From f5c5d4c4f385f115d3a0f569c18e4b66106ca4ff Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Thu, 27 Oct 2022 10:57:07 +0800
Subject: [PATCH 001/503] init

---
 chunk_codegen.py     | 1047 ++++++++++++++++++++++++++++++++++++++++++
 chunk_codegen_run.py |  177 +++++++
 2 files changed, 1224 insertions(+)
 create mode 100644 chunk_codegen.py
 create mode 100644 chunk_codegen_run.py

diff --git a/chunk_codegen.py b/chunk_codegen.py
new file mode 100644
index 000000000000..684028c014de
--- /dev/null
+++ b/chunk_codegen.py
@@ -0,0 +1,1047 @@
+import colossalai
+import torch
+from typing import List, Callable, Any, Tuple, Dict, Iterable
+
+try:
+    from torch.fx.node import Node, Argument, map_arg, _type_repr, _get_qualified_name
+    from torch.fx.graph import _Namespace, PythonCode, _custom_builtins, _is_from_torch, _format_target, magic_methods, CodeGen, _origin_type_map, inplace_methods, _CustomBuiltin
+    CODEGEN_AVAILABLE = True
+except:
+    from torch.fx.graph import _Namespace, PythonCode, _custom_builtins, _is_from_torch, _format_target, magic_methods, _origin_type_map, _format_args, _CustomBuiltin
+    from torch.fx.node import Node, Argument, map_arg, _type_repr, _get_qualified_name
+    CODEGEN_AVAILABLE = False
+
+if CODEGEN_AVAILABLE:
+    __all__ = ['ActivationCheckpointCodeGen']
+else:
+    __all__ = ['python_code_with_activation_checkpoint']
+
+
+def _gen_saved_tensors_hooks():
+    """
+    Generate saved tensors hooks
+    """
+
+    pack_hook = """def pack_hook_input(self, x):
+    if getattr(x, "offload", False):
+        return (x.device, x.cpu())
+    else:
+        return x
+ 
+def pack_hook_no_input(self, x):
+    if getattr(x, "offload", True):
+        return (x.device, x.cpu())
+    else:
+        return x
+"""
+
+    unpack_hook = """def unpack_hook(self, packed):
+    if isinstance(packed, tuple):
+        device, tensor = packed
+        return tensor.to(device)
+    else:
+        return packed
+"""
+
+    return pack_hook, unpack_hook
+
+
+def _gen_save_tensors_hooks_context(offload_input=True) -> str:
+    """Generate customized saved_tensors_hooks
+
+    Args:
+        offload_input (bool, optional): whether we need offload input, if offload_input=False, 
+        we will use self.pack_hook_no_input instead. Defaults to True.
+
+    Returns:
+        str: generated context
+    """
+
+    if offload_input:
+        context = "with torch.autograd.graph.saved_tensors_hooks(self.pack_hook_input, self.unpack_hook):\n"
+    else:
+        context = "with torch.autograd.graph.saved_tensors_hooks(self.pack_hook_no_input, self.unpack_hook):\n"
+    return context
+
+
+def _gen_save_on_cpu_context():
+    """
+    Generate save on cpu context
+    """
+
+    context = "with torch.autograd.graph.save_on_cpu(pin_memory=True):\n"
+    return context
+
+
+def _find_input_and_output_nodes(nodes: List[Node]):
+    """
+    Find the input and output node names which are not found in the given list of nodes.
+    """
+    input_nodes = []
+    output_nodes = []
+
+    # if a node has an input node which is not in the node list
+    # we treat that input node as the input of the checkpoint function
+    for node in nodes:
+        for input_node in node._input_nodes.keys():
+            node_repr = repr(input_node)
+            if input_node not in nodes and node_repr not in input_nodes:
+                input_nodes.append(node_repr)
+
+    # if a node has a user node which is not in the node list
+    # we treat that user node as the node receiving the current node output
+    for node in nodes:
+        for output_node in node.users.keys():
+            node_repr = repr(node)
+            if output_node not in nodes and node_repr not in output_nodes:
+                output_nodes.append(node_repr)
+
+    return input_nodes, output_nodes
+
+
+def _find_ckpt_regions(nodes: List[Node]):
+    """
+    Find the checkpoint regions given a list of consecutive nodes. The outputs will be list
+    of tuples, each tuple is in the form of (start_index, end_index).
+    """
+    ckpt_nodes = []
+    ckpt_regions = []
+    start = -1
+    end = -1
+    current_region = None
+
+    for idx, node in enumerate(nodes):
+        if hasattr(node, 'activation_checkpoint'):
+            act_ckpt_label = node.activation_checkpoint
+
+            # this activation checkpoint label is not set yet
+            # meaning this is the first node of the activation ckpt region
+            if current_region is None:
+                current_region = act_ckpt_label
+                start = idx
+
+            # if activation checkpoint has changed
+            # we restart the tracking
+            # e.g. node ckpt states = [ckpt1, ckpt2, ckpt2, ckpt2]
+            if act_ckpt_label != current_region:
+                assert start != -1
+                ckpt_regions.append((start, idx - 1))
+                current_region = act_ckpt_label
+                start = idx
+                end = -1
+        elif current_region is not None and not hasattr(node, 'activation_checkpoint'):
+            # used to check the case below
+            # node ckpt states = [ckpt, ckpt, non-ckpt]
+            end = idx - 1
+            assert start != -1 and end != -1
+            ckpt_regions.append((start, end))
+            start = end = -1
+            current_region = None
+        else:
+            pass
+    return ckpt_regions
+
+
+def _find_offload_regions(nodes: List[Node]):
+    """This function is to find the offload regions
+    In pofo algorithm, during annotation, we will annotate the offload region with the 
+    list in the form of [idx, offload_input, offload_bar]. idx indicates the offload
+    region's index, offload_input is a bool type indicates whether we need to offload
+    the input, offload_bar is a bool type indicates whether we need to offload all the
+    intermediate x_bars of this region.
+    """
+    offload_regions = []
+    offload_labels = []
+    start = -1
+    end = -1
+    current_region = None
+
+    for idx, node in enumerate(nodes):
+        if hasattr(node, 'activation_offload') and isinstance(getattr(node, 'activation_offload', None), Iterable):
+            act_offload_label = node.activation_offload
+
+            if current_region == None:
+                current_region = act_offload_label
+                start = idx
+                offload_labels.append(act_offload_label)
+
+            if act_offload_label != current_region:
+                assert start != -1
+                offload_regions.append((start, idx - 1))
+                offload_labels.append(act_offload_label)
+                current_region = act_offload_label
+                start = idx
+                end = -1
+
+        else:
+            if current_region is not None:
+                end = idx - 1
+                assert start != -1 and end != -1
+                offload_regions.append((start, end))
+                start = end = -1
+                current_region = None
+
+            else:
+                pass
+
+    return offload_regions, offload_labels
+
+
+def _gen_ckpt_fn_def(label, free_vars: List[str]) -> str:
+    """
+    Generate the checkpoint function definition
+    """
+    return f"def checkpoint_{label}({', '.join(['self'] + free_vars)}):"
+
+
+def _gen_ckpt_output(output_vars: List[str]) -> str:
+    """
+    Generate the return statement for checkpoint region
+    """
+    return f"return {', '.join(output_vars)}"
+
+
+def _gen_ckpt_usage(label, activation_offload, input_vars, output_vars, use_reentrant=True):
+    """
+    Generate the checkpoint function call code text
+    """
+    outputs = ', '.join(output_vars)
+    inputs = ', '.join(input_vars)
+    return f'{outputs} = colossalai.utils.activation_checkpoint.checkpoint(self.checkpoint_{label}, {activation_offload}, {inputs}, use_reentrant={use_reentrant})'
+
+
+def _end_of_ckpt(node: Node, check_idx: int) -> bool:
+    """Check if the node could end the ckpt region
+
+    Args:
+        node (Node): torch.fx.Node
+        check_idx (int): the index of checkpoint level for 
+        nested checkpoint
+
+    Returns:
+        bool
+    """
+    if hasattr(node, "activation_checkpoint"):
+        if isinstance(node.activation_checkpoint, list):
+            return node.activation_checkpoint[check_idx] == None
+        else:
+            return False
+    else:
+        return True
+
+
+def _find_nested_ckpt_regions(nodes, check_idx=0):
+    """
+    Find the nested checkpoint regions given a list of consecutive nodes. The outputs 
+    will be list of tuples, each tuple is in the form of (start_index, end_index).
+    """
+    ckpt_regions = []
+    start = -1
+    end = -1
+    current_region = None
+
+    for idx, node in enumerate(nodes):
+        if hasattr(node, 'activation_checkpoint'):
+            if isinstance(getattr(node, 'activation_checkpoint'), int):
+                act_ckpt_label = node.activation_checkpoint
+            else:
+                act_ckpt_label = node.activation_checkpoint[check_idx]
+
+            # this activation checkpoint label is not set yet
+            # meaning this is the first node of the activation ckpt region
+            if current_region is None:
+                current_region = act_ckpt_label
+                start = idx
+
+            # if activation checkpoint has changed
+            # we restart the tracking
+            # e.g. node ckpt states = [ckpt1, ckpt2, ckpt2, ckpt2]
+            if act_ckpt_label != current_region:
+                assert start != -1
+                ckpt_regions.append((start, idx - 1))
+                current_region = act_ckpt_label
+                start = idx
+                end = -1
+        elif current_region is not None and _end_of_ckpt(node, check_idx):
+            # used to check the case below
+            # node ckpt states = [ckpt, ckpt, non-ckpt]
+            end = idx - 1
+            assert start != -1 and end != -1
+            ckpt_regions.append((start, end))
+            start = end = -1
+            current_region = None
+        else:
+            pass
+
+    if current_region is not None:
+        end = len(nodes) - 1
+        ckpt_regions.append((start, end))
+    return ckpt_regions
+
+
+def emit_ckpt_func(body,
+                   ckpt_func,
+                   node_list: List[Node],
+                   emit_node_func,
+                   delete_unused_value_func,
+                   level=0,
+                   in_ckpt=False):
+    """Emit ckpt fuction in nested way
+
+    Args:
+        body: forward code, in recursive calls, this part will be checkpoint
+        functions code
+        ckpt_func: checkpoint functions code, in recursive calls, this part
+        will be a buffer
+        node_list (List[Node]): list of torch.fx.Node
+        emit_node_func: function to emit a node
+        delete_unused_value_func: function to delete unused value
+        level (int, optional): checkpoint level. Defaults to 0.
+        in_ckpt (bool, optional): indicates wether the func is in recursive
+        call. Defaults to False.
+    """
+    inputs, outputs = _find_input_and_output_nodes(node_list)
+
+    # if the current checkpoint function use int as label, using old generation method
+    if isinstance(node_list[0].activation_checkpoint, int):
+        label = node_list[0].activation_checkpoint
+        ckpt_fn_def = _gen_ckpt_fn_def(label, inputs)
+        ckpt_func.append(f'{ckpt_fn_def}\n')
+        for node in node_list:
+            emit_node_func(node, ckpt_func)
+            ckpt_func[-1] = '    ' + ckpt_func[-1]
+            delete_unused_value_func(node, ckpt_func)
+
+        ckpt_func.append('    ' + _gen_ckpt_output(outputs) + '\n\n')
+        activation_offload = getattr(node_list[0], "activation_offload", False)
+        usage = _gen_ckpt_usage(label, activation_offload, inputs, outputs, False)
+        usage += "\n"
+        body.append(usage)
+
+    # use nested ckpt function codegen
+    else:
+        # label given by each layer, e.g. if you are currently at level [0, 1, 1]
+        # the label will be '0_1_1'
+        label = "_".join([str(idx) for idx in node_list[0].activation_checkpoint[:level + 1]])
+        ckpt_fn_def = _gen_ckpt_fn_def(label, inputs)
+        ckpt_func.append(f'{ckpt_fn_def}\n')
+
+        # if there is more level to fetch
+        if level + 1 < len(node_list[0].activation_checkpoint):
+            ckpt_regions = _find_nested_ckpt_regions(node_list, level + 1)
+            start_idx = [item[0] for item in ckpt_regions]
+            end_idx = [item[1] for item in ckpt_regions]
+
+            # use ckpt_func_buffer to store nested checkpoint functions
+            ckpt_func_buffer = []
+            node_idx = 0
+            while 1:
+                if node_idx >= len(node_list):
+                    break
+
+                if node_idx in start_idx:
+                    ckpt_node_list = node_list[node_idx:end_idx[start_idx.index(node_idx)] + 1]
+                    emit_ckpt_func(ckpt_func, ckpt_func_buffer, ckpt_node_list, emit_node_func,
+                                   delete_unused_value_func, level + 1, True)
+                    node_idx += len(ckpt_node_list)
+
+                else:
+                    node = node_list[node_idx]
+                    emit_node_func(node, ckpt_func)
+                    ckpt_func[-1] = '    ' + ckpt_func[-1]
+                    delete_unused_value_func(node, ckpt_func)
+                    node_idx += 1
+
+            ckpt_func.append('    ' + _gen_ckpt_output(outputs) + '\n\n')
+            ckpt_func += ckpt_func_buffer
+            activation_offload = getattr(node_list[0], "activation_offload", False)
+            usage = _gen_ckpt_usage(label, activation_offload, inputs, outputs, False) + '\n'
+            if in_ckpt:
+                usage = '    ' + usage
+            body.append(usage)
+
+        # last level
+        else:
+            for node in node_list:
+                emit_node_func(node, ckpt_func)
+                ckpt_func[-1] = '    ' + ckpt_func[-1]
+                delete_unused_value_func(node, ckpt_func)
+
+            ckpt_func.append('    ' + _gen_ckpt_output(outputs) + '\n\n')
+            activation_offload = getattr(node_list[0], "activation_offload", False)
+            usage = _gen_ckpt_usage(label, activation_offload, inputs, outputs, False) + '\n'
+            if in_ckpt:
+                usage = '    ' + usage
+            body.append(usage)
+
+
+def emit_code_with_nested_activation_checkpoint(body, ckpt_func, nodes, emit_node_func, delete_unused_value_func):
+    """Emit code with nested activation checkpoint
+    When we detect some of the node.activation_checkpoint is a List, we will use
+    this function to emit the activation checkpoint codes.
+
+    Args:
+        body: forward code
+        ckpt_func: checkpoint functions code
+        nodes: graph.nodes
+        emit_node_func: function to emit node
+        delete_unused_value_func: function to remove the unused value
+    """
+    ckpt_regions = _find_nested_ckpt_regions(nodes, 0)
+    start_idx = [item[0] for item in ckpt_regions]
+    end_idx = [item[1] for item in ckpt_regions]
+
+    # find the offload regions
+    offload_regions, offload_labels = _find_offload_regions(nodes)
+    offload_starts = [item[0] for item in offload_regions]
+    offload_ends = [item[1] for item in offload_regions]
+    offload_inputs = []
+    offload_outputs = []
+    within_offload_region = False
+
+    node_list = list(nodes)
+
+    # find the input and output var names for each offload region
+    for idx, (start, end) in enumerate(offload_regions):
+        offload_node_list = node_list[start:end + 1]
+        inputs, outputs = _find_input_and_output_nodes(offload_node_list)
+        offload_inputs.append(inputs)
+        offload_outputs.append(outputs)
+
+    # this flag is to prevent repeated insert of save tensors
+    # hooks definition in ckpt_func
+    is_hook_inserted = False
+    node_idx = 0
+    while 1:
+        # break if we finish the processing all the nodes
+        if node_idx >= len(node_list):
+            break
+
+        # process ckpt_regions
+        if node_idx in start_idx:
+            ckpt_node_list = node_list[node_idx:end_idx[start_idx.index(node_idx)] + 1]
+            emit_ckpt_func(body, ckpt_func, ckpt_node_list, emit_node_func, delete_unused_value_func)
+            node_idx += len(ckpt_node_list)
+
+        # process node in forward function
+        else:
+            node = node_list[node_idx]
+
+            if node_idx in offload_starts:
+                offload_label = offload_labels[offload_starts.index(node_idx)]
+                _, offload_input, offload_bar = offload_label
+                within_offload_region = True
+
+                # insert hook functions if needed
+                if not is_hook_inserted:
+                    pack_hook, unpack_hook = _gen_saved_tensors_hooks()
+                    ckpt_func.insert(0, "\n".join([pack_hook, unpack_hook]) + "\n")
+                    is_hook_inserted = True
+
+                if offload_input and offload_bar:
+                    body.append(_gen_save_on_cpu_context())
+
+                elif offload_input:
+                    for par in offload_inputs[offload_label[0]]:
+                        body.append(f"setattr({par}, 'offload', True)\n")
+                    body.append(_gen_save_tensors_hooks_context(offload_input=True))
+
+                else:
+                    for par in offload_inputs[offload_label[0]]:
+                        body.append(f"setattr({par}, 'offload', False)\n")
+                    body.append(_gen_save_tensors_hooks_context(offload_input=False))
+
+            if within_offload_region:
+                emit_node_func(node, body)
+                body[-1] = '    ' + body[-1]
+                delete_unused_value_func(node, body)
+
+            else:
+                emit_node_func(node, body)
+                delete_unused_value_func(node, body)
+
+            if node_idx in offload_ends:
+                within_offload_region = False
+
+            node_idx += 1
+
+
+def emit_code_with_activation_checkpoint(body, ckpt_func, nodes, emit_node_func, delete_unused_value_func):
+    # find the activation checkpoint regions
+    ckpt_regions = _find_ckpt_regions(nodes)
+    start_idx = [item[0] for item in ckpt_regions]
+    end_idx = [item[1] for item in ckpt_regions]
+    input_vars = []
+    output_vars = []
+    within_ckpt_region = False
+
+    # find the offload regions
+    offload_regions, offload_labels = _find_offload_regions(nodes)
+    offload_starts = [item[0] for item in offload_regions]
+    offload_ends = [item[1] for item in offload_regions]
+    offload_inputs = []
+    offload_outputs = []
+    within_offload_region = False
+
+    node_list = list(nodes)
+
+    # use this variable to avoid inserting hook functions
+    # to ckpt_func repeatedly
+    is_hook_inserted = False
+
+    # find the input and output var names for each region
+    for idx, (start, end) in enumerate(ckpt_regions):
+        ckpt_node_list = node_list[start:end + 1]
+        inputs, outputs = _find_input_and_output_nodes(ckpt_node_list)
+        input_vars.append(inputs)
+        output_vars.append(outputs)
+
+    # find the input and output var names for each offload region
+    for idx, (start, end) in enumerate(offload_regions):
+        offload_node_list = node_list[start:end + 1]
+        inputs, outputs = _find_input_and_output_nodes(offload_node_list)
+        offload_inputs.append(inputs)
+        offload_outputs.append(outputs)
+
+    # append code text to body
+    for idx, node in enumerate(node_list):
+        # if this is the first node of the ckpt region
+        # append the ckpt function defition
+        if idx in start_idx:
+            label = start_idx.index(idx)
+            ckpt_fn_def = _gen_ckpt_fn_def(label, input_vars[label])
+            ckpt_func.append(f'{ckpt_fn_def}\n')
+            within_ckpt_region = True
+
+        if idx in offload_starts:
+            offload_label = offload_labels[offload_starts.index(idx)]
+            _, offload_input, offload_bar = offload_label
+            within_offload_region = True
+
+            # insert hook functions if needed
+            if not is_hook_inserted:
+                pack_hook, unpack_hook = _gen_saved_tensors_hooks()
+                ckpt_func.insert(0, "\n".join([pack_hook, unpack_hook]) + "\n")
+                is_hook_inserted = True
+
+            if offload_input and offload_bar:
+                body.append(_gen_save_on_cpu_context())
+
+            elif offload_input:
+                for par in offload_inputs[offload_label[0]]:
+                    body.append(f"setattr({par}, 'offload', True)\n")
+                body.append(_gen_save_tensors_hooks_context(offload_input=True))
+
+            else:
+                for par in offload_inputs[offload_label[0]]:
+                    body.append(f"setattr({par}, 'offload', False)\n")
+                body.append(_gen_save_tensors_hooks_context(offload_input=False))
+
+        # NOTE: emit_node does not emit a string with newline. It depends
+        # on delete_unused_values to append one
+        # NOTE: currently we separate body and ckpt_func definition
+        if within_ckpt_region:
+            emit_node_func(node, ckpt_func)
+            ckpt_func[-1] = '    ' + ckpt_func[-1]
+            delete_unused_value_func(node, ckpt_func)
+
+        elif within_offload_region:
+            emit_node_func(node, body)
+            body[-1] = '    ' + body[-1]
+            delete_unused_value_func(node, body)
+
+        else:
+            emit_node_func(node, body)
+            delete_unused_value_func(node, body)
+
+        if idx in end_idx:
+            # if this is the last node of the ckpt region
+            # generate return statement
+            label = end_idx.index(idx)
+            return_statement = _gen_ckpt_output(output_vars[label])
+            return_statement = f'    {return_statement}\n\n'
+            ckpt_func.append(return_statement)
+
+            # we need to check if the checkpoint need to offload the input
+            start_node_idx = start_idx[label]
+            if hasattr(node_list[start_node_idx], 'activation_offload'):
+                activation_offload = node_list[start_node_idx].activation_offload
+            else:
+                activation_offload = False
+
+            # we need to check if the checkpoint need use_reentrant=False
+            use_reentrant = True
+            non_leaf_input = 0
+            for var in input_vars[label]:
+                input_node = next(item for item in node_list if item.name == var)
+                if input_node.op != "placeholder":
+                    non_leaf_input = 1
+                for user in input_node.users:
+                    if hasattr(user, "activation_checkpoint"):
+                        if user.activation_checkpoint == label:
+                            if user.op == "call_module":
+                                if hasattr(user.graph.owning_module.get_submodule(user.target), "inplace"):
+                                    use_reentrant = not user.graph.owning_module.get_submodule(user.target).inplace
+
+                            elif user.op == "call_function":
+                                if "inplace" in user.kwargs:
+                                    use_reentrant = not user.kwargs["inplace"]
+
+            # if all the inputs are leaf nodes, we need to set use_reentrant = False
+            if not non_leaf_input:
+                use_reentrant = False
+
+            # generate checkpoint function call in a new line
+            usage = _gen_ckpt_usage(label, activation_offload, input_vars[label], output_vars[label], use_reentrant)
+            usage += '\n'
+            body.append(usage)
+            within_ckpt_region = False
+
+        if idx in offload_ends:
+            within_offload_region = False
+
+
+if CODEGEN_AVAILABLE:
+
+    class ActivationCheckpointCodeGen(CodeGen):
+
+        def _gen_python_code(self, nodes, root_module: str, namespace: _Namespace) -> PythonCode:
+            free_vars: List[str] = []
+            body: List[str] = []
+            globals_: Dict[str, Any] = {}
+            wrapped_fns: Dict[str, None] = {}
+
+            # Wrap string in list to pass by reference
+            maybe_return_annotation: List[str] = ['']
+
+            def add_global(name_hint: str, obj: Any):
+                """Add an obj to be tracked as a global.
+
+                We call this for names that reference objects external to the
+                Graph, like functions or types.
+
+                Returns: the global name that should be used to reference 'obj' in generated source.
+                """
+                if _is_from_torch(obj) and obj != torch.device:    # to support registering torch.device
+                    # HACK: workaround for how torch custom ops are registered. We
+                    # can't import them like normal modules so they must retain their
+                    # fully qualified name.
+                    return _get_qualified_name(obj)
+
+                # normalize the name hint to get a proper identifier
+                global_name = namespace.create_name(name_hint, obj)
+
+                if global_name in globals_:
+                    assert globals_[global_name] is obj
+                    return global_name
+                globals_[global_name] = obj
+                return global_name
+
+            # set _custom_builtins here so that we needn't import colossalai in forward
+            _custom_builtins["colossalai"] = _CustomBuiltin("import colossalai", colossalai)
+
+            # Pre-fill the globals table with registered builtins.
+            for name, (_, obj) in _custom_builtins.items():
+                add_global(name, obj)
+
+            def type_repr(o: Any):
+                if o == ():
+                    # Empty tuple is used for empty tuple type annotation Tuple[()]
+                    return '()'
+
+                typename = _type_repr(o)
+
+                if hasattr(o, '__origin__'):
+                    # This is a generic type, e.g. typing.List[torch.Tensor]
+                    origin_type = _origin_type_map.get(o.__origin__, o.__origin__)
+                    origin_typename = add_global(_type_repr(origin_type), origin_type)
+
+                    if hasattr(o, '__args__'):
+                        # Assign global names for each of the inner type variables.
+                        args = [type_repr(arg) for arg in o.__args__]
+
+                        if len(args) == 0:
+                            # Bare type, such as `typing.Tuple` with no subscript
+                            # This code-path used in Python < 3.9
+                            return origin_typename
+
+                        return f'{origin_typename}[{",".join(args)}]'
+                    else:
+                        # Bare type, such as `typing.Tuple` with no subscript
+                        # This code-path used in Python 3.9+
+                        return origin_typename
+
+                # Common case: this is a regular module name like 'foo.bar.baz'
+                return add_global(typename, o)
+
+            def _format_args(args: Tuple[Argument, ...], kwargs: Dict[str, Argument]) -> str:
+
+                def _get_repr(arg):
+                    # Handle NamedTuples (if it has `_fields`) via add_global.
+                    if isinstance(arg, tuple) and hasattr(arg, '_fields'):
+                        qualified_name = _get_qualified_name(type(arg))
+                        global_name = add_global(qualified_name, type(arg))
+                        return f"{global_name}{repr(tuple(arg))}"
+                    return repr(arg)
+
+                args_s = ', '.join(_get_repr(a) for a in args)
+                kwargs_s = ', '.join(f'{k} = {_get_repr(v)}' for k, v in kwargs.items())
+                if args_s and kwargs_s:
+                    return f'{args_s}, {kwargs_s}'
+                return args_s or kwargs_s
+
+            # Run through reverse nodes and record the first instance of a use
+            # of a given node. This represents the *last* use of the node in the
+            # execution order of the program, which we will use to free unused
+            # values
+            node_to_last_use: Dict[Node, Node] = {}
+            user_to_last_uses: Dict[Node, List[Node]] = {}
+
+            def register_last_uses(n: Node, user: Node):
+                if n not in node_to_last_use:
+                    node_to_last_use[n] = user
+                    user_to_last_uses.setdefault(user, []).append(n)
+
+            for node in reversed(nodes):
+                map_arg(node.args, lambda n: register_last_uses(n, node))
+                map_arg(node.kwargs, lambda n: register_last_uses(n, node))
+
+            # NOTE: we add a variable to distinguish body and ckpt_func
+            def delete_unused_values(user: Node, body):
+                """
+                Delete values after their last use. This ensures that values that are
+                not used in the remainder of the code are freed and the memory usage
+                of the code is optimal.
+                """
+                if user.op == 'placeholder':
+                    return
+                if user.op == 'output':
+                    body.append('\n')
+                    return
+                nodes_to_delete = user_to_last_uses.get(user, [])
+                if len(nodes_to_delete):
+                    to_delete_str = ' = '.join([repr(n) for n in nodes_to_delete] + ['None'])
+                    body.append(f';  {to_delete_str}\n')
+                else:
+                    body.append('\n')
+
+            # NOTE: we add a variable to distinguish body and ckpt_func
+            def emit_node(node: Node, body):
+                maybe_type_annotation = '' if node.type is None else f' : {type_repr(node.type)}'
+                if node.op == 'placeholder':
+                    assert isinstance(node.target, str)
+                    maybe_default_arg = '' if not node.args else f' = {repr(node.args[0])}'
+                    free_vars.append(f'{node.target}{maybe_type_annotation}{maybe_default_arg}')
+                    raw_name = node.target.replace('*', '')
+                    if raw_name != repr(node):
+                        body.append(f'{repr(node)} = {raw_name}\n')
+                    return
+                elif node.op == 'call_method':
+                    assert isinstance(node.target, str)
+                    body.append(
+                        f'{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.target)}'
+                        f'({_format_args(node.args[1:], node.kwargs)})')
+                    return
+                elif node.op == 'call_function':
+                    assert callable(node.target)
+                    # pretty print operators
+                    if node.target.__module__ == '_operator' and node.target.__name__ in magic_methods:
+                        assert isinstance(node.args, tuple)
+                        body.append(f'{repr(node)}{maybe_type_annotation} = '
+                                    f'{magic_methods[node.target.__name__].format(*(repr(a) for a in node.args))}')
+                        return
+
+                    # pretty print inplace operators; required for jit.script to work properly
+                    # not currently supported in normal FX graphs, but generated by torchdynamo
+                    if node.target.__module__ == '_operator' and node.target.__name__ in inplace_methods:
+                        body.append(f'{inplace_methods[node.target.__name__].format(*(repr(a) for a in node.args))};  '
+                                    f'{repr(node)}{maybe_type_annotation} = {repr(node.args[0])}')
+                        return
+
+                    qualified_name = _get_qualified_name(node.target)
+                    global_name = add_global(qualified_name, node.target)
+                    # special case for getattr: node.args could be 2-argument or 3-argument
+                    # 2-argument: attribute access; 3-argument: fall through to attrib function call with default value
+                    if global_name == 'getattr' and \
+                    isinstance(node.args, tuple) and \
+                    isinstance(node.args[1], str) and \
+                    node.args[1].isidentifier() and \
+                    len(node.args) == 2:
+                        body.append(
+                            f'{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.args[1])}')
+                        return
+                    body.append(
+                        f'{repr(node)}{maybe_type_annotation} = {global_name}({_format_args(node.args, node.kwargs)})')
+                    if node.meta.get('is_wrapped', False):
+                        wrapped_fns.setdefault(global_name)
+                    return
+                elif node.op == 'call_module':
+                    assert isinstance(node.target, str)
+                    body.append(f'{repr(node)}{maybe_type_annotation} = '
+                                f'{_format_target(root_module, node.target)}({_format_args(node.args, node.kwargs)})')
+                    return
+                elif node.op == 'get_attr':
+                    assert isinstance(node.target, str)
+                    body.append(f'{repr(node)}{maybe_type_annotation} = {_format_target(root_module, node.target)}')
+                    return
+                elif node.op == 'output':
+                    if node.type is not None:
+                        maybe_return_annotation[0] = f" -> {type_repr(node.type)}"
+                    body.append(self.generate_output(node.args[0]))
+                    return
+                raise NotImplementedError(f'node: {node.op} {node.target}')
+
+            # Modified for activation checkpointing
+            ckpt_func = []
+
+            # if any node has a list of labels for activation_checkpoint, we
+            # will use nested type of activation checkpoint codegen
+            if any(isinstance(getattr(node, "activation_checkpoint", None), Iterable) for node in nodes):
+                emit_code_with_nested_activation_checkpoint(body, ckpt_func, nodes, emit_node, delete_unused_values)
+            else:
+                emit_code_with_activation_checkpoint(body, ckpt_func, nodes, emit_node, delete_unused_values)
+
+            if len(body) == 0:
+                # If the Graph has no non-placeholder nodes, no lines for the body
+                # have been emitted. To continue to have valid Python code, emit a
+                # single pass statement
+                body.append('pass\n')
+
+            if len(wrapped_fns) > 0:
+                wrap_name = add_global('wrap', torch.fx.wrap)
+                wrap_stmts = '\n'.join([f'{wrap_name}("{name}")' for name in wrapped_fns])
+            else:
+                wrap_stmts = ''
+
+            if self._body_transformer:
+                body = self._body_transformer(body)
+
+            for name, value in self.additional_globals():
+                add_global(name, value)
+
+            # as we need colossalai.utils.checkpoint, we need to import colossalai
+            # in forward function
+            prologue = self.gen_fn_def(free_vars, maybe_return_annotation[0])
+            prologue = ''.join(ckpt_func) + prologue
+            prologue = prologue
+
+            code = ''.join(body)
+            code = '\n'.join('    ' + line for line in code.split('\n'))
+            fn_code = f"""
+{wrap_stmts}
+
+{prologue}
+{code}"""
+            return PythonCode(fn_code, globals_)
+
+else:
+
+    def python_code_with_activation_checkpoint(self, root_module: str, namespace: _Namespace) -> PythonCode:
+        """
+        This method is copied from the _python_code of torch.fx.graph.Graph. Modifications are made so that it can generate
+        code for activation checkpoint.
+        """
+        free_vars: List[str] = []
+        body: List[str] = []
+        globals_: Dict[str, Any] = {}
+        wrapped_fns: Dict[str, None] = {}
+
+        # Wrap string in list to pass by reference
+        maybe_return_annotation: List[str] = ['']
+
+        def add_global(name_hint: str, obj: Any):
+            """Add an obj to be tracked as a global.
+
+            We call this for names that reference objects external to the
+            Graph, like functions or types.
+
+            Returns: the global name that should be used to reference 'obj' in generated source.
+            """
+            if _is_from_torch(obj) and obj != torch.device:    # to support registering torch.device
+                # HACK: workaround for how torch custom ops are registered. We
+                # can't import them like normal modules so they must retain their
+                # fully qualified name.
+                return _get_qualified_name(obj)
+
+            # normalize the name hint to get a proper identifier
+            global_name = namespace.create_name(name_hint, obj)
+
+            if global_name in globals_:
+                assert globals_[global_name] is obj
+                return global_name
+            globals_[global_name] = obj
+            return global_name
+
+        # set _custom_builtins here so that we needn't import colossalai in forward
+        _custom_builtins["colossalai"] = _CustomBuiltin("import colossalai", colossalai)
+
+        # Pre-fill the globals table with registered builtins.
+        for name, (_, obj) in _custom_builtins.items():
+            add_global(name, obj)
+
+        def type_repr(o: Any):
+            if o == ():
+                # Empty tuple is used for empty tuple type annotation Tuple[()]
+                return '()'
+
+            typename = _type_repr(o)
+
+            # This is a generic type, e.g. typing.List[torch.Tensor]
+            if hasattr(o, '__origin__'):
+                origin_type = _origin_type_map.get(o.__origin__, o.__origin__)
+                origin_typename = add_global(_type_repr(origin_type), origin_type)
+
+                # Assign global names for each of the inner type variables.
+                args = [type_repr(arg) for arg in o.__args__]
+
+                return f'{origin_typename}[{",".join(args)}]'
+
+            # Common case: this is a regular module name like 'foo.bar.baz'
+            return add_global(typename, o)
+
+        # Run through reverse nodes and record the first instance of a use
+        # of a given node. This represents the *last* use of the node in the
+        # execution order of the program, which we will use to free unused
+        # values
+        node_to_last_use: Dict[Node, Node] = {}
+        user_to_last_uses: Dict[Node, List[Node]] = {}
+
+        def register_last_uses(n: Node, user: Node):
+            if n not in node_to_last_use:
+                node_to_last_use[n] = user
+                user_to_last_uses.setdefault(user, []).append(n)
+
+        for node in reversed(self.nodes):
+            map_arg(node.args, lambda n: register_last_uses(n, node))
+            map_arg(node.kwargs, lambda n: register_last_uses(n, node))
+
+        # NOTE: we add a variable to distinguish body and ckpt_func
+        def delete_unused_values(user: Node, body):
+            """
+            Delete values after their last use. This ensures that values that are
+            not used in the remainder of the code are freed and the memory usage
+            of the code is optimal.
+            """
+            if user.op == 'placeholder':
+                return
+            if user.op == 'output':
+                body.append('\n')
+                return
+            nodes_to_delete = user_to_last_uses.get(user, [])
+            if len(nodes_to_delete):
+                to_delete_str = ' = '.join([repr(n) for n in nodes_to_delete] + ['None'])
+                body.append(f';  {to_delete_str}\n')
+            else:
+                body.append('\n')
+
+        # NOTE: we add a variable to distinguish body and ckpt_func
+        def emit_node(node: Node, body):
+            maybe_type_annotation = '' if node.type is None else f' : {type_repr(node.type)}'
+            if node.op == 'placeholder':
+                assert isinstance(node.target, str)
+                maybe_default_arg = '' if not node.args else f' = {repr(node.args[0])}'
+                free_vars.append(f'{node.target}{maybe_type_annotation}{maybe_default_arg}')
+                raw_name = node.target.replace('*', '')
+                if raw_name != repr(node):
+                    body.append(f'{repr(node)} = {raw_name}\n')
+                return
+            elif node.op == 'call_method':
+                assert isinstance(node.target, str)
+                body.append(f'{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.target)}'
+                            f'({_format_args(node.args[1:], node.kwargs)})')
+                return
+            elif node.op == 'call_function':
+                assert callable(node.target)
+                # pretty print operators
+                if node.target.__module__ == '_operator' and node.target.__name__ in magic_methods:
+                    assert isinstance(node.args, tuple)
+                    body.append(f'{repr(node)}{maybe_type_annotation} = '
+                                f'{magic_methods[node.target.__name__].format(*(repr(a) for a in node.args))}')
+                    return
+                qualified_name = _get_qualified_name(node.target)
+                global_name = add_global(qualified_name, node.target)
+                # special case for getattr: node.args could be 2-argument or 3-argument
+                # 2-argument: attribute access; 3-argument: fall through to attrib function call with default value
+                if global_name == 'getattr' and \
+                   isinstance(node.args, tuple) and \
+                   isinstance(node.args[1], str) and \
+                   node.args[1].isidentifier() and \
+                   len(node.args) == 2:
+                    body.append(
+                        f'{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.args[1])}')
+                    return
+                body.append(
+                    f'{repr(node)}{maybe_type_annotation} = {global_name}({_format_args(node.args, node.kwargs)})')
+                if node.meta.get('is_wrapped', False):
+                    wrapped_fns.setdefault(global_name)
+                return
+            elif node.op == 'call_module':
+                assert isinstance(node.target, str)
+                body.append(f'{repr(node)}{maybe_type_annotation} = '
+                            f'{_format_target(root_module, node.target)}({_format_args(node.args, node.kwargs)})')
+                return
+            elif node.op == 'get_attr':
+                assert isinstance(node.target, str)
+                body.append(f'{repr(node)}{maybe_type_annotation} = {_format_target(root_module, node.target)}')
+                return
+            elif node.op == 'output':
+                if node.type is not None:
+                    maybe_return_annotation[0] = f" -> {type_repr(node.type)}"
+                if self._pytree_info is None:
+                    body.append(f'return {repr(node.args[0])}')
+                else:
+                    body.append(f'return pytree.tree_unflatten({repr(node.args[0])}, self._out_spec)')
+                return
+            raise NotImplementedError(f'node: {node.op} {node.target}')
+
+        # Modified for activation checkpointing
+        ckpt_func = []
+
+        # if any node has a list of labels for activation_checkpoint, we
+        # will use nested type of activation checkpoint codegen
+        if any(isinstance(getattr(node, "activation_checkpoint", None), Iterable) for node in self.nodes):
+            emit_code_with_nested_activation_checkpoint(body, ckpt_func, self.nodes, emit_node, delete_unused_values)
+        else:
+            emit_code_with_activation_checkpoint(body, ckpt_func, self.nodes, emit_node, delete_unused_values)
+
+        if len(body) == 0:
+            # If the Graph has no non-placeholder nodes, no lines for the body
+            # have been emitted. To continue to have valid Python code, emit a
+            # single pass statement
+            body.append('pass\n')
+        if self._pytree_info is not None:
+            orig_args = self._pytree_info.orig_args
+            has_orig_self = (orig_args[0] == 'self')
+            if has_orig_self:
+                free_vars.insert(0, 'self')
+            if len(free_vars) > 0:    # pytree has placeholders in it
+                body.insert(
+                    0,
+                    f"{', '.join(free_vars)}, = fx_pytree.tree_flatten_spec([{', '.join(orig_args)}], self._in_spec)\n")
+        else:
+            orig_args = free_vars
+
+        if len(wrapped_fns) > 0:
+            wrap_name = add_global('wrap', torch.fx.wrap)
+            wrap_stmts = '\n'.join([f'{wrap_name}("{name}")' for name in wrapped_fns])
+        else:
+            wrap_stmts = ''
+
+        ckpt_func = ''.join(ckpt_func)
+
+        # If the original function didn't have self as its first argument, we
+        # would have added it.
+        if len(orig_args) == 0 or orig_args[0] != 'self':
+            orig_args.insert(0, 'self')
+        code = ''.join(body)
+        code = '\n'.join('    ' + line for line in code.split('\n'))
+
+        # as we need colossalai.utils.checkpoint, we need to import colossalai
+        # in forward function
+        fn_code = f"""
+{wrap_stmts}
+
+{ckpt_func}
+def forward({', '.join(orig_args)}){maybe_return_annotation[0]}:
+{code}"""
+        return PythonCode(fn_code, globals_)
diff --git a/chunk_codegen_run.py b/chunk_codegen_run.py
new file mode 100644
index 000000000000..9ac399a29b51
--- /dev/null
+++ b/chunk_codegen_run.py
@@ -0,0 +1,177 @@
+import copy
+import torch
+import torch.nn.functional as F
+import pytest
+import torch.multiprocessing as mp
+from torch.fx import GraphModule
+from colossalai.fx import ColoTracer
+import colossalai
+from colossalai.utils import free_port
+from colossalai.core import global_context as gpc
+from colossalai.fx.graph_module import ColoGraphModule
+
+try:
+    from chunk_codegen import ActivationCheckpointCodeGen
+    with_codegen = True
+except:
+    # fall back to older pytorch version
+    from chunk_codegen import python_code_with_activation_checkpoint
+    with_codegen = False
+
+
+class MyNet(torch.nn.Module):
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.linear0 = torch.nn.Linear(4, 4)
+        self.linear1 = torch.nn.Linear(4, 4)
+        self.linear2 = torch.nn.Linear(4, 4)
+        self.linear3 = torch.nn.Linear(4, 4)
+        self.linear4 = torch.nn.Linear(4, 4)
+        self.linear5 = torch.nn.Linear(4, 4)
+        self.linear6 = torch.nn.Linear(4, 4)
+
+    def forward(self, x):
+        x = self.linear0(x)
+        x = self.linear1(x)
+        x = self.linear2(x)
+        x = self.linear3(x)
+        x = self.linear4(x)
+        x = self.linear5(x)
+        x = self.linear6(x)
+        return x
+
+
+def _is_all_gradient_close(m: torch.nn.Module, gm: GraphModule) -> bool:
+    for m_p, gm_p in zip(m.parameters(), gm.parameters()):
+        if not torch.allclose(m_p.grad, gm_p.grad):
+            return False
+    return True
+
+
+def _test_fwd_and_bwd(model: torch.nn.Module, gm: ColoGraphModule, data: torch.Tensor):
+
+    # test forward
+    non_fx_out = model(data)
+    fx_out = gm(data)
+    assert torch.equal(non_fx_out, fx_out), "fx_out doesn't comply with original output"
+
+    # test barckward
+    loss0 = non_fx_out.sum()
+    loss0.backward()
+    loss1 = fx_out.sum()
+    loss1.backward()
+    assert _is_all_gradient_close(model, gm), "gm doesn't have the same gradient as original one"
+
+
+def _run_offload_codegen(rank):
+    # launch colossalai to make sure we could execute colossalai.utils.checkpoint currectly
+    colossalai.launch(config={}, rank=rank, world_size=1, host='localhost', port=free_port(), backend='nccl')
+
+    # build model and input
+    model = MyNet().cuda()
+    data = torch.rand(4, 4).cuda()
+
+    # trace the module and replace codegen
+    tracer = ColoTracer(trace_act_ckpt=True)
+    graph = tracer.trace(model)
+    codegen = ActivationCheckpointCodeGen()
+    graph.set_codegen(codegen)
+
+    # annotate the activation offload part
+    # also annotate the activation_checkpoint so we could test both types
+    # of input offload
+    for node in graph.nodes:
+        if node.name == "linear0":
+            setattr(node, "activation_offload", [0, True, False])
+        if node.name == "linear1":
+            setattr(node, "activation_offload", [0, True, False])
+        if node.name == "linear2":
+            setattr(node, "activation_offload", [1, True, True])
+        if node.name == "linear4":
+            setattr(node, "activation_offload", [2, False, True])
+        if node.name == "linear5":
+            setattr(node, "activation_checkpoint", [0])
+            setattr(node, "activation_offload", True)
+
+    gm = ColoGraphModule(copy.deepcopy(model), graph)
+    gm.recompile()
+
+    # assert we have all the components
+    code = graph.python_code("self").src
+    assert "def pack_hook_input(self, x):" in code and \
+    "def unpack_hook(self, packed):" in code and \
+    "def pack_hook_no_input(self, x):" in code and \
+    "setattr(x, 'offload', True)" in code and \
+    "setattr(linear3, 'offload', False)" in code and \
+    "with torch.autograd.graph.saved_tensors_hooks(self.pack_hook_input, self.unpack_hook):" in code and \
+    "with torch.autograd.graph.save_on_cpu(pin_memory=True):" in code and \
+    "with torch.autograd.graph.saved_tensors_hooks(self.pack_hook_no_input, self.unpack_hook):" in code and \
+    "colossalai.utils.activation_checkpoint.checkpoint(self.checkpoint_0, True, linear4, use_reentrant=False)" in code
+
+    _test_fwd_and_bwd(model, gm, data)
+    gpc.destroy()
+
+
+@pytest.mark.skipif(not with_codegen, reason='torch version is lower than 1.12.0')
+def test_act_ckpt_codegen():
+    mp.spawn(_run_offload_codegen, nprocs=1)
+
+
+def _run_offload_codegen_torch11(rank):
+    # launch colossalai to make sure we could execute colossalai.utils.checkpoint currectly
+    colossalai.launch(config={}, rank=rank, world_size=1, host='localhost', port=free_port(), backend='nccl')
+
+    # build model and input
+    model = MyNet().cuda()
+    data = torch.rand(4, 4).cuda()
+
+    # trace the module and replace codegen
+    tracer = ColoTracer(trace_act_ckpt=True)
+    graph = tracer.trace(model)
+
+    # replace a bound method of an object
+    graph._python_code = python_code_with_activation_checkpoint.__get__(graph)
+
+    # annotate the activation offload part
+    # also annotate the activation_checkpoint so we could test both types
+    # of input offload
+    for node in graph.nodes:
+        if node.name == "linear0":
+            setattr(node, "activation_offload", [0, True, False])
+        if node.name == "linear1":
+            setattr(node, "activation_offload", [0, True, False])
+        if node.name == "linear2":
+            setattr(node, "activation_offload", [1, True, True])
+        if node.name == "linear4":
+            setattr(node, "activation_offload", [2, False, True])
+        if node.name == "linear5":
+            setattr(node, "activation_checkpoint", [0])
+            setattr(node, "activation_offload", True)
+
+    gm = ColoGraphModule(copy.deepcopy(model), graph)
+    gm.recompile()
+
+    # assert we have all the components
+    code = graph.python_code("self").src
+    assert "def pack_hook_input(self, x):" in code and \
+    "def unpack_hook(self, packed):" in code and \
+    "def pack_hook_no_input(self, x):" in code and \
+    "setattr(x, 'offload', True)" in code and \
+    "setattr(linear3, 'offload', False)" in code and \
+    "with torch.autograd.graph.saved_tensors_hooks(self.pack_hook_input, self.unpack_hook):" in code and \
+    "with torch.autograd.graph.save_on_cpu(pin_memory=True):" in code and \
+    "with torch.autograd.graph.saved_tensors_hooks(self.pack_hook_no_input, self.unpack_hook):" in code and \
+    "colossalai.utils.activation_checkpoint.checkpoint(self.checkpoint_0, True, linear4, use_reentrant=False)" in code
+
+    _test_fwd_and_bwd(model, gm, data)
+    gpc.destroy()
+
+
+@pytest.mark.skip(reason="currently torch11 ColoGraphModule is not implemented")
+def test_act_ckpt_python_code_torch11():
+    mp.spawn(_run_offload_codegen_torch11, nprocs=1)
+
+
+if __name__ == "__main__":
+    _run_offload_codegen(0)

From 87cddf7e147f8db1c9710eb37961c489c09bd5b9 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Thu, 27 Oct 2022 16:40:19 +0800
Subject: [PATCH 002/503] rename and remove useless func

---
 chunk_codegen.py     | 398 +++----------------------------------------
 chunk_codegen_run.py |  69 +-------
 2 files changed, 27 insertions(+), 440 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 684028c014de..09fda2b988eb 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -12,7 +12,7 @@
     CODEGEN_AVAILABLE = False
 
 if CODEGEN_AVAILABLE:
-    __all__ = ['ActivationCheckpointCodeGen']
+    __all__ = ['ChunkCodeGen']
 else:
     __all__ = ['python_code_with_activation_checkpoint']
 
@@ -375,7 +375,7 @@ def emit_ckpt_func(body,
             body.append(usage)
 
 
-def emit_code_with_nested_activation_checkpoint(body, ckpt_func, nodes, emit_node_func, delete_unused_value_func):
+def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_value_func):
     """Emit code with nested activation checkpoint
     When we detect some of the node.activation_checkpoint is a List, we will use
     this function to emit the activation checkpoint codes.
@@ -392,21 +392,21 @@ def emit_code_with_nested_activation_checkpoint(body, ckpt_func, nodes, emit_nod
     end_idx = [item[1] for item in ckpt_regions]
 
     # find the offload regions
-    offload_regions, offload_labels = _find_offload_regions(nodes)
-    offload_starts = [item[0] for item in offload_regions]
-    offload_ends = [item[1] for item in offload_regions]
-    offload_inputs = []
-    offload_outputs = []
-    within_offload_region = False
+    chunk_regions, chunk_labels = _find_offload_regions(nodes)
+    chunk_starts = [item[0] for item in chunk_regions]
+    chunk_ends = [item[1] for item in chunk_regions]
+    chunk_inputs = []
+    chunk_outputs = []
+    within_chunk_region = False
 
     node_list = list(nodes)
 
     # find the input and output var names for each offload region
-    for idx, (start, end) in enumerate(offload_regions):
+    for idx, (start, end) in enumerate(chunk_regions):
         offload_node_list = node_list[start:end + 1]
         inputs, outputs = _find_input_and_output_nodes(offload_node_list)
-        offload_inputs.append(inputs)
-        offload_outputs.append(outputs)
+        chunk_inputs.append(inputs)
+        chunk_outputs.append(outputs)
 
     # this flag is to prevent repeated insert of save tensors
     # hooks definition in ckpt_func
@@ -427,10 +427,10 @@ def emit_code_with_nested_activation_checkpoint(body, ckpt_func, nodes, emit_nod
         else:
             node = node_list[node_idx]
 
-            if node_idx in offload_starts:
-                offload_label = offload_labels[offload_starts.index(node_idx)]
-                _, offload_input, offload_bar = offload_label
-                within_offload_region = True
+            if node_idx in chunk_starts:
+                chunk_label = chunk_labels[chunk_starts.index(node_idx)]
+                _, chunk_input, chunk_bar = chunk_label
+                within_chunk_region = True
 
                 # insert hook functions if needed
                 if not is_hook_inserted:
@@ -438,20 +438,20 @@ def emit_code_with_nested_activation_checkpoint(body, ckpt_func, nodes, emit_nod
                     ckpt_func.insert(0, "\n".join([pack_hook, unpack_hook]) + "\n")
                     is_hook_inserted = True
 
-                if offload_input and offload_bar:
+                if chunk_input and chunk_bar:
                     body.append(_gen_save_on_cpu_context())
 
-                elif offload_input:
-                    for par in offload_inputs[offload_label[0]]:
+                elif chunk_input:
+                    for par in chunk_inputs[chunk_label[0]]:
                         body.append(f"setattr({par}, 'offload', True)\n")
                     body.append(_gen_save_tensors_hooks_context(offload_input=True))
 
                 else:
-                    for par in offload_inputs[offload_label[0]]:
+                    for par in chunk_inputs[chunk_label[0]]:
                         body.append(f"setattr({par}, 'offload', False)\n")
                     body.append(_gen_save_tensors_hooks_context(offload_input=False))
 
-            if within_offload_region:
+            if within_chunk_region:
                 emit_node_func(node, body)
                 body[-1] = '    ' + body[-1]
                 delete_unused_value_func(node, body)
@@ -460,150 +460,15 @@ def emit_code_with_nested_activation_checkpoint(body, ckpt_func, nodes, emit_nod
                 emit_node_func(node, body)
                 delete_unused_value_func(node, body)
 
-            if node_idx in offload_ends:
-                within_offload_region = False
+            if node_idx in chunk_ends:
+                within_chunk_region = False
 
             node_idx += 1
 
 
-def emit_code_with_activation_checkpoint(body, ckpt_func, nodes, emit_node_func, delete_unused_value_func):
-    # find the activation checkpoint regions
-    ckpt_regions = _find_ckpt_regions(nodes)
-    start_idx = [item[0] for item in ckpt_regions]
-    end_idx = [item[1] for item in ckpt_regions]
-    input_vars = []
-    output_vars = []
-    within_ckpt_region = False
-
-    # find the offload regions
-    offload_regions, offload_labels = _find_offload_regions(nodes)
-    offload_starts = [item[0] for item in offload_regions]
-    offload_ends = [item[1] for item in offload_regions]
-    offload_inputs = []
-    offload_outputs = []
-    within_offload_region = False
-
-    node_list = list(nodes)
-
-    # use this variable to avoid inserting hook functions
-    # to ckpt_func repeatedly
-    is_hook_inserted = False
-
-    # find the input and output var names for each region
-    for idx, (start, end) in enumerate(ckpt_regions):
-        ckpt_node_list = node_list[start:end + 1]
-        inputs, outputs = _find_input_and_output_nodes(ckpt_node_list)
-        input_vars.append(inputs)
-        output_vars.append(outputs)
-
-    # find the input and output var names for each offload region
-    for idx, (start, end) in enumerate(offload_regions):
-        offload_node_list = node_list[start:end + 1]
-        inputs, outputs = _find_input_and_output_nodes(offload_node_list)
-        offload_inputs.append(inputs)
-        offload_outputs.append(outputs)
-
-    # append code text to body
-    for idx, node in enumerate(node_list):
-        # if this is the first node of the ckpt region
-        # append the ckpt function defition
-        if idx in start_idx:
-            label = start_idx.index(idx)
-            ckpt_fn_def = _gen_ckpt_fn_def(label, input_vars[label])
-            ckpt_func.append(f'{ckpt_fn_def}\n')
-            within_ckpt_region = True
-
-        if idx in offload_starts:
-            offload_label = offload_labels[offload_starts.index(idx)]
-            _, offload_input, offload_bar = offload_label
-            within_offload_region = True
-
-            # insert hook functions if needed
-            if not is_hook_inserted:
-                pack_hook, unpack_hook = _gen_saved_tensors_hooks()
-                ckpt_func.insert(0, "\n".join([pack_hook, unpack_hook]) + "\n")
-                is_hook_inserted = True
-
-            if offload_input and offload_bar:
-                body.append(_gen_save_on_cpu_context())
-
-            elif offload_input:
-                for par in offload_inputs[offload_label[0]]:
-                    body.append(f"setattr({par}, 'offload', True)\n")
-                body.append(_gen_save_tensors_hooks_context(offload_input=True))
-
-            else:
-                for par in offload_inputs[offload_label[0]]:
-                    body.append(f"setattr({par}, 'offload', False)\n")
-                body.append(_gen_save_tensors_hooks_context(offload_input=False))
-
-        # NOTE: emit_node does not emit a string with newline. It depends
-        # on delete_unused_values to append one
-        # NOTE: currently we separate body and ckpt_func definition
-        if within_ckpt_region:
-            emit_node_func(node, ckpt_func)
-            ckpt_func[-1] = '    ' + ckpt_func[-1]
-            delete_unused_value_func(node, ckpt_func)
-
-        elif within_offload_region:
-            emit_node_func(node, body)
-            body[-1] = '    ' + body[-1]
-            delete_unused_value_func(node, body)
-
-        else:
-            emit_node_func(node, body)
-            delete_unused_value_func(node, body)
-
-        if idx in end_idx:
-            # if this is the last node of the ckpt region
-            # generate return statement
-            label = end_idx.index(idx)
-            return_statement = _gen_ckpt_output(output_vars[label])
-            return_statement = f'    {return_statement}\n\n'
-            ckpt_func.append(return_statement)
-
-            # we need to check if the checkpoint need to offload the input
-            start_node_idx = start_idx[label]
-            if hasattr(node_list[start_node_idx], 'activation_offload'):
-                activation_offload = node_list[start_node_idx].activation_offload
-            else:
-                activation_offload = False
-
-            # we need to check if the checkpoint need use_reentrant=False
-            use_reentrant = True
-            non_leaf_input = 0
-            for var in input_vars[label]:
-                input_node = next(item for item in node_list if item.name == var)
-                if input_node.op != "placeholder":
-                    non_leaf_input = 1
-                for user in input_node.users:
-                    if hasattr(user, "activation_checkpoint"):
-                        if user.activation_checkpoint == label:
-                            if user.op == "call_module":
-                                if hasattr(user.graph.owning_module.get_submodule(user.target), "inplace"):
-                                    use_reentrant = not user.graph.owning_module.get_submodule(user.target).inplace
-
-                            elif user.op == "call_function":
-                                if "inplace" in user.kwargs:
-                                    use_reentrant = not user.kwargs["inplace"]
-
-            # if all the inputs are leaf nodes, we need to set use_reentrant = False
-            if not non_leaf_input:
-                use_reentrant = False
-
-            # generate checkpoint function call in a new line
-            usage = _gen_ckpt_usage(label, activation_offload, input_vars[label], output_vars[label], use_reentrant)
-            usage += '\n'
-            body.append(usage)
-            within_ckpt_region = False
-
-        if idx in offload_ends:
-            within_offload_region = False
-
-
 if CODEGEN_AVAILABLE:
 
-    class ActivationCheckpointCodeGen(CodeGen):
+    class ChunkCodeGen(CodeGen):
 
         def _gen_python_code(self, nodes, root_module: str, namespace: _Namespace) -> PythonCode:
             free_vars: List[str] = []
@@ -796,10 +661,7 @@ def emit_node(node: Node, body):
 
             # if any node has a list of labels for activation_checkpoint, we
             # will use nested type of activation checkpoint codegen
-            if any(isinstance(getattr(node, "activation_checkpoint", None), Iterable) for node in nodes):
-                emit_code_with_nested_activation_checkpoint(body, ckpt_func, nodes, emit_node, delete_unused_values)
-            else:
-                emit_code_with_activation_checkpoint(body, ckpt_func, nodes, emit_node, delete_unused_values)
+            emit_code_with_chunk(body, ckpt_func, nodes, emit_node, delete_unused_values)
 
             if len(body) == 0:
                 # If the Graph has no non-placeholder nodes, no lines for the body
@@ -833,215 +695,3 @@ def emit_node(node: Node, body):
 {prologue}
 {code}"""
             return PythonCode(fn_code, globals_)
-
-else:
-
-    def python_code_with_activation_checkpoint(self, root_module: str, namespace: _Namespace) -> PythonCode:
-        """
-        This method is copied from the _python_code of torch.fx.graph.Graph. Modifications are made so that it can generate
-        code for activation checkpoint.
-        """
-        free_vars: List[str] = []
-        body: List[str] = []
-        globals_: Dict[str, Any] = {}
-        wrapped_fns: Dict[str, None] = {}
-
-        # Wrap string in list to pass by reference
-        maybe_return_annotation: List[str] = ['']
-
-        def add_global(name_hint: str, obj: Any):
-            """Add an obj to be tracked as a global.
-
-            We call this for names that reference objects external to the
-            Graph, like functions or types.
-
-            Returns: the global name that should be used to reference 'obj' in generated source.
-            """
-            if _is_from_torch(obj) and obj != torch.device:    # to support registering torch.device
-                # HACK: workaround for how torch custom ops are registered. We
-                # can't import them like normal modules so they must retain their
-                # fully qualified name.
-                return _get_qualified_name(obj)
-
-            # normalize the name hint to get a proper identifier
-            global_name = namespace.create_name(name_hint, obj)
-
-            if global_name in globals_:
-                assert globals_[global_name] is obj
-                return global_name
-            globals_[global_name] = obj
-            return global_name
-
-        # set _custom_builtins here so that we needn't import colossalai in forward
-        _custom_builtins["colossalai"] = _CustomBuiltin("import colossalai", colossalai)
-
-        # Pre-fill the globals table with registered builtins.
-        for name, (_, obj) in _custom_builtins.items():
-            add_global(name, obj)
-
-        def type_repr(o: Any):
-            if o == ():
-                # Empty tuple is used for empty tuple type annotation Tuple[()]
-                return '()'
-
-            typename = _type_repr(o)
-
-            # This is a generic type, e.g. typing.List[torch.Tensor]
-            if hasattr(o, '__origin__'):
-                origin_type = _origin_type_map.get(o.__origin__, o.__origin__)
-                origin_typename = add_global(_type_repr(origin_type), origin_type)
-
-                # Assign global names for each of the inner type variables.
-                args = [type_repr(arg) for arg in o.__args__]
-
-                return f'{origin_typename}[{",".join(args)}]'
-
-            # Common case: this is a regular module name like 'foo.bar.baz'
-            return add_global(typename, o)
-
-        # Run through reverse nodes and record the first instance of a use
-        # of a given node. This represents the *last* use of the node in the
-        # execution order of the program, which we will use to free unused
-        # values
-        node_to_last_use: Dict[Node, Node] = {}
-        user_to_last_uses: Dict[Node, List[Node]] = {}
-
-        def register_last_uses(n: Node, user: Node):
-            if n not in node_to_last_use:
-                node_to_last_use[n] = user
-                user_to_last_uses.setdefault(user, []).append(n)
-
-        for node in reversed(self.nodes):
-            map_arg(node.args, lambda n: register_last_uses(n, node))
-            map_arg(node.kwargs, lambda n: register_last_uses(n, node))
-
-        # NOTE: we add a variable to distinguish body and ckpt_func
-        def delete_unused_values(user: Node, body):
-            """
-            Delete values after their last use. This ensures that values that are
-            not used in the remainder of the code are freed and the memory usage
-            of the code is optimal.
-            """
-            if user.op == 'placeholder':
-                return
-            if user.op == 'output':
-                body.append('\n')
-                return
-            nodes_to_delete = user_to_last_uses.get(user, [])
-            if len(nodes_to_delete):
-                to_delete_str = ' = '.join([repr(n) for n in nodes_to_delete] + ['None'])
-                body.append(f';  {to_delete_str}\n')
-            else:
-                body.append('\n')
-
-        # NOTE: we add a variable to distinguish body and ckpt_func
-        def emit_node(node: Node, body):
-            maybe_type_annotation = '' if node.type is None else f' : {type_repr(node.type)}'
-            if node.op == 'placeholder':
-                assert isinstance(node.target, str)
-                maybe_default_arg = '' if not node.args else f' = {repr(node.args[0])}'
-                free_vars.append(f'{node.target}{maybe_type_annotation}{maybe_default_arg}')
-                raw_name = node.target.replace('*', '')
-                if raw_name != repr(node):
-                    body.append(f'{repr(node)} = {raw_name}\n')
-                return
-            elif node.op == 'call_method':
-                assert isinstance(node.target, str)
-                body.append(f'{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.target)}'
-                            f'({_format_args(node.args[1:], node.kwargs)})')
-                return
-            elif node.op == 'call_function':
-                assert callable(node.target)
-                # pretty print operators
-                if node.target.__module__ == '_operator' and node.target.__name__ in magic_methods:
-                    assert isinstance(node.args, tuple)
-                    body.append(f'{repr(node)}{maybe_type_annotation} = '
-                                f'{magic_methods[node.target.__name__].format(*(repr(a) for a in node.args))}')
-                    return
-                qualified_name = _get_qualified_name(node.target)
-                global_name = add_global(qualified_name, node.target)
-                # special case for getattr: node.args could be 2-argument or 3-argument
-                # 2-argument: attribute access; 3-argument: fall through to attrib function call with default value
-                if global_name == 'getattr' and \
-                   isinstance(node.args, tuple) and \
-                   isinstance(node.args[1], str) and \
-                   node.args[1].isidentifier() and \
-                   len(node.args) == 2:
-                    body.append(
-                        f'{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.args[1])}')
-                    return
-                body.append(
-                    f'{repr(node)}{maybe_type_annotation} = {global_name}({_format_args(node.args, node.kwargs)})')
-                if node.meta.get('is_wrapped', False):
-                    wrapped_fns.setdefault(global_name)
-                return
-            elif node.op == 'call_module':
-                assert isinstance(node.target, str)
-                body.append(f'{repr(node)}{maybe_type_annotation} = '
-                            f'{_format_target(root_module, node.target)}({_format_args(node.args, node.kwargs)})')
-                return
-            elif node.op == 'get_attr':
-                assert isinstance(node.target, str)
-                body.append(f'{repr(node)}{maybe_type_annotation} = {_format_target(root_module, node.target)}')
-                return
-            elif node.op == 'output':
-                if node.type is not None:
-                    maybe_return_annotation[0] = f" -> {type_repr(node.type)}"
-                if self._pytree_info is None:
-                    body.append(f'return {repr(node.args[0])}')
-                else:
-                    body.append(f'return pytree.tree_unflatten({repr(node.args[0])}, self._out_spec)')
-                return
-            raise NotImplementedError(f'node: {node.op} {node.target}')
-
-        # Modified for activation checkpointing
-        ckpt_func = []
-
-        # if any node has a list of labels for activation_checkpoint, we
-        # will use nested type of activation checkpoint codegen
-        if any(isinstance(getattr(node, "activation_checkpoint", None), Iterable) for node in self.nodes):
-            emit_code_with_nested_activation_checkpoint(body, ckpt_func, self.nodes, emit_node, delete_unused_values)
-        else:
-            emit_code_with_activation_checkpoint(body, ckpt_func, self.nodes, emit_node, delete_unused_values)
-
-        if len(body) == 0:
-            # If the Graph has no non-placeholder nodes, no lines for the body
-            # have been emitted. To continue to have valid Python code, emit a
-            # single pass statement
-            body.append('pass\n')
-        if self._pytree_info is not None:
-            orig_args = self._pytree_info.orig_args
-            has_orig_self = (orig_args[0] == 'self')
-            if has_orig_self:
-                free_vars.insert(0, 'self')
-            if len(free_vars) > 0:    # pytree has placeholders in it
-                body.insert(
-                    0,
-                    f"{', '.join(free_vars)}, = fx_pytree.tree_flatten_spec([{', '.join(orig_args)}], self._in_spec)\n")
-        else:
-            orig_args = free_vars
-
-        if len(wrapped_fns) > 0:
-            wrap_name = add_global('wrap', torch.fx.wrap)
-            wrap_stmts = '\n'.join([f'{wrap_name}("{name}")' for name in wrapped_fns])
-        else:
-            wrap_stmts = ''
-
-        ckpt_func = ''.join(ckpt_func)
-
-        # If the original function didn't have self as its first argument, we
-        # would have added it.
-        if len(orig_args) == 0 or orig_args[0] != 'self':
-            orig_args.insert(0, 'self')
-        code = ''.join(body)
-        code = '\n'.join('    ' + line for line in code.split('\n'))
-
-        # as we need colossalai.utils.checkpoint, we need to import colossalai
-        # in forward function
-        fn_code = f"""
-{wrap_stmts}
-
-{ckpt_func}
-def forward({', '.join(orig_args)}){maybe_return_annotation[0]}:
-{code}"""
-        return PythonCode(fn_code, globals_)
diff --git a/chunk_codegen_run.py b/chunk_codegen_run.py
index 9ac399a29b51..85164bdada96 100644
--- a/chunk_codegen_run.py
+++ b/chunk_codegen_run.py
@@ -11,7 +11,7 @@
 from colossalai.fx.graph_module import ColoGraphModule
 
 try:
-    from chunk_codegen import ActivationCheckpointCodeGen
+    from chunk_codegen import ChunkCodeGen
     with_codegen = True
 except:
     # fall back to older pytorch version
@@ -75,7 +75,7 @@ def _run_offload_codegen(rank):
     # trace the module and replace codegen
     tracer = ColoTracer(trace_act_ckpt=True)
     graph = tracer.trace(model)
-    codegen = ActivationCheckpointCodeGen()
+    codegen = ChunkCodeGen()
     graph.set_codegen(codegen)
 
     # annotate the activation offload part
@@ -99,15 +99,7 @@ def _run_offload_codegen(rank):
 
     # assert we have all the components
     code = graph.python_code("self").src
-    assert "def pack_hook_input(self, x):" in code and \
-    "def unpack_hook(self, packed):" in code and \
-    "def pack_hook_no_input(self, x):" in code and \
-    "setattr(x, 'offload', True)" in code and \
-    "setattr(linear3, 'offload', False)" in code and \
-    "with torch.autograd.graph.saved_tensors_hooks(self.pack_hook_input, self.unpack_hook):" in code and \
-    "with torch.autograd.graph.save_on_cpu(pin_memory=True):" in code and \
-    "with torch.autograd.graph.saved_tensors_hooks(self.pack_hook_no_input, self.unpack_hook):" in code and \
-    "colossalai.utils.activation_checkpoint.checkpoint(self.checkpoint_0, True, linear4, use_reentrant=False)" in code
+    print(code)
 
     _test_fwd_and_bwd(model, gm, data)
     gpc.destroy()
@@ -118,60 +110,5 @@ def test_act_ckpt_codegen():
     mp.spawn(_run_offload_codegen, nprocs=1)
 
 
-def _run_offload_codegen_torch11(rank):
-    # launch colossalai to make sure we could execute colossalai.utils.checkpoint currectly
-    colossalai.launch(config={}, rank=rank, world_size=1, host='localhost', port=free_port(), backend='nccl')
-
-    # build model and input
-    model = MyNet().cuda()
-    data = torch.rand(4, 4).cuda()
-
-    # trace the module and replace codegen
-    tracer = ColoTracer(trace_act_ckpt=True)
-    graph = tracer.trace(model)
-
-    # replace a bound method of an object
-    graph._python_code = python_code_with_activation_checkpoint.__get__(graph)
-
-    # annotate the activation offload part
-    # also annotate the activation_checkpoint so we could test both types
-    # of input offload
-    for node in graph.nodes:
-        if node.name == "linear0":
-            setattr(node, "activation_offload", [0, True, False])
-        if node.name == "linear1":
-            setattr(node, "activation_offload", [0, True, False])
-        if node.name == "linear2":
-            setattr(node, "activation_offload", [1, True, True])
-        if node.name == "linear4":
-            setattr(node, "activation_offload", [2, False, True])
-        if node.name == "linear5":
-            setattr(node, "activation_checkpoint", [0])
-            setattr(node, "activation_offload", True)
-
-    gm = ColoGraphModule(copy.deepcopy(model), graph)
-    gm.recompile()
-
-    # assert we have all the components
-    code = graph.python_code("self").src
-    assert "def pack_hook_input(self, x):" in code and \
-    "def unpack_hook(self, packed):" in code and \
-    "def pack_hook_no_input(self, x):" in code and \
-    "setattr(x, 'offload', True)" in code and \
-    "setattr(linear3, 'offload', False)" in code and \
-    "with torch.autograd.graph.saved_tensors_hooks(self.pack_hook_input, self.unpack_hook):" in code and \
-    "with torch.autograd.graph.save_on_cpu(pin_memory=True):" in code and \
-    "with torch.autograd.graph.saved_tensors_hooks(self.pack_hook_no_input, self.unpack_hook):" in code and \
-    "colossalai.utils.activation_checkpoint.checkpoint(self.checkpoint_0, True, linear4, use_reentrant=False)" in code
-
-    _test_fwd_and_bwd(model, gm, data)
-    gpc.destroy()
-
-
-@pytest.mark.skip(reason="currently torch11 ColoGraphModule is not implemented")
-def test_act_ckpt_python_code_torch11():
-    mp.spawn(_run_offload_codegen_torch11, nprocs=1)
-
-
 if __name__ == "__main__":
     _run_offload_codegen(0)

From 78cfe4362b4550635f609a8b52a8489c7f9aa564 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Wed, 2 Nov 2022 13:59:48 +0800
Subject: [PATCH 003/503] basic chunk

---
 chunk_codegen.py     | 66 ++++++++++++++++++++++----------------------
 chunk_codegen_run.py | 15 +++++-----
 2 files changed, 41 insertions(+), 40 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 09fda2b988eb..c605e35f4725 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -46,6 +46,19 @@ def pack_hook_no_input(self, x):
     return pack_hook, unpack_hook
 
 
+def _gen_loop_5(to_keep):
+    context = "chunk_result = []\nfor gen_loop_idx in range(4):\n"
+    context += "    chunk_tensor = " + to_keep + "[gen_loop_idx, :]\n"
+    return context
+
+
+def _gen_loop_5_final(final_name, to_keep):
+    context = "    chunk_result.append(" + final_name + ")\n"
+    context += "chunk_result = torch.cat(chunk_result, dim=0);  " + to_keep[0] + " = None\n"
+    context += final_name + " = chunk_result; chunk_result = None\n"
+    return context
+
+    
 def _gen_save_tensors_hooks_context(offload_input=True) -> str:
     """Generate customized saved_tensors_hooks
 
@@ -410,57 +423,40 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
 
     # this flag is to prevent repeated insert of save tensors
     # hooks definition in ckpt_func
-    is_hook_inserted = False
     node_idx = 0
-    while 1:
+    to_keep = []
+    while node_idx < len(node_list):
         # break if we finish the processing all the nodes
         if node_idx >= len(node_list):
             break
 
-        # process ckpt_regions
-        if node_idx in start_idx:
-            ckpt_node_list = node_list[node_idx:end_idx[start_idx.index(node_idx)] + 1]
-            emit_ckpt_func(body, ckpt_func, ckpt_node_list, emit_node_func, delete_unused_value_func)
-            node_idx += len(ckpt_node_list)
-
         # process node in forward function
         else:
             node = node_list[node_idx]
 
             if node_idx in chunk_starts:
-                chunk_label = chunk_labels[chunk_starts.index(node_idx)]
-                _, chunk_input, chunk_bar = chunk_label
+                # save chunk input var, dont delete it
+                to_keep.extend(node.args[0].name)
                 within_chunk_region = True
-
-                # insert hook functions if needed
-                if not is_hook_inserted:
-                    pack_hook, unpack_hook = _gen_saved_tensors_hooks()
-                    ckpt_func.insert(0, "\n".join([pack_hook, unpack_hook]) + "\n")
-                    is_hook_inserted = True
-
-                if chunk_input and chunk_bar:
-                    body.append(_gen_save_on_cpu_context())
-
-                elif chunk_input:
-                    for par in chunk_inputs[chunk_label[0]]:
-                        body.append(f"setattr({par}, 'offload', True)\n")
-                    body.append(_gen_save_tensors_hooks_context(offload_input=True))
-
-                else:
-                    for par in chunk_inputs[chunk_label[0]]:
-                        body.append(f"setattr({par}, 'offload', False)\n")
-                    body.append(_gen_save_tensors_hooks_context(offload_input=False))
+                # add for loop
+                body.append(_gen_loop_5(to_keep[0]))
+                # change first node's input to new chunked var
+                node_args = list(node.args)
+                node_args[0] = 'chunk_tensor'
 
             if within_chunk_region:
                 emit_node_func(node, body)
                 body[-1] = '    ' + body[-1]
-                delete_unused_value_func(node, body)
+                delete_unused_value_func(node, body, to_keep)
 
             else:
                 emit_node_func(node, body)
-                delete_unused_value_func(node, body)
+                if node_idx not in chunk_inputs:
+                    delete_unused_value_func(node, body, to_keep)
 
             if node_idx in chunk_ends:
+                body.append(_gen_loop_5_final(node.name, to_keep))
+                to_keep = []
                 within_chunk_region = False
 
             node_idx += 1
@@ -572,7 +568,7 @@ def register_last_uses(n: Node, user: Node):
                 map_arg(node.kwargs, lambda n: register_last_uses(n, node))
 
             # NOTE: we add a variable to distinguish body and ckpt_func
-            def delete_unused_values(user: Node, body):
+            def delete_unused_values(user: Node, body, to_keep=[]):
                 """
                 Delete values after their last use. This ensures that values that are
                 not used in the remainder of the code are freed and the memory usage
@@ -584,6 +580,9 @@ def delete_unused_values(user: Node, body):
                     body.append('\n')
                     return
                 nodes_to_delete = user_to_last_uses.get(user, [])
+                for n in nodes_to_delete:
+                    if n.name in to_keep:
+                        nodes_to_delete.remove(n)
                 if len(nodes_to_delete):
                     to_delete_str = ' = '.join([repr(n) for n in nodes_to_delete] + ['None'])
                     body.append(f';  {to_delete_str}\n')
@@ -693,5 +692,6 @@ def emit_node(node: Node, body):
 {wrap_stmts}
 
 {prologue}
-{code}"""
+{code}"""   
+            print(fn_code)
             return PythonCode(fn_code, globals_)
diff --git a/chunk_codegen_run.py b/chunk_codegen_run.py
index 85164bdada96..69b327d4bd5b 100644
--- a/chunk_codegen_run.py
+++ b/chunk_codegen_run.py
@@ -54,6 +54,7 @@ def _test_fwd_and_bwd(model: torch.nn.Module, gm: ColoGraphModule, data: torch.T
     # test forward
     non_fx_out = model(data)
     fx_out = gm(data)
+    print(non_fx_out.shape, fx_out.shape)
     assert torch.equal(non_fx_out, fx_out), "fx_out doesn't comply with original output"
 
     # test barckward
@@ -86,13 +87,13 @@ def _run_offload_codegen(rank):
             setattr(node, "activation_offload", [0, True, False])
         if node.name == "linear1":
             setattr(node, "activation_offload", [0, True, False])
-        if node.name == "linear2":
-            setattr(node, "activation_offload", [1, True, True])
-        if node.name == "linear4":
-            setattr(node, "activation_offload", [2, False, True])
-        if node.name == "linear5":
-            setattr(node, "activation_checkpoint", [0])
-            setattr(node, "activation_offload", True)
+        # if node.name == "linear2":
+        #     setattr(node, "activation_offload", [1, True, True])
+        # if node.name == "linear4":
+        #     setattr(node, "activation_offload", [2, False, True])
+        # if node.name == "linear5":
+        #     setattr(node, "activation_checkpoint", [0])
+        #     setattr(node, "activation_offload", True)
 
     gm = ColoGraphModule(copy.deepcopy(model), graph)
     gm.recompile()

From 86f2a3147415f2afe53019cd7b9d9414de1510e9 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Wed, 2 Nov 2022 15:12:08 +0800
Subject: [PATCH 004/503] add evoformer

---
 evoformer/evoformer.py   |  47 ++++++++++
 evoformer/initializer.py |  29 ++++++
 evoformer/kernel.py      |  19 ++++
 evoformer/msa.py         |  95 +++++++++++++++++++
 evoformer/ops.py         | 176 +++++++++++++++++++++++++++++++++++
 evoformer/triangle.py    | 192 +++++++++++++++++++++++++++++++++++++++
 6 files changed, 558 insertions(+)
 create mode 100644 evoformer/evoformer.py
 create mode 100755 evoformer/initializer.py
 create mode 100644 evoformer/kernel.py
 create mode 100644 evoformer/msa.py
 create mode 100755 evoformer/ops.py
 create mode 100644 evoformer/triangle.py

diff --git a/evoformer/evoformer.py b/evoformer/evoformer.py
new file mode 100644
index 000000000000..ef3df2769840
--- /dev/null
+++ b/evoformer/evoformer.py
@@ -0,0 +1,47 @@
+import torch
+import torch.nn as nn
+
+from .msa import MSAStack
+from .ops import OutProductMean
+from .triangle import PairStack
+
+
+class EvoformerBlock(nn.Module):
+
+    def __init__(self, d_node, d_pair):
+        super(EvoformerBlock, self).__init__()
+
+        self.msa_stack = MSAStack(d_node, d_pair, p_drop=0.15)
+        self.communication = OutProductMean(n_feat=d_node, n_feat_out=d_pair, n_feat_proj=32)
+        self.pair_stack = PairStack(d_pair=d_pair)
+
+    def forward(self, node, pair):
+        node = node + self.msa_stack(node, pair)
+        pair = pair + self.communication(node)
+        pair = pair + self.pair_stack(pair)
+        return node, pair
+
+
+class Evoformer(nn.Module):
+
+    def __init__(self, d_node, d_pair):
+        super(Evoformer, self).__init__()
+
+        self.blocks = nn.ModuleList()
+        for _ in range(3):
+            self.blocks.append(EvoformerBlock(d_node, d_pair))
+
+    def forward(self, node, pair):
+        for b in self.blocks:
+            node, pair = b(node, pair)
+        return node, pair
+
+def evoformer_base():
+    return Evoformer(d_node=256, d_pair=128)
+
+
+def evoformer_large():
+    return Evoformer(d_node=512, d_pair=256)
+
+
+__all__ = ['Evoformer', 'evoformer_base', 'evoformer_large']
diff --git a/evoformer/initializer.py b/evoformer/initializer.py
new file mode 100755
index 000000000000..c6ce0659e597
--- /dev/null
+++ b/evoformer/initializer.py
@@ -0,0 +1,29 @@
+import math
+
+import numpy as np
+import torch.nn as nn
+
+
+def glorot_uniform_af(x, gain=1.0):
+    """
+    initialize tensors the same as xavier_initializer in PyTorch, but the dimensions are different:
+    In PyTorch:
+    [feature_out, feature_in, n_head ...]
+    In Jax:
+    [... n_head, feature_in, feature_out]
+    However, there is a feature in original Alphafold2 code that they use the Jax version initializer to initialize tensors like:
+    [feature_in, n_head, feature_out]
+
+    In this function, we keep this feature to initialize [feature_in, n_head, ..., feature_out] tensors
+    """
+    fan_in, fan_out = x.shape[-2:]
+    if len(x.shape) > 2:
+        receptive_field_size = np.prod(x.shape[:-2])
+        fan_in *= receptive_field_size
+        fan_out *= receptive_field_size
+    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
+    dev = math.sqrt(3.0) * std  # Calculate uniform bounds from standard deviation
+
+    nn.init.uniform_(x, -dev, dev)
+
+    return x
diff --git a/evoformer/kernel.py b/evoformer/kernel.py
new file mode 100644
index 000000000000..2655901a2fe9
--- /dev/null
+++ b/evoformer/kernel.py
@@ -0,0 +1,19 @@
+import torch
+import torch.nn.functional as F
+
+
+def bias_sigmod_ele(y, bias, z):
+    return torch.sigmoid(y + bias) * z
+
+
+def bias_dropout_add(x: torch.Tensor, bias: torch.Tensor, dropmask: torch.Tensor,
+                     residual: torch.Tensor, prob: float) -> torch.Tensor:
+    out = (x + bias) * F.dropout(dropmask, p=prob, training=True)
+    out = residual + out
+    return out
+
+
+def bias_ele_dropout_residual(ab: torch.Tensor, b: torch.Tensor, g: torch.Tensor,
+                              dropout_mask: torch.Tensor, Z_raw: torch.Tensor,
+                              prob: float) -> torch.Tensor:
+    return Z_raw + F.dropout(dropout_mask, p=prob, training=True) * (g * (ab + b))
\ No newline at end of file
diff --git a/evoformer/msa.py b/evoformer/msa.py
new file mode 100644
index 000000000000..ccefa38c48be
--- /dev/null
+++ b/evoformer/msa.py
@@ -0,0 +1,95 @@
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torch.nn import LayerNorm
+
+from .kernel import bias_dropout_add
+from .ops import SelfAttention, Transition
+
+
+class MSARowAttentionWithPairBias(nn.Module):
+
+    def __init__(self, d_node, d_pair, c=32, n_head=8, p_drop=0.15):
+        super(MSARowAttentionWithPairBias, self).__init__()
+        self.d_node = d_node
+        self.d_pair = d_pair
+        self.c = c
+        self.n_head = n_head
+        self.p_drop = p_drop
+
+        self.layernormM = LayerNorm(d_node)
+        self.layernormZ = LayerNorm(d_pair)
+
+        _init_weights = torch.nn.init.normal_(torch.zeros([n_head, d_pair]),
+                                              std=1.0 / math.sqrt(d_pair))
+        self.linear_b_weights = nn.parameter.Parameter(data=_init_weights, requires_grad=True)
+
+        self.attention = SelfAttention(qkv_dim=d_node,
+                                       c=c,
+                                       n_head=n_head,
+                                       out_dim=d_node,
+                                       gating=True,
+                                       last_bias_fuse=True)
+
+        self.out_bias = nn.parameter.Parameter(data=torch.zeros((d_node,)), requires_grad=True)
+
+    def forward(self, M_raw, Z):
+        ## Input projections
+        M = self.layernormM(M_raw)
+        Z = self.layernormZ(Z)
+        b = F.linear(Z, self.linear_b_weights)
+        b = b.permute(0, 3, 1, 2)
+        # b = rearrange(b, 'b q k h -> b h q k')
+
+        M = self.attention(M, b)
+        dropout_mask = torch.ones_like(M[:, 0:1, :, :], device=M.device, dtype=M.dtype)
+
+        return bias_dropout_add(M, self.out_bias, dropout_mask, M_raw, prob=self.p_drop)
+
+
+class MSAColumnAttention(nn.Module):
+
+    def __init__(self, d_node, c=32, n_head=8):
+        super(MSAColumnAttention, self).__init__()
+        self.d_node = d_node
+        self.c = c
+        self.n_head = n_head
+
+        self.layernormM = LayerNorm(d_node)
+        self.attention = SelfAttention(qkv_dim=d_node,
+                                       c=c,
+                                       n_head=n_head,
+                                       out_dim=d_node,
+                                       gating=True)
+
+    def forward(self, M_raw):
+        M = M_raw.transpose(-2, -3)
+        M = self.layernormM(M)
+
+        M = self.attention(M)
+
+        M = M.transpose(-2, -3)
+        return M_raw + M
+
+
+class MSAStack(nn.Module):
+
+    def __init__(self, d_node, d_pair, p_drop=0.15):
+        super(MSAStack, self).__init__()
+
+        self.MSARowAttentionWithPairBias = MSARowAttentionWithPairBias(d_node=d_node,
+                                                                       d_pair=d_pair,
+                                                                       p_drop=p_drop)
+
+        self.MSAColumnAttention = MSAColumnAttention(d_node=d_node)
+        self.MSATransition = Transition(d=d_node)
+
+    def forward(self, node, pair):
+        node = self.MSARowAttentionWithPairBias(node, pair)
+        node = self.MSAColumnAttention(node)
+        node = self.MSATransition(node)
+
+        return node
diff --git a/evoformer/ops.py b/evoformer/ops.py
new file mode 100755
index 000000000000..ddbba441dd5f
--- /dev/null
+++ b/evoformer/ops.py
@@ -0,0 +1,176 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torch.nn import LayerNorm
+
+from .initializer import glorot_uniform_af
+from .kernel import bias_sigmod_ele
+
+
+class DropoutRowwise(nn.Module):
+
+    def __init__(self, p):
+        super(DropoutRowwise, self).__init__()
+        self.p = p
+        self.dropout = nn.Dropout(p=p)
+
+    def forward(self, x):
+        dropout_mask = torch.ones_like(x[:, 0:1, :, :])
+        dropout_mask = self.dropout(dropout_mask)
+        return dropout_mask * x
+
+
+class DropoutColumnwise(nn.Module):
+
+    def __init__(self, p):
+        super(DropoutColumnwise, self).__init__()
+        self.p = p
+        self.dropout = nn.Dropout(p=p)
+
+    def forward(self, x):
+        dropout_mask = torch.ones_like(x[:, :, 0:1, :])
+        dropout_mask = self.dropout(dropout_mask)
+        return dropout_mask * x
+
+
+class Transition(nn.Module):
+
+    def __init__(self, d, n=4):
+        super(Transition, self).__init__()
+        self.norm = LayerNorm(d)
+        self.linear1 = Linear(d, n * d, initializer='relu')
+        self.linear2 = Linear(n * d, d, initializer='zeros')
+
+    def forward(self, src):
+        x = self.norm(src)
+        x = self.linear2(F.relu(self.linear1(x)))
+        return src + x
+
+
+class OutProductMean(nn.Module):
+
+    def __init__(self, n_feat=64, n_feat_out=128, n_feat_proj=32):
+        super(OutProductMean, self).__init__()
+
+        self.layernormM = LayerNorm(n_feat)
+        self.linear_a = Linear(n_feat, n_feat_proj)
+        self.linear_b = Linear(n_feat, n_feat_proj)
+
+        self.o_linear = Linear(n_feat_proj * n_feat_proj,
+                               n_feat_out,
+                               initializer='zero',
+                               use_bias=True)
+
+    def forward(self, M):
+        M = self.layernormM(M)
+        left_act = self.linear_a(M)
+        right_act = self.linear_b(M)
+
+        O = torch.einsum('bsid,bsje->bijde', left_act, right_act).contiguous()
+        # O = rearrange(O, 'b i j d e -> b i j (d e)')
+        O = O.reshape(O.shape[0], O.shape[1], O.shape[2], -1)
+        Z = self.o_linear(O)
+
+        return Z
+
+
+class Linear(nn.Linear):
+    """
+    A Linear layer with built-in nonstandard initializations. Called just
+    like torch.nn.Linear.
+    Implements the initializers in 1.11.4, plus some additional ones found
+    in the code.
+    """
+
+    def __init__(
+        self,
+        feature_in: int,
+        feature_out: int,
+        initializer: str = 'linear',
+        use_bias: bool = True,
+        bias_init: float = 0.,
+    ):
+        super(Linear, self).__init__(feature_in, feature_out, bias=use_bias)
+
+        self.use_bias = use_bias
+        if initializer == 'linear':
+            glorot_uniform_af(self.weight, gain=1.0)
+        elif initializer == 'relu':
+            glorot_uniform_af(self.weight, gain=2.0)
+        elif initializer == 'zeros':
+            nn.init.zeros_(self.weight)
+        if self.use_bias:
+            with torch.no_grad():
+                self.bias.fill_(bias_init)
+
+
+class SelfAttention(nn.Module):
+    """
+    Multi-Head SelfAttention dealing with [batch_size1, batch_size2, len, dim] tensors
+    """
+
+    def __init__(self, qkv_dim, c, n_head, out_dim, gating=True, last_bias_fuse=False):
+        super(SelfAttention, self).__init__()
+        self.qkv_dim = qkv_dim
+        self.c = c
+        self.n_head = n_head
+        self.out_dim = out_dim
+        self.gating = gating
+        self.last_bias_fuse = last_bias_fuse
+
+        self.scaling = self.c**(-0.5)
+
+        # self.to_qkv = Linear(qkv_dim, 3 * n_head * c, initializer='linear')
+        self.to_q = Linear(qkv_dim, n_head * c, initializer='linear', use_bias=False)
+        self.to_k = Linear(qkv_dim, n_head * c, initializer='linear', use_bias=False)
+        self.to_v = Linear(qkv_dim, n_head * c, initializer='linear', use_bias=False)
+
+        if gating:
+            self.gating_bias = nn.parameter.Parameter(data=torch.ones((n_head * c,)))
+            self.gating_linear = Linear(qkv_dim, n_head * c, initializer='zero', use_bias=False)
+
+        self.o_linear = Linear(n_head * c,
+                               out_dim,
+                               initializer='zero',
+                               use_bias=(not last_bias_fuse))
+
+    def forward(self, in_data, nonbatched_bias=None):
+        """
+        :param in_data: [batch_size1, batch_size2, len_qkv, qkv_dim]
+        :param bias: None or [batch_size1, batch_size2, n_head, len_q, len_kv]
+        :param nonbatched_bias: None or [batch_size1, n_head, len_q, len_kv]
+        """
+
+        # qkv = self.to_qkv(in_data).chunk(3, dim=-1)
+        # q, k, v = map(lambda t: rearrange(t, 'b1 b2 n (h d) -> b1 b2 h n d', h=self.n_head), qkv)
+
+        q = self.to_q(in_data)
+        k = self.to_k(in_data)
+        v = self.to_k(in_data)
+
+        # q, k, v = map(lambda t: rearrange(t, 'b1 b2 n (h d) -> b1 b2 h n d', h=self.n_head),
+        #               [q, k, v])
+        q, k, v = map(lambda t: t.view(t.shape[0], t.shape[1], t.shape[2], self.n_head, -1).permute(0, 1, 3, 2, 4),
+                      [q, k, v])
+        
+        q = q * self.scaling
+
+        logits = torch.matmul(q, k.transpose(-1, -2))
+
+        if nonbatched_bias is not None:
+            logits += nonbatched_bias.unsqueeze(1)
+        weights = torch.softmax(logits, dim=-1)
+        # weights = softmax(logits)
+
+        weighted_avg = torch.matmul(weights, v)
+        # weighted_avg = rearrange(weighted_avg, 'b1 b2 h n d -> b1 b2 n (h d)')
+        weighted_avg = weighted_avg.permute(0, 1, 3, 2, 4)
+        weighted_avg = weighted_avg.reshape(weighted_avg.shape[0], weighted_avg.shape[1], weighted_avg.shape[2], -1)
+
+        if self.gating:
+            gate_values = self.gating_linear(in_data)
+            weighted_avg = bias_sigmod_ele(gate_values, self.gating_bias, weighted_avg)
+
+        output = self.o_linear(weighted_avg)
+        return output
diff --git a/evoformer/triangle.py b/evoformer/triangle.py
new file mode 100644
index 000000000000..7db0482f5557
--- /dev/null
+++ b/evoformer/triangle.py
@@ -0,0 +1,192 @@
+import math
+
+import torch
+import torch.nn as nn
+from torch.nn import LayerNorm
+
+from .kernel import bias_dropout_add, bias_ele_dropout_residual
+from .ops import Linear, SelfAttention, Transition
+
+
+def permute_final_dims(tensor, inds):
+    zero_index = -1 * len(inds)
+    first_inds = list(range(len(tensor.shape[:zero_index])))
+    return tensor.permute(first_inds + [zero_index + i for i in inds])
+
+
+class TriangleMultiplicationOutgoing(nn.Module):
+
+    def __init__(self, d_pair, p_drop, c=128):
+        super(TriangleMultiplicationOutgoing, self).__init__()
+        self.d_pair = d_pair
+        self.c = c
+
+        self.layernorm1 = LayerNorm(d_pair)
+        self.left_projection = Linear(d_pair, c)
+        self.right_projection = Linear(d_pair, c)
+        self.left_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
+        self.right_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
+
+        self.output_gate = Linear(d_pair, d_pair, initializer='zeros', bias_init=1.)
+        self.layernorm2 = LayerNorm(c)
+        self.output_projection = Linear(d_pair, d_pair, initializer='zeros', use_bias=False)
+        self.output_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
+
+        self.p_drop = p_drop
+
+    def forward(self, Z_raw):
+        Z = self.layernorm1(Z_raw)
+        left_proj_act = self.left_projection(Z)
+        right_proj_act = self.right_projection(Z)
+
+        left_proj_act = left_proj_act * torch.sigmoid(self.left_gate(Z))
+        right_proj_act = right_proj_act * torch.sigmoid(self.right_gate(Z))
+
+        g = torch.sigmoid(self.output_gate(Z))
+        # p = torch.matmul(
+        #     permute_final_dims(left_proj_act, (2, 0, 1)),
+        #     permute_final_dims(right_proj_act, (2, 1, 0)),
+        # )
+        # ab = permute_final_dims(p, (1, 2, 0))
+
+        ab = torch.einsum('bikd,bjkd->bijd', left_proj_act, right_proj_act)
+        ab = self.output_projection(self.layernorm2(ab))
+        dropout_mask = torch.ones_like(Z[:, 0:1, :, :], device=Z.device, dtype=Z.dtype)
+        return bias_ele_dropout_residual(ab,
+                                         self.output_bias,
+                                         g,
+                                         dropout_mask,
+                                         Z_raw,
+                                         prob=self.p_drop)
+
+
+class TriangleMultiplicationIncoming(nn.Module):
+
+    def __init__(self, d_pair, p_drop, c=128):
+        super(TriangleMultiplicationIncoming, self).__init__()
+        self.d_pair = d_pair
+        self.c = c
+
+        self.layernorm1 = LayerNorm(d_pair)
+        self.left_projection = Linear(d_pair, c)
+        self.right_projection = Linear(d_pair, c)
+        self.left_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
+        self.right_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
+
+        self.output_gate = Linear(d_pair, d_pair, initializer='zeros', bias_init=1.)
+        self.layernorm2 = LayerNorm(c)
+        self.output_projection = Linear(d_pair, d_pair, initializer='zeros', use_bias=False)
+        self.output_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
+
+        self.p_drop = p_drop
+
+    def forward(self, Z_raw):
+        Z = self.layernorm1(Z_raw)
+        left_proj_act = self.left_projection(Z)
+        right_proj_act = self.right_projection(Z)
+
+        left_proj_act = left_proj_act * torch.sigmoid(self.left_gate(Z))
+        right_proj_act = right_proj_act * torch.sigmoid(self.right_gate(Z))
+
+        g = torch.sigmoid(self.output_gate(Z))
+        # p = torch.matmul(
+        #     permute_final_dims(left_proj_act, (2, 1, 0)),
+        #     permute_final_dims(right_proj_act, (2, 0, 1)),
+        # )
+        # ab = permute_final_dims(p, (1, 2, 0))
+
+        ab = torch.einsum('bkid,bkjd->bijd', left_proj_act, right_proj_act)
+        ab = self.output_projection(self.layernorm2(ab))
+        dropout_mask = torch.ones_like(Z[:, 0:1, :, :], device=Z.device, dtype=Z.dtype)
+        return bias_ele_dropout_residual(ab,
+                                         self.output_bias,
+                                         g,
+                                         dropout_mask,
+                                         Z_raw,
+                                         prob=self.p_drop)
+
+
+class TriangleAttentionStartingNode(nn.Module):
+
+    def __init__(self, d_pair, p_drop, c=32, n_head=4):
+        super(TriangleAttentionStartingNode, self).__init__()
+        self.d_pair = d_pair
+        self.c = c
+        self.n_head = n_head
+        self.p_drop = p_drop
+
+        self.layernorm1 = LayerNorm(d_pair)
+        _init_weights = torch.nn.init.normal_(torch.zeros([d_pair, n_head]),
+                                              std=1.0 / math.sqrt(d_pair))
+        self.linear_b_weights = nn.parameter.Parameter(data=_init_weights)
+        self.attention = SelfAttention(qkv_dim=d_pair,
+                                       c=c,
+                                       n_head=n_head,
+                                       out_dim=d_pair,
+                                       gating=True,
+                                       last_bias_fuse=True)
+
+        self.out_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
+
+    def forward(self, Z_raw):
+        Z = self.layernorm1(Z_raw)
+        b = torch.einsum('bqkc,ch->bhqk', Z, self.linear_b_weights)
+
+        Z = self.attention(Z, b)
+
+        dropout_mask = torch.ones_like(Z[:, 0:1, :, :], device=Z.device, dtype=Z.dtype)
+        return bias_dropout_add(Z, self.out_bias, dropout_mask, Z_raw, prob=self.p_drop)
+
+
+class TriangleAttentionEndingNode(nn.Module):
+
+    def __init__(self, d_pair, p_drop, c=32, n_head=4):
+        super(TriangleAttentionEndingNode, self).__init__()
+        self.d_pair = d_pair
+        self.c = c
+        self.n_head = n_head
+        self.p_drop = p_drop
+
+        self.layernorm1 = LayerNorm(d_pair)
+        _init_weights = torch.nn.init.normal_(torch.zeros([d_pair, n_head]),
+                                              std=1.0 / math.sqrt(d_pair))
+        self.linear_b_weights = nn.parameter.Parameter(data=_init_weights)
+        self.attention = SelfAttention(qkv_dim=d_pair,
+                                       c=c,
+                                       n_head=n_head,
+                                       out_dim=d_pair,
+                                       gating=True,
+                                       last_bias_fuse=True)
+
+        self.out_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
+
+    def forward(self, Z_raw):
+        Z = Z_raw.transpose(-2, -3)
+        Z = self.layernorm1(Z)
+        b = torch.einsum('bqkc,ch->bhqk', Z, self.linear_b_weights)
+
+        Z = self.attention(Z, b)
+
+        Z = Z.transpose(-2, -3)
+        dropout_mask = torch.ones_like(Z[:, :, 0:1, :], device=Z.device, dtype=Z.dtype)
+        return bias_dropout_add(Z, self.out_bias, dropout_mask, Z_raw, prob=self.p_drop)
+
+
+class PairStack(nn.Module):
+
+    def __init__(self, d_pair, p_drop=0.25):
+        super(PairStack, self).__init__()
+
+        self.TriangleMultiplicationOutgoing = TriangleMultiplicationOutgoing(d_pair, p_drop=p_drop)
+        self.TriangleMultiplicationIncoming = TriangleMultiplicationIncoming(d_pair, p_drop=p_drop)
+        self.TriangleAttentionStartingNode = TriangleAttentionStartingNode(d_pair, p_drop=p_drop)
+        self.TriangleAttentionEndingNode = TriangleAttentionEndingNode(d_pair, p_drop=p_drop)
+        self.PairTransition = Transition(d=d_pair)
+
+    def forward(self, pair):
+        pair = self.TriangleMultiplicationOutgoing(pair)
+        pair = self.TriangleMultiplicationIncoming(pair)
+        pair = self.TriangleAttentionStartingNode(pair)
+        pair = self.TriangleAttentionEndingNode(pair)
+        pair = self.PairTransition(pair)
+        return pair

From 820ea4d056e4ca943ca1d143325fb582128a1b96 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Wed, 2 Nov 2022 15:49:25 +0800
Subject: [PATCH 005/503] align evoformer

---
 chunk_codegen.py       | 143 ++++++-----------------------------------
 chunk_codegen_run.py   |  97 ++++++++++------------------
 evoformer/evoformer.py |   7 +-
 evoformer/kernel.py    |   2 +-
 evoformer/msa.py       |   2 +-
 evoformer/triangle.py  |   8 +--
 6 files changed, 67 insertions(+), 192 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index c605e35f4725..cb2a3a8a90ee 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -1,5 +1,6 @@
 import colossalai
 import torch
+import copy
 from typing import List, Callable, Any, Tuple, Dict, Iterable
 
 try:
@@ -17,74 +18,18 @@
     __all__ = ['python_code_with_activation_checkpoint']
 
 
-def _gen_saved_tensors_hooks():
-    """
-    Generate saved tensors hooks
-    """
-
-    pack_hook = """def pack_hook_input(self, x):
-    if getattr(x, "offload", False):
-        return (x.device, x.cpu())
-    else:
-        return x
- 
-def pack_hook_no_input(self, x):
-    if getattr(x, "offload", True):
-        return (x.device, x.cpu())
-    else:
-        return x
-"""
-
-    unpack_hook = """def unpack_hook(self, packed):
-    if isinstance(packed, tuple):
-        device, tensor = packed
-        return tensor.to(device)
-    else:
-        return packed
-"""
-
-    return pack_hook, unpack_hook
-
-
-def _gen_loop_5(to_keep):
-    context = "chunk_result = []\nfor gen_loop_idx in range(4):\n"
-    context += "    chunk_tensor = " + to_keep + "[gen_loop_idx, :]\n"
+def _gen_loop_start(to_keep, chunk_size=2):
+    context = "chunk_result = []; chunk_size = %d\nfor gen_loop_idx in range(0, %s.shape[0], chunk_size):\n" % (chunk_size, to_keep[0])
+    context += "    chunk_tensor = " + to_keep + "[gen_loop_idx:gen_loop_idx + chunk_size, :]\n"
     return context
 
 
-def _gen_loop_5_final(final_name, to_keep):
+def _gen_loop_end(final_name, to_keep):
     context = "    chunk_result.append(" + final_name + ")\n"
     context += "chunk_result = torch.cat(chunk_result, dim=0);  " + to_keep[0] + " = None\n"
     context += final_name + " = chunk_result; chunk_result = None\n"
     return context
 
-    
-def _gen_save_tensors_hooks_context(offload_input=True) -> str:
-    """Generate customized saved_tensors_hooks
-
-    Args:
-        offload_input (bool, optional): whether we need offload input, if offload_input=False, 
-        we will use self.pack_hook_no_input instead. Defaults to True.
-
-    Returns:
-        str: generated context
-    """
-
-    if offload_input:
-        context = "with torch.autograd.graph.saved_tensors_hooks(self.pack_hook_input, self.unpack_hook):\n"
-    else:
-        context = "with torch.autograd.graph.saved_tensors_hooks(self.pack_hook_no_input, self.unpack_hook):\n"
-    return context
-
-
-def _gen_save_on_cpu_context():
-    """
-    Generate save on cpu context
-    """
-
-    context = "with torch.autograd.graph.save_on_cpu(pin_memory=True):\n"
-    return context
-
 
 def _find_input_and_output_nodes(nodes: List[Node]):
     """
@@ -112,49 +57,6 @@ def _find_input_and_output_nodes(nodes: List[Node]):
     return input_nodes, output_nodes
 
 
-def _find_ckpt_regions(nodes: List[Node]):
-    """
-    Find the checkpoint regions given a list of consecutive nodes. The outputs will be list
-    of tuples, each tuple is in the form of (start_index, end_index).
-    """
-    ckpt_nodes = []
-    ckpt_regions = []
-    start = -1
-    end = -1
-    current_region = None
-
-    for idx, node in enumerate(nodes):
-        if hasattr(node, 'activation_checkpoint'):
-            act_ckpt_label = node.activation_checkpoint
-
-            # this activation checkpoint label is not set yet
-            # meaning this is the first node of the activation ckpt region
-            if current_region is None:
-                current_region = act_ckpt_label
-                start = idx
-
-            # if activation checkpoint has changed
-            # we restart the tracking
-            # e.g. node ckpt states = [ckpt1, ckpt2, ckpt2, ckpt2]
-            if act_ckpt_label != current_region:
-                assert start != -1
-                ckpt_regions.append((start, idx - 1))
-                current_region = act_ckpt_label
-                start = idx
-                end = -1
-        elif current_region is not None and not hasattr(node, 'activation_checkpoint'):
-            # used to check the case below
-            # node ckpt states = [ckpt, ckpt, non-ckpt]
-            end = idx - 1
-            assert start != -1 and end != -1
-            ckpt_regions.append((start, end))
-            start = end = -1
-            current_region = None
-        else:
-            pass
-    return ckpt_regions
-
-
 def _find_offload_regions(nodes: List[Node]):
     """This function is to find the offload regions
     In pofo algorithm, during annotation, we will annotate the offload region with the 
@@ -400,12 +302,9 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
         emit_node_func: function to emit node
         delete_unused_value_func: function to remove the unused value
     """
-    ckpt_regions = _find_nested_ckpt_regions(nodes, 0)
-    start_idx = [item[0] for item in ckpt_regions]
-    end_idx = [item[1] for item in ckpt_regions]
 
     # find the offload regions
-    chunk_regions, chunk_labels = _find_offload_regions(nodes)
+    chunk_regions = [(1, 4)]
     chunk_starts = [item[0] for item in chunk_regions]
     chunk_ends = [item[1] for item in chunk_regions]
     chunk_inputs = []
@@ -424,7 +323,7 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
     # this flag is to prevent repeated insert of save tensors
     # hooks definition in ckpt_func
     node_idx = 0
-    to_keep = []
+    chunk_var = []
     while node_idx < len(node_list):
         # break if we finish the processing all the nodes
         if node_idx >= len(node_list):
@@ -435,28 +334,30 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
             node = node_list[node_idx]
 
             if node_idx in chunk_starts:
-                # save chunk input var, dont delete it
-                to_keep.extend(node.args[0].name)
                 within_chunk_region = True
-                # add for loop
-                body.append(_gen_loop_5(to_keep[0]))
-                # change first node's input to new chunked var
-                node_args = list(node.args)
-                node_args[0] = 'chunk_tensor'
 
+                # save chunk input var, dont delete it
+                chunk_var.append(node.args[0].name)
+                
+                # add for loop
+                body.append(_gen_loop_start(chunk_var[0]))
+                
             if within_chunk_region:
                 emit_node_func(node, body)
+                # replace input var with chunk var
+                if node_idx in chunk_starts:
+                    body[-1] = body[-1].replace("("+ chunk_var[0] +")", '(chunk_tensor)')
                 body[-1] = '    ' + body[-1]
-                delete_unused_value_func(node, body, to_keep)
+                delete_unused_value_func(node, body, chunk_var)
 
             else:
                 emit_node_func(node, body)
                 if node_idx not in chunk_inputs:
-                    delete_unused_value_func(node, body, to_keep)
+                    delete_unused_value_func(node, body, chunk_var)
 
             if node_idx in chunk_ends:
-                body.append(_gen_loop_5_final(node.name, to_keep))
-                to_keep = []
+                body.append(_gen_loop_end(node.name, chunk_var))
+                chunk_var = []
                 within_chunk_region = False
 
             node_idx += 1
@@ -580,9 +481,7 @@ def delete_unused_values(user: Node, body, to_keep=[]):
                     body.append('\n')
                     return
                 nodes_to_delete = user_to_last_uses.get(user, [])
-                for n in nodes_to_delete:
-                    if n.name in to_keep:
-                        nodes_to_delete.remove(n)
+                nodes_to_delete = [i for i in nodes_to_delete if i.name not in to_keep]
                 if len(nodes_to_delete):
                     to_delete_str = ' = '.join([repr(n) for n in nodes_to_delete] + ['None'])
                     body.append(f';  {to_delete_str}\n')
diff --git a/chunk_codegen_run.py b/chunk_codegen_run.py
index 69b327d4bd5b..7667fa691558 100644
--- a/chunk_codegen_run.py
+++ b/chunk_codegen_run.py
@@ -9,60 +9,39 @@
 from colossalai.utils import free_port
 from colossalai.core import global_context as gpc
 from colossalai.fx.graph_module import ColoGraphModule
-
-try:
-    from chunk_codegen import ChunkCodeGen
-    with_codegen = True
-except:
-    # fall back to older pytorch version
-    from chunk_codegen import python_code_with_activation_checkpoint
-    with_codegen = False
-
-
-class MyNet(torch.nn.Module):
-
-    def __init__(self) -> None:
-        super().__init__()
-        self.linear0 = torch.nn.Linear(4, 4)
-        self.linear1 = torch.nn.Linear(4, 4)
-        self.linear2 = torch.nn.Linear(4, 4)
-        self.linear3 = torch.nn.Linear(4, 4)
-        self.linear4 = torch.nn.Linear(4, 4)
-        self.linear5 = torch.nn.Linear(4, 4)
-        self.linear6 = torch.nn.Linear(4, 4)
-
-    def forward(self, x):
-        x = self.linear0(x)
-        x = self.linear1(x)
-        x = self.linear2(x)
-        x = self.linear3(x)
-        x = self.linear4(x)
-        x = self.linear5(x)
-        x = self.linear6(x)
-        return x
+from evoformer.evoformer import evoformer_base
+from chunk_codegen import ChunkCodeGen
+with_codegen = True
 
 
 def _is_all_gradient_close(m: torch.nn.Module, gm: GraphModule) -> bool:
     for m_p, gm_p in zip(m.parameters(), gm.parameters()):
-        if not torch.allclose(m_p.grad, gm_p.grad):
+        if m_p.grad is not None and not torch.allclose(m_p.grad, gm_p.grad):
             return False
     return True
 
 
-def _test_fwd_and_bwd(model: torch.nn.Module, gm: ColoGraphModule, data: torch.Tensor):
+def _is_all_param_close(m: torch.nn.Module, gm: GraphModule) -> bool:
+    for m_p, gm_p in zip(m.parameters(), gm.parameters()):
+        if m_p.grad is not None and not torch.allclose(m_p.data, gm_p.data):
+            return False
+    return True
+
 
+def _test_fwd_and_bwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair):
     # test forward
-    non_fx_out = model(data)
-    fx_out = gm(data)
-    print(non_fx_out.shape, fx_out.shape)
-    assert torch.equal(non_fx_out, fx_out), "fx_out doesn't comply with original output"
+    non_fx_out = model(node.clone(), pair.clone())
+    fx_out = gm(node.clone(), pair.clone())
+    assert torch.equal(non_fx_out[0], fx_out[0]), "fx_out doesn't comply with original output"
+    assert torch.equal(non_fx_out[1], fx_out[1]), "fx_out doesn't comply with original output"
 
     # test barckward
-    loss0 = non_fx_out.sum()
-    loss0.backward()
-    loss1 = fx_out.sum()
-    loss1.backward()
-    assert _is_all_gradient_close(model, gm), "gm doesn't have the same gradient as original one"
+    # loss0 = non_fx_out[0].sum() + non_fx_out[1].sum()
+    # loss0.backward()
+    # loss1 = fx_out[0].sum() + fx_out[1].sum()
+    # loss1.backward()
+    # assert _is_all_param_close(model, gm)
+    # assert _is_all_gradient_close(model, gm), "gm doesn't have the same gradient as original one"
 
 
 def _run_offload_codegen(rank):
@@ -70,30 +49,22 @@ def _run_offload_codegen(rank):
     colossalai.launch(config={}, rank=rank, world_size=1, host='localhost', port=free_port(), backend='nccl')
 
     # build model and input
-    model = MyNet().cuda()
-    data = torch.rand(4, 4).cuda()
+    model = evoformer_base().cuda()
+    node = torch.randn(1, 16, 32, 256).cuda()
+    pair = torch.randn(1, 32, 32, 128).cuda()
 
     # trace the module and replace codegen
     tracer = ColoTracer(trace_act_ckpt=True)
     graph = tracer.trace(model)
-    codegen = ChunkCodeGen()
-    graph.set_codegen(codegen)
-
-    # annotate the activation offload part
-    # also annotate the activation_checkpoint so we could test both types
-    # of input offload
-    for node in graph.nodes:
-        if node.name == "linear0":
-            setattr(node, "activation_offload", [0, True, False])
-        if node.name == "linear1":
-            setattr(node, "activation_offload", [0, True, False])
-        # if node.name == "linear2":
-        #     setattr(node, "activation_offload", [1, True, True])
-        # if node.name == "linear4":
-        #     setattr(node, "activation_offload", [2, False, True])
-        # if node.name == "linear5":
-        #     setattr(node, "activation_checkpoint", [0])
-        #     setattr(node, "activation_offload", True)
+    # codegen = ChunkCodeGen()
+    # graph.set_codegen(codegen)
+
+    # annotate the chunk part
+    # for node in graph.nodes:
+    #     if node.name == "linear0":
+    #         setattr(node, "activation_offload", [0, True, False])
+    #     if node.name == "linear1":
+    #         setattr(node, "activation_offload", [0, True, False])
 
     gm = ColoGraphModule(copy.deepcopy(model), graph)
     gm.recompile()
@@ -102,7 +73,7 @@ def _run_offload_codegen(rank):
     code = graph.python_code("self").src
     print(code)
 
-    _test_fwd_and_bwd(model, gm, data)
+    _test_fwd_and_bwd(model, gm, node, pair)
     gpc.destroy()
 
 
diff --git a/evoformer/evoformer.py b/evoformer/evoformer.py
index ef3df2769840..0c5ab952a779 100644
--- a/evoformer/evoformer.py
+++ b/evoformer/evoformer.py
@@ -28,7 +28,7 @@ def __init__(self, d_node, d_pair):
         super(Evoformer, self).__init__()
 
         self.blocks = nn.ModuleList()
-        for _ in range(3):
+        for _ in range(1):
             self.blocks.append(EvoformerBlock(d_node, d_pair))
 
     def forward(self, node, pair):
@@ -36,6 +36,11 @@ def forward(self, node, pair):
             node, pair = b(node, pair)
         return node, pair
 
+
+def evoformer_tiny():
+    return Evoformer(d_node=64, d_pair=32)
+
+
 def evoformer_base():
     return Evoformer(d_node=256, d_pair=128)
 
diff --git a/evoformer/kernel.py b/evoformer/kernel.py
index 2655901a2fe9..26ab5dc53261 100644
--- a/evoformer/kernel.py
+++ b/evoformer/kernel.py
@@ -8,7 +8,7 @@ def bias_sigmod_ele(y, bias, z):
 
 def bias_dropout_add(x: torch.Tensor, bias: torch.Tensor, dropmask: torch.Tensor,
                      residual: torch.Tensor, prob: float) -> torch.Tensor:
-    out = (x + bias) * F.dropout(dropmask, p=prob, training=True)
+    out = (x + bias) * F.dropout(dropmask, p=prob, training=False)
     out = residual + out
     return out
 
diff --git a/evoformer/msa.py b/evoformer/msa.py
index ccefa38c48be..cac456638a55 100644
--- a/evoformer/msa.py
+++ b/evoformer/msa.py
@@ -45,7 +45,7 @@ def forward(self, M_raw, Z):
         # b = rearrange(b, 'b q k h -> b h q k')
 
         M = self.attention(M, b)
-        dropout_mask = torch.ones_like(M[:, 0:1, :, :], device=M.device, dtype=M.dtype)
+        dropout_mask = torch.ones_like(M[:, 0:1, :, :]).to(M.device).to(M.dtype)
 
         return bias_dropout_add(M, self.out_bias, dropout_mask, M_raw, prob=self.p_drop)
 
diff --git a/evoformer/triangle.py b/evoformer/triangle.py
index 7db0482f5557..f479469c3836 100644
--- a/evoformer/triangle.py
+++ b/evoformer/triangle.py
@@ -51,7 +51,7 @@ def forward(self, Z_raw):
 
         ab = torch.einsum('bikd,bjkd->bijd', left_proj_act, right_proj_act)
         ab = self.output_projection(self.layernorm2(ab))
-        dropout_mask = torch.ones_like(Z[:, 0:1, :, :], device=Z.device, dtype=Z.dtype)
+        dropout_mask = torch.ones_like(Z[:, 0:1, :, :]).to(Z.device).to(Z.dtype)
         return bias_ele_dropout_residual(ab,
                                          self.output_bias,
                                          g,
@@ -97,7 +97,7 @@ def forward(self, Z_raw):
 
         ab = torch.einsum('bkid,bkjd->bijd', left_proj_act, right_proj_act)
         ab = self.output_projection(self.layernorm2(ab))
-        dropout_mask = torch.ones_like(Z[:, 0:1, :, :], device=Z.device, dtype=Z.dtype)
+        dropout_mask = torch.ones_like(Z[:, 0:1, :, :]).to(Z.device).to(Z.dtype)
         return bias_ele_dropout_residual(ab,
                                          self.output_bias,
                                          g,
@@ -134,7 +134,7 @@ def forward(self, Z_raw):
 
         Z = self.attention(Z, b)
 
-        dropout_mask = torch.ones_like(Z[:, 0:1, :, :], device=Z.device, dtype=Z.dtype)
+        dropout_mask = torch.ones_like(Z[:, 0:1, :, :]).to(Z.device).to(Z.dtype)
         return bias_dropout_add(Z, self.out_bias, dropout_mask, Z_raw, prob=self.p_drop)
 
 
@@ -168,7 +168,7 @@ def forward(self, Z_raw):
         Z = self.attention(Z, b)
 
         Z = Z.transpose(-2, -3)
-        dropout_mask = torch.ones_like(Z[:, :, 0:1, :], device=Z.device, dtype=Z.dtype)
+        dropout_mask = torch.ones_like(Z[:, :, 0:1, :]).to(Z.device).to(Z.dtype)
         return bias_dropout_add(Z, self.out_bias, dropout_mask, Z_raw, prob=self.p_drop)
 
 
From f8aeecef46461ff574f51982d03310fa8c57888e Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Thu, 3 Nov 2022 14:33:35 +0800
Subject: [PATCH 006/503] add meta

---
 chunk_codegen.py     |  3 +++
 chunk_codegen_run.py | 13 +++++++++----
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index cb2a3a8a90ee..1f336eb2bf35 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -366,6 +366,9 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
 if CODEGEN_AVAILABLE:
 
     class ChunkCodeGen(CodeGen):
+        def __init__(self, meta_graph):
+            super().__init__()
+            self.meta_node = list(meta_graph.graph.nodes)
 
         def _gen_python_code(self, nodes, root_module: str, namespace: _Namespace) -> PythonCode:
             free_vars: List[str] = []
diff --git a/chunk_codegen_run.py b/chunk_codegen_run.py
index 7667fa691558..b875b6308f55 100644
--- a/chunk_codegen_run.py
+++ b/chunk_codegen_run.py
@@ -9,6 +9,8 @@
 from colossalai.utils import free_port
 from colossalai.core import global_context as gpc
 from colossalai.fx.graph_module import ColoGraphModule
+from colossalai.fx.passes.meta_info_prop import MetaInfoProp, TensorMetadata
+from colossalai.fx.profiler import MetaTensor
 from evoformer.evoformer import evoformer_base
 from chunk_codegen import ChunkCodeGen
 with_codegen = True
@@ -56,9 +58,10 @@ def _run_offload_codegen(rank):
     # trace the module and replace codegen
     tracer = ColoTracer(trace_act_ckpt=True)
     graph = tracer.trace(model)
-    # codegen = ChunkCodeGen()
-    # graph.set_codegen(codegen)
-
+    gm_prop = torch.fx.GraphModule(model, graph)
+    interp = MetaInfoProp(gm_prop)
+    interp.propagate(MetaTensor(node, fake_device='cuda:0'), MetaTensor(pair, fake_device='cuda:0'))
+    
     # annotate the chunk part
     # for node in graph.nodes:
     #     if node.name == "linear0":
@@ -66,7 +69,9 @@ def _run_offload_codegen(rank):
     #     if node.name == "linear1":
     #         setattr(node, "activation_offload", [0, True, False])
 
-    gm = ColoGraphModule(copy.deepcopy(model), graph)
+    codegen = ChunkCodeGen(gm_prop)
+    # graph.set_codegen(codegen)
+    gm = ColoGraphModule(model, graph)
     gm.recompile()
 
     # assert we have all the components

From c35718e8db5f3fbbb5749a2a0b5f4b46241a43b1 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 4 Nov 2022 11:18:09 +0800
Subject: [PATCH 007/503] basic chunk

---
 chunk_codegen.py     | 138 +++++++++++++++++++++++++++++--------------
 chunk_codegen_run.py |   2 +-
 2 files changed, 95 insertions(+), 45 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 1f336eb2bf35..1267f64cbbb2 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -18,16 +18,61 @@
     __all__ = ['python_code_with_activation_checkpoint']
 
 
-def _gen_loop_start(to_keep, chunk_size=2):
-    context = "chunk_result = []; chunk_size = %d\nfor gen_loop_idx in range(0, %s.shape[0], chunk_size):\n" % (chunk_size, to_keep[0])
-    context += "    chunk_tensor = " + to_keep + "[gen_loop_idx:gen_loop_idx + chunk_size, :]\n"
+def _gen_chunk_slice_dim(chunk_dim, chunk_idx_name, shape):
+    new_shape = "["
+    for idx, i in enumerate(shape):
+        if idx == chunk_dim:
+            new_shape += "%s:%s + chunk_size" % (chunk_idx_name, chunk_idx_name)
+        else:
+            new_shape += ":"
+        new_shape += ", "
+    new_shape = new_shape[:-2] + "]"
+    return new_shape
+
+
+def _get_first_non_single_dim(shape):
+    for idx, i in enumerate(shape):
+        if i == 1:
+            continue
+        else:
+            return idx
+    raise RuntimeError("can not get first non single dim for shape", shape)
+
+
+def _gen_loop_start(chunk_input_meta, chunk_output, chunk_size=2):
+    if len(chunk_input_meta) == 1:
+        node = chunk_input_meta[0]
+        node_shape = node.meta['tensor_meta'].shape
+        chunk_dim = _get_first_non_single_dim(node_shape)
+        chunk_slice = _gen_chunk_slice_dim(chunk_dim, "gen_chunk_idx", node_shape)
+        out_shape = str(list(chunk_output.meta['tensor_meta'].shape))
+        
+        context = "chunk_result = torch.empty(%s, dtype=%s.dtype, device=%s.device); chunk_size = %d\nfor gen_chunk_idx in range" % (
+            out_shape, node.name, node.name, chunk_size)
+        context += "(0, %s.shape[%d], chunk_size):\n" % (node.name, chunk_dim)
+        context += "    chunk_tensor = %s%s\n" % (node.name, chunk_slice)
+    else:
+        raise NotImplementedError("input with size %d not implemented" % len(chunk_input_meta))
     return context
 
 
-def _gen_loop_end(final_name, to_keep):
-    context = "    chunk_result.append(" + final_name + ")\n"
-    context += "chunk_result = torch.cat(chunk_result, dim=0);  " + to_keep[0] + " = None\n"
-    context += final_name + " = chunk_result; chunk_result = None\n"
+def _gen_loop_end(chunk_outputs, chunk_inputs, node_list):
+    chunk_inputs_name = chunk_inputs[0].name
+    chunk_outputs_name = chunk_outputs.name
+    chunk_outputs_idx = _find_idx_by_name(chunk_outputs_name, node_list)
+    chunk_output_shape = chunk_outputs.meta['tensor_meta'].shape
+    chunk_dim = _get_first_non_single_dim(chunk_output_shape)
+    chunk_slice = _gen_chunk_slice_dim(chunk_dim, "gen_chunk_idx", chunk_output_shape)
+    context = "    chunk_result%s = %s\n" % (chunk_slice, chunk_outputs_name)
+
+    context += chunk_outputs_name + " = chunk_result;  chunk_result = None;  chunk_size = None"
+    
+    # determine if its the last use for chunk input
+    users_name = list(chunk_inputs[0].users.keys())
+    if all([_find_idx_by_name(user.name, node_list) <= chunk_outputs_idx for user in users_name]):
+        context += ";  %s = None" % chunk_inputs_name
+
+    context += "\n"
     return context
 
 
@@ -44,7 +89,7 @@ def _find_input_and_output_nodes(nodes: List[Node]):
         for input_node in node._input_nodes.keys():
             node_repr = repr(input_node)
             if input_node not in nodes and node_repr not in input_nodes:
-                input_nodes.append(node_repr)
+                input_nodes.append(input_node)
 
     # if a node has a user node which is not in the node list
     # we treat that user node as the node receiving the current node output
@@ -52,11 +97,18 @@ def _find_input_and_output_nodes(nodes: List[Node]):
         for output_node in node.users.keys():
             node_repr = repr(node)
             if output_node not in nodes and node_repr not in output_nodes:
-                output_nodes.append(node_repr)
+                output_nodes.append(output_node)
 
     return input_nodes, output_nodes
 
 
+def _find_idx_by_name(name, nodes_list):
+    for idx, node in enumerate(nodes_list):
+        if node.name == name:
+            return idx
+    raise RuntimeError("name %s not found in node list" % name)
+        
+
 def _find_offload_regions(nodes: List[Node]):
     """This function is to find the offload regions
     In pofo algorithm, during annotation, we will annotate the offload region with the 
@@ -290,7 +342,7 @@ def emit_ckpt_func(body,
             body.append(usage)
 
 
-def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_value_func):
+def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_value_func, meta_nodes):
     """Emit code with nested activation checkpoint
     When we detect some of the node.activation_checkpoint is a List, we will use
     this function to emit the activation checkpoint codes.
@@ -304,7 +356,7 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
     """
 
     # find the offload regions
-    chunk_regions = [(1, 4)]
+    chunk_regions = [(2, 5)]
     chunk_starts = [item[0] for item in chunk_regions]
     chunk_ends = [item[1] for item in chunk_regions]
     chunk_inputs = []
@@ -319,48 +371,46 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
         inputs, outputs = _find_input_and_output_nodes(offload_node_list)
         chunk_inputs.append(inputs)
         chunk_outputs.append(outputs)
-
+    chunk_inputs_idx = [[_find_idx_by_name(j.name, node_list) for j in i] for i in chunk_inputs]
+    chunk_outputs_idx = [[_find_idx_by_name(j.name, node_list) for j in i] for i in chunk_outputs]
+    chunk_inputs_names = []
+    for i in chunk_inputs:
+        for j in i:
+            chunk_inputs_names.append(j.name)
+    
     # this flag is to prevent repeated insert of save tensors
     # hooks definition in ckpt_func
     node_idx = 0
-    chunk_var = []
+    region_idx = 0
     while node_idx < len(node_list):
-        # break if we finish the processing all the nodes
-        if node_idx >= len(node_list):
-            break
+        node = node_list[node_idx]
 
-        # process node in forward function
-        else:
-            node = node_list[node_idx]
+        if node_idx in chunk_starts:
+            within_chunk_region = True
+                
+            # add for loop
+            chunk_input_meta = [meta_nodes[i] for i in chunk_inputs_idx[region_idx]]
+            body.append(_gen_loop_start(chunk_input_meta, node_list[chunk_ends[region_idx]]))
 
+        if within_chunk_region:
+            emit_node_func(node, body)
+            # replace input var with chunk var
             if node_idx in chunk_starts:
-                within_chunk_region = True
-
-                # save chunk input var, dont delete it
-                chunk_var.append(node.args[0].name)
-                
-                # add for loop
-                body.append(_gen_loop_start(chunk_var[0]))
-                
-            if within_chunk_region:
-                emit_node_func(node, body)
-                # replace input var with chunk var
-                if node_idx in chunk_starts:
-                    body[-1] = body[-1].replace("("+ chunk_var[0] +")", '(chunk_tensor)')
-                body[-1] = '    ' + body[-1]
-                delete_unused_value_func(node, body, chunk_var)
+                body[-1] = body[-1].replace("("+ chunk_inputs[region_idx][0].name +")", '(chunk_tensor)')
+            body[-1] = '    ' + body[-1]
+            delete_unused_value_func(node, body, chunk_inputs_names)
 
-            else:
-                emit_node_func(node, body)
-                if node_idx not in chunk_inputs:
-                    delete_unused_value_func(node, body, chunk_var)
+        else:
+            emit_node_func(node, body)
+            if node_idx not in chunk_inputs:
+                delete_unused_value_func(node, body, chunk_inputs_names)
 
-            if node_idx in chunk_ends:
-                body.append(_gen_loop_end(node.name, chunk_var))
-                chunk_var = []
-                within_chunk_region = False
+        if node_idx in chunk_ends:
+            body.append(_gen_loop_end(node, chunk_inputs[region_idx], node_list))
+            within_chunk_region = False
+            region_idx += 1
 
-            node_idx += 1
+        node_idx += 1
 
 
 if CODEGEN_AVAILABLE:
@@ -562,7 +612,7 @@ def emit_node(node: Node, body):
 
             # if any node has a list of labels for activation_checkpoint, we
             # will use nested type of activation checkpoint codegen
-            emit_code_with_chunk(body, ckpt_func, nodes, emit_node, delete_unused_values)
+            emit_code_with_chunk(body, ckpt_func, nodes, emit_node, delete_unused_values, self.meta_node)
 
             if len(body) == 0:
                 # If the Graph has no non-placeholder nodes, no lines for the body
diff --git a/chunk_codegen_run.py b/chunk_codegen_run.py
index b875b6308f55..547b983a9c0c 100644
--- a/chunk_codegen_run.py
+++ b/chunk_codegen_run.py
@@ -70,7 +70,7 @@ def _run_offload_codegen(rank):
     #         setattr(node, "activation_offload", [0, True, False])
 
     codegen = ChunkCodeGen(gm_prop)
-    # graph.set_codegen(codegen)
+    graph.set_codegen(codegen)
     gm = ColoGraphModule(model, graph)
     gm.recompile()
 

From d95cfe26222427e483df7f23f4bb208cec6ae4c3 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 7 Nov 2022 18:26:13 +0800
Subject: [PATCH 008/503] basic memory

---
 chunk_codegen.py     | 83 ++++++++++++++++++++++++++++++++++++++++++--
 chunk_codegen_run.py | 20 +++++------
 2 files changed, 90 insertions(+), 13 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 1267f64cbbb2..4ca33a4d5914 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -6,6 +6,7 @@
 try:
     from torch.fx.node import Node, Argument, map_arg, _type_repr, _get_qualified_name
     from torch.fx.graph import _Namespace, PythonCode, _custom_builtins, _is_from_torch, _format_target, magic_methods, CodeGen, _origin_type_map, inplace_methods, _CustomBuiltin
+    from colossalai.fx.profiler import calculate_fwd_out, calculate_fwd_tmp, parameter_size, activation_size
     CODEGEN_AVAILABLE = True
 except:
     from torch.fx.graph import _Namespace, PythonCode, _custom_builtins, _is_from_torch, _format_target, magic_methods, _origin_type_map, _format_args, _CustomBuiltin
@@ -18,6 +19,82 @@
     __all__ = ['python_code_with_activation_checkpoint']
 
 
+def _get_meta_node_size(x):
+    x = x.meta['tensor_meta']
+    x = x.numel * torch.tensor([], dtype=x.dtype).element_size()
+    return x
+
+
+def _get_output_node_size(n):
+    fwd_out = {x.uuid: x for x in n.meta["fwd_out"] if isinstance(x, torch.Tensor) and hasattr(x, 'uuid')}
+    return activation_size(fwd_out)
+
+
+def _get_delete_node_size(user, user_to_last_uses):
+    if user.op in ('placeholder', 'output'):
+        return 0
+    nodes_to_delete = user_to_last_uses.get(user, [])
+    if len(nodes_to_delete):
+        delete_size = sum([_get_output_node_size(i) for i in nodes_to_delete])
+        return delete_size
+    return 0
+
+
+def _get_last_usr(nodes):
+    node_to_last_use: Dict[Node, Node] = {}
+    user_to_last_uses: Dict[Node, List[Node]] = {}
+
+    def register_last_uses(n: Node, user: Node):
+        if n not in node_to_last_use:
+            node_to_last_use[n] = user
+            user_to_last_uses.setdefault(user, []).append(n)
+
+    for node in reversed(nodes):
+        map_arg(node.args, lambda n: register_last_uses(n, node))
+        map_arg(node.kwargs, lambda n: register_last_uses(n, node))
+    return user_to_last_uses
+
+
+def _estimate_inference_mem(gm: torch.fx.GraphModule):
+    act_memory = 0
+    act_memory_peak_log = []
+    act_memory_after_node_log = []
+    user_to_last_uses = _get_last_usr(list(gm.graph.nodes))
+    for node in gm.graph.nodes:
+        # if node is placeholder, just add the size of the node
+        if node.op == 'placeholder':
+            act_memory += _get_meta_node_size(node)
+        # skip output
+        elif node.op == 'output':
+            continue
+        # node is an operation, calculate tmp, output node and delete node memory
+        else:
+            # forward memory
+            act_memory += calculate_fwd_tmp(node)
+            # act_memory += calculate_fwd_out(node)
+            act_memory += _get_output_node_size(node)
+            # record max act memory
+            act_memory_peak_log.append(act_memory)
+            # delete useless memory
+            act_memory -= calculate_fwd_tmp(node)
+            act_memory -= _get_delete_node_size(node, user_to_last_uses)
+            act_memory_after_node_log.append(act_memory)
+
+    act_memory_peak_log = [float(i) / (1024 ** 2) for i in act_memory_peak_log]
+    param_memory = parameter_size(gm)
+    return (act_memory + param_memory) / (1024 ** 2), param_memory / (1024 ** 2)
+
+
+def _estimate_chunk_forward_mem(gm: torch.fx.GraphModule, start_node, end_node, chunk_size):
+    node_size = 0
+    param_size = 0
+    for node in gm.graph.nodes:
+        node_size += calculate_fwd_tmp(node)
+        node_size += calculate_fwd_out(node)
+    param_size = parameter_size(gm)
+    return (node_size + param_size) / 1024**2, param_size / 1024**2
+
+
 def _gen_chunk_slice_dim(chunk_dim, chunk_idx_name, shape):
     new_shape = "["
     for idx, i in enumerate(shape):
@@ -342,7 +419,7 @@ def emit_ckpt_func(body,
             body.append(usage)
 
 
-def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_value_func, meta_nodes):
+def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_value_func, meta_nodes, meta_graph):
     """Emit code with nested activation checkpoint
     When we detect some of the node.activation_checkpoint is a List, we will use
     this function to emit the activation checkpoint codes.
@@ -364,6 +441,7 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
     within_chunk_region = False
 
     node_list = list(nodes)
+    _estimate_inference_mem(meta_graph)
 
     # find the input and output var names for each offload region
     for idx, (start, end) in enumerate(chunk_regions):
@@ -418,6 +496,7 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
     class ChunkCodeGen(CodeGen):
         def __init__(self, meta_graph):
             super().__init__()
+            self.meta_graph = meta_graph
             self.meta_node = list(meta_graph.graph.nodes)
 
         def _gen_python_code(self, nodes, root_module: str, namespace: _Namespace) -> PythonCode:
@@ -612,7 +691,7 @@ def emit_node(node: Node, body):
 
             # if any node has a list of labels for activation_checkpoint, we
             # will use nested type of activation checkpoint codegen
-            emit_code_with_chunk(body, ckpt_func, nodes, emit_node, delete_unused_values, self.meta_node)
+            emit_code_with_chunk(body, ckpt_func, nodes, emit_node, delete_unused_values, self.meta_node, self.meta_graph)
 
             if len(body) == 0:
                 # If the Graph has no non-placeholder nodes, no lines for the body
diff --git a/chunk_codegen_run.py b/chunk_codegen_run.py
index 547b983a9c0c..1ab7d958b0a9 100644
--- a/chunk_codegen_run.py
+++ b/chunk_codegen_run.py
@@ -2,6 +2,7 @@
 import torch
 import torch.nn.functional as F
 import pytest
+import torch.fx
 import torch.multiprocessing as mp
 from torch.fx import GraphModule
 from colossalai.fx import ColoTracer
@@ -56,18 +57,15 @@ def _run_offload_codegen(rank):
     pair = torch.randn(1, 32, 32, 128).cuda()
 
     # trace the module and replace codegen
-    tracer = ColoTracer(trace_act_ckpt=True)
-    graph = tracer.trace(model)
-    gm_prop = torch.fx.GraphModule(model, graph)
-    interp = MetaInfoProp(gm_prop)
+    graph = ColoTracer().trace(model, meta_args={'node': node.to(torch.device('meta')), 'pair': pair.to(torch.device('meta'))})
+    gm_prop = torch.fx.symbolic_trace(model) # must use symbolic_trace
+    interp = MetaInfoProp(gm_prop) 
+    interp.propagate(MetaTensor(node, fake_device='cuda:0'), MetaTensor(pair, fake_device='cuda:0'))
+
+    # now run it twice to get meta info in graph module, not necessary
+    gm = torch.fx.GraphModule(model, graph)
+    interp = MetaInfoProp(gm)
     interp.propagate(MetaTensor(node, fake_device='cuda:0'), MetaTensor(pair, fake_device='cuda:0'))
-    
-    # annotate the chunk part
-    # for node in graph.nodes:
-    #     if node.name == "linear0":
-    #         setattr(node, "activation_offload", [0, True, False])
-    #     if node.name == "linear1":
-    #         setattr(node, "activation_offload", [0, True, False])
 
     codegen = ChunkCodeGen(gm_prop)
     graph.set_codegen(codegen)

From 12301dd2e9a1889fe76c6ab719aff1404e92aea0 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 8 Nov 2022 10:34:14 +0800
Subject: [PATCH 009/503] finish basic inference memory estimation

---
 chunk_codegen.py     | 11 +++++++++++
 chunk_codegen_run.py | 14 ++++++++++++--
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 4ca33a4d5914..01b29cb33d43 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -64,6 +64,8 @@ def _estimate_inference_mem(gm: torch.fx.GraphModule):
         # if node is placeholder, just add the size of the node
         if node.op == 'placeholder':
             act_memory += _get_meta_node_size(node)
+            act_memory_peak_log.append(act_memory)
+            act_memory_after_node_log.append(act_memory)
         # skip output
         elif node.op == 'output':
             continue
@@ -81,6 +83,15 @@ def _estimate_inference_mem(gm: torch.fx.GraphModule):
             act_memory_after_node_log.append(act_memory)
 
     act_memory_peak_log = [float(i) / (1024 ** 2) for i in act_memory_peak_log]
+    act_memory_after_node_log = [float(i) / (1024 ** 2) for i in act_memory_after_node_log]
+
+    # for i in act_memory_peak_log:
+    #     print("%.2f " % i, end='')
+    # print("\n")
+    # for i in act_memory_after_node_log:
+    #     print("%.2f " % i, end='')
+    # print("\n")
+    
     param_memory = parameter_size(gm)
     return (act_memory + param_memory) / (1024 ** 2), param_memory / (1024 ** 2)
 
diff --git a/chunk_codegen_run.py b/chunk_codegen_run.py
index 1ab7d958b0a9..cc975f2eaf84 100644
--- a/chunk_codegen_run.py
+++ b/chunk_codegen_run.py
@@ -32,9 +32,19 @@ def _is_all_param_close(m: torch.nn.Module, gm: GraphModule) -> bool:
 
 
 def _test_fwd_and_bwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair):
+    # now_mem = torch.cuda.memory_allocated() / 1024**2
+    # max_mem = torch.cuda.max_memory_allocated() / 1024**2
+    # print("now:%.2f max:%.2f" %(torch.cuda.memory_allocated() / 1024**2, torch.cuda.max_memory_allocated() / 1024**2))
+    # with torch.no_grad():
+    #     fx_out = gm(node, pair)
+    # new_now_mem = torch.cuda.memory_allocated() / 1024**2
+    # new_max_mem = torch.cuda.max_memory_allocated() / 1024**2
+    # print("now:%.2f max:%.2f" %(new_now_mem - now_mem, new_max_mem - max_mem))
+    
     # test forward
-    non_fx_out = model(node.clone(), pair.clone())
-    fx_out = gm(node.clone(), pair.clone())
+    with torch.no_grad():
+        non_fx_out = model(node, pair)
+        fx_out = gm(node, pair)
     assert torch.equal(non_fx_out[0], fx_out[0]), "fx_out doesn't comply with original output"
     assert torch.equal(non_fx_out[1], fx_out[1]), "fx_out doesn't comply with original output"
 

From 8cca684c5684ffb0ac0b68d63df3cbde848d3d08 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 8 Nov 2022 14:41:57 +0800
Subject: [PATCH 010/503] finish memory estimation

---
 chunk_codegen.py | 103 ++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 88 insertions(+), 15 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 01b29cb33d43..baf207795b60 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -85,25 +85,97 @@ def _estimate_inference_mem(gm: torch.fx.GraphModule):
     act_memory_peak_log = [float(i) / (1024 ** 2) for i in act_memory_peak_log]
     act_memory_after_node_log = [float(i) / (1024 ** 2) for i in act_memory_after_node_log]
 
-    # for i in act_memory_peak_log:
-    #     print("%.2f " % i, end='')
-    # print("\n")
-    # for i in act_memory_after_node_log:
-    #     print("%.2f " % i, end='')
-    # print("\n")
+    print("no chunk")
+    _print_mem_log(act_memory_peak_log, "peak")
+    _print_mem_log(act_memory_after_node_log, "after")
     
     param_memory = parameter_size(gm)
     return (act_memory + param_memory) / (1024 ** 2), param_memory / (1024 ** 2)
 
 
-def _estimate_chunk_forward_mem(gm: torch.fx.GraphModule, start_node, end_node, chunk_size):
-    node_size = 0
-    param_size = 0
-    for node in gm.graph.nodes:
-        node_size += calculate_fwd_tmp(node)
-        node_size += calculate_fwd_out(node)
-    param_size = parameter_size(gm)
-    return (node_size + param_size) / 1024**2, param_size / 1024**2
+def _get_chunk_ratio(node, chunk_dim, chunk_size):
+    shape = node.meta['tensor_meta'].shape
+    chunk_ratio = float(chunk_size) / shape[chunk_dim]
+    return chunk_ratio
+
+
+def _get_chunk_delete_node_size(user, user_to_last_uses, chunk_ratio, node_list, start_node, end_node):
+    if user.op in ('placeholder', 'output'):
+        return 0
+    nodes_to_delete = user_to_last_uses.get(user, [])
+    delete_size = 0
+    for n in nodes_to_delete:
+        node_idx = _find_idx_by_name(n.name, node_list)
+        if start_node <= node_idx < end_node:
+            delete_size += _get_output_node_size(n) * chunk_ratio
+    return delete_size
+
+
+def _print_mem_log(log, title=None):
+    if title:
+        print("%-8s" % title, end=' ')
+    for i in log:
+        print("%.2f " % i, end='')
+    print("")
+
+
+def _estimate_chunk_inference_mem(gm: torch.fx.GraphModule, start_nodes, end_nodes, chunk_dims, chunk_sizes):
+    act_memory = 0
+    act_memory_peak_log = []
+    act_memory_after_node_log = []
+    user_to_last_uses = _get_last_usr(list(gm.graph.nodes))
+    within_chunk = False
+    region_idx = 0
+    chunk_ratio = 1 # use it to estimate chunk mem
+    node_list = list(gm.graph.nodes)
+
+    for idx, node in enumerate(node_list):
+        # if node in chunk start nodes, change chunk ratio and add chunk_tensor
+        if idx in start_nodes:
+            within_chunk = True
+            chunk_ratio = _get_chunk_ratio(node, chunk_dims[region_idx], chunk_sizes[region_idx])
+            act_memory += _get_output_node_size(node_list[end_nodes[region_idx]])
+            
+        # if node is placeholder, just add the size of the node
+        if node.op == 'placeholder':
+            act_memory += _get_meta_node_size(node) * chunk_ratio
+            act_memory_peak_log.append(act_memory)
+        # skip output
+        elif node.op == 'output':
+            continue
+        # node is an operation, calculate tmp, output node and delete node memory
+        else:
+            # forward memory
+            act_memory += calculate_fwd_tmp(node) * chunk_ratio
+            # act_memory += calculate_fwd_out(node)
+            act_memory += _get_output_node_size(node) * chunk_ratio
+            # record max act memory
+            act_memory_peak_log.append(act_memory)
+            # delete useless memory
+            act_memory -= calculate_fwd_tmp(node) * chunk_ratio
+            if within_chunk:
+                act_memory -= _get_chunk_delete_node_size(
+                    node, user_to_last_uses, chunk_ratio, node_list, start_nodes[region_idx], end_nodes[region_idx])
+            else:
+                act_memory -= _get_delete_node_size(node, user_to_last_uses)
+            
+        if idx in end_nodes:
+            act_memory -= _get_output_node_size(node) * chunk_ratio
+            within_chunk = False
+            chunk_ratio = 1
+            region_idx += 1
+        
+        act_memory_after_node_log.append(act_memory)
+
+    act_memory_peak_log = [float(i) / (1024 ** 2) for i in act_memory_peak_log]
+    act_memory_after_node_log = [float(i) / (1024 ** 2) for i in act_memory_after_node_log]
+
+    print("chunk")
+    _print_mem_log(act_memory_peak_log, "peak")
+    _print_mem_log(act_memory_after_node_log, "after")
+    
+    param_memory = parameter_size(gm)
+    return (act_memory + param_memory) / (1024 ** 2), param_memory / (1024 ** 2)
 
 
 def _gen_chunk_slice_dim(chunk_dim, chunk_idx_name, shape):
@@ -444,7 +516,7 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
     """
 
     # find the offload regions
-    chunk_regions = [(2, 5)]
+    chunk_regions = [(2, 6)]
     chunk_starts = [item[0] for item in chunk_regions]
     chunk_ends = [item[1] for item in chunk_regions]
     chunk_inputs = []
@@ -452,6 +524,7 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
     within_chunk_region = False
 
     node_list = list(nodes)
+    _estimate_chunk_inference_mem(meta_graph, chunk_starts, chunk_ends, [1], [2])
     _estimate_inference_mem(meta_graph)
 
     # find the input and output var names for each offload region

From 22f9c60b6bea147c38127f5a4420a91ab73dc84b Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Wed, 9 Nov 2022 17:50:39 +0800
Subject: [PATCH 011/503] fix bug

---
 evoformer/ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/evoformer/ops.py b/evoformer/ops.py
index ddbba441dd5f..611b7b0fe777 100755
--- a/evoformer/ops.py
+++ b/evoformer/ops.py
@@ -147,7 +147,7 @@ def forward(self, in_data, nonbatched_bias=None):
 
         q = self.to_q(in_data)
         k = self.to_k(in_data)
-        v = self.to_k(in_data)
+        v = self.to_v(in_data)
 
         # q, k, v = map(lambda t: rearrange(t, 'b1 b2 n (h d) -> b1 b2 h n d', h=self.n_head),
         #               [q, k, v])

From d7634af5c031aa9f4faaf6ee5ea0c1662d6c6f25 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 11 Nov 2022 15:43:03 +0800
Subject: [PATCH 012/503] finish memory estimation

---
 chunk_codegen.py     | 107 ++++++++++++++++++++++++++++---------------
 chunk_codegen_run.py |  20 ++++----
 2 files changed, 80 insertions(+), 47 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index baf207795b60..c8bb433ef6b5 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -55,15 +55,49 @@ def register_last_uses(n: Node, user: Node):
     return user_to_last_uses
 
 
+def _delete_free_var_from_last_use(user_to_last_uses):
+    for key, value in user_to_last_uses.items():
+        for n in value:
+            if n.op == 'placeholder':
+                user_to_last_uses[key].remove(n)
+
+
+def _get_contiguous_memory(node, not_contiguous_list, delete=False):
+    mem = 0
+    not_contiguous_ops = ['transpose', 'permute']
+
+    if node.op == 'call_function' and 'matmul' in node.name:
+        for n in node.args:
+            if n in not_contiguous_list:
+                # matmul won't change origin tensor, but create a tmp copy
+                mem += _get_output_node_size(n)
+    elif node.op == 'call_module':
+        for n in node.args:
+            if n in not_contiguous_list:
+                # module will just make origin tensor to contiguous
+                if delete:
+                    not_contiguous_list.remove(n)
+    elif node.op == 'call_method' and any(i in node.name for i in not_contiguous_ops):
+        if node not in not_contiguous_list:
+            not_contiguous_list.append(node)
+    elif any(i in node.args for i in not_contiguous_list):
+        if node not in not_contiguous_list:
+            not_contiguous_list.append(node)
+
+    return mem
+
+
 def _estimate_inference_mem(gm: torch.fx.GraphModule):
-    act_memory = 0
+    act_memory = 0.0
     act_memory_peak_log = []
     act_memory_after_node_log = []
+    not_contiguous_list = []
     user_to_last_uses = _get_last_usr(list(gm.graph.nodes))
+    _delete_free_var_from_last_use(user_to_last_uses)
     for node in gm.graph.nodes:
         # if node is placeholder, just add the size of the node
         if node.op == 'placeholder':
-            act_memory += _get_meta_node_size(node)
+            act_memory += _get_meta_node_size(node) / (1024 ** 2)
             act_memory_peak_log.append(act_memory)
             act_memory_after_node_log.append(act_memory)
         # skip output
@@ -72,25 +106,21 @@ def _estimate_inference_mem(gm: torch.fx.GraphModule):
         # node is an operation, calculate tmp, output node and delete node memory
         else:
             # forward memory
-            act_memory += calculate_fwd_tmp(node)
-            # act_memory += calculate_fwd_out(node)
-            act_memory += _get_output_node_size(node)
+            act_memory += _get_contiguous_memory(node, not_contiguous_list) / (1024 ** 2)
+            act_memory += _get_output_node_size(node) / (1024 ** 2)
             # record max act memory
             act_memory_peak_log.append(act_memory)
             # delete useless memory
-            act_memory -= calculate_fwd_tmp(node)
-            act_memory -= _get_delete_node_size(node, user_to_last_uses)
+            act_memory -= _get_delete_node_size(node, user_to_last_uses) / (1024 ** 2)
+            act_memory -= _get_contiguous_memory(node, not_contiguous_list, delete=True) / (1024 ** 2)
             act_memory_after_node_log.append(act_memory)
 
-    act_memory_peak_log = [float(i) / (1024 ** 2) for i in act_memory_peak_log]
-    act_memory_after_node_log = [float(i) / (1024 ** 2) for i in act_memory_after_node_log]
-
     print("no chunk")
-    _print_mem_log(act_memory_peak_log, "peak")
-    _print_mem_log(act_memory_after_node_log, "after")
+    _print_mem_log(act_memory_peak_log, list(gm.graph.nodes), "peak")
+    _print_mem_log(act_memory_after_node_log, list(gm.graph.nodes), "after")
     
     param_memory = parameter_size(gm)
-    return (act_memory + param_memory) / (1024 ** 2), param_memory / (1024 ** 2)
+    return act_memory + param_memory, param_memory
 
 
 def _get_chunk_ratio(node, chunk_dim, chunk_size):
@@ -111,19 +141,23 @@ def _get_chunk_delete_node_size(user, user_to_last_uses, chunk_ratio, node_list,
     return delete_size
 
 
-def _print_mem_log(log, title=None):
+def _print_mem_log(log, nodes, title=None):
     if title:
-        print("%-8s" % title, end=' ')
-    for i in log:
-        print("%.2f " % i, end='')
-    print("")
+        print(title)
+    for idx, (l, n) in enumerate(zip(log, nodes)):
+        print("%s:%.2f \t" % (n.name, l), end='')
+        if (idx + 1) % 3 == 0:
+            print("")
+    print("\n")
 
 
 def _estimate_chunk_inference_mem(gm: torch.fx.GraphModule, start_nodes, end_nodes, chunk_dims, chunk_sizes):
-    act_memory = 0
+    act_memory = 0.0
     act_memory_peak_log = []
     act_memory_after_node_log = []
+    not_contiguous_list = []
     user_to_last_uses = _get_last_usr(list(gm.graph.nodes))
+    _delete_free_var_from_last_use(user_to_last_uses)
     within_chunk = False
     region_idx = 0
     chunk_ratio = 1 # use it to estimate chunk mem
@@ -134,11 +168,11 @@ def _estimate_chunk_inference_mem(gm: torch.fx.GraphModule, start_nodes, end_nod
         if idx in start_nodes:
             within_chunk = True
             chunk_ratio = _get_chunk_ratio(node, chunk_dims[region_idx], chunk_sizes[region_idx])
-            act_memory += _get_output_node_size(node_list[end_nodes[region_idx]])
+            act_memory += _get_output_node_size(node_list[end_nodes[region_idx]]) / (1024 ** 2)
             
         # if node is placeholder, just add the size of the node
         if node.op == 'placeholder':
-            act_memory += _get_meta_node_size(node) * chunk_ratio
+            act_memory += _get_meta_node_size(node) * chunk_ratio / (1024 ** 2)
             act_memory_peak_log.append(act_memory)
         # skip output
         elif node.op == 'output':
@@ -146,36 +180,33 @@ def _estimate_chunk_inference_mem(gm: torch.fx.GraphModule, start_nodes, end_nod
         # node is an operation, calculate tmp, output node and delete node memory
         else:
             # forward memory
-            act_memory += calculate_fwd_tmp(node) * chunk_ratio
-            # act_memory += calculate_fwd_out(node)
-            act_memory += _get_output_node_size(node) * chunk_ratio
+            act_memory += _get_contiguous_memory(node, not_contiguous_list) * chunk_ratio / (1024 ** 2)
+            act_memory += _get_output_node_size(node) * chunk_ratio / (1024 ** 2)
             # record max act memory
             act_memory_peak_log.append(act_memory)
             # delete useless memory
-            act_memory -= calculate_fwd_tmp(node) * chunk_ratio
+            act_memory -= _get_contiguous_memory(node, not_contiguous_list, delete=True) * chunk_ratio / (1024 ** 2)
             if within_chunk:
                 act_memory -= _get_chunk_delete_node_size(
-                    node, user_to_last_uses, chunk_ratio, node_list, start_nodes[region_idx], end_nodes[region_idx])
+                    node, user_to_last_uses, chunk_ratio, node_list, 
+                    start_nodes[region_idx], end_nodes[region_idx]) / (1024 ** 2)
             else:
-                act_memory -= _get_delete_node_size(node, user_to_last_uses)
+                act_memory -= _get_delete_node_size(node, user_to_last_uses) / (1024 ** 2)
             
         if idx in end_nodes:
-            act_memory -= _get_output_node_size(node) * chunk_ratio
+            act_memory -= _get_output_node_size(node) * chunk_ratio / (1024 ** 2)
             within_chunk = False
             chunk_ratio = 1
             region_idx += 1
         
         act_memory_after_node_log.append(act_memory)
 
-    act_memory_peak_log = [float(i) / (1024 ** 2) for i in act_memory_peak_log]
-    act_memory_after_node_log = [float(i) / (1024 ** 2) for i in act_memory_after_node_log]
-
     print("chunk")
-    _print_mem_log(act_memory_peak_log, "peak")
-    _print_mem_log(act_memory_after_node_log, "after")
-    
+    _print_mem_log(act_memory_peak_log, node_list, "peak")
+    _print_mem_log(act_memory_after_node_log, node_list, "after")
+
     param_memory = parameter_size(gm)
-    return (act_memory + param_memory) / (1024 ** 2), param_memory / (1024 ** 2)
+    return act_memory + param_memory, param_memory
 
 
 def _gen_chunk_slice_dim(chunk_dim, chunk_idx_name, shape):
@@ -516,7 +547,7 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
     """
 
     # find the offload regions
-    chunk_regions = [(2, 6)]
+    chunk_regions = [(58, 62)]
     chunk_starts = [item[0] for item in chunk_regions]
     chunk_ends = [item[1] for item in chunk_regions]
     chunk_inputs = []
@@ -683,7 +714,9 @@ def register_last_uses(n: Node, user: Node):
             for node in reversed(nodes):
                 map_arg(node.args, lambda n: register_last_uses(n, node))
                 map_arg(node.kwargs, lambda n: register_last_uses(n, node))
-
+            
+            _delete_free_var_from_last_use(user_to_last_uses)
+            
             # NOTE: we add a variable to distinguish body and ckpt_func
             def delete_unused_values(user: Node, body, to_keep=[]):
                 """
diff --git a/chunk_codegen_run.py b/chunk_codegen_run.py
index cc975f2eaf84..39363a80abcb 100644
--- a/chunk_codegen_run.py
+++ b/chunk_codegen_run.py
@@ -32,14 +32,14 @@ def _is_all_param_close(m: torch.nn.Module, gm: GraphModule) -> bool:
 
 
 def _test_fwd_and_bwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair):
-    # now_mem = torch.cuda.memory_allocated() / 1024**2
-    # max_mem = torch.cuda.max_memory_allocated() / 1024**2
-    # print("now:%.2f max:%.2f" %(torch.cuda.memory_allocated() / 1024**2, torch.cuda.max_memory_allocated() / 1024**2))
-    # with torch.no_grad():
-    #     fx_out = gm(node, pair)
-    # new_now_mem = torch.cuda.memory_allocated() / 1024**2
-    # new_max_mem = torch.cuda.max_memory_allocated() / 1024**2
-    # print("now:%.2f max:%.2f" %(new_now_mem - now_mem, new_max_mem - max_mem))
+    now_mem = torch.cuda.memory_allocated() / 1024**2
+    with torch.no_grad():
+        node0 = node.clone()
+        pair0 = pair.clone()
+        node1, pair1 = gm(node0, pair0)        
+    new_now_mem = torch.cuda.memory_allocated() / 1024**2
+    new_max_mem = torch.cuda.max_memory_allocated() / 1024**2
+    print("now:%.2f max:%.2f" %(new_now_mem - now_mem, new_max_mem - now_mem))
     
     # test forward
     with torch.no_grad():
@@ -63,8 +63,8 @@ def _run_offload_codegen(rank):
 
     # build model and input
     model = evoformer_base().cuda()
-    node = torch.randn(1, 16, 32, 256).cuda()
-    pair = torch.randn(1, 32, 32, 128).cuda()
+    node = torch.randn(1, 100, 300, 256).cuda()
+    pair = torch.randn(1, 300, 300, 128).cuda()
 
     # trace the module and replace codegen
     graph = ColoTracer().trace(model, meta_args={'node': node.to(torch.device('meta')), 'pair': pair.to(torch.device('meta'))})

From 1607d04e81530a3de96ce064b961c2b10ed7067a Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 14 Nov 2022 16:02:47 +0800
Subject: [PATCH 013/503] add part of index tracer

---
 chunk_codegen.py | 119 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 119 insertions(+)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index c8bb433ef6b5..4b8882afc105 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -19,6 +19,123 @@
     __all__ = ['python_code_with_activation_checkpoint']
 
 
+class NodeIndexTracer(object):
+    def __init__(self, gm) -> None:
+        self.gm = gm
+        self.nodes_list = list(gm.graph.nodes)
+        self.idx_trace_list = [{'idx': [], 'compute': []} for _ in range(len(self.nodes_list))] 
+        self.idx_trace_equal = []
+        self.idx_count = 1
+
+    def add_index(self):
+        self.idx_count += 1
+        return self.idx_count - 1
+
+    def inherit_computation(self, node_from, node_to):
+        _, compute_from = self.find_trace_from_node(node_from)
+        idx_to, compute_to = self.find_trace_from_node(node_to)
+        for i in compute_from:
+            if i in idx_to:
+                compute_to.append(i)
+    
+    def mark_idx_equal(self, idx1, idx2):
+        self.idx_trace_equal.append((idx1, idx2))
+        
+    def mark_computation(self, node, idx, dim):
+        input_node_idx_trace = self.find_idx_trace_from_node(node)
+        if isinstance(dim, int):
+            dim = [dim]
+        for d in dim:
+            cur_idx = input_node_idx_trace[d]
+            self.idx_trace_list[idx]['compute'].append(cur_idx)
+    
+    def find_trace_from_node(self, node):
+        node_idx = _find_idx_by_name(node.name, self.nodes_list)
+        node_dict = self.idx_trace_list[node_idx]
+        return node_dict['idx'], node_dict['compute']
+    
+    def find_idx_trace_from_node(self, node):
+        node_idx = _find_idx_by_name(node.name, self.nodes_list)
+        node_idx_trace = self.idx_trace_list[node_idx]['idx']
+        return node_idx_trace
+    
+    def assign_index_as_input(self, node, node_idx):
+        input_node_idx = _find_idx_by_name(node.args[0].name, self.nodes_list)
+        input_node_idx_trace = self.idx_trace_list[input_node_idx]['idx']
+        
+        new_idx_trace = copy.deepcopy(input_node_idx_trace)
+        self.idx_trace_list[node_idx]['idx'] = new_idx_trace
+    
+    def assign_all_index(self, node, node_idx):
+        shape = node.meta['tensor_meta'].shape
+        new_trace = []
+        for _ in shape:
+            new_trace.append(self.add_index())
+        self.idx_trace_list[node_idx]['idx'] = new_trace   
+
+    def assign_transpose_index(self, node, node_idx):
+        tranpose_dim = node.args[1:]
+        input_node_idx_trace = self.find_idx_trace_from_node(node.args[0])
+        
+        new_idx_trace = copy.deepcopy(input_node_idx_trace)
+        new_idx_trace[tranpose_dim[0]] = input_node_idx_trace[tranpose_dim[1]]
+        new_idx_trace[tranpose_dim[1]] = input_node_idx_trace[tranpose_dim[0]]
+
+        self.idx_trace_list[node_idx]['idx'] = new_idx_trace
+        
+    def assign_linear_index(self, node, node_idx):
+        input_node, weight, bias = node.args
+        input_node_idx_trace = self.find_idx_trace_from_node(input_node)
+        weight_idx_trace = self.find_idx_trace_from_node(weight)
+        
+        new_idx_trace = copy.deepcopy(input_node_idx_trace)
+        new_idx_trace[-1] = weight_idx_trace[1]
+        self.idx_trace_list[node_idx]['idx'] = new_idx_trace
+
+        self.inherit_computation(input_node, node)
+        self.mark_computation(node, node_idx, [-1])
+        self.mark_idx_equal(input_node_idx_trace[-1], weight_idx_trace[0])
+        
+        if bias:
+            bias_idx_trace = self.find_idx_trace_from_node(bias)
+            self.mark_idx_equal(input_node_idx_trace[-1], bias_idx_trace[0])
+
+    def assign_layernorm_index(self, node, idx):
+        self.assign_index_as_input(node, idx)
+        self.mark_computation(node, idx, [-1, -2])
+            
+    def trace_node_idx(self):
+        for idx, node in enumerate(self.nodes_list):
+            if node.op == 'placeholder':
+                self.assign_all_index(node, idx)
+            elif node.op == 'call_method':
+                if 'transpose' in node.name:
+                    self.assign_transpose_index(node, idx)
+                elif 'view' in node.name:
+                    pass
+                elif 'permute' in node.name:
+                    pass
+                else:
+                    raise NotImplementedError(node.name, "method not implemented yet!")
+            elif node.op == 'call_function':
+                if 'linear' in node.name:
+                    self.assign_linear_index(node, idx)
+                elif 'getattr' in node.name:
+                    continue # get attr like shape
+                elif 'getitem' in node.name:
+                    continue # get item in list
+                else:
+                    raise NotImplementedError(node.name, "function not implemented yet!")
+            elif node.op == 'call_module':
+                if 'layernorm' in node.name:
+                    self.assign_layernorm_index(node, idx)
+                else:
+                    raise NotImplementedError(node.name, "module not implemented yet!")
+            elif node.op == 'get_attr':
+                self.assign_all_index(node, idx) # get param
+            else:
+                raise NotImplementedError(node.op, "op not implemented yet!")
+
 def _get_meta_node_size(x):
     x = x.meta['tensor_meta']
     x = x.numel * torch.tensor([], dtype=x.dtype).element_size()
@@ -557,6 +674,8 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
     node_list = list(nodes)
     _estimate_chunk_inference_mem(meta_graph, chunk_starts, chunk_ends, [1], [2])
     _estimate_inference_mem(meta_graph)
+    node_index_tracer = NodeIndexTracer(meta_graph)
+    node_index_tracer.trace_node_idx()
 
     # find the input and output var names for each offload region
     for idx, (start, end) in enumerate(chunk_regions):

From c36dba07defa3069ba65d5aafc53d8292e78cf60 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 14 Nov 2022 23:38:05 +0800
Subject: [PATCH 014/503] finish basic index tracer

---
 chunk_codegen.py | 133 +++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 124 insertions(+), 9 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 4b8882afc105..8477fe9a1702 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -25,6 +25,7 @@ def __init__(self, gm) -> None:
         self.nodes_list = list(gm.graph.nodes)
         self.idx_trace_list = [{'idx': [], 'compute': []} for _ in range(len(self.nodes_list))] 
         self.idx_trace_equal = []
+        self.idx_view_list = []
         self.idx_count = 1
 
     def add_index(self):
@@ -35,7 +36,7 @@ def inherit_computation(self, node_from, node_to):
         _, compute_from = self.find_trace_from_node(node_from)
         idx_to, compute_to = self.find_trace_from_node(node_to)
         for i in compute_from:
-            if i in idx_to:
+            if i in idx_to and i not in compute_to:
                 compute_to.append(i)
     
     def mark_idx_equal(self, idx1, idx2):
@@ -47,7 +48,8 @@ def mark_computation(self, node, idx, dim):
             dim = [dim]
         for d in dim:
             cur_idx = input_node_idx_trace[d]
-            self.idx_trace_list[idx]['compute'].append(cur_idx)
+            if cur_idx not in self.idx_trace_list[idx]['compute']:
+                self.idx_trace_list[idx]['compute'].append(cur_idx)
     
     def find_trace_from_node(self, node):
         node_idx = _find_idx_by_name(node.name, self.nodes_list)
@@ -56,8 +58,11 @@ def find_trace_from_node(self, node):
     
     def find_idx_trace_from_node(self, node):
         node_idx = _find_idx_by_name(node.name, self.nodes_list)
-        node_idx_trace = self.idx_trace_list[node_idx]['idx']
-        return node_idx_trace
+        return self.idx_trace_list[node_idx]['idx']
+    
+    def find_compute_trace_from_node(self, node):
+        node_idx = _find_idx_by_name(node.name, self.nodes_list)
+        return self.idx_trace_list[node_idx]['compute']
     
     def assign_index_as_input(self, node, node_idx):
         input_node_idx = _find_idx_by_name(node.args[0].name, self.nodes_list)
@@ -82,6 +87,18 @@ def assign_transpose_index(self, node, node_idx):
         new_idx_trace[tranpose_dim[1]] = input_node_idx_trace[tranpose_dim[0]]
 
         self.idx_trace_list[node_idx]['idx'] = new_idx_trace
+        self.inherit_computation(node.args[0], node)
+        
+    def assign_permute_index(self, node, node_idx):
+        permute_dim = node.args[1:]
+        input_node_idx_trace = self.find_idx_trace_from_node(node.args[0])
+        
+        new_idx_trace = copy.deepcopy(input_node_idx_trace)
+        for idx, d in enumerate(permute_dim):
+            new_idx_trace[idx] = input_node_idx_trace[d]
+
+        self.idx_trace_list[node_idx]['idx'] = new_idx_trace
+        self.inherit_computation(node.args[0], node)
         
     def assign_linear_index(self, node, node_idx):
         input_node, weight, bias = node.args
@@ -100,10 +117,99 @@ def assign_linear_index(self, node, node_idx):
             bias_idx_trace = self.find_idx_trace_from_node(bias)
             self.mark_idx_equal(input_node_idx_trace[-1], bias_idx_trace[0])
 
+    def assign_matmul_index(self, node, node_idx):
+        matmul_left, matmul_right = node.args
+        matmul_left_idx_trace = self.find_idx_trace_from_node(matmul_left)
+        matmul_right_idx_trace = self.find_idx_trace_from_node(matmul_right)
+        
+        assert(len(matmul_left_idx_trace) == len(matmul_right_idx_trace))
+        new_idx_trace = copy.deepcopy(matmul_left_idx_trace)
+        new_idx_trace[-1] = matmul_right_idx_trace[-1]
+        self.idx_trace_list[node_idx]['idx'] = new_idx_trace
+
+        self.inherit_computation(matmul_left, node)
+        self.inherit_computation(matmul_right, node)
+        self.mark_computation(node, node_idx, [-1])
+        self.mark_idx_equal(matmul_left_idx_trace[-1], matmul_right_idx_trace[-2])
+
     def assign_layernorm_index(self, node, idx):
         self.assign_index_as_input(node, idx)
+        self.inherit_computation(node.args[0], node)
         self.mark_computation(node, idx, [-1, -2])
-            
+    
+    def assign_elementwise_index(self, node, idx):
+        self.assign_index_as_input(node, idx)
+        for node_in in node.args:
+            if type(node_in) not in (int, float):
+                self.inherit_computation(node_in, node)
+                
+    def assign_softmax_index(self, node, idx):
+        self.assign_index_as_input(node, idx)
+        self.mark_computation(node, idx, [node.kwargs['dim']])
+
+    def assign_view_reshape_index(self, node, node_idx):
+        # get data, turn into number
+        origin_node = node.args[0]
+        origin_shape = origin_node.meta['tensor_meta'].shape
+        target_shape = []
+        for i in range(1, len(node.args)):
+            if isinstance(node.args[i], int):
+                target_shape.append(node.args[i])
+            else:
+                target_shape.append(node.args[i].meta['fwd_out'][0])
+
+        # compute the value of -1
+        if -1 in target_shape:
+            origin_product = 1
+            for i in origin_shape:
+                origin_product *= i
+            target_product = -1
+            for i in target_shape:
+                target_product *= i
+            shape_idx = target_shape.index(-1)
+            target_shape[shape_idx] = origin_product // target_product
+
+        # determine changed dim
+        len_diff = len(origin_shape) - len(target_shape)
+        if len_diff == 1:
+            # dim merge
+            dim_equal = [i == j for i, j in zip(origin_shape[:-1], target_shape)]
+            dim_to = [dim_equal.index(False)]
+            dim_from = [dim_equal.index(False), dim_equal.index(False) + 1]
+        elif len_diff == -1:
+            # dim expand
+            dim_equal = [i == j for i, j in zip(origin_shape, target_shape[:-1])]
+            dim_from = [dim_equal.index(False)]
+            dim_to = [dim_equal.index(False), dim_equal.index(False) + 1]
+        else:
+            raise NotImplementedError("shape" + str(origin_shape) + 'and' + str(target_shape) + "view not implemented")
+
+        # get new index
+        origin_trace = self.find_idx_trace_from_node(origin_node)
+        new_trace = copy.deepcopy(origin_trace)
+        dim_from.reverse()
+        for i in dim_from:
+            new_trace.pop(i)
+        for i in dim_to:
+            new_trace.insert(i, self.add_index())
+        self.idx_trace_list[node_idx]['idx'] = new_trace
+        
+        # inherit computation
+        self.inherit_computation(origin_node, node)
+        compute_log = self.find_compute_trace_from_node(origin_node)
+        for i in dim_from:
+            if origin_trace[i] in compute_log:
+                for j in dim_to:
+                    self.mark_computation(node, node_idx, [j])
+                break
+        
+        # log view
+        view_dict = {"idx_from": [origin_trace[i] for i in dim_from],
+                     "dim_from": dim_from,
+                     "idx_to": [new_trace[i] for i in dim_to],
+                     "dim_to": dim_to}
+        self.idx_view_list.append(view_dict) 
+        
     def trace_node_idx(self):
         for idx, node in enumerate(self.nodes_list):
             if node.op == 'placeholder':
@@ -111,15 +217,21 @@ def trace_node_idx(self):
             elif node.op == 'call_method':
                 if 'transpose' in node.name:
                     self.assign_transpose_index(node, idx)
-                elif 'view' in node.name:
-                    pass
                 elif 'permute' in node.name:
-                    pass
+                    self.assign_permute_index(node, idx)
+                elif 'view' in node.name or 'reshape' in node.name:
+                    self.assign_view_reshape_index(node, idx)
                 else:
                     raise NotImplementedError(node.name, "method not implemented yet!")
             elif node.op == 'call_function':
                 if 'linear' in node.name:
                     self.assign_linear_index(node, idx)
+                elif 'matmul' in node.name:
+                    self.assign_matmul_index(node, idx)
+                elif 'softmax' in node.name:
+                    self.assign_softmax_index(node, idx)
+                elif any(n in node.name for n in ['mul', 'add', 'sigmoid', 'relu']):
+                    self.assign_elementwise_index(node, idx)
                 elif 'getattr' in node.name:
                     continue # get attr like shape
                 elif 'getitem' in node.name:
@@ -127,12 +239,14 @@ def trace_node_idx(self):
                 else:
                     raise NotImplementedError(node.name, "function not implemented yet!")
             elif node.op == 'call_module':
-                if 'layernorm' in node.name:
+                if any(n in node.name for n in ['layernorm', 'norm']):
                     self.assign_layernorm_index(node, idx)
                 else:
                     raise NotImplementedError(node.name, "module not implemented yet!")
             elif node.op == 'get_attr':
                 self.assign_all_index(node, idx) # get param
+            elif node.op == 'output':
+                continue
             else:
                 raise NotImplementedError(node.op, "op not implemented yet!")
 
@@ -297,6 +411,7 @@ def _estimate_chunk_inference_mem(gm: torch.fx.GraphModule, start_nodes, end_nod
         # node is an operation, calculate tmp, output node and delete node memory
         else:
             # forward memory
+            # TODO: permute will create a tmp copy if not contiguous
             act_memory += _get_contiguous_memory(node, not_contiguous_list) * chunk_ratio / (1024 ** 2)
             act_memory += _get_output_node_size(node) * chunk_ratio / (1024 ** 2)
             # record max act memory

From 70a98b8f56e690b75039561a729c5b623d175512 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 14 Nov 2022 23:49:48 +0800
Subject: [PATCH 015/503] add doc string

---
 chunk_codegen.py | 59 ++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 57 insertions(+), 2 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 8477fe9a1702..aa9d7ecd861f 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -26,13 +26,28 @@ def __init__(self, gm) -> None:
         self.idx_trace_list = [{'idx': [], 'compute': []} for _ in range(len(self.nodes_list))] 
         self.idx_trace_equal = []
         self.idx_view_list = []
-        self.idx_count = 1
+        self.idx_count = -1
 
     def add_index(self):
+        """
+        Update the count and return it. To record the idx number.
+        
+        Returns:
+            idx_count: int
+        """        
         self.idx_count += 1
-        return self.idx_count - 1
+        return self.idx_count
 
     def inherit_computation(self, node_from, node_to):
+        """
+        Inherit computed dim from node_from to node_to.
+        If a dim in node_from is marked as computed and exists in node_to,
+        still mark it as computed in node_to.
+
+        Args:
+            node_from (node): node to be inherited
+            node_to (node): new node to inherit
+        """        
         _, compute_from = self.find_trace_from_node(node_from)
         idx_to, compute_to = self.find_trace_from_node(node_to)
         for i in compute_from:
@@ -40,9 +55,24 @@ def inherit_computation(self, node_from, node_to):
                 compute_to.append(i)
     
     def mark_idx_equal(self, idx1, idx2):
+        """
+        Mark 2 index to be equal.
+
+        Args:
+            idx1 (int): index count.
+            idx2 (int): index count.
+        """        
         self.idx_trace_equal.append((idx1, idx2))
         
     def mark_computation(self, node, idx, dim):
+        """
+        Mark some dims of node as computed.
+
+        Args:
+            node (node)
+            idx (int): node index
+            dim (list or int): dims to be marked as computed
+        """        
         input_node_idx_trace = self.find_idx_trace_from_node(node)
         if isinstance(dim, int):
             dim = [dim]
@@ -52,15 +82,40 @@ def mark_computation(self, node, idx, dim):
                 self.idx_trace_list[idx]['compute'].append(cur_idx)
     
     def find_trace_from_node(self, node):
+        """
+        Find node idx and compute trace by the node.
+
+        Args:
+            node (node)
+        Returns:
+            idx (list): idx of the node
+            compute (list): computed idx of the node.
+        """        
         node_idx = _find_idx_by_name(node.name, self.nodes_list)
         node_dict = self.idx_trace_list[node_idx]
         return node_dict['idx'], node_dict['compute']
     
     def find_idx_trace_from_node(self, node):
+        """
+        Find node idx trace by the node.
+
+        Args:
+            node (node)
+        Returns:
+            idx (list): idx of the node
+        """ 
         node_idx = _find_idx_by_name(node.name, self.nodes_list)
         return self.idx_trace_list[node_idx]['idx']
     
     def find_compute_trace_from_node(self, node):
+        """
+        Find node compute trace by the node.
+
+        Args:
+            node (node)
+        Returns:
+            compute (list): computed idx of the node.
+        """ 
         node_idx = _find_idx_by_name(node.name, self.nodes_list)
         return self.idx_trace_list[node_idx]['compute']
     

From f379d1a94d5ffc7aa4a0c47ffc56cddbf99f4650 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 15 Nov 2022 10:18:00 +0800
Subject: [PATCH 016/503] add doc str

---
 chunk_codegen.py | 95 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 95 insertions(+)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index aa9d7ecd861f..a14f7c134985 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -120,6 +120,13 @@ def find_compute_trace_from_node(self, node):
         return self.idx_trace_list[node_idx]['compute']
     
     def assign_index_as_input(self, node, node_idx):
+        """
+        Assign node's trace as its input node.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """        
         input_node_idx = _find_idx_by_name(node.args[0].name, self.nodes_list)
         input_node_idx_trace = self.idx_trace_list[input_node_idx]['idx']
         
@@ -127,6 +134,13 @@ def assign_index_as_input(self, node, node_idx):
         self.idx_trace_list[node_idx]['idx'] = new_idx_trace
     
     def assign_all_index(self, node, node_idx):
+        """
+        Add new index for all node's dims.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """  
         shape = node.meta['tensor_meta'].shape
         new_trace = []
         for _ in shape:
@@ -134,6 +148,15 @@ def assign_all_index(self, node, node_idx):
         self.idx_trace_list[node_idx]['idx'] = new_trace   
 
     def assign_transpose_index(self, node, node_idx):
+        """
+        Assign index for transpose op.
+        1. swap input's dim according to transpose args
+        2. inherit input's computation
+
+        Args:
+            node (node)
+            node_idx (int)
+        """  
         tranpose_dim = node.args[1:]
         input_node_idx_trace = self.find_idx_trace_from_node(node.args[0])
         
@@ -145,6 +168,15 @@ def assign_transpose_index(self, node, node_idx):
         self.inherit_computation(node.args[0], node)
         
     def assign_permute_index(self, node, node_idx):
+        """
+        Assign index for permute op.
+        1. swap input's dim according to permute args
+        2. inherit input's computation
+
+        Args:
+            node (node)
+            node_idx (int)
+        """  
         permute_dim = node.args[1:]
         input_node_idx_trace = self.find_idx_trace_from_node(node.args[0])
         
@@ -156,6 +188,16 @@ def assign_permute_index(self, node, node_idx):
         self.inherit_computation(node.args[0], node)
         
     def assign_linear_index(self, node, node_idx):
+        """
+        Assign index for linear op.
+        1. copy trace from input node and change last index accroding to weight
+        2. mark equal for input node last index, weight first dim and bias dim.
+        3. inherit input's computation, mark computation for last dim.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """  
         input_node, weight, bias = node.args
         input_node_idx_trace = self.find_idx_trace_from_node(input_node)
         weight_idx_trace = self.find_idx_trace_from_node(weight)
@@ -173,6 +215,16 @@ def assign_linear_index(self, node, node_idx):
             self.mark_idx_equal(input_node_idx_trace[-1], bias_idx_trace[0])
 
     def assign_matmul_index(self, node, node_idx):
+        """
+        Assign index for matmul op.
+        1. copy trace from matmul_left and change last index accroding to matmul_right. (assert they have same length)
+        2. mark equal for input matmul_left -1 index and matmul_right -2 dim.
+        3. inherit matmul_left and matmul_right computation, mark computation for last dim.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """  
         matmul_left, matmul_right = node.args
         matmul_left_idx_trace = self.find_idx_trace_from_node(matmul_left)
         matmul_right_idx_trace = self.find_idx_trace_from_node(matmul_right)
@@ -188,21 +240,63 @@ def assign_matmul_index(self, node, node_idx):
         self.mark_idx_equal(matmul_left_idx_trace[-1], matmul_right_idx_trace[-2])
 
     def assign_layernorm_index(self, node, idx):
+        """
+        Assign index for layernorm op.
+        1. assign index as input node
+        2. inherit computation and mark last 2 dims as computed.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
         self.assign_index_as_input(node, idx)
         self.inherit_computation(node.args[0], node)
         self.mark_computation(node, idx, [-1, -2])
     
     def assign_elementwise_index(self, node, idx):
+        """
+        Assign index for element-wise op (eg. relu sigmoid add mul).
+        1. assign index as input node
+        2. inherit computation from all input nodes.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """  
         self.assign_index_as_input(node, idx)
         for node_in in node.args:
             if type(node_in) not in (int, float):
                 self.inherit_computation(node_in, node)
                 
     def assign_softmax_index(self, node, idx):
+        """
+        Assign index for softmax op.
+        1. assign index as input node
+        2. inherit computation and mark softmax dim as computed.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """  
         self.assign_index_as_input(node, idx)
+        self.inherit_computation(node.args[0], node)
         self.mark_computation(node, idx, [node.kwargs['dim']])
 
     def assign_view_reshape_index(self, node, node_idx):
+        """
+        Assign index for view and reshape op.
+        1. get origin shape and target shape by meta info.
+        2. compute the real value of -1 in target shape.
+        3. determine changed dim, and assgin index for generated dim.
+        4. log changed dim and generated dim for restore
+        5. look into view list to see whether the view is associated with other,
+           if so assgin equal dim according to previous view.
+        6. inherit computation.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """  
         # get data, turn into number
         origin_node = node.args[0]
         origin_shape = origin_node.meta['tensor_meta'].shape
@@ -305,6 +399,7 @@ def trace_node_idx(self):
             else:
                 raise NotImplementedError(node.op, "op not implemented yet!")
 
+
 def _get_meta_node_size(x):
     x = x.meta['tensor_meta']
     x = x.numel * torch.tensor([], dtype=x.dtype).element_size()

From 7e2bd1e42892a3021b9882fb0d08f18cfcbcfe86 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 15 Nov 2022 10:36:02 +0800
Subject: [PATCH 017/503] polish code

---
 chunk_codegen.py | 258 ++---------------------------------------------
 1 file changed, 8 insertions(+), 250 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index a14f7c134985..9930a0570436 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -3,20 +3,11 @@
 import copy
 from typing import List, Callable, Any, Tuple, Dict, Iterable
 
-try:
-    from torch.fx.node import Node, Argument, map_arg, _type_repr, _get_qualified_name
-    from torch.fx.graph import _Namespace, PythonCode, _custom_builtins, _is_from_torch, _format_target, magic_methods, CodeGen, _origin_type_map, inplace_methods, _CustomBuiltin
-    from colossalai.fx.profiler import calculate_fwd_out, calculate_fwd_tmp, parameter_size, activation_size
-    CODEGEN_AVAILABLE = True
-except:
-    from torch.fx.graph import _Namespace, PythonCode, _custom_builtins, _is_from_torch, _format_target, magic_methods, _origin_type_map, _format_args, _CustomBuiltin
-    from torch.fx.node import Node, Argument, map_arg, _type_repr, _get_qualified_name
-    CODEGEN_AVAILABLE = False
-
-if CODEGEN_AVAILABLE:
-    __all__ = ['ChunkCodeGen']
-else:
-    __all__ = ['python_code_with_activation_checkpoint']
+from torch.fx.node import Node, Argument, map_arg, _type_repr, _get_qualified_name
+from torch.fx.graph import _Namespace, PythonCode, _custom_builtins, _is_from_torch, _format_target, magic_methods, CodeGen, _origin_type_map, inplace_methods, _CustomBuiltin
+from colossalai.fx.profiler import calculate_fwd_out, calculate_fwd_tmp, parameter_size, activation_size
+CODEGEN_AVAILABLE = True
+__all__ = ['ChunkCodeGen']
 
 
 class NodeIndexTracer(object):
@@ -289,9 +280,9 @@ def assign_view_reshape_index(self, node, node_idx):
         2. compute the real value of -1 in target shape.
         3. determine changed dim, and assgin index for generated dim.
         4. log changed dim and generated dim for restore
-        5. look into view list to see whether the view is associated with other,
+        5. inherit computation.
+        6. TODO: look into view list to see whether the view is associated with other,
            if so assgin equal dim according to previous view.
-        6. inherit computation.
 
         Args:
             node (node)
@@ -352,7 +343,7 @@ def assign_view_reshape_index(self, node, node_idx):
                     self.mark_computation(node, node_idx, [j])
                 break
         
-        # log view
+        # log view, not used now
         view_dict = {"idx_from": [origin_trace[i] for i in dim_from],
                      "dim_from": dim_from,
                      "idx_to": [new_trace[i] for i in dim_to],
@@ -680,239 +671,6 @@ def _find_idx_by_name(name, nodes_list):
         if node.name == name:
             return idx
     raise RuntimeError("name %s not found in node list" % name)
-        
-
-def _find_offload_regions(nodes: List[Node]):
-    """This function is to find the offload regions
-    In pofo algorithm, during annotation, we will annotate the offload region with the 
-    list in the form of [idx, offload_input, offload_bar]. idx indicates the offload
-    region's index, offload_input is a bool type indicates whether we need to offload
-    the input, offload_bar is a bool type indicates whether we need to offload all the
-    intermediate x_bars of this region.
-    """
-    offload_regions = []
-    offload_labels = []
-    start = -1
-    end = -1
-    current_region = None
-
-    for idx, node in enumerate(nodes):
-        if hasattr(node, 'activation_offload') and isinstance(getattr(node, 'activation_offload', None), Iterable):
-            act_offload_label = node.activation_offload
-
-            if current_region == None:
-                current_region = act_offload_label
-                start = idx
-                offload_labels.append(act_offload_label)
-
-            if act_offload_label != current_region:
-                assert start != -1
-                offload_regions.append((start, idx - 1))
-                offload_labels.append(act_offload_label)
-                current_region = act_offload_label
-                start = idx
-                end = -1
-
-        else:
-            if current_region is not None:
-                end = idx - 1
-                assert start != -1 and end != -1
-                offload_regions.append((start, end))
-                start = end = -1
-                current_region = None
-
-            else:
-                pass
-
-    return offload_regions, offload_labels
-
-
-def _gen_ckpt_fn_def(label, free_vars: List[str]) -> str:
-    """
-    Generate the checkpoint function definition
-    """
-    return f"def checkpoint_{label}({', '.join(['self'] + free_vars)}):"
-
-
-def _gen_ckpt_output(output_vars: List[str]) -> str:
-    """
-    Generate the return statement for checkpoint region
-    """
-    return f"return {', '.join(output_vars)}"
-
-
-def _gen_ckpt_usage(label, activation_offload, input_vars, output_vars, use_reentrant=True):
-    """
-    Generate the checkpoint function call code text
-    """
-    outputs = ', '.join(output_vars)
-    inputs = ', '.join(input_vars)
-    return f'{outputs} = colossalai.utils.activation_checkpoint.checkpoint(self.checkpoint_{label}, {activation_offload}, {inputs}, use_reentrant={use_reentrant})'
-
-
-def _end_of_ckpt(node: Node, check_idx: int) -> bool:
-    """Check if the node could end the ckpt region
-
-    Args:
-        node (Node): torch.fx.Node
-        check_idx (int): the index of checkpoint level for 
-        nested checkpoint
-
-    Returns:
-        bool
-    """
-    if hasattr(node, "activation_checkpoint"):
-        if isinstance(node.activation_checkpoint, list):
-            return node.activation_checkpoint[check_idx] == None
-        else:
-            return False
-    else:
-        return True
-
-
-def _find_nested_ckpt_regions(nodes, check_idx=0):
-    """
-    Find the nested checkpoint regions given a list of consecutive nodes. The outputs 
-    will be list of tuples, each tuple is in the form of (start_index, end_index).
-    """
-    ckpt_regions = []
-    start = -1
-    end = -1
-    current_region = None
-
-    for idx, node in enumerate(nodes):
-        if hasattr(node, 'activation_checkpoint'):
-            if isinstance(getattr(node, 'activation_checkpoint'), int):
-                act_ckpt_label = node.activation_checkpoint
-            else:
-                act_ckpt_label = node.activation_checkpoint[check_idx]
-
-            # this activation checkpoint label is not set yet
-            # meaning this is the first node of the activation ckpt region
-            if current_region is None:
-                current_region = act_ckpt_label
-                start = idx
-
-            # if activation checkpoint has changed
-            # we restart the tracking
-            # e.g. node ckpt states = [ckpt1, ckpt2, ckpt2, ckpt2]
-            if act_ckpt_label != current_region:
-                assert start != -1
-                ckpt_regions.append((start, idx - 1))
-                current_region = act_ckpt_label
-                start = idx
-                end = -1
-        elif current_region is not None and _end_of_ckpt(node, check_idx):
-            # used to check the case below
-            # node ckpt states = [ckpt, ckpt, non-ckpt]
-            end = idx - 1
-            assert start != -1 and end != -1
-            ckpt_regions.append((start, end))
-            start = end = -1
-            current_region = None
-        else:
-            pass
-
-    if current_region is not None:
-        end = len(nodes) - 1
-        ckpt_regions.append((start, end))
-    return ckpt_regions
-
-
-def emit_ckpt_func(body,
-                   ckpt_func,
-                   node_list: List[Node],
-                   emit_node_func,
-                   delete_unused_value_func,
-                   level=0,
-                   in_ckpt=False):
-    """Emit ckpt fuction in nested way
-
-    Args:
-        body: forward code, in recursive calls, this part will be checkpoint
-        functions code
-        ckpt_func: checkpoint functions code, in recursive calls, this part
-        will be a buffer
-        node_list (List[Node]): list of torch.fx.Node
-        emit_node_func: function to emit a node
-        delete_unused_value_func: function to delete unused value
-        level (int, optional): checkpoint level. Defaults to 0.
-        in_ckpt (bool, optional): indicates wether the func is in recursive
-        call. Defaults to False.
-    """
-    inputs, outputs = _find_input_and_output_nodes(node_list)
-
-    # if the current checkpoint function use int as label, using old generation method
-    if isinstance(node_list[0].activation_checkpoint, int):
-        label = node_list[0].activation_checkpoint
-        ckpt_fn_def = _gen_ckpt_fn_def(label, inputs)
-        ckpt_func.append(f'{ckpt_fn_def}\n')
-        for node in node_list:
-            emit_node_func(node, ckpt_func)
-            ckpt_func[-1] = '    ' + ckpt_func[-1]
-            delete_unused_value_func(node, ckpt_func)
-
-        ckpt_func.append('    ' + _gen_ckpt_output(outputs) + '\n\n')
-        activation_offload = getattr(node_list[0], "activation_offload", False)
-        usage = _gen_ckpt_usage(label, activation_offload, inputs, outputs, False)
-        usage += "\n"
-        body.append(usage)
-
-    # use nested ckpt function codegen
-    else:
-        # label given by each layer, e.g. if you are currently at level [0, 1, 1]
-        # the label will be '0_1_1'
-        label = "_".join([str(idx) for idx in node_list[0].activation_checkpoint[:level + 1]])
-        ckpt_fn_def = _gen_ckpt_fn_def(label, inputs)
-        ckpt_func.append(f'{ckpt_fn_def}\n')
-
-        # if there is more level to fetch
-        if level + 1 < len(node_list[0].activation_checkpoint):
-            ckpt_regions = _find_nested_ckpt_regions(node_list, level + 1)
-            start_idx = [item[0] for item in ckpt_regions]
-            end_idx = [item[1] for item in ckpt_regions]
-
-            # use ckpt_func_buffer to store nested checkpoint functions
-            ckpt_func_buffer = []
-            node_idx = 0
-            while 1:
-                if node_idx >= len(node_list):
-                    break
-
-                if node_idx in start_idx:
-                    ckpt_node_list = node_list[node_idx:end_idx[start_idx.index(node_idx)] + 1]
-                    emit_ckpt_func(ckpt_func, ckpt_func_buffer, ckpt_node_list, emit_node_func,
-                                   delete_unused_value_func, level + 1, True)
-                    node_idx += len(ckpt_node_list)
-
-                else:
-                    node = node_list[node_idx]
-                    emit_node_func(node, ckpt_func)
-                    ckpt_func[-1] = '    ' + ckpt_func[-1]
-                    delete_unused_value_func(node, ckpt_func)
-                    node_idx += 1
-
-            ckpt_func.append('    ' + _gen_ckpt_output(outputs) + '\n\n')
-            ckpt_func += ckpt_func_buffer
-            activation_offload = getattr(node_list[0], "activation_offload", False)
-            usage = _gen_ckpt_usage(label, activation_offload, inputs, outputs, False) + '\n'
-            if in_ckpt:
-                usage = '    ' + usage
-            body.append(usage)
-
-        # last level
-        else:
-            for node in node_list:
-                emit_node_func(node, ckpt_func)
-                ckpt_func[-1] = '    ' + ckpt_func[-1]
-                delete_unused_value_func(node, ckpt_func)
-
-            ckpt_func.append('    ' + _gen_ckpt_output(outputs) + '\n\n')
-            activation_offload = getattr(node_list[0], "activation_offload", False)
-            usage = _gen_ckpt_usage(label, activation_offload, inputs, outputs, False) + '\n'
-            if in_ckpt:
-                usage = '    ' + usage
-            body.append(usage)
 
 
 def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_value_func, meta_nodes, meta_graph):

From fad3b6d1a65ee04d18e4826045ce3af4e3d28f10 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 15 Nov 2022 10:46:51 +0800
Subject: [PATCH 018/503] polish code

---
 chunk_codegen.py | 478 +++++++++++++++++++++++------------------------
 1 file changed, 239 insertions(+), 239 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 9930a0570436..c1d9e26e790a 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -10,6 +10,13 @@
 __all__ = ['ChunkCodeGen']
 
 
+def _delete_free_var_from_last_use(user_to_last_uses):
+    for key, value in user_to_last_uses.items():
+        for n in value:
+            if n.op == 'placeholder':
+                user_to_last_uses[key].remove(n)
+
+
 class NodeIndexTracer(object):
     def __init__(self, gm) -> None:
         self.gm = gm
@@ -19,7 +26,7 @@ def __init__(self, gm) -> None:
         self.idx_view_list = []
         self.idx_count = -1
 
-    def add_index(self):
+    def _add_index(self):
         """
         Update the count and return it. To record the idx number.
         
@@ -29,7 +36,7 @@ def add_index(self):
         self.idx_count += 1
         return self.idx_count
 
-    def inherit_computation(self, node_from, node_to):
+    def _inherit_computation(self, node_from, node_to):
         """
         Inherit computed dim from node_from to node_to.
         If a dim in node_from is marked as computed and exists in node_to,
@@ -39,13 +46,13 @@ def inherit_computation(self, node_from, node_to):
             node_from (node): node to be inherited
             node_to (node): new node to inherit
         """        
-        _, compute_from = self.find_trace_from_node(node_from)
-        idx_to, compute_to = self.find_trace_from_node(node_to)
+        _, compute_from = self._find_trace_from_node(node_from)
+        idx_to, compute_to = self._find_trace_from_node(node_to)
         for i in compute_from:
             if i in idx_to and i not in compute_to:
                 compute_to.append(i)
     
-    def mark_idx_equal(self, idx1, idx2):
+    def _mark_idx_equal(self, idx1, idx2):
         """
         Mark 2 index to be equal.
 
@@ -55,7 +62,7 @@ def mark_idx_equal(self, idx1, idx2):
         """        
         self.idx_trace_equal.append((idx1, idx2))
         
-    def mark_computation(self, node, idx, dim):
+    def _mark_computation(self, node, idx, dim):
         """
         Mark some dims of node as computed.
 
@@ -64,7 +71,7 @@ def mark_computation(self, node, idx, dim):
             idx (int): node index
             dim (list or int): dims to be marked as computed
         """        
-        input_node_idx_trace = self.find_idx_trace_from_node(node)
+        input_node_idx_trace = self._find_idx_trace_from_node(node)
         if isinstance(dim, int):
             dim = [dim]
         for d in dim:
@@ -72,7 +79,7 @@ def mark_computation(self, node, idx, dim):
             if cur_idx not in self.idx_trace_list[idx]['compute']:
                 self.idx_trace_list[idx]['compute'].append(cur_idx)
     
-    def find_trace_from_node(self, node):
+    def _find_trace_from_node(self, node):
         """
         Find node idx and compute trace by the node.
 
@@ -86,7 +93,7 @@ def find_trace_from_node(self, node):
         node_dict = self.idx_trace_list[node_idx]
         return node_dict['idx'], node_dict['compute']
     
-    def find_idx_trace_from_node(self, node):
+    def _find_idx_trace_from_node(self, node):
         """
         Find node idx trace by the node.
 
@@ -98,7 +105,7 @@ def find_idx_trace_from_node(self, node):
         node_idx = _find_idx_by_name(node.name, self.nodes_list)
         return self.idx_trace_list[node_idx]['idx']
     
-    def find_compute_trace_from_node(self, node):
+    def _find_compute_trace_from_node(self, node):
         """
         Find node compute trace by the node.
 
@@ -110,7 +117,7 @@ def find_compute_trace_from_node(self, node):
         node_idx = _find_idx_by_name(node.name, self.nodes_list)
         return self.idx_trace_list[node_idx]['compute']
     
-    def assign_index_as_input(self, node, node_idx):
+    def _assign_index_as_input(self, node, node_idx):
         """
         Assign node's trace as its input node.
 
@@ -124,7 +131,7 @@ def assign_index_as_input(self, node, node_idx):
         new_idx_trace = copy.deepcopy(input_node_idx_trace)
         self.idx_trace_list[node_idx]['idx'] = new_idx_trace
     
-    def assign_all_index(self, node, node_idx):
+    def _assign_all_index(self, node, node_idx):
         """
         Add new index for all node's dims.
 
@@ -135,10 +142,10 @@ def assign_all_index(self, node, node_idx):
         shape = node.meta['tensor_meta'].shape
         new_trace = []
         for _ in shape:
-            new_trace.append(self.add_index())
+            new_trace.append(self._add_index())
         self.idx_trace_list[node_idx]['idx'] = new_trace   
 
-    def assign_transpose_index(self, node, node_idx):
+    def _assign_transpose_index(self, node, node_idx):
         """
         Assign index for transpose op.
         1. swap input's dim according to transpose args
@@ -149,16 +156,16 @@ def assign_transpose_index(self, node, node_idx):
             node_idx (int)
         """  
         tranpose_dim = node.args[1:]
-        input_node_idx_trace = self.find_idx_trace_from_node(node.args[0])
+        input_node_idx_trace = self._find_idx_trace_from_node(node.args[0])
         
         new_idx_trace = copy.deepcopy(input_node_idx_trace)
         new_idx_trace[tranpose_dim[0]] = input_node_idx_trace[tranpose_dim[1]]
         new_idx_trace[tranpose_dim[1]] = input_node_idx_trace[tranpose_dim[0]]
 
         self.idx_trace_list[node_idx]['idx'] = new_idx_trace
-        self.inherit_computation(node.args[0], node)
+        self._inherit_computation(node.args[0], node)
         
-    def assign_permute_index(self, node, node_idx):
+    def _assign_permute_index(self, node, node_idx):
         """
         Assign index for permute op.
         1. swap input's dim according to permute args
@@ -169,16 +176,16 @@ def assign_permute_index(self, node, node_idx):
             node_idx (int)
         """  
         permute_dim = node.args[1:]
-        input_node_idx_trace = self.find_idx_trace_from_node(node.args[0])
+        input_node_idx_trace = self._find_idx_trace_from_node(node.args[0])
         
         new_idx_trace = copy.deepcopy(input_node_idx_trace)
         for idx, d in enumerate(permute_dim):
             new_idx_trace[idx] = input_node_idx_trace[d]
 
         self.idx_trace_list[node_idx]['idx'] = new_idx_trace
-        self.inherit_computation(node.args[0], node)
+        self._inherit_computation(node.args[0], node)
         
-    def assign_linear_index(self, node, node_idx):
+    def _assign_linear_index(self, node, node_idx):
         """
         Assign index for linear op.
         1. copy trace from input node and change last index accroding to weight
@@ -190,22 +197,22 @@ def assign_linear_index(self, node, node_idx):
             node_idx (int)
         """  
         input_node, weight, bias = node.args
-        input_node_idx_trace = self.find_idx_trace_from_node(input_node)
-        weight_idx_trace = self.find_idx_trace_from_node(weight)
+        input_node_idx_trace = self._find_idx_trace_from_node(input_node)
+        weight_idx_trace = self._find_idx_trace_from_node(weight)
         
         new_idx_trace = copy.deepcopy(input_node_idx_trace)
         new_idx_trace[-1] = weight_idx_trace[1]
         self.idx_trace_list[node_idx]['idx'] = new_idx_trace
 
-        self.inherit_computation(input_node, node)
-        self.mark_computation(node, node_idx, [-1])
-        self.mark_idx_equal(input_node_idx_trace[-1], weight_idx_trace[0])
+        self._inherit_computation(input_node, node)
+        self._mark_computation(node, node_idx, [-1])
+        self._mark_idx_equal(input_node_idx_trace[-1], weight_idx_trace[0])
         
         if bias:
-            bias_idx_trace = self.find_idx_trace_from_node(bias)
-            self.mark_idx_equal(input_node_idx_trace[-1], bias_idx_trace[0])
+            bias_idx_trace = self._find_idx_trace_from_node(bias)
+            self._mark_idx_equal(input_node_idx_trace[-1], bias_idx_trace[0])
 
-    def assign_matmul_index(self, node, node_idx):
+    def _assign_matmul_index(self, node, node_idx):
         """
         Assign index for matmul op.
         1. copy trace from matmul_left and change last index accroding to matmul_right. (assert they have same length)
@@ -217,20 +224,20 @@ def assign_matmul_index(self, node, node_idx):
             node_idx (int)
         """  
         matmul_left, matmul_right = node.args
-        matmul_left_idx_trace = self.find_idx_trace_from_node(matmul_left)
-        matmul_right_idx_trace = self.find_idx_trace_from_node(matmul_right)
+        matmul_left_idx_trace = self._find_idx_trace_from_node(matmul_left)
+        matmul_right_idx_trace = self._find_idx_trace_from_node(matmul_right)
         
         assert(len(matmul_left_idx_trace) == len(matmul_right_idx_trace))
         new_idx_trace = copy.deepcopy(matmul_left_idx_trace)
         new_idx_trace[-1] = matmul_right_idx_trace[-1]
         self.idx_trace_list[node_idx]['idx'] = new_idx_trace
 
-        self.inherit_computation(matmul_left, node)
-        self.inherit_computation(matmul_right, node)
-        self.mark_computation(node, node_idx, [-1])
-        self.mark_idx_equal(matmul_left_idx_trace[-1], matmul_right_idx_trace[-2])
+        self._inherit_computation(matmul_left, node)
+        self._inherit_computation(matmul_right, node)
+        self._mark_computation(node, node_idx, [-1])
+        self._mark_idx_equal(matmul_left_idx_trace[-1], matmul_right_idx_trace[-2])
 
-    def assign_layernorm_index(self, node, idx):
+    def _assign_layernorm_index(self, node, idx):
         """
         Assign index for layernorm op.
         1. assign index as input node
@@ -240,11 +247,11 @@ def assign_layernorm_index(self, node, idx):
             node (node)
             node_idx (int)
         """
-        self.assign_index_as_input(node, idx)
-        self.inherit_computation(node.args[0], node)
-        self.mark_computation(node, idx, [-1, -2])
+        self._assign_index_as_input(node, idx)
+        self._inherit_computation(node.args[0], node)
+        self._mark_computation(node, idx, [-1, -2])
     
-    def assign_elementwise_index(self, node, idx):
+    def _assign_elementwise_index(self, node, idx):
         """
         Assign index for element-wise op (eg. relu sigmoid add mul).
         1. assign index as input node
@@ -254,12 +261,12 @@ def assign_elementwise_index(self, node, idx):
             node (node)
             node_idx (int)
         """  
-        self.assign_index_as_input(node, idx)
+        self._assign_index_as_input(node, idx)
         for node_in in node.args:
             if type(node_in) not in (int, float):
-                self.inherit_computation(node_in, node)
+                self._inherit_computation(node_in, node)
                 
-    def assign_softmax_index(self, node, idx):
+    def _assign_softmax_index(self, node, idx):
         """
         Assign index for softmax op.
         1. assign index as input node
@@ -269,11 +276,11 @@ def assign_softmax_index(self, node, idx):
             node (node)
             node_idx (int)
         """  
-        self.assign_index_as_input(node, idx)
-        self.inherit_computation(node.args[0], node)
-        self.mark_computation(node, idx, [node.kwargs['dim']])
+        self._assign_index_as_input(node, idx)
+        self._inherit_computation(node.args[0], node)
+        self._mark_computation(node, idx, [node.kwargs['dim']])
 
-    def assign_view_reshape_index(self, node, node_idx):
+    def _assign_view_reshape_index(self, node, node_idx):
         """
         Assign index for view and reshape op.
         1. get origin shape and target shape by meta info.
@@ -325,22 +332,22 @@ def assign_view_reshape_index(self, node, node_idx):
             raise NotImplementedError("shape" + str(origin_shape) + 'and' + str(target_shape) + "view not implemented")
 
         # get new index
-        origin_trace = self.find_idx_trace_from_node(origin_node)
+        origin_trace = self._find_idx_trace_from_node(origin_node)
         new_trace = copy.deepcopy(origin_trace)
         dim_from.reverse()
         for i in dim_from:
             new_trace.pop(i)
         for i in dim_to:
-            new_trace.insert(i, self.add_index())
+            new_trace.insert(i, self._add_index())
         self.idx_trace_list[node_idx]['idx'] = new_trace
         
         # inherit computation
-        self.inherit_computation(origin_node, node)
-        compute_log = self.find_compute_trace_from_node(origin_node)
+        self._inherit_computation(origin_node, node)
+        compute_log = self._find_compute_trace_from_node(origin_node)
         for i in dim_from:
             if origin_trace[i] in compute_log:
                 for j in dim_to:
-                    self.mark_computation(node, node_idx, [j])
+                    self._mark_computation(node, node_idx, [j])
                 break
         
         # log view, not used now
@@ -353,25 +360,25 @@ def assign_view_reshape_index(self, node, node_idx):
     def trace_node_idx(self):
         for idx, node in enumerate(self.nodes_list):
             if node.op == 'placeholder':
-                self.assign_all_index(node, idx)
+                self._assign_all_index(node, idx)
             elif node.op == 'call_method':
                 if 'transpose' in node.name:
-                    self.assign_transpose_index(node, idx)
+                    self._assign_transpose_index(node, idx)
                 elif 'permute' in node.name:
-                    self.assign_permute_index(node, idx)
+                    self._assign_permute_index(node, idx)
                 elif 'view' in node.name or 'reshape' in node.name:
-                    self.assign_view_reshape_index(node, idx)
+                    self._assign_view_reshape_index(node, idx)
                 else:
                     raise NotImplementedError(node.name, "method not implemented yet!")
             elif node.op == 'call_function':
                 if 'linear' in node.name:
-                    self.assign_linear_index(node, idx)
+                    self._assign_linear_index(node, idx)
                 elif 'matmul' in node.name:
-                    self.assign_matmul_index(node, idx)
+                    self._assign_matmul_index(node, idx)
                 elif 'softmax' in node.name:
-                    self.assign_softmax_index(node, idx)
+                    self._assign_softmax_index(node, idx)
                 elif any(n in node.name for n in ['mul', 'add', 'sigmoid', 'relu']):
-                    self.assign_elementwise_index(node, idx)
+                    self._assign_elementwise_index(node, idx)
                 elif 'getattr' in node.name:
                     continue # get attr like shape
                 elif 'getitem' in node.name:
@@ -380,206 +387,198 @@ def trace_node_idx(self):
                     raise NotImplementedError(node.name, "function not implemented yet!")
             elif node.op == 'call_module':
                 if any(n in node.name for n in ['layernorm', 'norm']):
-                    self.assign_layernorm_index(node, idx)
+                    self._assign_layernorm_index(node, idx)
                 else:
                     raise NotImplementedError(node.name, "module not implemented yet!")
             elif node.op == 'get_attr':
-                self.assign_all_index(node, idx) # get param
+                self._assign_all_index(node, idx) # get param
             elif node.op == 'output':
                 continue
             else:
                 raise NotImplementedError(node.op, "op not implemented yet!")
 
 
-def _get_meta_node_size(x):
-    x = x.meta['tensor_meta']
-    x = x.numel * torch.tensor([], dtype=x.dtype).element_size()
-    return x
-
+class MemoryEstimator(object):
+    def __init__(self) -> None:
+        pass
 
-def _get_output_node_size(n):
-    fwd_out = {x.uuid: x for x in n.meta["fwd_out"] if isinstance(x, torch.Tensor) and hasattr(x, 'uuid')}
-    return activation_size(fwd_out)
+    def _get_meta_node_size(self, x):
+        x = x.meta['tensor_meta']
+        x = x.numel * torch.tensor([], dtype=x.dtype).element_size()
+        return x
 
+    def _get_output_node_size(self, n):
+        fwd_out = {x.uuid: x for x in n.meta["fwd_out"] if isinstance(x, torch.Tensor) and hasattr(x, 'uuid')}
+        return activation_size(fwd_out)
 
-def _get_delete_node_size(user, user_to_last_uses):
-    if user.op in ('placeholder', 'output'):
+    def _get_delete_node_size(self, user, user_to_last_uses):
+        if user.op in ('placeholder', 'output'):
+            return 0
+        nodes_to_delete = user_to_last_uses.get(user, [])
+        if len(nodes_to_delete):
+            delete_size = sum([self._get_output_node_size(i) for i in nodes_to_delete])
+            return delete_size
         return 0
-    nodes_to_delete = user_to_last_uses.get(user, [])
-    if len(nodes_to_delete):
-        delete_size = sum([_get_output_node_size(i) for i in nodes_to_delete])
-        return delete_size
-    return 0
-
-
-def _get_last_usr(nodes):
-    node_to_last_use: Dict[Node, Node] = {}
-    user_to_last_uses: Dict[Node, List[Node]] = {}
-
-    def register_last_uses(n: Node, user: Node):
-        if n not in node_to_last_use:
-            node_to_last_use[n] = user
-            user_to_last_uses.setdefault(user, []).append(n)
-
-    for node in reversed(nodes):
-        map_arg(node.args, lambda n: register_last_uses(n, node))
-        map_arg(node.kwargs, lambda n: register_last_uses(n, node))
-    return user_to_last_uses
-
-
-def _delete_free_var_from_last_use(user_to_last_uses):
-    for key, value in user_to_last_uses.items():
-        for n in value:
-            if n.op == 'placeholder':
-                user_to_last_uses[key].remove(n)
-
-
-def _get_contiguous_memory(node, not_contiguous_list, delete=False):
-    mem = 0
-    not_contiguous_ops = ['transpose', 'permute']
-
-    if node.op == 'call_function' and 'matmul' in node.name:
-        for n in node.args:
-            if n in not_contiguous_list:
-                # matmul won't change origin tensor, but create a tmp copy
-                mem += _get_output_node_size(n)
-    elif node.op == 'call_module':
-        for n in node.args:
-            if n in not_contiguous_list:
-                # module will just make origin tensor to contiguous
-                if delete:
-                    not_contiguous_list.remove(n)
-    elif node.op == 'call_method' and any(i in node.name for i in not_contiguous_ops):
-        if node not in not_contiguous_list:
-            not_contiguous_list.append(node)
-    elif any(i in node.args for i in not_contiguous_list):
-        if node not in not_contiguous_list:
-            not_contiguous_list.append(node)
-
-    return mem
-
-
-def _estimate_inference_mem(gm: torch.fx.GraphModule):
-    act_memory = 0.0
-    act_memory_peak_log = []
-    act_memory_after_node_log = []
-    not_contiguous_list = []
-    user_to_last_uses = _get_last_usr(list(gm.graph.nodes))
-    _delete_free_var_from_last_use(user_to_last_uses)
-    for node in gm.graph.nodes:
-        # if node is placeholder, just add the size of the node
-        if node.op == 'placeholder':
-            act_memory += _get_meta_node_size(node) / (1024 ** 2)
-            act_memory_peak_log.append(act_memory)
-            act_memory_after_node_log.append(act_memory)
-        # skip output
-        elif node.op == 'output':
-            continue
-        # node is an operation, calculate tmp, output node and delete node memory
-        else:
-            # forward memory
-            act_memory += _get_contiguous_memory(node, not_contiguous_list) / (1024 ** 2)
-            act_memory += _get_output_node_size(node) / (1024 ** 2)
-            # record max act memory
-            act_memory_peak_log.append(act_memory)
-            # delete useless memory
-            act_memory -= _get_delete_node_size(node, user_to_last_uses) / (1024 ** 2)
-            act_memory -= _get_contiguous_memory(node, not_contiguous_list, delete=True) / (1024 ** 2)
-            act_memory_after_node_log.append(act_memory)
 
-    print("no chunk")
-    _print_mem_log(act_memory_peak_log, list(gm.graph.nodes), "peak")
-    _print_mem_log(act_memory_after_node_log, list(gm.graph.nodes), "after")
-    
-    param_memory = parameter_size(gm)
-    return act_memory + param_memory, param_memory
+    def _get_last_usr(self, nodes):
+        node_to_last_use: Dict[Node, Node] = {}
+        user_to_last_uses: Dict[Node, List[Node]] = {}
+
+        def register_last_uses(n: Node, user: Node):
+            if n not in node_to_last_use:
+                node_to_last_use[n] = user
+                user_to_last_uses.setdefault(user, []).append(n)
+
+        for node in reversed(nodes):
+            map_arg(node.args, lambda n: register_last_uses(n, node))
+            map_arg(node.kwargs, lambda n: register_last_uses(n, node))
+        return user_to_last_uses
+
+    def _get_contiguous_memory(self, node, not_contiguous_list, delete=False):
+        mem = 0
+        not_contiguous_ops = ['transpose', 'permute']
+
+        if node.op == 'call_function' and 'matmul' in node.name:
+            for n in node.args:
+                if n in not_contiguous_list:
+                    # matmul won't change origin tensor, but create a tmp copy
+                    mem += self._get_output_node_size(n)
+        elif node.op == 'call_module':
+            for n in node.args:
+                if n in not_contiguous_list:
+                    # module will just make origin tensor to contiguous
+                    if delete:
+                        not_contiguous_list.remove(n)
+        elif node.op == 'call_method' and any(i in node.name for i in not_contiguous_ops):
+            if node not in not_contiguous_list:
+                not_contiguous_list.append(node)
+        elif any(i in node.args for i in not_contiguous_list):
+            if node not in not_contiguous_list:
+                not_contiguous_list.append(node)
+
+        return mem
+
+    def estimate_inference_mem(self, gm: torch.fx.GraphModule):
+        act_memory = 0.0
+        act_memory_peak_log = []
+        act_memory_after_node_log = []
+        not_contiguous_list = []
+        user_to_last_uses = self._get_last_usr(list(gm.graph.nodes))
+        _delete_free_var_from_last_use(user_to_last_uses)
+        for node in gm.graph.nodes:
+            # if node is placeholder, just add the size of the node
+            if node.op == 'placeholder':
+                act_memory += self._get_meta_node_size(node) / (1024 ** 2)
+                act_memory_peak_log.append(act_memory)
+                act_memory_after_node_log.append(act_memory)
+            # skip output
+            elif node.op == 'output':
+                continue
+            # node is an operation, calculate tmp, output node and delete node memory
+            else:
+                # forward memory
+                act_memory += self._get_contiguous_memory(node, not_contiguous_list) / (1024 ** 2)
+                act_memory += self._get_output_node_size(node) / (1024 ** 2)
+                # record max act memory
+                act_memory_peak_log.append(act_memory)
+                # delete useless memory
+                act_memory -= self._get_delete_node_size(node, user_to_last_uses) / (1024 ** 2)
+                act_memory -= self._get_contiguous_memory(node, not_contiguous_list, delete=True) / (1024 ** 2)
+                act_memory_after_node_log.append(act_memory)
+
+        print("no chunk")
+        self._print_mem_log(act_memory_peak_log, list(gm.graph.nodes), "peak")
+        self._print_mem_log(act_memory_after_node_log, list(gm.graph.nodes), "after")
+        
+        param_memory = parameter_size(gm)
+        return act_memory + param_memory, param_memory
 
 
-def _get_chunk_ratio(node, chunk_dim, chunk_size):
-    shape = node.meta['tensor_meta'].shape
-    chunk_ratio = float(chunk_size) / shape[chunk_dim]
-    return chunk_ratio
+    def _get_chunk_ratio(self, node, chunk_dim, chunk_size):
+        shape = node.meta['tensor_meta'].shape
+        chunk_ratio = float(chunk_size) / shape[chunk_dim]
+        return chunk_ratio
+
+
+    def _get_chunk_delete_node_size(self, user, user_to_last_uses, chunk_ratio, node_list, start_node, end_node):
+        if user.op in ('placeholder', 'output'):
+            return 0
+        nodes_to_delete = user_to_last_uses.get(user, [])
+        delete_size = 0
+        for n in nodes_to_delete:
+            node_idx = _find_idx_by_name(n.name, node_list)
+            if start_node <= node_idx < end_node:
+                delete_size += self._get_output_node_size(n) * chunk_ratio
+        return delete_size
 
 
-def _get_chunk_delete_node_size(user, user_to_last_uses, chunk_ratio, node_list, start_node, end_node):
-    if user.op in ('placeholder', 'output'):
-        return 0
-    nodes_to_delete = user_to_last_uses.get(user, [])
-    delete_size = 0
-    for n in nodes_to_delete:
-        node_idx = _find_idx_by_name(n.name, node_list)
-        if start_node <= node_idx < end_node:
-            delete_size += _get_output_node_size(n) * chunk_ratio
-    return delete_size
-
-
-def _print_mem_log(log, nodes, title=None):
-    if title:
-        print(title)
-    for idx, (l, n) in enumerate(zip(log, nodes)):
-        print("%s:%.2f \t" % (n.name, l), end='')
-        if (idx + 1) % 3 == 0:
-            print("")
-    print("\n")
-
-
-def _estimate_chunk_inference_mem(gm: torch.fx.GraphModule, start_nodes, end_nodes, chunk_dims, chunk_sizes):
-    act_memory = 0.0
-    act_memory_peak_log = []
-    act_memory_after_node_log = []
-    not_contiguous_list = []
-    user_to_last_uses = _get_last_usr(list(gm.graph.nodes))
-    _delete_free_var_from_last_use(user_to_last_uses)
-    within_chunk = False
-    region_idx = 0
-    chunk_ratio = 1 # use it to estimate chunk mem
-    node_list = list(gm.graph.nodes)
-
-    for idx, node in enumerate(node_list):
-        # if node in chunk start nodes, change chunk ratio and add chunk_tensor
-        if idx in start_nodes:
-            within_chunk = True
-            chunk_ratio = _get_chunk_ratio(node, chunk_dims[region_idx], chunk_sizes[region_idx])
-            act_memory += _get_output_node_size(node_list[end_nodes[region_idx]]) / (1024 ** 2)
-            
-        # if node is placeholder, just add the size of the node
-        if node.op == 'placeholder':
-            act_memory += _get_meta_node_size(node) * chunk_ratio / (1024 ** 2)
-            act_memory_peak_log.append(act_memory)
-        # skip output
-        elif node.op == 'output':
-            continue
-        # node is an operation, calculate tmp, output node and delete node memory
-        else:
-            # forward memory
-            # TODO: permute will create a tmp copy if not contiguous
-            act_memory += _get_contiguous_memory(node, not_contiguous_list) * chunk_ratio / (1024 ** 2)
-            act_memory += _get_output_node_size(node) * chunk_ratio / (1024 ** 2)
-            # record max act memory
-            act_memory_peak_log.append(act_memory)
-            # delete useless memory
-            act_memory -= _get_contiguous_memory(node, not_contiguous_list, delete=True) * chunk_ratio / (1024 ** 2)
-            if within_chunk:
-                act_memory -= _get_chunk_delete_node_size(
-                    node, user_to_last_uses, chunk_ratio, node_list, 
-                    start_nodes[region_idx], end_nodes[region_idx]) / (1024 ** 2)
+    def _print_mem_log(self, log, nodes, title=None):
+        if title:
+            print(title)
+        for idx, (l, n) in enumerate(zip(log, nodes)):
+            print("%s:%.2f \t" % (n.name, l), end='')
+            if (idx + 1) % 3 == 0:
+                print("")
+        print("\n")
+
+
+    def estimate_chunk_inference_mem(self, gm: torch.fx.GraphModule, start_nodes, end_nodes, chunk_dims, chunk_sizes):
+        act_memory = 0.0
+        act_memory_peak_log = []
+        act_memory_after_node_log = []
+        not_contiguous_list = []
+        user_to_last_uses = self._get_last_usr(list(gm.graph.nodes))
+        _delete_free_var_from_last_use(user_to_last_uses)
+        within_chunk = False
+        region_idx = 0
+        chunk_ratio = 1 # use it to estimate chunk mem
+        node_list = list(gm.graph.nodes)
+
+        for idx, node in enumerate(node_list):
+            # if node in chunk start nodes, change chunk ratio and add chunk_tensor
+            if idx in start_nodes:
+                within_chunk = True
+                chunk_ratio = self._get_chunk_ratio(node, chunk_dims[region_idx], chunk_sizes[region_idx])
+                act_memory += self._get_output_node_size(node_list[end_nodes[region_idx]]) / (1024 ** 2)
+                
+            # if node is placeholder, just add the size of the node
+            if node.op == 'placeholder':
+                act_memory += self._get_meta_node_size(node) * chunk_ratio / (1024 ** 2)
+                act_memory_peak_log.append(act_memory)
+            # skip output
+            elif node.op == 'output':
+                continue
+            # node is an operation, calculate tmp, output node and delete node memory
             else:
-                act_memory -= _get_delete_node_size(node, user_to_last_uses) / (1024 ** 2)
+                # forward memory
+                # TODO: permute will create a tmp copy if not contiguous
+                act_memory += self._get_contiguous_memory(node, not_contiguous_list) * chunk_ratio / (1024 ** 2)
+                act_memory += self._get_output_node_size(node) * chunk_ratio / (1024 ** 2)
+                # record max act memory
+                act_memory_peak_log.append(act_memory)
+                # delete useless memory
+                act_memory -= self._get_contiguous_memory(node, not_contiguous_list, delete=True) * chunk_ratio / (1024 ** 2)
+                if within_chunk:
+                    act_memory -= self._get_chunk_delete_node_size(
+                        node, user_to_last_uses, chunk_ratio, node_list, 
+                        start_nodes[region_idx], end_nodes[region_idx]) / (1024 ** 2)
+                else:
+                    act_memory -= self._get_delete_node_size(node, user_to_last_uses) / (1024 ** 2)
+                
+            if idx in end_nodes:
+                act_memory -= self._get_output_node_size(node) * chunk_ratio / (1024 ** 2)
+                within_chunk = False
+                chunk_ratio = 1
+                region_idx += 1
             
-        if idx in end_nodes:
-            act_memory -= _get_output_node_size(node) * chunk_ratio / (1024 ** 2)
-            within_chunk = False
-            chunk_ratio = 1
-            region_idx += 1
-        
-        act_memory_after_node_log.append(act_memory)
+            act_memory_after_node_log.append(act_memory)
 
-    print("chunk")
-    _print_mem_log(act_memory_peak_log, node_list, "peak")
-    _print_mem_log(act_memory_after_node_log, node_list, "after")
+        print("chunk")
+        self._print_mem_log(act_memory_peak_log, node_list, "peak")
+        self._print_mem_log(act_memory_after_node_log, node_list, "after")
 
-    param_memory = parameter_size(gm)
-    return act_memory + param_memory, param_memory
+        param_memory = parameter_size(gm)
+        return act_memory + param_memory, param_memory
 
 
 def _gen_chunk_slice_dim(chunk_dim, chunk_idx_name, shape):
@@ -695,8 +694,9 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
     within_chunk_region = False
 
     node_list = list(nodes)
-    _estimate_chunk_inference_mem(meta_graph, chunk_starts, chunk_ends, [1], [2])
-    _estimate_inference_mem(meta_graph)
+    memory_estimator = MemoryEstimator()
+    memory_estimator.estimate_chunk_inference_mem(meta_graph, chunk_starts, chunk_ends, [1], [2])
+    memory_estimator.estimate_inference_mem(meta_graph)
     node_index_tracer = NodeIndexTracer(meta_graph)
     node_index_tracer.trace_node_idx()
 

From 54a34a7e46d2f9e0234eb9295f3507e720ba21b2 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 15 Nov 2022 11:30:43 +0800
Subject: [PATCH 019/503] update active log

---
 chunk_codegen.py | 56 +++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 43 insertions(+), 13 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index c1d9e26e790a..ade986d1e343 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -407,18 +407,41 @@ def _get_meta_node_size(self, x):
         x = x.numel * torch.tensor([], dtype=x.dtype).element_size()
         return x
 
-    def _get_output_node_size(self, n):
+    def _get_output_node(self, n):
         fwd_out = {x.uuid: x for x in n.meta["fwd_out"] if isinstance(x, torch.Tensor) and hasattr(x, 'uuid')}
-        return activation_size(fwd_out)
+        out_size = activation_size(fwd_out)
+        out_node = [n.name] if out_size > 0 else []
+        return out_size, out_node
+    
+    def _get_output_node_size(self, n):
+        return self._get_output_node(n)[0]
+    
+    def _add_active_node(self, n, active_list):
+        new_active = self._get_output_node(n)[1]
+        for i in new_active:
+            if i not in active_list:
+                active_list.append(i)
 
+    def _get_delete_node(self, user, user_to_last_uses):
+        delete_size = 0
+        delete_node = []
+        if user.op not in ('placeholder', 'output'):
+            nodes_to_delete = user_to_last_uses.get(user, [])
+            if len(nodes_to_delete):
+                out_node = [self._get_output_node(i) for i in nodes_to_delete]
+                delete_size = sum([i[0] for i in out_node])
+                for i in range(len(out_node)):
+                    if out_node[i][0] > 0:
+                        delete_node.append(out_node[i][1][0])
+        return delete_size, delete_node
+    
     def _get_delete_node_size(self, user, user_to_last_uses):
-        if user.op in ('placeholder', 'output'):
-            return 0
-        nodes_to_delete = user_to_last_uses.get(user, [])
-        if len(nodes_to_delete):
-            delete_size = sum([self._get_output_node_size(i) for i in nodes_to_delete])
-            return delete_size
-        return 0
+        return self._get_delete_node(user, user_to_last_uses)[0]
+    
+    def _remove_active_node(self, user, user_to_last_uses, active_list):
+        delete_node = self._get_delete_node(user, user_to_last_uses)[1]
+        for i in delete_node:
+            active_list.remove(i)
 
     def _get_last_usr(self, nodes):
         node_to_last_use: Dict[Node, Node] = {}
@@ -438,7 +461,7 @@ def _get_contiguous_memory(self, node, not_contiguous_list, delete=False):
         mem = 0
         not_contiguous_ops = ['transpose', 'permute']
 
-        if node.op == 'call_function' and 'matmul' in node.name:
+        if node.op == 'call_function' and any(n in node.name for n in ['matmul', 'reshape']):
             for n in node.args:
                 if n in not_contiguous_list:
                     # matmul won't change origin tensor, but create a tmp copy
@@ -463,6 +486,8 @@ def estimate_inference_mem(self, gm: torch.fx.GraphModule):
         act_memory_peak_log = []
         act_memory_after_node_log = []
         not_contiguous_list = []
+        active_node_list = []
+        active_node_list_log = []
         user_to_last_uses = self._get_last_usr(list(gm.graph.nodes))
         _delete_free_var_from_last_use(user_to_last_uses)
         for node in gm.graph.nodes:
@@ -470,7 +495,7 @@ def estimate_inference_mem(self, gm: torch.fx.GraphModule):
             if node.op == 'placeholder':
                 act_memory += self._get_meta_node_size(node) / (1024 ** 2)
                 act_memory_peak_log.append(act_memory)
-                act_memory_after_node_log.append(act_memory)
+                active_node_list.append(node.name)
             # skip output
             elif node.op == 'output':
                 continue
@@ -484,8 +509,12 @@ def estimate_inference_mem(self, gm: torch.fx.GraphModule):
                 # delete useless memory
                 act_memory -= self._get_delete_node_size(node, user_to_last_uses) / (1024 ** 2)
                 act_memory -= self._get_contiguous_memory(node, not_contiguous_list, delete=True) / (1024 ** 2)
-                act_memory_after_node_log.append(act_memory)
+                # log active node
+                self._add_active_node(node, active_node_list)
+                self._remove_active_node(node, user_to_last_uses, active_node_list)
 
+            act_memory_after_node_log.append(act_memory)
+            active_node_list_log.append(copy.deepcopy(active_node_list))
         print("no chunk")
         self._print_mem_log(act_memory_peak_log, list(gm.graph.nodes), "peak")
         self._print_mem_log(act_memory_after_node_log, list(gm.graph.nodes), "after")
@@ -551,7 +580,6 @@ def estimate_chunk_inference_mem(self, gm: torch.fx.GraphModule, start_nodes, en
             # node is an operation, calculate tmp, output node and delete node memory
             else:
                 # forward memory
-                # TODO: permute will create a tmp copy if not contiguous
                 act_memory += self._get_contiguous_memory(node, not_contiguous_list) * chunk_ratio / (1024 ** 2)
                 act_memory += self._get_output_node_size(node) * chunk_ratio / (1024 ** 2)
                 # record max act memory
@@ -694,9 +722,11 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
     within_chunk_region = False
 
     node_list = list(nodes)
+
     memory_estimator = MemoryEstimator()
     memory_estimator.estimate_chunk_inference_mem(meta_graph, chunk_starts, chunk_ends, [1], [2])
     memory_estimator.estimate_inference_mem(meta_graph)
+
     node_index_tracer = NodeIndexTracer(meta_graph)
     node_index_tracer.trace_node_idx()
 

From d9ca2f898d1fb2a2b76ba663ebb27b9a778bd0ed Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 15 Nov 2022 15:50:50 +0800
Subject: [PATCH 020/503] polish code

---
 chunk_codegen.py | 87 +++++++++++++++---------------------------------
 1 file changed, 27 insertions(+), 60 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index ade986d1e343..77aca8deb81f 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -438,7 +438,7 @@ def _get_delete_node(self, user, user_to_last_uses):
     def _get_delete_node_size(self, user, user_to_last_uses):
         return self._get_delete_node(user, user_to_last_uses)[0]
     
-    def _remove_active_node(self, user, user_to_last_uses, active_list):
+    def _remove_deactive_node(self, user, user_to_last_uses, active_list):
         delete_node = self._get_delete_node(user, user_to_last_uses)[1]
         for i in delete_node:
             active_list.remove(i)
@@ -481,48 +481,6 @@ def _get_contiguous_memory(self, node, not_contiguous_list, delete=False):
 
         return mem
 
-    def estimate_inference_mem(self, gm: torch.fx.GraphModule):
-        act_memory = 0.0
-        act_memory_peak_log = []
-        act_memory_after_node_log = []
-        not_contiguous_list = []
-        active_node_list = []
-        active_node_list_log = []
-        user_to_last_uses = self._get_last_usr(list(gm.graph.nodes))
-        _delete_free_var_from_last_use(user_to_last_uses)
-        for node in gm.graph.nodes:
-            # if node is placeholder, just add the size of the node
-            if node.op == 'placeholder':
-                act_memory += self._get_meta_node_size(node) / (1024 ** 2)
-                act_memory_peak_log.append(act_memory)
-                active_node_list.append(node.name)
-            # skip output
-            elif node.op == 'output':
-                continue
-            # node is an operation, calculate tmp, output node and delete node memory
-            else:
-                # forward memory
-                act_memory += self._get_contiguous_memory(node, not_contiguous_list) / (1024 ** 2)
-                act_memory += self._get_output_node_size(node) / (1024 ** 2)
-                # record max act memory
-                act_memory_peak_log.append(act_memory)
-                # delete useless memory
-                act_memory -= self._get_delete_node_size(node, user_to_last_uses) / (1024 ** 2)
-                act_memory -= self._get_contiguous_memory(node, not_contiguous_list, delete=True) / (1024 ** 2)
-                # log active node
-                self._add_active_node(node, active_node_list)
-                self._remove_active_node(node, user_to_last_uses, active_node_list)
-
-            act_memory_after_node_log.append(act_memory)
-            active_node_list_log.append(copy.deepcopy(active_node_list))
-        print("no chunk")
-        self._print_mem_log(act_memory_peak_log, list(gm.graph.nodes), "peak")
-        self._print_mem_log(act_memory_after_node_log, list(gm.graph.nodes), "after")
-        
-        param_memory = parameter_size(gm)
-        return act_memory + param_memory, param_memory
-
-
     def _get_chunk_ratio(self, node, chunk_dim, chunk_size):
         shape = node.meta['tensor_meta'].shape
         chunk_ratio = float(chunk_size) / shape[chunk_dim]
@@ -550,25 +508,28 @@ def _print_mem_log(self, log, nodes, title=None):
                 print("")
         print("\n")
 
-
-    def estimate_chunk_inference_mem(self, gm: torch.fx.GraphModule, start_nodes, end_nodes, chunk_dims, chunk_sizes):
+    def estimate_chunk_inference_mem(self, gm: torch.fx.GraphModule, start_nodes=None, end_nodes=None, chunk_dims=None, chunk_sizes=None):
         act_memory = 0.0
         act_memory_peak_log = []
         act_memory_after_node_log = []
+        active_node_list = []
+        active_node_list_log = []
         not_contiguous_list = []
+        node_list = list(gm.graph.nodes)
         user_to_last_uses = self._get_last_usr(list(gm.graph.nodes))
         _delete_free_var_from_last_use(user_to_last_uses)
-        within_chunk = False
-        region_idx = 0
+        
+        use_chunk = all(i is not None for i in [start_nodes, end_nodes, chunk_dims, chunk_sizes])
+        chunk_within = False
+        chunk_region_idx = 0
         chunk_ratio = 1 # use it to estimate chunk mem
-        node_list = list(gm.graph.nodes)
 
         for idx, node in enumerate(node_list):
             # if node in chunk start nodes, change chunk ratio and add chunk_tensor
-            if idx in start_nodes:
-                within_chunk = True
-                chunk_ratio = self._get_chunk_ratio(node, chunk_dims[region_idx], chunk_sizes[region_idx])
-                act_memory += self._get_output_node_size(node_list[end_nodes[region_idx]]) / (1024 ** 2)
+            if use_chunk and idx in start_nodes:
+                chunk_within = True
+                chunk_ratio = self._get_chunk_ratio(node, chunk_dims[chunk_region_idx], chunk_sizes[chunk_region_idx])
+                act_memory += self._get_output_node_size(node_list[end_nodes[chunk_region_idx]]) / (1024 ** 2)
                 
             # if node is placeholder, just add the size of the node
             if node.op == 'placeholder':
@@ -586,22 +547,28 @@ def estimate_chunk_inference_mem(self, gm: torch.fx.GraphModule, start_nodes, en
                 act_memory_peak_log.append(act_memory)
                 # delete useless memory
                 act_memory -= self._get_contiguous_memory(node, not_contiguous_list, delete=True) * chunk_ratio / (1024 ** 2)
-                if within_chunk:
+                if chunk_within:
                     act_memory -= self._get_chunk_delete_node_size(
                         node, user_to_last_uses, chunk_ratio, node_list, 
-                        start_nodes[region_idx], end_nodes[region_idx]) / (1024 ** 2)
+                        start_nodes[chunk_region_idx], end_nodes[chunk_region_idx]) / (1024 ** 2)
                 else:
                     act_memory -= self._get_delete_node_size(node, user_to_last_uses) / (1024 ** 2)
-                
-            if idx in end_nodes:
+
+            # log active node
+            self._add_active_node(node, active_node_list)
+            self._remove_deactive_node(node, user_to_last_uses, active_node_list)
+
+            # if node in chunk end nodes, restore chunk settings
+            if use_chunk and idx in end_nodes:
                 act_memory -= self._get_output_node_size(node) * chunk_ratio / (1024 ** 2)
-                within_chunk = False
+                chunk_within = False
                 chunk_ratio = 1
-                region_idx += 1
+                chunk_region_idx += 1
             
             act_memory_after_node_log.append(act_memory)
+            active_node_list_log.append(copy.deepcopy(active_node_list))
 
-        print("chunk")
+        print("with chunk" if use_chunk else "without chunk")
         self._print_mem_log(act_memory_peak_log, node_list, "peak")
         self._print_mem_log(act_memory_after_node_log, node_list, "after")
 
@@ -725,7 +692,7 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
 
     memory_estimator = MemoryEstimator()
     memory_estimator.estimate_chunk_inference_mem(meta_graph, chunk_starts, chunk_ends, [1], [2])
-    memory_estimator.estimate_inference_mem(meta_graph)
+    memory_estimator.estimate_chunk_inference_mem(meta_graph)
 
     node_index_tracer = NodeIndexTracer(meta_graph)
     node_index_tracer.trace_node_idx()

From 7330d907459a220ebedaeafbbcc7c3cff3c8b1c4 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Sun, 4 Dec 2022 17:05:28 +0800
Subject: [PATCH 021/503] add possible region search

---
 chunk_codegen.py | 116 ++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 109 insertions(+), 7 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 77aca8deb81f..ba83f7fec3be 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -356,7 +356,17 @@ def _assign_view_reshape_index(self, node, node_idx):
                      "idx_to": [new_trace[i] for i in dim_to],
                      "dim_to": dim_to}
         self.idx_view_list.append(view_dict) 
-        
+    
+    def _merge_equal_idx(self):
+        idx_equal = copy.deepcopy(self.idx_trace_equal)
+        idx_equal.reverse()
+        for idx in idx_equal:
+            merge_to = min(idx)
+            merge_from = max(idx)
+            for trace in self.idx_trace_list:
+                if merge_from in trace['idx']:
+                    trace['idx'] = [merge_to if i == merge_from else i for i in trace['idx']]
+    
     def trace_node_idx(self):
         for idx, node in enumerate(self.nodes_list):
             if node.op == 'placeholder':
@@ -396,6 +406,7 @@ def trace_node_idx(self):
                 continue
             else:
                 raise NotImplementedError(node.op, "op not implemented yet!")
+        self._merge_equal_idx()
 
 
 class MemoryEstimator(object):
@@ -433,6 +444,8 @@ def _get_delete_node(self, user, user_to_last_uses):
                 for i in range(len(out_node)):
                     if out_node[i][0] > 0:
                         delete_node.append(out_node[i][1][0])
+                    elif nodes_to_delete[i].op == 'placeholder':
+                        delete_node.append(nodes_to_delete[i].name)
         return delete_size, delete_node
     
     def _get_delete_node_size(self, user, user_to_last_uses):
@@ -516,8 +529,9 @@ def estimate_chunk_inference_mem(self, gm: torch.fx.GraphModule, start_nodes=Non
         active_node_list_log = []
         not_contiguous_list = []
         node_list = list(gm.graph.nodes)
-        user_to_last_uses = self._get_last_usr(list(gm.graph.nodes))
-        _delete_free_var_from_last_use(user_to_last_uses)
+        user_to_last_uses = self._get_last_usr(node_list)
+        user_to_last_uses_no_free_var = self._get_last_usr(node_list)
+        _delete_free_var_from_last_use(user_to_last_uses_no_free_var)
         
         use_chunk = all(i is not None for i in [start_nodes, end_nodes, chunk_dims, chunk_sizes])
         chunk_within = False
@@ -535,6 +549,7 @@ def estimate_chunk_inference_mem(self, gm: torch.fx.GraphModule, start_nodes=Non
             if node.op == 'placeholder':
                 act_memory += self._get_meta_node_size(node) * chunk_ratio / (1024 ** 2)
                 act_memory_peak_log.append(act_memory)
+                active_node_list.append(node.name)
             # skip output
             elif node.op == 'output':
                 continue
@@ -549,10 +564,10 @@ def estimate_chunk_inference_mem(self, gm: torch.fx.GraphModule, start_nodes=Non
                 act_memory -= self._get_contiguous_memory(node, not_contiguous_list, delete=True) * chunk_ratio / (1024 ** 2)
                 if chunk_within:
                     act_memory -= self._get_chunk_delete_node_size(
-                        node, user_to_last_uses, chunk_ratio, node_list, 
+                        node, user_to_last_uses_no_free_var, chunk_ratio, node_list, 
                         start_nodes[chunk_region_idx], end_nodes[chunk_region_idx]) / (1024 ** 2)
                 else:
-                    act_memory -= self._get_delete_node_size(node, user_to_last_uses) / (1024 ** 2)
+                    act_memory -= self._get_delete_node_size(node, user_to_last_uses_no_free_var) / (1024 ** 2)
 
             # log active node
             self._add_active_node(node, active_node_list)
@@ -572,8 +587,92 @@ def estimate_chunk_inference_mem(self, gm: torch.fx.GraphModule, start_nodes=Non
         self._print_mem_log(act_memory_peak_log, node_list, "peak")
         self._print_mem_log(act_memory_after_node_log, node_list, "after")
 
-        param_memory = parameter_size(gm)
-        return act_memory + param_memory, param_memory
+        # param_memory = parameter_size(gm)
+        # all_memory = act_memory + param_memory
+        return act_memory_peak_log, act_memory_after_node_log, active_node_list_log
+
+
+class ChunkRegionSearch(object):
+    def __init__(self, gm) -> None:
+        self.gm = gm
+        self.node_list = list(gm.graph.nodes)
+        self.memory_estimator = MemoryEstimator()
+        self.index_tracer = NodeIndexTracer(gm)
+        self.index_tracer.trace_node_idx()
+
+    def _find_peak_node(self, mem_peak):
+        max_value = max(mem_peak)
+        max_idx = [mem_peak.index(max_value)]
+        return max_idx
+    
+    def _get_free_var(self):
+        free_var_idx = []
+        for idx, n in enumerate(self.node_list):
+            if n.op == 'placeholder':
+                free_var_idx.append(idx)
+        return free_var_idx
+    
+    def _get_min_free_var(self, active_node_list, free_vars):
+        min_len = 999
+        for idx, n in enumerate(active_node_list):
+            if idx in free_vars:
+                continue
+            if len(n) < min_len:
+                min_len = len(n)
+        return min_len
+    
+    def _search_max_chunk_region(self, active_node, peak_node):
+        free_vars = self._get_free_var()
+        min_var = self._get_min_free_var(active_node, free_vars)
+        
+        # from peak_node to free_var
+        chunk_region_start = None
+        for i in range(peak_node, -1, -1):
+            if len(active_node[i]) == min_var:
+                chunk_region_start = i + 1
+                break
+            if i in free_vars or i == 0:
+                raise RuntimeError()
+        # from peak_node to len-2
+        chunk_region_end = None
+        for i in range(peak_node, len(active_node) - 1):
+            if len(active_node[i]) == min_var:
+                chunk_region_end = i - 1
+                break
+            if i in free_vars or i == 0:
+                raise RuntimeError()
+        return chunk_region_start, chunk_region_end
+    
+    def _search_possible_chunk_regions(self, max_chunk_region, peak_node):
+        possible_chunk_region = []
+        for before_idx in range(max_chunk_region[0], peak_node):
+            for after_idx in range(peak_node, max_chunk_region[1]):
+                # skip non compute nodes
+                if any(op in ['placeholder', 'get_attr', 'output'] for op in 
+                       [self.node_list[before_idx].op, self.node_list[after_idx].op]):
+                    continue
+                if any(any(i in name for i in ['getitem', 'getattr']) for name in 
+                       [self.node_list[before_idx].name, self.node_list[after_idx].name]):
+                    continue
+                
+                # select free dim
+                before_trace = self.index_tracer.idx_trace_list[before_idx]
+                after_trace = self.index_tracer.idx_trace_list[after_idx]
+                free_dim = []
+                for i in range(min(len(before_trace['idx']), len(after_trace['idx']))):
+                   if (before_trace['idx'][i] == after_trace['idx'][i] and 
+                       before_trace['idx'][i] not in before_trace['compute'] and
+                       after_trace['idx'][i] not in after_trace['compute']):
+                       free_dim.append(i)
+                possible_chunk_region.append({'region': (before_idx, after_idx), 'dim': free_dim})
+        return possible_chunk_region
+    
+    def search_region(self):
+        mem_peak, mem_after, active_node = self.memory_estimator.estimate_chunk_inference_mem(self.gm)
+        peak_nodes = self._find_peak_node(mem_peak)
+        for idx, peak_node in enumerate(peak_nodes):
+            max_chunk_region = self._search_max_chunk_region(active_node, peak_node)
+            possible_chunk_regions = self._search_possible_chunk_regions(max_chunk_region, peak_node)
 
 
 def _gen_chunk_slice_dim(chunk_dim, chunk_idx_name, shape):
@@ -696,6 +795,9 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
 
     node_index_tracer = NodeIndexTracer(meta_graph)
     node_index_tracer.trace_node_idx()
+    
+    chunk_region_search = ChunkRegionSearch(meta_graph)
+    chunk_region_search.search_region()
 
     # find the input and output var names for each offload region
     for idx, (start, end) in enumerate(chunk_regions):

From 3b7d6712065b65d9c93feb64a488739e4483981f Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 6 Dec 2022 11:08:39 +0800
Subject: [PATCH 022/503] finish region search loop

---
 chunk_codegen.py     | 152 ++++++++++++++++++++++++++++++++-----------
 chunk_codegen_run.py |   4 +-
 2 files changed, 116 insertions(+), 40 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index ba83f7fec3be..47cda0f8ed20 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -21,7 +21,7 @@ class NodeIndexTracer(object):
     def __init__(self, gm) -> None:
         self.gm = gm
         self.nodes_list = list(gm.graph.nodes)
-        self.idx_trace_list = [{'idx': [], 'compute': []} for _ in range(len(self.nodes_list))] 
+        self.idx_trace_list = [{'idx': [], 'compute': {}} for _ in range(len(self.nodes_list))] 
         self.idx_trace_equal = []
         self.idx_view_list = []
         self.idx_count = -1
@@ -48,9 +48,12 @@ def _inherit_computation(self, node_from, node_to):
         """        
         _, compute_from = self._find_trace_from_node(node_from)
         idx_to, compute_to = self._find_trace_from_node(node_to)
-        for i in compute_from:
-            if i in idx_to and i not in compute_to:
-                compute_to.append(i)
+        for k, v in compute_from.items():
+            if k in idx_to:
+                if k in compute_to:
+                    compute_to[k].extend(v)
+                else:
+                    compute_to[k] = copy.deepcopy(v)
     
     def _mark_idx_equal(self, idx1, idx2):
         """
@@ -77,7 +80,9 @@ def _mark_computation(self, node, idx, dim):
         for d in dim:
             cur_idx = input_node_idx_trace[d]
             if cur_idx not in self.idx_trace_list[idx]['compute']:
-                self.idx_trace_list[idx]['compute'].append(cur_idx)
+                self.idx_trace_list[idx]['compute'][cur_idx] = [idx]
+            else:
+                self.idx_trace_list[idx]['compute'][cur_idx].append(idx)
     
     def _find_trace_from_node(self, node):
         """
@@ -357,6 +362,11 @@ def _assign_view_reshape_index(self, node, node_idx):
                      "dim_to": dim_to}
         self.idx_view_list.append(view_dict) 
     
+    def _remove_duplicate_compute(self):
+        for i in self.idx_trace_list:
+            for k, v in i['compute'].items():
+                i['compute'][k] = list(set(v))
+    
     def _merge_equal_idx(self):
         idx_equal = copy.deepcopy(self.idx_trace_equal)
         idx_equal.reverse()
@@ -406,6 +416,8 @@ def trace_node_idx(self):
                 continue
             else:
                 raise NotImplementedError(node.op, "op not implemented yet!")
+            
+        self._remove_duplicate_compute()
         self._merge_equal_idx()
 
 
@@ -521,6 +533,19 @@ def _print_mem_log(self, log, nodes, title=None):
                 print("")
         print("\n")
 
+    def _print_compute_op_mem_log(self, log, nodes, title=None):
+        if title:
+            print(title)
+        for idx, (l, n) in enumerate(zip(log, nodes)):
+            if n.op in ['placeholder', 'get_attr', 'output']:
+                continue
+            if any(i in n.name for i in ['getitem', 'getattr']):
+                continue
+            print("%s:%.2f \t" % (n.name, l), end='')
+            if (idx + 1) % 3 == 0:
+                print("")
+        print("\n")
+    
     def estimate_chunk_inference_mem(self, gm: torch.fx.GraphModule, start_nodes=None, end_nodes=None, chunk_dims=None, chunk_sizes=None):
         act_memory = 0.0
         act_memory_peak_log = []
@@ -584,8 +609,10 @@ def estimate_chunk_inference_mem(self, gm: torch.fx.GraphModule, start_nodes=Non
             active_node_list_log.append(copy.deepcopy(active_node_list))
 
         print("with chunk" if use_chunk else "without chunk")
-        self._print_mem_log(act_memory_peak_log, node_list, "peak")
-        self._print_mem_log(act_memory_after_node_log, node_list, "after")
+        # self._print_mem_log(act_memory_peak_log, node_list, "peak")
+        # self._print_mem_log(act_memory_after_node_log, node_list, "after")
+        self._print_compute_op_mem_log(act_memory_peak_log, node_list, "peak")
+        self._print_compute_op_mem_log(act_memory_after_node_log, node_list, "after")
 
         # param_memory = parameter_size(gm)
         # all_memory = act_memory + param_memory
@@ -602,7 +629,7 @@ def __init__(self, gm) -> None:
 
     def _find_peak_node(self, mem_peak):
         max_value = max(mem_peak)
-        max_idx = [mem_peak.index(max_value)]
+        max_idx = mem_peak.index(max_value)
         return max_idx
     
     def _get_free_var(self):
@@ -635,18 +662,35 @@ def _search_max_chunk_region(self, active_node, peak_node):
                 raise RuntimeError()
         # from peak_node to len-2
         chunk_region_end = None
-        for i in range(peak_node, len(active_node) - 1):
+        for i in range(peak_node, len(active_node)):
             if len(active_node[i]) == min_var:
-                chunk_region_end = i - 1
+                chunk_region_end = i
                 break
             if i in free_vars or i == 0:
                 raise RuntimeError()
         return chunk_region_start, chunk_region_end
     
+    def _not_compute(self, trace, chunk_range, dim_idx):
+        if trace['idx'][dim_idx] not in trace['compute']:
+            return True
+        if trace['idx'][dim_idx] in trace['compute'] and \
+            all(i < chunk_range[0] or i > chunk_range[1] for i in trace['compute'][trace['idx'][dim_idx]]):
+            return True
+        return False
+    
     def _search_possible_chunk_regions(self, max_chunk_region, peak_node):
         possible_chunk_region = []
+        output_trace = copy.deepcopy(self.index_tracer.idx_trace_list)
+        input_trace = []
+        for i, n in enumerate(self.node_list):
+            if len(n.args) > 0 and n.op != 'output':
+                input_idx = _find_idx_by_name(n.args[0].name, self.node_list)
+                input_trace.append(output_trace[input_idx])
+            else:
+                input_trace.append(None)
+
         for before_idx in range(max_chunk_region[0], peak_node):
-            for after_idx in range(peak_node, max_chunk_region[1]):
+            for after_idx in range(peak_node, max_chunk_region[1] + 1):
                 # skip non compute nodes
                 if any(op in ['placeholder', 'get_attr', 'output'] for op in 
                        [self.node_list[before_idx].op, self.node_list[after_idx].op]):
@@ -656,23 +700,59 @@ def _search_possible_chunk_regions(self, max_chunk_region, peak_node):
                     continue
                 
                 # select free dim
-                before_trace = self.index_tracer.idx_trace_list[before_idx]
-                after_trace = self.index_tracer.idx_trace_list[after_idx]
+                before_trace = input_trace[before_idx]
+                after_trace = output_trace[after_idx]
                 free_dim = []
                 for i in range(min(len(before_trace['idx']), len(after_trace['idx']))):
                    if (before_trace['idx'][i] == after_trace['idx'][i] and 
-                       before_trace['idx'][i] not in before_trace['compute'] and
-                       after_trace['idx'][i] not in after_trace['compute']):
+                       self._not_compute(before_trace, (before_idx, after_idx), i) and
+                       self._not_compute(after_trace, (before_idx, after_idx), i) and
+                       self.node_list[after_idx].meta['tensor_meta'].shape[i] != 1):
                        free_dim.append(i)
                 possible_chunk_region.append({'region': (before_idx, after_idx), 'dim': free_dim})
         return possible_chunk_region
     
+    def _search_best_chunk_region(self, possible_chunk_regions):
+        max_region_range = 0
+        best_regions = None
+        for i in possible_chunk_regions:
+            if i['region'][1] - i['region'][0] > max_region_range:
+                best_regions = i
+                max_region_range = i['region'][1] - i['region'][0]
+        return best_regions
+    
+    def _step_search(self, peak_node, active_node):
+        max_chunk_region = self._search_max_chunk_region(active_node, peak_node)
+        possible_chunk_regions = self._search_possible_chunk_regions(max_chunk_region, peak_node)
+        best_chunk_region = self._search_best_chunk_region(possible_chunk_regions)
+        return best_chunk_region
+    
+    def _stop_search(self, init_mem_peak, mem_peak):
+        sorted_init_mem_peak = sorted(init_mem_peak)
+        if max(mem_peak) < sorted_init_mem_peak[int(len(sorted_init_mem_peak) * 0.5)]:
+            return True
+        return False
+    
     def search_region(self):
-        mem_peak, mem_after, active_node = self.memory_estimator.estimate_chunk_inference_mem(self.gm)
-        peak_nodes = self._find_peak_node(mem_peak)
-        for idx, peak_node in enumerate(peak_nodes):
-            max_chunk_region = self._search_max_chunk_region(active_node, peak_node)
-            possible_chunk_regions = self._search_possible_chunk_regions(max_chunk_region, peak_node)
+        chunk_regions = []
+        init_mem_peak, _, active_node = self.memory_estimator.estimate_chunk_inference_mem(self.gm)
+        mem_peak = init_mem_peak
+        
+        while True:
+            peak_node = self._find_peak_node(mem_peak)
+            chunk_region = self._step_search(peak_node, active_node)
+            if chunk_region is None or len(chunk_region['dim']) == 0:
+                break
+            
+            chunk_regions.append(chunk_region)
+            mem_peak, _, active_node = self.memory_estimator.estimate_chunk_inference_mem(
+                self.gm, [i['region'][0] for i in chunk_regions], 
+                [i['region'][1] for i in chunk_regions], [i['dim'][0] for i in chunk_regions], [1] * len(chunk_regions))
+            
+            if self._stop_search(init_mem_peak, mem_peak):
+                break
+
+        return chunk_regions
 
 
 def _gen_chunk_slice_dim(chunk_dim, chunk_idx_name, shape):
@@ -696,11 +776,12 @@ def _get_first_non_single_dim(shape):
     raise RuntimeError("can not get first non single dim for shape", shape)
 
 
-def _gen_loop_start(chunk_input_meta, chunk_output, chunk_size=2):
+def _gen_loop_start(chunk_input_meta, chunk_output, chunk_dim, chunk_size=2):
     if len(chunk_input_meta) == 1:
         node = chunk_input_meta[0]
         node_shape = node.meta['tensor_meta'].shape
-        chunk_dim = _get_first_non_single_dim(node_shape)
+        free_shape = [node_shape[i] if i in chunk_dim else 1 for i in range(len(node_shape))]
+        chunk_dim = _get_first_non_single_dim(free_shape)
         chunk_slice = _gen_chunk_slice_dim(chunk_dim, "gen_chunk_idx", node_shape)
         out_shape = str(list(chunk_output.meta['tensor_meta'].shape))
         
@@ -713,12 +794,13 @@ def _gen_loop_start(chunk_input_meta, chunk_output, chunk_size=2):
     return context
 
 
-def _gen_loop_end(chunk_outputs, chunk_inputs, node_list):
+def _gen_loop_end(chunk_outputs, chunk_inputs, node_list, chunk_dim):
     chunk_inputs_name = chunk_inputs[0].name
     chunk_outputs_name = chunk_outputs.name
     chunk_outputs_idx = _find_idx_by_name(chunk_outputs_name, node_list)
     chunk_output_shape = chunk_outputs.meta['tensor_meta'].shape
-    chunk_dim = _get_first_non_single_dim(chunk_output_shape)
+    free_shape = [chunk_output_shape[i] if i in chunk_dim else 1 for i in range(len(chunk_output_shape))]
+    chunk_dim = _get_first_non_single_dim(free_shape)
     chunk_slice = _gen_chunk_slice_dim(chunk_dim, "gen_chunk_idx", chunk_output_shape)
     context = "    chunk_result%s = %s\n" % (chunk_slice, chunk_outputs_name)
 
@@ -780,7 +862,11 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
     """
 
     # find the offload regions
-    chunk_regions = [(58, 62)]
+    chunk_region_search = ChunkRegionSearch(meta_graph)
+    chunk_search = chunk_region_search.search_region()
+    chunk_regions = [i['region'] for i in chunk_search]
+    chunk_dims = [i['dim'] for i in chunk_search]
+    
     chunk_starts = [item[0] for item in chunk_regions]
     chunk_ends = [item[1] for item in chunk_regions]
     chunk_inputs = []
@@ -789,16 +875,6 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
 
     node_list = list(nodes)
 
-    memory_estimator = MemoryEstimator()
-    memory_estimator.estimate_chunk_inference_mem(meta_graph, chunk_starts, chunk_ends, [1], [2])
-    memory_estimator.estimate_chunk_inference_mem(meta_graph)
-
-    node_index_tracer = NodeIndexTracer(meta_graph)
-    node_index_tracer.trace_node_idx()
-    
-    chunk_region_search = ChunkRegionSearch(meta_graph)
-    chunk_region_search.search_region()
-
     # find the input and output var names for each offload region
     for idx, (start, end) in enumerate(chunk_regions):
         offload_node_list = node_list[start:end + 1]
@@ -824,13 +900,13 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
                 
             # add for loop
             chunk_input_meta = [meta_nodes[i] for i in chunk_inputs_idx[region_idx]]
-            body.append(_gen_loop_start(chunk_input_meta, node_list[chunk_ends[region_idx]]))
+            body.append(_gen_loop_start(chunk_input_meta, node_list[chunk_ends[region_idx]], chunk_dims[region_idx]))
 
         if within_chunk_region:
             emit_node_func(node, body)
             # replace input var with chunk var
             if node_idx in chunk_starts:
-                body[-1] = body[-1].replace("("+ chunk_inputs[region_idx][0].name +")", '(chunk_tensor)')
+                body[-1] = body[-1].replace(chunk_inputs[region_idx][0].name, 'chunk_tensor')
             body[-1] = '    ' + body[-1]
             delete_unused_value_func(node, body, chunk_inputs_names)
 
@@ -840,7 +916,7 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
                 delete_unused_value_func(node, body, chunk_inputs_names)
 
         if node_idx in chunk_ends:
-            body.append(_gen_loop_end(node, chunk_inputs[region_idx], node_list))
+            body.append(_gen_loop_end(node, chunk_inputs[region_idx], node_list, chunk_dims[region_idx]))
             within_chunk_region = False
             region_idx += 1
 
diff --git a/chunk_codegen_run.py b/chunk_codegen_run.py
index 39363a80abcb..88c734903392 100644
--- a/chunk_codegen_run.py
+++ b/chunk_codegen_run.py
@@ -45,8 +45,8 @@ def _test_fwd_and_bwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair):
     with torch.no_grad():
         non_fx_out = model(node, pair)
         fx_out = gm(node, pair)
-    assert torch.equal(non_fx_out[0], fx_out[0]), "fx_out doesn't comply with original output"
-    assert torch.equal(non_fx_out[1], fx_out[1]), "fx_out doesn't comply with original output"
+    assert torch.allclose(non_fx_out[0], fx_out[0], atol=1e-6), "fx_out doesn't comply with original output"
+    assert torch.allclose(non_fx_out[1], fx_out[1], atol=1e-6), "fx_out doesn't comply with original output"
 
     # test barckward
     # loss0 = non_fx_out[0].sum() + non_fx_out[1].sum()

From f24c418bb04a1e65eaa0f6cf8aada466deca2598 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 6 Dec 2022 16:29:07 +0800
Subject: [PATCH 023/503] finish chunk define

---
 chunk_codegen.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 47cda0f8ed20..6740cd44ab6a 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -827,7 +827,7 @@ def _find_input_and_output_nodes(nodes: List[Node]):
     for node in nodes:
         for input_node in node._input_nodes.keys():
             node_repr = repr(input_node)
-            if input_node not in nodes and node_repr not in input_nodes:
+            if input_node not in nodes and input_node not in input_nodes:
                 input_nodes.append(input_node)
 
     # if a node has a user node which is not in the node list
@@ -835,7 +835,7 @@ def _find_input_and_output_nodes(nodes: List[Node]):
     for node in nodes:
         for output_node in node.users.keys():
             node_repr = repr(node)
-            if output_node not in nodes and node_repr not in output_nodes:
+            if output_node not in nodes and output_node not in output_nodes:
                 output_nodes.append(output_node)
 
     return input_nodes, output_nodes
@@ -848,6 +848,16 @@ def _find_idx_by_name(name, nodes_list):
     raise RuntimeError("name %s not found in node list" % name)
 
 
+def _replace_name(context, name_from, name_to):
+    patterns = [(" ", " "), (" ", "."), (" ", ","), ("(", ")"), ("(", ",")]
+    for p in patterns:
+        source = p[0] + name_from + p[1]
+        target = p[0] + name_to + p[1]
+        if source in context:
+            context = context.replace(source, target)
+    return context
+
+
 def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_value_func, meta_nodes, meta_graph):
     """Emit code with nested activation checkpoint
     When we detect some of the node.activation_checkpoint is a List, we will use
@@ -905,8 +915,7 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
         if within_chunk_region:
             emit_node_func(node, body)
             # replace input var with chunk var
-            if node_idx in chunk_starts:
-                body[-1] = body[-1].replace(chunk_inputs[region_idx][0].name, 'chunk_tensor')
+            body[-1] = _replace_name(body[-1], chunk_inputs[region_idx][0].name, 'chunk_tensor')
             body[-1] = '    ' + body[-1]
             delete_unused_value_func(node, body, chunk_inputs_names)
 

From a9d64377bb237f34fdafaeec2abcfdfb6e080091 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 6 Dec 2022 17:34:24 +0800
Subject: [PATCH 024/503] support new op

---
 chunk_codegen.py | 63 ++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 61 insertions(+), 2 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 6740cd44ab6a..2dc44d381d85 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -200,8 +200,12 @@ def _assign_linear_index(self, node, node_idx):
         Args:
             node (node)
             node_idx (int)
-        """  
-        input_node, weight, bias = node.args
+        """
+        if len(node.args) == 2:
+            input_node, weight = node.args
+            bias = None
+        else:
+            input_node, weight, bias = node.args
         input_node_idx_trace = self._find_idx_trace_from_node(input_node)
         weight_idx_trace = self._find_idx_trace_from_node(weight)
         
@@ -284,6 +288,53 @@ def _assign_softmax_index(self, node, idx):
         self._assign_index_as_input(node, idx)
         self._inherit_computation(node.args[0], node)
         self._mark_computation(node, idx, [node.kwargs['dim']])
+        
+    def _assign_unsqueeze_index(self, node, node_idx):
+        """
+        Assign index for unsqueeze op.
+        1. assign new index for unsqueeze dim
+
+        Args:
+            node (node)
+            node_idx (int)
+        """ 
+        self._assign_index_as_input(node, node_idx)
+        self._inherit_computation(node.args[0], node)
+        self.idx_trace_list[node_idx]['idx'].insert(node.args[1], self._add_index())
+        
+    def _assign_dropout_index(self, node, node_idx):
+        """
+        Assign index for unsqueeze op.
+        1. assign new index for unsqueeze dim
+
+        Args:
+            node (node)
+            node_idx (int)
+        """ 
+        self._assign_index_as_input(node, node_idx)
+
+        
+    def _assign_ones_like_index(self, node, node_idx):
+        """
+        Assign index for oneslike op.
+        1. assign new index for all dim
+
+        Args:
+            node (node)
+            node_idx (int)
+        """ 
+        self._assign_all_index(node, node_idx)
+        
+    def _assign_to_index(self, node, node_idx):
+        """
+        Assign index for to op.
+        1. assign new index for all dim
+
+        Args:
+            node (node)
+            node_idx (int)
+        """ 
+        self._assign_index_as_input(node, node_idx)
 
     def _assign_view_reshape_index(self, node, node_idx):
         """
@@ -388,6 +439,10 @@ def trace_node_idx(self):
                     self._assign_permute_index(node, idx)
                 elif 'view' in node.name or 'reshape' in node.name:
                     self._assign_view_reshape_index(node, idx)
+                elif 'unsqueeze' in node.name:
+                    self._assign_unsqueeze_index(node, idx)
+                elif 'to' in node.name:
+                    self._assign_to_index(node, idx)
                 else:
                     raise NotImplementedError(node.name, "method not implemented yet!")
             elif node.op == 'call_function':
@@ -399,6 +454,10 @@ def trace_node_idx(self):
                     self._assign_softmax_index(node, idx)
                 elif any(n in node.name for n in ['mul', 'add', 'sigmoid', 'relu']):
                     self._assign_elementwise_index(node, idx)
+                elif 'ones_like' in node.name:
+                    self._assign_ones_like_index(node, idx)
+                elif 'dropout' in node.name:
+                    self._assign_dropout_index(node, idx)
                 elif 'getattr' in node.name:
                     continue # get attr like shape
                 elif 'getitem' in node.name:

From 6d99994a7afbfe290bcd798804b4e1e7e76d1281 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 6 Dec 2022 17:35:27 +0800
Subject: [PATCH 025/503] rename index tracer

---
 chunk_codegen.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 2dc44d381d85..0f97f94a9d21 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -17,7 +17,7 @@ def _delete_free_var_from_last_use(user_to_last_uses):
                 user_to_last_uses[key].remove(n)
 
 
-class NodeIndexTracer(object):
+class IndexTracer(object):
     def __init__(self, gm) -> None:
         self.gm = gm
         self.nodes_list = list(gm.graph.nodes)
@@ -683,7 +683,7 @@ def __init__(self, gm) -> None:
         self.gm = gm
         self.node_list = list(gm.graph.nodes)
         self.memory_estimator = MemoryEstimator()
-        self.index_tracer = NodeIndexTracer(gm)
+        self.index_tracer = IndexTracer(gm)
         self.index_tracer.trace_node_idx()
 
     def _find_peak_node(self, mem_peak):

From 2b4ebcc27839b34c015c4fb79e69abd721b83ee6 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Thu, 8 Dec 2022 15:16:10 +0800
Subject: [PATCH 026/503] finishi codegen on msa

---
 chunk_codegen.py | 212 +++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 188 insertions(+), 24 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 0f97f94a9d21..1e8305ba395b 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -17,6 +17,121 @@ def _delete_free_var_from_last_use(user_to_last_uses):
                 user_to_last_uses[key].remove(n)
 
 
+class FlowTracer(object):
+    def __init__(self, gm) -> None:
+        self.gm = gm
+        self.nodes_list = list(gm.graph.nodes)
+        self.flow_trace = {}
+
+    def _add_trace(self, name):
+        self.flow_trace[name] = []
+    
+    def _add_node(self, trace_name, node):
+        self.flow_trace[trace_name].append({'node': node, 'inside_depend': [], 'outside_depend': []})
+    
+    def _add_inside_depend(self, flow_name, node, inside_depend_node):
+        for i in self.flow_trace[flow_name]:
+            if i['node'] == node:
+                i['inside_depend'].append(inside_depend_node)
+                return
+        raise RuntimeError("node not found")
+                
+    def _add_outside_depend(self, flow_name, node, outside_depend_node, outside_depend_trace):
+        for i in self.flow_trace[flow_name]:
+            if i['node'] == node:
+                i['outside_depend'].append({outside_depend_trace: outside_depend_node}) 
+                return
+        raise RuntimeError("node not found")
+
+    def _init_trace(self):
+        for i in self.nodes_list:
+            if i.op == 'placeholder':
+                self._add_trace(i.name)
+                self._add_node(i.name, i)
+
+    def _is_non_compute_node(self, node):
+        if any(i in node.op for i in ['placeholder', 'get_attr', 'output']) or \
+            any(i in node.name for i in ['getitem', 'getattr']):
+            return True
+        return False
+    
+    def _is_non_compute_node_except_placeholder(self, node):
+        if any(i in node.op for i in ['get_attr', 'output']) or \
+            any(i in node.name for i in ['getitem', 'getattr']):
+            return True
+        return False
+    
+    def _find_flow_for_node(self, node):
+        if type(self.nodes_list[0]) != type(node):
+            return None
+        if self._is_non_compute_node_except_placeholder(node):
+            return None
+        for name, trace in self.flow_trace.items():
+            for i in trace:
+                if node == i['node']:
+                    return name
+        if any(i in node.name for i in ["ones_like"]):
+            self._add_trace(node.name)
+            self._add_node(node.name, node)
+            return node.name
+        raise RuntimeError("node not found")
+    
+    def _find_first_valid_flow(self, flow):
+        for i in flow:
+            if i is not None:
+                return i
+        raise RuntimeError("invalid flow")
+    
+    def find_node_flow(self, node):
+        for name, trace in self.flow_trace.items():
+            for i in trace:
+                if node == i['node']:
+                    return name, i
+        raise RuntimeError("invalid node")
+        
+    def get_flow_mix(self, node):
+        if self._is_non_compute_node(node):
+            return None
+        _, node_trace = self.find_node_flow(node)
+        if len(node_trace['outside_depend']) == 0:
+            return None
+        elif len(node_trace['outside_depend']) > 1:
+            raise NotImplementedError
+        vars = list(node_trace['outside_depend'][0].values())[0]
+        return vars
+    
+    def get_same_flow_node(self, node_list, node):
+        name, _ = self.find_node_flow(node)
+        result = []
+        for i in self.flow_trace[name]:
+            if i['node'] in node_list:
+                result.append(i['node'])
+        return result
+        
+    def trace_flow(self):    
+        # init trace
+        self._init_trace()
+
+        for node in self.nodes_list:
+            # skip if non compute node
+            if all(type(arg) != type(node) or self._is_non_compute_node_except_placeholder(arg) for arg in node.args) \
+                or self._is_non_compute_node(node):
+                continue
+
+            node_input_flows = [self._find_flow_for_node(arg) for arg in node.args]
+
+            node_domin_flow = self._find_first_valid_flow(node_input_flows)
+            self._add_node(node_domin_flow, node)
+            for node_input_flow, arg in zip(node_input_flows, node.args):
+                if node_input_flow is None:
+                    continue
+                elif node_input_flow == node_domin_flow:
+                    self._add_inside_depend(node_domin_flow, node, arg)
+                else:
+                    self._add_outside_depend(node_domin_flow, node, arg, node_input_flow)
+        return self.flow_trace
+
+
 class IndexTracer(object):
     def __init__(self, gm) -> None:
         self.gm = gm
@@ -428,7 +543,7 @@ def _merge_equal_idx(self):
                 if merge_from in trace['idx']:
                     trace['idx'] = [merge_to if i == merge_from else i for i in trace['idx']]
     
-    def trace_node_idx(self):
+    def trace_index(self):
         for idx, node in enumerate(self.nodes_list):
             if node.op == 'placeholder':
                 self._assign_all_index(node, idx)
@@ -684,7 +799,9 @@ def __init__(self, gm) -> None:
         self.node_list = list(gm.graph.nodes)
         self.memory_estimator = MemoryEstimator()
         self.index_tracer = IndexTracer(gm)
-        self.index_tracer.trace_node_idx()
+        self.index_tracer.trace_index()
+        self.flow_tracer = FlowTracer(gm)
+        self.flow_tracer.trace_flow()
 
     def _find_peak_node(self, mem_peak):
         max_value = max(mem_peak)
@@ -729,7 +846,7 @@ def _search_max_chunk_region(self, active_node, peak_node):
                 raise RuntimeError()
         return chunk_region_start, chunk_region_end
     
-    def _not_compute(self, trace, chunk_range, dim_idx):
+    def _is_not_compute(self, trace, chunk_range, dim_idx):
         if trace['idx'][dim_idx] not in trace['compute']:
             return True
         if trace['idx'][dim_idx] in trace['compute'] and \
@@ -737,6 +854,56 @@ def _not_compute(self, trace, chunk_range, dim_idx):
             return True
         return False
     
+    def _detect_flow(self, before_trace, after_trace, start_idx, end_idx, dim_idx):
+        inputs, outputs = _find_input_and_output_nodes(self.node_list[start_idx:end_idx + 1])
+        chunk_info = {'inputs': inputs, 'outputs': outputs}
+        flow_flag = False
+        
+        for idx in range(start_idx, end_idx + 1):
+            node = self.node_list[idx]
+            mix_flow_var = self.flow_tracer.get_flow_mix(node)
+            if mix_flow_var is None:
+                continue
+            
+            # if there is a flow mix, op must be in [mul, add, div, matmul]
+            # element-wise op requires dim to be equal in every dim
+            if any(n in node.name for n in ['mul', 'add']):
+                for i in node.args:
+                    if type(i) == type(mix_flow_var) and i != mix_flow_var:
+                        main_flow_var = i
+                # if mix flow is a broadcast in chunk dim, 
+                # TODO need to move that flow out of the chunk
+                if mix_flow_var.meta['tensor_meta'].shape[dim_idx] == 1:
+                    flow_flag = True
+                    for i in self.flow_tracer.get_same_flow_node(chunk_info['inputs'], mix_flow_var):
+                        chunk_info['inputs'].remove(i)
+                # else, we need to chunk mix var as well
+                else:
+                    # TODO chunk another value
+                    flow_flag = False
+                    break
+            else:
+                raise NotImplementedError("%s not implemented" % node.name)
+        return flow_flag, chunk_info
+    
+    def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
+        before_trace = input_trace[start_idx]
+        after_trace = output_trace[end_idx]
+        free_dim = []
+        chunk_infos = []
+        for i in range(min(len(before_trace['idx']), len(after_trace['idx']))):
+            if not (before_trace['idx'][i] == after_trace['idx'][i] and 
+                self._is_not_compute(before_trace, (start_idx, end_idx), i) and
+                self._is_not_compute(after_trace, (start_idx, end_idx), i) and
+                self.node_list[end_idx].meta['tensor_meta'].shape[i] != 1):
+                continue
+            flow_flag, chunk_info = self._detect_flow(before_trace, after_trace, start_idx, end_idx, i)
+            if flow_flag == None:
+                continue
+            chunk_infos.append(chunk_info)
+            free_dim.append(i)
+        return free_dim, chunk_infos
+
     def _search_possible_chunk_regions(self, max_chunk_region, peak_node):
         possible_chunk_region = []
         output_trace = copy.deepcopy(self.index_tracer.idx_trace_list)
@@ -748,27 +915,22 @@ def _search_possible_chunk_regions(self, max_chunk_region, peak_node):
             else:
                 input_trace.append(None)
 
-        for before_idx in range(max_chunk_region[0], peak_node):
-            for after_idx in range(peak_node, max_chunk_region[1] + 1):
+        for start_idx in range(max_chunk_region[0], peak_node):
+            for end_idx in range(peak_node, max_chunk_region[1] + 1):
                 # skip non compute nodes
                 if any(op in ['placeholder', 'get_attr', 'output'] for op in 
-                       [self.node_list[before_idx].op, self.node_list[after_idx].op]):
+                       [self.node_list[start_idx].op, self.node_list[end_idx].op]):
                     continue
                 if any(any(i in name for i in ['getitem', 'getattr']) for name in 
-                       [self.node_list[before_idx].name, self.node_list[after_idx].name]):
+                       [self.node_list[start_idx].name, self.node_list[end_idx].name]):
                     continue
                 
                 # select free dim
-                before_trace = input_trace[before_idx]
-                after_trace = output_trace[after_idx]
-                free_dim = []
-                for i in range(min(len(before_trace['idx']), len(after_trace['idx']))):
-                   if (before_trace['idx'][i] == after_trace['idx'][i] and 
-                       self._not_compute(before_trace, (before_idx, after_idx), i) and
-                       self._not_compute(after_trace, (before_idx, after_idx), i) and
-                       self.node_list[after_idx].meta['tensor_meta'].shape[i] != 1):
-                       free_dim.append(i)
-                possible_chunk_region.append({'region': (before_idx, after_idx), 'dim': free_dim})
+                free_dim, chunk_info = self._find_free_dim(input_trace, output_trace, start_idx, end_idx)
+                if len(free_dim) > 0:
+                    free_dim = [free_dim[0]]
+                    chunk_info = [chunk_info[0]]
+                possible_chunk_region.append({'region': (start_idx, end_idx), 'dim': free_dim, 'chunk_info': chunk_info})
         return possible_chunk_region
     
     def _search_best_chunk_region(self, possible_chunk_regions):
@@ -935,21 +1097,23 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
     chunk_search = chunk_region_search.search_region()
     chunk_regions = [i['region'] for i in chunk_search]
     chunk_dims = [i['dim'] for i in chunk_search]
+    chunk_infos = [i['chunk_info'] for i in chunk_search]
     
     chunk_starts = [item[0] for item in chunk_regions]
     chunk_ends = [item[1] for item in chunk_regions]
-    chunk_inputs = []
-    chunk_outputs = []
+    chunk_inputs = [[j['inputs'][0] for j in i] for i in chunk_infos]
+    chunk_outputs = [[j['outputs'][0] for j in i] for i in chunk_infos]
     within_chunk_region = False
 
     node_list = list(nodes)
 
     # find the input and output var names for each offload region
-    for idx, (start, end) in enumerate(chunk_regions):
-        offload_node_list = node_list[start:end + 1]
-        inputs, outputs = _find_input_and_output_nodes(offload_node_list)
-        chunk_inputs.append(inputs)
-        chunk_outputs.append(outputs)
+    # for idx, (start, end) in enumerate(chunk_regions):
+    #     offload_node_list = node_list[start:end + 1]
+    #     inputs, outputs = _find_input_and_output_nodes(offload_node_list)
+    #     chunk_inputs.append(inputs)
+    #     chunk_outputs.append(outputs)
+    
     chunk_inputs_idx = [[_find_idx_by_name(j.name, node_list) for j in i] for i in chunk_inputs]
     chunk_outputs_idx = [[_find_idx_by_name(j.name, node_list) for j in i] for i in chunk_outputs]
     chunk_inputs_names = []

From 979e61db92a95b8bc2904c5b38264f24060be310 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 9 Dec 2022 17:39:02 +0800
Subject: [PATCH 027/503] redesign index tracer, add source and change compute

---
 chunk_codegen.py | 310 +++++++++++++++++++++++++++++++----------------
 1 file changed, 206 insertions(+), 104 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 1e8305ba395b..ce7d849178d1 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -16,6 +16,11 @@ def _delete_free_var_from_last_use(user_to_last_uses):
             if n.op == 'placeholder':
                 user_to_last_uses[key].remove(n)
 
+def _get_node_shape(node):
+    if hasattr(node.meta['tensor_meta'], "shape"):
+        return node.meta['tensor_meta'].shape
+    return None
+
 
 class FlowTracer(object):
     def __init__(self, gm) -> None:
@@ -136,11 +141,25 @@ class IndexTracer(object):
     def __init__(self, gm) -> None:
         self.gm = gm
         self.nodes_list = list(gm.graph.nodes)
-        self.idx_trace_list = [{'idx': [], 'compute': {}} for _ in range(len(self.nodes_list))] 
+        self.idx_trace_list = self._init_idx_trace_list()
         self.idx_trace_equal = []
         self.idx_view_list = []
         self.idx_count = -1
 
+    def _init_idx_trace_list(self):
+        idx_trace_list = []
+        for n in self.nodes_list:
+            if _get_node_shape(n) != None:            
+                cur_trace = {
+                    'idx': [None for _ in range(len(_get_node_shape(n)))],
+                    'compute': [[] for _ in range(len(_get_node_shape(n)))],
+                    'source': [[] for _ in range(len(_get_node_shape(n)))],
+                }
+            else:
+                cur_trace = {'idx': [], 'compute': [], 'source': []}
+            idx_trace_list.append(cur_trace)
+        return idx_trace_list
+    
     def _add_index(self):
         """
         Update the count and return it. To record the idx number.
@@ -150,35 +169,81 @@ def _add_index(self):
         """        
         self.idx_count += 1
         return self.idx_count
-
-    def _inherit_computation(self, node_from, node_to):
-        """
-        Inherit computed dim from node_from to node_to.
-        If a dim in node_from is marked as computed and exists in node_to,
-        still mark it as computed in node_to.
-
-        Args:
-            node_from (node): node to be inherited
-            node_to (node): new node to inherit
-        """        
-        _, compute_from = self._find_trace_from_node(node_from)
-        idx_to, compute_to = self._find_trace_from_node(node_to)
-        for k, v in compute_from.items():
-            if k in idx_to:
-                if k in compute_to:
-                    compute_to[k].extend(v)
-                else:
-                    compute_to[k] = copy.deepcopy(v)
     
-    def _mark_idx_equal(self, idx1, idx2):
+    def _del_dim(self, idx, dim_idx):
+        self.idx_trace_list[idx]['idx'].pop(dim_idx)
+        self.idx_trace_list[idx]['compute'].pop(dim_idx)
+        self.idx_trace_list[idx]['source'].pop(dim_idx)
+    
+    def _add_dim(self, idx, dim_idx):
+        self.idx_trace_list[idx]['idx'].insert(dim_idx, self._add_index())
+        self.idx_trace_list[idx]['compute'].insert(dim_idx, [])
+        self.idx_trace_list[idx]['source'].insert(dim_idx, [])
+    
+    def _transform_index(self, node, node_dim):
+        node_idx = self._find_idx_trace_from_node(node)
+        dims = list(range(len(node_idx)))
+        return dims[node_dim]
+    
+    def _inherit_index(self, node_from, node_from_dim, node_to, node_to_dim):
+        node_from_dim = self._transform_index(node_from, node_from_dim)
+        node_to_dim = self._transform_index(node_to, node_to_dim)
+        node_from_trace = self._find_trace_from_node(node_from)
+        node_to_trace = self._find_trace_from_node(node_to)
+        node_to_trace['idx'][node_to_dim] = node_from_trace['idx'][node_from_dim]
+        node_to_trace['compute'][node_to_dim] = copy.deepcopy(node_from_trace['compute'][node_from_dim])
+        node_from_idx = _find_idx_by_name(node_from.name, self.nodes_list)
+        node_to_trace['source'][node_to_dim] = []
+        node_to_trace['source'][node_to_dim].append({node_from_idx: node_from_dim})
+        node_to_trace['source'][node_to_dim].extend(node_from_trace['source'][node_from_dim])
+    
+    def _inherit_all_computation(self, node_from, node_to):
+        node_from_compute = self._find_compute_trace_from_node(node_from)
+        node_to_compute = self._find_compute_trace_from_node(node_to)
+        assert len(node_from_compute) == len(node_to_compute)
+        for i in range(len(node_from_compute)):
+            self._add_source(node_from, i, node_to, i)
+            node_to_compute[i] = copy.deepcopy(node_from_compute[i])
+    
+    def _add_source(self, node_from, node_from_dim, node_to, node_to_dim):
+        node_from_dim = self._transform_index(node_from, node_from_dim)
+        node_from_trace = self._find_trace_from_node(node_from)
+        node_to_dim = self._transform_index(node_to, node_to_dim)
+        node_to_trace = self._find_trace_from_node(node_to)
+        node_from_idx = _find_idx_by_name(node_from.name, self.nodes_list)
+        node_to_trace['source'][node_to_dim].append({node_from_idx: node_from_dim})
+        node_to_trace['source'][node_to_dim].extend(node_from_trace['source'][node_from_dim])
+    
+    def _mark_computation_from_node(self, node_from, node_to, exclude=None):
+        if exclude == None:
+            exclude = []
+        else:
+            exclude = [self._transform_index(node_to, i) for i in exclude]
+        node_from_compute = self._find_compute_trace_from_node(node_from)
+        node_to_compute = self._find_compute_trace_from_node(node_to)
+        # assert len(node_from_compute) == len(node_to_compute)
+        for i in range(-1, -min(len(node_from_compute), len(node_to_compute)) - 1, -1):
+            if self._transform_index(node_to, i) in exclude:
+                continue
+            self._add_source(node_from, i, node_to, i)
+            for j in node_from_compute[i]:
+                if j not in node_to_compute[i]:
+                    node_to_compute[i].append(j)
+    
+    def _mark_idx_equal(self, node1, dim1, node2, dim2):
         """
         Mark 2 index to be equal.
 
         Args:
             idx1 (int): index count.
             idx2 (int): index count.
-        """        
-        self.idx_trace_equal.append((idx1, idx2))
+        """
+        # node1_idx = _find_idx_by_name(node1.name, self.nodes_list)
+        # node2_idx = _find_idx_by_name(node2.name, self.nodes_list)
+        # if node1_idx > node2_idx:
+        #     self._add_source(node2, dim2, node1, dim1)
+        # else:
+        #     self._add_source(node1, dim1, node2, dim2)
         
     def _mark_computation(self, node, idx, dim):
         """
@@ -189,16 +254,14 @@ def _mark_computation(self, node, idx, dim):
             idx (int): node index
             dim (list or int): dims to be marked as computed
         """        
-        input_node_idx_trace = self._find_idx_trace_from_node(node)
         if isinstance(dim, int):
             dim = [dim]
+        dims = list(range(len(_get_node_shape(node))))
         for d in dim:
-            cur_idx = input_node_idx_trace[d]
-            if cur_idx not in self.idx_trace_list[idx]['compute']:
-                self.idx_trace_list[idx]['compute'][cur_idx] = [idx]
-            else:
-                self.idx_trace_list[idx]['compute'][cur_idx].append(idx)
-    
+            cur_dim = dims[d]
+            if idx not in self.idx_trace_list[idx]['compute'][cur_dim]:
+                self.idx_trace_list[idx]['compute'][cur_dim].append(idx)
+
     def _find_trace_from_node(self, node):
         """
         Find node idx and compute trace by the node.
@@ -211,7 +274,7 @@ def _find_trace_from_node(self, node):
         """        
         node_idx = _find_idx_by_name(node.name, self.nodes_list)
         node_dict = self.idx_trace_list[node_idx]
-        return node_dict['idx'], node_dict['compute']
+        return node_dict
     
     def _find_idx_trace_from_node(self, node):
         """
@@ -237,19 +300,23 @@ def _find_compute_trace_from_node(self, node):
         node_idx = _find_idx_by_name(node.name, self.nodes_list)
         return self.idx_trace_list[node_idx]['compute']
     
-    def _assign_index_as_input(self, node, node_idx):
+    def _assign_index_as_input(self, node, node_idx, input_node=None):
         """
         Assign node's trace as its input node.
 
         Args:
             node (node)
             node_idx (int)
-        """        
-        input_node_idx = _find_idx_by_name(node.args[0].name, self.nodes_list)
+        """
+        if input_node == None:
+            input_node = node.args[0]
+        input_node_idx = _find_idx_by_name(input_node.name, self.nodes_list)
         input_node_idx_trace = self.idx_trace_list[input_node_idx]['idx']
         
         new_idx_trace = copy.deepcopy(input_node_idx_trace)
         self.idx_trace_list[node_idx]['idx'] = new_idx_trace
+        
+        self._inherit_all_computation(input_node, node)
     
     def _assign_all_index(self, node, node_idx):
         """
@@ -275,15 +342,12 @@ def _assign_transpose_index(self, node, node_idx):
             node (node)
             node_idx (int)
         """  
+        input_node = node.args[0]
         tranpose_dim = node.args[1:]
-        input_node_idx_trace = self._find_idx_trace_from_node(node.args[0])
         
-        new_idx_trace = copy.deepcopy(input_node_idx_trace)
-        new_idx_trace[tranpose_dim[0]] = input_node_idx_trace[tranpose_dim[1]]
-        new_idx_trace[tranpose_dim[1]] = input_node_idx_trace[tranpose_dim[0]]
-
-        self.idx_trace_list[node_idx]['idx'] = new_idx_trace
-        self._inherit_computation(node.args[0], node)
+        self._assign_index_as_input(node, node_idx, input_node)
+        self._inherit_index(input_node, tranpose_dim[1], node, tranpose_dim[0])
+        self._inherit_index(input_node, tranpose_dim[0], node, tranpose_dim[1])
         
     def _assign_permute_index(self, node, node_idx):
         """
@@ -296,14 +360,11 @@ def _assign_permute_index(self, node, node_idx):
             node_idx (int)
         """  
         permute_dim = node.args[1:]
-        input_node_idx_trace = self._find_idx_trace_from_node(node.args[0])
+        input_node = node.args[0]
         
-        new_idx_trace = copy.deepcopy(input_node_idx_trace)
+        self._assign_index_as_input(node, node_idx, input_node)
         for idx, d in enumerate(permute_dim):
-            new_idx_trace[idx] = input_node_idx_trace[d]
-
-        self.idx_trace_list[node_idx]['idx'] = new_idx_trace
-        self._inherit_computation(node.args[0], node)
+            self._inherit_index(input_node, d, node, idx)
         
     def _assign_linear_index(self, node, node_idx):
         """
@@ -321,20 +382,15 @@ def _assign_linear_index(self, node, node_idx):
             bias = None
         else:
             input_node, weight, bias = node.args
-        input_node_idx_trace = self._find_idx_trace_from_node(input_node)
-        weight_idx_trace = self._find_idx_trace_from_node(weight)
         
-        new_idx_trace = copy.deepcopy(input_node_idx_trace)
-        new_idx_trace[-1] = weight_idx_trace[1]
-        self.idx_trace_list[node_idx]['idx'] = new_idx_trace
+        self._assign_index_as_input(node, node_idx)
+        self._inherit_index(weight, 1, node, -1)
 
-        self._inherit_computation(input_node, node)
         self._mark_computation(node, node_idx, [-1])
-        self._mark_idx_equal(input_node_idx_trace[-1], weight_idx_trace[0])
+        self._mark_idx_equal(input_node, -1, weight, 0)
         
         if bias:
-            bias_idx_trace = self._find_idx_trace_from_node(bias)
-            self._mark_idx_equal(input_node_idx_trace[-1], bias_idx_trace[0])
+            self._mark_idx_equal(input_node, -1, bias, 0)
 
     def _assign_matmul_index(self, node, node_idx):
         """
@@ -348,18 +404,14 @@ def _assign_matmul_index(self, node, node_idx):
             node_idx (int)
         """  
         matmul_left, matmul_right = node.args
-        matmul_left_idx_trace = self._find_idx_trace_from_node(matmul_left)
-        matmul_right_idx_trace = self._find_idx_trace_from_node(matmul_right)
         
-        assert(len(matmul_left_idx_trace) == len(matmul_right_idx_trace))
-        new_idx_trace = copy.deepcopy(matmul_left_idx_trace)
-        new_idx_trace[-1] = matmul_right_idx_trace[-1]
-        self.idx_trace_list[node_idx]['idx'] = new_idx_trace
+        assert(len(_get_node_shape(matmul_left)) == len(_get_node_shape(matmul_right)))
+        self._assign_index_as_input(node, node_idx, matmul_left)
+        self._inherit_index(matmul_right, -1, node, -1)
 
-        self._inherit_computation(matmul_left, node)
-        self._inherit_computation(matmul_right, node)
+        self._mark_computation_from_node(matmul_right, node, [-1, -2])
         self._mark_computation(node, node_idx, [-1])
-        self._mark_idx_equal(matmul_left_idx_trace[-1], matmul_right_idx_trace[-2])
+        self._mark_idx_equal(matmul_left, -1, matmul_right, -2)
 
     def _assign_layernorm_index(self, node, idx):
         """
@@ -372,7 +424,6 @@ def _assign_layernorm_index(self, node, idx):
             node_idx (int)
         """
         self._assign_index_as_input(node, idx)
-        self._inherit_computation(node.args[0], node)
         self._mark_computation(node, idx, [-1, -2])
     
     def _assign_elementwise_index(self, node, idx):
@@ -386,9 +437,59 @@ def _assign_elementwise_index(self, node, idx):
             node_idx (int)
         """  
         self._assign_index_as_input(node, idx)
+        nodes_in = []
         for node_in in node.args:
-            if type(node_in) not in (int, float):
-                self._inherit_computation(node_in, node)
+            if type(node_in) == type(node):
+                nodes_in.append(node_in)
+                self._mark_computation_from_node(node_in, node)
+        assert len(nodes_in) <= 2
+        if len(nodes_in) == 2:
+            node_in0_shape = _get_node_shape(nodes_in[0])
+            node_in1_shape = _get_node_shape(nodes_in[1])
+            for i in range(-1, -min(len(node_in0_shape), len(node_in1_shape)) - 1, -1):
+                if node_in0_shape[i] == node_in1_shape[i]:
+                    self._mark_idx_equal(nodes_in[0], i, nodes_in[1], i)
+    
+    def _assgin_no_change_index(self, node, idx):
+        self._assign_index_as_input(node, idx)
+        for node_in in node.args:
+            if type(node_in) == type(node):
+                self._mark_computation_from_node(node_in, node)
+            
+    def _assign_einsum_index(self, node, idx):
+        """
+        Assign index for einsum op.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        patterns = node.args[0]
+        input_nodes = node.args[1:]
+        
+        patterns = patterns.replace(" ", "")
+        left, right = patterns.split("->")
+        left = left.split(",")
+        
+        all_index = []
+        for i in left:
+            for c in i:
+                all_index.append(c)
+        all_index = set(all_index)
+        free_index = set([i for i in right])
+        sum_index = all_index - free_index
+        
+        for right_idx, right_indice in enumerate(right):
+            for left_idx, left_str in enumerate(left):
+                if right_indice in left_str:
+                    source_idx = left_str.index(right_indice)
+                    self._inherit_index(input_nodes[left_idx], source_idx, node, right_idx)
+        
+        for i in sum_index:
+            for left_idx, left_str in enumerate(left):
+                if i in left_str:
+                    self._mark_computation(node, idx, left_str.index(i))
+                    break
                 
     def _assign_softmax_index(self, node, idx):
         """
@@ -401,7 +502,6 @@ def _assign_softmax_index(self, node, idx):
             node_idx (int)
         """  
         self._assign_index_as_input(node, idx)
-        self._inherit_computation(node.args[0], node)
         self._mark_computation(node, idx, [node.kwargs['dim']])
         
     def _assign_unsqueeze_index(self, node, node_idx):
@@ -412,10 +512,12 @@ def _assign_unsqueeze_index(self, node, node_idx):
         Args:
             node (node)
             node_idx (int)
-        """ 
+        """
+        self._del_dim(node_idx, -1)
         self._assign_index_as_input(node, node_idx)
-        self._inherit_computation(node.args[0], node)
         self.idx_trace_list[node_idx]['idx'].insert(node.args[1], self._add_index())
+        self.idx_trace_list[node_idx]['compute'].insert(node.args[1], [])
+        self.idx_trace_list[node_idx]['source'].insert(node.args[1], [])
         
     def _assign_dropout_index(self, node, node_idx):
         """
@@ -427,7 +529,6 @@ def _assign_dropout_index(self, node, node_idx):
             node_idx (int)
         """ 
         self._assign_index_as_input(node, node_idx)
-
         
     def _assign_ones_like_index(self, node, node_idx):
         """
@@ -439,17 +540,6 @@ def _assign_ones_like_index(self, node, node_idx):
             node_idx (int)
         """ 
         self._assign_all_index(node, node_idx)
-        
-    def _assign_to_index(self, node, node_idx):
-        """
-        Assign index for to op.
-        1. assign new index for all dim
-
-        Args:
-            node (node)
-            node_idx (int)
-        """ 
-        self._assign_index_as_input(node, node_idx)
 
     def _assign_view_reshape_index(self, node, node_idx):
         """
@@ -494,26 +584,26 @@ def _assign_view_reshape_index(self, node, node_idx):
             dim_equal = [i == j for i, j in zip(origin_shape[:-1], target_shape)]
             dim_to = [dim_equal.index(False)]
             dim_from = [dim_equal.index(False), dim_equal.index(False) + 1]
+            self._add_dim(node_idx, -1)
         elif len_diff == -1:
             # dim expand
             dim_equal = [i == j for i, j in zip(origin_shape, target_shape[:-1])]
             dim_from = [dim_equal.index(False)]
             dim_to = [dim_equal.index(False), dim_equal.index(False) + 1]
+            self._del_dim(node_idx, -1)
         else:
             raise NotImplementedError("shape" + str(origin_shape) + 'and' + str(target_shape) + "view not implemented")
 
         # get new index
         origin_trace = self._find_idx_trace_from_node(origin_node)
-        new_trace = copy.deepcopy(origin_trace)
+        self._assign_index_as_input(node, node_idx, origin_node)
         dim_from.reverse()
         for i in dim_from:
-            new_trace.pop(i)
+            self._del_dim(node_idx, i)
         for i in dim_to:
-            new_trace.insert(i, self._add_index())
-        self.idx_trace_list[node_idx]['idx'] = new_trace
+            self._add_dim(node_idx, i)
         
         # inherit computation
-        self._inherit_computation(origin_node, node)
         compute_log = self._find_compute_trace_from_node(origin_node)
         for i in dim_from:
             if origin_trace[i] in compute_log:
@@ -524,15 +614,10 @@ def _assign_view_reshape_index(self, node, node_idx):
         # log view, not used now
         view_dict = {"idx_from": [origin_trace[i] for i in dim_from],
                      "dim_from": dim_from,
-                     "idx_to": [new_trace[i] for i in dim_to],
+                     "idx_to": [self.idx_trace_list[node_idx]['idx'][i] for i in dim_to],
                      "dim_to": dim_to}
         self.idx_view_list.append(view_dict) 
-    
-    def _remove_duplicate_compute(self):
-        for i in self.idx_trace_list:
-            for k, v in i['compute'].items():
-                i['compute'][k] = list(set(v))
-    
+
     def _merge_equal_idx(self):
         idx_equal = copy.deepcopy(self.idx_trace_equal)
         idx_equal.reverse()
@@ -556,8 +641,8 @@ def trace_index(self):
                     self._assign_view_reshape_index(node, idx)
                 elif 'unsqueeze' in node.name:
                     self._assign_unsqueeze_index(node, idx)
-                elif 'to' in node.name:
-                    self._assign_to_index(node, idx)
+                elif any(i in node.name for i in ['to', 'contiguous']):
+                    self._assgin_no_change_index(node, idx)
                 else:
                     raise NotImplementedError(node.name, "method not implemented yet!")
             elif node.op == 'call_function':
@@ -573,6 +658,8 @@ def trace_index(self):
                     self._assign_ones_like_index(node, idx)
                 elif 'dropout' in node.name:
                     self._assign_dropout_index(node, idx)
+                elif 'einsum' in node.name:
+                    self._assign_einsum_index(node, idx)
                 elif 'getattr' in node.name:
                     continue # get attr like shape
                 elif 'getitem' in node.name:
@@ -590,10 +677,20 @@ def trace_index(self):
                 continue
             else:
                 raise NotImplementedError(node.op, "op not implemented yet!")
-            
-        self._remove_duplicate_compute()
-        self._merge_equal_idx()
-
+        # self._merge_equal_idx()
+        
+    def check_index(self, trace_idx, start_idx, end_idx):
+        for i in range(start_idx, end_idx + 1):
+            cur_idx = self.idx_trace_list[i]['idx']
+            cur_compute = self.idx_trace_list[i]['compute']
+            if trace_idx in cur_compute:
+                for j in cur_compute[trace_idx]:
+                    if j < start_idx or j > end_idx:
+                        return False
+            # same_idx = [1 if j == trace_idx else 0 for j in cur_idx]
+            # if sum(same_idx) > 1:
+            #     return False
+        return True
 
 class MemoryEstimator(object):
     def __init__(self) -> None:
@@ -897,6 +994,8 @@ def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
                 self._is_not_compute(after_trace, (start_idx, end_idx), i) and
                 self.node_list[end_idx].meta['tensor_meta'].shape[i] != 1):
                 continue
+            if not self.index_tracer.check_index(before_trace['idx'][i], start_idx, end_idx):
+                continue
             flow_flag, chunk_info = self._detect_flow(before_trace, after_trace, start_idx, end_idx, i)
             if flow_flag == None:
                 continue
@@ -910,7 +1009,10 @@ def _search_possible_chunk_regions(self, max_chunk_region, peak_node):
         input_trace = []
         for i, n in enumerate(self.node_list):
             if len(n.args) > 0 and n.op != 'output':
-                input_idx = _find_idx_by_name(n.args[0].name, self.node_list)
+                if isinstance(n.args[0], str):
+                    input_idx = _find_idx_by_name(n.args[1].name, self.node_list)
+                else:
+                    input_idx = _find_idx_by_name(n.args[0].name, self.node_list)
                 input_trace.append(output_trace[input_idx])
             else:
                 input_trace.append(None)
@@ -930,7 +1032,7 @@ def _search_possible_chunk_regions(self, max_chunk_region, peak_node):
                 if len(free_dim) > 0:
                     free_dim = [free_dim[0]]
                     chunk_info = [chunk_info[0]]
-                possible_chunk_region.append({'region': (start_idx, end_idx), 'dim': free_dim, 'chunk_info': chunk_info})
+                    possible_chunk_region.append({'region': (start_idx, end_idx), 'dim': free_dim, 'chunk_info': chunk_info})
         return possible_chunk_region
     
     def _search_best_chunk_region(self, possible_chunk_regions):
@@ -1130,6 +1232,7 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
 
         if node_idx in chunk_starts:
             within_chunk_region = True
+            region_idx = chunk_starts.index(node_idx)
                 
             # add for loop
             chunk_input_meta = [meta_nodes[i] for i in chunk_inputs_idx[region_idx]]
@@ -1150,7 +1253,6 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
         if node_idx in chunk_ends:
             body.append(_gen_loop_end(node, chunk_inputs[region_idx], node_list, chunk_dims[region_idx]))
             within_chunk_region = False
-            region_idx += 1
 
         node_idx += 1
 

From 929445116a14d30ebbd50c5978a8f4db52ab3cd6 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Sat, 10 Dec 2022 17:29:51 +0800
Subject: [PATCH 028/503] pass outproduct mean

---
 chunk_codegen.py | 317 +++++++++++++++++++++++++++++++----------------
 1 file changed, 212 insertions(+), 105 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index ce7d849178d1..fc3c88cf91f6 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -16,16 +16,31 @@ def _delete_free_var_from_last_use(user_to_last_uses):
             if n.op == 'placeholder':
                 user_to_last_uses[key].remove(n)
 
+
 def _get_node_shape(node):
     if hasattr(node.meta['tensor_meta'], "shape"):
         return node.meta['tensor_meta'].shape
     return None
 
 
+def _is_non_compute_node(node):
+    if any(i in node.op for i in ['placeholder', 'get_attr', 'output']) or \
+        any(i in node.name for i in ['getitem', 'getattr']):
+        return True
+    return False
+    
+    
+def _is_non_compute_node_except_placeholder(node):
+    if any(i in node.op for i in ['get_attr', 'output']) or \
+        any(i in node.name for i in ['getitem', 'getattr']):
+        return True
+    return False
+
+
 class FlowTracer(object):
     def __init__(self, gm) -> None:
         self.gm = gm
-        self.nodes_list = list(gm.graph.nodes)
+        self.node_list = list(gm.graph.nodes)
         self.flow_trace = {}
 
     def _add_trace(self, name):
@@ -49,7 +64,7 @@ def _add_outside_depend(self, flow_name, node, outside_depend_node, outside_depe
         raise RuntimeError("node not found")
 
     def _init_trace(self):
-        for i in self.nodes_list:
+        for i in self.node_list:
             if i.op == 'placeholder':
                 self._add_trace(i.name)
                 self._add_node(i.name, i)
@@ -67,7 +82,7 @@ def _is_non_compute_node_except_placeholder(self, node):
         return False
     
     def _find_flow_for_node(self, node):
-        if type(self.nodes_list[0]) != type(node):
+        if type(self.node_list[0]) != type(node):
             return None
         if self._is_non_compute_node_except_placeholder(node):
             return None
@@ -117,7 +132,7 @@ def trace_flow(self):
         # init trace
         self._init_trace()
 
-        for node in self.nodes_list:
+        for node in self.node_list:
             # skip if non compute node
             if all(type(arg) != type(node) or self._is_non_compute_node_except_placeholder(arg) for arg in node.args) \
                 or self._is_non_compute_node(node):
@@ -135,6 +150,41 @@ def trace_flow(self):
                 else:
                     self._add_outside_depend(node_domin_flow, node, arg, node_input_flow)
         return self.flow_trace
+    
+    def _detect_flow(self, start_idx, start_dim, end_idx, end_dim):
+        inputs, outputs = _find_chunk_input_and_output_nodes(self.node_list[start_idx:end_idx + 1])
+        chunk_info = {'region': (start_idx, end_idx),
+                      'inputs': inputs, 'inputs_dim': start_dim,
+                      'outputs': outputs, 'outputs_dim': end_dim,
+                      'args': {}}
+        flow_flag = False
+        
+        for idx in range(start_idx, end_idx + 1):
+            node = self.node_list[idx]
+            mix_flow_var = self.get_flow_mix(node)
+            if mix_flow_var is None:
+                continue
+            
+            # if there is a flow mix, op must be in [mul, add, div, matmul]
+            # element-wise op requires dim to be equal in every dim
+            if any(n in node.name for n in ['mul', 'add']):
+                for i in node.args:
+                    if type(i) == type(mix_flow_var) and i != mix_flow_var:
+                        main_flow_var = i
+                # if mix flow is a broadcast in chunk dim, 
+                # TODO need to move that flow out of the chunk
+                if mix_flow_var.meta['tensor_meta'].shape[dim_idx] == 1:
+                    flow_flag = True
+                    for i in self.get_same_flow_node(chunk_info['inputs'], mix_flow_var):
+                        chunk_info['inputs'].remove(i)
+                # else, we need to chunk mix var as well
+                else:
+                    # TODO chunk another value
+                    flow_flag = False
+                    break
+            else:
+                raise NotImplementedError("%s not implemented" % node.name)
+        return flow_flag, chunk_info
 
 
 class IndexTracer(object):
@@ -153,7 +203,7 @@ def _init_idx_trace_list(self):
                 cur_trace = {
                     'idx': [None for _ in range(len(_get_node_shape(n)))],
                     'compute': [[] for _ in range(len(_get_node_shape(n)))],
-                    'source': [[] for _ in range(len(_get_node_shape(n)))],
+                    'source': [{} for _ in range(len(_get_node_shape(n)))],
                 }
             else:
                 cur_trace = {'idx': [], 'compute': [], 'source': []}
@@ -178,7 +228,7 @@ def _del_dim(self, idx, dim_idx):
     def _add_dim(self, idx, dim_idx):
         self.idx_trace_list[idx]['idx'].insert(dim_idx, self._add_index())
         self.idx_trace_list[idx]['compute'].insert(dim_idx, [])
-        self.idx_trace_list[idx]['source'].insert(dim_idx, [])
+        self.idx_trace_list[idx]['source'].insert(dim_idx, {})
     
     def _transform_index(self, node, node_dim):
         node_idx = self._find_idx_trace_from_node(node)
@@ -192,10 +242,7 @@ def _inherit_index(self, node_from, node_from_dim, node_to, node_to_dim):
         node_to_trace = self._find_trace_from_node(node_to)
         node_to_trace['idx'][node_to_dim] = node_from_trace['idx'][node_from_dim]
         node_to_trace['compute'][node_to_dim] = copy.deepcopy(node_from_trace['compute'][node_from_dim])
-        node_from_idx = _find_idx_by_name(node_from.name, self.nodes_list)
-        node_to_trace['source'][node_to_dim] = []
-        node_to_trace['source'][node_to_dim].append({node_from_idx: node_from_dim})
-        node_to_trace['source'][node_to_dim].extend(node_from_trace['source'][node_from_dim])
+        self._add_source(node_from, node_from_dim, node_to, node_to_dim, init=True)
     
     def _inherit_all_computation(self, node_from, node_to):
         node_from_compute = self._find_compute_trace_from_node(node_from)
@@ -205,14 +252,16 @@ def _inherit_all_computation(self, node_from, node_to):
             self._add_source(node_from, i, node_to, i)
             node_to_compute[i] = copy.deepcopy(node_from_compute[i])
     
-    def _add_source(self, node_from, node_from_dim, node_to, node_to_dim):
+    def _add_source(self, node_from, node_from_dim, node_to, node_to_dim, init=False):
         node_from_dim = self._transform_index(node_from, node_from_dim)
         node_from_trace = self._find_trace_from_node(node_from)
         node_to_dim = self._transform_index(node_to, node_to_dim)
         node_to_trace = self._find_trace_from_node(node_to)
         node_from_idx = _find_idx_by_name(node_from.name, self.nodes_list)
-        node_to_trace['source'][node_to_dim].append({node_from_idx: node_from_dim})
-        node_to_trace['source'][node_to_dim].extend(node_from_trace['source'][node_from_dim])
+        if init:
+            node_to_trace['source'][node_to_dim] = {}
+        node_to_trace['source'][node_to_dim][node_from_idx] = node_from_dim
+        node_to_trace['source'][node_to_dim].update(node_from_trace['source'][node_from_dim])
     
     def _mark_computation_from_node(self, node_from, node_to, exclude=None):
         if exclude == None:
@@ -485,11 +534,11 @@ def _assign_einsum_index(self, node, idx):
                     source_idx = left_str.index(right_indice)
                     self._inherit_index(input_nodes[left_idx], source_idx, node, right_idx)
         
-        for i in sum_index:
-            for left_idx, left_str in enumerate(left):
-                if i in left_str:
-                    self._mark_computation(node, idx, left_str.index(i))
-                    break
+        # for i in sum_index:
+        #     for left_idx, left_str in enumerate(left):
+        #         if i in left_str:
+        #             self._mark_computation(node, idx, left_str.index(i))
+        #             break
                 
     def _assign_softmax_index(self, node, idx):
         """
@@ -679,18 +728,56 @@ def trace_index(self):
                 raise NotImplementedError(node.op, "op not implemented yet!")
         # self._merge_equal_idx()
         
-    def check_index(self, trace_idx, start_idx, end_idx):
-        for i in range(start_idx, end_idx + 1):
-            cur_idx = self.idx_trace_list[i]['idx']
-            cur_compute = self.idx_trace_list[i]['compute']
-            if trace_idx in cur_compute:
-                for j in cur_compute[trace_idx]:
-                    if j < start_idx or j > end_idx:
-                        return False
-            # same_idx = [1 if j == trace_idx else 0 for j in cur_idx]
-            # if sum(same_idx) > 1:
-            #     return False
+    def check_index_source(self, start_dim, start_node, start_idx, end_dim, end_node):
+        """
+        Check 2 given index: one index should be source of the other
+        Args:
+            start_idx(int): start node chunk dim
+            start_node(node): start node
+            end_idx(int): end node chunk dim
+            end_node(node): end node
+
+        Returns:
+            bool: True if check pass
+        """
+        start_node_idx = _find_idx_by_name(start_node.name, self.nodes_list)
+        end_node_trace = self._find_trace_from_node(end_node)
+        end_node_trace_source = end_node_trace['source'][end_dim]
+        sorted_source = sorted(end_node_trace_source.items(), key=lambda d:d[0], reverse=True)
+        for node_idx, node_dim in sorted_source:
+            if node_idx == start_node_idx and node_dim == start_dim:
+                return True
+            # it means we meet a node outside the loop, and the node is not input node
+            if node_idx < start_idx:
+                return False
+        return False
+
+    def check_index_compute(self, start_idx, end_dim, end_node, end_idx):
+        """
+        Check 2 given index: check they haven't been computed in the source trace.
+        Args:
+            start_idx(int): start node chunk dim
+            start_node(node): start node
+            end_idx(int): end node chunk dim
+            end_node(node): end node
+
+        Returns:
+            bool: True if check pass
+        """
+        end_node_trace = self._find_trace_from_node(end_node)
+        end_node_compute = end_node_trace['compute'][end_dim]
+        if any(start_idx <= i <= end_idx for i in end_node_compute):
+            return False
         return True
+        # end_node_trace_source = end_node_trace['source'][end_dim]
+        # for node_idx, node_dim in end_node_trace_source.items():
+        #     if node_idx < start_node_idx or node_idx > end_node_idx:
+        #         continue
+        #     compute_list = self.idx_trace_list[node_idx]['compute'][node_dim]
+        #     if any(start_node_idx <= i <= end_node_idx for i in compute_list):
+        #         return False
+        # return True
+
 
 class MemoryEstimator(object):
     def __init__(self) -> None:
@@ -951,88 +1038,81 @@ def _is_not_compute(self, trace, chunk_range, dim_idx):
             return True
         return False
     
-    def _detect_flow(self, before_trace, after_trace, start_idx, end_idx, dim_idx):
-        inputs, outputs = _find_input_and_output_nodes(self.node_list[start_idx:end_idx + 1])
-        chunk_info = {'inputs': inputs, 'outputs': outputs}
-        flow_flag = False
-        
-        for idx in range(start_idx, end_idx + 1):
-            node = self.node_list[idx]
-            mix_flow_var = self.flow_tracer.get_flow_mix(node)
-            if mix_flow_var is None:
-                continue
-            
-            # if there is a flow mix, op must be in [mul, add, div, matmul]
-            # element-wise op requires dim to be equal in every dim
-            if any(n in node.name for n in ['mul', 'add']):
-                for i in node.args:
-                    if type(i) == type(mix_flow_var) and i != mix_flow_var:
-                        main_flow_var = i
-                # if mix flow is a broadcast in chunk dim, 
-                # TODO need to move that flow out of the chunk
-                if mix_flow_var.meta['tensor_meta'].shape[dim_idx] == 1:
-                    flow_flag = True
-                    for i in self.flow_tracer.get_same_flow_node(chunk_info['inputs'], mix_flow_var):
-                        chunk_info['inputs'].remove(i)
-                # else, we need to chunk mix var as well
-                else:
-                    # TODO chunk another value
-                    flow_flag = False
-                    break
-            else:
-                raise NotImplementedError("%s not implemented" % node.name)
-        return flow_flag, chunk_info
+    def _check_duplicate_map(self, chunk_infos):
+        dim_map = [(i['inputs_dim'], i['outputs_dim']) for i in chunk_infos]
+        remove_list = []
+        for idx1, (input_dim1, output_dim1) in enumerate(dim_map):
+            for idx2, (input_dim2, output_dim2) in enumerate(dim_map):
+                if idx1 == idx2:
+                    continue
+                # it means an index create 2 copy of itself
+                # eg. a = torch.matmul(x, x.transpose(-1, -2))
+                # TODO currently remove it, deal with this in future
+                if input_dim1 == input_dim2 and output_dim1 != output_dim2:
+                    remove_list.append(chunk_infos[idx1])
+                    remove_list.append(chunk_infos[idx2])
+        for i in remove_list:
+            if i in chunk_infos:
+                chunk_infos.remove(i)
+        return chunk_infos
     
     def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
-        before_trace = input_trace[start_idx]
-        after_trace = output_trace[end_idx]
-        free_dim = []
+        start_traces = input_trace[start_idx]
+        end_trace = output_trace[end_idx]
+        end_node = self.node_list[end_idx]
         chunk_infos = []
-        for i in range(min(len(before_trace['idx']), len(after_trace['idx']))):
-            if not (before_trace['idx'][i] == after_trace['idx'][i] and 
-                self._is_not_compute(before_trace, (start_idx, end_idx), i) and
-                self._is_not_compute(after_trace, (start_idx, end_idx), i) and
-                self.node_list[end_idx].meta['tensor_meta'].shape[i] != 1):
-                continue
-            if not self.index_tracer.check_index(before_trace['idx'][i], start_idx, end_idx):
+        for end_dim, end_trace_idx in enumerate(end_trace['idx']):
+            if len(start_traces) > 1:
+                # TODO implement multi input chunk
                 continue
-            flow_flag, chunk_info = self._detect_flow(before_trace, after_trace, start_idx, end_idx, i)
-            if flow_flag == None:
-                continue
-            chunk_infos.append(chunk_info)
-            free_dim.append(i)
-        return free_dim, chunk_infos
+            for start_node, start_trace in start_traces.items():
+                for start_dim, start_trace_idx in enumerate(start_trace['idx']):
+                    # must be same trace idx
+                    if start_trace_idx != end_trace_idx:
+                        continue
+                    # dim size cannot be 1
+                    if _get_node_shape(end_node)[end_dim] == 1 or \
+                        _get_node_shape(start_node)[start_dim] == 1:
+                        continue
+                    # check index source align
+                    if not self.index_tracer.check_index_source(
+                        start_dim, start_node, start_idx, end_dim, end_node):
+                        continue
+                    # check index copmute
+                    if not self.index_tracer.check_index_compute(
+                        start_idx, end_dim, end_node, end_idx):
+                        continue
+                    # detect flow meet
+                    flow_flag, chunk_info = self.flow_tracer._detect_flow(
+                        start_idx, start_dim, end_idx, end_dim)
+                    if flow_flag:
+                        continue
+                    chunk_infos.append(chunk_info)
+        chunk_infos = self._check_duplicate_map(chunk_infos)
+        return chunk_infos
 
     def _search_possible_chunk_regions(self, max_chunk_region, peak_node):
         possible_chunk_region = []
         output_trace = copy.deepcopy(self.index_tracer.idx_trace_list)
-        input_trace = []
-        for i, n in enumerate(self.node_list):
-            if len(n.args) > 0 and n.op != 'output':
-                if isinstance(n.args[0], str):
-                    input_idx = _find_idx_by_name(n.args[1].name, self.node_list)
-                else:
-                    input_idx = _find_idx_by_name(n.args[0].name, self.node_list)
-                input_trace.append(output_trace[input_idx])
-            else:
-                input_trace.append(None)
-
-        for start_idx in range(max_chunk_region[0], peak_node):
+        input_trace = []  # trace of a node's input nodes
+        for _, n in enumerate(self.node_list):
+            cur_trace = {}
+            for arg in n.args:
+                if type(arg) == type(n) and not _is_non_compute_node_except_placeholder(arg):
+                    cur_trace[arg] = self.index_tracer._find_trace_from_node(arg)
+            input_trace.append(cur_trace)
+
+        for start_idx in range(max_chunk_region[0], peak_node + 1):
             for end_idx in range(peak_node, max_chunk_region[1] + 1):
                 # skip non compute nodes
-                if any(op in ['placeholder', 'get_attr', 'output'] for op in 
-                       [self.node_list[start_idx].op, self.node_list[end_idx].op]):
-                    continue
-                if any(any(i in name for i in ['getitem', 'getattr']) for name in 
-                       [self.node_list[start_idx].name, self.node_list[end_idx].name]):
+                if _is_non_compute_node(self.node_list[start_idx]) or \
+                    _is_non_compute_node(self.node_list[end_idx]):
                     continue
                 
                 # select free dim
-                free_dim, chunk_info = self._find_free_dim(input_trace, output_trace, start_idx, end_idx)
-                if len(free_dim) > 0:
-                    free_dim = [free_dim[0]]
-                    chunk_info = [chunk_info[0]]
-                    possible_chunk_region.append({'region': (start_idx, end_idx), 'dim': free_dim, 'chunk_info': chunk_info})
+                chunk_info = self._find_free_dim(input_trace, output_trace, start_idx, end_idx)
+                if len(chunk_info) > 0:
+                    possible_chunk_region.extend(chunk_info)
         return possible_chunk_region
     
     def _search_best_chunk_region(self, possible_chunk_regions):
@@ -1044,7 +1124,8 @@ def _search_best_chunk_region(self, possible_chunk_regions):
                 max_region_range = i['region'][1] - i['region'][0]
         return best_regions
     
-    def _step_search(self, peak_node, active_node):
+    def _step_search(self, mem_peak, active_node):
+        peak_node = self._find_peak_node(mem_peak)
         max_chunk_region = self._search_max_chunk_region(active_node, peak_node)
         possible_chunk_regions = self._search_possible_chunk_regions(max_chunk_region, peak_node)
         best_chunk_region = self._search_best_chunk_region(possible_chunk_regions)
@@ -1062,19 +1143,16 @@ def search_region(self):
         mem_peak = init_mem_peak
         
         while True:
-            peak_node = self._find_peak_node(mem_peak)
-            chunk_region = self._step_search(peak_node, active_node)
-            if chunk_region is None or len(chunk_region['dim']) == 0:
+            chunk_region = self._step_search(mem_peak, active_node)
+            if chunk_region is None:
                 break
             
             chunk_regions.append(chunk_region)
             mem_peak, _, active_node = self.memory_estimator.estimate_chunk_inference_mem(
                 self.gm, [i['region'][0] for i in chunk_regions], 
-                [i['region'][1] for i in chunk_regions], [i['dim'][0] for i in chunk_regions], [1] * len(chunk_regions))
-            
+                [i['region'][1] for i in chunk_regions], [i['inputs_dim'] for i in chunk_regions], [1] * len(chunk_regions))
             if self._stop_search(init_mem_peak, mem_peak):
                 break
-
         return chunk_regions
 
 
@@ -1164,6 +1242,35 @@ def _find_input_and_output_nodes(nodes: List[Node]):
     return input_nodes, output_nodes
 
 
+def _find_chunk_input_and_output_nodes(nodes: List[Node]):
+    """
+    Find non-compute input and output node names.
+    input nodes are nodes used in the list
+    output nodes are nodes will use nodes in the list
+    """
+    input_nodes = []
+    output_nodes = []
+
+    # if a node has an input node which is not in the node list
+    # we treat that input node as the input of the checkpoint function
+    for node in nodes:
+        for input_node in node._input_nodes.keys():
+            if input_node not in nodes and input_node not in input_nodes \
+                and not _is_non_compute_node_except_placeholder(input_node):
+                input_nodes.append(input_node)
+
+    # if a node has a user node which is not in the node list
+    # we treat that user node as the node receiving the current node output
+    # TODO it is unsafe to remove non compute node here
+    for node in nodes:
+        for output_node in node.users.keys():
+            if output_node not in nodes and node not in output_nodes \
+                and not _is_non_compute_node_except_placeholder(input_node):
+                output_nodes.append(node)
+
+    return input_nodes, output_nodes
+
+
 def _find_idx_by_name(name, nodes_list):
     for idx, node in enumerate(nodes_list):
         if node.name == name:

From d31e146687ebd4cefdc67500e84b7414b5760dd4 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Sat, 10 Dec 2022 17:34:40 +0800
Subject: [PATCH 029/503] code format

---
 chunk_codegen.py | 908 +++++++++++++++++++++++++++++------------------
 1 file changed, 560 insertions(+), 348 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index fc3c88cf91f6..e8cf0d22f157 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -4,35 +4,52 @@
 from typing import List, Callable, Any, Tuple, Dict, Iterable
 
 from torch.fx.node import Node, Argument, map_arg, _type_repr, _get_qualified_name
-from torch.fx.graph import _Namespace, PythonCode, _custom_builtins, _is_from_torch, _format_target, magic_methods, CodeGen, _origin_type_map, inplace_methods, _CustomBuiltin
-from colossalai.fx.profiler import calculate_fwd_out, calculate_fwd_tmp, parameter_size, activation_size
+from torch.fx.graph import (
+    _Namespace,
+    PythonCode,
+    _custom_builtins,
+    _is_from_torch,
+    _format_target,
+    magic_methods,
+    CodeGen,
+    _origin_type_map,
+    inplace_methods,
+    _CustomBuiltin,
+)
+from colossalai.fx.profiler import (
+    calculate_fwd_out,
+    calculate_fwd_tmp,
+    parameter_size,
+    activation_size,
+)
+
 CODEGEN_AVAILABLE = True
-__all__ = ['ChunkCodeGen']
+__all__ = ["ChunkCodeGen"]
 
 
 def _delete_free_var_from_last_use(user_to_last_uses):
     for key, value in user_to_last_uses.items():
         for n in value:
-            if n.op == 'placeholder':
+            if n.op == "placeholder":
                 user_to_last_uses[key].remove(n)
 
 
 def _get_node_shape(node):
-    if hasattr(node.meta['tensor_meta'], "shape"):
-        return node.meta['tensor_meta'].shape
+    if hasattr(node.meta["tensor_meta"], "shape"):
+        return node.meta["tensor_meta"].shape
     return None
 
 
 def _is_non_compute_node(node):
-    if any(i in node.op for i in ['placeholder', 'get_attr', 'output']) or \
-        any(i in node.name for i in ['getitem', 'getattr']):
+    if any(i in node.op for i in ["placeholder", "get_attr", "output"]) or any(
+        i in node.name for i in ["getitem", "getattr"]
+    ):
         return True
     return False
-    
-    
+
+
 def _is_non_compute_node_except_placeholder(node):
-    if any(i in node.op for i in ['get_attr', 'output']) or \
-        any(i in node.name for i in ['getitem', 'getattr']):
+    if (any(i in node.op for i in ["get_attr", "output"]) or any(i in node.name for i in ["getitem", "getattr"])):
         return True
     return False
 
@@ -45,42 +62,48 @@ def __init__(self, gm) -> None:
 
     def _add_trace(self, name):
         self.flow_trace[name] = []
-    
+
     def _add_node(self, trace_name, node):
-        self.flow_trace[trace_name].append({'node': node, 'inside_depend': [], 'outside_depend': []})
-    
+        self.flow_trace[trace_name].append(
+            {"node": node, "inside_depend": [], "outside_depend": []}
+        )
+
     def _add_inside_depend(self, flow_name, node, inside_depend_node):
         for i in self.flow_trace[flow_name]:
-            if i['node'] == node:
-                i['inside_depend'].append(inside_depend_node)
+            if i["node"] == node:
+                i["inside_depend"].append(inside_depend_node)
                 return
         raise RuntimeError("node not found")
-                
-    def _add_outside_depend(self, flow_name, node, outside_depend_node, outside_depend_trace):
+
+    def _add_outside_depend(
+        self, flow_name, node, outside_depend_node, outside_depend_trace
+    ):
         for i in self.flow_trace[flow_name]:
-            if i['node'] == node:
-                i['outside_depend'].append({outside_depend_trace: outside_depend_node}) 
+            if i["node"] == node:
+                i["outside_depend"].append({outside_depend_trace: outside_depend_node})
                 return
         raise RuntimeError("node not found")
 
     def _init_trace(self):
         for i in self.node_list:
-            if i.op == 'placeholder':
+            if i.op == "placeholder":
                 self._add_trace(i.name)
                 self._add_node(i.name, i)
 
     def _is_non_compute_node(self, node):
-        if any(i in node.op for i in ['placeholder', 'get_attr', 'output']) or \
-            any(i in node.name for i in ['getitem', 'getattr']):
+        if any(i in node.op for i in ["placeholder", "get_attr", "output"]) or any(
+            i in node.name for i in ["getitem", "getattr"]
+        ):
             return True
         return False
-    
+
     def _is_non_compute_node_except_placeholder(self, node):
-        if any(i in node.op for i in ['get_attr', 'output']) or \
-            any(i in node.name for i in ['getitem', 'getattr']):
+        if any(i in node.op for i in ["get_attr", "output"]) or any(
+            i in node.name for i in ["getitem", "getattr"]
+        ):
             return True
         return False
-    
+
     def _find_flow_for_node(self, node):
         if type(self.node_list[0]) != type(node):
             return None
@@ -88,54 +111,57 @@ def _find_flow_for_node(self, node):
             return None
         for name, trace in self.flow_trace.items():
             for i in trace:
-                if node == i['node']:
+                if node == i["node"]:
                     return name
         if any(i in node.name for i in ["ones_like"]):
             self._add_trace(node.name)
             self._add_node(node.name, node)
             return node.name
         raise RuntimeError("node not found")
-    
+
     def _find_first_valid_flow(self, flow):
         for i in flow:
             if i is not None:
                 return i
         raise RuntimeError("invalid flow")
-    
+
     def find_node_flow(self, node):
         for name, trace in self.flow_trace.items():
             for i in trace:
-                if node == i['node']:
+                if node == i["node"]:
                     return name, i
         raise RuntimeError("invalid node")
-        
+
     def get_flow_mix(self, node):
         if self._is_non_compute_node(node):
             return None
         _, node_trace = self.find_node_flow(node)
-        if len(node_trace['outside_depend']) == 0:
+        if len(node_trace["outside_depend"]) == 0:
             return None
-        elif len(node_trace['outside_depend']) > 1:
+        elif len(node_trace["outside_depend"]) > 1:
             raise NotImplementedError
-        vars = list(node_trace['outside_depend'][0].values())[0]
+        vars = list(node_trace["outside_depend"][0].values())[0]
         return vars
-    
+
     def get_same_flow_node(self, node_list, node):
         name, _ = self.find_node_flow(node)
         result = []
         for i in self.flow_trace[name]:
-            if i['node'] in node_list:
-                result.append(i['node'])
+            if i["node"] in node_list:
+                result.append(i["node"])
         return result
-        
-    def trace_flow(self):    
+
+    def trace_flow(self):
         # init trace
         self._init_trace()
 
         for node in self.node_list:
             # skip if non compute node
-            if all(type(arg) != type(node) or self._is_non_compute_node_except_placeholder(arg) for arg in node.args) \
-                or self._is_non_compute_node(node):
+            if all(
+                type(arg) != type(node)
+                or self._is_non_compute_node_except_placeholder(arg)
+                for arg in node.args
+            ) or self._is_non_compute_node(node):
                 continue
 
             node_input_flows = [self._find_flow_for_node(arg) for arg in node.args]
@@ -148,35 +174,45 @@ def trace_flow(self):
                 elif node_input_flow == node_domin_flow:
                     self._add_inside_depend(node_domin_flow, node, arg)
                 else:
-                    self._add_outside_depend(node_domin_flow, node, arg, node_input_flow)
+                    self._add_outside_depend(
+                        node_domin_flow, node, arg, node_input_flow
+                    )
         return self.flow_trace
-    
+
     def _detect_flow(self, start_idx, start_dim, end_idx, end_dim):
-        inputs, outputs = _find_chunk_input_and_output_nodes(self.node_list[start_idx:end_idx + 1])
-        chunk_info = {'region': (start_idx, end_idx),
-                      'inputs': inputs, 'inputs_dim': start_dim,
-                      'outputs': outputs, 'outputs_dim': end_dim,
-                      'args': {}}
+        inputs, outputs = _find_chunk_input_and_output_nodes(
+            self.node_list[start_idx : end_idx + 1]
+        )
+        chunk_info = {
+            "region": (start_idx, end_idx),
+            "inputs": inputs,
+            "inputs_dim": start_dim,
+            "outputs": outputs,
+            "outputs_dim": end_dim,
+            "args": {},
+        }
         flow_flag = False
-        
+
         for idx in range(start_idx, end_idx + 1):
             node = self.node_list[idx]
             mix_flow_var = self.get_flow_mix(node)
             if mix_flow_var is None:
                 continue
-            
+
             # if there is a flow mix, op must be in [mul, add, div, matmul]
             # element-wise op requires dim to be equal in every dim
-            if any(n in node.name for n in ['mul', 'add']):
+            if any(n in node.name for n in ["mul", "add"]):
                 for i in node.args:
                     if type(i) == type(mix_flow_var) and i != mix_flow_var:
                         main_flow_var = i
-                # if mix flow is a broadcast in chunk dim, 
+                # if mix flow is a broadcast in chunk dim,
                 # TODO need to move that flow out of the chunk
-                if mix_flow_var.meta['tensor_meta'].shape[dim_idx] == 1:
+                if mix_flow_var.meta["tensor_meta"].shape[dim_idx] == 1:
                     flow_flag = True
-                    for i in self.get_same_flow_node(chunk_info['inputs'], mix_flow_var):
-                        chunk_info['inputs'].remove(i)
+                    for i in self.get_same_flow_node(
+                        chunk_info["inputs"], mix_flow_var
+                    ):
+                        chunk_info["inputs"].remove(i)
                 # else, we need to chunk mix var as well
                 else:
                     # TODO chunk another value
@@ -199,51 +235,53 @@ def __init__(self, gm) -> None:
     def _init_idx_trace_list(self):
         idx_trace_list = []
         for n in self.nodes_list:
-            if _get_node_shape(n) != None:            
+            if _get_node_shape(n) != None:
                 cur_trace = {
-                    'idx': [None for _ in range(len(_get_node_shape(n)))],
-                    'compute': [[] for _ in range(len(_get_node_shape(n)))],
-                    'source': [{} for _ in range(len(_get_node_shape(n)))],
+                    "idx": [None for _ in range(len(_get_node_shape(n)))],
+                    "compute": [[] for _ in range(len(_get_node_shape(n)))],
+                    "source": [{} for _ in range(len(_get_node_shape(n)))],
                 }
             else:
-                cur_trace = {'idx': [], 'compute': [], 'source': []}
+                cur_trace = {"idx": [], "compute": [], "source": []}
             idx_trace_list.append(cur_trace)
         return idx_trace_list
-    
+
     def _add_index(self):
         """
         Update the count and return it. To record the idx number.
-        
+
         Returns:
             idx_count: int
-        """        
+        """
         self.idx_count += 1
         return self.idx_count
-    
+
     def _del_dim(self, idx, dim_idx):
-        self.idx_trace_list[idx]['idx'].pop(dim_idx)
-        self.idx_trace_list[idx]['compute'].pop(dim_idx)
-        self.idx_trace_list[idx]['source'].pop(dim_idx)
-    
+        self.idx_trace_list[idx]["idx"].pop(dim_idx)
+        self.idx_trace_list[idx]["compute"].pop(dim_idx)
+        self.idx_trace_list[idx]["source"].pop(dim_idx)
+
     def _add_dim(self, idx, dim_idx):
-        self.idx_trace_list[idx]['idx'].insert(dim_idx, self._add_index())
-        self.idx_trace_list[idx]['compute'].insert(dim_idx, [])
-        self.idx_trace_list[idx]['source'].insert(dim_idx, {})
-    
+        self.idx_trace_list[idx]["idx"].insert(dim_idx, self._add_index())
+        self.idx_trace_list[idx]["compute"].insert(dim_idx, [])
+        self.idx_trace_list[idx]["source"].insert(dim_idx, {})
+
     def _transform_index(self, node, node_dim):
         node_idx = self._find_idx_trace_from_node(node)
         dims = list(range(len(node_idx)))
         return dims[node_dim]
-    
+
     def _inherit_index(self, node_from, node_from_dim, node_to, node_to_dim):
         node_from_dim = self._transform_index(node_from, node_from_dim)
         node_to_dim = self._transform_index(node_to, node_to_dim)
         node_from_trace = self._find_trace_from_node(node_from)
         node_to_trace = self._find_trace_from_node(node_to)
-        node_to_trace['idx'][node_to_dim] = node_from_trace['idx'][node_from_dim]
-        node_to_trace['compute'][node_to_dim] = copy.deepcopy(node_from_trace['compute'][node_from_dim])
+        node_to_trace["idx"][node_to_dim] = node_from_trace["idx"][node_from_dim]
+        node_to_trace["compute"][node_to_dim] = copy.deepcopy(
+            node_from_trace["compute"][node_from_dim]
+        )
         self._add_source(node_from, node_from_dim, node_to, node_to_dim, init=True)
-    
+
     def _inherit_all_computation(self, node_from, node_to):
         node_from_compute = self._find_compute_trace_from_node(node_from)
         node_to_compute = self._find_compute_trace_from_node(node_to)
@@ -251,7 +289,7 @@ def _inherit_all_computation(self, node_from, node_to):
         for i in range(len(node_from_compute)):
             self._add_source(node_from, i, node_to, i)
             node_to_compute[i] = copy.deepcopy(node_from_compute[i])
-    
+
     def _add_source(self, node_from, node_from_dim, node_to, node_to_dim, init=False):
         node_from_dim = self._transform_index(node_from, node_from_dim)
         node_from_trace = self._find_trace_from_node(node_from)
@@ -259,10 +297,12 @@ def _add_source(self, node_from, node_from_dim, node_to, node_to_dim, init=False
         node_to_trace = self._find_trace_from_node(node_to)
         node_from_idx = _find_idx_by_name(node_from.name, self.nodes_list)
         if init:
-            node_to_trace['source'][node_to_dim] = {}
-        node_to_trace['source'][node_to_dim][node_from_idx] = node_from_dim
-        node_to_trace['source'][node_to_dim].update(node_from_trace['source'][node_from_dim])
-    
+            node_to_trace["source"][node_to_dim] = {}
+        node_to_trace["source"][node_to_dim][node_from_idx] = node_from_dim
+        node_to_trace["source"][node_to_dim].update(
+            node_from_trace["source"][node_from_dim]
+        )
+
     def _mark_computation_from_node(self, node_from, node_to, exclude=None):
         if exclude == None:
             exclude = []
@@ -278,7 +318,7 @@ def _mark_computation_from_node(self, node_from, node_to, exclude=None):
             for j in node_from_compute[i]:
                 if j not in node_to_compute[i]:
                     node_to_compute[i].append(j)
-    
+
     def _mark_idx_equal(self, node1, dim1, node2, dim2):
         """
         Mark 2 index to be equal.
@@ -293,7 +333,7 @@ def _mark_idx_equal(self, node1, dim1, node2, dim2):
         #     self._add_source(node2, dim2, node1, dim1)
         # else:
         #     self._add_source(node1, dim1, node2, dim2)
-        
+
     def _mark_computation(self, node, idx, dim):
         """
         Mark some dims of node as computed.
@@ -302,14 +342,14 @@ def _mark_computation(self, node, idx, dim):
             node (node)
             idx (int): node index
             dim (list or int): dims to be marked as computed
-        """        
+        """
         if isinstance(dim, int):
             dim = [dim]
         dims = list(range(len(_get_node_shape(node))))
         for d in dim:
             cur_dim = dims[d]
-            if idx not in self.idx_trace_list[idx]['compute'][cur_dim]:
-                self.idx_trace_list[idx]['compute'][cur_dim].append(idx)
+            if idx not in self.idx_trace_list[idx]["compute"][cur_dim]:
+                self.idx_trace_list[idx]["compute"][cur_dim].append(idx)
 
     def _find_trace_from_node(self, node):
         """
@@ -320,11 +360,11 @@ def _find_trace_from_node(self, node):
         Returns:
             idx (list): idx of the node
             compute (list): computed idx of the node.
-        """        
+        """
         node_idx = _find_idx_by_name(node.name, self.nodes_list)
         node_dict = self.idx_trace_list[node_idx]
         return node_dict
-    
+
     def _find_idx_trace_from_node(self, node):
         """
         Find node idx trace by the node.
@@ -333,10 +373,10 @@ def _find_idx_trace_from_node(self, node):
             node (node)
         Returns:
             idx (list): idx of the node
-        """ 
+        """
         node_idx = _find_idx_by_name(node.name, self.nodes_list)
-        return self.idx_trace_list[node_idx]['idx']
-    
+        return self.idx_trace_list[node_idx]["idx"]
+
     def _find_compute_trace_from_node(self, node):
         """
         Find node compute trace by the node.
@@ -345,10 +385,10 @@ def _find_compute_trace_from_node(self, node):
             node (node)
         Returns:
             compute (list): computed idx of the node.
-        """ 
+        """
         node_idx = _find_idx_by_name(node.name, self.nodes_list)
-        return self.idx_trace_list[node_idx]['compute']
-    
+        return self.idx_trace_list[node_idx]["compute"]
+
     def _assign_index_as_input(self, node, node_idx, input_node=None):
         """
         Assign node's trace as its input node.
@@ -360,13 +400,13 @@ def _assign_index_as_input(self, node, node_idx, input_node=None):
         if input_node == None:
             input_node = node.args[0]
         input_node_idx = _find_idx_by_name(input_node.name, self.nodes_list)
-        input_node_idx_trace = self.idx_trace_list[input_node_idx]['idx']
-        
+        input_node_idx_trace = self.idx_trace_list[input_node_idx]["idx"]
+
         new_idx_trace = copy.deepcopy(input_node_idx_trace)
-        self.idx_trace_list[node_idx]['idx'] = new_idx_trace
-        
+        self.idx_trace_list[node_idx]["idx"] = new_idx_trace
+
         self._inherit_all_computation(input_node, node)
-    
+
     def _assign_all_index(self, node, node_idx):
         """
         Add new index for all node's dims.
@@ -374,12 +414,12 @@ def _assign_all_index(self, node, node_idx):
         Args:
             node (node)
             node_idx (int)
-        """  
-        shape = node.meta['tensor_meta'].shape
+        """
+        shape = node.meta["tensor_meta"].shape
         new_trace = []
         for _ in shape:
             new_trace.append(self._add_index())
-        self.idx_trace_list[node_idx]['idx'] = new_trace   
+        self.idx_trace_list[node_idx]["idx"] = new_trace
 
     def _assign_transpose_index(self, node, node_idx):
         """
@@ -390,14 +430,14 @@ def _assign_transpose_index(self, node, node_idx):
         Args:
             node (node)
             node_idx (int)
-        """  
+        """
         input_node = node.args[0]
         tranpose_dim = node.args[1:]
-        
+
         self._assign_index_as_input(node, node_idx, input_node)
         self._inherit_index(input_node, tranpose_dim[1], node, tranpose_dim[0])
         self._inherit_index(input_node, tranpose_dim[0], node, tranpose_dim[1])
-        
+
     def _assign_permute_index(self, node, node_idx):
         """
         Assign index for permute op.
@@ -407,14 +447,14 @@ def _assign_permute_index(self, node, node_idx):
         Args:
             node (node)
             node_idx (int)
-        """  
+        """
         permute_dim = node.args[1:]
         input_node = node.args[0]
-        
+
         self._assign_index_as_input(node, node_idx, input_node)
         for idx, d in enumerate(permute_dim):
             self._inherit_index(input_node, d, node, idx)
-        
+
     def _assign_linear_index(self, node, node_idx):
         """
         Assign index for linear op.
@@ -431,13 +471,13 @@ def _assign_linear_index(self, node, node_idx):
             bias = None
         else:
             input_node, weight, bias = node.args
-        
+
         self._assign_index_as_input(node, node_idx)
         self._inherit_index(weight, 1, node, -1)
 
         self._mark_computation(node, node_idx, [-1])
         self._mark_idx_equal(input_node, -1, weight, 0)
-        
+
         if bias:
             self._mark_idx_equal(input_node, -1, bias, 0)
 
@@ -451,10 +491,10 @@ def _assign_matmul_index(self, node, node_idx):
         Args:
             node (node)
             node_idx (int)
-        """  
+        """
         matmul_left, matmul_right = node.args
-        
-        assert(len(_get_node_shape(matmul_left)) == len(_get_node_shape(matmul_right)))
+
+        assert len(_get_node_shape(matmul_left)) == len(_get_node_shape(matmul_right))
         self._assign_index_as_input(node, node_idx, matmul_left)
         self._inherit_index(matmul_right, -1, node, -1)
 
@@ -474,7 +514,7 @@ def _assign_layernorm_index(self, node, idx):
         """
         self._assign_index_as_input(node, idx)
         self._mark_computation(node, idx, [-1, -2])
-    
+
     def _assign_elementwise_index(self, node, idx):
         """
         Assign index for element-wise op (eg. relu sigmoid add mul).
@@ -484,7 +524,7 @@ def _assign_elementwise_index(self, node, idx):
         Args:
             node (node)
             node_idx (int)
-        """  
+        """
         self._assign_index_as_input(node, idx)
         nodes_in = []
         for node_in in node.args:
@@ -498,13 +538,13 @@ def _assign_elementwise_index(self, node, idx):
             for i in range(-1, -min(len(node_in0_shape), len(node_in1_shape)) - 1, -1):
                 if node_in0_shape[i] == node_in1_shape[i]:
                     self._mark_idx_equal(nodes_in[0], i, nodes_in[1], i)
-    
+
     def _assgin_no_change_index(self, node, idx):
         self._assign_index_as_input(node, idx)
         for node_in in node.args:
             if type(node_in) == type(node):
                 self._mark_computation_from_node(node_in, node)
-            
+
     def _assign_einsum_index(self, node, idx):
         """
         Assign index for einsum op.
@@ -515,11 +555,11 @@ def _assign_einsum_index(self, node, idx):
         """
         patterns = node.args[0]
         input_nodes = node.args[1:]
-        
+
         patterns = patterns.replace(" ", "")
         left, right = patterns.split("->")
         left = left.split(",")
-        
+
         all_index = []
         for i in left:
             for c in i:
@@ -527,19 +567,21 @@ def _assign_einsum_index(self, node, idx):
         all_index = set(all_index)
         free_index = set([i for i in right])
         sum_index = all_index - free_index
-        
+
         for right_idx, right_indice in enumerate(right):
             for left_idx, left_str in enumerate(left):
                 if right_indice in left_str:
                     source_idx = left_str.index(right_indice)
-                    self._inherit_index(input_nodes[left_idx], source_idx, node, right_idx)
-        
+                    self._inherit_index(
+                        input_nodes[left_idx], source_idx, node, right_idx
+                    )
+
         # for i in sum_index:
         #     for left_idx, left_str in enumerate(left):
         #         if i in left_str:
         #             self._mark_computation(node, idx, left_str.index(i))
         #             break
-                
+
     def _assign_softmax_index(self, node, idx):
         """
         Assign index for softmax op.
@@ -549,10 +591,10 @@ def _assign_softmax_index(self, node, idx):
         Args:
             node (node)
             node_idx (int)
-        """  
+        """
         self._assign_index_as_input(node, idx)
-        self._mark_computation(node, idx, [node.kwargs['dim']])
-        
+        self._mark_computation(node, idx, [node.kwargs["dim"]])
+
     def _assign_unsqueeze_index(self, node, node_idx):
         """
         Assign index for unsqueeze op.
@@ -564,10 +606,10 @@ def _assign_unsqueeze_index(self, node, node_idx):
         """
         self._del_dim(node_idx, -1)
         self._assign_index_as_input(node, node_idx)
-        self.idx_trace_list[node_idx]['idx'].insert(node.args[1], self._add_index())
-        self.idx_trace_list[node_idx]['compute'].insert(node.args[1], [])
-        self.idx_trace_list[node_idx]['source'].insert(node.args[1], [])
-        
+        self.idx_trace_list[node_idx]["idx"].insert(node.args[1], self._add_index())
+        self.idx_trace_list[node_idx]["compute"].insert(node.args[1], [])
+        self.idx_trace_list[node_idx]["source"].insert(node.args[1], [])
+
     def _assign_dropout_index(self, node, node_idx):
         """
         Assign index for unsqueeze op.
@@ -576,9 +618,9 @@ def _assign_dropout_index(self, node, node_idx):
         Args:
             node (node)
             node_idx (int)
-        """ 
+        """
         self._assign_index_as_input(node, node_idx)
-        
+
     def _assign_ones_like_index(self, node, node_idx):
         """
         Assign index for oneslike op.
@@ -587,7 +629,7 @@ def _assign_ones_like_index(self, node, node_idx):
         Args:
             node (node)
             node_idx (int)
-        """ 
+        """
         self._assign_all_index(node, node_idx)
 
     def _assign_view_reshape_index(self, node, node_idx):
@@ -604,16 +646,16 @@ def _assign_view_reshape_index(self, node, node_idx):
         Args:
             node (node)
             node_idx (int)
-        """  
+        """
         # get data, turn into number
         origin_node = node.args[0]
-        origin_shape = origin_node.meta['tensor_meta'].shape
+        origin_shape = origin_node.meta["tensor_meta"].shape
         target_shape = []
         for i in range(1, len(node.args)):
             if isinstance(node.args[i], int):
                 target_shape.append(node.args[i])
             else:
-                target_shape.append(node.args[i].meta['fwd_out'][0])
+                target_shape.append(node.args[i].meta["fwd_out"][0])
 
         # compute the value of -1
         if -1 in target_shape:
@@ -641,7 +683,13 @@ def _assign_view_reshape_index(self, node, node_idx):
             dim_to = [dim_equal.index(False), dim_equal.index(False) + 1]
             self._del_dim(node_idx, -1)
         else:
-            raise NotImplementedError("shape" + str(origin_shape) + 'and' + str(target_shape) + "view not implemented")
+            raise NotImplementedError(
+                "shape"
+                + str(origin_shape)
+                + "and"
+                + str(target_shape)
+                + "view not implemented"
+            )
 
         # get new index
         origin_trace = self._find_idx_trace_from_node(origin_node)
@@ -651,7 +699,7 @@ def _assign_view_reshape_index(self, node, node_idx):
             self._del_dim(node_idx, i)
         for i in dim_to:
             self._add_dim(node_idx, i)
-        
+
         # inherit computation
         compute_log = self._find_compute_trace_from_node(origin_node)
         for i in dim_from:
@@ -659,13 +707,15 @@ def _assign_view_reshape_index(self, node, node_idx):
                 for j in dim_to:
                     self._mark_computation(node, node_idx, [j])
                 break
-        
+
         # log view, not used now
-        view_dict = {"idx_from": [origin_trace[i] for i in dim_from],
-                     "dim_from": dim_from,
-                     "idx_to": [self.idx_trace_list[node_idx]['idx'][i] for i in dim_to],
-                     "dim_to": dim_to}
-        self.idx_view_list.append(view_dict) 
+        view_dict = {
+            "idx_from": [origin_trace[i] for i in dim_from],
+            "dim_from": dim_from,
+            "idx_to": [self.idx_trace_list[node_idx]["idx"][i] for i in dim_to],
+            "dim_to": dim_to,
+        }
+        self.idx_view_list.append(view_dict)
 
     def _merge_equal_idx(self):
         idx_equal = copy.deepcopy(self.idx_trace_equal)
@@ -674,60 +724,64 @@ def _merge_equal_idx(self):
             merge_to = min(idx)
             merge_from = max(idx)
             for trace in self.idx_trace_list:
-                if merge_from in trace['idx']:
-                    trace['idx'] = [merge_to if i == merge_from else i for i in trace['idx']]
-    
+                if merge_from in trace["idx"]:
+                    trace["idx"] = [
+                        merge_to if i == merge_from else i for i in trace["idx"]
+                    ]
+
     def trace_index(self):
         for idx, node in enumerate(self.nodes_list):
-            if node.op == 'placeholder':
+            if node.op == "placeholder":
                 self._assign_all_index(node, idx)
-            elif node.op == 'call_method':
-                if 'transpose' in node.name:
+            elif node.op == "call_method":
+                if "transpose" in node.name:
                     self._assign_transpose_index(node, idx)
-                elif 'permute' in node.name:
+                elif "permute" in node.name:
                     self._assign_permute_index(node, idx)
-                elif 'view' in node.name or 'reshape' in node.name:
+                elif "view" in node.name or "reshape" in node.name:
                     self._assign_view_reshape_index(node, idx)
-                elif 'unsqueeze' in node.name:
+                elif "unsqueeze" in node.name:
                     self._assign_unsqueeze_index(node, idx)
-                elif any(i in node.name for i in ['to', 'contiguous']):
+                elif any(i in node.name for i in ["to", "contiguous"]):
                     self._assgin_no_change_index(node, idx)
                 else:
                     raise NotImplementedError(node.name, "method not implemented yet!")
-            elif node.op == 'call_function':
-                if 'linear' in node.name:
+            elif node.op == "call_function":
+                if "linear" in node.name:
                     self._assign_linear_index(node, idx)
-                elif 'matmul' in node.name:
+                elif "matmul" in node.name:
                     self._assign_matmul_index(node, idx)
-                elif 'softmax' in node.name:
+                elif "softmax" in node.name:
                     self._assign_softmax_index(node, idx)
-                elif any(n in node.name for n in ['mul', 'add', 'sigmoid', 'relu']):
+                elif any(n in node.name for n in ["mul", "add", "sigmoid", "relu"]):
                     self._assign_elementwise_index(node, idx)
-                elif 'ones_like' in node.name:
+                elif "ones_like" in node.name:
                     self._assign_ones_like_index(node, idx)
-                elif 'dropout' in node.name:
+                elif "dropout" in node.name:
                     self._assign_dropout_index(node, idx)
-                elif 'einsum' in node.name:
+                elif "einsum" in node.name:
                     self._assign_einsum_index(node, idx)
-                elif 'getattr' in node.name:
-                    continue # get attr like shape
-                elif 'getitem' in node.name:
-                    continue # get item in list
+                elif "getattr" in node.name:
+                    continue  # get attr like shape
+                elif "getitem" in node.name:
+                    continue  # get item in list
                 else:
-                    raise NotImplementedError(node.name, "function not implemented yet!")
-            elif node.op == 'call_module':
-                if any(n in node.name for n in ['layernorm', 'norm']):
+                    raise NotImplementedError(
+                        node.name, "function not implemented yet!"
+                    )
+            elif node.op == "call_module":
+                if any(n in node.name for n in ["layernorm", "norm"]):
                     self._assign_layernorm_index(node, idx)
                 else:
                     raise NotImplementedError(node.name, "module not implemented yet!")
-            elif node.op == 'get_attr':
-                self._assign_all_index(node, idx) # get param
-            elif node.op == 'output':
+            elif node.op == "get_attr":
+                self._assign_all_index(node, idx)  # get param
+            elif node.op == "output":
                 continue
             else:
                 raise NotImplementedError(node.op, "op not implemented yet!")
         # self._merge_equal_idx()
-        
+
     def check_index_source(self, start_dim, start_node, start_idx, end_dim, end_node):
         """
         Check 2 given index: one index should be source of the other
@@ -742,8 +796,10 @@ def check_index_source(self, start_dim, start_node, start_idx, end_dim, end_node
         """
         start_node_idx = _find_idx_by_name(start_node.name, self.nodes_list)
         end_node_trace = self._find_trace_from_node(end_node)
-        end_node_trace_source = end_node_trace['source'][end_dim]
-        sorted_source = sorted(end_node_trace_source.items(), key=lambda d:d[0], reverse=True)
+        end_node_trace_source = end_node_trace["source"][end_dim]
+        sorted_source = sorted(
+            end_node_trace_source.items(), key=lambda d: d[0], reverse=True
+        )
         for node_idx, node_dim in sorted_source:
             if node_idx == start_node_idx and node_dim == start_dim:
                 return True
@@ -765,7 +821,7 @@ def check_index_compute(self, start_idx, end_dim, end_node, end_idx):
             bool: True if check pass
         """
         end_node_trace = self._find_trace_from_node(end_node)
-        end_node_compute = end_node_trace['compute'][end_dim]
+        end_node_compute = end_node_trace["compute"][end_dim]
         if any(start_idx <= i <= end_idx for i in end_node_compute):
             return False
         return True
@@ -784,19 +840,23 @@ def __init__(self) -> None:
         pass
 
     def _get_meta_node_size(self, x):
-        x = x.meta['tensor_meta']
+        x = x.meta["tensor_meta"]
         x = x.numel * torch.tensor([], dtype=x.dtype).element_size()
         return x
 
     def _get_output_node(self, n):
-        fwd_out = {x.uuid: x for x in n.meta["fwd_out"] if isinstance(x, torch.Tensor) and hasattr(x, 'uuid')}
+        fwd_out = {
+            x.uuid: x
+            for x in n.meta["fwd_out"]
+            if isinstance(x, torch.Tensor) and hasattr(x, "uuid")
+        }
         out_size = activation_size(fwd_out)
         out_node = [n.name] if out_size > 0 else []
         return out_size, out_node
-    
+
     def _get_output_node_size(self, n):
         return self._get_output_node(n)[0]
-    
+
     def _add_active_node(self, n, active_list):
         new_active = self._get_output_node(n)[1]
         for i in new_active:
@@ -806,7 +866,7 @@ def _add_active_node(self, n, active_list):
     def _get_delete_node(self, user, user_to_last_uses):
         delete_size = 0
         delete_node = []
-        if user.op not in ('placeholder', 'output'):
+        if user.op not in ("placeholder", "output"):
             nodes_to_delete = user_to_last_uses.get(user, [])
             if len(nodes_to_delete):
                 out_node = [self._get_output_node(i) for i in nodes_to_delete]
@@ -814,13 +874,13 @@ def _get_delete_node(self, user, user_to_last_uses):
                 for i in range(len(out_node)):
                     if out_node[i][0] > 0:
                         delete_node.append(out_node[i][1][0])
-                    elif nodes_to_delete[i].op == 'placeholder':
+                    elif nodes_to_delete[i].op == "placeholder":
                         delete_node.append(nodes_to_delete[i].name)
         return delete_size, delete_node
-    
+
     def _get_delete_node_size(self, user, user_to_last_uses):
         return self._get_delete_node(user, user_to_last_uses)[0]
-    
+
     def _remove_deactive_node(self, user, user_to_last_uses, active_list):
         delete_node = self._get_delete_node(user, user_to_last_uses)[1]
         for i in delete_node:
@@ -842,20 +902,24 @@ def register_last_uses(n: Node, user: Node):
 
     def _get_contiguous_memory(self, node, not_contiguous_list, delete=False):
         mem = 0
-        not_contiguous_ops = ['transpose', 'permute']
+        not_contiguous_ops = ["transpose", "permute"]
 
-        if node.op == 'call_function' and any(n in node.name for n in ['matmul', 'reshape']):
+        if node.op == "call_function" and any(
+            n in node.name for n in ["matmul", "reshape"]
+        ):
             for n in node.args:
                 if n in not_contiguous_list:
                     # matmul won't change origin tensor, but create a tmp copy
                     mem += self._get_output_node_size(n)
-        elif node.op == 'call_module':
+        elif node.op == "call_module":
             for n in node.args:
                 if n in not_contiguous_list:
                     # module will just make origin tensor to contiguous
                     if delete:
                         not_contiguous_list.remove(n)
-        elif node.op == 'call_method' and any(i in node.name for i in not_contiguous_ops):
+        elif node.op == "call_method" and any(
+            i in node.name for i in not_contiguous_ops
+        ):
             if node not in not_contiguous_list:
                 not_contiguous_list.append(node)
         elif any(i in node.args for i in not_contiguous_list):
@@ -865,13 +929,14 @@ def _get_contiguous_memory(self, node, not_contiguous_list, delete=False):
         return mem
 
     def _get_chunk_ratio(self, node, chunk_dim, chunk_size):
-        shape = node.meta['tensor_meta'].shape
+        shape = node.meta["tensor_meta"].shape
         chunk_ratio = float(chunk_size) / shape[chunk_dim]
         return chunk_ratio
 
-
-    def _get_chunk_delete_node_size(self, user, user_to_last_uses, chunk_ratio, node_list, start_node, end_node):
-        if user.op in ('placeholder', 'output'):
+    def _get_chunk_delete_node_size(
+        self, user, user_to_last_uses, chunk_ratio, node_list, start_node, end_node
+    ):
+        if user.op in ("placeholder", "output"):
             return 0
         nodes_to_delete = user_to_last_uses.get(user, [])
         delete_size = 0
@@ -881,12 +946,11 @@ def _get_chunk_delete_node_size(self, user, user_to_last_uses, chunk_ratio, node
                 delete_size += self._get_output_node_size(n) * chunk_ratio
         return delete_size
 
-
     def _print_mem_log(self, log, nodes, title=None):
         if title:
             print(title)
         for idx, (l, n) in enumerate(zip(log, nodes)):
-            print("%s:%.2f \t" % (n.name, l), end='')
+            print("%s:%.2f \t" % (n.name, l), end="")
             if (idx + 1) % 3 == 0:
                 print("")
         print("\n")
@@ -895,16 +959,23 @@ def _print_compute_op_mem_log(self, log, nodes, title=None):
         if title:
             print(title)
         for idx, (l, n) in enumerate(zip(log, nodes)):
-            if n.op in ['placeholder', 'get_attr', 'output']:
+            if n.op in ["placeholder", "get_attr", "output"]:
                 continue
-            if any(i in n.name for i in ['getitem', 'getattr']):
+            if any(i in n.name for i in ["getitem", "getattr"]):
                 continue
-            print("%s:%.2f \t" % (n.name, l), end='')
+            print("%s:%.2f \t" % (n.name, l), end="")
             if (idx + 1) % 3 == 0:
                 print("")
         print("\n")
-    
-    def estimate_chunk_inference_mem(self, gm: torch.fx.GraphModule, start_nodes=None, end_nodes=None, chunk_dims=None, chunk_sizes=None):
+
+    def estimate_chunk_inference_mem(
+        self,
+        gm: torch.fx.GraphModule,
+        start_nodes=None,
+        end_nodes=None,
+        chunk_dims=None,
+        chunk_sizes=None,
+    ):
         act_memory = 0.0
         act_memory_peak_log = []
         act_memory_after_node_log = []
@@ -915,42 +986,65 @@ def estimate_chunk_inference_mem(self, gm: torch.fx.GraphModule, start_nodes=Non
         user_to_last_uses = self._get_last_usr(node_list)
         user_to_last_uses_no_free_var = self._get_last_usr(node_list)
         _delete_free_var_from_last_use(user_to_last_uses_no_free_var)
-        
-        use_chunk = all(i is not None for i in [start_nodes, end_nodes, chunk_dims, chunk_sizes])
+
+        use_chunk = all(
+            i is not None for i in [start_nodes, end_nodes, chunk_dims, chunk_sizes]
+        )
         chunk_within = False
         chunk_region_idx = 0
-        chunk_ratio = 1 # use it to estimate chunk mem
+        chunk_ratio = 1  # use it to estimate chunk mem
 
         for idx, node in enumerate(node_list):
             # if node in chunk start nodes, change chunk ratio and add chunk_tensor
             if use_chunk and idx in start_nodes:
                 chunk_within = True
-                chunk_ratio = self._get_chunk_ratio(node, chunk_dims[chunk_region_idx], chunk_sizes[chunk_region_idx])
-                act_memory += self._get_output_node_size(node_list[end_nodes[chunk_region_idx]]) / (1024 ** 2)
-                
+                chunk_ratio = self._get_chunk_ratio(
+                    node, chunk_dims[chunk_region_idx], chunk_sizes[chunk_region_idx]
+                )
+                act_memory += self._get_output_node_size(
+                    node_list[end_nodes[chunk_region_idx]]
+                ) / (1024**2)
+
             # if node is placeholder, just add the size of the node
-            if node.op == 'placeholder':
-                act_memory += self._get_meta_node_size(node) * chunk_ratio / (1024 ** 2)
+            if node.op == "placeholder":
+                act_memory += self._get_meta_node_size(node) * chunk_ratio / (1024**2)
                 act_memory_peak_log.append(act_memory)
                 active_node_list.append(node.name)
             # skip output
-            elif node.op == 'output':
+            elif node.op == "output":
                 continue
             # node is an operation, calculate tmp, output node and delete node memory
             else:
                 # forward memory
-                act_memory += self._get_contiguous_memory(node, not_contiguous_list) * chunk_ratio / (1024 ** 2)
-                act_memory += self._get_output_node_size(node) * chunk_ratio / (1024 ** 2)
+                act_memory += (
+                    self._get_contiguous_memory(node, not_contiguous_list)
+                    * chunk_ratio
+                    / (1024**2)
+                )
+                act_memory += (
+                    self._get_output_node_size(node) * chunk_ratio / (1024**2)
+                )
                 # record max act memory
                 act_memory_peak_log.append(act_memory)
                 # delete useless memory
-                act_memory -= self._get_contiguous_memory(node, not_contiguous_list, delete=True) * chunk_ratio / (1024 ** 2)
+                act_memory -= (
+                    self._get_contiguous_memory(node, not_contiguous_list, delete=True)
+                    * chunk_ratio
+                    / (1024**2)
+                )
                 if chunk_within:
                     act_memory -= self._get_chunk_delete_node_size(
-                        node, user_to_last_uses_no_free_var, chunk_ratio, node_list, 
-                        start_nodes[chunk_region_idx], end_nodes[chunk_region_idx]) / (1024 ** 2)
+                        node,
+                        user_to_last_uses_no_free_var,
+                        chunk_ratio,
+                        node_list,
+                        start_nodes[chunk_region_idx],
+                        end_nodes[chunk_region_idx],
+                    ) / (1024**2)
                 else:
-                    act_memory -= self._get_delete_node_size(node, user_to_last_uses_no_free_var) / (1024 ** 2)
+                    act_memory -= self._get_delete_node_size(
+                        node, user_to_last_uses_no_free_var
+                    ) / (1024**2)
 
             # log active node
             self._add_active_node(node, active_node_list)
@@ -958,11 +1052,13 @@ def estimate_chunk_inference_mem(self, gm: torch.fx.GraphModule, start_nodes=Non
 
             # if node in chunk end nodes, restore chunk settings
             if use_chunk and idx in end_nodes:
-                act_memory -= self._get_output_node_size(node) * chunk_ratio / (1024 ** 2)
+                act_memory -= (
+                    self._get_output_node_size(node) * chunk_ratio / (1024**2)
+                )
                 chunk_within = False
                 chunk_ratio = 1
                 chunk_region_idx += 1
-            
+
             act_memory_after_node_log.append(act_memory)
             active_node_list_log.append(copy.deepcopy(active_node_list))
 
@@ -991,14 +1087,14 @@ def _find_peak_node(self, mem_peak):
         max_value = max(mem_peak)
         max_idx = mem_peak.index(max_value)
         return max_idx
-    
+
     def _get_free_var(self):
         free_var_idx = []
         for idx, n in enumerate(self.node_list):
-            if n.op == 'placeholder':
+            if n.op == "placeholder":
                 free_var_idx.append(idx)
         return free_var_idx
-    
+
     def _get_min_free_var(self, active_node_list, free_vars):
         min_len = 999
         for idx, n in enumerate(active_node_list):
@@ -1007,11 +1103,11 @@ def _get_min_free_var(self, active_node_list, free_vars):
             if len(n) < min_len:
                 min_len = len(n)
         return min_len
-    
+
     def _search_max_chunk_region(self, active_node, peak_node):
         free_vars = self._get_free_var()
         min_var = self._get_min_free_var(active_node, free_vars)
-        
+
         # from peak_node to free_var
         chunk_region_start = None
         for i in range(peak_node, -1, -1):
@@ -1029,17 +1125,19 @@ def _search_max_chunk_region(self, active_node, peak_node):
             if i in free_vars or i == 0:
                 raise RuntimeError()
         return chunk_region_start, chunk_region_end
-    
+
     def _is_not_compute(self, trace, chunk_range, dim_idx):
-        if trace['idx'][dim_idx] not in trace['compute']:
+        if trace["idx"][dim_idx] not in trace["compute"]:
             return True
-        if trace['idx'][dim_idx] in trace['compute'] and \
-            all(i < chunk_range[0] or i > chunk_range[1] for i in trace['compute'][trace['idx'][dim_idx]]):
+        if trace["idx"][dim_idx] in trace["compute"] and all(
+            i < chunk_range[0] or i > chunk_range[1]
+            for i in trace["compute"][trace["idx"][dim_idx]]
+        ):
             return True
         return False
-    
+
     def _check_duplicate_map(self, chunk_infos):
-        dim_map = [(i['inputs_dim'], i['outputs_dim']) for i in chunk_infos]
+        dim_map = [(i["inputs_dim"], i["outputs_dim"]) for i in chunk_infos]
         remove_list = []
         for idx1, (input_dim1, output_dim1) in enumerate(dim_map):
             for idx2, (input_dim2, output_dim2) in enumerate(dim_map):
@@ -1055,36 +1153,41 @@ def _check_duplicate_map(self, chunk_infos):
             if i in chunk_infos:
                 chunk_infos.remove(i)
         return chunk_infos
-    
+
     def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
         start_traces = input_trace[start_idx]
         end_trace = output_trace[end_idx]
         end_node = self.node_list[end_idx]
         chunk_infos = []
-        for end_dim, end_trace_idx in enumerate(end_trace['idx']):
+        for end_dim, end_trace_idx in enumerate(end_trace["idx"]):
             if len(start_traces) > 1:
                 # TODO implement multi input chunk
                 continue
             for start_node, start_trace in start_traces.items():
-                for start_dim, start_trace_idx in enumerate(start_trace['idx']):
+                for start_dim, start_trace_idx in enumerate(start_trace["idx"]):
                     # must be same trace idx
                     if start_trace_idx != end_trace_idx:
                         continue
                     # dim size cannot be 1
-                    if _get_node_shape(end_node)[end_dim] == 1 or \
-                        _get_node_shape(start_node)[start_dim] == 1:
+                    if (
+                        _get_node_shape(end_node)[end_dim] == 1
+                        or _get_node_shape(start_node)[start_dim] == 1
+                    ):
                         continue
                     # check index source align
                     if not self.index_tracer.check_index_source(
-                        start_dim, start_node, start_idx, end_dim, end_node):
+                        start_dim, start_node, start_idx, end_dim, end_node
+                    ):
                         continue
                     # check index copmute
                     if not self.index_tracer.check_index_compute(
-                        start_idx, end_dim, end_node, end_idx):
+                        start_idx, end_dim, end_node, end_idx
+                    ):
                         continue
                     # detect flow meet
                     flow_flag, chunk_info = self.flow_tracer._detect_flow(
-                        start_idx, start_dim, end_idx, end_dim)
+                        start_idx, start_dim, end_idx, end_dim
+                    )
                     if flow_flag:
                         continue
                     chunk_infos.append(chunk_info)
@@ -1098,59 +1201,78 @@ def _search_possible_chunk_regions(self, max_chunk_region, peak_node):
         for _, n in enumerate(self.node_list):
             cur_trace = {}
             for arg in n.args:
-                if type(arg) == type(n) and not _is_non_compute_node_except_placeholder(arg):
+                if type(arg) == type(n) and not _is_non_compute_node_except_placeholder(
+                    arg
+                ):
                     cur_trace[arg] = self.index_tracer._find_trace_from_node(arg)
             input_trace.append(cur_trace)
 
         for start_idx in range(max_chunk_region[0], peak_node + 1):
             for end_idx in range(peak_node, max_chunk_region[1] + 1):
                 # skip non compute nodes
-                if _is_non_compute_node(self.node_list[start_idx]) or \
-                    _is_non_compute_node(self.node_list[end_idx]):
+                if _is_non_compute_node(
+                    self.node_list[start_idx]
+                ) or _is_non_compute_node(self.node_list[end_idx]):
                     continue
-                
+
                 # select free dim
-                chunk_info = self._find_free_dim(input_trace, output_trace, start_idx, end_idx)
+                chunk_info = self._find_free_dim(
+                    input_trace, output_trace, start_idx, end_idx
+                )
                 if len(chunk_info) > 0:
                     possible_chunk_region.extend(chunk_info)
         return possible_chunk_region
-    
+
     def _search_best_chunk_region(self, possible_chunk_regions):
         max_region_range = 0
         best_regions = None
         for i in possible_chunk_regions:
-            if i['region'][1] - i['region'][0] > max_region_range:
+            if i["region"][1] - i["region"][0] > max_region_range:
                 best_regions = i
-                max_region_range = i['region'][1] - i['region'][0]
+                max_region_range = i["region"][1] - i["region"][0]
         return best_regions
-    
+
     def _step_search(self, mem_peak, active_node):
         peak_node = self._find_peak_node(mem_peak)
         max_chunk_region = self._search_max_chunk_region(active_node, peak_node)
-        possible_chunk_regions = self._search_possible_chunk_regions(max_chunk_region, peak_node)
+        possible_chunk_regions = self._search_possible_chunk_regions(
+            max_chunk_region, peak_node
+        )
         best_chunk_region = self._search_best_chunk_region(possible_chunk_regions)
         return best_chunk_region
-    
+
     def _stop_search(self, init_mem_peak, mem_peak):
         sorted_init_mem_peak = sorted(init_mem_peak)
         if max(mem_peak) < sorted_init_mem_peak[int(len(sorted_init_mem_peak) * 0.5)]:
             return True
         return False
-    
+
     def search_region(self):
         chunk_regions = []
-        init_mem_peak, _, active_node = self.memory_estimator.estimate_chunk_inference_mem(self.gm)
+        (
+            init_mem_peak,
+            _,
+            active_node,
+        ) = self.memory_estimator.estimate_chunk_inference_mem(self.gm)
         mem_peak = init_mem_peak
-        
+
         while True:
             chunk_region = self._step_search(mem_peak, active_node)
             if chunk_region is None:
                 break
-            
+
             chunk_regions.append(chunk_region)
-            mem_peak, _, active_node = self.memory_estimator.estimate_chunk_inference_mem(
-                self.gm, [i['region'][0] for i in chunk_regions], 
-                [i['region'][1] for i in chunk_regions], [i['inputs_dim'] for i in chunk_regions], [1] * len(chunk_regions))
+            (
+                mem_peak,
+                _,
+                active_node,
+            ) = self.memory_estimator.estimate_chunk_inference_mem(
+                self.gm,
+                [i["region"][0] for i in chunk_regions],
+                [i["region"][1] for i in chunk_regions],
+                [i["inputs_dim"] for i in chunk_regions],
+                [1] * len(chunk_regions),
+            )
             if self._stop_search(init_mem_peak, mem_peak):
                 break
         return chunk_regions
@@ -1180,18 +1302,24 @@ def _get_first_non_single_dim(shape):
 def _gen_loop_start(chunk_input_meta, chunk_output, chunk_dim, chunk_size=2):
     if len(chunk_input_meta) == 1:
         node = chunk_input_meta[0]
-        node_shape = node.meta['tensor_meta'].shape
-        free_shape = [node_shape[i] if i in chunk_dim else 1 for i in range(len(node_shape))]
+        node_shape = node.meta["tensor_meta"].shape
+        free_shape = [
+            node_shape[i] if i in chunk_dim else 1 for i in range(len(node_shape))
+        ]
         chunk_dim = _get_first_non_single_dim(free_shape)
         chunk_slice = _gen_chunk_slice_dim(chunk_dim, "gen_chunk_idx", node_shape)
-        out_shape = str(list(chunk_output.meta['tensor_meta'].shape))
-        
-        context = "chunk_result = torch.empty(%s, dtype=%s.dtype, device=%s.device); chunk_size = %d\nfor gen_chunk_idx in range" % (
-            out_shape, node.name, node.name, chunk_size)
+        out_shape = str(list(chunk_output.meta["tensor_meta"].shape))
+
+        context = (
+            "chunk_result = torch.empty(%s, dtype=%s.dtype, device=%s.device); chunk_size = %d\nfor gen_chunk_idx in range"
+            % (out_shape, node.name, node.name, chunk_size)
+        )
         context += "(0, %s.shape[%d], chunk_size):\n" % (node.name, chunk_dim)
         context += "    chunk_tensor = %s%s\n" % (node.name, chunk_slice)
     else:
-        raise NotImplementedError("input with size %d not implemented" % len(chunk_input_meta))
+        raise NotImplementedError(
+            "input with size %d not implemented" % len(chunk_input_meta)
+        )
     return context
 
 
@@ -1199,17 +1327,27 @@ def _gen_loop_end(chunk_outputs, chunk_inputs, node_list, chunk_dim):
     chunk_inputs_name = chunk_inputs[0].name
     chunk_outputs_name = chunk_outputs.name
     chunk_outputs_idx = _find_idx_by_name(chunk_outputs_name, node_list)
-    chunk_output_shape = chunk_outputs.meta['tensor_meta'].shape
-    free_shape = [chunk_output_shape[i] if i in chunk_dim else 1 for i in range(len(chunk_output_shape))]
+    chunk_output_shape = chunk_outputs.meta["tensor_meta"].shape
+    free_shape = [
+        chunk_output_shape[i] if i in chunk_dim else 1
+        for i in range(len(chunk_output_shape))
+    ]
     chunk_dim = _get_first_non_single_dim(free_shape)
     chunk_slice = _gen_chunk_slice_dim(chunk_dim, "gen_chunk_idx", chunk_output_shape)
     context = "    chunk_result%s = %s\n" % (chunk_slice, chunk_outputs_name)
 
-    context += chunk_outputs_name + " = chunk_result;  chunk_result = None;  chunk_size = None"
-    
+    context += (
+        chunk_outputs_name + " = chunk_result;  chunk_result = None;  chunk_size = None"
+    )
+
     # determine if its the last use for chunk input
     users_name = list(chunk_inputs[0].users.keys())
-    if all([_find_idx_by_name(user.name, node_list) <= chunk_outputs_idx for user in users_name]):
+    if all(
+        [
+            _find_idx_by_name(user.name, node_list) <= chunk_outputs_idx
+            for user in users_name
+        ]
+    ):
         context += ";  %s = None" % chunk_inputs_name
 
     context += "\n"
@@ -1255,8 +1393,11 @@ def _find_chunk_input_and_output_nodes(nodes: List[Node]):
     # we treat that input node as the input of the checkpoint function
     for node in nodes:
         for input_node in node._input_nodes.keys():
-            if input_node not in nodes and input_node not in input_nodes \
-                and not _is_non_compute_node_except_placeholder(input_node):
+            if (
+                input_node not in nodes
+                and input_node not in input_nodes
+                and not _is_non_compute_node_except_placeholder(input_node)
+            ):
                 input_nodes.append(input_node)
 
     # if a node has a user node which is not in the node list
@@ -1264,8 +1405,11 @@ def _find_chunk_input_and_output_nodes(nodes: List[Node]):
     # TODO it is unsafe to remove non compute node here
     for node in nodes:
         for output_node in node.users.keys():
-            if output_node not in nodes and node not in output_nodes \
-                and not _is_non_compute_node_except_placeholder(input_node):
+            if (
+                output_node not in nodes
+                and node not in output_nodes
+                and not _is_non_compute_node_except_placeholder(input_node)
+            ):
                 output_nodes.append(node)
 
     return input_nodes, output_nodes
@@ -1288,7 +1432,15 @@ def _replace_name(context, name_from, name_to):
     return context
 
 
-def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_value_func, meta_nodes, meta_graph):
+def emit_code_with_chunk(
+    body,
+    ckpt_func,
+    nodes,
+    emit_node_func,
+    delete_unused_value_func,
+    meta_nodes,
+    meta_graph,
+):
     """Emit code with nested activation checkpoint
     When we detect some of the node.activation_checkpoint is a List, we will use
     this function to emit the activation checkpoint codes.
@@ -1304,14 +1456,14 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
     # find the offload regions
     chunk_region_search = ChunkRegionSearch(meta_graph)
     chunk_search = chunk_region_search.search_region()
-    chunk_regions = [i['region'] for i in chunk_search]
-    chunk_dims = [i['dim'] for i in chunk_search]
-    chunk_infos = [i['chunk_info'] for i in chunk_search]
-    
+    chunk_regions = [i["region"] for i in chunk_search]
+    chunk_dims = [i["dim"] for i in chunk_search]
+    chunk_infos = [i["chunk_info"] for i in chunk_search]
+
     chunk_starts = [item[0] for item in chunk_regions]
     chunk_ends = [item[1] for item in chunk_regions]
-    chunk_inputs = [[j['inputs'][0] for j in i] for i in chunk_infos]
-    chunk_outputs = [[j['outputs'][0] for j in i] for i in chunk_infos]
+    chunk_inputs = [[j["inputs"][0] for j in i] for i in chunk_infos]
+    chunk_outputs = [[j["outputs"][0] for j in i] for i in chunk_infos]
     within_chunk_region = False
 
     node_list = list(nodes)
@@ -1322,14 +1474,18 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
     #     inputs, outputs = _find_input_and_output_nodes(offload_node_list)
     #     chunk_inputs.append(inputs)
     #     chunk_outputs.append(outputs)
-    
-    chunk_inputs_idx = [[_find_idx_by_name(j.name, node_list) for j in i] for i in chunk_inputs]
-    chunk_outputs_idx = [[_find_idx_by_name(j.name, node_list) for j in i] for i in chunk_outputs]
+
+    chunk_inputs_idx = [
+        [_find_idx_by_name(j.name, node_list) for j in i] for i in chunk_inputs
+    ]
+    chunk_outputs_idx = [
+        [_find_idx_by_name(j.name, node_list) for j in i] for i in chunk_outputs
+    ]
     chunk_inputs_names = []
     for i in chunk_inputs:
         for j in i:
             chunk_inputs_names.append(j.name)
-    
+
     # this flag is to prevent repeated insert of save tensors
     # hooks definition in ckpt_func
     node_idx = 0
@@ -1340,16 +1496,24 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
         if node_idx in chunk_starts:
             within_chunk_region = True
             region_idx = chunk_starts.index(node_idx)
-                
+
             # add for loop
             chunk_input_meta = [meta_nodes[i] for i in chunk_inputs_idx[region_idx]]
-            body.append(_gen_loop_start(chunk_input_meta, node_list[chunk_ends[region_idx]], chunk_dims[region_idx]))
+            body.append(
+                _gen_loop_start(
+                    chunk_input_meta,
+                    node_list[chunk_ends[region_idx]],
+                    chunk_dims[region_idx],
+                )
+            )
 
         if within_chunk_region:
             emit_node_func(node, body)
             # replace input var with chunk var
-            body[-1] = _replace_name(body[-1], chunk_inputs[region_idx][0].name, 'chunk_tensor')
-            body[-1] = '    ' + body[-1]
+            body[-1] = _replace_name(
+                body[-1], chunk_inputs[region_idx][0].name, "chunk_tensor"
+            )
+            body[-1] = "    " + body[-1]
             delete_unused_value_func(node, body, chunk_inputs_names)
 
         else:
@@ -1358,7 +1522,11 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
                 delete_unused_value_func(node, body, chunk_inputs_names)
 
         if node_idx in chunk_ends:
-            body.append(_gen_loop_end(node, chunk_inputs[region_idx], node_list, chunk_dims[region_idx]))
+            body.append(
+                _gen_loop_end(
+                    node, chunk_inputs[region_idx], node_list, chunk_dims[region_idx]
+                )
+            )
             within_chunk_region = False
 
         node_idx += 1
@@ -1372,14 +1540,16 @@ def __init__(self, meta_graph):
             self.meta_graph = meta_graph
             self.meta_node = list(meta_graph.graph.nodes)
 
-        def _gen_python_code(self, nodes, root_module: str, namespace: _Namespace) -> PythonCode:
+        def _gen_python_code(
+            self, nodes, root_module: str, namespace: _Namespace
+        ) -> PythonCode:
             free_vars: List[str] = []
             body: List[str] = []
             globals_: Dict[str, Any] = {}
             wrapped_fns: Dict[str, None] = {}
 
             # Wrap string in list to pass by reference
-            maybe_return_annotation: List[str] = ['']
+            maybe_return_annotation: List[str] = [""]
 
             def add_global(name_hint: str, obj: Any):
                 """Add an obj to be tracked as a global.
@@ -1389,7 +1559,9 @@ def add_global(name_hint: str, obj: Any):
 
                 Returns: the global name that should be used to reference 'obj' in generated source.
                 """
-                if _is_from_torch(obj) and obj != torch.device:    # to support registering torch.device
+                if (
+                    _is_from_torch(obj) and obj != torch.device
+                ):  # to support registering torch.device
                     # HACK: workaround for how torch custom ops are registered. We
                     # can't import them like normal modules so they must retain their
                     # fully qualified name.
@@ -1405,7 +1577,9 @@ def add_global(name_hint: str, obj: Any):
                 return global_name
 
             # set _custom_builtins here so that we needn't import colossalai in forward
-            _custom_builtins["colossalai"] = _CustomBuiltin("import colossalai", colossalai)
+            _custom_builtins["colossalai"] = _CustomBuiltin(
+                "import colossalai", colossalai
+            )
 
             # Pre-fill the globals table with registered builtins.
             for name, (_, obj) in _custom_builtins.items():
@@ -1414,16 +1588,16 @@ def add_global(name_hint: str, obj: Any):
             def type_repr(o: Any):
                 if o == ():
                     # Empty tuple is used for empty tuple type annotation Tuple[()]
-                    return '()'
+                    return "()"
 
                 typename = _type_repr(o)
 
-                if hasattr(o, '__origin__'):
+                if hasattr(o, "__origin__"):
                     # This is a generic type, e.g. typing.List[torch.Tensor]
                     origin_type = _origin_type_map.get(o.__origin__, o.__origin__)
                     origin_typename = add_global(_type_repr(origin_type), origin_type)
 
-                    if hasattr(o, '__args__'):
+                    if hasattr(o, "__args__"):
                         # Assign global names for each of the inner type variables.
                         args = [type_repr(arg) for arg in o.__args__]
 
@@ -1441,20 +1615,21 @@ def type_repr(o: Any):
                 # Common case: this is a regular module name like 'foo.bar.baz'
                 return add_global(typename, o)
 
-            def _format_args(args: Tuple[Argument, ...], kwargs: Dict[str, Argument]) -> str:
-
+            def _format_args(
+                args: Tuple[Argument, ...], kwargs: Dict[str, Argument]
+            ) -> str:
                 def _get_repr(arg):
                     # Handle NamedTuples (if it has `_fields`) via add_global.
-                    if isinstance(arg, tuple) and hasattr(arg, '_fields'):
+                    if isinstance(arg, tuple) and hasattr(arg, "_fields"):
                         qualified_name = _get_qualified_name(type(arg))
                         global_name = add_global(qualified_name, type(arg))
                         return f"{global_name}{repr(tuple(arg))}"
                     return repr(arg)
 
-                args_s = ', '.join(_get_repr(a) for a in args)
-                kwargs_s = ', '.join(f'{k} = {_get_repr(v)}' for k, v in kwargs.items())
+                args_s = ", ".join(_get_repr(a) for a in args)
+                kwargs_s = ", ".join(f"{k} = {_get_repr(v)}" for k, v in kwargs.items())
                 if args_s and kwargs_s:
-                    return f'{args_s}, {kwargs_s}'
+                    return f"{args_s}, {kwargs_s}"
                 return args_s or kwargs_s
 
             # Run through reverse nodes and record the first instance of a use
@@ -1472,9 +1647,9 @@ def register_last_uses(n: Node, user: Node):
             for node in reversed(nodes):
                 map_arg(node.args, lambda n: register_last_uses(n, node))
                 map_arg(node.kwargs, lambda n: register_last_uses(n, node))
-            
+
             _delete_free_var_from_last_use(user_to_last_uses)
-            
+
             # NOTE: we add a variable to distinguish body and ckpt_func
             def delete_unused_values(user: Node, body, to_keep=[]):
                 """
@@ -1482,103 +1657,140 @@ def delete_unused_values(user: Node, body, to_keep=[]):
                 not used in the remainder of the code are freed and the memory usage
                 of the code is optimal.
                 """
-                if user.op == 'placeholder':
+                if user.op == "placeholder":
                     return
-                if user.op == 'output':
-                    body.append('\n')
+                if user.op == "output":
+                    body.append("\n")
                     return
                 nodes_to_delete = user_to_last_uses.get(user, [])
                 nodes_to_delete = [i for i in nodes_to_delete if i.name not in to_keep]
                 if len(nodes_to_delete):
-                    to_delete_str = ' = '.join([repr(n) for n in nodes_to_delete] + ['None'])
-                    body.append(f';  {to_delete_str}\n')
+                    to_delete_str = " = ".join(
+                        [repr(n) for n in nodes_to_delete] + ["None"]
+                    )
+                    body.append(f";  {to_delete_str}\n")
                 else:
-                    body.append('\n')
+                    body.append("\n")
 
             # NOTE: we add a variable to distinguish body and ckpt_func
             def emit_node(node: Node, body):
-                maybe_type_annotation = '' if node.type is None else f' : {type_repr(node.type)}'
-                if node.op == 'placeholder':
+                maybe_type_annotation = (
+                    "" if node.type is None else f" : {type_repr(node.type)}"
+                )
+                if node.op == "placeholder":
                     assert isinstance(node.target, str)
-                    maybe_default_arg = '' if not node.args else f' = {repr(node.args[0])}'
-                    free_vars.append(f'{node.target}{maybe_type_annotation}{maybe_default_arg}')
-                    raw_name = node.target.replace('*', '')
+                    maybe_default_arg = (
+                        "" if not node.args else f" = {repr(node.args[0])}"
+                    )
+                    free_vars.append(
+                        f"{node.target}{maybe_type_annotation}{maybe_default_arg}"
+                    )
+                    raw_name = node.target.replace("*", "")
                     if raw_name != repr(node):
-                        body.append(f'{repr(node)} = {raw_name}\n')
+                        body.append(f"{repr(node)} = {raw_name}\n")
                     return
-                elif node.op == 'call_method':
+                elif node.op == "call_method":
                     assert isinstance(node.target, str)
                     body.append(
-                        f'{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.target)}'
-                        f'({_format_args(node.args[1:], node.kwargs)})')
+                        f"{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.target)}"
+                        f"({_format_args(node.args[1:], node.kwargs)})"
+                    )
                     return
-                elif node.op == 'call_function':
+                elif node.op == "call_function":
                     assert callable(node.target)
                     # pretty print operators
-                    if node.target.__module__ == '_operator' and node.target.__name__ in magic_methods:
+                    if (
+                        node.target.__module__ == "_operator"
+                        and node.target.__name__ in magic_methods
+                    ):
                         assert isinstance(node.args, tuple)
-                        body.append(f'{repr(node)}{maybe_type_annotation} = '
-                                    f'{magic_methods[node.target.__name__].format(*(repr(a) for a in node.args))}')
+                        body.append(
+                            f"{repr(node)}{maybe_type_annotation} = "
+                            f"{magic_methods[node.target.__name__].format(*(repr(a) for a in node.args))}"
+                        )
                         return
 
                     # pretty print inplace operators; required for jit.script to work properly
                     # not currently supported in normal FX graphs, but generated by torchdynamo
-                    if node.target.__module__ == '_operator' and node.target.__name__ in inplace_methods:
-                        body.append(f'{inplace_methods[node.target.__name__].format(*(repr(a) for a in node.args))};  '
-                                    f'{repr(node)}{maybe_type_annotation} = {repr(node.args[0])}')
+                    if (
+                        node.target.__module__ == "_operator"
+                        and node.target.__name__ in inplace_methods
+                    ):
+                        body.append(
+                            f"{inplace_methods[node.target.__name__].format(*(repr(a) for a in node.args))};  "
+                            f"{repr(node)}{maybe_type_annotation} = {repr(node.args[0])}"
+                        )
                         return
 
                     qualified_name = _get_qualified_name(node.target)
                     global_name = add_global(qualified_name, node.target)
                     # special case for getattr: node.args could be 2-argument or 3-argument
                     # 2-argument: attribute access; 3-argument: fall through to attrib function call with default value
-                    if global_name == 'getattr' and \
-                    isinstance(node.args, tuple) and \
-                    isinstance(node.args[1], str) and \
-                    node.args[1].isidentifier() and \
-                    len(node.args) == 2:
+                    if (
+                        global_name == "getattr"
+                        and isinstance(node.args, tuple)
+                        and isinstance(node.args[1], str)
+                        and node.args[1].isidentifier()
+                        and len(node.args) == 2
+                    ):
                         body.append(
-                            f'{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.args[1])}')
+                            f"{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.args[1])}"
+                        )
                         return
                     body.append(
-                        f'{repr(node)}{maybe_type_annotation} = {global_name}({_format_args(node.args, node.kwargs)})')
-                    if node.meta.get('is_wrapped', False):
+                        f"{repr(node)}{maybe_type_annotation} = {global_name}({_format_args(node.args, node.kwargs)})"
+                    )
+                    if node.meta.get("is_wrapped", False):
                         wrapped_fns.setdefault(global_name)
                     return
-                elif node.op == 'call_module':
+                elif node.op == "call_module":
                     assert isinstance(node.target, str)
-                    body.append(f'{repr(node)}{maybe_type_annotation} = '
-                                f'{_format_target(root_module, node.target)}({_format_args(node.args, node.kwargs)})')
+                    body.append(
+                        f"{repr(node)}{maybe_type_annotation} = "
+                        f"{_format_target(root_module, node.target)}({_format_args(node.args, node.kwargs)})"
+                    )
                     return
-                elif node.op == 'get_attr':
+                elif node.op == "get_attr":
                     assert isinstance(node.target, str)
-                    body.append(f'{repr(node)}{maybe_type_annotation} = {_format_target(root_module, node.target)}')
+                    body.append(
+                        f"{repr(node)}{maybe_type_annotation} = {_format_target(root_module, node.target)}"
+                    )
                     return
-                elif node.op == 'output':
+                elif node.op == "output":
                     if node.type is not None:
                         maybe_return_annotation[0] = f" -> {type_repr(node.type)}"
                     body.append(self.generate_output(node.args[0]))
                     return
-                raise NotImplementedError(f'node: {node.op} {node.target}')
+                raise NotImplementedError(f"node: {node.op} {node.target}")
 
             # Modified for activation checkpointing
             ckpt_func = []
 
             # if any node has a list of labels for activation_checkpoint, we
             # will use nested type of activation checkpoint codegen
-            emit_code_with_chunk(body, ckpt_func, nodes, emit_node, delete_unused_values, self.meta_node, self.meta_graph)
+            emit_code_with_chunk(
+                body,
+                ckpt_func,
+                nodes,
+                emit_node,
+                delete_unused_values,
+                self.meta_node,
+                self.meta_graph,
+            )
 
             if len(body) == 0:
                 # If the Graph has no non-placeholder nodes, no lines for the body
                 # have been emitted. To continue to have valid Python code, emit a
                 # single pass statement
-                body.append('pass\n')
+                body.append("pass\n")
 
             if len(wrapped_fns) > 0:
-                wrap_name = add_global('wrap', torch.fx.wrap)
-                wrap_stmts = '\n'.join([f'{wrap_name}("{name}")' for name in wrapped_fns])
+                wrap_name = add_global("wrap", torch.fx.wrap)
+                wrap_stmts = "\n".join(
+                    [f'{wrap_name}("{name}")' for name in wrapped_fns]
+                )
             else:
-                wrap_stmts = ''
+                wrap_stmts = ""
 
             if self._body_transformer:
                 body = self._body_transformer(body)
@@ -1589,15 +1801,15 @@ def emit_node(node: Node, body):
             # as we need colossalai.utils.checkpoint, we need to import colossalai
             # in forward function
             prologue = self.gen_fn_def(free_vars, maybe_return_annotation[0])
-            prologue = ''.join(ckpt_func) + prologue
+            prologue = "".join(ckpt_func) + prologue
             prologue = prologue
 
-            code = ''.join(body)
-            code = '\n'.join('    ' + line for line in code.split('\n'))
+            code = "".join(body)
+            code = "\n".join("    " + line for line in code.split("\n"))
             fn_code = f"""
 {wrap_stmts}
 
 {prologue}
-{code}"""   
+{code}"""
             print(fn_code)
             return PythonCode(fn_code, globals_)

From 5de9e46381f35a40ffff3675c2170a987b6fd9b9 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Sat, 10 Dec 2022 17:34:48 +0800
Subject: [PATCH 030/503] code format

---
 chunk_codegen.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index e8cf0d22f157..9147aa9fcc20 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -49,7 +49,9 @@ def _is_non_compute_node(node):
 
 
 def _is_non_compute_node_except_placeholder(node):
-    if (any(i in node.op for i in ["get_attr", "output"]) or any(i in node.name for i in ["getitem", "getattr"])):
+    if any(i in node.op for i in ["get_attr", "output"]) or any(
+        i in node.name for i in ["getitem", "getattr"]
+    ):
         return True
     return False
 

From 31a2c5d09fb5496c90f740b3e7cac787ef489e91 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 12 Dec 2022 17:24:06 +0800
Subject: [PATCH 031/503] work with outerproductmean and msa

---
 chunk_codegen.py | 258 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 168 insertions(+), 90 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 9147aa9fcc20..191eab564853 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -134,7 +134,7 @@ def find_node_flow(self, node):
                     return name, i
         raise RuntimeError("invalid node")
 
-    def get_flow_mix(self, node):
+    def _get_flow_mix_node(self, node):
         if self._is_non_compute_node(node):
             return None
         _, node_trace = self.find_node_flow(node)
@@ -145,7 +145,7 @@ def get_flow_mix(self, node):
         vars = list(node_trace["outside_depend"][0].values())[0]
         return vars
 
-    def get_same_flow_node(self, node_list, node):
+    def _get_same_flow_node(self, node_list, node):
         name, _ = self.find_node_flow(node)
         result = []
         for i in self.flow_trace[name]:
@@ -181,13 +181,14 @@ def trace_flow(self):
                     )
         return self.flow_trace
 
-    def _detect_flow(self, start_idx, start_dim, end_idx, end_dim):
-        inputs, outputs = _find_chunk_input_and_output_nodes(
+    def _detect_flow(self, start_idx, start_dim, end_idx, end_dim, index_tracer):
+        inputs, outputs = _find_chunk_compute_input_and_output_nodes(
             self.node_list[start_idx : end_idx + 1]
         )
         chunk_info = {
             "region": (start_idx, end_idx),
             "inputs": inputs,
+            "inputs_non_chunk": [],
             "inputs_dim": start_dim,
             "outputs": outputs,
             "outputs_dim": end_dim,
@@ -197,31 +198,71 @@ def _detect_flow(self, start_idx, start_dim, end_idx, end_dim):
 
         for idx in range(start_idx, end_idx + 1):
             node = self.node_list[idx]
-            mix_flow_var = self.get_flow_mix(node)
-            if mix_flow_var is None:
+            mix_flow_node = self._get_flow_mix_node(node)
+            if mix_flow_node is None:
                 continue
 
-            # if there is a flow mix, op must be in [mul, add, div, matmul]
+            # if there is a flow mix, op must be in [mul, add, matmul]
             # element-wise op requires dim to be equal in every dim
             if any(n in node.name for n in ["mul", "add"]):
                 for i in node.args:
-                    if type(i) == type(mix_flow_var) and i != mix_flow_var:
+                    if type(i) == type(mix_flow_node) and i != mix_flow_node:
                         main_flow_var = i
                 # if mix flow is a broadcast in chunk dim,
                 # TODO need to move that flow out of the chunk
-                if mix_flow_var.meta["tensor_meta"].shape[dim_idx] == 1:
+                mix_flow_node_dim = index_tracer._get_node_chunk_dim(
+                    self.node_list[end_idx], end_dim, node
+                )
+                if mix_flow_node_dim is None:
                     flow_flag = True
-                    for i in self.get_same_flow_node(
-                        chunk_info["inputs"], mix_flow_var
+                    break
+                if _get_node_shape(mix_flow_node)[mix_flow_node_dim] == 1:
+                    flow_flag = False
+                    for i in self._get_same_flow_node(
+                        chunk_info["inputs"], mix_flow_node
                     ):
                         chunk_info["inputs"].remove(i)
                 # else, we need to chunk mix var as well
                 else:
                     # TODO chunk another value
-                    flow_flag = False
+                    flow_flag = True
                     break
             else:
                 raise NotImplementedError("%s not implemented" % node.name)
+        
+        inputs_dim = []
+        remove_inputs = []
+        for input_node in chunk_info['inputs']:
+            input_dict = {}
+            for user in input_node.users.keys():
+                if _is_non_compute_node(user):
+                    continue
+                user_idx = _find_idx_by_name(user.name, self.node_list)
+                dim = None
+                if start_dim <= user_idx < end_idx:
+                    dim = index_tracer._get_node_chunk_dim(
+                        self.node_list[end_idx], end_dim, input_node
+                    )
+                elif user_idx == end_idx:
+                    dim = end_dim
+                # n has relation with chunk dim
+                if dim is not None and _get_node_shape(user)[dim] != 1:
+                    input_dict[user_idx] = dim
+            if len(input_dict) == 0:
+                remove_inputs.append(input_node)
+            else:
+                inputs_dim.append(input_dict)
+        chunk_info['inputs_dim'] = inputs_dim
+        for i in remove_inputs:
+            if i in chunk_info['inputs']:
+                chunk_info['inputs'].remove(i)
+        
+        # we need to log input nodes to avoid deleteing them in the loop
+        non_chunk_inputs = _find_chunk_all_input_nodes(self.node_list[start_idx : end_idx + 1])
+        for i in non_chunk_inputs:
+            if i not in chunk_info['inputs']:
+                chunk_info["inputs_non_chunk"].append(i)
+
         return flow_flag, chunk_info
 
 
@@ -367,6 +408,20 @@ def _find_trace_from_node(self, node):
         node_dict = self.idx_trace_list[node_idx]
         return node_dict
 
+    def _find_source_trace_from_node(self, node):
+        """
+        Find node source trace by the node.
+
+        Args:
+            node (node)
+        Returns:
+            idx (list): idx of the node
+            compute (list): computed idx of the node.
+        """
+        node_idx = _find_idx_by_name(node.name, self.nodes_list)
+        node_dict = self.idx_trace_list[node_idx]
+        return node_dict["source"]
+
     def _find_idx_trace_from_node(self, node):
         """
         Find node idx trace by the node.
@@ -836,6 +891,15 @@ def check_index_compute(self, start_idx, end_dim, end_node, end_idx):
         #         return False
         # return True
 
+    def _get_node_chunk_dim(self, node_from, node_from_dim, node_to):
+        node_from_source = self._find_source_trace_from_node(node_from)
+        dim_source = node_from_source[node_from_dim]
+        node_to_idx = _find_idx_by_name(node_to.name, self.nodes_list)
+        for k, v in dim_source.items():
+            if k == node_to_idx:
+                return v
+        return None
+
 
 class MemoryEstimator(object):
     def __init__(self) -> None:
@@ -931,8 +995,10 @@ def _get_contiguous_memory(self, node, not_contiguous_list, delete=False):
         return mem
 
     def _get_chunk_ratio(self, node, chunk_dim, chunk_size):
+        sorted_dim = sorted(chunk_dim, key=lambda x: list(x.keys())[0])
+        dim = list(sorted_dim[-1].values())[0]
         shape = node.meta["tensor_meta"].shape
-        chunk_ratio = float(chunk_size) / shape[chunk_dim]
+        chunk_ratio = float(chunk_size) / shape[dim]
         return chunk_ratio
 
     def _get_chunk_delete_node_size(
@@ -1157,6 +1223,8 @@ def _check_duplicate_map(self, chunk_infos):
         return chunk_infos
 
     def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
+        if start_idx == 71 and end_idx == 126:
+            print(1)
         start_traces = input_trace[start_idx]
         end_trace = output_trace[end_idx]
         end_node = self.node_list[end_idx]
@@ -1188,7 +1256,7 @@ def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
                         continue
                     # detect flow meet
                     flow_flag, chunk_info = self.flow_tracer._detect_flow(
-                        start_idx, start_dim, end_idx, end_dim
+                        start_idx, start_dim, end_idx, end_dim, self.index_tracer
                     )
                     if flow_flag:
                         continue
@@ -1301,56 +1369,53 @@ def _get_first_non_single_dim(shape):
     raise RuntimeError("can not get first non single dim for shape", shape)
 
 
-def _gen_loop_start(chunk_input_meta, chunk_output, chunk_dim, chunk_size=2):
-    if len(chunk_input_meta) == 1:
-        node = chunk_input_meta[0]
-        node_shape = node.meta["tensor_meta"].shape
-        free_shape = [
-            node_shape[i] if i in chunk_dim else 1 for i in range(len(node_shape))
-        ]
-        chunk_dim = _get_first_non_single_dim(free_shape)
-        chunk_slice = _gen_chunk_slice_dim(chunk_dim, "gen_chunk_idx", node_shape)
-        out_shape = str(list(chunk_output.meta["tensor_meta"].shape))
-
-        context = (
-            "chunk_result = torch.empty(%s, dtype=%s.dtype, device=%s.device); chunk_size = %d\nfor gen_chunk_idx in range"
-            % (out_shape, node.name, node.name, chunk_size)
-        )
-        context += "(0, %s.shape[%d], chunk_size):\n" % (node.name, chunk_dim)
-        context += "    chunk_tensor = %s%s\n" % (node.name, chunk_slice)
-    else:
-        raise NotImplementedError(
-            "input with size %d not implemented" % len(chunk_input_meta)
-        )
+def _gen_loop_start(chunk_input, chunk_output, chunk_ouput_dim, chunk_size=2):
+    input_node = chunk_input[0]
+    
+    out_shape = _get_node_shape(chunk_output)
+    out_str = str(list(out_shape))
+
+    context = (
+        "chunk_result = torch.empty(%s, dtype=%s.dtype, device=%s.device); chunk_size = %d\nfor chunk_idx in range"
+        % (out_str, input_node.name, input_node.name, chunk_size)
+    )
+    context += "(0, %d, chunk_size):\n" % (out_shape[chunk_ouput_dim])
+
+    # node = chunk_input[0]
+    # node_shape = node.meta["tensor_meta"].shape
+    # free_shape = [
+    #     node_shape[i] if i in chunk_dim else 1 for i in range(len(node_shape))
+    # ]
+    # chunk_dim = _get_first_non_single_dim(free_shape)
+    # chunk_slice = _gen_chunk_slice_dim(chunk_dim, "gen_chunk_idx", node_shape)
+    # out_shape = str(list(chunk_output.meta["tensor_meta"].shape))
+
+    # context = (
+    #     "chunk_result = torch.empty(%s, dtype=%s.dtype, device=%s.device); chunk_size = %d\nfor gen_chunk_idx in range"
+    #     % (out_shape, node.name, node.name, chunk_size)
+    # )
+    # context += "(0, %s.shape[%d], chunk_size):\n" % (node.name, chunk_dim)
+    # context += "    chunk_tensor = %s%s\n" % (node.name, chunk_slice)
     return context
 
 
-def _gen_loop_end(chunk_outputs, chunk_inputs, node_list, chunk_dim):
-    chunk_inputs_name = chunk_inputs[0].name
+def _gen_loop_end(chunk_inputs, chunk_non_compute_inputs, chunk_outputs, chunk_outputs_dim, node_list):
     chunk_outputs_name = chunk_outputs.name
     chunk_outputs_idx = _find_idx_by_name(chunk_outputs_name, node_list)
     chunk_output_shape = chunk_outputs.meta["tensor_meta"].shape
-    free_shape = [
-        chunk_output_shape[i] if i in chunk_dim else 1
-        for i in range(len(chunk_output_shape))
-    ]
-    chunk_dim = _get_first_non_single_dim(free_shape)
-    chunk_slice = _gen_chunk_slice_dim(chunk_dim, "gen_chunk_idx", chunk_output_shape)
+    chunk_slice = _gen_chunk_slice_dim(chunk_outputs_dim, "chunk_idx", chunk_output_shape)
     context = "    chunk_result%s = %s\n" % (chunk_slice, chunk_outputs_name)
-
-    context += (
-        chunk_outputs_name + " = chunk_result;  chunk_result = None;  chunk_size = None"
-    )
+    context += (chunk_outputs_name + " = chunk_result;  chunk_result = None;  chunk_size = None")
 
     # determine if its the last use for chunk input
-    users_name = list(chunk_inputs[0].users.keys())
-    if all(
-        [
-            _find_idx_by_name(user.name, node_list) <= chunk_outputs_idx
-            for user in users_name
-        ]
-    ):
-        context += ";  %s = None" % chunk_inputs_name
+    for chunk_input in (chunk_inputs + chunk_non_compute_inputs):
+        if all(
+            [
+                _find_idx_by_name(user.name, node_list) <= chunk_outputs_idx
+                for user in chunk_input.users.keys()
+            ]
+        ):
+            context += ";  %s = None" % chunk_input.name
 
     context += "\n"
     return context
@@ -1382,7 +1447,24 @@ def _find_input_and_output_nodes(nodes: List[Node]):
     return input_nodes, output_nodes
 
 
-def _find_chunk_input_and_output_nodes(nodes: List[Node]):
+def _find_chunk_all_input_nodes(nodes: List[Node]):
+    """
+    Find non-compute input and output node names.
+    input nodes are nodes used in the list
+    output nodes are nodes will use nodes in the list
+    """
+    input_nodes = []
+    for node in nodes:
+        for input_node in node._input_nodes.keys():
+            if (
+                input_node not in nodes
+                and input_node not in input_nodes
+            ):
+                input_nodes.append(input_node)
+    return input_nodes
+
+
+def _find_chunk_compute_input_and_output_nodes(nodes: List[Node]):
     """
     Find non-compute input and output node names.
     input nodes are nodes used in the list
@@ -1410,7 +1492,7 @@ def _find_chunk_input_and_output_nodes(nodes: List[Node]):
             if (
                 output_node not in nodes
                 and node not in output_nodes
-                and not _is_non_compute_node_except_placeholder(input_node)
+                and not _is_non_compute_node_except_placeholder(output_node)
             ):
                 output_nodes.append(node)
 
@@ -1454,44 +1536,34 @@ def emit_code_with_chunk(
         emit_node_func: function to emit node
         delete_unused_value_func: function to remove the unused value
     """
+    node_list = list(nodes)
 
-    # find the offload regions
+    # find the chunk regions
     chunk_region_search = ChunkRegionSearch(meta_graph)
     chunk_search = chunk_region_search.search_region()
-    chunk_regions = [i["region"] for i in chunk_search]
-    chunk_dims = [i["dim"] for i in chunk_search]
-    chunk_infos = [i["chunk_info"] for i in chunk_search]
-
-    chunk_starts = [item[0] for item in chunk_regions]
-    chunk_ends = [item[1] for item in chunk_regions]
-    chunk_inputs = [[j["inputs"][0] for j in i] for i in chunk_infos]
-    chunk_outputs = [[j["outputs"][0] for j in i] for i in chunk_infos]
-    within_chunk_region = False
-
-    node_list = list(nodes)
 
-    # find the input and output var names for each offload region
-    # for idx, (start, end) in enumerate(chunk_regions):
-    #     offload_node_list = node_list[start:end + 1]
-    #     inputs, outputs = _find_input_and_output_nodes(offload_node_list)
-    #     chunk_inputs.append(inputs)
-    #     chunk_outputs.append(outputs)
+    chunk_regions = [i["region"] for i in chunk_search]
+    chunk_starts = [i[0] for i in chunk_regions]
+    chunk_ends = [i[1] for i in chunk_regions]
 
+    chunk_inputs = [i["inputs"] for i in chunk_search]
+    chunk_inputs_non_chunk = [i["inputs_non_chunk"] for i in chunk_search]
+    chunk_inputs_dim = [i["inputs_dim"] for i in chunk_search]
     chunk_inputs_idx = [
         [_find_idx_by_name(j.name, node_list) for j in i] for i in chunk_inputs
     ]
+    chunk_inputs_names = [j.name for i in chunk_inputs for j in i] + [j.name for i in chunk_inputs_non_chunk for j in i]
+
+    chunk_outputs = [i["outputs"][0] for i in chunk_search]
+    chunk_outputs_dim = [i["outputs_dim"] for i in chunk_search]
     chunk_outputs_idx = [
-        [_find_idx_by_name(j.name, node_list) for j in i] for i in chunk_outputs
+        _find_idx_by_name(i.name, node_list) for i in chunk_outputs
     ]
-    chunk_inputs_names = []
-    for i in chunk_inputs:
-        for j in i:
-            chunk_inputs_names.append(j.name)
 
-    # this flag is to prevent repeated insert of save tensors
-    # hooks definition in ckpt_func
     node_idx = 0
     region_idx = 0
+    within_chunk_region = False
+
     while node_idx < len(node_list):
         node = node_list[node_idx]
 
@@ -1500,21 +1572,24 @@ def emit_code_with_chunk(
             region_idx = chunk_starts.index(node_idx)
 
             # add for loop
-            chunk_input_meta = [meta_nodes[i] for i in chunk_inputs_idx[region_idx]]
             body.append(
                 _gen_loop_start(
-                    chunk_input_meta,
-                    node_list[chunk_ends[region_idx]],
-                    chunk_dims[region_idx],
+                    chunk_inputs[region_idx],
+                    chunk_outputs[region_idx],
+                    chunk_outputs_dim[region_idx],
                 )
             )
 
         if within_chunk_region:
             emit_node_func(node, body)
             # replace input var with chunk var
-            body[-1] = _replace_name(
-                body[-1], chunk_inputs[region_idx][0].name, "chunk_tensor"
-            )
+            for input_node_idx, input_node in enumerate(chunk_inputs[region_idx]):
+                for idx, dim in chunk_inputs_dim[region_idx][input_node_idx].items():
+                    if idx == node_idx:
+                        chunk_slice = _gen_chunk_slice_dim(dim, "chunk_idx", _get_node_shape(input_node))    
+                        body[-1] = _replace_name(
+                            body[-1], input_node.name, input_node.name + chunk_slice
+                        )
             body[-1] = "    " + body[-1]
             delete_unused_value_func(node, body, chunk_inputs_names)
 
@@ -1526,7 +1601,10 @@ def emit_code_with_chunk(
         if node_idx in chunk_ends:
             body.append(
                 _gen_loop_end(
-                    node, chunk_inputs[region_idx], node_list, chunk_dims[region_idx]
+                    chunk_inputs[region_idx],
+                    chunk_inputs_non_chunk[region_idx],
+                    chunk_outputs[region_idx],
+                    chunk_outputs_dim[region_idx], node_list
                 )
             )
             within_chunk_region = False

From b7b67c32ad79c4e81775b32fc4a36ec733915f56 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 12 Dec 2022 17:25:38 +0800
Subject: [PATCH 032/503] code style

---
 chunk_codegen.py | 70 +++++++++++++++++++-----------------------------
 1 file changed, 28 insertions(+), 42 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 191eab564853..3bea84faeabb 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -229,10 +229,10 @@ def _detect_flow(self, start_idx, start_dim, end_idx, end_dim, index_tracer):
                     break
             else:
                 raise NotImplementedError("%s not implemented" % node.name)
-        
+
         inputs_dim = []
         remove_inputs = []
-        for input_node in chunk_info['inputs']:
+        for input_node in chunk_info["inputs"]:
             input_dict = {}
             for user in input_node.users.keys():
                 if _is_non_compute_node(user):
@@ -252,15 +252,17 @@ def _detect_flow(self, start_idx, start_dim, end_idx, end_dim, index_tracer):
                 remove_inputs.append(input_node)
             else:
                 inputs_dim.append(input_dict)
-        chunk_info['inputs_dim'] = inputs_dim
+        chunk_info["inputs_dim"] = inputs_dim
         for i in remove_inputs:
-            if i in chunk_info['inputs']:
-                chunk_info['inputs'].remove(i)
-        
+            if i in chunk_info["inputs"]:
+                chunk_info["inputs"].remove(i)
+
         # we need to log input nodes to avoid deleteing them in the loop
-        non_chunk_inputs = _find_chunk_all_input_nodes(self.node_list[start_idx : end_idx + 1])
+        non_chunk_inputs = _find_chunk_all_input_nodes(
+            self.node_list[start_idx : end_idx + 1]
+        )
         for i in non_chunk_inputs:
-            if i not in chunk_info['inputs']:
+            if i not in chunk_info["inputs"]:
                 chunk_info["inputs_non_chunk"].append(i)
 
         return flow_flag, chunk_info
@@ -1371,44 +1373,32 @@ def _get_first_non_single_dim(shape):
 
 def _gen_loop_start(chunk_input, chunk_output, chunk_ouput_dim, chunk_size=2):
     input_node = chunk_input[0]
-    
     out_shape = _get_node_shape(chunk_output)
     out_str = str(list(out_shape))
-
     context = (
         "chunk_result = torch.empty(%s, dtype=%s.dtype, device=%s.device); chunk_size = %d\nfor chunk_idx in range"
         % (out_str, input_node.name, input_node.name, chunk_size)
     )
     context += "(0, %d, chunk_size):\n" % (out_shape[chunk_ouput_dim])
-
-    # node = chunk_input[0]
-    # node_shape = node.meta["tensor_meta"].shape
-    # free_shape = [
-    #     node_shape[i] if i in chunk_dim else 1 for i in range(len(node_shape))
-    # ]
-    # chunk_dim = _get_first_non_single_dim(free_shape)
-    # chunk_slice = _gen_chunk_slice_dim(chunk_dim, "gen_chunk_idx", node_shape)
-    # out_shape = str(list(chunk_output.meta["tensor_meta"].shape))
-
-    # context = (
-    #     "chunk_result = torch.empty(%s, dtype=%s.dtype, device=%s.device); chunk_size = %d\nfor gen_chunk_idx in range"
-    #     % (out_shape, node.name, node.name, chunk_size)
-    # )
-    # context += "(0, %s.shape[%d], chunk_size):\n" % (node.name, chunk_dim)
-    # context += "    chunk_tensor = %s%s\n" % (node.name, chunk_slice)
     return context
 
 
-def _gen_loop_end(chunk_inputs, chunk_non_compute_inputs, chunk_outputs, chunk_outputs_dim, node_list):
+def _gen_loop_end(
+    chunk_inputs, chunk_non_compute_inputs, chunk_outputs, chunk_outputs_dim, node_list
+):
     chunk_outputs_name = chunk_outputs.name
     chunk_outputs_idx = _find_idx_by_name(chunk_outputs_name, node_list)
     chunk_output_shape = chunk_outputs.meta["tensor_meta"].shape
-    chunk_slice = _gen_chunk_slice_dim(chunk_outputs_dim, "chunk_idx", chunk_output_shape)
+    chunk_slice = _gen_chunk_slice_dim(
+        chunk_outputs_dim, "chunk_idx", chunk_output_shape
+    )
     context = "    chunk_result%s = %s\n" % (chunk_slice, chunk_outputs_name)
-    context += (chunk_outputs_name + " = chunk_result;  chunk_result = None;  chunk_size = None")
+    context += (
+        chunk_outputs_name + " = chunk_result;  chunk_result = None;  chunk_size = None"
+    )
 
     # determine if its the last use for chunk input
-    for chunk_input in (chunk_inputs + chunk_non_compute_inputs):
+    for chunk_input in chunk_inputs + chunk_non_compute_inputs:
         if all(
             [
                 _find_idx_by_name(user.name, node_list) <= chunk_outputs_idx
@@ -1456,10 +1446,7 @@ def _find_chunk_all_input_nodes(nodes: List[Node]):
     input_nodes = []
     for node in nodes:
         for input_node in node._input_nodes.keys():
-            if (
-                input_node not in nodes
-                and input_node not in input_nodes
-            ):
+            if input_node not in nodes and input_node not in input_nodes:
                 input_nodes.append(input_node)
     return input_nodes
 
@@ -1549,16 +1536,12 @@ def emit_code_with_chunk(
     chunk_inputs = [i["inputs"] for i in chunk_search]
     chunk_inputs_non_chunk = [i["inputs_non_chunk"] for i in chunk_search]
     chunk_inputs_dim = [i["inputs_dim"] for i in chunk_search]
-    chunk_inputs_idx = [
-        [_find_idx_by_name(j.name, node_list) for j in i] for i in chunk_inputs
+    chunk_inputs_names = [j.name for i in chunk_inputs for j in i] + [
+        j.name for i in chunk_inputs_non_chunk for j in i
     ]
-    chunk_inputs_names = [j.name for i in chunk_inputs for j in i] + [j.name for i in chunk_inputs_non_chunk for j in i]
 
     chunk_outputs = [i["outputs"][0] for i in chunk_search]
     chunk_outputs_dim = [i["outputs_dim"] for i in chunk_search]
-    chunk_outputs_idx = [
-        _find_idx_by_name(i.name, node_list) for i in chunk_outputs
-    ]
 
     node_idx = 0
     region_idx = 0
@@ -1586,7 +1569,9 @@ def emit_code_with_chunk(
             for input_node_idx, input_node in enumerate(chunk_inputs[region_idx]):
                 for idx, dim in chunk_inputs_dim[region_idx][input_node_idx].items():
                     if idx == node_idx:
-                        chunk_slice = _gen_chunk_slice_dim(dim, "chunk_idx", _get_node_shape(input_node))    
+                        chunk_slice = _gen_chunk_slice_dim(
+                            dim, "chunk_idx", _get_node_shape(input_node)
+                        )
                         body[-1] = _replace_name(
                             body[-1], input_node.name, input_node.name + chunk_slice
                         )
@@ -1604,7 +1589,8 @@ def emit_code_with_chunk(
                     chunk_inputs[region_idx],
                     chunk_inputs_non_chunk[region_idx],
                     chunk_outputs[region_idx],
-                    chunk_outputs_dim[region_idx], node_list
+                    chunk_outputs_dim[region_idx],
+                    node_list,
                 )
             )
             within_chunk_region = False

From 5cdfcfe1d168e39d39a741112c036fa1455f0d06 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 12 Dec 2022 17:29:07 +0800
Subject: [PATCH 033/503] code style

---
 chunk_codegen.py | 49 ++++--------------------------------------------
 1 file changed, 4 insertions(+), 45 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 3bea84faeabb..96dcbfc0f79d 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -92,24 +92,10 @@ def _init_trace(self):
                 self._add_trace(i.name)
                 self._add_node(i.name, i)
 
-    def _is_non_compute_node(self, node):
-        if any(i in node.op for i in ["placeholder", "get_attr", "output"]) or any(
-            i in node.name for i in ["getitem", "getattr"]
-        ):
-            return True
-        return False
-
-    def _is_non_compute_node_except_placeholder(self, node):
-        if any(i in node.op for i in ["get_attr", "output"]) or any(
-            i in node.name for i in ["getitem", "getattr"]
-        ):
-            return True
-        return False
-
     def _find_flow_for_node(self, node):
         if type(self.node_list[0]) != type(node):
             return None
-        if self._is_non_compute_node_except_placeholder(node):
+        if _is_non_compute_node_except_placeholder(node):
             return None
         for name, trace in self.flow_trace.items():
             for i in trace:
@@ -135,7 +121,7 @@ def find_node_flow(self, node):
         raise RuntimeError("invalid node")
 
     def _get_flow_mix_node(self, node):
-        if self._is_non_compute_node(node):
+        if _is_non_compute_node(node):
             return None
         _, node_trace = self.find_node_flow(node)
         if len(node_trace["outside_depend"]) == 0:
@@ -160,10 +146,9 @@ def trace_flow(self):
         for node in self.node_list:
             # skip if non compute node
             if all(
-                type(arg) != type(node)
-                or self._is_non_compute_node_except_placeholder(arg)
+                type(arg) != type(node) or _is_non_compute_node_except_placeholder(arg)
                 for arg in node.args
-            ) or self._is_non_compute_node(node):
+            ) or _is_non_compute_node(node):
                 continue
 
             node_input_flows = [self._find_flow_for_node(arg) for arg in node.args]
@@ -1411,32 +1396,6 @@ def _gen_loop_end(
     return context
 
 
-def _find_input_and_output_nodes(nodes: List[Node]):
-    """
-    Find the input and output node names which are not found in the given list of nodes.
-    """
-    input_nodes = []
-    output_nodes = []
-
-    # if a node has an input node which is not in the node list
-    # we treat that input node as the input of the checkpoint function
-    for node in nodes:
-        for input_node in node._input_nodes.keys():
-            node_repr = repr(input_node)
-            if input_node not in nodes and input_node not in input_nodes:
-                input_nodes.append(input_node)
-
-    # if a node has a user node which is not in the node list
-    # we treat that user node as the node receiving the current node output
-    for node in nodes:
-        for output_node in node.users.keys():
-            node_repr = repr(node)
-            if output_node not in nodes and output_node not in output_nodes:
-                output_nodes.append(output_node)
-
-    return input_nodes, output_nodes
-
-
 def _find_chunk_all_input_nodes(nodes: List[Node]):
     """
     Find non-compute input and output node names.

From 8511d900a88638cb04ced2db35b171a96f6f310c Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 12 Dec 2022 17:36:17 +0800
Subject: [PATCH 034/503] code style

---
 chunk_codegen.py | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 96dcbfc0f79d..88d9178091b7 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -1210,8 +1210,6 @@ def _check_duplicate_map(self, chunk_infos):
         return chunk_infos
 
     def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
-        if start_idx == 71 and end_idx == 126:
-            print(1)
         start_traces = input_trace[start_idx]
         end_trace = output_trace[end_idx]
         end_node = self.node_list[end_idx]
@@ -1347,15 +1345,6 @@ def _gen_chunk_slice_dim(chunk_dim, chunk_idx_name, shape):
     return new_shape
 
 
-def _get_first_non_single_dim(shape):
-    for idx, i in enumerate(shape):
-        if i == 1:
-            continue
-        else:
-            return idx
-    raise RuntimeError("can not get first non single dim for shape", shape)
-
-
 def _gen_loop_start(chunk_input, chunk_output, chunk_ouput_dim, chunk_size=2):
     input_node = chunk_input[0]
     out_shape = _get_node_shape(chunk_output)

From 98f9728e29f463692cea1533c998f0e7f2381e59 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 12 Dec 2022 18:15:47 +0800
Subject: [PATCH 035/503] code style

---
 chunk_codegen.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 88d9178091b7..22d48f5d661a 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -194,7 +194,7 @@ def _detect_flow(self, start_idx, start_dim, end_idx, end_dim, index_tracer):
                     if type(i) == type(mix_flow_node) and i != mix_flow_node:
                         main_flow_var = i
                 # if mix flow is a broadcast in chunk dim,
-                # TODO need to move that flow out of the chunk
+                # TODO: need to move that flow out of the chunk
                 mix_flow_node_dim = index_tracer._get_node_chunk_dim(
                     self.node_list[end_idx], end_dim, node
                 )
@@ -1200,7 +1200,7 @@ def _check_duplicate_map(self, chunk_infos):
                     continue
                 # it means an index create 2 copy of itself
                 # eg. a = torch.matmul(x, x.transpose(-1, -2))
-                # TODO currently remove it, deal with this in future
+                # TODO: currently remove it, deal with this in future
                 if input_dim1 == input_dim2 and output_dim1 != output_dim2:
                     remove_list.append(chunk_infos[idx1])
                     remove_list.append(chunk_infos[idx2])
@@ -1216,7 +1216,7 @@ def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
         chunk_infos = []
         for end_dim, end_trace_idx in enumerate(end_trace["idx"]):
             if len(start_traces) > 1:
-                # TODO implement multi input chunk
+                # TODO: implement multi input chunk
                 continue
             for start_node, start_trace in start_traces.items():
                 for start_dim, start_trace_idx in enumerate(start_trace["idx"]):
@@ -1421,7 +1421,7 @@ def _find_chunk_compute_input_and_output_nodes(nodes: List[Node]):
 
     # if a node has a user node which is not in the node list
     # we treat that user node as the node receiving the current node output
-    # TODO it is unsafe to remove non compute node here
+    # TODO: it is unsafe to remove non compute node here
     for node in nodes:
         for output_node in node.users.keys():
             if (

From 8754fa255376055c01aab4a3fab385454b8b7930 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 12 Dec 2022 18:25:47 +0800
Subject: [PATCH 036/503] change threshold

---
 chunk_codegen_run.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/chunk_codegen_run.py b/chunk_codegen_run.py
index 88c734903392..99700e1af9d8 100644
--- a/chunk_codegen_run.py
+++ b/chunk_codegen_run.py
@@ -45,8 +45,9 @@ def _test_fwd_and_bwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair):
     with torch.no_grad():
         non_fx_out = model(node, pair)
         fx_out = gm(node, pair)
-    assert torch.allclose(non_fx_out[0], fx_out[0], atol=1e-6), "fx_out doesn't comply with original output"
-    assert torch.allclose(non_fx_out[1], fx_out[1], atol=1e-6), "fx_out doesn't comply with original output"
+
+    assert torch.allclose(non_fx_out[0], fx_out[0], atol=1e-4), "fx_out doesn't comply with original output"
+    assert torch.allclose(non_fx_out[1], fx_out[1], atol=1e-4), "fx_out doesn't comply with original output"
 
     # test barckward
     # loss0 = non_fx_out[0].sum() + non_fx_out[1].sum()

From 1e0fd11bc1773ca47cbd95fb19b86517265390ce Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 13 Dec 2022 10:01:30 +0800
Subject: [PATCH 037/503] support check_index_duplicate

---
 chunk_codegen.py | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 22d48f5d661a..64bff4a801a1 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -179,7 +179,12 @@ def _detect_flow(self, start_idx, start_dim, end_idx, end_dim, index_tracer):
             "outputs_dim": end_dim,
             "args": {},
         }
-        flow_flag = False
+        flow_block = False
+        
+        # TODO don't allow multi outputs now
+        if len(outputs) > 1:
+            flow_block = True
+            return flow_block, chunk_info
 
         for idx in range(start_idx, end_idx + 1):
             node = self.node_list[idx]
@@ -199,10 +204,10 @@ def _detect_flow(self, start_idx, start_dim, end_idx, end_dim, index_tracer):
                     self.node_list[end_idx], end_dim, node
                 )
                 if mix_flow_node_dim is None:
-                    flow_flag = True
+                    flow_block = True
                     break
                 if _get_node_shape(mix_flow_node)[mix_flow_node_dim] == 1:
-                    flow_flag = False
+                    flow_block = False
                     for i in self._get_same_flow_node(
                         chunk_info["inputs"], mix_flow_node
                     ):
@@ -210,11 +215,15 @@ def _detect_flow(self, start_idx, start_dim, end_idx, end_dim, index_tracer):
                 # else, we need to chunk mix var as well
                 else:
                     # TODO chunk another value
-                    flow_flag = True
+                    flow_block = True
                     break
             else:
                 raise NotImplementedError("%s not implemented" % node.name)
 
+        if flow_block:
+            flow_block = True
+            return flow_block, chunk_info
+        
         inputs_dim = []
         remove_inputs = []
         for input_node in chunk_info["inputs"]:
@@ -250,7 +259,7 @@ def _detect_flow(self, start_idx, start_dim, end_idx, end_dim, index_tracer):
             if i not in chunk_info["inputs"]:
                 chunk_info["inputs_non_chunk"].append(i)
 
-        return flow_flag, chunk_info
+        return flow_block, chunk_info
 
 
 class IndexTracer(object):
@@ -869,14 +878,6 @@ def check_index_compute(self, start_idx, end_dim, end_node, end_idx):
         if any(start_idx <= i <= end_idx for i in end_node_compute):
             return False
         return True
-        # end_node_trace_source = end_node_trace['source'][end_dim]
-        # for node_idx, node_dim in end_node_trace_source.items():
-        #     if node_idx < start_node_idx or node_idx > end_node_idx:
-        #         continue
-        #     compute_list = self.idx_trace_list[node_idx]['compute'][node_dim]
-        #     if any(start_node_idx <= i <= end_node_idx for i in compute_list):
-        #         return False
-        # return True
 
     def _get_node_chunk_dim(self, node_from, node_from_dim, node_to):
         node_from_source = self._find_source_trace_from_node(node_from)
@@ -1240,10 +1241,10 @@ def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
                     ):
                         continue
                     # detect flow meet
-                    flow_flag, chunk_info = self.flow_tracer._detect_flow(
+                    flow_block, chunk_info = self.flow_tracer._detect_flow(
                         start_idx, start_dim, end_idx, end_dim, self.index_tracer
                     )
-                    if flow_flag:
+                    if flow_block:
                         continue
                     chunk_infos.append(chunk_info)
         chunk_infos = self._check_duplicate_map(chunk_infos)

From cda3e8572a8ab1f0c48342ad305fadbf892d62b2 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 13 Dec 2022 10:02:26 +0800
Subject: [PATCH 038/503] support index dupilictae and update loop

---
 chunk_codegen.py     | 109 +++++++++++++++++++++++++++++--------------
 chunk_codegen_run.py |   4 +-
 2 files changed, 76 insertions(+), 37 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 64bff4a801a1..b5bb8f18560a 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -180,7 +180,7 @@ def _detect_flow(self, start_idx, start_dim, end_idx, end_dim, index_tracer):
             "args": {},
         }
         flow_block = False
-        
+
         # TODO don't allow multi outputs now
         if len(outputs) > 1:
             flow_block = True
@@ -200,7 +200,7 @@ def _detect_flow(self, start_idx, start_dim, end_idx, end_dim, index_tracer):
                         main_flow_var = i
                 # if mix flow is a broadcast in chunk dim,
                 # TODO: need to move that flow out of the chunk
-                mix_flow_node_dim = index_tracer._get_node_chunk_dim(
+                mix_flow_node_dim = index_tracer.get_node_chunk_dim(
                     self.node_list[end_idx], end_dim, node
                 )
                 if mix_flow_node_dim is None:
@@ -223,7 +223,7 @@ def _detect_flow(self, start_idx, start_dim, end_idx, end_dim, index_tracer):
         if flow_block:
             flow_block = True
             return flow_block, chunk_info
-        
+
         inputs_dim = []
         remove_inputs = []
         for input_node in chunk_info["inputs"]:
@@ -234,7 +234,7 @@ def _detect_flow(self, start_idx, start_dim, end_idx, end_dim, index_tracer):
                 user_idx = _find_idx_by_name(user.name, self.node_list)
                 dim = None
                 if start_dim <= user_idx < end_idx:
-                    dim = index_tracer._get_node_chunk_dim(
+                    dim = index_tracer.get_node_chunk_dim(
                         self.node_list[end_idx], end_dim, input_node
                     )
                 elif user_idx == end_idx:
@@ -300,10 +300,10 @@ def _del_dim(self, idx, dim_idx):
         self.idx_trace_list[idx]["compute"].pop(dim_idx)
         self.idx_trace_list[idx]["source"].pop(dim_idx)
 
-    def _add_dim(self, idx, dim_idx):
-        self.idx_trace_list[idx]["idx"].insert(dim_idx, self._add_index())
-        self.idx_trace_list[idx]["compute"].insert(dim_idx, [])
-        self.idx_trace_list[idx]["source"].insert(dim_idx, {})
+    def _add_dim(self, node_idx, dim_idx):
+        self.idx_trace_list[node_idx]["idx"].insert(dim_idx, self._add_index())
+        self.idx_trace_list[node_idx]["compute"].insert(dim_idx, [])
+        self.idx_trace_list[node_idx]["source"].insert(dim_idx, {})
 
     def _transform_index(self, node, node_dim):
         node_idx = self._find_idx_trace_from_node(node)
@@ -659,9 +659,7 @@ def _assign_unsqueeze_index(self, node, node_idx):
         """
         self._del_dim(node_idx, -1)
         self._assign_index_as_input(node, node_idx)
-        self.idx_trace_list[node_idx]["idx"].insert(node.args[1], self._add_index())
-        self.idx_trace_list[node_idx]["compute"].insert(node.args[1], [])
-        self.idx_trace_list[node_idx]["source"].insert(node.args[1], [])
+        self._add_dim(node_idx, node.args[1])
 
     def _assign_dropout_index(self, node, node_idx):
         """
@@ -879,7 +877,7 @@ def check_index_compute(self, start_idx, end_dim, end_node, end_idx):
             return False
         return True
 
-    def _get_node_chunk_dim(self, node_from, node_from_dim, node_to):
+    def get_node_chunk_dim(self, node_from, node_from_dim, node_to):
         node_from_source = self._find_source_trace_from_node(node_from)
         dim_source = node_from_source[node_from_dim]
         node_to_idx = _find_idx_by_name(node_to.name, self.nodes_list)
@@ -888,6 +886,44 @@ def _get_node_chunk_dim(self, node_from, node_from_dim, node_to):
                 return v
         return None
 
+    def _find_inherit_dim(self, input_node, input_dim, node):
+        input_node_idx = _find_idx_by_name(input_node.name, self.nodes_list)
+        node_idx = _find_idx_by_name(node.name, self.nodes_list)
+        node_trace_source = self._find_source_trace_from_node(node)
+        for node_dim in range(len(_get_node_shape(node))):
+            if (
+                input_node_idx in node_trace_source[node_dim]
+                and node_trace_source[node_dim][input_node_idx] == input_dim
+            ):
+                return {node_idx: node_dim}
+        return {}
+
+    def check_index_duplicate(self, chunk_infos):
+        input_dim_after_node = {}
+        for input_node_idx, input_node in enumerate(chunk_infos["inputs"]):
+            for k, v in chunk_infos["inputs_dim"][input_node_idx].items():
+                input_dim_after_node.update(
+                    self._find_inherit_dim(input_node, v, self.nodes_list[k])
+                )
+
+        for node in self.nodes_list[
+            chunk_infos["region"][0] : chunk_infos["region"][1] + 1
+        ]:
+            if _is_non_compute_node_except_placeholder(node):
+                continue
+            count = 0
+            node_trace_source = self._find_source_trace_from_node(node)
+            for node_dim in range(len(_get_node_shape(node))):
+                dim_source = node_trace_source[node_dim]
+                for k, v in dim_source.items():
+                    if chunk_infos["region"][0] <= k <= chunk_infos["region"][1]:
+                        if k in input_dim_after_node and input_dim_after_node[k] == v:
+                            count += 1
+                            break
+            if count > 1:
+                return False
+        return True
+
 
 class MemoryEstimator(object):
     def __init__(self) -> None:
@@ -1160,7 +1196,7 @@ def _get_min_free_var(self, active_node_list, free_vars):
                 min_len = len(n)
         return min_len
 
-    def _search_max_chunk_region(self, active_node, peak_node):
+    def _search_max_chunk_region(self, active_node, peak_node, chunk_regions):
         free_vars = self._get_free_var()
         min_var = self._get_min_free_var(active_node, free_vars)
 
@@ -1180,6 +1216,21 @@ def _search_max_chunk_region(self, active_node, peak_node):
                 break
             if i in free_vars or i == 0:
                 raise RuntimeError()
+
+        for i in chunk_regions:
+            region = i["region"]
+            if chunk_region_start >= region[0] and chunk_region_end <= region[1]:
+                return None
+            elif (
+                region[0] <= chunk_region_start <= region[1]
+                and chunk_region_end > region[1]
+            ):
+                chunk_region_start = region[1] + 1
+            elif (
+                region[0] <= chunk_region_end <= region[1]
+                and chunk_region_start < region[0]
+            ):
+                chunk_region_end = region[0] - 1
         return chunk_region_start, chunk_region_end
 
     def _is_not_compute(self, trace, chunk_range, dim_idx):
@@ -1192,24 +1243,6 @@ def _is_not_compute(self, trace, chunk_range, dim_idx):
             return True
         return False
 
-    def _check_duplicate_map(self, chunk_infos):
-        dim_map = [(i["inputs_dim"], i["outputs_dim"]) for i in chunk_infos]
-        remove_list = []
-        for idx1, (input_dim1, output_dim1) in enumerate(dim_map):
-            for idx2, (input_dim2, output_dim2) in enumerate(dim_map):
-                if idx1 == idx2:
-                    continue
-                # it means an index create 2 copy of itself
-                # eg. a = torch.matmul(x, x.transpose(-1, -2))
-                # TODO: currently remove it, deal with this in future
-                if input_dim1 == input_dim2 and output_dim1 != output_dim2:
-                    remove_list.append(chunk_infos[idx1])
-                    remove_list.append(chunk_infos[idx2])
-        for i in remove_list:
-            if i in chunk_infos:
-                chunk_infos.remove(i)
-        return chunk_infos
-
     def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
         start_traces = input_trace[start_idx]
         end_trace = output_trace[end_idx]
@@ -1246,8 +1279,10 @@ def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
                     )
                     if flow_block:
                         continue
+                    # check index copmute
+                    if not self.index_tracer.check_index_duplicate(chunk_info):
+                        continue
                     chunk_infos.append(chunk_info)
-        chunk_infos = self._check_duplicate_map(chunk_infos)
         return chunk_infos
 
     def _search_possible_chunk_regions(self, max_chunk_region, peak_node):
@@ -1288,9 +1323,13 @@ def _search_best_chunk_region(self, possible_chunk_regions):
                 max_region_range = i["region"][1] - i["region"][0]
         return best_regions
 
-    def _step_search(self, mem_peak, active_node):
+    def _step_search(self, mem_peak, active_node, chunk_regions):
         peak_node = self._find_peak_node(mem_peak)
-        max_chunk_region = self._search_max_chunk_region(active_node, peak_node)
+        max_chunk_region = self._search_max_chunk_region(
+            active_node, peak_node, chunk_regions
+        )
+        if max_chunk_region == None:
+            return None
         possible_chunk_regions = self._search_possible_chunk_regions(
             max_chunk_region, peak_node
         )
@@ -1313,7 +1352,7 @@ def search_region(self):
         mem_peak = init_mem_peak
 
         while True:
-            chunk_region = self._step_search(mem_peak, active_node)
+            chunk_region = self._step_search(mem_peak, active_node, chunk_regions)
             if chunk_region is None:
                 break
 
diff --git a/chunk_codegen_run.py b/chunk_codegen_run.py
index 99700e1af9d8..ae4653d6545b 100644
--- a/chunk_codegen_run.py
+++ b/chunk_codegen_run.py
@@ -46,8 +46,8 @@ def _test_fwd_and_bwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair):
         non_fx_out = model(node, pair)
         fx_out = gm(node, pair)
 
-    assert torch.allclose(non_fx_out[0], fx_out[0], atol=1e-4), "fx_out doesn't comply with original output"
-    assert torch.allclose(non_fx_out[1], fx_out[1], atol=1e-4), "fx_out doesn't comply with original output"
+    assert torch.allclose(non_fx_out[0], fx_out[0], atol=1e-4), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(torch.abs(non_fx_out[0] - fx_out[0]))
+    assert torch.allclose(non_fx_out[1], fx_out[1], atol=1e-4), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(torch.abs(non_fx_out[1] - fx_out[1]))
 
     # test barckward
     # loss0 = non_fx_out[0].sum() + non_fx_out[1].sum()

From de65e6c3e88bc1b217b894bf20a4769748145605 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 13 Dec 2022 11:00:51 +0800
Subject: [PATCH 039/503] support output

---
 chunk_codegen.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index b5bb8f18560a..79cefddf07d2 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -56,6 +56,14 @@ def _is_non_compute_node_except_placeholder(node):
     return False
 
 
+def _is_non_compute_node_except_placeholder_output(node):
+    if any(i in node.op for i in ["get_attr"]) or any(
+        i in node.name for i in ["getitem", "getattr"]
+    ):
+        return True
+    return False
+
+
 class FlowTracer(object):
     def __init__(self, gm) -> None:
         self.gm = gm
@@ -1083,13 +1091,14 @@ def estimate_chunk_inference_mem(
             i is not None for i in [start_nodes, end_nodes, chunk_dims, chunk_sizes]
         )
         chunk_within = False
-        chunk_region_idx = 0
+        chunk_region_idx = None
         chunk_ratio = 1  # use it to estimate chunk mem
 
         for idx, node in enumerate(node_list):
             # if node in chunk start nodes, change chunk ratio and add chunk_tensor
             if use_chunk and idx in start_nodes:
                 chunk_within = True
+                chunk_region_idx = start_nodes.index(idx)
                 chunk_ratio = self._get_chunk_ratio(
                     node, chunk_dims[chunk_region_idx], chunk_sizes[chunk_region_idx]
                 )
@@ -1149,7 +1158,7 @@ def estimate_chunk_inference_mem(
                 )
                 chunk_within = False
                 chunk_ratio = 1
-                chunk_region_idx += 1
+                chunk_region_idx = None
 
             act_memory_after_node_log.append(act_memory)
             active_node_list_log.append(copy.deepcopy(active_node_list))
@@ -1467,7 +1476,7 @@ def _find_chunk_compute_input_and_output_nodes(nodes: List[Node]):
             if (
                 output_node not in nodes
                 and node not in output_nodes
-                and not _is_non_compute_node_except_placeholder(output_node)
+                and not _is_non_compute_node_except_placeholder_output(output_node)
             ):
                 output_nodes.append(node)
 

From e83e3c615452c5f8ab04f558880c378256d95802 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 16 Dec 2022 11:09:35 +0800
Subject: [PATCH 040/503] update memory estimate

---
 chunk_codegen.py | 177 +++++++++++++++++++++++++++++------------------
 1 file changed, 111 insertions(+), 66 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 79cefddf07d2..18d9a0c8d764 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -896,23 +896,22 @@ def get_node_chunk_dim(self, node_from, node_from_dim, node_to):
 
     def _find_inherit_dim(self, input_node, input_dim, node):
         input_node_idx = _find_idx_by_name(input_node.name, self.nodes_list)
-        node_idx = _find_idx_by_name(node.name, self.nodes_list)
         node_trace_source = self._find_source_trace_from_node(node)
         for node_dim in range(len(_get_node_shape(node))):
             if (
                 input_node_idx in node_trace_source[node_dim]
                 and node_trace_source[node_dim][input_node_idx] == input_dim
             ):
-                return {node_idx: node_dim}
-        return {}
+                return node_dim
+        return None
 
     def check_index_duplicate(self, chunk_infos):
         input_dim_after_node = {}
         for input_node_idx, input_node in enumerate(chunk_infos["inputs"]):
             for k, v in chunk_infos["inputs_dim"][input_node_idx].items():
-                input_dim_after_node.update(
-                    self._find_inherit_dim(input_node, v, self.nodes_list[k])
-                )
+                inherit_dim = self._find_inherit_dim(input_node, v, self.nodes_list[k])
+                if inherit_dim:
+                    input_dim_after_node[k] = inherit_dim
 
         for node in self.nodes_list[
             chunk_infos["region"][0] : chunk_infos["region"][1] + 1
@@ -934,8 +933,8 @@ def check_index_duplicate(self, chunk_infos):
 
 
 class MemoryEstimator(object):
-    def __init__(self) -> None:
-        pass
+    def __init__(self, index_tracer: IndexTracer) -> None:
+        self.index_tracer = index_tracer
 
     def _get_meta_node_size(self, x):
         x = x.meta["tensor_meta"]
@@ -950,6 +949,8 @@ def _get_output_node(self, n):
         }
         out_size = activation_size(fwd_out)
         out_node = [n.name] if out_size > 0 else []
+        # if any(i in n.name for i in ['transpose', 'permute', 'view']):
+        #     out_size = 0
         return out_size, out_node
 
     def _get_output_node_size(self, n):
@@ -961,11 +962,19 @@ def _add_active_node(self, n, active_list):
             if i not in active_list:
                 active_list.append(i)
 
-    def _get_delete_node(self, user, user_to_last_uses):
+    def _get_delete_node(self, user, user_to_last_uses, to_keep=None):
         delete_size = 0
         delete_node = []
         if user.op not in ("placeholder", "output"):
             nodes_to_delete = user_to_last_uses.get(user, [])
+            if to_keep is not None:
+                keep_list = []
+                for n in nodes_to_delete:
+                    if n.name in to_keep:
+                        keep_list.append(n)
+                for n in keep_list:
+                    if n in nodes_to_delete:
+                        nodes_to_delete.remove(n)
             if len(nodes_to_delete):
                 out_node = [self._get_output_node(i) for i in nodes_to_delete]
                 delete_size = sum([i[0] for i in out_node])
@@ -974,15 +983,30 @@ def _get_delete_node(self, user, user_to_last_uses):
                         delete_node.append(out_node[i][1][0])
                     elif nodes_to_delete[i].op == "placeholder":
                         delete_node.append(nodes_to_delete[i].name)
+                    # elif any(j in nodes_to_delete[i].name for j in ['transpose', 'permute', 'view']):
+                    #     delete_node.append(nodes_to_delete[i].name)
         return delete_size, delete_node
 
-    def _get_delete_node_size(self, user, user_to_last_uses):
-        return self._get_delete_node(user, user_to_last_uses)[0]
+    def _get_delete_node_size(self, user, user_to_last_uses, to_keep):
+        return self._get_delete_node(user, user_to_last_uses, to_keep)[0]
 
     def _remove_deactive_node(self, user, user_to_last_uses, active_list):
         delete_node = self._get_delete_node(user, user_to_last_uses)[1]
         for i in delete_node:
-            active_list.remove(i)
+            if i in active_list:
+                active_list.remove(i)
+    
+    def _get_chunk_inputs_size(self, chunk_inputs, chunk_inputs_non_chunk, node_list, chunk_end_idx):
+        nodes_to_delete = []
+        for chunk_input in chunk_inputs + chunk_inputs_non_chunk:
+            chunk_input_users = chunk_input.users.keys()
+            chunk_input_users_idx = [_find_idx_by_name(i.name, node_list) for i in chunk_input_users]
+            if all(i <= chunk_end_idx for i in chunk_input_users_idx):
+                if chunk_input not in nodes_to_delete:
+                    nodes_to_delete.append(chunk_input)
+        out_node = [self._get_output_node(i) for i in nodes_to_delete]
+        delete_size = sum([i[0] for i in out_node])
+        return delete_size
 
     def _get_last_usr(self, nodes):
         node_to_last_use: Dict[Node, Node] = {}
@@ -1000,7 +1024,8 @@ def register_last_uses(n: Node, user: Node):
 
     def _get_contiguous_memory(self, node, not_contiguous_list, delete=False):
         mem = 0
-        not_contiguous_ops = ["transpose", "permute"]
+        not_contiguous_ops = ["permute"]
+        inherit_contiguous_ops = ["transpose", "view"]
 
         if node.op == "call_function" and any(
             n in node.name for n in ["matmul", "reshape"]
@@ -1020,30 +1045,36 @@ def _get_contiguous_memory(self, node, not_contiguous_list, delete=False):
         ):
             if node not in not_contiguous_list:
                 not_contiguous_list.append(node)
-        elif any(i in node.args for i in not_contiguous_list):
-            if node not in not_contiguous_list:
-                not_contiguous_list.append(node)
-
         return mem
 
-    def _get_chunk_ratio(self, node, chunk_dim, chunk_size):
-        sorted_dim = sorted(chunk_dim, key=lambda x: list(x.keys())[0])
-        dim = list(sorted_dim[-1].values())[0]
-        shape = node.meta["tensor_meta"].shape
-        chunk_ratio = float(chunk_size) / shape[dim]
-        return chunk_ratio
+    def _get_chunk_ratio(self, node, chunk_inputs, chunk_inputs_dim, chunk_size):
+        node_shape = _get_node_shape(node)
+        node_source = self.index_tracer._find_source_trace_from_node(node)
+        for (input_node, input_node_dim) in zip(chunk_inputs, chunk_inputs_dim):
+            for k, v in input_node_dim.items():
+                inherit_dim = self.index_tracer._find_inherit_dim(input_node, v, self.index_tracer.nodes_list[k])
+                if k == _find_idx_by_name(node.name, self.index_tracer.nodes_list):
+                    chunk_ratio = float(chunk_size) / node_shape[inherit_dim]
+                    return chunk_ratio
+                for dim, source in enumerate(node_source):
+                    if k in source and source[k] == inherit_dim:
+                        chunk_ratio = float(chunk_size) / node_shape[dim]
+                        return chunk_ratio
+        return 1.
 
     def _get_chunk_delete_node_size(
-        self, user, user_to_last_uses, chunk_ratio, node_list, start_node, end_node
+        self, user, user_to_last_uses, chunk_ratio, chunk_inputs_names
     ):
+        # if any(j in user.name for j in ['transpose', 'permute', 'view']):
+        #     return 0
         if user.op in ("placeholder", "output"):
             return 0
         nodes_to_delete = user_to_last_uses.get(user, [])
         delete_size = 0
         for n in nodes_to_delete:
-            node_idx = _find_idx_by_name(n.name, node_list)
-            if start_node <= node_idx < end_node:
-                delete_size += self._get_output_node_size(n) * chunk_ratio
+            if n.name in chunk_inputs_names:
+                continue
+            delete_size += self._get_output_node_size(n) * chunk_ratio
         return delete_size
 
     def _print_mem_log(self, log, nodes, title=None):
@@ -1071,10 +1102,7 @@ def _print_compute_op_mem_log(self, log, nodes, title=None):
     def estimate_chunk_inference_mem(
         self,
         gm: torch.fx.GraphModule,
-        start_nodes=None,
-        end_nodes=None,
-        chunk_dims=None,
-        chunk_sizes=None,
+        chunk_infos=None,
     ):
         act_memory = 0.0
         act_memory_peak_log = []
@@ -1087,36 +1115,53 @@ def estimate_chunk_inference_mem(
         user_to_last_uses_no_free_var = self._get_last_usr(node_list)
         _delete_free_var_from_last_use(user_to_last_uses_no_free_var)
 
-        use_chunk = all(
-            i is not None for i in [start_nodes, end_nodes, chunk_dims, chunk_sizes]
-        )
+        use_chunk = True if chunk_infos is not None else False
         chunk_within = False
         chunk_region_idx = None
         chunk_ratio = 1  # use it to estimate chunk mem
+        chunk_size = 1
+        chunk_inputs_names = []
+        
+        if use_chunk:
+            chunk_regions = [i["region"] for i in chunk_infos]
+            chunk_starts = [i[0] for i in chunk_regions]
+            chunk_ends = [i[1] for i in chunk_regions]
+            chunk_inputs = [i["inputs"] for i in chunk_infos]
+            chunk_inputs_non_chunk = [i["inputs_non_chunk"] for i in chunk_infos]
+            chunk_inputs_dim = [i["inputs_dim"] for i in chunk_infos]
+            chunk_inputs_names = [j.name for i in chunk_inputs for j in i] + [
+                j.name for i in chunk_inputs_non_chunk for j in i
+            ]
+            chunk_outputs = [i["outputs"][0] for i in chunk_infos]
 
         for idx, node in enumerate(node_list):
             # if node in chunk start nodes, change chunk ratio and add chunk_tensor
-            if use_chunk and idx in start_nodes:
+            if use_chunk and idx in chunk_starts:
                 chunk_within = True
-                chunk_region_idx = start_nodes.index(idx)
+                chunk_region_idx = chunk_starts.index(idx)
+                act_memory += self._get_output_node_size(chunk_outputs[chunk_region_idx]) / (1024**2)
+
+            # determine chunk ratio for current node
+            if chunk_within:
                 chunk_ratio = self._get_chunk_ratio(
-                    node, chunk_dims[chunk_region_idx], chunk_sizes[chunk_region_idx]
+                    node, chunk_inputs[chunk_region_idx], chunk_inputs_dim[chunk_region_idx], chunk_size
                 )
-                act_memory += self._get_output_node_size(
-                    node_list[end_nodes[chunk_region_idx]]
-                ) / (1024**2)
 
             # if node is placeholder, just add the size of the node
             if node.op == "placeholder":
                 act_memory += self._get_meta_node_size(node) * chunk_ratio / (1024**2)
                 act_memory_peak_log.append(act_memory)
-                active_node_list.append(node.name)
             # skip output
             elif node.op == "output":
                 continue
-            # node is an operation, calculate tmp, output node and delete node memory
+            # no change for non compute node
+            elif _is_non_compute_node_except_placeholder(node):
+                act_memory_peak_log.append(act_memory)
+            # node is a compute op
+            # calculate tmp, output node and delete node memory
             else:
                 # forward memory
+                # TODO: contiguous_memory still not accurate for matmul, view, reshape and transpose
                 act_memory += (
                     self._get_contiguous_memory(node, not_contiguous_list)
                     * chunk_ratio
@@ -1133,29 +1178,35 @@ def estimate_chunk_inference_mem(
                     * chunk_ratio
                     / (1024**2)
                 )
+                # delete unused vars not in chunk_input_list
+                # we can't delete input nodes until chunk ends 
                 if chunk_within:
                     act_memory -= self._get_chunk_delete_node_size(
                         node,
                         user_to_last_uses_no_free_var,
                         chunk_ratio,
-                        node_list,
-                        start_nodes[chunk_region_idx],
-                        end_nodes[chunk_region_idx],
+                        chunk_inputs_names
                     ) / (1024**2)
                 else:
-                    act_memory -= self._get_delete_node_size(
-                        node, user_to_last_uses_no_free_var
-                    ) / (1024**2)
+                    act_memory -= (self._get_delete_node_size(
+                        node, user_to_last_uses_no_free_var, chunk_inputs_names
+                    ) / (1024**2))
 
-            # log active node
+            # log active node, only effective without chunk
             self._add_active_node(node, active_node_list)
             self._remove_deactive_node(node, user_to_last_uses, active_node_list)
 
             # if node in chunk end nodes, restore chunk settings
-            if use_chunk and idx in end_nodes:
+            if use_chunk and idx in chunk_ends:
                 act_memory -= (
                     self._get_output_node_size(node) * chunk_ratio / (1024**2)
                 )
+                act_memory -= self._get_chunk_inputs_size(
+                    chunk_inputs[chunk_region_idx], 
+                    chunk_inputs_non_chunk[chunk_region_idx], 
+                    node_list,
+                    chunk_regions[chunk_region_idx][1]
+                    ) / (1024**2)
                 chunk_within = False
                 chunk_ratio = 1
                 chunk_region_idx = None
@@ -1178,11 +1229,11 @@ class ChunkRegionSearch(object):
     def __init__(self, gm) -> None:
         self.gm = gm
         self.node_list = list(gm.graph.nodes)
-        self.memory_estimator = MemoryEstimator()
         self.index_tracer = IndexTracer(gm)
         self.index_tracer.trace_index()
         self.flow_tracer = FlowTracer(gm)
         self.flow_tracer.trace_flow()
+        self.memory_estimator = MemoryEstimator(self.index_tracer)
 
     def _find_peak_node(self, mem_peak):
         max_value = max(mem_peak)
@@ -1210,7 +1261,7 @@ def _search_max_chunk_region(self, active_node, peak_node, chunk_regions):
         min_var = self._get_min_free_var(active_node, free_vars)
 
         # from peak_node to free_var
-        chunk_region_start = None
+        chunk_region_start = len(free_vars)
         for i in range(peak_node, -1, -1):
             if len(active_node[i]) == min_var:
                 chunk_region_start = i + 1
@@ -1218,7 +1269,7 @@ def _search_max_chunk_region(self, active_node, peak_node, chunk_regions):
             if i in free_vars or i == 0:
                 raise RuntimeError()
         # from peak_node to len-2
-        chunk_region_end = None
+        chunk_region_end = len(active_node) - 1
         for i in range(peak_node, len(active_node)):
             if len(active_node[i]) == min_var:
                 chunk_region_end = i
@@ -1352,7 +1403,7 @@ def _stop_search(self, init_mem_peak, mem_peak):
         return False
 
     def search_region(self):
-        chunk_regions = []
+        chunk_infos = []
         (
             init_mem_peak,
             _,
@@ -1361,25 +1412,19 @@ def search_region(self):
         mem_peak = init_mem_peak
 
         while True:
-            chunk_region = self._step_search(mem_peak, active_node, chunk_regions)
-            if chunk_region is None:
+            chunk_info = self._step_search(mem_peak, active_node, chunk_infos)
+            if chunk_info is None:
                 break
 
-            chunk_regions.append(chunk_region)
+            chunk_infos.append(chunk_info)
             (
                 mem_peak,
                 _,
                 active_node,
-            ) = self.memory_estimator.estimate_chunk_inference_mem(
-                self.gm,
-                [i["region"][0] for i in chunk_regions],
-                [i["region"][1] for i in chunk_regions],
-                [i["inputs_dim"] for i in chunk_regions],
-                [1] * len(chunk_regions),
-            )
+            ) = self.memory_estimator.estimate_chunk_inference_mem(self.gm, chunk_infos)
             if self._stop_search(init_mem_peak, mem_peak):
                 break
-        return chunk_regions
+        return chunk_infos
 
 
 def _gen_chunk_slice_dim(chunk_dim, chunk_idx_name, shape):
@@ -1415,7 +1460,7 @@ def _gen_loop_end(
     chunk_slice = _gen_chunk_slice_dim(
         chunk_outputs_dim, "chunk_idx", chunk_output_shape
     )
-    context = "    chunk_result%s = %s\n" % (chunk_slice, chunk_outputs_name)
+    context = "    chunk_result%s = %s;  %s = None\n" % (chunk_slice, chunk_outputs_name, chunk_outputs_name)
     context += (
         chunk_outputs_name + " = chunk_result;  chunk_result = None;  chunk_size = None"
     )

From e66a18a0bfaa87767d5869ab21a76c48af8b81cf Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 16 Dec 2022 15:06:39 +0800
Subject: [PATCH 041/503] optimise search

---
 chunk_codegen.py | 67 +++++++++++++++++++++++++++++++++---------------
 1 file changed, 47 insertions(+), 20 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 18d9a0c8d764..5e2130ee76f4 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -958,6 +958,8 @@ def _get_output_node_size(self, n):
 
     def _add_active_node(self, n, active_list):
         new_active = self._get_output_node(n)[1]
+        if n.op == 'placeholder':
+            new_active.append(n.name)
         for i in new_active:
             if i not in active_list:
                 active_list.append(i)
@@ -965,7 +967,7 @@ def _add_active_node(self, n, active_list):
     def _get_delete_node(self, user, user_to_last_uses, to_keep=None):
         delete_size = 0
         delete_node = []
-        if user.op not in ("placeholder", "output"):
+        if user.op not in ("output",):
             nodes_to_delete = user_to_last_uses.get(user, [])
             if to_keep is not None:
                 keep_list = []
@@ -1258,24 +1260,30 @@ def _get_min_free_var(self, active_node_list, free_vars):
 
     def _search_max_chunk_region(self, active_node, peak_node, chunk_regions):
         free_vars = self._get_free_var()
-        min_var = self._get_min_free_var(active_node, free_vars)
-
+        free_var_num = len(free_vars)
+        active_node_num = [len(i) for i in active_node]
+        min_active_node_num = min(active_node_num[free_var_num:])
+        threshold = max(free_var_num, min_active_node_num)
+        
         # from peak_node to free_var
-        chunk_region_start = len(free_vars)
+        inside_flag = False
+        chunk_region_start = free_var_num
         for i in range(peak_node, -1, -1):
-            if len(active_node[i]) == min_var:
+            if active_node_num[i] <= threshold:
+                inside_flag = True
+            if inside_flag and active_node_num[i] > threshold:
                 chunk_region_start = i + 1
                 break
-            if i in free_vars or i == 0:
-                raise RuntimeError()
+
         # from peak_node to len-2
+        inside_flag = False
         chunk_region_end = len(active_node) - 1
         for i in range(peak_node, len(active_node)):
-            if len(active_node[i]) == min_var:
+            if active_node_num[i] <= threshold:
+                inside_flag = True
+            if inside_flag and active_node_num[i] > threshold:
                 chunk_region_end = i
                 break
-            if i in free_vars or i == 0:
-                raise RuntimeError()
 
         for i in chunk_regions:
             region = i["region"]
@@ -1374,15 +1382,34 @@ def _search_possible_chunk_regions(self, max_chunk_region, peak_node):
                     possible_chunk_region.extend(chunk_info)
         return possible_chunk_region
 
-    def _search_best_chunk_region(self, possible_chunk_regions):
+    def _search_best_chunk_region(self, possible_chunk_regions, chunk_infos):
         max_region_range = 0
-        best_regions = None
-        for i in possible_chunk_regions:
-            if i["region"][1] - i["region"][0] > max_region_range:
-                best_regions = i
-                max_region_range = i["region"][1] - i["region"][0]
-        return best_regions
-
+        best_region = None
+        while len(possible_chunk_regions) > 0:
+            for i in possible_chunk_regions:
+                if i["region"][1] - i["region"][0] > max_region_range:
+                    best_region = i
+                    max_region_range = i["region"][1] - i["region"][0]
+            if self._is_legal_region(best_region, chunk_infos):
+                break
+            possible_chunk_regions.remove(i)
+            max_region_range = 0
+            best_region = None
+        return best_region
+    
+    def _is_legal_region(self, cur_chunk_info, chunk_infos):
+        (chunk_region_start, chunk_region_end) = cur_chunk_info["region"]
+        if cur_chunk_info in chunk_infos:
+            return False
+        if chunk_region_end < chunk_region_start:
+            return False
+        for i in chunk_infos:
+            region = i["region"]
+            if not ((chunk_region_start > region[1] and chunk_region_end > region[1]) 
+                    or (chunk_region_start < region[0] and chunk_region_end < region[0])):
+                return False
+        return True
+    
     def _step_search(self, mem_peak, active_node, chunk_regions):
         peak_node = self._find_peak_node(mem_peak)
         max_chunk_region = self._search_max_chunk_region(
@@ -1393,7 +1420,7 @@ def _step_search(self, mem_peak, active_node, chunk_regions):
         possible_chunk_regions = self._search_possible_chunk_regions(
             max_chunk_region, peak_node
         )
-        best_chunk_region = self._search_best_chunk_region(possible_chunk_regions)
+        best_chunk_region = self._search_best_chunk_region(possible_chunk_regions, chunk_regions)
         return best_chunk_region
 
     def _stop_search(self, init_mem_peak, mem_peak):
@@ -1919,5 +1946,5 @@ def emit_node(node: Node, body):
 
 {prologue}
 {code}"""
-            print(fn_code)
+            # print(fn_code)
             return PythonCode(fn_code, globals_)

From 9d516fa68f4e029d63b53d78803667bfa71e86d6 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Sun, 18 Dec 2022 20:37:55 +0800
Subject: [PATCH 042/503] fix layernorm

---
 chunk_codegen.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 5e2130ee76f4..77c28fd32c88 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -574,7 +574,7 @@ def _assign_layernorm_index(self, node, idx):
             node_idx (int)
         """
         self._assign_index_as_input(node, idx)
-        self._mark_computation(node, idx, [-1, -2])
+        self._mark_computation(node, idx, [-1])
 
     def _assign_elementwise_index(self, node, idx):
         """

From d734529a390087f1366b7573410eca5775735b14 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Wed, 21 Dec 2022 15:00:24 +0800
Subject: [PATCH 043/503] move flow tracer

---
 chunk_codegen.py | 413 ++++++++++++++++++++++++-----------------------
 1 file changed, 207 insertions(+), 206 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 77c28fd32c88..2c1c09ae5238 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -64,212 +64,6 @@ def _is_non_compute_node_except_placeholder_output(node):
     return False
 
 
-class FlowTracer(object):
-    def __init__(self, gm) -> None:
-        self.gm = gm
-        self.node_list = list(gm.graph.nodes)
-        self.flow_trace = {}
-
-    def _add_trace(self, name):
-        self.flow_trace[name] = []
-
-    def _add_node(self, trace_name, node):
-        self.flow_trace[trace_name].append(
-            {"node": node, "inside_depend": [], "outside_depend": []}
-        )
-
-    def _add_inside_depend(self, flow_name, node, inside_depend_node):
-        for i in self.flow_trace[flow_name]:
-            if i["node"] == node:
-                i["inside_depend"].append(inside_depend_node)
-                return
-        raise RuntimeError("node not found")
-
-    def _add_outside_depend(
-        self, flow_name, node, outside_depend_node, outside_depend_trace
-    ):
-        for i in self.flow_trace[flow_name]:
-            if i["node"] == node:
-                i["outside_depend"].append({outside_depend_trace: outside_depend_node})
-                return
-        raise RuntimeError("node not found")
-
-    def _init_trace(self):
-        for i in self.node_list:
-            if i.op == "placeholder":
-                self._add_trace(i.name)
-                self._add_node(i.name, i)
-
-    def _find_flow_for_node(self, node):
-        if type(self.node_list[0]) != type(node):
-            return None
-        if _is_non_compute_node_except_placeholder(node):
-            return None
-        for name, trace in self.flow_trace.items():
-            for i in trace:
-                if node == i["node"]:
-                    return name
-        if any(i in node.name for i in ["ones_like"]):
-            self._add_trace(node.name)
-            self._add_node(node.name, node)
-            return node.name
-        raise RuntimeError("node not found")
-
-    def _find_first_valid_flow(self, flow):
-        for i in flow:
-            if i is not None:
-                return i
-        raise RuntimeError("invalid flow")
-
-    def find_node_flow(self, node):
-        for name, trace in self.flow_trace.items():
-            for i in trace:
-                if node == i["node"]:
-                    return name, i
-        raise RuntimeError("invalid node")
-
-    def _get_flow_mix_node(self, node):
-        if _is_non_compute_node(node):
-            return None
-        _, node_trace = self.find_node_flow(node)
-        if len(node_trace["outside_depend"]) == 0:
-            return None
-        elif len(node_trace["outside_depend"]) > 1:
-            raise NotImplementedError
-        vars = list(node_trace["outside_depend"][0].values())[0]
-        return vars
-
-    def _get_same_flow_node(self, node_list, node):
-        name, _ = self.find_node_flow(node)
-        result = []
-        for i in self.flow_trace[name]:
-            if i["node"] in node_list:
-                result.append(i["node"])
-        return result
-
-    def trace_flow(self):
-        # init trace
-        self._init_trace()
-
-        for node in self.node_list:
-            # skip if non compute node
-            if all(
-                type(arg) != type(node) or _is_non_compute_node_except_placeholder(arg)
-                for arg in node.args
-            ) or _is_non_compute_node(node):
-                continue
-
-            node_input_flows = [self._find_flow_for_node(arg) for arg in node.args]
-
-            node_domin_flow = self._find_first_valid_flow(node_input_flows)
-            self._add_node(node_domin_flow, node)
-            for node_input_flow, arg in zip(node_input_flows, node.args):
-                if node_input_flow is None:
-                    continue
-                elif node_input_flow == node_domin_flow:
-                    self._add_inside_depend(node_domin_flow, node, arg)
-                else:
-                    self._add_outside_depend(
-                        node_domin_flow, node, arg, node_input_flow
-                    )
-        return self.flow_trace
-
-    def _detect_flow(self, start_idx, start_dim, end_idx, end_dim, index_tracer):
-        inputs, outputs = _find_chunk_compute_input_and_output_nodes(
-            self.node_list[start_idx : end_idx + 1]
-        )
-        chunk_info = {
-            "region": (start_idx, end_idx),
-            "inputs": inputs,
-            "inputs_non_chunk": [],
-            "inputs_dim": start_dim,
-            "outputs": outputs,
-            "outputs_dim": end_dim,
-            "args": {},
-        }
-        flow_block = False
-
-        # TODO don't allow multi outputs now
-        if len(outputs) > 1:
-            flow_block = True
-            return flow_block, chunk_info
-
-        for idx in range(start_idx, end_idx + 1):
-            node = self.node_list[idx]
-            mix_flow_node = self._get_flow_mix_node(node)
-            if mix_flow_node is None:
-                continue
-
-            # if there is a flow mix, op must be in [mul, add, matmul]
-            # element-wise op requires dim to be equal in every dim
-            if any(n in node.name for n in ["mul", "add"]):
-                for i in node.args:
-                    if type(i) == type(mix_flow_node) and i != mix_flow_node:
-                        main_flow_var = i
-                # if mix flow is a broadcast in chunk dim,
-                # TODO: need to move that flow out of the chunk
-                mix_flow_node_dim = index_tracer.get_node_chunk_dim(
-                    self.node_list[end_idx], end_dim, node
-                )
-                if mix_flow_node_dim is None:
-                    flow_block = True
-                    break
-                if _get_node_shape(mix_flow_node)[mix_flow_node_dim] == 1:
-                    flow_block = False
-                    for i in self._get_same_flow_node(
-                        chunk_info["inputs"], mix_flow_node
-                    ):
-                        chunk_info["inputs"].remove(i)
-                # else, we need to chunk mix var as well
-                else:
-                    # TODO chunk another value
-                    flow_block = True
-                    break
-            else:
-                raise NotImplementedError("%s not implemented" % node.name)
-
-        if flow_block:
-            flow_block = True
-            return flow_block, chunk_info
-
-        inputs_dim = []
-        remove_inputs = []
-        for input_node in chunk_info["inputs"]:
-            input_dict = {}
-            for user in input_node.users.keys():
-                if _is_non_compute_node(user):
-                    continue
-                user_idx = _find_idx_by_name(user.name, self.node_list)
-                dim = None
-                if start_dim <= user_idx < end_idx:
-                    dim = index_tracer.get_node_chunk_dim(
-                        self.node_list[end_idx], end_dim, input_node
-                    )
-                elif user_idx == end_idx:
-                    dim = end_dim
-                # n has relation with chunk dim
-                if dim is not None and _get_node_shape(user)[dim] != 1:
-                    input_dict[user_idx] = dim
-            if len(input_dict) == 0:
-                remove_inputs.append(input_node)
-            else:
-                inputs_dim.append(input_dict)
-        chunk_info["inputs_dim"] = inputs_dim
-        for i in remove_inputs:
-            if i in chunk_info["inputs"]:
-                chunk_info["inputs"].remove(i)
-
-        # we need to log input nodes to avoid deleteing them in the loop
-        non_chunk_inputs = _find_chunk_all_input_nodes(
-            self.node_list[start_idx : end_idx + 1]
-        )
-        for i in non_chunk_inputs:
-            if i not in chunk_info["inputs"]:
-                chunk_info["inputs_non_chunk"].append(i)
-
-        return flow_block, chunk_info
-
-
 class IndexTracer(object):
     def __init__(self, gm) -> None:
         self.gm = gm
@@ -932,6 +726,213 @@ def check_index_duplicate(self, chunk_infos):
         return True
 
 
+
+class FlowTracer(object):
+    def __init__(self, gm) -> None:
+        self.gm = gm
+        self.node_list = list(gm.graph.nodes)
+        self.flow_trace = {}
+
+    def _add_trace(self, name):
+        self.flow_trace[name] = []
+
+    def _add_node(self, trace_name, node):
+        self.flow_trace[trace_name].append(
+            {"node": node, "inside_depend": [], "outside_depend": []}
+        )
+
+    def _add_inside_depend(self, flow_name, node, inside_depend_node):
+        for i in self.flow_trace[flow_name]:
+            if i["node"] == node:
+                i["inside_depend"].append(inside_depend_node)
+                return
+        raise RuntimeError("node not found")
+
+    def _add_outside_depend(
+        self, flow_name, node, outside_depend_node, outside_depend_trace
+    ):
+        for i in self.flow_trace[flow_name]:
+            if i["node"] == node:
+                i["outside_depend"].append({outside_depend_trace: outside_depend_node})
+                return
+        raise RuntimeError("node not found")
+
+    def _init_trace(self):
+        for i in self.node_list:
+            if i.op == "placeholder":
+                self._add_trace(i.name)
+                self._add_node(i.name, i)
+
+    def _find_flow_for_node(self, node):
+        if type(self.node_list[0]) != type(node):
+            return None
+        if _is_non_compute_node_except_placeholder(node):
+            return None
+        for name, trace in self.flow_trace.items():
+            for i in trace:
+                if node == i["node"]:
+                    return name
+        if any(i in node.name for i in ["ones_like"]):
+            self._add_trace(node.name)
+            self._add_node(node.name, node)
+            return node.name
+        raise RuntimeError("node not found")
+
+    def _find_first_valid_flow(self, flow):
+        for i in flow:
+            if i is not None:
+                return i
+        raise RuntimeError("invalid flow")
+
+    def find_node_flow(self, node):
+        for name, trace in self.flow_trace.items():
+            for i in trace:
+                if node == i["node"]:
+                    return name, i
+        raise RuntimeError("invalid node")
+
+    def _get_flow_mix_node(self, node):
+        if _is_non_compute_node(node):
+            return None
+        _, node_trace = self.find_node_flow(node)
+        if len(node_trace["outside_depend"]) == 0:
+            return None
+        elif len(node_trace["outside_depend"]) > 1:
+            raise NotImplementedError
+        vars = list(node_trace["outside_depend"][0].values())[0]
+        return vars
+
+    def _get_same_flow_node(self, node_list, node):
+        name, _ = self.find_node_flow(node)
+        result = []
+        for i in self.flow_trace[name]:
+            if i["node"] in node_list:
+                result.append(i["node"])
+        return result
+
+    def trace_flow(self):
+        # init trace
+        self._init_trace()
+
+        for node in self.node_list:
+            # skip if non compute node
+            if all(
+                type(arg) != type(node) or _is_non_compute_node_except_placeholder(arg)
+                for arg in node.args
+            ) or _is_non_compute_node(node):
+                continue
+
+            node_input_flows = [self._find_flow_for_node(arg) for arg in node.args]
+
+            node_domin_flow = self._find_first_valid_flow(node_input_flows)
+            self._add_node(node_domin_flow, node)
+            for node_input_flow, arg in zip(node_input_flows, node.args):
+                if node_input_flow is None:
+                    continue
+                elif node_input_flow == node_domin_flow:
+                    self._add_inside_depend(node_domin_flow, node, arg)
+                else:
+                    self._add_outside_depend(
+                        node_domin_flow, node, arg, node_input_flow
+                    )
+        return self.flow_trace
+
+    def _detect_flow(self, start_idx, start_dim, end_idx, end_dim, index_tracer: IndexTracer):
+        inputs, outputs = _find_chunk_compute_input_and_output_nodes(
+            self.node_list[start_idx : end_idx + 1]
+        )
+        chunk_info = {
+            "region": (start_idx, end_idx),
+            "inputs": inputs,
+            "inputs_non_chunk": [],
+            "inputs_dim": start_dim,
+            "outputs": outputs,
+            "outputs_dim": end_dim,
+            "args": {},
+        }
+        flow_block = False
+
+        # TODO don't allow multi outputs now
+        if len(outputs) > 1:
+            flow_block = True
+            return flow_block, chunk_info
+
+        for idx in range(start_idx, end_idx + 1):
+            node = self.node_list[idx]
+            mix_flow_node = self._get_flow_mix_node(node)
+            if mix_flow_node is None:
+                continue
+
+            # if there is a flow mix, op must be in [mul, add, matmul]
+            # element-wise op requires dim to be equal in every dim
+            if any(n in node.name for n in ["mul", "add"]):
+                for i in node.args:
+                    if type(i) == type(mix_flow_node) and i != mix_flow_node:
+                        main_flow_var = i
+                # if mix flow is a broadcast in chunk dim,
+                # TODO: need to move that flow out of the chunk
+                mix_flow_node_dim = index_tracer.get_node_chunk_dim(
+                    self.node_list[end_idx], end_dim, node
+                )
+                if mix_flow_node_dim is None:
+                    flow_block = True
+                    break
+                if _get_node_shape(mix_flow_node)[mix_flow_node_dim] == 1:
+                    flow_block = False
+                    for i in self._get_same_flow_node(
+                        chunk_info["inputs"], mix_flow_node
+                    ):
+                        chunk_info["inputs"].remove(i)
+                # else, we need to chunk mix var as well
+                else:
+                    # TODO chunk another value
+                    flow_block = True
+                    break
+            else:
+                raise NotImplementedError("%s not implemented" % node.name)
+
+        if flow_block:
+            flow_block = True
+            return flow_block, chunk_info
+
+        inputs_dim = []
+        remove_inputs = []
+        for input_node in chunk_info["inputs"]:
+            input_dict = {}
+            for user in input_node.users.keys():
+                if _is_non_compute_node(user):
+                    continue
+                user_idx = _find_idx_by_name(user.name, self.node_list)
+                dim = None
+                if start_dim <= user_idx < end_idx:
+                    dim = index_tracer.get_node_chunk_dim(
+                        self.node_list[end_idx], end_dim, input_node
+                    )
+                elif user_idx == end_idx:
+                    dim = end_dim
+                # n has relation with chunk dim
+                if dim is not None and _get_node_shape(user)[dim] != 1:
+                    input_dict[user_idx] = dim
+            if len(input_dict) == 0:
+                remove_inputs.append(input_node)
+            else:
+                inputs_dim.append(input_dict)
+        chunk_info["inputs_dim"] = inputs_dim
+        for i in remove_inputs:
+            if i in chunk_info["inputs"]:
+                chunk_info["inputs"].remove(i)
+
+        # we need to log input nodes to avoid deleteing them in the loop
+        non_chunk_inputs = _find_chunk_all_input_nodes(
+            self.node_list[start_idx : end_idx + 1]
+        )
+        for i in non_chunk_inputs:
+            if i not in chunk_info["inputs"]:
+                chunk_info["inputs_non_chunk"].append(i)
+
+        return flow_block, chunk_info
+
+
 class MemoryEstimator(object):
     def __init__(self, index_tracer: IndexTracer) -> None:
         self.index_tracer = index_tracer

From d361d533e8e7773d2009cc4ff5a82633401ab44a Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Wed, 21 Dec 2022 15:01:03 +0800
Subject: [PATCH 044/503] refactor flow tracer

---
 chunk_codegen.py       | 283 +++++++++++++++++++++++++++++++++--------
 evoformer/evoformer.py |  11 +-
 2 files changed, 240 insertions(+), 54 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 2c1c09ae5238..3ba082ceb845 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -139,7 +139,13 @@ def _add_source(self, node_from, node_from_dim, node_to, node_to_dim, init=False
         node_from_idx = _find_idx_by_name(node_from.name, self.nodes_list)
         if init:
             node_to_trace["source"][node_to_dim] = {}
-        node_to_trace["source"][node_to_dim][node_from_idx] = node_from_dim
+        # add dim to cur new source
+        if node_from_idx not in node_to_trace["source"][node_to_dim]:
+            node_to_trace["source"][node_to_dim][node_from_idx] = [node_from_dim]
+        else:
+            if node_from_dim not in node_to_trace["source"][node_to_dim][node_from_idx]:
+                node_to_trace["source"][node_to_dim][node_from_idx].append(node_from_dim)
+        # update inputs source
         node_to_trace["source"][node_to_dim].update(
             node_from_trace["source"][node_from_dim]
         )
@@ -654,7 +660,7 @@ def check_index_source(self, start_dim, start_node, start_idx, end_dim, end_node
             end_node_trace_source.items(), key=lambda d: d[0], reverse=True
         )
         for node_idx, node_dim in sorted_source:
-            if node_idx == start_node_idx and node_dim == start_dim:
+            if node_idx == start_node_idx and start_dim in node_dim:
                 return True
             # it means we meet a node outside the loop, and the node is not input node
             if node_idx < start_idx:
@@ -694,12 +700,12 @@ def _find_inherit_dim(self, input_node, input_dim, node):
         for node_dim in range(len(_get_node_shape(node))):
             if (
                 input_node_idx in node_trace_source[node_dim]
-                and node_trace_source[node_dim][input_node_idx] == input_dim
+                and input_dim in node_trace_source[node_dim][input_node_idx]
             ):
                 return node_dim
         return None
 
-    def check_index_duplicate(self, chunk_infos):
+    def check_index_duplicate(self, chunk_infos, return_dim=False):
         input_dim_after_node = {}
         for input_node_idx, input_node in enumerate(chunk_infos["inputs"]):
             for k, v in chunk_infos["inputs_dim"][input_node_idx].items():
@@ -713,17 +719,30 @@ def check_index_duplicate(self, chunk_infos):
             if _is_non_compute_node_except_placeholder(node):
                 continue
             count = 0
+            duplicate_dims = []
             node_trace_source = self._find_source_trace_from_node(node)
             for node_dim in range(len(_get_node_shape(node))):
+                duplicate_dim = []
+                duplicate_flag = False
                 dim_source = node_trace_source[node_dim]
                 for k, v in dim_source.items():
                     if chunk_infos["region"][0] <= k <= chunk_infos["region"][1]:
-                        if k in input_dim_after_node and input_dim_after_node[k] == v:
-                            count += 1
-                            break
+                        if k in input_dim_after_node and input_dim_after_node[k] in v:
+                            duplicate_flag = True
+                            duplicate_dim.append((k, v))
+                duplicate_dims.append(duplicate_dim)
+                if duplicate_flag:
+                    count += 1
+
             if count > 1:
-                return False
-        return True
+                if return_dim:
+                    return False, duplicate_dims
+                else:
+                    return False
+        if return_dim:
+            return True, None
+        else:
+            return True
 
 
@@ -857,43 +876,45 @@ def _detect_flow(self, start_idx, start_dim, end_idx, end_dim, index_tracer: Ind
             flow_block = True
             return flow_block, chunk_info
 
-        for idx in range(start_idx, end_idx + 1):
-            node = self.node_list[idx]
-            mix_flow_node = self._get_flow_mix_node(node)
-            if mix_flow_node is None:
-                continue
-
-            # if there is a flow mix, op must be in [mul, add, matmul]
-            # element-wise op requires dim to be equal in every dim
-            if any(n in node.name for n in ["mul", "add"]):
-                for i in node.args:
-                    if type(i) == type(mix_flow_node) and i != mix_flow_node:
-                        main_flow_var = i
-                # if mix flow is a broadcast in chunk dim,
-                # TODO: need to move that flow out of the chunk
-                mix_flow_node_dim = index_tracer.get_node_chunk_dim(
-                    self.node_list[end_idx], end_dim, node
-                )
-                if mix_flow_node_dim is None:
-                    flow_block = True
-                    break
-                if _get_node_shape(mix_flow_node)[mix_flow_node_dim] == 1:
-                    flow_block = False
-                    for i in self._get_same_flow_node(
-                        chunk_info["inputs"], mix_flow_node
-                    ):
-                        chunk_info["inputs"].remove(i)
-                # else, we need to chunk mix var as well
-                else:
-                    # TODO chunk another value
-                    flow_block = True
-                    break
-            else:
-                raise NotImplementedError("%s not implemented" % node.name)
-
-        if flow_block:
-            flow_block = True
-            return flow_block, chunk_info
+        # for idx in range(start_idx, end_idx + 1):
+        #     node = self.node_list[idx]
+        #     mix_flow_node = self._get_flow_mix_node(node)
+        #     if mix_flow_node is None:
+        #         continue
+
+        #     # if there is a flow mix, op must be in [mul, add, matmul]
+        #     # element-wise op requires dim to be equal in every dim
+        #     if any(n in node.name for n in ["mul", "add"]):
+        #         for i in node.args:
+        #             if type(i) == type(mix_flow_node) and i != mix_flow_node:
+        #                 main_flow_var = i
+        #         # if mix flow is a broadcast in chunk dim,
+        #         # TODO: need to move that flow out of the chunk
+        #         mix_flow_node_dim = index_tracer.get_node_chunk_dim(
+        #             self.node_list[end_idx], end_dim, node
+        #         )
+        #         # TODO: we need to loop every dim
+        #         if isinstance(mix_flow_node_dim, list):
+        #             mix_flow_node_dim = mix_flow_node_dim[0]
+        #         if mix_flow_node_dim is None:
+        #             flow_block = True
+        #             break
+        #         if _get_node_shape(mix_flow_node)[mix_flow_node_dim] == 1:
+        #             flow_block = False
+        #             for i in self._get_same_flow_node(
+        #                 chunk_info["inputs"], mix_flow_node
+        #             ):
+        #                 chunk_info["inputs"].remove(i)
+        #         # else, we need to chunk mix var as well
+        #         else:
+        #             # TODO chunk another value
+        #             flow_block = True
+        #             break
+        #     else:
+        #         raise NotImplementedError("%s not implemented" % node.name)
+        # if flow_block:
+        #     flow_block = True
+        #     return flow_block, chunk_info
 
         inputs_dim = []
         remove_inputs = []
@@ -908,6 +929,9 @@ def _detect_flow(self, start_idx, start_dim, end_idx, end_dim, index_tracer: Ind
                     dim = index_tracer.get_node_chunk_dim(
                         self.node_list[end_idx], end_dim, input_node
                     )
+                    # TODO: we need to loop every dim
+                    if isinstance(dim, list):
+                        dim = dim[0]
                 elif user_idx == end_idx:
                     dim = end_dim
                 # n has relation with chunk dim
@@ -921,6 +945,8 @@ def _detect_flow(self, start_idx, start_dim, end_idx, end_dim, index_tracer: Ind
         for i in remove_inputs:
             if i in chunk_info["inputs"]:
                 chunk_info["inputs"].remove(i)
+        
+        duplicate_result, duplicate_dim = index_tracer.check_index_duplicate(chunk_info, return_dim=True)
 
         # we need to log input nodes to avoid deleteing them in the loop
         non_chunk_inputs = _find_chunk_all_input_nodes(
@@ -932,6 +958,150 @@ def _detect_flow(self, start_idx, start_dim, end_idx, end_dim, index_tracer: Ind
 
         return flow_block, chunk_info
 
+    def _assgin_single_node_flow(self, arg_node, start_idx, end_idx, 
+                                 inputs, index_tracer, cur_node_dim, 
+                                 cur_node_compute, cur_node_source, cur_node_fix_dim, all_node_info,
+                                 next_node_list):
+        arg_idx = _find_idx_by_name(arg_node.name, index_tracer.nodes_list)
+        # arg in chunk range or be inputs
+        if not (start_idx <= arg_idx < end_idx):
+            return True
+                    
+        # find arg dim
+        if cur_node_dim is not None:
+            # dim is computed
+            if arg_idx in cur_node_compute[cur_node_dim]:
+                return False
+            if arg_idx not in cur_node_source[cur_node_dim]:
+                arg_dim = None
+            else:
+                arg_dim = cur_node_source[cur_node_dim][arg_idx][0]
+        else:
+            arg_dim = None
+                    
+        # get fix dim
+        arg_fix_dim = []
+        if cur_node_dim is not None:
+            for i in cur_node_fix_dim:
+                fix_dim_source = cur_node_source[i]
+                if arg_idx in fix_dim_source:
+                    arg_fix_dim.append(fix_dim_source[arg_idx][0])
+                    
+        # if already in node_info, arg dim must be same
+        if arg_node in all_node_info:
+            if all_node_info[arg_node] != arg_dim:
+                return False
+            all_node_info[arg_node]['fix_dim'] = list(set(all_node_info[arg_node]['fix_dim'] + arg_fix_dim))
+        # else add it to list
+        else:
+            all_node_info[arg_node] = {'chunk_dim': arg_dim, 'fix_dim': arg_fix_dim}
+                        
+        next_node_list.append(arg_node)
+        return True
+    
+    def flow_search(self, start_idx, start_dim, end_idx, end_dim, index_tracer: IndexTracer):
+        inputs, outputs = _find_chunk_compute_input_and_output_nodes(
+            self.node_list[start_idx : end_idx + 1]
+        )
+        # only single ouput
+        if len(outputs) > 1:
+            return None
+        
+        cur_node_list = [index_tracer.nodes_list[end_idx]]  # start from the last node
+        all_node_info = {cur_node_list[0]: {'chunk_dim': end_dim, 'fix_dim': []}}
+        
+        while len(cur_node_list) > 0:
+            next_node_list = []
+
+            for cur_node in cur_node_list:
+                # get cur node info
+                cur_node_chunk_dim = all_node_info[cur_node]['chunk_dim']
+                cur_node_fix_dim = all_node_info[cur_node]['fix_dim']
+                cur_node_idx = _find_idx_by_name(cur_node.name, index_tracer.nodes_list)
+                if cur_node_chunk_dim:
+                    cur_node_compute = index_tracer._find_compute_trace_from_node(cur_node)
+                    cur_node_source = index_tracer._find_source_trace_from_node(cur_node)
+                else:
+                    cur_node_compute = cur_node_source = None
+                
+                # get all valid args
+                arg_list = []
+                for arg in cur_node.args:
+                    if type(arg) != type(cur_node):
+                        continue
+                    if _is_non_compute_node(arg):
+                        continue
+                    arg_list.append(arg)
+                    flow_flag = self._assgin_single_node_flow(arg, start_idx, end_idx, 
+                        inputs, index_tracer, cur_node_chunk_dim, 
+                        cur_node_compute, cur_node_source, cur_node_fix_dim, all_node_info,
+                        next_node_list)
+                    if flow_flag == False:
+                        return None
+                        
+                if len(arg_list) == 2:
+                    if any(i in cur_node.name for i in ["add", "mul"]):
+                        for arg in arg_list:
+                            if not (start_idx <= _find_idx_by_name(arg.name, index_tracer.nodes_list) < end_idx):
+                                continue
+                            arg_chunk_dim = all_node_info[arg]['chunk_dim']
+                            arg_fix_dim = all_node_info[arg]['fix_dim']
+                            arg_shape = _get_node_shape(arg)
+                            # add all dim as fix dim except chunk dim
+                            for i, shape in enumerate(arg_shape):
+                                if shape != 1 and i != cur_node_chunk_dim:
+                                    if i == arg_chunk_dim:
+                                        return None
+                                    if i not in arg_fix_dim:
+                                        arg_fix_dim.append(i)
+                    elif "einsum" in cur_node.name:
+                        pass
+                    elif "matmul" in cur_node.name:
+                        pass
+                    else:
+                        raise NotImplementedError()
+            cur_node_list = next_node_list
+        
+        inputs_dim = []
+        remove_inputs = []
+        for input_node in inputs:
+            input_dict = {}
+            for user in input_node.users.keys():
+                if _is_non_compute_node(user):
+                    continue
+                user_idx = _find_idx_by_name(user.name, self.node_list)
+                if start_idx <= user_idx <= end_idx:
+                    chunk_dim = all_node_info[user]['chunk_dim']
+                    if chunk_dim is not None:
+                        input_dict[user_idx] = chunk_dim
+            if len(input_dict) == 0:
+                remove_inputs.append(input_node)
+            else:
+                inputs_dim.append(input_dict)
+        for i in remove_inputs:
+            if i in inputs:
+                inputs.remove(i)
+        
+        chunk_info = {
+            "region": (start_idx, end_idx),
+            "inputs": inputs,
+            "inputs_non_chunk": [],
+            "inputs_dim": inputs_dim,
+            "outputs": outputs,
+            "outputs_dim": end_dim,
+            "args": {},
+        }
+        
+        # we need to log input nodes to avoid deleteing them in the loop
+        non_chunk_inputs = _find_chunk_all_input_nodes(
+            self.node_list[start_idx : end_idx + 1]
+        )
+        for i in non_chunk_inputs:
+            if i not in chunk_info["inputs"]:
+                chunk_info["inputs_non_chunk"].append(i)
+
+        return chunk_info
+
 
 class MemoryEstimator(object):
     def __init__(self, index_tracer: IndexTracer) -> None:
@@ -1055,12 +1225,13 @@ def _get_chunk_ratio(self, node, chunk_inputs, chunk_inputs_dim, chunk_size):
         node_source = self.index_tracer._find_source_trace_from_node(node)
         for (input_node, input_node_dim) in zip(chunk_inputs, chunk_inputs_dim):
             for k, v in input_node_dim.items():
+                # TODO: inherit dim should be list too, int now
                 inherit_dim = self.index_tracer._find_inherit_dim(input_node, v, self.index_tracer.nodes_list[k])
                 if k == _find_idx_by_name(node.name, self.index_tracer.nodes_list):
                     chunk_ratio = float(chunk_size) / node_shape[inherit_dim]
                     return chunk_ratio
                 for dim, source in enumerate(node_source):
-                    if k in source and source[k] == inherit_dim:
+                    if k in source and inherit_dim in source[k]:
                         chunk_ratio = float(chunk_size) / node_shape[dim]
                         return chunk_ratio
         return 1.
@@ -1323,9 +1494,11 @@ def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
                 continue
             for start_node, start_trace in start_traces.items():
                 for start_dim, start_trace_idx in enumerate(start_trace["idx"]):
-                    # must be same trace idx
-                    if start_trace_idx != end_trace_idx:
-                        continue
+                    if start_idx == 199 and end_idx == 229 and start_dim == 2 and end_dim == 2:
+                        print(1)
+                        self.flow_tracer.flow_search(
+                            start_idx, start_dim, end_idx, end_dim, self.index_tracer
+                        )
                     # dim size cannot be 1
                     if (
                         _get_node_shape(end_node)[end_dim] == 1
@@ -1343,10 +1516,16 @@ def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
                     ):
                         continue
                     # detect flow meet
-                    flow_block, chunk_info = self.flow_tracer._detect_flow(
+                    # flow_block, chunk_info = self.flow_tracer._detect_flow(
+                    #     start_idx, start_dim, end_idx, end_dim, self.index_tracer
+                    # )
+                    # if flow_block:
+                    #     continue
+                    # flow search
+                    chunk_info = self.flow_tracer.flow_search(
                         start_idx, start_dim, end_idx, end_dim, self.index_tracer
                     )
-                    if flow_block:
+                    if chunk_info is None:
                         continue
                     # check index copmute
                     if not self.index_tracer.check_index_duplicate(chunk_info):
diff --git a/evoformer/evoformer.py b/evoformer/evoformer.py
index 0c5ab952a779..cfd2bb2a2529 100644
--- a/evoformer/evoformer.py
+++ b/evoformer/evoformer.py
@@ -6,6 +6,13 @@
 from .triangle import PairStack
 
 
+def print_memory(init_mem, text=None):
+    now_mem = torch.cuda.memory_allocated() / 1024 ** 2 - init_mem
+    max_mem = torch.cuda.max_memory_allocated() / 1024 ** 2 - init_mem
+    print("%s now:%.2f max:%.2f" % ("" if text is None else text, now_mem, max_mem))
+    torch.cuda.reset_peak_memory_stats()
+
+
 class EvoformerBlock(nn.Module):
 
     def __init__(self, d_node, d_pair):
@@ -16,9 +23,9 @@ def __init__(self, d_node, d_pair):
         self.pair_stack = PairStack(d_pair=d_pair)
 
     def forward(self, node, pair):
-        node = node + self.msa_stack(node, pair)
+        node = self.msa_stack(node, pair)
         pair = pair + self.communication(node)
-        pair = pair + self.pair_stack(pair)
+        pair = self.pair_stack(pair)
         return node, pair
 
 
From ded1005667402ee9458afa53852ce2018b1ccb10 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Wed, 21 Dec 2022 15:03:08 +0800
Subject: [PATCH 045/503] format code

---
 chunk_codegen.py | 184 +++++++++++++++++++++++++++++++----------------
 1 file changed, 122 insertions(+), 62 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 3ba082ceb845..eb16361c04fc 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -144,7 +144,9 @@ def _add_source(self, node_from, node_from_dim, node_to, node_to_dim, init=False
             node_to_trace["source"][node_to_dim][node_from_idx] = [node_from_dim]
         else:
             if node_from_dim not in node_to_trace["source"][node_to_dim][node_from_idx]:
-                node_to_trace["source"][node_to_dim][node_from_idx].append(node_from_dim)
+                node_to_trace["source"][node_to_dim][node_from_idx].append(
+                    node_from_dim
+                )
         # update inputs source
         node_to_trace["source"][node_to_dim].update(
             node_from_trace["source"][node_from_dim]
@@ -745,7 +747,6 @@ def check_index_duplicate(self, chunk_infos, return_dim=False):
             return True
 
 
-
 class FlowTracer(object):
     def __init__(self, gm) -> None:
         self.gm = gm
@@ -856,7 +857,9 @@ def trace_flow(self):
                     )
         return self.flow_trace
 
-    def _detect_flow(self, start_idx, start_dim, end_idx, end_dim, index_tracer: IndexTracer):
+    def _detect_flow(
+        self, start_idx, start_dim, end_idx, end_dim, index_tracer: IndexTracer
+    ):
         inputs, outputs = _find_chunk_compute_input_and_output_nodes(
             self.node_list[start_idx : end_idx + 1]
         )
@@ -945,8 +948,10 @@ def _detect_flow(self, start_idx, start_dim, end_idx, end_dim, index_tracer: Ind
         for i in remove_inputs:
             if i in chunk_info["inputs"]:
                 chunk_info["inputs"].remove(i)
-        
-        duplicate_result, duplicate_dim = index_tracer.check_index_duplicate(chunk_info, return_dim=True)
+
+        duplicate_result, duplicate_dim = index_tracer.check_index_duplicate(
+            chunk_info, return_dim=True
+        )
 
         # we need to log input nodes to avoid deleteing them in the loop
         non_chunk_inputs = _find_chunk_all_input_nodes(
@@ -958,15 +963,25 @@ def _detect_flow(self, start_idx, start_dim, end_idx, end_dim, index_tracer: Ind
 
         return flow_block, chunk_info
 
-    def _assgin_single_node_flow(self, arg_node, start_idx, end_idx, 
-                                 inputs, index_tracer, cur_node_dim, 
-                                 cur_node_compute, cur_node_source, cur_node_fix_dim, all_node_info,
-                                 next_node_list):
+    def _assgin_single_node_flow(
+        self,
+        arg_node,
+        start_idx,
+        end_idx,
+        inputs,
+        index_tracer,
+        cur_node_dim,
+        cur_node_compute,
+        cur_node_source,
+        cur_node_fix_dim,
+        all_node_info,
+        next_node_list,
+    ):
         arg_idx = _find_idx_by_name(arg_node.name, index_tracer.nodes_list)
         # arg in chunk range or be inputs
         if not (start_idx <= arg_idx < end_idx):
             return True
-                    
+
         # find arg dim
         if cur_node_dim is not None:
             # dim is computed
@@ -978,7 +993,7 @@ def _assgin_single_node_flow(self, arg_node, start_idx, end_idx,
                 arg_dim = cur_node_source[cur_node_dim][arg_idx][0]
         else:
             arg_dim = None
-                    
+
         # get fix dim
         arg_fix_dim = []
         if cur_node_dim is not None:
@@ -986,44 +1001,52 @@ def _assgin_single_node_flow(self, arg_node, start_idx, end_idx,
                 fix_dim_source = cur_node_source[i]
                 if arg_idx in fix_dim_source:
                     arg_fix_dim.append(fix_dim_source[arg_idx][0])
-                    
+
         # if already in node_info, arg dim must be same
         if arg_node in all_node_info:
             if all_node_info[arg_node] != arg_dim:
                 return False
-            all_node_info[arg_node]['fix_dim'] = list(set(all_node_info[arg_node]['fix_dim'] + arg_fix_dim))
+            all_node_info[arg_node]["fix_dim"] = list(
+                set(all_node_info[arg_node]["fix_dim"] + arg_fix_dim)
+            )
         # else add it to list
         else:
-            all_node_info[arg_node] = {'chunk_dim': arg_dim, 'fix_dim': arg_fix_dim}
-                        
+            all_node_info[arg_node] = {"chunk_dim": arg_dim, "fix_dim": arg_fix_dim}
+
         next_node_list.append(arg_node)
         return True
-    
-    def flow_search(self, start_idx, start_dim, end_idx, end_dim, index_tracer: IndexTracer):
+
+    def flow_search(
+        self, start_idx, start_dim, end_idx, end_dim, index_tracer: IndexTracer
+    ):
         inputs, outputs = _find_chunk_compute_input_and_output_nodes(
             self.node_list[start_idx : end_idx + 1]
         )
         # only single ouput
         if len(outputs) > 1:
             return None
-        
+
         cur_node_list = [index_tracer.nodes_list[end_idx]]  # start from the last node
-        all_node_info = {cur_node_list[0]: {'chunk_dim': end_dim, 'fix_dim': []}}
-        
+        all_node_info = {cur_node_list[0]: {"chunk_dim": end_dim, "fix_dim": []}}
+
         while len(cur_node_list) > 0:
             next_node_list = []
 
             for cur_node in cur_node_list:
                 # get cur node info
-                cur_node_chunk_dim = all_node_info[cur_node]['chunk_dim']
-                cur_node_fix_dim = all_node_info[cur_node]['fix_dim']
+                cur_node_chunk_dim = all_node_info[cur_node]["chunk_dim"]
+                cur_node_fix_dim = all_node_info[cur_node]["fix_dim"]
                 cur_node_idx = _find_idx_by_name(cur_node.name, index_tracer.nodes_list)
                 if cur_node_chunk_dim:
-                    cur_node_compute = index_tracer._find_compute_trace_from_node(cur_node)
-                    cur_node_source = index_tracer._find_source_trace_from_node(cur_node)
+                    cur_node_compute = index_tracer._find_compute_trace_from_node(
+                        cur_node
+                    )
+                    cur_node_source = index_tracer._find_source_trace_from_node(
+                        cur_node
+                    )
                 else:
                     cur_node_compute = cur_node_source = None
-                
+
                 # get all valid args
                 arg_list = []
                 for arg in cur_node.args:
@@ -1032,20 +1055,33 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim, index_tracer: Inde
                     if _is_non_compute_node(arg):
                         continue
                     arg_list.append(arg)
-                    flow_flag = self._assgin_single_node_flow(arg, start_idx, end_idx, 
-                        inputs, index_tracer, cur_node_chunk_dim, 
-                        cur_node_compute, cur_node_source, cur_node_fix_dim, all_node_info,
-                        next_node_list)
+                    flow_flag = self._assgin_single_node_flow(
+                        arg,
+                        start_idx,
+                        end_idx,
+                        inputs,
+                        index_tracer,
+                        cur_node_chunk_dim,
+                        cur_node_compute,
+                        cur_node_source,
+                        cur_node_fix_dim,
+                        all_node_info,
+                        next_node_list,
+                    )
                     if flow_flag == False:
                         return None
-                        
+
                 if len(arg_list) == 2:
                     if any(i in cur_node.name for i in ["add", "mul"]):
                         for arg in arg_list:
-                            if not (start_idx <= _find_idx_by_name(arg.name, index_tracer.nodes_list) < end_idx):
+                            if not (
+                                start_idx
+                                <= _find_idx_by_name(arg.name, index_tracer.nodes_list)
+                                < end_idx
+                            ):
                                 continue
-                            arg_chunk_dim = all_node_info[arg]['chunk_dim']
-                            arg_fix_dim = all_node_info[arg]['fix_dim']
+                            arg_chunk_dim = all_node_info[arg]["chunk_dim"]
+                            arg_fix_dim = all_node_info[arg]["fix_dim"]
                             arg_shape = _get_node_shape(arg)
                             # add all dim as fix dim except chunk dim
                             for i, shape in enumerate(arg_shape):
@@ -1061,7 +1097,7 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim, index_tracer: Inde
                     else:
                         raise NotImplementedError()
             cur_node_list = next_node_list
-        
+
         inputs_dim = []
         remove_inputs = []
         for input_node in inputs:
@@ -1071,7 +1107,7 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim, index_tracer: Inde
                     continue
                 user_idx = _find_idx_by_name(user.name, self.node_list)
                 if start_idx <= user_idx <= end_idx:
-                    chunk_dim = all_node_info[user]['chunk_dim']
+                    chunk_dim = all_node_info[user]["chunk_dim"]
                     if chunk_dim is not None:
                         input_dict[user_idx] = chunk_dim
             if len(input_dict) == 0:
@@ -1081,7 +1117,7 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim, index_tracer: Inde
         for i in remove_inputs:
             if i in inputs:
                 inputs.remove(i)
-        
+
         chunk_info = {
             "region": (start_idx, end_idx),
             "inputs": inputs,
@@ -1091,7 +1127,7 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim, index_tracer: Inde
             "outputs_dim": end_dim,
             "args": {},
         }
-        
+
         # we need to log input nodes to avoid deleteing them in the loop
         non_chunk_inputs = _find_chunk_all_input_nodes(
             self.node_list[start_idx : end_idx + 1]
@@ -1129,7 +1165,7 @@ def _get_output_node_size(self, n):
 
     def _add_active_node(self, n, active_list):
         new_active = self._get_output_node(n)[1]
-        if n.op == 'placeholder':
+        if n.op == "placeholder":
             new_active.append(n.name)
         for i in new_active:
             if i not in active_list:
@@ -1168,12 +1204,16 @@ def _remove_deactive_node(self, user, user_to_last_uses, active_list):
         for i in delete_node:
             if i in active_list:
                 active_list.remove(i)
-    
-    def _get_chunk_inputs_size(self, chunk_inputs, chunk_inputs_non_chunk, node_list, chunk_end_idx):
+
+    def _get_chunk_inputs_size(
+        self, chunk_inputs, chunk_inputs_non_chunk, node_list, chunk_end_idx
+    ):
         nodes_to_delete = []
         for chunk_input in chunk_inputs + chunk_inputs_non_chunk:
             chunk_input_users = chunk_input.users.keys()
-            chunk_input_users_idx = [_find_idx_by_name(i.name, node_list) for i in chunk_input_users]
+            chunk_input_users_idx = [
+                _find_idx_by_name(i.name, node_list) for i in chunk_input_users
+            ]
             if all(i <= chunk_end_idx for i in chunk_input_users_idx):
                 if chunk_input not in nodes_to_delete:
                     nodes_to_delete.append(chunk_input)
@@ -1226,7 +1266,9 @@ def _get_chunk_ratio(self, node, chunk_inputs, chunk_inputs_dim, chunk_size):
         for (input_node, input_node_dim) in zip(chunk_inputs, chunk_inputs_dim):
             for k, v in input_node_dim.items():
                 # TODO: inherit dim should be list too, int now
-                inherit_dim = self.index_tracer._find_inherit_dim(input_node, v, self.index_tracer.nodes_list[k])
+                inherit_dim = self.index_tracer._find_inherit_dim(
+                    input_node, v, self.index_tracer.nodes_list[k]
+                )
                 if k == _find_idx_by_name(node.name, self.index_tracer.nodes_list):
                     chunk_ratio = float(chunk_size) / node_shape[inherit_dim]
                     return chunk_ratio
@@ -1234,7 +1276,7 @@ def _get_chunk_ratio(self, node, chunk_inputs, chunk_inputs_dim, chunk_size):
                     if k in source and inherit_dim in source[k]:
                         chunk_ratio = float(chunk_size) / node_shape[dim]
                         return chunk_ratio
-        return 1.
+        return 1.0
 
     def _get_chunk_delete_node_size(
         self, user, user_to_last_uses, chunk_ratio, chunk_inputs_names
@@ -1295,7 +1337,7 @@ def estimate_chunk_inference_mem(
         chunk_ratio = 1  # use it to estimate chunk mem
         chunk_size = 1
         chunk_inputs_names = []
-        
+
         if use_chunk:
             chunk_regions = [i["region"] for i in chunk_infos]
             chunk_starts = [i[0] for i in chunk_regions]
@@ -1313,12 +1355,17 @@ def estimate_chunk_inference_mem(
             if use_chunk and idx in chunk_starts:
                 chunk_within = True
                 chunk_region_idx = chunk_starts.index(idx)
-                act_memory += self._get_output_node_size(chunk_outputs[chunk_region_idx]) / (1024**2)
+                act_memory += self._get_output_node_size(
+                    chunk_outputs[chunk_region_idx]
+                ) / (1024**2)
 
             # determine chunk ratio for current node
             if chunk_within:
                 chunk_ratio = self._get_chunk_ratio(
-                    node, chunk_inputs[chunk_region_idx], chunk_inputs_dim[chunk_region_idx], chunk_size
+                    node,
+                    chunk_inputs[chunk_region_idx],
+                    chunk_inputs_dim[chunk_region_idx],
+                    chunk_size,
                 )
 
             # if node is placeholder, just add the size of the node
@@ -1353,18 +1400,18 @@ def estimate_chunk_inference_mem(
                     / (1024**2)
                 )
                 # delete unused vars not in chunk_input_list
-                # we can't delete input nodes until chunk ends 
+                # we can't delete input nodes until chunk ends
                 if chunk_within:
                     act_memory -= self._get_chunk_delete_node_size(
                         node,
                         user_to_last_uses_no_free_var,
                         chunk_ratio,
-                        chunk_inputs_names
+                        chunk_inputs_names,
                     ) / (1024**2)
                 else:
-                    act_memory -= (self._get_delete_node_size(
+                    act_memory -= self._get_delete_node_size(
                         node, user_to_last_uses_no_free_var, chunk_inputs_names
-                    ) / (1024**2))
+                    ) / (1024**2)
 
             # log active node, only effective without chunk
             self._add_active_node(node, active_node_list)
@@ -1376,11 +1423,11 @@ def estimate_chunk_inference_mem(
                     self._get_output_node_size(node) * chunk_ratio / (1024**2)
                 )
                 act_memory -= self._get_chunk_inputs_size(
-                    chunk_inputs[chunk_region_idx], 
-                    chunk_inputs_non_chunk[chunk_region_idx], 
+                    chunk_inputs[chunk_region_idx],
+                    chunk_inputs_non_chunk[chunk_region_idx],
                     node_list,
-                    chunk_regions[chunk_region_idx][1]
-                    ) / (1024**2)
+                    chunk_regions[chunk_region_idx][1],
+                ) / (1024**2)
                 chunk_within = False
                 chunk_ratio = 1
                 chunk_region_idx = None
@@ -1436,7 +1483,7 @@ def _search_max_chunk_region(self, active_node, peak_node, chunk_regions):
         active_node_num = [len(i) for i in active_node]
         min_active_node_num = min(active_node_num[free_var_num:])
         threshold = max(free_var_num, min_active_node_num)
-        
+
         # from peak_node to free_var
         inside_flag = False
         chunk_region_start = free_var_num
@@ -1494,7 +1541,12 @@ def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
                 continue
             for start_node, start_trace in start_traces.items():
                 for start_dim, start_trace_idx in enumerate(start_trace["idx"]):
-                    if start_idx == 199 and end_idx == 229 and start_dim == 2 and end_dim == 2:
+                    if (
+                        start_idx == 199
+                        and end_idx == 229
+                        and start_dim == 2
+                        and end_dim == 2
+                    ):
                         print(1)
                         self.flow_tracer.flow_search(
                             start_idx, start_dim, end_idx, end_dim, self.index_tracer
@@ -1576,7 +1628,7 @@ def _search_best_chunk_region(self, possible_chunk_regions, chunk_infos):
             max_region_range = 0
             best_region = None
         return best_region
-    
+
     def _is_legal_region(self, cur_chunk_info, chunk_infos):
         (chunk_region_start, chunk_region_end) = cur_chunk_info["region"]
         if cur_chunk_info in chunk_infos:
@@ -1585,11 +1637,13 @@ def _is_legal_region(self, cur_chunk_info, chunk_infos):
             return False
         for i in chunk_infos:
             region = i["region"]
-            if not ((chunk_region_start > region[1] and chunk_region_end > region[1]) 
-                    or (chunk_region_start < region[0] and chunk_region_end < region[0])):
+            if not (
+                (chunk_region_start > region[1] and chunk_region_end > region[1])
+                or (chunk_region_start < region[0] and chunk_region_end < region[0])
+            ):
                 return False
         return True
-    
+
     def _step_search(self, mem_peak, active_node, chunk_regions):
         peak_node = self._find_peak_node(mem_peak)
         max_chunk_region = self._search_max_chunk_region(
@@ -1600,7 +1654,9 @@ def _step_search(self, mem_peak, active_node, chunk_regions):
         possible_chunk_regions = self._search_possible_chunk_regions(
             max_chunk_region, peak_node
         )
-        best_chunk_region = self._search_best_chunk_region(possible_chunk_regions, chunk_regions)
+        best_chunk_region = self._search_best_chunk_region(
+            possible_chunk_regions, chunk_regions
+        )
         return best_chunk_region
 
     def _stop_search(self, init_mem_peak, mem_peak):
@@ -1667,7 +1723,11 @@ def _gen_loop_end(
     chunk_slice = _gen_chunk_slice_dim(
         chunk_outputs_dim, "chunk_idx", chunk_output_shape
     )
-    context = "    chunk_result%s = %s;  %s = None\n" % (chunk_slice, chunk_outputs_name, chunk_outputs_name)
+    context = "    chunk_result%s = %s;  %s = None\n" % (
+        chunk_slice,
+        chunk_outputs_name,
+        chunk_outputs_name,
+    )
     context += (
         chunk_outputs_name + " = chunk_result;  chunk_result = None;  chunk_size = None"
     )

From 774d34f1aa2f9534557dd4a0ca866392a496e448 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 23 Dec 2022 13:41:10 +0800
Subject: [PATCH 046/503] refactor flow search

---
 chunk_codegen.py | 78 +++++++++++++++++++++++++++++++++++-------------
 1 file changed, 58 insertions(+), 20 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index eb16361c04fc..0b0a164fe999 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -1004,7 +1004,7 @@ def _assgin_single_node_flow(
 
         # if already in node_info, arg dim must be same
         if arg_node in all_node_info:
-            if all_node_info[arg_node] != arg_dim:
+            if all_node_info[arg_node]['chunk_dim'] != arg_dim:
                 return False
             all_node_info[arg_node]["fix_dim"] = list(
                 set(all_node_info[arg_node]["fix_dim"] + arg_fix_dim)
@@ -1128,14 +1128,68 @@ def flow_search(
             "args": {},
         }
 
+        # move useless nodes ahead of loop
+        # get all possible prepose nodes
+        maybe_prepose_nodes = []
+        for node, node_info in all_node_info.items():
+            if node_info['chunk_dim'] is None:
+                maybe_prepose_nodes.append(node)
+        maybe_prepose_nodes.sort(key=lambda x: _find_idx_by_name(x.name, index_tracer.nodes_list), reverse=True) # from last node to first node
+        prepose_nodes = []
+        # set every node as root, search its args, if all legal, turn root and args as prepose nodes
+        while len(maybe_prepose_nodes) > 0:
+            tmp_cur_prepose_nodes = [maybe_prepose_nodes[0]]
+            tmp_cur_related_prepose_nodes = []
+            prepose_flag = True
+            
+            # loop cur node's all arg until out of chunk
+            while len(tmp_cur_prepose_nodes) > 0:
+                tmp_next_prepose_nodes = []
+                tmp_cur_related_prepose_nodes.extend(tmp_cur_prepose_nodes)
+                for cur_prepose_node in tmp_cur_prepose_nodes:
+                    for cur_prepose_node_arg in cur_prepose_node.args:
+                        if type(cur_prepose_node_arg) != type(cur_prepose_node):
+                            continue
+                        # out of loop
+                        if not (start_idx <= _find_idx_by_name(cur_prepose_node_arg.name, self.node_list) < end_idx):
+                            continue
+                        # compute op in loop
+                        elif cur_prepose_node_arg in all_node_info:
+                            if all_node_info[cur_prepose_node_arg]['chunk_dim'] is None:
+                                tmp_next_prepose_nodes.append(cur_prepose_node_arg)
+                            else:
+                                prepose_flag = False
+                                break; break; break
+                        # non compute op
+                        else:
+                            tmp_next_prepose_nodes.append(cur_prepose_node_arg)
+                tmp_cur_prepose_nodes = tmp_next_prepose_nodes
+            
+            if prepose_flag == False:
+                maybe_prepose_nodes.remove(maybe_prepose_nodes[0])
+                continue
+            else:
+                for n in tmp_cur_related_prepose_nodes:
+                    if n not in prepose_nodes:
+                        prepose_nodes.append(n)
+                    if n in maybe_prepose_nodes:
+                        maybe_prepose_nodes.remove(n)
+        # sort by index
+        prepose_nodes.sort(key=lambda x: _find_idx_by_name(x.name, index_tracer.nodes_list))
+        chunk_info["args"]["prepose_nodes"] = prepose_nodes
+        
         # we need to log input nodes to avoid deleteing them in the loop
+        chunk_node_list = self.node_list[start_idx : end_idx + 1]
+        # also need to get some prepose node's arg out of non_chunk_inputs
+        for n in prepose_nodes:
+            chunk_node_list.remove(n)
         non_chunk_inputs = _find_chunk_all_input_nodes(
-            self.node_list[start_idx : end_idx + 1]
+            chunk_node_list
         )
         for i in non_chunk_inputs:
-            if i not in chunk_info["inputs"]:
+            if i not in chunk_info["inputs"] and i not in prepose_nodes:
                 chunk_info["inputs_non_chunk"].append(i)
-
+        
         return chunk_info
 
 
@@ -1541,16 +1595,6 @@ def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
                 continue
             for start_node, start_trace in start_traces.items():
                 for start_dim, start_trace_idx in enumerate(start_trace["idx"]):
-                    if (
-                        start_idx == 199
-                        and end_idx == 229
-                        and start_dim == 2
-                        and end_dim == 2
-                    ):
-                        print(1)
-                        self.flow_tracer.flow_search(
-                            start_idx, start_dim, end_idx, end_dim, self.index_tracer
-                        )
                     # dim size cannot be 1
                     if (
                         _get_node_shape(end_node)[end_dim] == 1
@@ -1567,12 +1611,6 @@ def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
                         start_idx, end_dim, end_node, end_idx
                     ):
                         continue
-                    # detect flow meet
-                    # flow_block, chunk_info = self.flow_tracer._detect_flow(
-                    #     start_idx, start_dim, end_idx, end_dim, self.index_tracer
-                    # )
-                    # if flow_block:
-                    #     continue
                     # flow search
                     chunk_info = self.flow_tracer.flow_search(
                         start_idx, start_dim, end_idx, end_dim, self.index_tracer

From 522f01741864f3565f8e97837ecc7289774ee127 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 23 Dec 2022 13:41:51 +0800
Subject: [PATCH 047/503] code style

---
 chunk_codegen.py | 39 +++++++++++++++++++++++++--------------
 1 file changed, 25 insertions(+), 14 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 0b0a164fe999..a8b970116d1d 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -1004,7 +1004,7 @@ def _assgin_single_node_flow(
 
         # if already in node_info, arg dim must be same
         if arg_node in all_node_info:
-            if all_node_info[arg_node]['chunk_dim'] != arg_dim:
+            if all_node_info[arg_node]["chunk_dim"] != arg_dim:
                 return False
             all_node_info[arg_node]["fix_dim"] = list(
                 set(all_node_info[arg_node]["fix_dim"] + arg_fix_dim)
@@ -1132,16 +1132,19 @@ def flow_search(
         # get all possible prepose nodes
         maybe_prepose_nodes = []
         for node, node_info in all_node_info.items():
-            if node_info['chunk_dim'] is None:
+            if node_info["chunk_dim"] is None:
                 maybe_prepose_nodes.append(node)
-        maybe_prepose_nodes.sort(key=lambda x: _find_idx_by_name(x.name, index_tracer.nodes_list), reverse=True) # from last node to first node
+        maybe_prepose_nodes.sort(
+            key=lambda x: _find_idx_by_name(x.name, index_tracer.nodes_list),
+            reverse=True,
+        )  # from last node to first node
         prepose_nodes = []
         # set every node as root, search its args, if all legal, turn root and args as prepose nodes
         while len(maybe_prepose_nodes) > 0:
             tmp_cur_prepose_nodes = [maybe_prepose_nodes[0]]
             tmp_cur_related_prepose_nodes = []
             prepose_flag = True
-            
+
             # loop cur node's all arg until out of chunk
             while len(tmp_cur_prepose_nodes) > 0:
                 tmp_next_prepose_nodes = []
@@ -1151,20 +1154,28 @@ def flow_search(
                         if type(cur_prepose_node_arg) != type(cur_prepose_node):
                             continue
                         # out of loop
-                        if not (start_idx <= _find_idx_by_name(cur_prepose_node_arg.name, self.node_list) < end_idx):
+                        if not (
+                            start_idx
+                            <= _find_idx_by_name(
+                                cur_prepose_node_arg.name, self.node_list
+                            )
+                            < end_idx
+                        ):
                             continue
                         # compute op in loop
                         elif cur_prepose_node_arg in all_node_info:
-                            if all_node_info[cur_prepose_node_arg]['chunk_dim'] is None:
+                            if all_node_info[cur_prepose_node_arg]["chunk_dim"] is None:
                                 tmp_next_prepose_nodes.append(cur_prepose_node_arg)
                             else:
                                 prepose_flag = False
-                                break; break; break
+                                break
+                                break
+                                break
                         # non compute op
                         else:
                             tmp_next_prepose_nodes.append(cur_prepose_node_arg)
                 tmp_cur_prepose_nodes = tmp_next_prepose_nodes
-            
+
             if prepose_flag == False:
                 maybe_prepose_nodes.remove(maybe_prepose_nodes[0])
                 continue
@@ -1175,21 +1186,21 @@ def flow_search(
                     if n in maybe_prepose_nodes:
                         maybe_prepose_nodes.remove(n)
         # sort by index
-        prepose_nodes.sort(key=lambda x: _find_idx_by_name(x.name, index_tracer.nodes_list))
+        prepose_nodes.sort(
+            key=lambda x: _find_idx_by_name(x.name, index_tracer.nodes_list)
+        )
         chunk_info["args"]["prepose_nodes"] = prepose_nodes
-        
+
         # we need to log input nodes to avoid deleteing them in the loop
         chunk_node_list = self.node_list[start_idx : end_idx + 1]
         # also need to get some prepose node's arg out of non_chunk_inputs
         for n in prepose_nodes:
             chunk_node_list.remove(n)
-        non_chunk_inputs = _find_chunk_all_input_nodes(
-            chunk_node_list
-        )
+        non_chunk_inputs = _find_chunk_all_input_nodes(chunk_node_list)
         for i in non_chunk_inputs:
             if i not in chunk_info["inputs"] and i not in prepose_nodes:
                 chunk_info["inputs_non_chunk"].append(i)
-        
+
         return chunk_info
 
 
From d309e9338bde716ca356af8a27e0c484e97abbd9 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 23 Dec 2022 14:26:12 +0800
Subject: [PATCH 048/503] adapt codegen to prepose node

---
 chunk_codegen.py | 43 +++++++++++++++++++++++++------------------
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index a8b970116d1d..e3a7643d7499 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -1198,7 +1198,7 @@ def flow_search(
             chunk_node_list.remove(n)
         non_chunk_inputs = _find_chunk_all_input_nodes(chunk_node_list)
         for i in non_chunk_inputs:
-            if i not in chunk_info["inputs"] and i not in prepose_nodes:
+            if i not in chunk_info["inputs"]:
                 chunk_info["inputs_non_chunk"].append(i)
 
         return chunk_info
@@ -1425,6 +1425,7 @@ def estimate_chunk_inference_mem(
                 ) / (1024**2)
 
             # determine chunk ratio for current node
+            # TODO: adapt to prepose node memory
             if chunk_within:
                 chunk_ratio = self._get_chunk_ratio(
                     node,
@@ -1602,7 +1603,6 @@ def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
         chunk_infos = []
         for end_dim, end_trace_idx in enumerate(end_trace["idx"]):
             if len(start_traces) > 1:
-                # TODO: implement multi input chunk
                 continue
             for start_node, start_trace in start_traces.items():
                 for start_dim, start_trace_idx in enumerate(start_trace["idx"]):
@@ -1831,7 +1831,6 @@ def _find_chunk_compute_input_and_output_nodes(nodes: List[Node]):
 
     # if a node has a user node which is not in the node list
     # we treat that user node as the node receiving the current node output
-    # TODO: it is unsafe to remove non compute node here
     for node in nodes:
         for output_node in node.users.keys():
             if (
@@ -1900,6 +1899,8 @@ def emit_code_with_chunk(
 
     chunk_outputs = [i["outputs"][0] for i in chunk_search]
     chunk_outputs_dim = [i["outputs_dim"] for i in chunk_search]
+    
+    chunk_prepose_nodes = [i["args"]["prepose_nodes"] for i in chunk_search]
 
     node_idx = 0
     region_idx = 0
@@ -1911,7 +1912,11 @@ def emit_code_with_chunk(
         if node_idx in chunk_starts:
             within_chunk_region = True
             region_idx = chunk_starts.index(node_idx)
-
+            # add prepose nodes
+            for i in chunk_prepose_nodes[region_idx]:
+                prepose_node = node_list[_find_idx_by_name(i.name, node_list)]
+                emit_node_func(prepose_node, body)
+                delete_unused_value_func(prepose_node, body, chunk_inputs_names)
             # add for loop
             body.append(
                 _gen_loop_start(
@@ -1922,20 +1927,22 @@ def emit_code_with_chunk(
             )
 
         if within_chunk_region:
-            emit_node_func(node, body)
-            # replace input var with chunk var
-            for input_node_idx, input_node in enumerate(chunk_inputs[region_idx]):
-                for idx, dim in chunk_inputs_dim[region_idx][input_node_idx].items():
-                    if idx == node_idx:
-                        chunk_slice = _gen_chunk_slice_dim(
-                            dim, "chunk_idx", _get_node_shape(input_node)
-                        )
-                        body[-1] = _replace_name(
-                            body[-1], input_node.name, input_node.name + chunk_slice
-                        )
-            body[-1] = "    " + body[-1]
-            delete_unused_value_func(node, body, chunk_inputs_names)
-
+            if any(node.name == i.name for i in chunk_prepose_nodes[region_idx]):
+                pass
+            else:
+                emit_node_func(node, body)
+                # replace input var with chunk var
+                for input_node_idx, input_node in enumerate(chunk_inputs[region_idx]):
+                    for idx, dim in chunk_inputs_dim[region_idx][input_node_idx].items():
+                        if idx == node_idx:
+                            chunk_slice = _gen_chunk_slice_dim(
+                                dim, "chunk_idx", _get_node_shape(input_node)
+                            )
+                            body[-1] = _replace_name(
+                                body[-1], input_node.name, input_node.name + chunk_slice
+                            )
+                body[-1] = "    " + body[-1]
+                delete_unused_value_func(node, body, chunk_inputs_names)
         else:
             emit_node_func(node, body)
             if node_idx not in chunk_inputs:

From 49ba619085c33eef372e73b6a45aecdc3d37937f Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 23 Dec 2022 14:26:43 +0800
Subject: [PATCH 049/503] code style

---
 chunk_codegen.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index e3a7643d7499..40196285ec8c 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -1899,7 +1899,7 @@ def emit_code_with_chunk(
 
     chunk_outputs = [i["outputs"][0] for i in chunk_search]
     chunk_outputs_dim = [i["outputs_dim"] for i in chunk_search]
-    
+
     chunk_prepose_nodes = [i["args"]["prepose_nodes"] for i in chunk_search]
 
     node_idx = 0
@@ -1933,7 +1933,9 @@ def emit_code_with_chunk(
                 emit_node_func(node, body)
                 # replace input var with chunk var
                 for input_node_idx, input_node in enumerate(chunk_inputs[region_idx]):
-                    for idx, dim in chunk_inputs_dim[region_idx][input_node_idx].items():
+                    for idx, dim in chunk_inputs_dim[region_idx][
+                        input_node_idx
+                    ].items():
                         if idx == node_idx:
                             chunk_slice = _gen_chunk_slice_dim(
                                 dim, "chunk_idx", _get_node_shape(input_node)

From 4d89525fc2f828c9c65bf4077b677db9a78c8466 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 23 Dec 2022 14:28:49 +0800
Subject: [PATCH 050/503] remove abandoned function

---
 chunk_codegen.py | 106 -----------------------------------------------
 1 file changed, 106 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 40196285ec8c..e2786d5e244f 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -857,112 +857,6 @@ def trace_flow(self):
                     )
         return self.flow_trace
 
-    def _detect_flow(
-        self, start_idx, start_dim, end_idx, end_dim, index_tracer: IndexTracer
-    ):
-        inputs, outputs = _find_chunk_compute_input_and_output_nodes(
-            self.node_list[start_idx : end_idx + 1]
-        )
-        chunk_info = {
-            "region": (start_idx, end_idx),
-            "inputs": inputs,
-            "inputs_non_chunk": [],
-            "inputs_dim": start_dim,
-            "outputs": outputs,
-            "outputs_dim": end_dim,
-            "args": {},
-        }
-        flow_block = False
-
-        # TODO don't allow multi outputs now
-        if len(outputs) > 1:
-            flow_block = True
-            return flow_block, chunk_info
-
-        # for idx in range(start_idx, end_idx + 1):
-        #     node = self.node_list[idx]
-        #     mix_flow_node = self._get_flow_mix_node(node)
-        #     if mix_flow_node is None:
-        #         continue
-
-        #     # if there is a flow mix, op must be in [mul, add, matmul]
-        #     # element-wise op requires dim to be equal in every dim
-        #     if any(n in node.name for n in ["mul", "add"]):
-        #         for i in node.args:
-        #             if type(i) == type(mix_flow_node) and i != mix_flow_node:
-        #                 main_flow_var = i
-        #         # if mix flow is a broadcast in chunk dim,
-        #         # TODO: need to move that flow out of the chunk
-        #         mix_flow_node_dim = index_tracer.get_node_chunk_dim(
-        #             self.node_list[end_idx], end_dim, node
-        #         )
-        #         # TODO: we need to loop every dim
-        #         if isinstance(mix_flow_node_dim, list):
-        #             mix_flow_node_dim = mix_flow_node_dim[0]
-        #         if mix_flow_node_dim is None:
-        #             flow_block = True
-        #             break
-        #         if _get_node_shape(mix_flow_node)[mix_flow_node_dim] == 1:
-        #             flow_block = False
-        #             for i in self._get_same_flow_node(
-        #                 chunk_info["inputs"], mix_flow_node
-        #             ):
-        #                 chunk_info["inputs"].remove(i)
-        #         # else, we need to chunk mix var as well
-        #         else:
-        #             # TODO chunk another value
-        #             flow_block = True
-        #             break
-        #     else:
-        #         raise NotImplementedError("%s not implemented" % node.name)
-        # if flow_block:
-        #     flow_block = True
-        #     return flow_block, chunk_info
-
-        inputs_dim = []
-        remove_inputs = []
-        for input_node in chunk_info["inputs"]:
-            input_dict = {}
-            for user in input_node.users.keys():
-                if _is_non_compute_node(user):
-                    continue
-                user_idx = _find_idx_by_name(user.name, self.node_list)
-                dim = None
-                if start_dim <= user_idx < end_idx:
-                    dim = index_tracer.get_node_chunk_dim(
-                        self.node_list[end_idx], end_dim, input_node
-                    )
-                    # TODO: we need to loop every dim
-                    if isinstance(dim, list):
-                        dim = dim[0]
-                elif user_idx == end_idx:
-                    dim = end_dim
-                # n has relation with chunk dim
-                if dim is not None and _get_node_shape(user)[dim] != 1:
-                    input_dict[user_idx] = dim
-            if len(input_dict) == 0:
-                remove_inputs.append(input_node)
-            else:
-                inputs_dim.append(input_dict)
-        chunk_info["inputs_dim"] = inputs_dim
-        for i in remove_inputs:
-            if i in chunk_info["inputs"]:
-                chunk_info["inputs"].remove(i)
-
-        duplicate_result, duplicate_dim = index_tracer.check_index_duplicate(
-            chunk_info, return_dim=True
-        )
-
-        # we need to log input nodes to avoid deleteing them in the loop
-        non_chunk_inputs = _find_chunk_all_input_nodes(
-            self.node_list[start_idx : end_idx + 1]
-        )
-        for i in non_chunk_inputs:
-            if i not in chunk_info["inputs"]:
-                chunk_info["inputs_non_chunk"].append(i)
-
-        return flow_block, chunk_info
-
     def _assgin_single_node_flow(
         self,
         arg_node,

From 4f5e105af30fccb4b0595edd341bdd7a4b226aa9 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 23 Dec 2022 15:34:41 +0800
Subject: [PATCH 051/503] remove flow tracer

---
 chunk_codegen.py | 171 ++++++++---------------------------------------
 1 file changed, 27 insertions(+), 144 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index e2786d5e244f..838f53949de7 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -67,7 +67,7 @@ def _is_non_compute_node_except_placeholder_output(node):
 class IndexTracer(object):
     def __init__(self, gm) -> None:
         self.gm = gm
-        self.nodes_list = list(gm.graph.nodes)
+        self.node_list = list(gm.graph.nodes)
         self.idx_trace_list = self._init_idx_trace_list()
         self.idx_trace_equal = []
         self.idx_view_list = []
@@ -75,7 +75,7 @@ def __init__(self, gm) -> None:
 
     def _init_idx_trace_list(self):
         idx_trace_list = []
-        for n in self.nodes_list:
+        for n in self.node_list:
             if _get_node_shape(n) != None:
                 cur_trace = {
                     "idx": [None for _ in range(len(_get_node_shape(n)))],
@@ -136,7 +136,7 @@ def _add_source(self, node_from, node_from_dim, node_to, node_to_dim, init=False
         node_from_trace = self._find_trace_from_node(node_from)
         node_to_dim = self._transform_index(node_to, node_to_dim)
         node_to_trace = self._find_trace_from_node(node_to)
-        node_from_idx = _find_idx_by_name(node_from.name, self.nodes_list)
+        node_from_idx = _find_idx_by_name(node_from.name, self.node_list)
         if init:
             node_to_trace["source"][node_to_dim] = {}
         # add dim to cur new source
@@ -210,7 +210,7 @@ def _find_trace_from_node(self, node):
             idx (list): idx of the node
             compute (list): computed idx of the node.
         """
-        node_idx = _find_idx_by_name(node.name, self.nodes_list)
+        node_idx = _find_idx_by_name(node.name, self.node_list)
         node_dict = self.idx_trace_list[node_idx]
         return node_dict
 
@@ -224,7 +224,7 @@ def _find_source_trace_from_node(self, node):
             idx (list): idx of the node
             compute (list): computed idx of the node.
         """
-        node_idx = _find_idx_by_name(node.name, self.nodes_list)
+        node_idx = _find_idx_by_name(node.name, self.node_list)
         node_dict = self.idx_trace_list[node_idx]
         return node_dict["source"]
 
@@ -237,7 +237,7 @@ def _find_idx_trace_from_node(self, node):
         Returns:
             idx (list): idx of the node
         """
-        node_idx = _find_idx_by_name(node.name, self.nodes_list)
+        node_idx = _find_idx_by_name(node.name, self.node_list)
         return self.idx_trace_list[node_idx]["idx"]
 
     def _find_compute_trace_from_node(self, node):
@@ -249,7 +249,7 @@ def _find_compute_trace_from_node(self, node):
         Returns:
             compute (list): computed idx of the node.
         """
-        node_idx = _find_idx_by_name(node.name, self.nodes_list)
+        node_idx = _find_idx_by_name(node.name, self.node_list)
         return self.idx_trace_list[node_idx]["compute"]
 
     def _assign_index_as_input(self, node, node_idx, input_node=None):
@@ -262,7 +262,7 @@ def _assign_index_as_input(self, node, node_idx, input_node=None):
         """
         if input_node == None:
             input_node = node.args[0]
-        input_node_idx = _find_idx_by_name(input_node.name, self.nodes_list)
+        input_node_idx = _find_idx_by_name(input_node.name, self.node_list)
         input_node_idx_trace = self.idx_trace_list[input_node_idx]["idx"]
 
         new_idx_trace = copy.deepcopy(input_node_idx_trace)
@@ -591,7 +591,7 @@ def _merge_equal_idx(self):
                     ]
 
     def trace_index(self):
-        for idx, node in enumerate(self.nodes_list):
+        for idx, node in enumerate(self.node_list):
             if node.op == "placeholder":
                 self._assign_all_index(node, idx)
             elif node.op == "call_method":
@@ -655,7 +655,7 @@ def check_index_source(self, start_dim, start_node, start_idx, end_dim, end_node
         Returns:
             bool: True if check pass
         """
-        start_node_idx = _find_idx_by_name(start_node.name, self.nodes_list)
+        start_node_idx = _find_idx_by_name(start_node.name, self.node_list)
         end_node_trace = self._find_trace_from_node(end_node)
         end_node_trace_source = end_node_trace["source"][end_dim]
         sorted_source = sorted(
@@ -690,14 +690,14 @@ def check_index_compute(self, start_idx, end_dim, end_node, end_idx):
     def get_node_chunk_dim(self, node_from, node_from_dim, node_to):
         node_from_source = self._find_source_trace_from_node(node_from)
         dim_source = node_from_source[node_from_dim]
-        node_to_idx = _find_idx_by_name(node_to.name, self.nodes_list)
+        node_to_idx = _find_idx_by_name(node_to.name, self.node_list)
         for k, v in dim_source.items():
             if k == node_to_idx:
                 return v
         return None
 
     def _find_inherit_dim(self, input_node, input_dim, node):
-        input_node_idx = _find_idx_by_name(input_node.name, self.nodes_list)
+        input_node_idx = _find_idx_by_name(input_node.name, self.node_list)
         node_trace_source = self._find_source_trace_from_node(node)
         for node_dim in range(len(_get_node_shape(node))):
             if (
@@ -711,11 +711,11 @@ def check_index_duplicate(self, chunk_infos, return_dim=False):
         input_dim_after_node = {}
         for input_node_idx, input_node in enumerate(chunk_infos["inputs"]):
             for k, v in chunk_infos["inputs_dim"][input_node_idx].items():
-                inherit_dim = self._find_inherit_dim(input_node, v, self.nodes_list[k])
+                inherit_dim = self._find_inherit_dim(input_node, v, self.node_list[k])
                 if inherit_dim:
                     input_dim_after_node[k] = inherit_dim
 
-        for node in self.nodes_list[
+        for node in self.node_list[
             chunk_infos["region"][0] : chunk_infos["region"][1] + 1
         ]:
             if _is_non_compute_node_except_placeholder(node):
@@ -746,124 +746,11 @@ def check_index_duplicate(self, chunk_infos, return_dim=False):
         else:
             return True
 
-
-class FlowTracer(object):
-    def __init__(self, gm) -> None:
-        self.gm = gm
-        self.node_list = list(gm.graph.nodes)
-        self.flow_trace = {}
-
-    def _add_trace(self, name):
-        self.flow_trace[name] = []
-
-    def _add_node(self, trace_name, node):
-        self.flow_trace[trace_name].append(
-            {"node": node, "inside_depend": [], "outside_depend": []}
-        )
-
-    def _add_inside_depend(self, flow_name, node, inside_depend_node):
-        for i in self.flow_trace[flow_name]:
-            if i["node"] == node:
-                i["inside_depend"].append(inside_depend_node)
-                return
-        raise RuntimeError("node not found")
-
-    def _add_outside_depend(
-        self, flow_name, node, outside_depend_node, outside_depend_trace
-    ):
-        for i in self.flow_trace[flow_name]:
-            if i["node"] == node:
-                i["outside_depend"].append({outside_depend_trace: outside_depend_node})
-                return
-        raise RuntimeError("node not found")
-
-    def _init_trace(self):
-        for i in self.node_list:
-            if i.op == "placeholder":
-                self._add_trace(i.name)
-                self._add_node(i.name, i)
-
-    def _find_flow_for_node(self, node):
-        if type(self.node_list[0]) != type(node):
-            return None
-        if _is_non_compute_node_except_placeholder(node):
-            return None
-        for name, trace in self.flow_trace.items():
-            for i in trace:
-                if node == i["node"]:
-                    return name
-        if any(i in node.name for i in ["ones_like"]):
-            self._add_trace(node.name)
-            self._add_node(node.name, node)
-            return node.name
-        raise RuntimeError("node not found")
-
-    def _find_first_valid_flow(self, flow):
-        for i in flow:
-            if i is not None:
-                return i
-        raise RuntimeError("invalid flow")
-
-    def find_node_flow(self, node):
-        for name, trace in self.flow_trace.items():
-            for i in trace:
-                if node == i["node"]:
-                    return name, i
-        raise RuntimeError("invalid node")
-
-    def _get_flow_mix_node(self, node):
-        if _is_non_compute_node(node):
-            return None
-        _, node_trace = self.find_node_flow(node)
-        if len(node_trace["outside_depend"]) == 0:
-            return None
-        elif len(node_trace["outside_depend"]) > 1:
-            raise NotImplementedError
-        vars = list(node_trace["outside_depend"][0].values())[0]
-        return vars
-
-    def _get_same_flow_node(self, node_list, node):
-        name, _ = self.find_node_flow(node)
-        result = []
-        for i in self.flow_trace[name]:
-            if i["node"] in node_list:
-                result.append(i["node"])
-        return result
-
-    def trace_flow(self):
-        # init trace
-        self._init_trace()
-
-        for node in self.node_list:
-            # skip if non compute node
-            if all(
-                type(arg) != type(node) or _is_non_compute_node_except_placeholder(arg)
-                for arg in node.args
-            ) or _is_non_compute_node(node):
-                continue
-
-            node_input_flows = [self._find_flow_for_node(arg) for arg in node.args]
-
-            node_domin_flow = self._find_first_valid_flow(node_input_flows)
-            self._add_node(node_domin_flow, node)
-            for node_input_flow, arg in zip(node_input_flows, node.args):
-                if node_input_flow is None:
-                    continue
-                elif node_input_flow == node_domin_flow:
-                    self._add_inside_depend(node_domin_flow, node, arg)
-                else:
-                    self._add_outside_depend(
-                        node_domin_flow, node, arg, node_input_flow
-                    )
-        return self.flow_trace
-
     def _assgin_single_node_flow(
         self,
         arg_node,
         start_idx,
         end_idx,
-        inputs,
-        index_tracer,
         cur_node_dim,
         cur_node_compute,
         cur_node_source,
@@ -871,7 +758,7 @@ def _assgin_single_node_flow(
         all_node_info,
         next_node_list,
     ):
-        arg_idx = _find_idx_by_name(arg_node.name, index_tracer.nodes_list)
+        arg_idx = _find_idx_by_name(arg_node.name, self.node_list)
         # arg in chunk range or be inputs
         if not (start_idx <= arg_idx < end_idx):
             return True
@@ -911,7 +798,7 @@ def _assgin_single_node_flow(
         return True
 
     def flow_search(
-        self, start_idx, start_dim, end_idx, end_dim, index_tracer: IndexTracer
+        self, start_idx, start_dim, end_idx, end_dim
     ):
         inputs, outputs = _find_chunk_compute_input_and_output_nodes(
             self.node_list[start_idx : end_idx + 1]
@@ -920,7 +807,7 @@ def flow_search(
         if len(outputs) > 1:
             return None
 
-        cur_node_list = [index_tracer.nodes_list[end_idx]]  # start from the last node
+        cur_node_list = [self.node_list[end_idx]]  # start from the last node
         all_node_info = {cur_node_list[0]: {"chunk_dim": end_dim, "fix_dim": []}}
 
         while len(cur_node_list) > 0:
@@ -930,12 +817,12 @@ def flow_search(
                 # get cur node info
                 cur_node_chunk_dim = all_node_info[cur_node]["chunk_dim"]
                 cur_node_fix_dim = all_node_info[cur_node]["fix_dim"]
-                cur_node_idx = _find_idx_by_name(cur_node.name, index_tracer.nodes_list)
+                cur_node_idx = _find_idx_by_name(cur_node.name, self.node_list)
                 if cur_node_chunk_dim:
-                    cur_node_compute = index_tracer._find_compute_trace_from_node(
+                    cur_node_compute = self._find_compute_trace_from_node(
                         cur_node
                     )
-                    cur_node_source = index_tracer._find_source_trace_from_node(
+                    cur_node_source = self._find_source_trace_from_node(
                         cur_node
                     )
                 else:
@@ -953,8 +840,6 @@ def flow_search(
                         arg,
                         start_idx,
                         end_idx,
-                        inputs,
-                        index_tracer,
                         cur_node_chunk_dim,
                         cur_node_compute,
                         cur_node_source,
@@ -970,7 +855,7 @@ def flow_search(
                         for arg in arg_list:
                             if not (
                                 start_idx
-                                <= _find_idx_by_name(arg.name, index_tracer.nodes_list)
+                                <= _find_idx_by_name(arg.name, self.node_list)
                                 < end_idx
                             ):
                                 continue
@@ -1029,7 +914,7 @@ def flow_search(
             if node_info["chunk_dim"] is None:
                 maybe_prepose_nodes.append(node)
         maybe_prepose_nodes.sort(
-            key=lambda x: _find_idx_by_name(x.name, index_tracer.nodes_list),
+            key=lambda x: _find_idx_by_name(x.name, self.node_list),
             reverse=True,
         )  # from last node to first node
         prepose_nodes = []
@@ -1081,7 +966,7 @@ def flow_search(
                         maybe_prepose_nodes.remove(n)
         # sort by index
         prepose_nodes.sort(
-            key=lambda x: _find_idx_by_name(x.name, index_tracer.nodes_list)
+            key=lambda x: _find_idx_by_name(x.name, self.node_list)
         )
         chunk_info["args"]["prepose_nodes"] = prepose_nodes
 
@@ -1226,9 +1111,9 @@ def _get_chunk_ratio(self, node, chunk_inputs, chunk_inputs_dim, chunk_size):
             for k, v in input_node_dim.items():
                 # TODO: inherit dim should be list too, int now
                 inherit_dim = self.index_tracer._find_inherit_dim(
-                    input_node, v, self.index_tracer.nodes_list[k]
+                    input_node, v, self.index_tracer.node_list[k]
                 )
-                if k == _find_idx_by_name(node.name, self.index_tracer.nodes_list):
+                if k == _find_idx_by_name(node.name, self.index_tracer.node_list):
                     chunk_ratio = float(chunk_size) / node_shape[inherit_dim]
                     return chunk_ratio
                 for dim, source in enumerate(node_source):
@@ -1412,8 +1297,6 @@ def __init__(self, gm) -> None:
         self.node_list = list(gm.graph.nodes)
         self.index_tracer = IndexTracer(gm)
         self.index_tracer.trace_index()
-        self.flow_tracer = FlowTracer(gm)
-        self.flow_tracer.trace_flow()
         self.memory_estimator = MemoryEstimator(self.index_tracer)
 
     def _find_peak_node(self, mem_peak):
@@ -1517,8 +1400,8 @@ def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
                     ):
                         continue
                     # flow search
-                    chunk_info = self.flow_tracer.flow_search(
-                        start_idx, start_dim, end_idx, end_dim, self.index_tracer
+                    chunk_info = self.index_tracer.flow_search(
+                        start_idx, start_dim, end_idx, end_dim
                     )
                     if chunk_info is None:
                         continue

From fa5e6fbf96448ebff1dc682e749a3f73a5a9c2b5 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 23 Dec 2022 15:38:37 +0800
Subject: [PATCH 052/503] code style

---
 chunk_codegen.py | 25 +++++++++----------------
 1 file changed, 9 insertions(+), 16 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 838f53949de7..e80b0fd9be77 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -65,9 +65,8 @@ def _is_non_compute_node_except_placeholder_output(node):
 
 
 class IndexTracer(object):
-    def __init__(self, gm) -> None:
-        self.gm = gm
-        self.node_list = list(gm.graph.nodes)
+    def __init__(self, node_list) -> None:
+        self.node_list = node_list
         self.idx_trace_list = self._init_idx_trace_list()
         self.idx_trace_equal = []
         self.idx_view_list = []
@@ -797,9 +796,7 @@ def _assgin_single_node_flow(
         next_node_list.append(arg_node)
         return True
 
-    def flow_search(
-        self, start_idx, start_dim, end_idx, end_dim
-    ):
+    def flow_search(self, start_idx, start_dim, end_idx, end_dim):
         inputs, outputs = _find_chunk_compute_input_and_output_nodes(
             self.node_list[start_idx : end_idx + 1]
         )
@@ -819,12 +816,8 @@ def flow_search(
                 cur_node_fix_dim = all_node_info[cur_node]["fix_dim"]
                 cur_node_idx = _find_idx_by_name(cur_node.name, self.node_list)
                 if cur_node_chunk_dim:
-                    cur_node_compute = self._find_compute_trace_from_node(
-                        cur_node
-                    )
-                    cur_node_source = self._find_source_trace_from_node(
-                        cur_node
-                    )
+                    cur_node_compute = self._find_compute_trace_from_node(cur_node)
+                    cur_node_source = self._find_source_trace_from_node(cur_node)
                 else:
                     cur_node_compute = cur_node_source = None
 
@@ -965,9 +958,7 @@ def flow_search(
                     if n in maybe_prepose_nodes:
                         maybe_prepose_nodes.remove(n)
         # sort by index
-        prepose_nodes.sort(
-            key=lambda x: _find_idx_by_name(x.name, self.node_list)
-        )
+        prepose_nodes.sort(key=lambda x: _find_idx_by_name(x.name, self.node_list))
         chunk_info["args"]["prepose_nodes"] = prepose_nodes
 
         # we need to log input nodes to avoid deleteing them in the loop
@@ -1295,7 +1286,9 @@ class ChunkRegionSearch(object):
     def __init__(self, gm) -> None:
         self.gm = gm
         self.node_list = list(gm.graph.nodes)
-        self.index_tracer = IndexTracer(gm)
+        self.index_tracer = IndexTracer(
+            self.node_list
+        )  # node list shared in index tracer
         self.index_tracer.trace_index()
         self.memory_estimator = MemoryEstimator(self.index_tracer)
 

From e0ae68e736cb56015fd1316113d52affaaf27749 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 23 Dec 2022 15:49:04 +0800
Subject: [PATCH 053/503] code style

---
 chunk_codegen.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index e80b0fd9be77..6e772aa8a56a 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -1497,8 +1497,8 @@ def search_region(self):
             chunk_info = self._step_search(mem_peak, active_node, chunk_infos)
             if chunk_info is None:
                 break
-
             chunk_infos.append(chunk_info)
+
             (
                 mem_peak,
                 _,

From 884a228ea674b02998575776b0069b15de0b7a10 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 23 Dec 2022 17:06:07 +0800
Subject: [PATCH 054/503] reorder nodes

---
 chunk_codegen.py | 127 +++++++++++++++++++++++++++++++++++++----------
 1 file changed, 101 insertions(+), 26 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 6e772aa8a56a..4b3b04d93b91 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -71,6 +71,7 @@ def __init__(self, node_list) -> None:
         self.idx_trace_equal = []
         self.idx_view_list = []
         self.idx_count = -1
+        self.all_reorder_map = {i: i for i in range(len(self.idx_trace_list))}
 
     def _init_idx_trace_list(self):
         idx_trace_list = []
@@ -973,6 +974,91 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
 
         return chunk_info
 
+    def _get_reorder_map(self, chunk_info):
+        reorder_map = {i: i for i in range(len(self.node_list))}
+
+        chunk_region_start = chunk_info["region"][0]
+        chunk_region_end = chunk_info["region"][1]
+        chunk_prepose_nodes = chunk_info["args"]["prepose_nodes"]
+        chunk_prepose_nodes_idx = [
+            _find_idx_by_name(i.name, self.node_list) for i in chunk_prepose_nodes
+        ]
+        # put prepose nodes ahead
+        for idx, n in enumerate(chunk_prepose_nodes):
+            n_idx = chunk_prepose_nodes_idx[idx]
+            reorder_map[n_idx] = chunk_region_start + idx
+        # put other nodes after prepose nodes
+        for n in self.node_list[chunk_region_start : chunk_region_end + 1]:
+            if n in chunk_prepose_nodes:
+                continue
+            n_idx = _find_idx_by_name(n.name, self.node_list)
+            pos = sum([n_idx < i for i in chunk_prepose_nodes_idx])
+            reorder_map[n_idx] = n_idx + pos
+
+        return reorder_map
+
+    def _reorder_chunk_info(self, chunk_info, reorder_map):
+        # update chunk info
+        chunk_info["region"] = (
+            chunk_info["region"][0] + len(chunk_info["args"]["prepose_nodes"]),
+            chunk_info["region"][1],
+        )
+        for idx, input_dim in enumerate(chunk_info["inputs_dim"]):
+            new_input_dim = {}
+            for k, v in input_dim.items():
+                new_input_dim[reorder_map[k]] = v
+            chunk_info["inputs_dim"][idx] = new_input_dim
+        return chunk_info
+
+    def _update_all_reorder_map(self, reorder_map):
+        for origin_idx, map_idx in self.all_reorder_map.items():
+            self.all_reorder_map[origin_idx] = reorder_map[map_idx]
+
+    def _reorder_self_node_list(self, reorder_map):
+        new_node_list = [None for _ in range(len(self.node_list))]
+        for old_idx, new_idx in reorder_map.items():
+            new_node_list[new_idx] = self.node_list[old_idx]
+        self.node_list = new_node_list
+
+    def _reorder_idx_trace(self, reorder_map):
+        # reorder list
+        new_idx_trace_list = [None for _ in range(len(self.idx_trace_list))]
+        for old_idx, new_idx in reorder_map.items():
+            new_idx_trace_list[new_idx] = self.idx_trace_list[old_idx]
+        self.idx_trace_list = new_idx_trace_list
+        # update compute
+        for idx_trace in self.idx_trace_list:
+            compute = idx_trace["compute"]
+            for dim_compute in compute:
+                for idx, i in enumerate(dim_compute):
+                    dim_compute[idx] = reorder_map[i]
+        # update source
+        for idx_trace in self.idx_trace_list:
+            source = idx_trace["source"]
+            for dim_idx, dim_source in enumerate(source):
+                new_dim_source = {}
+                for k, v in dim_source.items():
+                    new_dim_source[reorder_map[k]] = v
+                source[dim_idx] = new_dim_source
+
+    def reorder_all(self, chunk_info):
+        if chunk_info is None:
+            return chunk_info
+        if len(chunk_info["args"]["prepose_nodes"]) == 0:
+            return chunk_info
+        reorder_map = self._get_reorder_map(chunk_info)
+        self._update_all_reorder_map(reorder_map)
+        self._reorder_idx_trace(reorder_map)
+        self._reorder_self_node_list(reorder_map)
+        chunk_info = self._reorder_chunk_info(chunk_info, reorder_map)
+        return chunk_info
+
+    def reorder_node_list(self, node_list):
+        new_node_list = [None for _ in range(len(node_list))]
+        for old_idx, new_idx in self.all_reorder_map.items():
+            new_node_list[new_idx] = node_list[old_idx]
+        return new_node_list
+
 
 class MemoryEstimator(object):
     def __init__(self, index_tracer: IndexTracer) -> None:
@@ -1476,6 +1562,7 @@ def _step_search(self, mem_peak, active_node, chunk_regions):
         best_chunk_region = self._search_best_chunk_region(
             possible_chunk_regions, chunk_regions
         )
+        best_chunk_region = self.index_tracer.reorder_all(best_chunk_region)
         return best_chunk_region
 
     def _stop_search(self, init_mem_peak, mem_peak):
@@ -1670,8 +1757,7 @@ def emit_code_with_chunk(
     chunk_outputs = [i["outputs"][0] for i in chunk_search]
     chunk_outputs_dim = [i["outputs_dim"] for i in chunk_search]
 
-    chunk_prepose_nodes = [i["args"]["prepose_nodes"] for i in chunk_search]
-
+    node_list = chunk_region_search.index_tracer.reorder_node_list(node_list)
     node_idx = 0
     region_idx = 0
     within_chunk_region = False
@@ -1682,12 +1768,6 @@ def emit_code_with_chunk(
         if node_idx in chunk_starts:
             within_chunk_region = True
             region_idx = chunk_starts.index(node_idx)
-            # add prepose nodes
-            for i in chunk_prepose_nodes[region_idx]:
-                prepose_node = node_list[_find_idx_by_name(i.name, node_list)]
-                emit_node_func(prepose_node, body)
-                delete_unused_value_func(prepose_node, body, chunk_inputs_names)
-            # add for loop
             body.append(
                 _gen_loop_start(
                     chunk_inputs[region_idx],
@@ -1697,24 +1777,19 @@ def emit_code_with_chunk(
             )
 
         if within_chunk_region:
-            if any(node.name == i.name for i in chunk_prepose_nodes[region_idx]):
-                pass
-            else:
-                emit_node_func(node, body)
-                # replace input var with chunk var
-                for input_node_idx, input_node in enumerate(chunk_inputs[region_idx]):
-                    for idx, dim in chunk_inputs_dim[region_idx][
-                        input_node_idx
-                    ].items():
-                        if idx == node_idx:
-                            chunk_slice = _gen_chunk_slice_dim(
-                                dim, "chunk_idx", _get_node_shape(input_node)
-                            )
-                            body[-1] = _replace_name(
-                                body[-1], input_node.name, input_node.name + chunk_slice
-                            )
-                body[-1] = "    " + body[-1]
-                delete_unused_value_func(node, body, chunk_inputs_names)
+            emit_node_func(node, body)
+            # replace input var with chunk var
+            for input_node_idx, input_node in enumerate(chunk_inputs[region_idx]):
+                for idx, dim in chunk_inputs_dim[region_idx][input_node_idx].items():
+                    if idx == node_idx:
+                        chunk_slice = _gen_chunk_slice_dim(
+                            dim, "chunk_idx", _get_node_shape(input_node)
+                        )
+                        body[-1] = _replace_name(
+                            body[-1], input_node.name, input_node.name + chunk_slice
+                        )
+            body[-1] = "    " + body[-1]
+            delete_unused_value_func(node, body, chunk_inputs_names)
         else:
             emit_node_func(node, body)
             if node_idx not in chunk_inputs:

From 51ef8384c153f46dcbb74c26eec523ad7cd0d51c Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 23 Dec 2022 17:25:36 +0800
Subject: [PATCH 055/503] finish node reorder

---
 chunk_codegen.py | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 4b3b04d93b91..9623a9d9bbe2 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -1238,7 +1238,7 @@ def _print_compute_op_mem_log(self, log, nodes, title=None):
 
     def estimate_chunk_inference_mem(
         self,
-        gm: torch.fx.GraphModule,
+        node_list,
         chunk_infos=None,
     ):
         act_memory = 0.0
@@ -1247,7 +1247,6 @@ def estimate_chunk_inference_mem(
         active_node_list = []
         active_node_list_log = []
         not_contiguous_list = []
-        node_list = list(gm.graph.nodes)
         user_to_last_uses = self._get_last_usr(node_list)
         user_to_last_uses_no_free_var = self._get_last_usr(node_list)
         _delete_free_var_from_last_use(user_to_last_uses_no_free_var)
@@ -1281,7 +1280,6 @@ def estimate_chunk_inference_mem(
                 ) / (1024**2)
 
             # determine chunk ratio for current node
-            # TODO: adapt to prepose node memory
             if chunk_within:
                 chunk_ratio = self._get_chunk_ratio(
                     node,
@@ -1371,10 +1369,7 @@ def estimate_chunk_inference_mem(
 class ChunkRegionSearch(object):
     def __init__(self, gm) -> None:
         self.gm = gm
-        self.node_list = list(gm.graph.nodes)
-        self.index_tracer = IndexTracer(
-            self.node_list
-        )  # node list shared in index tracer
+        self.index_tracer = IndexTracer(list(gm.graph.nodes))
         self.index_tracer.trace_index()
         self.memory_estimator = MemoryEstimator(self.index_tracer)
 
@@ -1385,7 +1380,7 @@ def _find_peak_node(self, mem_peak):
 
     def _get_free_var(self):
         free_var_idx = []
-        for idx, n in enumerate(self.node_list):
+        for idx, n in enumerate(self.index_tracer.node_list):
             if n.op == "placeholder":
                 free_var_idx.append(idx)
         return free_var_idx
@@ -1455,13 +1450,13 @@ def _is_not_compute(self, trace, chunk_range, dim_idx):
     def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
         start_traces = input_trace[start_idx]
         end_trace = output_trace[end_idx]
-        end_node = self.node_list[end_idx]
+        end_node = self.index_tracer.node_list[end_idx]
         chunk_infos = []
-        for end_dim, end_trace_idx in enumerate(end_trace["idx"]):
+        for end_dim, _ in enumerate(end_trace["idx"]):
             if len(start_traces) > 1:
                 continue
             for start_node, start_trace in start_traces.items():
-                for start_dim, start_trace_idx in enumerate(start_trace["idx"]):
+                for start_dim, _ in enumerate(start_trace["idx"]):
                     # dim size cannot be 1
                     if (
                         _get_node_shape(end_node)[end_dim] == 1
@@ -1494,7 +1489,7 @@ def _search_possible_chunk_regions(self, max_chunk_region, peak_node):
         possible_chunk_region = []
         output_trace = copy.deepcopy(self.index_tracer.idx_trace_list)
         input_trace = []  # trace of a node's input nodes
-        for _, n in enumerate(self.node_list):
+        for _, n in enumerate(self.index_tracer.node_list):
             cur_trace = {}
             for arg in n.args:
                 if type(arg) == type(n) and not _is_non_compute_node_except_placeholder(
@@ -1507,8 +1502,8 @@ def _search_possible_chunk_regions(self, max_chunk_region, peak_node):
             for end_idx in range(peak_node, max_chunk_region[1] + 1):
                 # skip non compute nodes
                 if _is_non_compute_node(
-                    self.node_list[start_idx]
-                ) or _is_non_compute_node(self.node_list[end_idx]):
+                    self.index_tracer.node_list[start_idx]
+                ) or _is_non_compute_node(self.index_tracer.node_list[end_idx]):
                     continue
 
                 # select free dim
@@ -1577,7 +1572,9 @@ def search_region(self):
             init_mem_peak,
             _,
             active_node,
-        ) = self.memory_estimator.estimate_chunk_inference_mem(self.gm)
+        ) = self.memory_estimator.estimate_chunk_inference_mem(
+            self.index_tracer.node_list
+        )
         mem_peak = init_mem_peak
 
         while True:
@@ -1590,7 +1587,9 @@ def search_region(self):
                 mem_peak,
                 _,
                 active_node,
-            ) = self.memory_estimator.estimate_chunk_inference_mem(self.gm, chunk_infos)
+            ) = self.memory_estimator.estimate_chunk_inference_mem(
+                self.index_tracer.node_list, chunk_infos
+            )
             if self._stop_search(init_mem_peak, mem_peak):
                 break
         return chunk_infos

From 9b1b890347f345f1c4de2a0991e250dcaf94365a Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 23 Dec 2022 17:32:11 +0800
Subject: [PATCH 056/503] update run

---
 chunk_codegen_run.py | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/chunk_codegen_run.py b/chunk_codegen_run.py
index ae4653d6545b..3a3b3c599e3e 100644
--- a/chunk_codegen_run.py
+++ b/chunk_codegen_run.py
@@ -32,15 +32,25 @@ def _is_all_param_close(m: torch.nn.Module, gm: GraphModule) -> bool:
 
 
 def _test_fwd_and_bwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair):
+    # now_mem = torch.cuda.memory_allocated() / 1024**2
+    # with torch.no_grad():
+    #     node0 = node.clone()
+    #     pair0 = pair.clone()
+    #     model.graph(node0, pair0, now_mem)        
+    # new_now_mem = torch.cuda.memory_allocated() / 1024**2
+    # new_max_mem = torch.cuda.max_memory_allocated() / 1024**2
+    # print("\ncode now:%.2f max:%.2f" %(new_now_mem - now_mem, new_max_mem - now_mem))
+    
+    torch.cuda.reset_peak_memory_stats()
     now_mem = torch.cuda.memory_allocated() / 1024**2
     with torch.no_grad():
-        node0 = node.clone()
-        pair0 = pair.clone()
-        node1, pair1 = gm(node0, pair0)        
+        node1 = node.clone()
+        pair1 = pair.clone()
+        gm(node1, pair1)        
     new_now_mem = torch.cuda.memory_allocated() / 1024**2
     new_max_mem = torch.cuda.max_memory_allocated() / 1024**2
-    print("now:%.2f max:%.2f" %(new_now_mem - now_mem, new_max_mem - now_mem))
-    
+    print("gm now:%.2f max:%.2f" %(new_now_mem - now_mem, new_max_mem - now_mem))
+            
     # test forward
     with torch.no_grad():
         non_fx_out = model(node, pair)

From 786a398a6bdea395e2ca8ddde87c87c8470d971b Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 23 Dec 2022 17:42:51 +0800
Subject: [PATCH 057/503] code style

---
 chunk_codegen.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 9623a9d9bbe2..f87a3a132e78 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -920,9 +920,13 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
 
             # loop cur node's all arg until out of chunk
             while len(tmp_cur_prepose_nodes) > 0:
+                if prepose_flag == False:
+                    break
                 tmp_next_prepose_nodes = []
                 tmp_cur_related_prepose_nodes.extend(tmp_cur_prepose_nodes)
                 for cur_prepose_node in tmp_cur_prepose_nodes:
+                    if prepose_flag == False:
+                        break
                     for cur_prepose_node_arg in cur_prepose_node.args:
                         if type(cur_prepose_node_arg) != type(cur_prepose_node):
                             continue
@@ -942,8 +946,6 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
                             else:
                                 prepose_flag = False
                                 break
-                                break
-                                break
                         # non compute op
                         else:
                             tmp_next_prepose_nodes.append(cur_prepose_node_arg)

From 1b8a066592821870bb8f7a6fce338481efd5140b Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 26 Dec 2022 15:28:01 +0800
Subject: [PATCH 058/503] add chunk select class

---
 chunk_codegen.py | 80 +++++++++++++++++++++++++++++-------------------
 1 file changed, 49 insertions(+), 31 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index f87a3a132e78..cdd0b1077487 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -1368,12 +1368,60 @@ def estimate_chunk_inference_mem(
         return act_memory_peak_log, act_memory_after_node_log, active_node_list_log
 
 
+class ChunkSelector(object):
+    def __init__(self, index_tracer: IndexTracer, stratge) -> None:
+        self.index_tracer = index_tracer
+        assert stratge in ['min_memory', 'fit_memory']
+        self.stratge = stratge
+        self.max_memory = 800  # MB
+    
+    def _select_best_chunk_region(self, possible_chunk_regions, chunk_infos):
+        if self.stratge == 'min_memory':
+            best_region = self._select_min_memory_chunk_region(possible_chunk_regions, chunk_infos)
+        elif self.stratge == 'fit_memory':
+            pass
+        else:
+            raise RuntimeError()
+        return best_region
+    
+    def _select_min_memory_chunk_region(self, possible_chunk_regions, chunk_infos):
+        max_region_range = 0
+        best_region = None
+        while len(possible_chunk_regions) > 0:
+            for i in possible_chunk_regions:
+                if i["region"][1] - i["region"][0] > max_region_range:
+                    best_region = i
+                    max_region_range = i["region"][1] - i["region"][0]
+            if self._is_legal_region(best_region, chunk_infos):
+                break
+            possible_chunk_regions.remove(i)
+            max_region_range = 0
+            best_region = None
+        return best_region
+
+    def _is_legal_region(self, cur_chunk_info, chunk_infos):
+        (chunk_region_start, chunk_region_end) = cur_chunk_info["region"]
+        if cur_chunk_info in chunk_infos:
+            return False
+        if chunk_region_end < chunk_region_start:
+            return False
+        for i in chunk_infos:
+            region = i["region"]
+            if not (
+                (chunk_region_start > region[1] and chunk_region_end > region[1])
+                or (chunk_region_start < region[0] and chunk_region_end < region[0])
+            ):
+                return False
+        return True
+
+
 class ChunkRegionSearch(object):
     def __init__(self, gm) -> None:
         self.gm = gm
         self.index_tracer = IndexTracer(list(gm.graph.nodes))
         self.index_tracer.trace_index()
         self.memory_estimator = MemoryEstimator(self.index_tracer)
+        self.chunk_selector = ChunkSelector(self.index_tracer, stratge="min_memory")
 
     def _find_peak_node(self, mem_peak):
         max_value = max(mem_peak)
@@ -1516,36 +1564,6 @@ def _search_possible_chunk_regions(self, max_chunk_region, peak_node):
                     possible_chunk_region.extend(chunk_info)
         return possible_chunk_region
 
-    def _search_best_chunk_region(self, possible_chunk_regions, chunk_infos):
-        max_region_range = 0
-        best_region = None
-        while len(possible_chunk_regions) > 0:
-            for i in possible_chunk_regions:
-                if i["region"][1] - i["region"][0] > max_region_range:
-                    best_region = i
-                    max_region_range = i["region"][1] - i["region"][0]
-            if self._is_legal_region(best_region, chunk_infos):
-                break
-            possible_chunk_regions.remove(i)
-            max_region_range = 0
-            best_region = None
-        return best_region
-
-    def _is_legal_region(self, cur_chunk_info, chunk_infos):
-        (chunk_region_start, chunk_region_end) = cur_chunk_info["region"]
-        if cur_chunk_info in chunk_infos:
-            return False
-        if chunk_region_end < chunk_region_start:
-            return False
-        for i in chunk_infos:
-            region = i["region"]
-            if not (
-                (chunk_region_start > region[1] and chunk_region_end > region[1])
-                or (chunk_region_start < region[0] and chunk_region_end < region[0])
-            ):
-                return False
-        return True
-
     def _step_search(self, mem_peak, active_node, chunk_regions):
         peak_node = self._find_peak_node(mem_peak)
         max_chunk_region = self._search_max_chunk_region(
@@ -1556,7 +1574,7 @@ def _step_search(self, mem_peak, active_node, chunk_regions):
         possible_chunk_regions = self._search_possible_chunk_regions(
             max_chunk_region, peak_node
         )
-        best_chunk_region = self._search_best_chunk_region(
+        best_chunk_region = self.chunk_selector._select_best_chunk_region(
             possible_chunk_regions, chunk_regions
         )
         best_chunk_region = self.index_tracer.reorder_all(best_chunk_region)

From 8f5a0edfab3d9c4636333cba2dcdbb7f2fa74181 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 26 Dec 2022 23:08:49 +0800
Subject: [PATCH 059/503] add chunk select

---
 chunk_codegen.py | 147 ++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 112 insertions(+), 35 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index cdd0b1077487..330f3dec611c 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -69,7 +69,7 @@ def __init__(self, node_list) -> None:
         self.node_list = node_list
         self.idx_trace_list = self._init_idx_trace_list()
         self.idx_trace_equal = []
-        self.idx_view_list = []
+        self.idx_view_list = {}
         self.idx_count = -1
         self.all_reorder_map = {i: i for i in range(len(self.idx_trace_list))}
 
@@ -576,7 +576,7 @@ def _assign_view_reshape_index(self, node, node_idx):
             "idx_to": [self.idx_trace_list[node_idx]["idx"][i] for i in dim_to],
             "dim_to": dim_to,
         }
-        self.idx_view_list.append(view_dict)
+        self.idx_view_list[node] = view_dict
 
     def _merge_equal_idx(self):
         idx_equal = copy.deepcopy(self.idx_trace_equal)
@@ -702,7 +702,7 @@ def _find_inherit_dim(self, input_node, input_dim, node):
         for node_dim in range(len(_get_node_shape(node))):
             if (
                 input_node_idx in node_trace_source[node_dim]
-                and input_dim in node_trace_source[node_dim][input_node_idx]
+                and input_dim[0] in node_trace_source[node_dim][input_node_idx]
             ):
                 return node_dim
         return None
@@ -875,6 +875,7 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
         remove_inputs = []
         for input_node in inputs:
             input_dict = {}
+            input_node_idx = _find_idx_by_name(input_node.name, self.node_list)
             for user in input_node.users.keys():
                 if _is_non_compute_node(user):
                     continue
@@ -882,7 +883,11 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
                 if start_idx <= user_idx <= end_idx:
                     chunk_dim = all_node_info[user]["chunk_dim"]
                     if chunk_dim is not None:
-                        input_dict[user_idx] = chunk_dim
+                        user_source = self._find_source_trace_from_node(user)[chunk_dim]
+                        if input_node_idx in user_source:
+                            input_dict[user_idx] = user_source[input_node_idx]
+                        else:
+                            return None
             if len(input_dict) == 0:
                 remove_inputs.append(input_node)
             else:
@@ -898,6 +903,7 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
             "inputs_dim": inputs_dim,
             "outputs": outputs,
             "outputs_dim": end_dim,
+            "node_chunk_dim": all_node_info,
             "args": {},
         }
 
@@ -974,6 +980,26 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
             if i not in chunk_info["inputs"]:
                 chunk_info["inputs_non_chunk"].append(i)
 
+        # reassgin reshape size, some size may have changed due to chunk
+        chunk_info = self._reassgin_reshape_size(chunk_info)
+        
+        return chunk_info
+    
+    def _reassgin_reshape_size(self, chunk_info):
+        chunk_region = chunk_info['region']
+        reshape_size = {}
+        for node in self.node_list[chunk_region[0]: chunk_region[1] + 1]:
+            if any(i in node.name for i in ['reshape', 'view']):
+                reshape_args = node.args[1:]
+                reshape_log = self.idx_view_list[node]
+                chunk_dim = chunk_info['node_chunk_dim'][node]['chunk_dim']
+                reshape_size[node.name] = {}
+                for reshape_arg_dim, reshape_arg in enumerate(reshape_args):
+                    if reshape_arg_dim in reshape_log['dim_to']:
+                        continue
+                    if reshape_arg_dim == chunk_dim:
+                        reshape_size[node.name][reshape_arg.name] = "chunk_size"
+        chunk_info['reshape_size'] = reshape_size     
         return chunk_info
 
     def _get_reorder_map(self, chunk_info):
@@ -1183,23 +1209,15 @@ def _get_contiguous_memory(self, node, not_contiguous_list, delete=False):
                 not_contiguous_list.append(node)
         return mem
 
-    def _get_chunk_ratio(self, node, chunk_inputs, chunk_inputs_dim, chunk_size):
+    def _get_chunk_ratio(self, node, chunk_node_dim, chunk_size):
+        if node not in chunk_node_dim:
+            return 1.0
         node_shape = _get_node_shape(node)
-        node_source = self.index_tracer._find_source_trace_from_node(node)
-        for (input_node, input_node_dim) in zip(chunk_inputs, chunk_inputs_dim):
-            for k, v in input_node_dim.items():
-                # TODO: inherit dim should be list too, int now
-                inherit_dim = self.index_tracer._find_inherit_dim(
-                    input_node, v, self.index_tracer.node_list[k]
-                )
-                if k == _find_idx_by_name(node.name, self.index_tracer.node_list):
-                    chunk_ratio = float(chunk_size) / node_shape[inherit_dim]
-                    return chunk_ratio
-                for dim, source in enumerate(node_source):
-                    if k in source and inherit_dim in source[k]:
-                        chunk_ratio = float(chunk_size) / node_shape[dim]
-                        return chunk_ratio
-        return 1.0
+        chunk_dim = chunk_node_dim[node]['chunk_dim']
+        if chunk_dim is None:
+            return 1.0
+        else:
+            return float(chunk_size) / node_shape[chunk_dim]
 
     def _get_chunk_delete_node_size(
         self, user, user_to_last_uses, chunk_ratio, chunk_inputs_names
@@ -1242,6 +1260,7 @@ def estimate_chunk_inference_mem(
         self,
         node_list,
         chunk_infos=None,
+        print_mem=False,
     ):
         act_memory = 0.0
         act_memory_peak_log = []
@@ -1271,6 +1290,7 @@ def estimate_chunk_inference_mem(
                 j.name for i in chunk_inputs_non_chunk for j in i
             ]
             chunk_outputs = [i["outputs"][0] for i in chunk_infos]
+            chunk_node_dim = [i["node_chunk_dim"] for i in chunk_infos]
 
         for idx, node in enumerate(node_list):
             # if node in chunk start nodes, change chunk ratio and add chunk_tensor
@@ -1285,8 +1305,7 @@ def estimate_chunk_inference_mem(
             if chunk_within:
                 chunk_ratio = self._get_chunk_ratio(
                     node,
-                    chunk_inputs[chunk_region_idx],
-                    chunk_inputs_dim[chunk_region_idx],
+                    chunk_node_dim[chunk_region_idx],
                     chunk_size,
                 )
 
@@ -1357,11 +1376,12 @@ def estimate_chunk_inference_mem(
             act_memory_after_node_log.append(act_memory)
             active_node_list_log.append(copy.deepcopy(active_node_list))
 
-        print("with chunk" if use_chunk else "without chunk")
-        # self._print_mem_log(act_memory_peak_log, node_list, "peak")
-        # self._print_mem_log(act_memory_after_node_log, node_list, "after")
-        self._print_compute_op_mem_log(act_memory_peak_log, node_list, "peak")
-        self._print_compute_op_mem_log(act_memory_after_node_log, node_list, "after")
+        if print_mem:
+            print("with chunk" if use_chunk else "without chunk")
+            # self._print_mem_log(act_memory_peak_log, node_list, "peak")
+            # self._print_mem_log(act_memory_after_node_log, node_list, "after")
+            self._print_compute_op_mem_log(act_memory_peak_log, node_list, "peak")
+            self._print_compute_op_mem_log(act_memory_after_node_log, node_list, "after")
 
         # param_memory = parameter_size(gm)
         # all_memory = act_memory + param_memory
@@ -1369,21 +1389,70 @@ def estimate_chunk_inference_mem(
 
 
 class ChunkSelector(object):
-    def __init__(self, index_tracer: IndexTracer, stratge) -> None:
+    def __init__(self, index_tracer: IndexTracer, memory_estimator: MemoryEstimator, stratge):
         self.index_tracer = index_tracer
+        self.memory_estimator = memory_estimator
         assert stratge in ['min_memory', 'fit_memory']
         self.stratge = stratge
-        self.max_memory = 800  # MB
+        self.max_memory = 600  # MB
     
-    def _select_best_chunk_region(self, possible_chunk_regions, chunk_infos):
+    def _select_best_chunk_region(self, possible_chunk_regions, 
+            chunk_infos, peak_node, max_chunk_region, mem_peak):
         if self.stratge == 'min_memory':
             best_region = self._select_min_memory_chunk_region(possible_chunk_regions, chunk_infos)
         elif self.stratge == 'fit_memory':
-            pass
+            best_region = self._select_fit_memory_chunk_region(
+                possible_chunk_regions, chunk_infos, peak_node, max_chunk_region, mem_peak)
         else:
             raise RuntimeError()
         return best_region
     
+    def _select_fit_memory_chunk_region(self, possible_chunk_regions, 
+            chunk_infos, peak_node, max_chunk_region, mem_peak):
+        # stop chunk if max memory satisfy memory limit
+        if max(mem_peak) < self.max_memory:
+            return None
+        
+        # remove illegal regions
+        illegal_regions = []
+        for i in possible_chunk_regions:
+            if not self._is_legal_region(i, chunk_infos):
+                illegal_regions.append(i)
+        for i in illegal_regions:
+            if i in possible_chunk_regions:
+                possible_chunk_regions.remove(i)
+        
+        # get mem for chunk region
+        regions_dict = []
+        for region in possible_chunk_regions:
+            cur_chunk_infos = chunk_infos + [region]
+            cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
+                self.index_tracer.node_list, cur_chunk_infos)[0]
+            cur_chunk_region_peak = cur_mem_peak[max_chunk_region[0]: max_chunk_region[1] + 1]
+            cur_chunk_region_max_peak = max(cur_chunk_region_peak)
+            if cur_chunk_region_max_peak < self.max_memory:
+                regions_dict.append({
+                    "chunk_info": region,
+                    "chunk_max_mem": cur_chunk_region_max_peak,
+                    "chunk_len": self._get_compute_node_num(region['region'][0], region['region'][1]),
+                })
+        # no region found
+        if len(regions_dict) == 0:
+            return None
+        
+        # select the min chunk len
+        chunk_len = [i["chunk_len"] for i in regions_dict]
+        best_region_idx = chunk_len.index(min(chunk_len))
+        best_region = regions_dict[best_region_idx]["chunk_info"]
+        return best_region
+    
+    def _get_compute_node_num(self, start, end):
+        count = 0
+        for i in self.index_tracer.node_list[start: end+1]:
+            if _is_non_compute_node(i):
+                count += 1
+        return count
+    
     def _select_min_memory_chunk_region(self, possible_chunk_regions, chunk_infos):
         max_region_range = 0
         best_region = None
@@ -1421,7 +1490,7 @@ def __init__(self, gm) -> None:
         self.index_tracer = IndexTracer(list(gm.graph.nodes))
         self.index_tracer.trace_index()
         self.memory_estimator = MemoryEstimator(self.index_tracer)
-        self.chunk_selector = ChunkSelector(self.index_tracer, stratge="min_memory")
+        self.chunk_selector = ChunkSelector(self.index_tracer, self.memory_estimator, stratge="fit_memory")
 
     def _find_peak_node(self, mem_peak):
         max_value = max(mem_peak)
@@ -1575,7 +1644,7 @@ def _step_search(self, mem_peak, active_node, chunk_regions):
             max_chunk_region, peak_node
         )
         best_chunk_region = self.chunk_selector._select_best_chunk_region(
-            possible_chunk_regions, chunk_regions
+            possible_chunk_regions, chunk_regions, peak_node, max_chunk_region, mem_peak
         )
         best_chunk_region = self.index_tracer.reorder_all(best_chunk_region)
         return best_chunk_region
@@ -1608,7 +1677,7 @@ def search_region(self):
                 _,
                 active_node,
             ) = self.memory_estimator.estimate_chunk_inference_mem(
-                self.index_tracer.node_list, chunk_infos
+                self.index_tracer.node_list, chunk_infos, print_mem=True
             )
             if self._stop_search(init_mem_peak, mem_peak):
                 break
@@ -1736,6 +1805,13 @@ def _replace_name(context, name_from, name_to):
     return context
 
 
+def _replace_reshape_size(context, node_name, reshape_size_dict):
+    if node_name not in reshape_size_dict:
+        return context
+    for size_name, size_value in reshape_size_dict[node_name].items():  
+        context = context.replace(size_name, size_value)
+    return context
+
 def emit_code_with_chunk(
     body,
     ckpt_func,
@@ -1802,11 +1878,12 @@ def emit_code_with_chunk(
                 for idx, dim in chunk_inputs_dim[region_idx][input_node_idx].items():
                     if idx == node_idx:
                         chunk_slice = _gen_chunk_slice_dim(
-                            dim, "chunk_idx", _get_node_shape(input_node)
+                            dim[0], "chunk_idx", _get_node_shape(input_node)
                         )
                         body[-1] = _replace_name(
                             body[-1], input_node.name, input_node.name + chunk_slice
                         )
+            body[-1] = _replace_reshape_size(body[-1], node.name, chunk_search[region_idx]['reshape_size'])
             body[-1] = "    " + body[-1]
             delete_unused_value_func(node, body, chunk_inputs_names)
         else:

From 378a49dc6c259773cdc198841a75137f7c6edc7f Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 27 Dec 2022 09:48:59 +0800
Subject: [PATCH 060/503] code style

---
 chunk_codegen.py | 101 +++++++++++++++++++++++++++++------------------
 1 file changed, 63 insertions(+), 38 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 330f3dec611c..1255852d777d 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -982,24 +982,24 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
 
         # reassgin reshape size, some size may have changed due to chunk
         chunk_info = self._reassgin_reshape_size(chunk_info)
-        
+
         return chunk_info
-    
+
     def _reassgin_reshape_size(self, chunk_info):
-        chunk_region = chunk_info['region']
+        chunk_region = chunk_info["region"]
         reshape_size = {}
-        for node in self.node_list[chunk_region[0]: chunk_region[1] + 1]:
-            if any(i in node.name for i in ['reshape', 'view']):
+        for node in self.node_list[chunk_region[0] : chunk_region[1] + 1]:
+            if any(i in node.name for i in ["reshape", "view"]):
                 reshape_args = node.args[1:]
                 reshape_log = self.idx_view_list[node]
-                chunk_dim = chunk_info['node_chunk_dim'][node]['chunk_dim']
+                chunk_dim = chunk_info["node_chunk_dim"][node]["chunk_dim"]
                 reshape_size[node.name] = {}
                 for reshape_arg_dim, reshape_arg in enumerate(reshape_args):
-                    if reshape_arg_dim in reshape_log['dim_to']:
+                    if reshape_arg_dim in reshape_log["dim_to"]:
                         continue
                     if reshape_arg_dim == chunk_dim:
                         reshape_size[node.name][reshape_arg.name] = "chunk_size"
-        chunk_info['reshape_size'] = reshape_size     
+        chunk_info["reshape_size"] = reshape_size
         return chunk_info
 
     def _get_reorder_map(self, chunk_info):
@@ -1213,7 +1213,7 @@ def _get_chunk_ratio(self, node, chunk_node_dim, chunk_size):
         if node not in chunk_node_dim:
             return 1.0
         node_shape = _get_node_shape(node)
-        chunk_dim = chunk_node_dim[node]['chunk_dim']
+        chunk_dim = chunk_node_dim[node]["chunk_dim"]
         if chunk_dim is None:
             return 1.0
         else:
@@ -1381,7 +1381,9 @@ def estimate_chunk_inference_mem(
             # self._print_mem_log(act_memory_peak_log, node_list, "peak")
             # self._print_mem_log(act_memory_after_node_log, node_list, "after")
             self._print_compute_op_mem_log(act_memory_peak_log, node_list, "peak")
-            self._print_compute_op_mem_log(act_memory_after_node_log, node_list, "after")
+            self._print_compute_op_mem_log(
+                act_memory_after_node_log, node_list, "after"
+            )
 
         # param_memory = parameter_size(gm)
         # all_memory = act_memory + param_memory
@@ -1389,30 +1391,41 @@ def estimate_chunk_inference_mem(
 
 
 class ChunkSelector(object):
-    def __init__(self, index_tracer: IndexTracer, memory_estimator: MemoryEstimator, stratge):
+    def __init__(
+        self, index_tracer: IndexTracer, memory_estimator: MemoryEstimator, stratge
+    ):
         self.index_tracer = index_tracer
         self.memory_estimator = memory_estimator
-        assert stratge in ['min_memory', 'fit_memory']
+        assert stratge in ["min_memory", "fit_memory"]
         self.stratge = stratge
         self.max_memory = 600  # MB
-    
-    def _select_best_chunk_region(self, possible_chunk_regions, 
-            chunk_infos, peak_node, max_chunk_region, mem_peak):
-        if self.stratge == 'min_memory':
-            best_region = self._select_min_memory_chunk_region(possible_chunk_regions, chunk_infos)
-        elif self.stratge == 'fit_memory':
+
+    def _select_best_chunk_region(
+        self, possible_chunk_regions, chunk_infos, peak_node, max_chunk_region, mem_peak
+    ):
+        if self.stratge == "min_memory":
+            best_region = self._select_min_memory_chunk_region(
+                possible_chunk_regions, chunk_infos
+            )
+        elif self.stratge == "fit_memory":
             best_region = self._select_fit_memory_chunk_region(
-                possible_chunk_regions, chunk_infos, peak_node, max_chunk_region, mem_peak)
+                possible_chunk_regions,
+                chunk_infos,
+                peak_node,
+                max_chunk_region,
+                mem_peak,
+            )
         else:
             raise RuntimeError()
         return best_region
-    
-    def _select_fit_memory_chunk_region(self, possible_chunk_regions, 
-            chunk_infos, peak_node, max_chunk_region, mem_peak):
+
+    def _select_fit_memory_chunk_region(
+        self, possible_chunk_regions, chunk_infos, peak_node, max_chunk_region, mem_peak
+    ):
         # stop chunk if max memory satisfy memory limit
         if max(mem_peak) < self.max_memory:
             return None
-        
+
         # remove illegal regions
         illegal_regions = []
         for i in possible_chunk_regions:
@@ -1421,38 +1434,45 @@ def _select_fit_memory_chunk_region(self, possible_chunk_regions,
         for i in illegal_regions:
             if i in possible_chunk_regions:
                 possible_chunk_regions.remove(i)
-        
+
         # get mem for chunk region
         regions_dict = []
         for region in possible_chunk_regions:
             cur_chunk_infos = chunk_infos + [region]
             cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
-                self.index_tracer.node_list, cur_chunk_infos)[0]
-            cur_chunk_region_peak = cur_mem_peak[max_chunk_region[0]: max_chunk_region[1] + 1]
+                self.index_tracer.node_list, cur_chunk_infos
+            )[0]
+            cur_chunk_region_peak = cur_mem_peak[
+                max_chunk_region[0] : max_chunk_region[1] + 1
+            ]
             cur_chunk_region_max_peak = max(cur_chunk_region_peak)
             if cur_chunk_region_max_peak < self.max_memory:
-                regions_dict.append({
-                    "chunk_info": region,
-                    "chunk_max_mem": cur_chunk_region_max_peak,
-                    "chunk_len": self._get_compute_node_num(region['region'][0], region['region'][1]),
-                })
+                regions_dict.append(
+                    {
+                        "chunk_info": region,
+                        "chunk_max_mem": cur_chunk_region_max_peak,
+                        "chunk_len": self._get_compute_node_num(
+                            region["region"][0], region["region"][1]
+                        ),
+                    }
+                )
         # no region found
         if len(regions_dict) == 0:
             return None
-        
+
         # select the min chunk len
         chunk_len = [i["chunk_len"] for i in regions_dict]
         best_region_idx = chunk_len.index(min(chunk_len))
         best_region = regions_dict[best_region_idx]["chunk_info"]
         return best_region
-    
+
     def _get_compute_node_num(self, start, end):
         count = 0
-        for i in self.index_tracer.node_list[start: end+1]:
+        for i in self.index_tracer.node_list[start : end + 1]:
             if _is_non_compute_node(i):
                 count += 1
         return count
-    
+
     def _select_min_memory_chunk_region(self, possible_chunk_regions, chunk_infos):
         max_region_range = 0
         best_region = None
@@ -1490,7 +1510,9 @@ def __init__(self, gm) -> None:
         self.index_tracer = IndexTracer(list(gm.graph.nodes))
         self.index_tracer.trace_index()
         self.memory_estimator = MemoryEstimator(self.index_tracer)
-        self.chunk_selector = ChunkSelector(self.index_tracer, self.memory_estimator, stratge="fit_memory")
+        self.chunk_selector = ChunkSelector(
+            self.index_tracer, self.memory_estimator, stratge="fit_memory"
+        )
 
     def _find_peak_node(self, mem_peak):
         max_value = max(mem_peak)
@@ -1808,10 +1830,11 @@ def _replace_name(context, name_from, name_to):
 def _replace_reshape_size(context, node_name, reshape_size_dict):
     if node_name not in reshape_size_dict:
         return context
-    for size_name, size_value in reshape_size_dict[node_name].items():  
+    for size_name, size_value in reshape_size_dict[node_name].items():
         context = context.replace(size_name, size_value)
     return context
 
+
 def emit_code_with_chunk(
     body,
     ckpt_func,
@@ -1883,7 +1906,9 @@ def emit_code_with_chunk(
                         body[-1] = _replace_name(
                             body[-1], input_node.name, input_node.name + chunk_slice
                         )
-            body[-1] = _replace_reshape_size(body[-1], node.name, chunk_search[region_idx]['reshape_size'])
+            body[-1] = _replace_reshape_size(
+                body[-1], node.name, chunk_search[region_idx]["reshape_size"]
+            )
             body[-1] = "    " + body[-1]
             delete_unused_value_func(node, body, chunk_inputs_names)
         else:

From 6be89a3b82d370be152c93dd7277e234e68eaea6 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 27 Dec 2022 14:48:25 +0800
Subject: [PATCH 061/503] add chunksize in emit, fix bug in reassgin shape

---
 chunk_codegen.py | 56 ++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 52 insertions(+), 4 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 1255852d777d..470768855779 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -988,6 +988,7 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
     def _reassgin_reshape_size(self, chunk_info):
         chunk_region = chunk_info["region"]
         reshape_size = {}
+        chunk_shape = _get_node_shape(chunk_info["outputs"][0])[chunk_info["outputs_dim"]]
         for node in self.node_list[chunk_region[0] : chunk_region[1] + 1]:
             if any(i in node.name for i in ["reshape", "view"]):
                 reshape_args = node.args[1:]
@@ -998,7 +999,7 @@ def _reassgin_reshape_size(self, chunk_info):
                     if reshape_arg_dim in reshape_log["dim_to"]:
                         continue
                     if reshape_arg_dim == chunk_dim:
-                        reshape_size[node.name][reshape_arg.name] = "chunk_size"
+                        reshape_size[node.name][reshape_arg.name] = "min(chunk_size, %d - chunk_idx)" % chunk_shape
         chunk_info["reshape_size"] = reshape_size
         return chunk_info
 
@@ -1276,7 +1277,6 @@ def estimate_chunk_inference_mem(
         chunk_within = False
         chunk_region_idx = None
         chunk_ratio = 1  # use it to estimate chunk mem
-        chunk_size = 1
         chunk_inputs_names = []
 
         if use_chunk:
@@ -1285,12 +1285,14 @@ def estimate_chunk_inference_mem(
             chunk_ends = [i[1] for i in chunk_regions]
             chunk_inputs = [i["inputs"] for i in chunk_infos]
             chunk_inputs_non_chunk = [i["inputs_non_chunk"] for i in chunk_infos]
-            chunk_inputs_dim = [i["inputs_dim"] for i in chunk_infos]
             chunk_inputs_names = [j.name for i in chunk_inputs for j in i] + [
                 j.name for i in chunk_inputs_non_chunk for j in i
             ]
             chunk_outputs = [i["outputs"][0] for i in chunk_infos]
             chunk_node_dim = [i["node_chunk_dim"] for i in chunk_infos]
+            chunk_sizes = [
+                i["chunk_size"] if "chunk_size" in i else 1 for i in chunk_infos
+            ]
 
         for idx, node in enumerate(node_list):
             # if node in chunk start nodes, change chunk ratio and add chunk_tensor
@@ -1306,7 +1308,7 @@ def estimate_chunk_inference_mem(
                 chunk_ratio = self._get_chunk_ratio(
                     node,
                     chunk_node_dim[chunk_region_idx],
-                    chunk_size,
+                    chunk_sizes[chunk_region_idx],
                 )
 
             # if node is placeholder, just add the size of the node
@@ -1464,8 +1466,53 @@ def _select_fit_memory_chunk_region(
         chunk_len = [i["chunk_len"] for i in regions_dict]
         best_region_idx = chunk_len.index(min(chunk_len))
         best_region = regions_dict[best_region_idx]["chunk_info"]
+
+        # get max chunk size
+        best_region = self._get_fit_chunk_size(best_region, chunk_infos)
         return best_region
 
+    def _get_fit_chunk_size(self, chunk_info, chunk_infos):
+        chunk_size = 1
+        chunk_info["chunk_size"] = chunk_size
+        cur_chunk_max_mem = 0
+        # search a region
+        while cur_chunk_max_mem < self.max_memory:
+            chunk_size *= 2
+            chunk_info["chunk_size"] = chunk_size
+            cur_chunk_infos = chunk_infos + [chunk_info]
+            cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
+                self.index_tracer.node_list, cur_chunk_infos
+            )[0]
+            cur_chunk_max_mem = max(
+                cur_mem_peak[chunk_info["region"][0] : chunk_info["region"][1] + 1]
+            )
+        # search exact size
+        chunk_info["chunk_size"] = self._chunk_size_binary_search(
+            chunk_size // 2, chunk_size, chunk_info, chunk_infos
+        )
+        return chunk_info
+
+    def _chunk_size_binary_search(self, l, r, chunk_info, chunk_infos):
+        if l >= 16:
+            gap = 4
+        else:
+            gap = 1
+        while r >= l + gap:
+            mid = int(l + (r - l)/2)
+            chunk_info["chunk_size"] = mid
+            cur_chunk_infos = chunk_infos + [chunk_info]
+            cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
+                self.index_tracer.node_list, cur_chunk_infos
+            )[0]
+            cur_chunk_max_mem = max(
+                cur_mem_peak[chunk_info["region"][0] : chunk_info["region"][1] + 1]
+            )
+            if cur_chunk_max_mem >= self.max_memory:
+                r = mid - gap
+            else:
+                l = mid + gap
+        return l
+
     def _get_compute_node_num(self, start, end):
         count = 0
         for i in self.index_tracer.node_list[start : end + 1]:
@@ -1891,6 +1938,7 @@ def emit_code_with_chunk(
                     chunk_inputs[region_idx],
                     chunk_outputs[region_idx],
                     chunk_outputs_dim[region_idx],
+                    chunk_size=chunk_search[region_idx]["chunk_size"]
                 )
             )
 

From a2b4755ce96e2e8dea100bafd7790e22426aa548 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 27 Dec 2022 14:49:52 +0800
Subject: [PATCH 062/503] code style

---
 chunk_codegen.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 470768855779..3cd10350eaba 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -988,7 +988,9 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
     def _reassgin_reshape_size(self, chunk_info):
         chunk_region = chunk_info["region"]
         reshape_size = {}
-        chunk_shape = _get_node_shape(chunk_info["outputs"][0])[chunk_info["outputs_dim"]]
+        chunk_shape = _get_node_shape(chunk_info["outputs"][0])[
+            chunk_info["outputs_dim"]
+        ]
         for node in self.node_list[chunk_region[0] : chunk_region[1] + 1]:
             if any(i in node.name for i in ["reshape", "view"]):
                 reshape_args = node.args[1:]
@@ -999,7 +1001,9 @@ def _reassgin_reshape_size(self, chunk_info):
                     if reshape_arg_dim in reshape_log["dim_to"]:
                         continue
                     if reshape_arg_dim == chunk_dim:
-                        reshape_size[node.name][reshape_arg.name] = "min(chunk_size, %d - chunk_idx)" % chunk_shape
+                        reshape_size[node.name][reshape_arg.name] = (
+                            "min(chunk_size, %d - chunk_idx)" % chunk_shape
+                        )
         chunk_info["reshape_size"] = reshape_size
         return chunk_info
 
@@ -1498,7 +1502,7 @@ def _chunk_size_binary_search(self, l, r, chunk_info, chunk_infos):
         else:
             gap = 1
         while r >= l + gap:
-            mid = int(l + (r - l)/2)
+            mid = int(l + (r - l) / 2)
             chunk_info["chunk_size"] = mid
             cur_chunk_infos = chunk_infos + [chunk_info]
             cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
@@ -1938,7 +1942,7 @@ def emit_code_with_chunk(
                     chunk_inputs[region_idx],
                     chunk_outputs[region_idx],
                     chunk_outputs_dim[region_idx],
-                    chunk_size=chunk_search[region_idx]["chunk_size"]
+                    chunk_search[region_idx]["chunk_size"],
                 )
             )
 

From cb2dd1a10614c21ca78e1c0cea2f6f7aa882e712 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 27 Dec 2022 15:01:58 +0800
Subject: [PATCH 063/503] turn off print mem

---
 chunk_codegen.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 3cd10350eaba..6caed88d84d2 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -1750,10 +1750,13 @@ def search_region(self):
                 _,
                 active_node,
             ) = self.memory_estimator.estimate_chunk_inference_mem(
-                self.index_tracer.node_list, chunk_infos, print_mem=True
+                self.index_tracer.node_list, chunk_infos
             )
             if self._stop_search(init_mem_peak, mem_peak):
                 break
+        # self.memory_estimator.estimate_chunk_inference_mem(
+        #     self.index_tracer.node_list, chunk_infos, print_mem=True
+        # )
         return chunk_infos
 
 
From 69af93107f09db3fb90116144296ebc20adc7b52 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Thu, 29 Dec 2022 11:28:25 +0800
Subject: [PATCH 064/503] add evoformer openfold init

---
 evoformer_openfold/evoformer.py   |  59 +++++++++
 evoformer_openfold/initializer.py |  29 +++++
 evoformer_openfold/kernel.py      |  19 +++
 evoformer_openfold/msa.py         |  95 +++++++++++++++
 evoformer_openfold/ops.py         | 176 +++++++++++++++++++++++++++
 evoformer_openfold/triangle.py    | 192 ++++++++++++++++++++++++++++++
 6 files changed, 570 insertions(+)
 create mode 100644 evoformer_openfold/evoformer.py
 create mode 100755 evoformer_openfold/initializer.py
 create mode 100644 evoformer_openfold/kernel.py
 create mode 100644 evoformer_openfold/msa.py
 create mode 100755 evoformer_openfold/ops.py
 create mode 100644 evoformer_openfold/triangle.py

diff --git a/evoformer_openfold/evoformer.py b/evoformer_openfold/evoformer.py
new file mode 100644
index 000000000000..cfd2bb2a2529
--- /dev/null
+++ b/evoformer_openfold/evoformer.py
@@ -0,0 +1,59 @@
+import torch
+import torch.nn as nn
+
+from .msa import MSAStack
+from .ops import OutProductMean
+from .triangle import PairStack
+
+
+def print_memory(init_mem, text=None):
+    now_mem = torch.cuda.memory_allocated() / 1024 ** 2 - init_mem
+    max_mem = torch.cuda.max_memory_allocated() / 1024 ** 2 - init_mem
+    print("%s now:%.2f max:%.2f" % ("" if text is None else text, now_mem, max_mem))
+    torch.cuda.reset_peak_memory_stats()
+
+
+class EvoformerBlock(nn.Module):
+
+    def __init__(self, d_node, d_pair):
+        super(EvoformerBlock, self).__init__()
+
+        self.msa_stack = MSAStack(d_node, d_pair, p_drop=0.15)
+        self.communication = OutProductMean(n_feat=d_node, n_feat_out=d_pair, n_feat_proj=32)
+        self.pair_stack = PairStack(d_pair=d_pair)
+
+    def forward(self, node, pair):
+        node = self.msa_stack(node, pair)
+        pair = pair + self.communication(node)
+        pair = self.pair_stack(pair)
+        return node, pair
+
+
+class Evoformer(nn.Module):
+
+    def __init__(self, d_node, d_pair):
+        super(Evoformer, self).__init__()
+
+        self.blocks = nn.ModuleList()
+        for _ in range(1):
+            self.blocks.append(EvoformerBlock(d_node, d_pair))
+
+    def forward(self, node, pair):
+        for b in self.blocks:
+            node, pair = b(node, pair)
+        return node, pair
+
+
+def evoformer_tiny():
+    return Evoformer(d_node=64, d_pair=32)
+
+
+def evoformer_base():
+    return Evoformer(d_node=256, d_pair=128)
+
+
+def evoformer_large():
+    return Evoformer(d_node=512, d_pair=256)
+
+
+__all__ = ['Evoformer', 'evoformer_base', 'evoformer_large']
diff --git a/evoformer_openfold/initializer.py b/evoformer_openfold/initializer.py
new file mode 100755
index 000000000000..c6ce0659e597
--- /dev/null
+++ b/evoformer_openfold/initializer.py
@@ -0,0 +1,29 @@
+import math
+
+import numpy as np
+import torch.nn as nn
+
+
+def glorot_uniform_af(x, gain=1.0):
+    """
+    initialize tensors the same as xavier_initializer in PyTorch, but the dimensions are different:
+    In PyTorch:
+    [feature_out, feature_in, n_head ...]
+    In Jax:
+    [... n_head, feature_in, feature_out]
+    However, there is a feature in original Alphafold2 code that they use the Jax version initializer to initialize tensors like:
+    [feature_in, n_head, feature_out]
+
+    In this function, we keep this feature to initialize [feature_in, n_head, ..., feature_out] tensors
+    """
+    fan_in, fan_out = x.shape[-2:]
+    if len(x.shape) > 2:
+        receptive_field_size = np.prod(x.shape[:-2])
+        fan_in *= receptive_field_size
+        fan_out *= receptive_field_size
+    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
+    dev = math.sqrt(3.0) * std  # Calculate uniform bounds from standard deviation
+
+    nn.init.uniform_(x, -dev, dev)
+
+    return x
diff --git a/evoformer_openfold/kernel.py b/evoformer_openfold/kernel.py
new file mode 100644
index 000000000000..26ab5dc53261
--- /dev/null
+++ b/evoformer_openfold/kernel.py
@@ -0,0 +1,19 @@
+import torch
+import torch.nn.functional as F
+
+
+def bias_sigmod_ele(y, bias, z):
+    return torch.sigmoid(y + bias) * z
+
+
+def bias_dropout_add(x: torch.Tensor, bias: torch.Tensor, dropmask: torch.Tensor,
+                     residual: torch.Tensor, prob: float) -> torch.Tensor:
+    out = (x + bias) * F.dropout(dropmask, p=prob, training=False)
+    out = residual + out
+    return out
+
+
+def bias_ele_dropout_residual(ab: torch.Tensor, b: torch.Tensor, g: torch.Tensor,
+                              dropout_mask: torch.Tensor, Z_raw: torch.Tensor,
+                              prob: float) -> torch.Tensor:
+    return Z_raw + F.dropout(dropout_mask, p=prob, training=True) * (g * (ab + b))
\ No newline at end of file
diff --git a/evoformer_openfold/msa.py b/evoformer_openfold/msa.py
new file mode 100644
index 000000000000..cac456638a55
--- /dev/null
+++ b/evoformer_openfold/msa.py
@@ -0,0 +1,95 @@
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torch.nn import LayerNorm
+
+from .kernel import bias_dropout_add
+from .ops import SelfAttention, Transition
+
+
+class MSARowAttentionWithPairBias(nn.Module):
+
+    def __init__(self, d_node, d_pair, c=32, n_head=8, p_drop=0.15):
+        super(MSARowAttentionWithPairBias, self).__init__()
+        self.d_node = d_node
+        self.d_pair = d_pair
+        self.c = c
+        self.n_head = n_head
+        self.p_drop = p_drop
+
+        self.layernormM = LayerNorm(d_node)
+        self.layernormZ = LayerNorm(d_pair)
+
+        _init_weights = torch.nn.init.normal_(torch.zeros([n_head, d_pair]),
+                                              std=1.0 / math.sqrt(d_pair))
+        self.linear_b_weights = nn.parameter.Parameter(data=_init_weights, requires_grad=True)
+
+        self.attention = SelfAttention(qkv_dim=d_node,
+                                       c=c,
+                                       n_head=n_head,
+                                       out_dim=d_node,
+                                       gating=True,
+                                       last_bias_fuse=True)
+
+        self.out_bias = nn.parameter.Parameter(data=torch.zeros((d_node,)), requires_grad=True)
+
+    def forward(self, M_raw, Z):
+        ## Input projections
+        M = self.layernormM(M_raw)
+        Z = self.layernormZ(Z)
+        b = F.linear(Z, self.linear_b_weights)
+        b = b.permute(0, 3, 1, 2)
+        # b = rearrange(b, 'b q k h -> b h q k')
+
+        M = self.attention(M, b)
+        dropout_mask = torch.ones_like(M[:, 0:1, :, :]).to(M.device).to(M.dtype)
+
+        return bias_dropout_add(M, self.out_bias, dropout_mask, M_raw, prob=self.p_drop)
+
+
+class MSAColumnAttention(nn.Module):
+
+    def __init__(self, d_node, c=32, n_head=8):
+        super(MSAColumnAttention, self).__init__()
+        self.d_node = d_node
+        self.c = c
+        self.n_head = n_head
+
+        self.layernormM = LayerNorm(d_node)
+        self.attention = SelfAttention(qkv_dim=d_node,
+                                       c=c,
+                                       n_head=n_head,
+                                       out_dim=d_node,
+                                       gating=True)
+
+    def forward(self, M_raw):
+        M = M_raw.transpose(-2, -3)
+        M = self.layernormM(M)
+
+        M = self.attention(M)
+
+        M = M.transpose(-2, -3)
+        return M_raw + M
+
+
+class MSAStack(nn.Module):
+
+    def __init__(self, d_node, d_pair, p_drop=0.15):
+        super(MSAStack, self).__init__()
+
+        self.MSARowAttentionWithPairBias = MSARowAttentionWithPairBias(d_node=d_node,
+                                                                       d_pair=d_pair,
+                                                                       p_drop=p_drop)
+
+        self.MSAColumnAttention = MSAColumnAttention(d_node=d_node)
+        self.MSATransition = Transition(d=d_node)
+
+    def forward(self, node, pair):
+        node = self.MSARowAttentionWithPairBias(node, pair)
+        node = self.MSAColumnAttention(node)
+        node = self.MSATransition(node)
+
+        return node
diff --git a/evoformer_openfold/ops.py b/evoformer_openfold/ops.py
new file mode 100755
index 000000000000..611b7b0fe777
--- /dev/null
+++ b/evoformer_openfold/ops.py
@@ -0,0 +1,176 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torch.nn import LayerNorm
+
+from .initializer import glorot_uniform_af
+from .kernel import bias_sigmod_ele
+
+
+class DropoutRowwise(nn.Module):
+
+    def __init__(self, p):
+        super(DropoutRowwise, self).__init__()
+        self.p = p
+        self.dropout = nn.Dropout(p=p)
+
+    def forward(self, x):
+        dropout_mask = torch.ones_like(x[:, 0:1, :, :])
+        dropout_mask = self.dropout(dropout_mask)
+        return dropout_mask * x
+
+
+class DropoutColumnwise(nn.Module):
+
+    def __init__(self, p):
+        super(DropoutColumnwise, self).__init__()
+        self.p = p
+        self.dropout = nn.Dropout(p=p)
+
+    def forward(self, x):
+        dropout_mask = torch.ones_like(x[:, :, 0:1, :])
+        dropout_mask = self.dropout(dropout_mask)
+        return dropout_mask * x
+
+
+class Transition(nn.Module):
+
+    def __init__(self, d, n=4):
+        super(Transition, self).__init__()
+        self.norm = LayerNorm(d)
+        self.linear1 = Linear(d, n * d, initializer='relu')
+        self.linear2 = Linear(n * d, d, initializer='zeros')
+
+    def forward(self, src):
+        x = self.norm(src)
+        x = self.linear2(F.relu(self.linear1(x)))
+        return src + x
+
+
+class OutProductMean(nn.Module):
+
+    def __init__(self, n_feat=64, n_feat_out=128, n_feat_proj=32):
+        super(OutProductMean, self).__init__()
+
+        self.layernormM = LayerNorm(n_feat)
+        self.linear_a = Linear(n_feat, n_feat_proj)
+        self.linear_b = Linear(n_feat, n_feat_proj)
+
+        self.o_linear = Linear(n_feat_proj * n_feat_proj,
+                               n_feat_out,
+                               initializer='zero',
+                               use_bias=True)
+
+    def forward(self, M):
+        M = self.layernormM(M)
+        left_act = self.linear_a(M)
+        right_act = self.linear_b(M)
+
+        O = torch.einsum('bsid,bsje->bijde', left_act, right_act).contiguous()
+        # O = rearrange(O, 'b i j d e -> b i j (d e)')
+        O = O.reshape(O.shape[0], O.shape[1], O.shape[2], -1)
+        Z = self.o_linear(O)
+
+        return Z
+
+
+class Linear(nn.Linear):
+    """
+    A Linear layer with built-in nonstandard initializations. Called just
+    like torch.nn.Linear.
+    Implements the initializers in 1.11.4, plus some additional ones found
+    in the code.
+    """
+
+    def __init__(
+        self,
+        feature_in: int,
+        feature_out: int,
+        initializer: str = 'linear',
+        use_bias: bool = True,
+        bias_init: float = 0.,
+    ):
+        super(Linear, self).__init__(feature_in, feature_out, bias=use_bias)
+
+        self.use_bias = use_bias
+        if initializer == 'linear':
+            glorot_uniform_af(self.weight, gain=1.0)
+        elif initializer == 'relu':
+            glorot_uniform_af(self.weight, gain=2.0)
+        elif initializer == 'zeros':
+            nn.init.zeros_(self.weight)
+        if self.use_bias:
+            with torch.no_grad():
+                self.bias.fill_(bias_init)
+
+
+class SelfAttention(nn.Module):
+    """
+    Multi-Head SelfAttention dealing with [batch_size1, batch_size2, len, dim] tensors
+    """
+
+    def __init__(self, qkv_dim, c, n_head, out_dim, gating=True, last_bias_fuse=False):
+        super(SelfAttention, self).__init__()
+        self.qkv_dim = qkv_dim
+        self.c = c
+        self.n_head = n_head
+        self.out_dim = out_dim
+        self.gating = gating
+        self.last_bias_fuse = last_bias_fuse
+
+        self.scaling = self.c**(-0.5)
+
+        # self.to_qkv = Linear(qkv_dim, 3 * n_head * c, initializer='linear')
+        self.to_q = Linear(qkv_dim, n_head * c, initializer='linear', use_bias=False)
+        self.to_k = Linear(qkv_dim, n_head * c, initializer='linear', use_bias=False)
+        self.to_v = Linear(qkv_dim, n_head * c, initializer='linear', use_bias=False)
+
+        if gating:
+            self.gating_bias = nn.parameter.Parameter(data=torch.ones((n_head * c,)))
+            self.gating_linear = Linear(qkv_dim, n_head * c, initializer='zero', use_bias=False)
+
+        self.o_linear = Linear(n_head * c,
+                               out_dim,
+                               initializer='zero',
+                               use_bias=(not last_bias_fuse))
+
+    def forward(self, in_data, nonbatched_bias=None):
+        """
+        :param in_data: [batch_size1, batch_size2, len_qkv, qkv_dim]
+        :param bias: None or [batch_size1, batch_size2, n_head, len_q, len_kv]
+        :param nonbatched_bias: None or [batch_size1, n_head, len_q, len_kv]
+        """
+
+        # qkv = self.to_qkv(in_data).chunk(3, dim=-1)
+        # q, k, v = map(lambda t: rearrange(t, 'b1 b2 n (h d) -> b1 b2 h n d', h=self.n_head), qkv)
+
+        q = self.to_q(in_data)
+        k = self.to_k(in_data)
+        v = self.to_v(in_data)
+
+        # q, k, v = map(lambda t: rearrange(t, 'b1 b2 n (h d) -> b1 b2 h n d', h=self.n_head),
+        #               [q, k, v])
+        q, k, v = map(lambda t: t.view(t.shape[0], t.shape[1], t.shape[2], self.n_head, -1).permute(0, 1, 3, 2, 4),
+                      [q, k, v])
+        
+        q = q * self.scaling
+
+        logits = torch.matmul(q, k.transpose(-1, -2))
+
+        if nonbatched_bias is not None:
+            logits += nonbatched_bias.unsqueeze(1)
+        weights = torch.softmax(logits, dim=-1)
+        # weights = softmax(logits)
+
+        weighted_avg = torch.matmul(weights, v)
+        # weighted_avg = rearrange(weighted_avg, 'b1 b2 h n d -> b1 b2 n (h d)')
+        weighted_avg = weighted_avg.permute(0, 1, 3, 2, 4)
+        weighted_avg = weighted_avg.reshape(weighted_avg.shape[0], weighted_avg.shape[1], weighted_avg.shape[2], -1)
+
+        if self.gating:
+            gate_values = self.gating_linear(in_data)
+            weighted_avg = bias_sigmod_ele(gate_values, self.gating_bias, weighted_avg)
+
+        output = self.o_linear(weighted_avg)
+        return output
diff --git a/evoformer_openfold/triangle.py b/evoformer_openfold/triangle.py
new file mode 100644
index 000000000000..f479469c3836
--- /dev/null
+++ b/evoformer_openfold/triangle.py
@@ -0,0 +1,192 @@
+import math
+
+import torch
+import torch.nn as nn
+from torch.nn import LayerNorm
+
+from .kernel import bias_dropout_add, bias_ele_dropout_residual
+from .ops import Linear, SelfAttention, Transition
+
+
+def permute_final_dims(tensor, inds):
+    zero_index = -1 * len(inds)
+    first_inds = list(range(len(tensor.shape[:zero_index])))
+    return tensor.permute(first_inds + [zero_index + i for i in inds])
+
+
+class TriangleMultiplicationOutgoing(nn.Module):
+
+    def __init__(self, d_pair, p_drop, c=128):
+        super(TriangleMultiplicationOutgoing, self).__init__()
+        self.d_pair = d_pair
+        self.c = c
+
+        self.layernorm1 = LayerNorm(d_pair)
+        self.left_projection = Linear(d_pair, c)
+        self.right_projection = Linear(d_pair, c)
+        self.left_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
+        self.right_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
+
+        self.output_gate = Linear(d_pair, d_pair, initializer='zeros', bias_init=1.)
+        self.layernorm2 = LayerNorm(c)
+        self.output_projection = Linear(d_pair, d_pair, initializer='zeros', use_bias=False)
+        self.output_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
+
+        self.p_drop = p_drop
+
+    def forward(self, Z_raw):
+        Z = self.layernorm1(Z_raw)
+        left_proj_act = self.left_projection(Z)
+        right_proj_act = self.right_projection(Z)
+
+        left_proj_act = left_proj_act * torch.sigmoid(self.left_gate(Z))
+        right_proj_act = right_proj_act * torch.sigmoid(self.right_gate(Z))
+
+        g = torch.sigmoid(self.output_gate(Z))
+        # p = torch.matmul(
+        #     permute_final_dims(left_proj_act, (2, 0, 1)),
+        #     permute_final_dims(right_proj_act, (2, 1, 0)),
+        # )
+        # ab = permute_final_dims(p, (1, 2, 0))
+
+        ab = torch.einsum('bikd,bjkd->bijd', left_proj_act, right_proj_act)
+        ab = self.output_projection(self.layernorm2(ab))
+        dropout_mask = torch.ones_like(Z[:, 0:1, :, :]).to(Z.device).to(Z.dtype)
+        return bias_ele_dropout_residual(ab,
+                                         self.output_bias,
+                                         g,
+                                         dropout_mask,
+                                         Z_raw,
+                                         prob=self.p_drop)
+
+
+class TriangleMultiplicationIncoming(nn.Module):
+
+    def __init__(self, d_pair, p_drop, c=128):
+        super(TriangleMultiplicationIncoming, self).__init__()
+        self.d_pair = d_pair
+        self.c = c
+
+        self.layernorm1 = LayerNorm(d_pair)
+        self.left_projection = Linear(d_pair, c)
+        self.right_projection = Linear(d_pair, c)
+        self.left_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
+        self.right_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
+
+        self.output_gate = Linear(d_pair, d_pair, initializer='zeros', bias_init=1.)
+        self.layernorm2 = LayerNorm(c)
+        self.output_projection = Linear(d_pair, d_pair, initializer='zeros', use_bias=False)
+        self.output_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
+
+        self.p_drop = p_drop
+
+    def forward(self, Z_raw):
+        Z = self.layernorm1(Z_raw)
+        left_proj_act = self.left_projection(Z)
+        right_proj_act = self.right_projection(Z)
+
+        left_proj_act = left_proj_act * torch.sigmoid(self.left_gate(Z))
+        right_proj_act = right_proj_act * torch.sigmoid(self.right_gate(Z))
+
+        g = torch.sigmoid(self.output_gate(Z))
+        # p = torch.matmul(
+        #     permute_final_dims(left_proj_act, (2, 1, 0)),
+        #     permute_final_dims(right_proj_act, (2, 0, 1)),
+        # )
+        # ab = permute_final_dims(p, (1, 2, 0))
+
+        ab = torch.einsum('bkid,bkjd->bijd', left_proj_act, right_proj_act)
+        ab = self.output_projection(self.layernorm2(ab))
+        dropout_mask = torch.ones_like(Z[:, 0:1, :, :]).to(Z.device).to(Z.dtype)
+        return bias_ele_dropout_residual(ab,
+                                         self.output_bias,
+                                         g,
+                                         dropout_mask,
+                                         Z_raw,
+                                         prob=self.p_drop)
+
+
+class TriangleAttentionStartingNode(nn.Module):
+
+    def __init__(self, d_pair, p_drop, c=32, n_head=4):
+        super(TriangleAttentionStartingNode, self).__init__()
+        self.d_pair = d_pair
+        self.c = c
+        self.n_head = n_head
+        self.p_drop = p_drop
+
+        self.layernorm1 = LayerNorm(d_pair)
+        _init_weights = torch.nn.init.normal_(torch.zeros([d_pair, n_head]),
+                                              std=1.0 / math.sqrt(d_pair))
+        self.linear_b_weights = nn.parameter.Parameter(data=_init_weights)
+        self.attention = SelfAttention(qkv_dim=d_pair,
+                                       c=c,
+                                       n_head=n_head,
+                                       out_dim=d_pair,
+                                       gating=True,
+                                       last_bias_fuse=True)
+
+        self.out_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
+
+    def forward(self, Z_raw):
+        Z = self.layernorm1(Z_raw)
+        b = torch.einsum('bqkc,ch->bhqk', Z, self.linear_b_weights)
+
+        Z = self.attention(Z, b)
+
+        dropout_mask = torch.ones_like(Z[:, 0:1, :, :]).to(Z.device).to(Z.dtype)
+        return bias_dropout_add(Z, self.out_bias, dropout_mask, Z_raw, prob=self.p_drop)
+
+
+class TriangleAttentionEndingNode(nn.Module):
+
+    def __init__(self, d_pair, p_drop, c=32, n_head=4):
+        super(TriangleAttentionEndingNode, self).__init__()
+        self.d_pair = d_pair
+        self.c = c
+        self.n_head = n_head
+        self.p_drop = p_drop
+
+        self.layernorm1 = LayerNorm(d_pair)
+        _init_weights = torch.nn.init.normal_(torch.zeros([d_pair, n_head]),
+                                              std=1.0 / math.sqrt(d_pair))
+        self.linear_b_weights = nn.parameter.Parameter(data=_init_weights)
+        self.attention = SelfAttention(qkv_dim=d_pair,
+                                       c=c,
+                                       n_head=n_head,
+                                       out_dim=d_pair,
+                                       gating=True,
+                                       last_bias_fuse=True)
+
+        self.out_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
+
+    def forward(self, Z_raw):
+        Z = Z_raw.transpose(-2, -3)
+        Z = self.layernorm1(Z)
+        b = torch.einsum('bqkc,ch->bhqk', Z, self.linear_b_weights)
+
+        Z = self.attention(Z, b)
+
+        Z = Z.transpose(-2, -3)
+        dropout_mask = torch.ones_like(Z[:, :, 0:1, :]).to(Z.device).to(Z.dtype)
+        return bias_dropout_add(Z, self.out_bias, dropout_mask, Z_raw, prob=self.p_drop)
+
+
+class PairStack(nn.Module):
+
+    def __init__(self, d_pair, p_drop=0.25):
+        super(PairStack, self).__init__()
+
+        self.TriangleMultiplicationOutgoing = TriangleMultiplicationOutgoing(d_pair, p_drop=p_drop)
+        self.TriangleMultiplicationIncoming = TriangleMultiplicationIncoming(d_pair, p_drop=p_drop)
+        self.TriangleAttentionStartingNode = TriangleAttentionStartingNode(d_pair, p_drop=p_drop)
+        self.TriangleAttentionEndingNode = TriangleAttentionEndingNode(d_pair, p_drop=p_drop)
+        self.PairTransition = Transition(d=d_pair)
+
+    def forward(self, pair):
+        pair = self.TriangleMultiplicationOutgoing(pair)
+        pair = self.TriangleMultiplicationIncoming(pair)
+        pair = self.TriangleAttentionStartingNode(pair)
+        pair = self.TriangleAttentionEndingNode(pair)
+        pair = self.PairTransition(pair)
+        return pair

From fff493c2021a55754d574cc1457cb4c695e30354 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Thu, 29 Dec 2022 11:48:11 +0800
Subject: [PATCH 065/503] init openfold

---
 evoformer_openfold/evoformer.py              |  59 --
 evoformer_openfold/initializer.py            |  29 -
 evoformer_openfold/kernel.py                 |  19 -
 evoformer_openfold/msa.py                    |  95 ---
 evoformer_openfold/ops.py                    | 176 -----
 evoformer_openfold/triangle.py               | 192 ------
 openfold/checkpointing.py                    |  84 +++
 openfold/dropout.py                          |  78 +++
 openfold/evoformer.py                        | 636 +++++++++++++++++++
 openfold/msa.py                              | 392 ++++++++++++
 openfold/outer_product_mean.py               | 129 ++++
 openfold/pair_transition.py                  |  99 +++
 openfold/primitives.py                       | 529 +++++++++++++++
 openfold/tensor_utils.py                     | 408 ++++++++++++
 openfold/triangular_attention.py             | 139 ++++
 openfold/triangular_multiplicative_update.py | 127 ++++
 16 files changed, 2621 insertions(+), 570 deletions(-)
 delete mode 100644 evoformer_openfold/evoformer.py
 delete mode 100755 evoformer_openfold/initializer.py
 delete mode 100644 evoformer_openfold/kernel.py
 delete mode 100644 evoformer_openfold/msa.py
 delete mode 100755 evoformer_openfold/ops.py
 delete mode 100644 evoformer_openfold/triangle.py
 create mode 100644 openfold/checkpointing.py
 create mode 100644 openfold/dropout.py
 create mode 100644 openfold/evoformer.py
 create mode 100644 openfold/msa.py
 create mode 100644 openfold/outer_product_mean.py
 create mode 100644 openfold/pair_transition.py
 create mode 100644 openfold/primitives.py
 create mode 100644 openfold/tensor_utils.py
 create mode 100644 openfold/triangular_attention.py
 create mode 100644 openfold/triangular_multiplicative_update.py

diff --git a/evoformer_openfold/evoformer.py b/evoformer_openfold/evoformer.py
deleted file mode 100644
index cfd2bb2a2529..000000000000
--- a/evoformer_openfold/evoformer.py
+++ /dev/null
@@ -1,59 +0,0 @@
-import torch
-import torch.nn as nn
-
-from .msa import MSAStack
-from .ops import OutProductMean
-from .triangle import PairStack
-
-
-def print_memory(init_mem, text=None):
-    now_mem = torch.cuda.memory_allocated() / 1024 ** 2 - init_mem
-    max_mem = torch.cuda.max_memory_allocated() / 1024 ** 2 - init_mem
-    print("%s now:%.2f max:%.2f" % ("" if text is None else text, now_mem, max_mem))
-    torch.cuda.reset_peak_memory_stats()
-
-
-class EvoformerBlock(nn.Module):
-
-    def __init__(self, d_node, d_pair):
-        super(EvoformerBlock, self).__init__()
-
-        self.msa_stack = MSAStack(d_node, d_pair, p_drop=0.15)
-        self.communication = OutProductMean(n_feat=d_node, n_feat_out=d_pair, n_feat_proj=32)
-        self.pair_stack = PairStack(d_pair=d_pair)
-
-    def forward(self, node, pair):
-        node = self.msa_stack(node, pair)
-        pair = pair + self.communication(node)
-        pair = self.pair_stack(pair)
-        return node, pair
-
-
-class Evoformer(nn.Module):
-
-    def __init__(self, d_node, d_pair):
-        super(Evoformer, self).__init__()
-
-        self.blocks = nn.ModuleList()
-        for _ in range(1):
-            self.blocks.append(EvoformerBlock(d_node, d_pair))
-
-    def forward(self, node, pair):
-        for b in self.blocks:
-            node, pair = b(node, pair)
-        return node, pair
-
-
-def evoformer_tiny():
-    return Evoformer(d_node=64, d_pair=32)
-
-
-def evoformer_base():
-    return Evoformer(d_node=256, d_pair=128)
-
-
-def evoformer_large():
-    return Evoformer(d_node=512, d_pair=256)
-
-
-__all__ = ['Evoformer', 'evoformer_base', 'evoformer_large']
diff --git a/evoformer_openfold/initializer.py b/evoformer_openfold/initializer.py
deleted file mode 100755
index c6ce0659e597..000000000000
--- a/evoformer_openfold/initializer.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import math
-
-import numpy as np
-import torch.nn as nn
-
-
-def glorot_uniform_af(x, gain=1.0):
-    """
-    initialize tensors the same as xavier_initializer in PyTorch, but the dimensions are different:
-    In PyTorch:
-    [feature_out, feature_in, n_head ...]
-    In Jax:
-    [... n_head, feature_in, feature_out]
-    However, there is a feature in original Alphafold2 code that they use the Jax version initializer to initialize tensors like:
-    [feature_in, n_head, feature_out]
-
-    In this function, we keep this feature to initialize [feature_in, n_head, ..., feature_out] tensors
-    """
-    fan_in, fan_out = x.shape[-2:]
-    if len(x.shape) > 2:
-        receptive_field_size = np.prod(x.shape[:-2])
-        fan_in *= receptive_field_size
-        fan_out *= receptive_field_size
-    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
-    dev = math.sqrt(3.0) * std  # Calculate uniform bounds from standard deviation
-
-    nn.init.uniform_(x, -dev, dev)
-
-    return x
diff --git a/evoformer_openfold/kernel.py b/evoformer_openfold/kernel.py
deleted file mode 100644
index 26ab5dc53261..000000000000
--- a/evoformer_openfold/kernel.py
+++ /dev/null
@@ -1,19 +0,0 @@
-import torch
-import torch.nn.functional as F
-
-
-def bias_sigmod_ele(y, bias, z):
-    return torch.sigmoid(y + bias) * z
-
-
-def bias_dropout_add(x: torch.Tensor, bias: torch.Tensor, dropmask: torch.Tensor,
-                     residual: torch.Tensor, prob: float) -> torch.Tensor:
-    out = (x + bias) * F.dropout(dropmask, p=prob, training=False)
-    out = residual + out
-    return out
-
-
-def bias_ele_dropout_residual(ab: torch.Tensor, b: torch.Tensor, g: torch.Tensor,
-                              dropout_mask: torch.Tensor, Z_raw: torch.Tensor,
-                              prob: float) -> torch.Tensor:
-    return Z_raw + F.dropout(dropout_mask, p=prob, training=True) * (g * (ab + b))
\ No newline at end of file
diff --git a/evoformer_openfold/msa.py b/evoformer_openfold/msa.py
deleted file mode 100644
index cac456638a55..000000000000
--- a/evoformer_openfold/msa.py
+++ /dev/null
@@ -1,95 +0,0 @@
-import math
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from einops import rearrange
-from torch.nn import LayerNorm
-
-from .kernel import bias_dropout_add
-from .ops import SelfAttention, Transition
-
-
-class MSARowAttentionWithPairBias(nn.Module):
-
-    def __init__(self, d_node, d_pair, c=32, n_head=8, p_drop=0.15):
-        super(MSARowAttentionWithPairBias, self).__init__()
-        self.d_node = d_node
-        self.d_pair = d_pair
-        self.c = c
-        self.n_head = n_head
-        self.p_drop = p_drop
-
-        self.layernormM = LayerNorm(d_node)
-        self.layernormZ = LayerNorm(d_pair)
-
-        _init_weights = torch.nn.init.normal_(torch.zeros([n_head, d_pair]),
-                                              std=1.0 / math.sqrt(d_pair))
-        self.linear_b_weights = nn.parameter.Parameter(data=_init_weights, requires_grad=True)
-
-        self.attention = SelfAttention(qkv_dim=d_node,
-                                       c=c,
-                                       n_head=n_head,
-                                       out_dim=d_node,
-                                       gating=True,
-                                       last_bias_fuse=True)
-
-        self.out_bias = nn.parameter.Parameter(data=torch.zeros((d_node,)), requires_grad=True)
-
-    def forward(self, M_raw, Z):
-        ## Input projections
-        M = self.layernormM(M_raw)
-        Z = self.layernormZ(Z)
-        b = F.linear(Z, self.linear_b_weights)
-        b = b.permute(0, 3, 1, 2)
-        # b = rearrange(b, 'b q k h -> b h q k')
-
-        M = self.attention(M, b)
-        dropout_mask = torch.ones_like(M[:, 0:1, :, :]).to(M.device).to(M.dtype)
-
-        return bias_dropout_add(M, self.out_bias, dropout_mask, M_raw, prob=self.p_drop)
-
-
-class MSAColumnAttention(nn.Module):
-
-    def __init__(self, d_node, c=32, n_head=8):
-        super(MSAColumnAttention, self).__init__()
-        self.d_node = d_node
-        self.c = c
-        self.n_head = n_head
-
-        self.layernormM = LayerNorm(d_node)
-        self.attention = SelfAttention(qkv_dim=d_node,
-                                       c=c,
-                                       n_head=n_head,
-                                       out_dim=d_node,
-                                       gating=True)
-
-    def forward(self, M_raw):
-        M = M_raw.transpose(-2, -3)
-        M = self.layernormM(M)
-
-        M = self.attention(M)
-
-        M = M.transpose(-2, -3)
-        return M_raw + M
-
-
-class MSAStack(nn.Module):
-
-    def __init__(self, d_node, d_pair, p_drop=0.15):
-        super(MSAStack, self).__init__()
-
-        self.MSARowAttentionWithPairBias = MSARowAttentionWithPairBias(d_node=d_node,
-                                                                       d_pair=d_pair,
-                                                                       p_drop=p_drop)
-
-        self.MSAColumnAttention = MSAColumnAttention(d_node=d_node)
-        self.MSATransition = Transition(d=d_node)
-
-    def forward(self, node, pair):
-        node = self.MSARowAttentionWithPairBias(node, pair)
-        node = self.MSAColumnAttention(node)
-        node = self.MSATransition(node)
-
-        return node
diff --git a/evoformer_openfold/ops.py b/evoformer_openfold/ops.py
deleted file mode 100755
index 611b7b0fe777..000000000000
--- a/evoformer_openfold/ops.py
+++ /dev/null
@@ -1,176 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from einops import rearrange
-from torch.nn import LayerNorm
-
-from .initializer import glorot_uniform_af
-from .kernel import bias_sigmod_ele
-
-
-class DropoutRowwise(nn.Module):
-
-    def __init__(self, p):
-        super(DropoutRowwise, self).__init__()
-        self.p = p
-        self.dropout = nn.Dropout(p=p)
-
-    def forward(self, x):
-        dropout_mask = torch.ones_like(x[:, 0:1, :, :])
-        dropout_mask = self.dropout(dropout_mask)
-        return dropout_mask * x
-
-
-class DropoutColumnwise(nn.Module):
-
-    def __init__(self, p):
-        super(DropoutColumnwise, self).__init__()
-        self.p = p
-        self.dropout = nn.Dropout(p=p)
-
-    def forward(self, x):
-        dropout_mask = torch.ones_like(x[:, :, 0:1, :])
-        dropout_mask = self.dropout(dropout_mask)
-        return dropout_mask * x
-
-
-class Transition(nn.Module):
-
-    def __init__(self, d, n=4):
-        super(Transition, self).__init__()
-        self.norm = LayerNorm(d)
-        self.linear1 = Linear(d, n * d, initializer='relu')
-        self.linear2 = Linear(n * d, d, initializer='zeros')
-
-    def forward(self, src):
-        x = self.norm(src)
-        x = self.linear2(F.relu(self.linear1(x)))
-        return src + x
-
-
-class OutProductMean(nn.Module):
-
-    def __init__(self, n_feat=64, n_feat_out=128, n_feat_proj=32):
-        super(OutProductMean, self).__init__()
-
-        self.layernormM = LayerNorm(n_feat)
-        self.linear_a = Linear(n_feat, n_feat_proj)
-        self.linear_b = Linear(n_feat, n_feat_proj)
-
-        self.o_linear = Linear(n_feat_proj * n_feat_proj,
-                               n_feat_out,
-                               initializer='zero',
-                               use_bias=True)
-
-    def forward(self, M):
-        M = self.layernormM(M)
-        left_act = self.linear_a(M)
-        right_act = self.linear_b(M)
-
-        O = torch.einsum('bsid,bsje->bijde', left_act, right_act).contiguous()
-        # O = rearrange(O, 'b i j d e -> b i j (d e)')
-        O = O.reshape(O.shape[0], O.shape[1], O.shape[2], -1)
-        Z = self.o_linear(O)
-
-        return Z
-
-
-class Linear(nn.Linear):
-    """
-    A Linear layer with built-in nonstandard initializations. Called just
-    like torch.nn.Linear.
-    Implements the initializers in 1.11.4, plus some additional ones found
-    in the code.
-    """
-
-    def __init__(
-        self,
-        feature_in: int,
-        feature_out: int,
-        initializer: str = 'linear',
-        use_bias: bool = True,
-        bias_init: float = 0.,
-    ):
-        super(Linear, self).__init__(feature_in, feature_out, bias=use_bias)
-
-        self.use_bias = use_bias
-        if initializer == 'linear':
-            glorot_uniform_af(self.weight, gain=1.0)
-        elif initializer == 'relu':
-            glorot_uniform_af(self.weight, gain=2.0)
-        elif initializer == 'zeros':
-            nn.init.zeros_(self.weight)
-        if self.use_bias:
-            with torch.no_grad():
-                self.bias.fill_(bias_init)
-
-
-class SelfAttention(nn.Module):
-    """
-    Multi-Head SelfAttention dealing with [batch_size1, batch_size2, len, dim] tensors
-    """
-
-    def __init__(self, qkv_dim, c, n_head, out_dim, gating=True, last_bias_fuse=False):
-        super(SelfAttention, self).__init__()
-        self.qkv_dim = qkv_dim
-        self.c = c
-        self.n_head = n_head
-        self.out_dim = out_dim
-        self.gating = gating
-        self.last_bias_fuse = last_bias_fuse
-
-        self.scaling = self.c**(-0.5)
-
-        # self.to_qkv = Linear(qkv_dim, 3 * n_head * c, initializer='linear')
-        self.to_q = Linear(qkv_dim, n_head * c, initializer='linear', use_bias=False)
-        self.to_k = Linear(qkv_dim, n_head * c, initializer='linear', use_bias=False)
-        self.to_v = Linear(qkv_dim, n_head * c, initializer='linear', use_bias=False)
-
-        if gating:
-            self.gating_bias = nn.parameter.Parameter(data=torch.ones((n_head * c,)))
-            self.gating_linear = Linear(qkv_dim, n_head * c, initializer='zero', use_bias=False)
-
-        self.o_linear = Linear(n_head * c,
-                               out_dim,
-                               initializer='zero',
-                               use_bias=(not last_bias_fuse))
-
-    def forward(self, in_data, nonbatched_bias=None):
-        """
-        :param in_data: [batch_size1, batch_size2, len_qkv, qkv_dim]
-        :param bias: None or [batch_size1, batch_size2, n_head, len_q, len_kv]
-        :param nonbatched_bias: None or [batch_size1, n_head, len_q, len_kv]
-        """
-
-        # qkv = self.to_qkv(in_data).chunk(3, dim=-1)
-        # q, k, v = map(lambda t: rearrange(t, 'b1 b2 n (h d) -> b1 b2 h n d', h=self.n_head), qkv)
-
-        q = self.to_q(in_data)
-        k = self.to_k(in_data)
-        v = self.to_v(in_data)
-
-        # q, k, v = map(lambda t: rearrange(t, 'b1 b2 n (h d) -> b1 b2 h n d', h=self.n_head),
-        #               [q, k, v])
-        q, k, v = map(lambda t: t.view(t.shape[0], t.shape[1], t.shape[2], self.n_head, -1).permute(0, 1, 3, 2, 4),
-                      [q, k, v])
-        
-        q = q * self.scaling
-
-        logits = torch.matmul(q, k.transpose(-1, -2))
-
-        if nonbatched_bias is not None:
-            logits += nonbatched_bias.unsqueeze(1)
-        weights = torch.softmax(logits, dim=-1)
-        # weights = softmax(logits)
-
-        weighted_avg = torch.matmul(weights, v)
-        # weighted_avg = rearrange(weighted_avg, 'b1 b2 h n d -> b1 b2 n (h d)')
-        weighted_avg = weighted_avg.permute(0, 1, 3, 2, 4)
-        weighted_avg = weighted_avg.reshape(weighted_avg.shape[0], weighted_avg.shape[1], weighted_avg.shape[2], -1)
-
-        if self.gating:
-            gate_values = self.gating_linear(in_data)
-            weighted_avg = bias_sigmod_ele(gate_values, self.gating_bias, weighted_avg)
-
-        output = self.o_linear(weighted_avg)
-        return output
diff --git a/evoformer_openfold/triangle.py b/evoformer_openfold/triangle.py
deleted file mode 100644
index f479469c3836..000000000000
--- a/evoformer_openfold/triangle.py
+++ /dev/null
@@ -1,192 +0,0 @@
-import math
-
-import torch
-import torch.nn as nn
-from torch.nn import LayerNorm
-
-from .kernel import bias_dropout_add, bias_ele_dropout_residual
-from .ops import Linear, SelfAttention, Transition
-
-
-def permute_final_dims(tensor, inds):
-    zero_index = -1 * len(inds)
-    first_inds = list(range(len(tensor.shape[:zero_index])))
-    return tensor.permute(first_inds + [zero_index + i for i in inds])
-
-
-class TriangleMultiplicationOutgoing(nn.Module):
-
-    def __init__(self, d_pair, p_drop, c=128):
-        super(TriangleMultiplicationOutgoing, self).__init__()
-        self.d_pair = d_pair
-        self.c = c
-
-        self.layernorm1 = LayerNorm(d_pair)
-        self.left_projection = Linear(d_pair, c)
-        self.right_projection = Linear(d_pair, c)
-        self.left_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
-        self.right_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
-
-        self.output_gate = Linear(d_pair, d_pair, initializer='zeros', bias_init=1.)
-        self.layernorm2 = LayerNorm(c)
-        self.output_projection = Linear(d_pair, d_pair, initializer='zeros', use_bias=False)
-        self.output_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
-
-        self.p_drop = p_drop
-
-    def forward(self, Z_raw):
-        Z = self.layernorm1(Z_raw)
-        left_proj_act = self.left_projection(Z)
-        right_proj_act = self.right_projection(Z)
-
-        left_proj_act = left_proj_act * torch.sigmoid(self.left_gate(Z))
-        right_proj_act = right_proj_act * torch.sigmoid(self.right_gate(Z))
-
-        g = torch.sigmoid(self.output_gate(Z))
-        # p = torch.matmul(
-        #     permute_final_dims(left_proj_act, (2, 0, 1)),
-        #     permute_final_dims(right_proj_act, (2, 1, 0)),
-        # )
-        # ab = permute_final_dims(p, (1, 2, 0))
-
-        ab = torch.einsum('bikd,bjkd->bijd', left_proj_act, right_proj_act)
-        ab = self.output_projection(self.layernorm2(ab))
-        dropout_mask = torch.ones_like(Z[:, 0:1, :, :]).to(Z.device).to(Z.dtype)
-        return bias_ele_dropout_residual(ab,
-                                         self.output_bias,
-                                         g,
-                                         dropout_mask,
-                                         Z_raw,
-                                         prob=self.p_drop)
-
-
-class TriangleMultiplicationIncoming(nn.Module):
-
-    def __init__(self, d_pair, p_drop, c=128):
-        super(TriangleMultiplicationIncoming, self).__init__()
-        self.d_pair = d_pair
-        self.c = c
-
-        self.layernorm1 = LayerNorm(d_pair)
-        self.left_projection = Linear(d_pair, c)
-        self.right_projection = Linear(d_pair, c)
-        self.left_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
-        self.right_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
-
-        self.output_gate = Linear(d_pair, d_pair, initializer='zeros', bias_init=1.)
-        self.layernorm2 = LayerNorm(c)
-        self.output_projection = Linear(d_pair, d_pair, initializer='zeros', use_bias=False)
-        self.output_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
-
-        self.p_drop = p_drop
-
-    def forward(self, Z_raw):
-        Z = self.layernorm1(Z_raw)
-        left_proj_act = self.left_projection(Z)
-        right_proj_act = self.right_projection(Z)
-
-        left_proj_act = left_proj_act * torch.sigmoid(self.left_gate(Z))
-        right_proj_act = right_proj_act * torch.sigmoid(self.right_gate(Z))
-
-        g = torch.sigmoid(self.output_gate(Z))
-        # p = torch.matmul(
-        #     permute_final_dims(left_proj_act, (2, 1, 0)),
-        #     permute_final_dims(right_proj_act, (2, 0, 1)),
-        # )
-        # ab = permute_final_dims(p, (1, 2, 0))
-
-        ab = torch.einsum('bkid,bkjd->bijd', left_proj_act, right_proj_act)
-        ab = self.output_projection(self.layernorm2(ab))
-        dropout_mask = torch.ones_like(Z[:, 0:1, :, :]).to(Z.device).to(Z.dtype)
-        return bias_ele_dropout_residual(ab,
-                                         self.output_bias,
-                                         g,
-                                         dropout_mask,
-                                         Z_raw,
-                                         prob=self.p_drop)
-
-
-class TriangleAttentionStartingNode(nn.Module):
-
-    def __init__(self, d_pair, p_drop, c=32, n_head=4):
-        super(TriangleAttentionStartingNode, self).__init__()
-        self.d_pair = d_pair
-        self.c = c
-        self.n_head = n_head
-        self.p_drop = p_drop
-
-        self.layernorm1 = LayerNorm(d_pair)
-        _init_weights = torch.nn.init.normal_(torch.zeros([d_pair, n_head]),
-                                              std=1.0 / math.sqrt(d_pair))
-        self.linear_b_weights = nn.parameter.Parameter(data=_init_weights)
-        self.attention = SelfAttention(qkv_dim=d_pair,
-                                       c=c,
-                                       n_head=n_head,
-                                       out_dim=d_pair,
-                                       gating=True,
-                                       last_bias_fuse=True)
-
-        self.out_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
-
-    def forward(self, Z_raw):
-        Z = self.layernorm1(Z_raw)
-        b = torch.einsum('bqkc,ch->bhqk', Z, self.linear_b_weights)
-
-        Z = self.attention(Z, b)
-
-        dropout_mask = torch.ones_like(Z[:, 0:1, :, :]).to(Z.device).to(Z.dtype)
-        return bias_dropout_add(Z, self.out_bias, dropout_mask, Z_raw, prob=self.p_drop)
-
-
-class TriangleAttentionEndingNode(nn.Module):
-
-    def __init__(self, d_pair, p_drop, c=32, n_head=4):
-        super(TriangleAttentionEndingNode, self).__init__()
-        self.d_pair = d_pair
-        self.c = c
-        self.n_head = n_head
-        self.p_drop = p_drop
-
-        self.layernorm1 = LayerNorm(d_pair)
-        _init_weights = torch.nn.init.normal_(torch.zeros([d_pair, n_head]),
-                                              std=1.0 / math.sqrt(d_pair))
-        self.linear_b_weights = nn.parameter.Parameter(data=_init_weights)
-        self.attention = SelfAttention(qkv_dim=d_pair,
-                                       c=c,
-                                       n_head=n_head,
-                                       out_dim=d_pair,
-                                       gating=True,
-                                       last_bias_fuse=True)
-
-        self.out_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
-
-    def forward(self, Z_raw):
-        Z = Z_raw.transpose(-2, -3)
-        Z = self.layernorm1(Z)
-        b = torch.einsum('bqkc,ch->bhqk', Z, self.linear_b_weights)
-
-        Z = self.attention(Z, b)
-
-        Z = Z.transpose(-2, -3)
-        dropout_mask = torch.ones_like(Z[:, :, 0:1, :]).to(Z.device).to(Z.dtype)
-        return bias_dropout_add(Z, self.out_bias, dropout_mask, Z_raw, prob=self.p_drop)
-
-
-class PairStack(nn.Module):
-
-    def __init__(self, d_pair, p_drop=0.25):
-        super(PairStack, self).__init__()
-
-        self.TriangleMultiplicationOutgoing = TriangleMultiplicationOutgoing(d_pair, p_drop=p_drop)
-        self.TriangleMultiplicationIncoming = TriangleMultiplicationIncoming(d_pair, p_drop=p_drop)
-        self.TriangleAttentionStartingNode = TriangleAttentionStartingNode(d_pair, p_drop=p_drop)
-        self.TriangleAttentionEndingNode = TriangleAttentionEndingNode(d_pair, p_drop=p_drop)
-        self.PairTransition = Transition(d=d_pair)
-
-    def forward(self, pair):
-        pair = self.TriangleMultiplicationOutgoing(pair)
-        pair = self.TriangleMultiplicationIncoming(pair)
-        pair = self.TriangleAttentionStartingNode(pair)
-        pair = self.TriangleAttentionEndingNode(pair)
-        pair = self.PairTransition(pair)
-        return pair
diff --git a/openfold/checkpointing.py b/openfold/checkpointing.py
new file mode 100644
index 000000000000..83e77c638ec1
--- /dev/null
+++ b/openfold/checkpointing.py
@@ -0,0 +1,84 @@
+# Copyright 2021 AlQuraishi Laboratory
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.utils.checkpoint
+from typing import Any, Tuple, List, Callable, Optional
+
+
+BLOCK_ARG = Any
+BLOCK_ARGS = List[BLOCK_ARG]
+
+
+def get_checkpoint_fn():
+    checkpoint = torch.utils.checkpoint.checkpoint
+
+    return checkpoint
+
+
+@torch.jit.ignore
+def checkpoint_blocks(
+    blocks: List[Callable],
+    args: BLOCK_ARGS,
+    blocks_per_ckpt: Optional[int],
+) -> BLOCK_ARGS:
+    """
+    Chunk a list of blocks and run each chunk with activation
+    checkpointing. We define a "block" as a callable whose only inputs are
+    the outputs of the previous block.
+
+    Implements Subsection 1.11.8
+
+    Args:
+        blocks:
+            List of blocks
+        args:
+            Tuple of arguments for the first block.
+        blocks_per_ckpt:
+            Size of each chunk. A higher value corresponds to fewer 
+            checkpoints, and trades memory for speed. If None, no checkpointing 
+            is performed.
+    Returns:
+        The output of the final block
+    """
+    def wrap(a):
+        return (a,) if type(a) is not tuple else a
+
+    def exec(b, a):
+        for block in b:
+            a = wrap(block(*a))
+        return a
+
+    def chunker(s, e):
+        def exec_sliced(*a):
+            return exec(blocks[s:e], a)
+
+        return exec_sliced
+
+    # Avoids mishaps when the blocks take just one argument
+    args = wrap(args)
+
+    if blocks_per_ckpt is None:
+        return exec(blocks, args)
+    elif blocks_per_ckpt < 1 or blocks_per_ckpt > len(blocks):
+        raise ValueError("blocks_per_ckpt must be between 1 and len(blocks)")
+
+    checkpoint = get_checkpoint_fn() 
+
+    for s in range(0, len(blocks), blocks_per_ckpt):
+        e = s + blocks_per_ckpt
+        args = checkpoint(chunker(s, e), *args)
+        args = wrap(args)
+
+    return args
diff --git a/openfold/dropout.py b/openfold/dropout.py
new file mode 100644
index 000000000000..651b9775ef44
--- /dev/null
+++ b/openfold/dropout.py
@@ -0,0 +1,78 @@
+# Copyright 2021 AlQuraishi Laboratory
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import torch
+import torch.nn as nn
+from functools import partialmethod
+from typing import Union, List
+
+
+class Dropout(nn.Module):
+    """
+    Implementation of dropout with the ability to share the dropout mask
+    along a particular dimension.
+
+    If not in training mode, this module computes the identity function.
+    """
+
+    def __init__(self, r: float, batch_dim: Union[int, List[int]]):
+        """
+        Args:
+            r:
+                Dropout rate
+            batch_dim:
+                Dimension(s) along which the dropout mask is shared
+        """
+        super(Dropout, self).__init__()
+
+        self.r = r
+        if type(batch_dim) == int:
+            batch_dim = [batch_dim]
+        self.batch_dim = batch_dim
+        self.dropout = nn.Dropout(self.r)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x:
+                Tensor to which dropout is applied. Can have any shape
+                compatible with self.batch_dim
+        """
+        shape = list(x.shape)
+        if self.batch_dim is not None:
+            for bd in self.batch_dim:
+                shape[bd] = 1
+        mask = x.new_ones(shape)
+        mask = self.dropout(mask)
+        x *= mask
+        return x
+
+
+class DropoutRowwise(Dropout):
+    """
+    Convenience class for rowwise dropout as described in subsection
+    1.11.6.
+    """
+
+    __init__ = partialmethod(Dropout.__init__, batch_dim=-3)
+
+
+class DropoutColumnwise(Dropout):
+    """
+    Convenience class for columnwise dropout as described in subsection
+    1.11.6.
+    """
+
+    __init__ = partialmethod(Dropout.__init__, batch_dim=-2)
diff --git a/openfold/evoformer.py b/openfold/evoformer.py
new file mode 100644
index 000000000000..21e422b04764
--- /dev/null
+++ b/openfold/evoformer.py
@@ -0,0 +1,636 @@
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import torch
+import torch.nn as nn
+from typing import Tuple, Optional
+from functools import partial
+
+from openfold.primitives import Linear, LayerNorm
+from openfold.dropout import DropoutRowwise, DropoutColumnwise
+from openfold.msa import (
+    MSARowAttentionWithPairBias,
+    MSAColumnAttention,
+    MSAColumnGlobalAttention,
+)
+from openfold.outer_product_mean import OuterProductMean
+from openfold.pair_transition import PairTransition
+from openfold.triangular_attention import (
+    TriangleAttentionStartingNode,
+    TriangleAttentionEndingNode,
+)
+from openfold.triangular_multiplicative_update import (
+    TriangleMultiplicationOutgoing,
+    TriangleMultiplicationIncoming,
+)
+from openfold.checkpointing import checkpoint_blocks, get_checkpoint_fn
+from openfold.tensor_utils import chunk_layer
+
+
+class MSATransition(nn.Module):
+    """
+    Feed-forward network applied to MSA activations after attention.
+
+    Implements Algorithm 9
+    """
+    def __init__(self, c_m, n):
+        """
+        Args:
+            c_m:
+                MSA channel dimension
+            n:
+                Factor multiplied to c_m to obtain the hidden channel
+                dimension
+        """
+        super(MSATransition, self).__init__()
+
+        self.c_m = c_m
+        self.n = n
+
+        self.layer_norm = LayerNorm(self.c_m)
+        self.linear_1 = Linear(self.c_m, self.n * self.c_m, init="relu")
+        self.relu = nn.ReLU()
+        self.linear_2 = Linear(self.n * self.c_m, self.c_m, init="final")
+
+    def _transition(self, m, mask):
+        m = self.linear_1(m)
+        m = self.relu(m)
+        m = self.linear_2(m) * mask
+        return m
+
+    @torch.jit.ignore
+    def _chunk(self,
+        m: torch.Tensor,
+        mask: torch.Tensor,
+        chunk_size: int,
+    ) -> torch.Tensor:
+         return chunk_layer(
+             self._transition,
+             {"m": m, "mask": mask},
+             chunk_size=chunk_size,
+             no_batch_dims=len(m.shape[:-2]),
+         )
+
+    def forward(
+        self,
+        m: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        chunk_size: Optional[int] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            m:
+                [*, N_seq, N_res, C_m] MSA activation
+            mask:
+                [*, N_seq, N_res, C_m] MSA mask
+        Returns:
+            m:
+                [*, N_seq, N_res, C_m] MSA activation update
+        """
+
+        # DISCREPANCY: DeepMind forgets to apply the MSA mask here.
+        if mask is None:
+            mask = m.new_ones(m.shape[:-1])
+
+        # [*, N_seq, N_res, 1]
+        mask = mask.unsqueeze(-1)
+
+        m = self.layer_norm(m)
+
+        if chunk_size is not None:
+            m = self._chunk(m, mask, chunk_size)
+        else:
+            m = self._transition(m, mask)
+
+        return m
+
+
+class EvoformerBlockCore(nn.Module):
+    def __init__(
+        self,
+        c_m: int,
+        c_z: int,
+        c_hidden_opm: int,
+        c_hidden_mul: int,
+        c_hidden_pair_att: int,
+        no_heads_msa: int,
+        no_heads_pair: int,
+        transition_n: int,
+        pair_dropout: float,
+        inf: float,
+        eps: float,
+        _is_extra_msa_stack: bool = False,
+        is_multimer: bool = False,
+    ):
+        super(EvoformerBlockCore, self).__init__()
+        self.is_multimer = is_multimer
+        self.msa_transition = MSATransition(
+            c_m=c_m,
+            n=transition_n,
+        )
+
+        self.outer_product_mean = OuterProductMean(
+            c_m,
+            c_z,
+            c_hidden_opm,
+        )
+
+        self.tri_mul_out = TriangleMultiplicationOutgoing(
+            c_z,
+            c_hidden_mul,
+        )
+        self.tri_mul_in = TriangleMultiplicationIncoming(
+            c_z,
+            c_hidden_mul,
+        )
+
+        self.tri_att_start = TriangleAttentionStartingNode(
+            c_z,
+            c_hidden_pair_att,
+            no_heads_pair,
+            inf=inf,
+        )
+        self.tri_att_end = TriangleAttentionEndingNode(
+            c_z,
+            c_hidden_pair_att,
+            no_heads_pair,
+            inf=inf,
+        )
+
+        self.pair_transition = PairTransition(
+            c_z,
+            transition_n,
+        )
+
+        self.ps_dropout_row_layer = DropoutRowwise(pair_dropout)
+        self.ps_dropout_col_layer = DropoutColumnwise(pair_dropout)
+
+    def forward(
+        self,
+        m: torch.Tensor,
+        z: torch.Tensor,
+        msa_mask: torch.Tensor,
+        pair_mask: torch.Tensor,
+        chunk_size: Optional[int] = None,
+        _mask_trans: bool = True,
+    ) -> Tuple[torch.Tensor, torch.Tensor]: 
+        # DeepMind doesn't mask these transitions in the source, so _mask_trans
+        # should be disabled to better approximate the exact activations of
+        # the original.
+        msa_trans_mask = msa_mask if _mask_trans else None
+        pair_trans_mask = pair_mask if _mask_trans else None
+
+        m = m + self.msa_transition(
+            m, mask=msa_trans_mask, chunk_size=chunk_size
+        )
+        z = z + self.outer_product_mean(
+            m, mask=msa_mask, chunk_size=chunk_size
+        )
+        z = z + self.ps_dropout_row_layer(self.tri_mul_out(z, mask=pair_mask))
+        z = z + self.ps_dropout_row_layer(self.tri_mul_in(z, mask=pair_mask))
+        z = z + self.ps_dropout_row_layer(
+            self.tri_att_start(z, mask=pair_mask, chunk_size=chunk_size)
+        )
+        z = z + self.ps_dropout_col_layer(
+            self.tri_att_end(z, mask=pair_mask, chunk_size=chunk_size)
+        )
+        z = z + self.pair_transition(
+            z, mask=pair_trans_mask, chunk_size=chunk_size
+        )
+
+        return m, z
+
+
+class EvoformerBlock(nn.Module):
+    def __init__(self,
+        c_m: int,
+        c_z: int,
+        c_hidden_msa_att: int,
+        c_hidden_opm: int,
+        c_hidden_mul: int,
+        c_hidden_pair_att: int,
+        no_heads_msa: int,
+        no_heads_pair: int,
+        transition_n: int,
+        msa_dropout: float,
+        pair_dropout: float,
+        inf: float,
+        eps: float,
+        is_multimer: bool,
+    ):
+        super(EvoformerBlock, self).__init__()
+
+        self.msa_att_row = MSARowAttentionWithPairBias(
+            c_m=c_m,
+            c_z=c_z,
+            c_hidden=c_hidden_msa_att,
+            no_heads=no_heads_msa,
+            inf=inf,
+        )
+
+        self.msa_att_col = MSAColumnAttention(
+            c_m,
+            c_hidden_msa_att,
+            no_heads_msa,
+            inf=inf,
+        )
+
+        self.msa_dropout_layer = DropoutRowwise(msa_dropout)
+
+        self.core = EvoformerBlockCore(
+            c_m=c_m,
+            c_z=c_z,
+            c_hidden_opm=c_hidden_opm,
+            c_hidden_mul=c_hidden_mul,
+            c_hidden_pair_att=c_hidden_pair_att,
+            no_heads_msa=no_heads_msa,
+            no_heads_pair=no_heads_pair,
+            transition_n=transition_n,
+            pair_dropout=pair_dropout,
+            inf=inf,
+            eps=eps,
+        )
+        
+        self.outer_product_mean = OuterProductMean(
+            c_m,
+            c_z,
+            c_hidden_opm,
+        )
+        self.is_multimer = is_multimer
+
+    def forward(self,
+        m: torch.Tensor,
+        z: torch.Tensor,
+        msa_mask: torch.Tensor,
+        pair_mask: torch.Tensor,
+        chunk_size: Optional[int] = None,
+        _mask_trans: bool = True,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        m = m + self.msa_dropout_layer(
+            self.msa_att_row(m, z=z, mask=msa_mask, chunk_size=chunk_size)
+        )
+        m = m + self.msa_att_col(m, mask=msa_mask, chunk_size=chunk_size)
+        m, z = self.core(
+            m, 
+            z, 
+            msa_mask=msa_mask, 
+            pair_mask=pair_mask, 
+            chunk_size=chunk_size, 
+            _mask_trans=_mask_trans,
+        )
+
+        return m, z
+
+
+class ExtraMSABlock(nn.Module):
+    """ 
+        Almost identical to the standard EvoformerBlock, except in that the
+        ExtraMSABlock uses GlobalAttention for MSA column attention and
+        requires more fine-grained control over checkpointing. Separated from
+        its twin to preserve the TorchScript-ability of the latter.
+    """
+    def __init__(self,
+        c_m: int,
+        c_z: int,
+        c_hidden_msa_att: int,
+        c_hidden_opm: int,
+        c_hidden_mul: int,
+        c_hidden_pair_att: int,
+        no_heads_msa: int,
+        no_heads_pair: int,
+        transition_n: int,
+        msa_dropout: float,
+        pair_dropout: float,
+        inf: float,
+        eps: float,
+        ckpt: bool,
+        is_multimer: bool,
+    ):
+        super(ExtraMSABlock, self).__init__()
+        
+        self.ckpt = ckpt
+
+        self.msa_att_row = MSARowAttentionWithPairBias(
+            c_m=c_m,
+            c_z=c_z,
+            c_hidden=c_hidden_msa_att,
+            no_heads=no_heads_msa,
+            inf=inf,
+        )
+
+        self.msa_att_col = MSAColumnGlobalAttention(
+            c_in=c_m,
+            c_hidden=c_hidden_msa_att,
+            no_heads=no_heads_msa,
+            inf=inf,
+            eps=eps,
+        )
+
+        self.msa_dropout_layer = DropoutRowwise(msa_dropout)
+
+        self.core = EvoformerBlockCore(
+            c_m=c_m,
+            c_z=c_z,
+            c_hidden_opm=c_hidden_opm,
+            c_hidden_mul=c_hidden_mul,
+            c_hidden_pair_att=c_hidden_pair_att,
+            no_heads_msa=no_heads_msa,
+            no_heads_pair=no_heads_pair,
+            transition_n=transition_n,
+            pair_dropout=pair_dropout,
+            inf=inf,
+            eps=eps,
+        )
+        self.is_multimer = is_multimer
+
+    def forward(self,
+        m: torch.Tensor,
+        z: torch.Tensor,
+        msa_mask: torch.Tensor,
+        pair_mask: torch.Tensor,
+        chunk_size: Optional[int] = None,
+        _chunk_logits: Optional[int] = 1024,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        m = m + self.msa_dropout_layer(
+            self.msa_att_row(
+                m.clone(), 
+                z=z.clone(), 
+                mask=msa_mask, 
+                chunk_size=chunk_size,
+                _chunk_logits=_chunk_logits if torch.is_grad_enabled() else None,
+                _checkpoint_chunks=
+                    self.ckpt if torch.is_grad_enabled() else False,
+            )
+        )
+
+        def fn(m, z):
+            m = m + self.msa_att_col(m, mask=msa_mask, chunk_size=chunk_size)
+            m, z = self.core(
+                m, z, msa_mask=msa_mask, pair_mask=pair_mask, chunk_size=chunk_size
+            )
+            
+            return m, z
+
+        if(torch.is_grad_enabled() and self.ckpt):
+            checkpoint_fn = get_checkpoint_fn()
+            m, z = checkpoint_fn(fn, m, z)
+        else:
+            m, z = fn(m, z)
+
+        return m, z
+
+
+class EvoformerStack(nn.Module):
+    """
+    Main Evoformer trunk.
+
+    Implements Algorithm 6.
+    """
+
+    def __init__(
+        self,
+        c_m: int,
+        c_z: int,
+        c_hidden_msa_att: int,
+        c_hidden_opm: int,
+        c_hidden_mul: int,
+        c_hidden_pair_att: int,
+        c_s: int,
+        no_heads_msa: int,
+        no_heads_pair: int,
+        no_blocks: int,
+        transition_n: int,
+        msa_dropout: float,
+        pair_dropout: float,
+        blocks_per_ckpt: int,
+        inf: float,
+        eps: float,
+        clear_cache_between_blocks: bool = False, 
+        is_multimer: bool = False,
+        **kwargs,
+    ):
+        """
+        Args:
+            c_m:
+                MSA channel dimension
+            c_z:
+                Pair channel dimension
+            c_hidden_msa_att:
+                Hidden dimension in MSA attention
+            c_hidden_opm:
+                Hidden dimension in outer product mean module
+            c_hidden_mul:
+                Hidden dimension in multiplicative updates
+            c_hidden_pair_att:
+                Hidden dimension in triangular attention
+            c_s:
+                Channel dimension of the output "single" embedding
+            no_heads_msa:
+                Number of heads used for MSA attention
+            no_heads_pair:
+                Number of heads used for pair attention
+            no_blocks:
+                Number of Evoformer blocks in the stack
+            transition_n:
+                Factor by which to multiply c_m to obtain the MSATransition
+                hidden dimension
+            msa_dropout:
+                Dropout rate for MSA activations
+            pair_dropout:
+                Dropout used for pair activations
+            blocks_per_ckpt:
+                Number of Evoformer blocks in each activation checkpoint
+            clear_cache_between_blocks:
+                Whether to clear CUDA's GPU memory cache between blocks of the
+                stack. Slows down each block but can reduce fragmentation
+        """
+        super(EvoformerStack, self).__init__()
+
+        self.blocks_per_ckpt = blocks_per_ckpt
+        self.clear_cache_between_blocks = clear_cache_between_blocks
+
+        self.blocks = nn.ModuleList()
+
+        for _ in range(no_blocks):
+            block = EvoformerBlock(
+                c_m=c_m,
+                c_z=c_z,
+                c_hidden_msa_att=c_hidden_msa_att,
+                c_hidden_opm=c_hidden_opm,
+                c_hidden_mul=c_hidden_mul,
+                c_hidden_pair_att=c_hidden_pair_att,
+                no_heads_msa=no_heads_msa,
+                no_heads_pair=no_heads_pair,
+                transition_n=transition_n,
+                msa_dropout=msa_dropout,
+                pair_dropout=pair_dropout,
+                inf=inf,
+                eps=eps,
+                is_multimer=is_multimer,
+            )
+            self.blocks.append(block)
+
+        self.linear = Linear(c_m, c_s)
+
+    def forward(self,
+        m: torch.Tensor,
+        z: torch.Tensor,
+        msa_mask: torch.Tensor,
+        pair_mask: torch.Tensor,
+        chunk_size: int,
+        _mask_trans: bool = True,
+    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        """
+        Args:
+            m:
+                [*, N_seq, N_res, C_m] MSA embedding
+            z:
+                [*, N_res, N_res, C_z] pair embedding
+            msa_mask:
+                [*, N_seq, N_res] MSA mask
+            pair_mask:
+                [*, N_res, N_res] pair mask
+        Returns:
+            m:
+                [*, N_seq, N_res, C_m] MSA embedding
+            z:
+                [*, N_res, N_res, C_z] pair embedding
+            s:
+                [*, N_res, C_s] single embedding (or None if extra MSA stack)
+        """
+        blocks = [
+            partial(
+                b,
+                msa_mask=msa_mask,
+                pair_mask=pair_mask,
+                chunk_size=chunk_size,
+                _mask_trans=_mask_trans,
+            )
+            for b in self.blocks
+        ]
+
+        if(self.clear_cache_between_blocks):
+            def block_with_cache_clear(block, *args):
+                torch.cuda.empty_cache()
+                return block(*args)
+
+            blocks = [partial(block_with_cache_clear, b) for b in blocks]
+
+        m, z = checkpoint_blocks(
+            blocks,
+            args=(m, z),
+            blocks_per_ckpt=self.blocks_per_ckpt if self.training else None,
+        )
+
+        s = self.linear(m[..., 0, :, :])
+        
+        return m, z, s
+
+
+class ExtraMSAStack(nn.Module):
+    """
+    Implements Algorithm 18.
+    """
+
+    def __init__(self,
+        c_m: int,
+        c_z: int,
+        c_hidden_msa_att: int,
+        c_hidden_opm: int,
+        c_hidden_mul: int,
+        c_hidden_pair_att: int,
+        no_heads_msa: int,
+        no_heads_pair: int,
+        no_blocks: int,
+        transition_n: int,
+        msa_dropout: float,
+        pair_dropout: float,
+        inf: float,
+        eps: float,
+        ckpt: bool,
+        clear_cache_between_blocks: bool = False,
+        is_multimer: bool = False,
+        **kwargs,
+    ):
+        super(ExtraMSAStack, self).__init__()
+        
+        self.clear_cache_between_blocks = clear_cache_between_blocks
+        self.blocks = nn.ModuleList()
+        for _ in range(no_blocks):
+            block = ExtraMSABlock(
+                c_m=c_m,
+                c_z=c_z,
+                c_hidden_msa_att=c_hidden_msa_att,
+                c_hidden_opm=c_hidden_opm,
+                c_hidden_mul=c_hidden_mul,
+                c_hidden_pair_att=c_hidden_pair_att,
+                no_heads_msa=no_heads_msa,
+                no_heads_pair=no_heads_pair,
+                transition_n=transition_n,
+                msa_dropout=msa_dropout,
+                pair_dropout=pair_dropout,
+                inf=inf,
+                eps=eps,
+                ckpt=ckpt,
+                is_multimer=is_multimer,
+            )
+            self.blocks.append(block)
+
+    def forward(self,
+        m: torch.Tensor,
+        z: torch.Tensor,
+        chunk_size: int,
+        msa_mask: Optional[torch.Tensor] = None,
+        pair_mask: Optional[torch.Tensor] = None,
+        _mask_trans: bool = True,
+    ) -> torch.Tensor:
+        """
+        Args:
+            m:
+                [*, N_extra, N_res, C_m] extra MSA embedding
+            z:
+                [*, N_res, N_res, C_z] pair embedding
+            msa_mask:
+                Optional [*, N_extra, N_res] MSA mask
+            pair_mask:
+                Optional [*, N_res, N_res] pair mask
+        Returns:
+            [*, N_res, N_res, C_z] pair update
+        """ 
+        #checkpoint_fn = get_checkpoint_fn()
+        #blocks = [
+        #    partial(b, msa_mask=msa_mask, pair_mask=pair_mask, chunk_size=chunk_size, _chunk_logits=None) for b in self.blocks
+        #]
+
+        #def dodo(b, *args):
+        #    torch.cuda.empty_cache()
+        #    return b(*args)
+
+        #blocks = [partial(dodo, b) for b in blocks]
+
+        #for b in blocks:
+        #    if(torch.is_grad_enabled()):
+        #        m, z = checkpoint_fn(b, *(m, z))
+        #    else:
+        #        m, z = b(m, z)
+
+        for b in self.blocks:
+            m, z = b(m, z, msa_mask, pair_mask, chunk_size=chunk_size)
+
+            if(self.clear_cache_between_blocks):
+                torch.cuda.empty_cache()
+
+        return z
\ No newline at end of file
diff --git a/openfold/msa.py b/openfold/msa.py
new file mode 100644
index 000000000000..172b26def5f1
--- /dev/null
+++ b/openfold/msa.py
@@ -0,0 +1,392 @@
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import torch
+import torch.nn as nn
+from typing import Optional, List, Tuple
+
+from openfold.primitives import (
+    Linear, 
+    LayerNorm,
+    Attention, 
+    GlobalAttention, 
+    _attention_chunked_trainable,
+)
+from openfold.checkpointing import get_checkpoint_fn
+from openfold.tensor_utils import (
+    chunk_layer,
+    permute_final_dims,
+    flatten_final_dims,
+)
+
+
+class MSAAttention(nn.Module):
+    def __init__(
+        self,
+        c_in,
+        c_hidden,
+        no_heads,
+        pair_bias=False,
+        c_z=None,
+        inf=1e9,
+    ):
+        """
+        Args:
+            c_in:
+                Input channel dimension
+            c_hidden:
+                Per-head hidden channel dimension
+            no_heads:
+                Number of attention heads
+            pair_bias:
+                Whether to use pair embedding bias
+            c_z:
+                Pair embedding channel dimension. Ignored unless pair_bias
+                is true
+            inf:
+                A large number to be used in computing the attention mask
+        """
+        super(MSAAttention, self).__init__()
+
+        self.c_in = c_in
+        self.c_hidden = c_hidden
+        self.no_heads = no_heads
+        self.pair_bias = pair_bias
+        self.c_z = c_z
+        self.inf = inf
+
+        self.layer_norm_m = LayerNorm(self.c_in)
+
+        self.layer_norm_z = None
+        self.linear_z = None
+        if self.pair_bias:
+            self.layer_norm_z = LayerNorm(self.c_z)
+            self.linear_z = Linear(
+                self.c_z, self.no_heads, bias=False, init="normal"
+            )
+        
+        self.mha = Attention(
+            self.c_in, self.c_in, self.c_in, self.c_hidden, self.no_heads
+        )
+
+    @torch.jit.ignore
+    def _chunk(self, 
+        m: torch.Tensor,
+        biases: List[torch.Tensor],
+        chunk_size: int,
+    ) -> torch.Tensor:
+        return chunk_layer(
+            self.mha,
+            {"q_x": m, "kv_x": m, "biases": biases},
+            chunk_size=chunk_size,
+            no_batch_dims=len(m.shape[:-2]),
+        )
+
+    def _prep_inputs(self,
+        m: torch.Tensor,
+        z: Optional[torch.Tensor],
+        mask: Optional[torch.Tensor]
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        # [*, N_seq, N_res, C_m]
+        m = self.layer_norm_m(m)
+
+        n_seq, n_res = m.shape[-3:-1]
+        if mask is None:
+            # [*, N_seq, N_res]
+            mask = m.new_ones(
+                m.shape[:-3] + (n_seq, n_res),
+            )
+
+        # [*, N_seq, 1, 1, N_res]
+        mask_bias = (self.inf * (mask - 1))[..., :, None, None, :]
+
+        # This step simply returns a larger view of the bias, and does not
+        # consume additional memory.
+        # [*, N_seq, no_heads, N_res, N_res]
+        #bias = bias.expand(
+        #    ((-1,) * len(bias.shape[:-4])) + (-1, self.no_heads, n_res, -1)
+        #)
+
+        if (self.pair_bias and 
+            z is not None and                       # For the 
+            self.layer_norm_z is not None and       # benefit of
+            self.linear_z is not None               # TorchScript
+        ):
+            # [*, N_res, N_res, C_z]
+            z = self.layer_norm_z(z)
+            
+            # [*, N_res, N_res, no_heads]
+            z = self.linear_z(z)
+            
+            # [*, 1, no_heads, N_res, N_res]
+            z = permute_final_dims(z, (2, 0, 1)).unsqueeze(-4)
+
+        return m, mask_bias, z
+
+    @torch.jit.ignore
+    def _chunked_msa_attn(self,
+        m: torch.Tensor,
+        z: Optional[torch.Tensor],
+        mask: Optional[torch.Tensor],
+        chunk_logits: int,
+        checkpoint: bool,
+    ) -> torch.Tensor:
+        MSA_DIM = -4
+
+        def _get_qkv(m, z):
+            m, mask_bias, z = self._prep_inputs(m, z, mask)
+            q, k, v = self.mha._prep_qkv(m, m)
+            return m, q, k, v, mask_bias, z
+
+        checkpoint_fn = get_checkpoint_fn()
+
+        if(torch.is_grad_enabled() and checkpoint):
+            m, q, k, v, mask_bias, z = checkpoint_fn(_get_qkv, m, z)
+        else:
+            m, q, k, v, mask_bias, z = _get_qkv(m, z)
+       
+        o = _attention_chunked_trainable(
+            query=q, 
+            key=k, 
+            value=v, 
+            biases=[mask_bias, z], 
+            chunk_size=chunk_logits, 
+            chunk_dim=MSA_DIM,
+            checkpoint=checkpoint,
+        )
+
+        if(torch.is_grad_enabled() and checkpoint):
+            # Storing an additional m here is far from ideal
+            m = checkpoint_fn(self.mha._wrap_up, o, m)
+        else:
+            m = self.mha._wrap_up(o, m)
+
+        return m
+
+    def forward(self, 
+        m: torch.Tensor, 
+        z: Optional[torch.Tensor] = None, 
+        mask: Optional[torch.Tensor] = None, 
+        chunk_size: Optional[int] = None,
+        _chunk_logits: Optional[int] = None,
+        _checkpoint_chunks: Optional[bool] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            m:
+                [*, N_seq, N_res, C_m] MSA embedding
+            z:
+                [*, N_res, N_res, C_z] pair embedding. Required only if
+                pair_bias is True
+            mask:
+                [*, N_seq, N_res] MSA mask
+            chunk_size:
+                Size of chunks into which the inputs are split along their
+                batch dimensions. A low value decreases memory overhead at the 
+                cost of slower execution. Chunking is not performed by default.
+                
+        """
+        if(_chunk_logits is not None):
+            return self._chunked_msa_attn(
+                m=m, z=z, mask=mask, 
+                chunk_logits=_chunk_logits, checkpoint=_checkpoint_chunks
+            )           
+
+        m, mask_bias, z = self._prep_inputs(m, z, mask)
+
+        biases = [mask_bias]
+        if(z is not None):
+            biases.append(z)
+
+        if chunk_size is not None:
+            m = self._chunk(m, biases, chunk_size)
+        else:
+            m = self.mha(
+                q_x=m, 
+                kv_x=m, 
+                biases=biases 
+            )
+
+        return m
+
+
+class MSARowAttentionWithPairBias(MSAAttention):
+    """
+    Implements Algorithm 7.
+    """
+
+    def __init__(self, c_m, c_z, c_hidden, no_heads, inf=1e9):
+        """
+        Args:
+            c_m:
+                Input channel dimension
+            c_z:
+                Pair embedding channel dimension
+            c_hidden:
+                Per-head hidden channel dimension
+            no_heads:
+                Number of attention heads
+            inf:
+                Large number used to construct attention masks
+        """
+        super(MSARowAttentionWithPairBias, self).__init__(
+            c_m,
+            c_hidden,
+            no_heads,
+            pair_bias=True,
+            c_z=c_z,
+            inf=inf,
+        )
+
+
+class MSAColumnAttention(nn.Module):
+    """
+    Implements Algorithm 8.
+
+    By rights, this should also be a subclass of MSAAttention. Alas,
+    most inheritance isn't supported by TorchScript.
+    """
+
+    def __init__(self, c_m, c_hidden, no_heads, inf=1e9):
+        """
+        Args:
+            c_m:
+                MSA channel dimension
+            c_hidden:
+                Per-head hidden channel dimension
+            no_heads:
+                Number of attention heads
+            inf:
+                Large number used to construct attention masks
+        """
+        super(MSAColumnAttention, self).__init__()
+        
+        self.c_m = c_m
+        self.c_hidden = c_hidden
+        self.no_heads = no_heads
+        self.inf = inf
+
+        self._msa_att = MSAAttention(
+            c_in=c_m,
+            c_hidden=c_hidden,
+            no_heads=no_heads,
+            pair_bias=False,
+            c_z=None,
+            inf=inf,
+        )
+
+    def forward(self, 
+        m: torch.Tensor, 
+        mask: Optional[torch.Tensor] = None, 
+        chunk_size: Optional[int] = None
+    ) -> torch.Tensor:
+        """
+        Args:
+            m:
+                [*, N_seq, N_res, C_m] MSA embedding
+            mask:
+                [*, N_seq, N_res] MSA mask
+            chunk_size:
+                Size of chunks into which the inputs are split along their
+                batch dimensions. A low value decreases memory overhead at the 
+                cost of slower execution. Chunking is not performed by default.
+        """ 
+        # [*, N_res, N_seq, C_in]
+        m = m.transpose(-2, -3)
+        if mask is not None:
+            mask = mask.transpose(-1, -2)
+
+        m = self._msa_att(m, mask=mask, chunk_size=chunk_size)
+
+        # [*, N_seq, N_res, C_in]
+        m = m.transpose(-2, -3)
+        if mask is not None:
+            mask = mask.transpose(-1, -2)
+
+        return m
+
+
+class MSAColumnGlobalAttention(nn.Module):
+    def __init__(
+        self, c_in, c_hidden, no_heads, inf=1e9, eps=1e-10,
+    ):
+        super(MSAColumnGlobalAttention, self).__init__()
+
+        self.c_in = c_in
+        self.c_hidden = c_hidden
+        self.no_heads = no_heads
+        self.inf = inf
+        self.eps = eps
+
+        self.layer_norm_m = nn.LayerNorm(c_in)
+
+        self.global_attention = GlobalAttention(
+            c_in=c_in,
+            c_hidden=c_hidden,
+            no_heads=no_heads,
+            inf=inf,
+            eps=eps,
+        )
+
+    @torch.jit.ignore
+    def _chunk(self,
+        m: torch.Tensor,
+        mask: torch.Tensor,
+        chunk_size: int,
+    ) -> torch.Tensor:
+        mha_input = {
+            "m": m,
+            "mask": mask,
+        }
+        return chunk_layer(
+            self.global_attention,
+            mha_input,
+            chunk_size=chunk_size,
+            no_batch_dims=len(m.shape[:-2]),
+        )
+
+    def forward(
+        self, 
+        m: torch.Tensor, 
+        mask: Optional[torch.Tensor] = None, 
+        chunk_size: Optional[int] = None,
+    ) -> torch.Tensor:
+        n_seq, n_res, c_in = m.shape[-3:]
+
+        if mask is None:
+            # [*, N_seq, N_res]
+            mask = torch.ones(
+                m.shape[:-1],
+                dtype=m.dtype,
+                device=m.device,
+            ).detach()
+
+        # [*, N_res, N_seq, C_in]
+        m = m.transpose(-2, -3)
+        mask = mask.transpose(-1, -2)
+
+        # [*, N_res, N_seq, C_in]
+        m = self.layer_norm_m(m)
+
+        if chunk_size is not None:
+            m = self._chunk(m, mask, chunk_size) 
+        else:
+            m = self.global_attention(m=m, mask=mask)
+
+        # [*, N_seq, N_res, C_in]
+        m = m.transpose(-2, -3)
+
+        return m
diff --git a/openfold/outer_product_mean.py b/openfold/outer_product_mean.py
new file mode 100644
index 000000000000..43d853833c66
--- /dev/null
+++ b/openfold/outer_product_mean.py
@@ -0,0 +1,129 @@
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partial
+from typing import Optional
+
+import torch
+import torch.nn as nn
+
+from openfold.primitives import Linear
+from openfold.tensor_utils import chunk_layer
+
+
+class OuterProductMean(nn.Module):
+    """
+    Implements Algorithm 10.
+    """
+
+    def __init__(self, c_m, c_z, c_hidden, eps=1e-3):
+        """
+        Args:
+            c_m:
+                MSA embedding channel dimension
+            c_z:
+                Pair embedding channel dimension
+            c_hidden:
+                Hidden channel dimension
+        """
+        super(OuterProductMean, self).__init__()
+
+        self.c_m = c_m
+        self.c_z = c_z
+        self.c_hidden = c_hidden
+        self.eps = eps
+
+        self.layer_norm = nn.LayerNorm(c_m)
+        self.linear_1 = Linear(c_m, c_hidden)
+        self.linear_2 = Linear(c_m, c_hidden)
+        self.linear_out = Linear(c_hidden ** 2, c_z, init="final")
+
+    def _opm(self, a, b):
+        # [*, N_res, N_res, C, C]
+        outer = torch.einsum("...bac,...dae->...bdce", a, b)
+
+        # [*, N_res, N_res, C * C]
+        outer = outer.reshape(outer.shape[:-2] + (-1,))
+
+        # [*, N_res, N_res, C_z]
+        outer = self.linear_out(outer)
+
+        return outer
+
+    @torch.jit.ignore
+    def _chunk(self, 
+        a: torch.Tensor, 
+        b: torch.Tensor, 
+        chunk_size: int
+    ) -> torch.Tensor:
+        # Since the "batch dim" in this case is not a true batch dimension
+        # (in that the shape of the output depends on it), we need to
+        # iterate over it ourselves
+        a_reshape = a.reshape((-1,) + a.shape[-3:])
+        b_reshape = b.reshape((-1,) + b.shape[-3:])
+        out = []
+        for a_prime, b_prime in zip(a_reshape, b_reshape):
+            outer = chunk_layer(
+                partial(self._opm, b=b_prime),
+                {"a": a_prime},
+                chunk_size=chunk_size,
+                no_batch_dims=1,
+            )
+            out.append(outer)
+        outer = torch.stack(out, dim=0)
+        outer = outer.reshape(a.shape[:-3] + outer.shape[1:])
+
+        return outer
+
+    def forward(self, 
+        m: torch.Tensor, 
+        mask: Optional[torch.Tensor] = None,
+        chunk_size: Optional[int] = None
+    ) -> torch.Tensor:
+        """
+        Args:
+            m:
+                [*, N_seq, N_res, C_m] MSA embedding
+            mask:
+                [*, N_seq, N_res] MSA mask
+        Returns:
+            [*, N_res, N_res, C_z] pair embedding update
+        """
+        if mask is None:
+            mask = m.new_ones(m.shape[:-1])
+
+        # [*, N_seq, N_res, C_m]
+        m = self.layer_norm(m)
+
+        # [*, N_seq, N_res, C]
+        mask = mask.unsqueeze(-1)
+        a = self.linear_1(m) * mask
+        b = self.linear_2(m) * mask
+
+        a = a.transpose(-2, -3)
+        b = b.transpose(-2, -3)
+
+        if chunk_size is not None:
+            outer = self._chunk(a, b, chunk_size)
+        else:
+            outer = self._opm(a, b)
+
+        # [*, N_res, N_res, 1]
+        norm = torch.einsum("...abc,...adc->...bdc", mask, mask)
+
+        # [*, N_res, N_res, C_z]
+        outer = outer / (self.eps + norm)
+
+        return outer
diff --git a/openfold/pair_transition.py b/openfold/pair_transition.py
new file mode 100644
index 000000000000..de76306418ee
--- /dev/null
+++ b/openfold/pair_transition.py
@@ -0,0 +1,99 @@
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+
+import torch
+import torch.nn as nn
+
+from openfold.primitives import Linear, LayerNorm
+from openfold.tensor_utils import chunk_layer
+
+
+class PairTransition(nn.Module):
+    """
+    Implements Algorithm 15.
+    """
+
+    def __init__(self, c_z, n):
+        """
+        Args:
+            c_z:
+                Pair transition channel dimension
+            n:
+                Factor by which c_z is multiplied to obtain hidden channel
+                dimension
+        """
+        super(PairTransition, self).__init__()
+
+        self.c_z = c_z
+        self.n = n
+
+        self.layer_norm = LayerNorm(self.c_z)
+        self.linear_1 = Linear(self.c_z, self.n * self.c_z, init="relu")
+        self.relu = nn.ReLU()
+        self.linear_2 = Linear(self.n * self.c_z, c_z, init="final")
+
+    def _transition(self, z, mask):
+        # [*, N_res, N_res, C_hidden]
+        z = self.linear_1(z)
+        z = self.relu(z)
+
+        # [*, N_res, N_res, C_z]
+        z = self.linear_2(z) * mask
+
+        return z
+
+    @torch.jit.ignore
+    def _chunk(self,
+        z: torch.Tensor,
+        mask: torch.Tensor,
+        chunk_size: int,
+    ) -> torch.Tensor:
+        return chunk_layer(
+            self._transition,
+            {"z": z, "mask": mask},
+            chunk_size=chunk_size,
+            no_batch_dims=len(z.shape[:-2]),
+        )
+
+
+    def forward(self, 
+        z: torch.Tensor, 
+        mask: Optional[torch.Tensor] = None,
+        chunk_size: Optional[int] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            z:
+                [*, N_res, N_res, C_z] pair embedding
+        Returns:
+            [*, N_res, N_res, C_z] pair embedding update
+        """
+        # DISCREPANCY: DeepMind forgets to apply the mask in this module.
+        if mask is None:
+            mask = z.new_ones(z.shape[:-1])
+
+        # [*, N_res, N_res, 1]
+        mask = mask.unsqueeze(-1)
+
+        # [*, N_res, N_res, C_z]
+        z = self.layer_norm(z)
+
+        if chunk_size is not None:
+            z = self._chunk(z, mask, chunk_size)
+        else:
+            z = self._transition(z=z, mask=mask)
+
+        return z
diff --git a/openfold/primitives.py b/openfold/primitives.py
new file mode 100644
index 000000000000..bbc156f21d4a
--- /dev/null
+++ b/openfold/primitives.py
@@ -0,0 +1,529 @@
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partial
+import math
+from typing import Optional, Callable, List, Tuple, Sequence
+import numpy as np
+
+import torch
+import torch.nn as nn
+
+from openfold.checkpointing import get_checkpoint_fn
+from openfold.tensor_utils import (
+    permute_final_dims,
+    flatten_final_dims,
+    _chunk_slice,
+)
+
+
+def _prod(nums):
+    out = 1
+    for n in nums:
+        out = out * n
+    return out
+
+
+def _calculate_fan(linear_weight_shape, fan="fan_in"):
+    fan_out, fan_in = linear_weight_shape
+
+    if fan == "fan_in":
+        f = fan_in
+    elif fan == "fan_out":
+        f = fan_out
+    elif fan == "fan_avg":
+        f = (fan_in + fan_out) / 2
+    else:
+        raise ValueError("Invalid fan option")
+
+    return f
+
+
+def glorot_uniform_init_(weights):
+    nn.init.xavier_uniform_(weights, gain=1)
+
+
+def final_init_(weights):
+    with torch.no_grad():
+        weights.fill_(0.0)
+
+
+def gating_init_(weights):
+    with torch.no_grad():
+        weights.fill_(0.0)
+
+
+def normal_init_(weights):
+    torch.nn.init.kaiming_normal_(weights, nonlinearity="linear")
+
+
+def ipa_point_weights_init_(weights):
+    with torch.no_grad():
+        softplus_inverse_1 = 0.541324854612918
+        weights.fill_(softplus_inverse_1)
+
+
+class Linear(nn.Linear):
+    """
+    A Linear layer with built-in nonstandard initializations. Called just
+    like torch.nn.Linear.
+
+    Implements the initializers in 1.11.4, plus some additional ones found
+    in the code.
+    """
+
+    def __init__(
+        self,
+        in_dim: int,
+        out_dim: int,
+        bias: bool = True,
+        init: str = "default",
+        init_fn: Optional[Callable[[torch.Tensor, torch.Tensor], None]] = None,
+    ):
+        """
+        Args:
+            in_dim:
+                The final dimension of inputs to the layer
+            out_dim:
+                The final dimension of layer outputs
+            bias:
+                Whether to learn an additive bias. True by default
+            init:
+                The initializer to use. Choose from:
+
+                "default": LeCun fan-in truncated normal initialization
+                "relu": He initialization w/ truncated normal distribution
+                "glorot": Fan-average Glorot uniform initialization
+                "gating": Weights=0, Bias=1
+                "normal": Normal initialization with std=1/sqrt(fan_in)
+                "final": Weights=0, Bias=0
+
+                Overridden by init_fn if the latter is not None.
+            init_fn:
+                A custom initializer taking weight and bias as inputs.
+                Overrides init if not None.
+        """
+        super(Linear, self).__init__(in_dim, out_dim, bias=bias)
+
+        if bias:
+            with torch.no_grad():
+                self.bias.fill_(0)
+
+        if init_fn is not None:
+            init_fn(self.weight, self.bias)
+        else:
+            if init == "default":
+                normal_init_(self.weight)
+            elif init == "relu":
+                normal_init_(self.weight)
+            elif init == "glorot":
+                glorot_uniform_init_(self.weight)
+            elif init == "gating":
+                gating_init_(self.weight)
+                if bias:
+                    with torch.no_grad():
+                        self.bias.fill_(1.0)
+            elif init == "normal":
+                normal_init_(self.weight)
+            elif init == "final":
+                final_init_(self.weight)
+            else:
+                raise ValueError("Invalid init string.")
+
+
+class LayerNorm(nn.Module):
+
+    def __init__(self, c_in, eps=1e-5):
+        super(LayerNorm, self).__init__()
+
+        self.c_in = (c_in,)
+        self.eps = eps
+
+        self.weight = nn.Parameter(torch.ones(c_in))
+        self.bias = nn.Parameter(torch.zeros(c_in))
+
+    def forward(self, x):
+        out = nn.functional.layer_norm(
+            x,
+            self.c_in,
+            self.weight,
+            self.bias,
+            self.eps,
+        )
+
+        return out
+
+
+@torch.jit.ignore
+def softmax(t: torch.Tensor, dim: int = -1) -> torch.Tensor:
+    """
+        Softmax, but without automatic casting to fp32 when the input is of
+        type bfloat16
+    """
+    s = torch.nn.functional.softmax(t, dim=dim)
+
+    return s
+
+
+#@torch.jit.script
+def _attention(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor,
+               biases: List[torch.Tensor]) -> torch.Tensor:
+    # [*, H, Q, C_hidden]
+    query = permute_final_dims(query, (1, 0, 2))
+
+    # [*, H, C_hidden, K]
+    key = permute_final_dims(key, (1, 2, 0))
+
+    # [*, H, V, C_hidden]
+    value = permute_final_dims(value, (1, 0, 2))
+
+    # [*, H, Q, K]
+    a = torch.matmul(query, key)
+
+    for b in biases:
+        a += b
+
+    a = softmax(a, -1)
+
+    # [*, H, Q, C_hidden]
+    a = torch.matmul(a, value)
+
+    # [*, Q, H, C_hidden]
+    a = a.transpose(-2, -3)
+
+    return a
+
+
+@torch.jit.ignore
+def _attention_chunked_trainable(
+    query,
+    key,
+    value,
+    biases,
+    chunk_size,
+    chunk_dim,
+    checkpoint,
+):
+    if (checkpoint and len(biases) > 2):
+        raise ValueError("Checkpointed version permits only permits two bias terms")
+
+    def _checkpointable_attention(q, k, v, b1, b2):
+        bs = [b for b in [b1, b2] if b is not None]
+        return _attention(q, k, v, bs)
+
+    o_chunks = []
+    checkpoint_fn = get_checkpoint_fn()
+    count = query.shape[chunk_dim]
+    for start in range(0, count, chunk_size):
+        end = start + chunk_size
+        idx = [slice(None)] * len(query.shape)
+        idx[chunk_dim] = slice(start, end)
+        idx_tup = tuple(idx)
+        q_chunk = query[idx_tup]
+        k_chunk = key[idx_tup]
+        v_chunk = value[idx_tup]
+
+        def _slice_bias(b):
+            idx[chunk_dim] = (slice(start, end) if b.shape[chunk_dim] != 1 else slice(None))
+            return b[tuple(idx)]
+
+        if (checkpoint):
+            bias_1_chunk, bias_2_chunk = [
+                _slice_bias(b) if b is not None else None for b in (biases + [None, None])[:2]
+            ]
+
+            o_chunk = checkpoint_fn(_checkpointable_attention, q_chunk, k_chunk, v_chunk,
+                                    bias_1_chunk, bias_2_chunk)
+        else:
+            bias_chunks = [_slice_bias(b) for b in biases]
+
+            o_chunk = _attention(q_chunk, k_chunk, v_chunk, bias_chunks)
+
+        o_chunks.append(o_chunk)
+
+    o = torch.cat(o_chunks, dim=chunk_dim)
+    return o
+
+
+class Attention(nn.Module):
+    """
+    Standard multi-head attention using AlphaFold's default layer
+    initialization. Allows multiple bias vectors.
+    """
+
+    def __init__(
+        self,
+        c_q: int,
+        c_k: int,
+        c_v: int,
+        c_hidden: int,
+        no_heads: int,
+        gating: bool = True,
+    ):
+        """
+        Args:
+            c_q:
+                Input dimension of query data
+            c_k:
+                Input dimension of key data
+            c_v:
+                Input dimension of value data
+            c_hidden:
+                Per-head hidden dimension
+            no_heads:
+                Number of attention heads
+            gating:
+                Whether the output should be gated using query data
+        """
+        super(Attention, self).__init__()
+
+        self.c_q = c_q
+        self.c_k = c_k
+        self.c_v = c_v
+        self.c_hidden = c_hidden
+        self.no_heads = no_heads
+        self.gating = gating
+
+        # DISCREPANCY: c_hidden is not the per-head channel dimension, as
+        # stated in the supplement, but the overall channel dimension.
+
+        self.linear_q = Linear(self.c_q, self.c_hidden * self.no_heads, bias=False, init="glorot")
+        self.linear_k = Linear(self.c_k, self.c_hidden * self.no_heads, bias=False, init="glorot")
+        self.linear_v = Linear(self.c_v, self.c_hidden * self.no_heads, bias=False, init="glorot")
+        self.linear_o = Linear(self.c_hidden * self.no_heads, self.c_q, init="final")
+
+        self.linear_g = None
+        if self.gating:
+            self.linear_g = Linear(self.c_q, self.c_hidden * self.no_heads, init="gating")
+
+        self.sigmoid = nn.Sigmoid()
+
+    def _prep_qkv(self, q_x: torch.Tensor,
+                  kv_x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        # [*, Q/K/V, H * C_hidden]
+        q = self.linear_q(q_x)
+        k = self.linear_k(kv_x)
+        v = self.linear_v(kv_x)
+
+        # [*, Q/K, H, C_hidden]
+        q = q.view(q.shape[:-1] + (self.no_heads, -1))
+        k = k.view(k.shape[:-1] + (self.no_heads, -1))
+        v = v.view(v.shape[:-1] + (self.no_heads, -1))
+
+        q /= math.sqrt(self.c_hidden)
+
+        return q, k, v
+
+    def _wrap_up(self, o: torch.Tensor, q_x: torch.Tensor) -> torch.Tensor:
+        if (self.linear_g is not None):
+            g = self.sigmoid(self.linear_g(q_x))
+
+            # [*, Q, H, C_hidden]
+            g = g.view(g.shape[:-1] + (self.no_heads, -1))
+            o = o * g
+
+        # [*, Q, H * C_hidden]
+        o = flatten_final_dims(o, 2)
+
+        # [*, Q, C_q]
+        o = self.linear_o(o)
+
+        return o
+
+    def forward(
+        self,
+        q_x: torch.Tensor,
+        kv_x: torch.Tensor,
+        biases: Optional[List[torch.Tensor]] = None,
+        use_lma: bool = False,
+        q_chunk_size: Optional[int] = None,
+        kv_chunk_size: Optional[int] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            q_x:
+                [*, Q, C_q] query data
+            kv_x:
+                [*, K, C_k] key data
+            biases:
+                List of biases that broadcast to [*, H, Q, K]
+            use_lma:
+                Whether to use low-memory attention
+            q_chunk_size:
+                Query chunk size (for LMA)
+            kv_chunk_size:
+                Key/Value chunk size (for LMA)
+        Returns
+            [*, Q, C_q] attention update
+        """
+        if (biases is None):
+            biases = []
+        if (use_lma and (q_chunk_size is None or kv_chunk_size is None)):
+            raise ValueError("If use_lma is specified, q_chunk_size and kv_chunk_size must "
+                             "be provided")
+
+        q, k, v = self._prep_qkv(q_x, kv_x)
+
+        if (use_lma):
+            biases = [b.expand(b.shape[:-2] + (q_x.shape[-2],) + (kv_x.shape[-2],)) for b in biases]
+
+            o = _lma(q, k, v, biases, q_chunk_size, kv_chunk_size)
+        else:
+            o = _attention(q, k, v, biases)
+
+        o = self._wrap_up(o, q_x)
+
+        return o
+
+
+class GlobalAttention(nn.Module):
+
+    def __init__(self, c_in, c_hidden, no_heads, inf, eps):
+        super(GlobalAttention, self).__init__()
+
+        self.c_in = c_in
+        self.c_hidden = c_hidden
+        self.no_heads = no_heads
+        self.inf = inf
+        self.eps = eps
+
+        self.linear_q = Linear(c_in, c_hidden * no_heads, bias=False, init="glorot")
+
+        self.linear_k = Linear(
+            c_in,
+            c_hidden,
+            bias=False,
+            init="glorot",
+        )
+        self.linear_v = Linear(
+            c_in,
+            c_hidden,
+            bias=False,
+            init="glorot",
+        )
+        self.linear_g = Linear(c_in, c_hidden * no_heads, init="gating")
+        self.linear_o = Linear(c_hidden * no_heads, c_in, init="final")
+
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, m: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+        # [*, N_res, C_in]
+        q = torch.sum(m * mask.unsqueeze(-1),
+                      dim=-2) / (torch.sum(mask, dim=-1)[..., None] + self.eps)
+
+        # [*, N_res, H * C_hidden]
+        q = self.linear_q(q)
+        q *= (self.c_hidden**(-0.5))
+
+        # [*, N_res, H, C_hidden]
+        q = q.view(q.shape[:-1] + (self.no_heads, -1))
+
+        # [*, N_res, N_seq, C_hidden]
+        k = self.linear_k(m)
+        v = self.linear_v(m)
+
+        # [*, N_res, H, N_seq]
+        a = torch.matmul(
+            q,
+            k.transpose(-1, -2),  # [*, N_res, C_hidden, N_seq]
+        )
+        bias = (self.inf * (mask - 1))[..., :, None, :]
+        a += bias
+        a = softmax(a)
+
+        # [*, N_res, H, C_hidden]
+        o = torch.matmul(
+            a,
+            v,
+        )
+
+        # [*, N_res, N_seq, C_hidden]
+        g = self.sigmoid(self.linear_g(m))
+
+        # [*, N_res, N_seq, H, C_hidden]
+        g = g.view(g.shape[:-1] + (self.no_heads, -1))
+
+        # [*, N_res, N_seq, H, C_hidden]
+        o = o.unsqueeze(-3) * g
+
+        # [*, N_res, N_seq, H * C_hidden]
+        o = o.reshape(o.shape[:-2] + (-1,))
+
+        # [*, N_res, N_seq, C_in]
+        m = self.linear_o(o)
+
+        return m
+
+
+def _lma(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    biases: List[torch.Tensor],
+    q_chunk_size: int,
+    kv_chunk_size: int,
+):
+    no_q, no_kv = q.shape[-3], k.shape[-3]
+
+    # [*, Q, H, C_hidden]
+    o = q.new_zeros(q.shape)
+    for q_s in range(0, no_q, q_chunk_size):
+        q_chunk = q[..., q_s:q_s + q_chunk_size, :, :]
+        large_bias_chunks = [b[..., q_s:q_s + q_chunk_size, :] for b in biases]
+
+        maxes = []
+        weights = []
+        values = []
+        for kv_s in range(0, no_kv, kv_chunk_size):
+            k_chunk = k[..., kv_s:kv_s + kv_chunk_size, :, :]
+            v_chunk = v[..., kv_s:kv_s + kv_chunk_size, :, :]
+            small_bias_chunks = [b[..., kv_s:kv_s + kv_chunk_size] for b in large_bias_chunks]
+
+            a = torch.einsum(
+                "...qhd,...khd->...hqk",
+                q_chunk,
+                k_chunk,
+            )
+
+            for b in small_bias_chunks:
+                a += b
+
+            a = a.transpose(-2, -3)
+
+            max_a = torch.max(a, dim=-1, keepdim=True)[0]
+            exp_a = torch.exp(a - max_a)
+            exp_v = torch.einsum("...vhf,...qhv->...qhf", v_chunk, exp_a)
+
+            maxes.append(max_a.detach().squeeze(-1))
+            weights.append(torch.sum(exp_a, dim=-1))
+            values.append(exp_v)
+
+        chunk_max = torch.stack(maxes, dim=-3)
+        chunk_weights = torch.stack(weights, dim=-3)
+        chunk_values = torch.stack(values, dim=-4)
+
+        global_max = torch.max(chunk_max, dim=-3, keepdim=True)[0]
+        max_diffs = torch.exp(chunk_max - global_max)
+        chunk_values *= max_diffs.unsqueeze(-1)
+        chunk_weights *= max_diffs
+
+        all_values = torch.sum(chunk_values, dim=-4)
+        all_weights = torch.sum(chunk_weights.unsqueeze(-1), dim=-4)
+
+        q_chunk_out = all_values / all_weights
+
+        o[..., q_s:q_s + q_chunk_size, :, :] = q_chunk_out
+
+    return o
diff --git a/openfold/tensor_utils.py b/openfold/tensor_utils.py
new file mode 100644
index 000000000000..7e5e8e4b6b5e
--- /dev/null
+++ b/openfold/tensor_utils.py
@@ -0,0 +1,408 @@
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partial
+import torch
+import torch.nn as nn
+from typing import Tuple, List, Callable, Any, Dict, Sequence, Optional
+
+
+def permute_final_dims(tensor: torch.Tensor, inds: List[int]):
+    zero_index = -1 * len(inds)
+    first_inds = list(range(len(tensor.shape[:zero_index])))
+    return tensor.permute(first_inds + [zero_index + i for i in inds])
+
+
+def flatten_final_dims(t: torch.Tensor, no_dims: int):
+    return t.reshape(t.shape[:-no_dims] + (-1,))
+
+
+def masked_mean(mask, value, dim, eps=1e-4):
+    mask = mask.expand(*value.shape)
+    return torch.sum(mask * value, dim=dim) / (eps + torch.sum(mask, dim=dim))
+
+
+def pts_to_distogram(pts, min_bin=2.3125, max_bin=21.6875, no_bins=64):
+    boundaries = torch.linspace(
+        min_bin, max_bin, no_bins - 1, device=pts.device
+    )
+    dists = torch.sqrt(
+        torch.sum((pts.unsqueeze(-2) - pts.unsqueeze(-3)) ** 2, dim=-1)
+    )
+    return torch.bucketize(dists, boundaries)
+
+
+def dict_multimap(fn, dicts):
+    first = dicts[0]
+    new_dict = {}
+    for k, v in first.items():
+        all_v = [d[k] for d in dicts]
+        if type(v) is dict:
+            new_dict[k] = dict_multimap(fn, all_v)
+        else:
+            new_dict[k] = fn(all_v)
+
+    return new_dict
+
+
+def one_hot(x, v_bins):
+    reshaped_bins = v_bins.view(((1,) * len(x.shape)) + (len(v_bins),))
+    diffs = x[..., None] - reshaped_bins
+    am = torch.argmin(torch.abs(diffs), dim=-1)
+    return nn.functional.one_hot(am, num_classes=len(v_bins)).float()
+
+
+def batched_gather(data, inds, dim=0, no_batch_dims=0):
+    ranges = []
+    for i, s in enumerate(data.shape[:no_batch_dims]):
+        r = torch.arange(s)
+        r = r.view(*(*((1,) * i), -1, *((1,) * (len(inds.shape) - i - 1))))
+        ranges.append(r)
+
+    remaining_dims = [
+        slice(None) for _ in range(len(data.shape) - no_batch_dims)
+    ]
+    remaining_dims[dim - no_batch_dims if dim >= 0 else dim] = inds
+    ranges.extend(remaining_dims)
+    return data[ranges]
+
+
+# With tree_map, a poor man's JAX tree_map
+def dict_map(fn, dic, leaf_type):
+    new_dict = {}
+    for k, v in dic.items():
+        if type(v) is dict:
+            new_dict[k] = dict_map(fn, v, leaf_type)
+        else:
+            new_dict[k] = tree_map(fn, v, leaf_type)
+
+    return new_dict
+
+
+def tree_map(fn, tree, leaf_type):
+    if isinstance(tree, dict):
+        return dict_map(fn, tree, leaf_type)
+    elif isinstance(tree, list):
+        return [tree_map(fn, x, leaf_type) for x in tree]
+    elif isinstance(tree, tuple):
+        return tuple([tree_map(fn, x, leaf_type) for x in tree])
+    elif isinstance(tree, leaf_type):
+        return fn(tree)
+    else:
+        print(type(tree))
+        raise ValueError("Not supported")
+
+
+tensor_tree_map = partial(tree_map, leaf_type=torch.Tensor)
+
+def _fetch_dims(tree):
+    shapes = []
+    tree_type = type(tree)
+    if tree_type is dict:
+        for v in tree.values():
+            shapes.extend(_fetch_dims(v))
+    elif tree_type is list or tree_type is tuple:
+        for t in tree:
+            shapes.extend(_fetch_dims(t))
+    elif tree_type is torch.Tensor:
+        shapes.append(tree.shape)
+    else:
+        raise ValueError("Not supported")
+
+    return shapes
+
+
+@torch.jit.ignore
+def _flat_idx_to_idx(
+    flat_idx: int,
+    dims: Tuple[int],
+) -> Tuple[int]:
+    idx = []
+    for d in reversed(dims):
+        idx.append(flat_idx % d)
+        flat_idx = flat_idx // d
+
+    return tuple(reversed(idx))
+
+
+@torch.jit.ignore
+def _get_minimal_slice_set(
+    start: Sequence[int],
+    end: Sequence[int],
+    dims: int,
+    start_edges: Optional[Sequence[bool]] = None,
+    end_edges: Optional[Sequence[bool]] = None,
+) -> Sequence[Tuple[int]]:
+    """ 
+        Produces an ordered sequence of tensor slices that, when used in
+        sequence on a tensor with shape dims, yields tensors that contain every
+        leaf in the contiguous range [start, end]. Care is taken to yield a 
+        short sequence of slices, and perhaps even the shortest possible (I'm 
+        pretty sure it's the latter).
+         
+        end is INCLUSIVE. 
+    """
+    # start_edges and end_edges both indicate whether, starting from any given
+    # dimension, the start/end index is at the top/bottom edge of the
+    # corresponding tensor, modeled as a tree
+    def reduce_edge_list(l):
+        tally = 1
+        for i in range(len(l)):
+            reversed_idx = -1 * (i + 1)
+            l[reversed_idx] *= tally
+            tally = l[reversed_idx]
+
+    if(start_edges is None):
+        start_edges = [s == 0 for s in start]
+        reduce_edge_list(start_edges)
+    if(end_edges is None):
+        end_edges = [e == (d - 1) for e,d in zip(end, dims)]
+        reduce_edge_list(end_edges)        
+
+    # Base cases. Either start/end are empty and we're done, or the final,
+    # one-dimensional tensor can be simply sliced
+    if(len(start) == 0):
+        return [tuple()]
+    elif(len(start) == 1):
+        return [(slice(start[0], end[0] + 1),)]
+
+    slices = []
+    path = []
+ 
+    # Dimensions common to start and end can be selected directly
+    for s,e in zip(start, end):
+        if(s == e):
+            path.append(slice(s, s + 1))
+        else:
+            break
+
+    path = tuple(path)
+    divergence_idx = len(path)
+
+    # start == end, and we're done
+    if(divergence_idx == len(dims)):
+        return [tuple(path)]
+
+    def upper():
+        sdi = start[divergence_idx]
+        return [
+            path + (slice(sdi, sdi + 1),) + s for s in 
+            _get_minimal_slice_set(
+                start[divergence_idx + 1:],
+                [d - 1 for d in dims[divergence_idx + 1:]],
+                dims[divergence_idx + 1:],
+                start_edges=start_edges[divergence_idx + 1:],
+                end_edges=[1 for _ in end_edges[divergence_idx + 1:]]
+            )
+        ]
+
+    def lower():
+        edi = end[divergence_idx]
+        return [
+            path + (slice(edi, edi + 1),) + s for s in 
+            _get_minimal_slice_set(
+                [0 for _ in start[divergence_idx + 1:]],
+                end[divergence_idx + 1:],
+                dims[divergence_idx + 1:],
+                start_edges=[1 for _ in start_edges[divergence_idx + 1:]],
+                end_edges=end_edges[divergence_idx + 1:],
+            )
+        ]
+
+    # If both start and end are at the edges of the subtree rooted at
+    # divergence_idx, we can just select the whole subtree at once
+    if(start_edges[divergence_idx] and end_edges[divergence_idx]):
+        slices.append(
+            path + (slice(start[divergence_idx], end[divergence_idx] + 1),)
+        )
+    # If just start is at the edge, we can grab almost all of the subtree, 
+    # treating only the ragged bottom edge as an edge case
+    elif(start_edges[divergence_idx]):
+        slices.append(
+            path + (slice(start[divergence_idx], end[divergence_idx]),)
+        )
+        slices.extend(lower())
+    # Analogous to the previous case, but the top is ragged this time
+    elif(end_edges[divergence_idx]):
+        slices.extend(upper())
+        slices.append(
+            path + (slice(start[divergence_idx] + 1, end[divergence_idx] + 1),)
+        )
+    # If both sides of the range are ragged, we need to handle both sides
+    # separately. If there's contiguous meat in between them, we can index it
+    # in one big chunk
+    else:
+        slices.extend(upper())
+        middle_ground = end[divergence_idx] - start[divergence_idx]
+        if(middle_ground > 1):
+            slices.append(
+                path + (slice(start[divergence_idx] + 1, end[divergence_idx]),)
+            )
+        slices.extend(lower())
+
+    return [tuple(s) for s in slices]
+
+
+@torch.jit.ignore
+def _chunk_slice(
+    t: torch.Tensor,
+    flat_start: int,
+    flat_end: int,
+    no_batch_dims: int,
+) -> torch.Tensor:
+    """
+        Equivalent to
+        
+            t.reshape((-1,) + t.shape[no_batch_dims:])[flat_start:flat_end]
+
+        but without the need for the initial reshape call, which can be 
+        memory-intensive in certain situations. The only reshape operations
+        in this function are performed on sub-tensors that scale with
+        (flat_end - flat_start), the chunk size.
+    """
+
+    batch_dims = t.shape[:no_batch_dims]
+    start_idx = list(_flat_idx_to_idx(flat_start, batch_dims))
+    # _get_minimal_slice_set is inclusive
+    end_idx = list(_flat_idx_to_idx(flat_end - 1, batch_dims))
+
+    # Get an ordered list of slices to perform
+    slices = _get_minimal_slice_set(
+        start_idx,
+        end_idx,
+        batch_dims,
+    )
+
+    sliced_tensors = [t[s] for s in slices]
+
+    return torch.cat(
+        [s.view((-1,) + t.shape[no_batch_dims:]) for s in sliced_tensors]
+    )
+
+
+def chunk_layer(
+    layer: Callable,
+    inputs: Dict[str, Any],
+    chunk_size: int,
+    no_batch_dims: int,
+    low_mem: bool = False, 
+) -> Any:
+    """
+    Implements the "chunking" procedure described in section 1.11.8.
+
+    Layer outputs and inputs are assumed to be simple "pytrees,"
+    consisting only of (arbitrarily nested) lists, tuples, and dicts with
+    torch.Tensor leaves.
+
+    Args:
+        layer:
+            The layer to be applied chunk-wise
+        inputs:
+            A (non-nested) dictionary of keyworded inputs. All leaves must
+            be tensors and must share the same batch dimensions.
+        chunk_size:
+            The number of sub-batches per chunk. If multiple batch
+            dimensions are specified, a "sub-batch" is defined as a single
+            indexing of all batch dimensions simultaneously (s.t. the
+            number of sub-batches is the product of the batch dimensions).
+        no_batch_dims:
+            How many of the initial dimensions of each input tensor can
+            be considered batch dimensions.
+        low_mem:
+            Avoids flattening potentially large input tensors. Unnecessary
+            in most cases, and is ever so slightly slower than the default
+            setting.
+    Returns:
+        The reassembled output of the layer on the inputs.
+    """
+    if not (len(inputs) > 0):
+        raise ValueError("Must provide at least one input")
+
+    initial_dims = [shape[:no_batch_dims] for shape in _fetch_dims(inputs)]
+    orig_batch_dims = tuple([max(s) for s in zip(*initial_dims)])
+
+    def _prep_inputs(t):
+        # TODO: make this more memory efficient. This sucks
+        if(not low_mem):
+            if not sum(t.shape[:no_batch_dims]) == no_batch_dims:
+                t = t.expand(orig_batch_dims + t.shape[no_batch_dims:])
+            t = t.reshape(-1, *t.shape[no_batch_dims:])
+        else:
+            t = t.expand(orig_batch_dims + t.shape[no_batch_dims:])
+        return t
+
+    prepped_inputs = tensor_tree_map(_prep_inputs, inputs)
+
+    flat_batch_dim = 1
+    for d in orig_batch_dims:
+        flat_batch_dim *= d
+
+    no_chunks = flat_batch_dim // chunk_size + (
+        flat_batch_dim % chunk_size != 0
+    )
+
+    i = 0
+    out = None
+    for _ in range(no_chunks):
+        # Chunk the input
+        if(not low_mem):
+            select_chunk = (
+                lambda t: t[i : i + chunk_size] if t.shape[0] != 1 else t
+            )
+        else:
+            select_chunk = (
+                partial(
+                    _chunk_slice, 
+                    flat_start=i, 
+                    flat_end=min(flat_batch_dim, i + chunk_size), 
+                    no_batch_dims=len(orig_batch_dims)
+                )
+            )
+
+        chunks = tensor_tree_map(select_chunk, prepped_inputs)
+
+        # Run the layer on the chunk
+        output_chunk = layer(**chunks)
+
+        # Allocate space for the output
+        if out is None:
+            allocate = lambda t: t.new_zeros((flat_batch_dim,) + t.shape[1:])
+            out = tensor_tree_map(allocate, output_chunk)
+
+        # Put the chunk in its pre-allocated space
+        out_type = type(output_chunk)
+        if out_type is dict:
+            def assign(d1, d2):
+                for k, v in d1.items():
+                    if type(v) is dict:
+                        assign(v, d2[k])
+                    else:
+                        v[i : i + chunk_size] = d2[k]
+
+            assign(out, output_chunk)
+        elif out_type is tuple:
+            for x1, x2 in zip(out, output_chunk):
+                x1[i : i + chunk_size] = x2
+        elif out_type is torch.Tensor:
+            out[i : i + chunk_size] = output_chunk
+        else:
+            raise ValueError("Not supported")
+
+        i += chunk_size
+
+    reshape = lambda t: t.view(orig_batch_dims + t.shape[1:])
+    out = tensor_tree_map(reshape, out)
+
+    return out
diff --git a/openfold/triangular_attention.py b/openfold/triangular_attention.py
new file mode 100644
index 000000000000..6d3e37f4c681
--- /dev/null
+++ b/openfold/triangular_attention.py
@@ -0,0 +1,139 @@
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partialmethod, partial
+import math
+from typing import Optional, List
+
+import torch
+import torch.nn as nn
+
+from openfold.primitives import Linear, LayerNorm, Attention
+from openfold.tensor_utils import (
+    chunk_layer,
+    permute_final_dims,
+    flatten_final_dims,
+)
+
+
+class TriangleAttention(nn.Module):
+    def __init__(
+        self, c_in, c_hidden, no_heads, starting, inf=1e9
+    ):
+        """
+        Args:
+            c_in:
+                Input channel dimension
+            c_hidden:
+                Overall hidden channel dimension (not per-head)
+            no_heads:
+                Number of attention heads
+        """
+        super(TriangleAttention, self).__init__()
+
+        self.c_in = c_in
+        self.c_hidden = c_hidden
+        self.no_heads = no_heads
+        self.starting = starting
+        self.inf = inf
+
+        self.layer_norm = LayerNorm(self.c_in)
+
+        self.linear = Linear(c_in, self.no_heads, bias=False, init="normal")
+
+        self.mha = Attention(
+            self.c_in, self.c_in, self.c_in, self.c_hidden, self.no_heads
+        )
+
+    @torch.jit.ignore
+    def _chunk(self,
+        x: torch.Tensor,
+        biases: List[torch.Tensor],
+        chunk_size: int,
+    ) -> torch.Tensor:
+        mha_inputs = {
+            "q_x": x,
+            "kv_x": x,
+            "biases": biases,
+        }
+        return chunk_layer(
+            partial(self.mha),
+            mha_inputs,
+            chunk_size=chunk_size,
+            no_batch_dims=len(x.shape[:-2]),
+        )
+
+    def forward(self, 
+        x: torch.Tensor, 
+        mask: Optional[torch.Tensor] = None,
+        chunk_size: Optional[int] = None
+    ) -> torch.Tensor:
+        """
+        Args:
+            x:
+                [*, I, J, C_in] input tensor (e.g. the pair representation)
+        Returns:
+            [*, I, J, C_in] output tensor
+        """
+        if mask is None:
+            # [*, I, J]
+            mask = x.new_ones(
+                x.shape[:-1],
+            )
+
+        # Shape annotations assume self.starting. Else, I and J are flipped
+        if not self.starting:
+            x = x.transpose(-2, -3)
+            mask = mask.transpose(-1, -2)
+
+        # [*, I, J, C_in]
+        x = self.layer_norm(x)
+
+        # [*, I, 1, 1, J]
+        mask_bias = (self.inf * (mask - 1))[..., :, None, None, :]
+
+        # [*, H, I, J]
+        triangle_bias = permute_final_dims(self.linear(x), (2, 0, 1))
+
+        # [*, 1, H, I, J]
+        triangle_bias = triangle_bias.unsqueeze(-4)
+
+        biases = [mask_bias, triangle_bias]
+
+        if chunk_size is not None:
+            x = self._chunk(x, biases, chunk_size)
+        else:
+            x = self.mha(q_x=x, kv_x=x, biases=biases)
+
+        if not self.starting:
+            x = x.transpose(-2, -3)
+
+        return x
+
+
+class TriangleAttentionStartingNode(TriangleAttention):
+    """
+    Implements Algorithm 13.
+    """
+
+    __init__ = partialmethod(TriangleAttention.__init__, starting=True)
+
+
+class TriangleAttentionEndingNode(TriangleAttention):
+    """
+    Implements Algorithm 14.
+    """
+
+    __init__ = partialmethod(TriangleAttention.__init__, starting=False)
diff --git a/openfold/triangular_multiplicative_update.py b/openfold/triangular_multiplicative_update.py
new file mode 100644
index 000000000000..2406e2bac2cf
--- /dev/null
+++ b/openfold/triangular_multiplicative_update.py
@@ -0,0 +1,127 @@
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partialmethod
+from typing import Optional
+
+import torch
+import torch.nn as nn
+
+from openfold.primitives import Linear, LayerNorm
+from openfold.tensor_utils import permute_final_dims
+
+
+class TriangleMultiplicativeUpdate(nn.Module):
+    """
+    Implements Algorithms 11 and 12.
+    """
+    def __init__(self, c_z, c_hidden, _outgoing=True):
+        """
+        Args:
+            c_z:
+                Input channel dimension
+            c:
+                Hidden channel dimension
+        """
+        super(TriangleMultiplicativeUpdate, self).__init__()
+        self.c_z = c_z
+        self.c_hidden = c_hidden
+        self._outgoing = _outgoing
+
+        self.linear_a_p = Linear(self.c_z, self.c_hidden)
+        self.linear_a_g = Linear(self.c_z, self.c_hidden, init="gating")
+        self.linear_b_p = Linear(self.c_z, self.c_hidden)
+        self.linear_b_g = Linear(self.c_z, self.c_hidden, init="gating")
+        self.linear_g = Linear(self.c_z, self.c_z, init="gating")
+        self.linear_z = Linear(self.c_hidden, self.c_z, init="final")
+
+        self.layer_norm_in = LayerNorm(self.c_z)
+        self.layer_norm_out = LayerNorm(self.c_hidden)
+
+        self.sigmoid = nn.Sigmoid()
+
+    def _combine_projections(self,
+        a: torch.Tensor,
+        b: torch.Tensor,
+    ) -> torch.Tensor:
+        raise NotImplementedError("This method needs to be overridden")
+
+    def forward(self, 
+        z: torch.Tensor, 
+        mask: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        """
+        Args:
+            x:
+                [*, N_res, N_res, C_z] input tensor
+            mask:
+                [*, N_res, N_res] input mask
+        Returns:
+            [*, N_res, N_res, C_z] output tensor
+        """
+        if mask is None:
+            mask = z.new_ones(z.shape[:-1])
+
+        mask = mask.unsqueeze(-1)
+
+        z = self.layer_norm_in(z)
+        a = self.linear_a_p(z) * self.sigmoid(self.linear_a_g(z))
+        a = a * mask
+        b = self.linear_b_p(z) * self.sigmoid(self.linear_b_g(z))
+        b = b * mask
+        x = self._combine_projections(a, b)
+        x = self.layer_norm_out(x)
+        x = self.linear_z(x)
+        g = self.sigmoid(self.linear_g(z))
+        z = x * g
+
+        return z
+
+
+class TriangleMultiplicationOutgoing(TriangleMultiplicativeUpdate):
+    """
+    Implements Algorithm 11.
+    """
+    def _combine_projections(self,
+        a: torch.Tensor,  # [*, N_i, N_k, C]
+        b: torch.Tensor,  # [*, N_j, N_k, C]
+    ):
+        # [*, C, N_i, N_j]
+        p = torch.matmul(
+            permute_final_dims(a, (2, 0, 1)),
+            permute_final_dims(b, (2, 1, 0)),
+        )
+
+        # [*, N_i, N_j, C]
+        return permute_final_dims(p, (1, 2, 0))
+
+
+class TriangleMultiplicationIncoming(TriangleMultiplicativeUpdate):
+    """
+    Implements Algorithm 12.
+    """
+    def _combine_projections(self,
+        a: torch.Tensor,  # [*, N_k, N_i, C]
+        b: torch.Tensor,  # [*, N_k, N_j, C]
+    ):
+        # [*, C, N_i, N_j]
+        p = torch.matmul(
+            permute_final_dims(a, (2, 1, 0)),
+            permute_final_dims(b, (2, 0, 1)),
+        )
+
+        # [*, N_i, N_j, C]
+        return permute_final_dims(p, (1, 2, 0))
+

From 1d7ca02301c9ff71953070ea963b8e107fa4ccb6 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Thu, 29 Dec 2022 14:28:38 +0800
Subject: [PATCH 066/503] add benchmark

---
 autochunk_benchmark.py | 79 ++++++++++++++++++++++++++++++++++++++++++
 chunk_codegen.py       | 16 +++++----
 2 files changed, 89 insertions(+), 6 deletions(-)
 create mode 100644 autochunk_benchmark.py

diff --git a/autochunk_benchmark.py b/autochunk_benchmark.py
new file mode 100644
index 000000000000..a34464212e02
--- /dev/null
+++ b/autochunk_benchmark.py
@@ -0,0 +1,79 @@
+import copy
+import torch
+import torch.nn.functional as F
+import pytest
+import torch.fx
+import torch.multiprocessing as mp
+from torch.fx import GraphModule
+from colossalai.fx import ColoTracer
+import colossalai
+from colossalai.utils import free_port
+from colossalai.core import global_context as gpc
+from colossalai.fx.graph_module import ColoGraphModule
+from colossalai.fx.passes.meta_info_prop import MetaInfoProp, TensorMetadata
+from colossalai.fx.profiler import MetaTensor
+from evoformer.evoformer import evoformer_base
+from chunk_codegen import ChunkCodeGen
+import time
+
+
+def _benchmark_evoformer(model: torch.nn.Module, node, pair):
+    loop = 10
+    with torch.no_grad():
+        for _ in range(loop // 4):
+            model(node, pair)
+        torch.cuda.synchronize()
+        time1 = time.time()
+        for _ in range(loop):
+            model(node, pair)
+        torch.cuda.synchronize()
+        time2 = time.time()
+    return (time2 - time1) / loop
+
+
+def benchmark_evoformer():
+    # data
+    msa_len = 300
+    pair_len = 800
+    node = torch.randn(1, msa_len, pair_len, 256).cuda()
+    pair = torch.randn(1, pair_len, pair_len, 128).cuda()
+
+    # build gm model
+    max_memory = 3000  # MB
+    model = evoformer_base().cuda()
+    # trace the module and replace codegen
+    graph = ColoTracer().trace(
+        model,
+        meta_args={
+            "node": node.to(torch.device("meta")),
+            "pair": pair.to(torch.device("meta")),
+        },
+    )
+    gm_prop = torch.fx.symbolic_trace(model)  # must use symbolic_trace
+    interp = MetaInfoProp(gm_prop)
+    interp.propagate(
+        MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0")
+    )
+    # now run it twice to get meta info in graph module, not necessary
+    gm = torch.fx.GraphModule(model, graph)
+    interp = MetaInfoProp(gm)
+    interp.propagate(
+        MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0")
+    )
+    # set code_gen
+    codegen = ChunkCodeGen(gm_prop, max_memory)
+    graph.set_codegen(codegen)
+    gm = ColoGraphModule(model, graph)
+    gm.recompile()
+    # print
+    code = graph.python_code("self").src
+    print(code)
+
+    time_gm = _benchmark_evoformer(gm, node, pair)
+    print("gm %.4fs" % time_gm)
+    time_openfold = _benchmark_evoformer(model, node, pair)
+    print("openfold %.4fs" % time_openfold)
+
+
+if __name__ == "__main__":
+    benchmark_evoformer()
diff --git a/chunk_codegen.py b/chunk_codegen.py
index 6caed88d84d2..033db50dbccb 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -1398,13 +1398,14 @@ def estimate_chunk_inference_mem(
 
 class ChunkSelector(object):
     def __init__(
-        self, index_tracer: IndexTracer, memory_estimator: MemoryEstimator, stratge
+        self, index_tracer: IndexTracer, memory_estimator: MemoryEstimator, stratge, max_memory=None
     ):
         self.index_tracer = index_tracer
         self.memory_estimator = memory_estimator
         assert stratge in ["min_memory", "fit_memory"]
+        assert (stratge == "fit_memory" and max_memory is not None) or stratge != "fit_memory"
         self.stratge = stratge
-        self.max_memory = 600  # MB
+        self.max_memory = max_memory  # MB
 
     def _select_best_chunk_region(
         self, possible_chunk_regions, chunk_infos, peak_node, max_chunk_region, mem_peak
@@ -1556,13 +1557,13 @@ def _is_legal_region(self, cur_chunk_info, chunk_infos):
 
 
 class ChunkRegionSearch(object):
-    def __init__(self, gm) -> None:
+    def __init__(self, gm, max_memory=None) -> None:
         self.gm = gm
         self.index_tracer = IndexTracer(list(gm.graph.nodes))
         self.index_tracer.trace_index()
         self.memory_estimator = MemoryEstimator(self.index_tracer)
         self.chunk_selector = ChunkSelector(
-            self.index_tracer, self.memory_estimator, stratge="fit_memory"
+            self.index_tracer, self.memory_estimator, stratge="fit_memory", max_memory=max_memory
         )
 
     def _find_peak_node(self, mem_peak):
@@ -1897,6 +1898,7 @@ def emit_code_with_chunk(
     delete_unused_value_func,
     meta_nodes,
     meta_graph,
+    max_memory=None,
 ):
     """Emit code with nested activation checkpoint
     When we detect some of the node.activation_checkpoint is a List, we will use
@@ -1912,7 +1914,7 @@ def emit_code_with_chunk(
     node_list = list(nodes)
 
     # find the chunk regions
-    chunk_region_search = ChunkRegionSearch(meta_graph)
+    chunk_region_search = ChunkRegionSearch(meta_graph, max_memory)
     chunk_search = chunk_region_search.search_region()
 
     chunk_regions = [i["region"] for i in chunk_search]
@@ -1989,9 +1991,10 @@ def emit_code_with_chunk(
 if CODEGEN_AVAILABLE:
 
     class ChunkCodeGen(CodeGen):
-        def __init__(self, meta_graph):
+        def __init__(self, meta_graph, max_memory=None):
             super().__init__()
             self.meta_graph = meta_graph
+            self.max_memory = max_memory
             self.meta_node = list(meta_graph.graph.nodes)
 
         def _gen_python_code(
@@ -2230,6 +2233,7 @@ def emit_node(node: Node, body):
                 delete_unused_values,
                 self.meta_node,
                 self.meta_graph,
+                self.max_memory
             )
 
             if len(body) == 0:

From 5a916c0adb320b4a1cfc96e8a40364fb62a0a463 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Thu, 29 Dec 2022 14:42:29 +0800
Subject: [PATCH 067/503] add print

---
 autochunk_benchmark.py | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/autochunk_benchmark.py b/autochunk_benchmark.py
index a34464212e02..0c55a3a8848c 100644
--- a/autochunk_benchmark.py
+++ b/autochunk_benchmark.py
@@ -1,24 +1,21 @@
-import copy
+import time
+
 import torch
-import torch.nn.functional as F
-import pytest
 import torch.fx
-import torch.multiprocessing as mp
-from torch.fx import GraphModule
+
+from chunk_codegen import ChunkCodeGen
 from colossalai.fx import ColoTracer
-import colossalai
-from colossalai.utils import free_port
-from colossalai.core import global_context as gpc
 from colossalai.fx.graph_module import ColoGraphModule
-from colossalai.fx.passes.meta_info_prop import MetaInfoProp, TensorMetadata
+from colossalai.fx.passes.meta_info_prop import MetaInfoProp
 from colossalai.fx.profiler import MetaTensor
 from evoformer.evoformer import evoformer_base
-from chunk_codegen import ChunkCodeGen
-import time
 
 
-def _benchmark_evoformer(model: torch.nn.Module, node, pair):
-    loop = 10
+def _benchmark_evoformer(model: torch.nn.Module, node, pair, title):
+    torch.cuda.reset_peak_memory_stats()
+    now_mem = torch.cuda.memory_allocated() / 1024**2
+
+    loop = 16
     with torch.no_grad():
         for _ in range(loop // 4):
             model(node, pair)
@@ -28,7 +25,12 @@ def _benchmark_evoformer(model: torch.nn.Module, node, pair):
             model(node, pair)
         torch.cuda.synchronize()
         time2 = time.time()
-    return (time2 - time1) / loop
+
+    new_max_mem = torch.cuda.max_memory_allocated() / 1024**2
+    print(
+        "%s: time %.4fs, mem %dMB"
+        % (title, (time2 - time1) / loop, new_max_mem - now_mem)
+    )
 
 
 def benchmark_evoformer():
@@ -69,10 +71,8 @@ def benchmark_evoformer():
     code = graph.python_code("self").src
     print(code)
 
-    time_gm = _benchmark_evoformer(gm, node, pair)
-    print("gm %.4fs" % time_gm)
-    time_openfold = _benchmark_evoformer(model, node, pair)
-    print("openfold %.4fs" % time_openfold)
+    _benchmark_evoformer(gm, node, pair, "autochunk")
+    _benchmark_evoformer(model, node, pair, "openfold")
 
 
 if __name__ == "__main__":

From 7a23deb58455b112cf187776857e2a262d0b737e Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Thu, 29 Dec 2022 14:47:16 +0800
Subject: [PATCH 068/503] code style

---
 autochunk_benchmark.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/autochunk_benchmark.py b/autochunk_benchmark.py
index 0c55a3a8848c..f8e603f4ee63 100644
--- a/autochunk_benchmark.py
+++ b/autochunk_benchmark.py
@@ -34,15 +34,23 @@ def _benchmark_evoformer(model: torch.nn.Module, node, pair, title):
 
 
 def benchmark_evoformer():
-    # data
+    # init data and model
     msa_len = 300
     pair_len = 800
     node = torch.randn(1, msa_len, pair_len, 256).cuda()
     pair = torch.randn(1, pair_len, pair_len, 128).cuda()
+    model = evoformer_base().cuda()
 
-    # build gm model
+    # build autochunk model
     max_memory = 3000  # MB
-    model = evoformer_base().cuda()
+    autochunk = _build_autochunk(model, max_memory, node, pair)
+
+    # benchmark
+    _benchmark_evoformer(model, node, pair, "openfold")
+    _benchmark_evoformer(autochunk, node, pair, "autochunk")
+
+
+def _build_autochunk(model, max_memory, node, pair):
     # trace the module and replace codegen
     graph = ColoTracer().trace(
         model,
@@ -70,9 +78,7 @@ def benchmark_evoformer():
     # print
     code = graph.python_code("self").src
     print(code)
-
-    _benchmark_evoformer(gm, node, pair, "autochunk")
-    _benchmark_evoformer(model, node, pair, "openfold")
+    return gm
 
 
 if __name__ == "__main__":

From efe6fe3a33c4b8c50c2e964188fef72d1f269cfd Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Thu, 29 Dec 2022 14:47:47 +0800
Subject: [PATCH 069/503] code style

---
 autochunk_benchmark.py | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/autochunk_benchmark.py b/autochunk_benchmark.py
index f8e603f4ee63..20f615b216f7 100644
--- a/autochunk_benchmark.py
+++ b/autochunk_benchmark.py
@@ -33,23 +33,6 @@ def _benchmark_evoformer(model: torch.nn.Module, node, pair, title):
     )
 
 
-def benchmark_evoformer():
-    # init data and model
-    msa_len = 300
-    pair_len = 800
-    node = torch.randn(1, msa_len, pair_len, 256).cuda()
-    pair = torch.randn(1, pair_len, pair_len, 128).cuda()
-    model = evoformer_base().cuda()
-
-    # build autochunk model
-    max_memory = 3000  # MB
-    autochunk = _build_autochunk(model, max_memory, node, pair)
-
-    # benchmark
-    _benchmark_evoformer(model, node, pair, "openfold")
-    _benchmark_evoformer(autochunk, node, pair, "autochunk")
-
-
 def _build_autochunk(model, max_memory, node, pair):
     # trace the module and replace codegen
     graph = ColoTracer().trace(
@@ -81,5 +64,22 @@ def _build_autochunk(model, max_memory, node, pair):
     return gm
 
 
+def benchmark_evoformer():
+    # init data and model
+    msa_len = 300
+    pair_len = 800
+    node = torch.randn(1, msa_len, pair_len, 256).cuda()
+    pair = torch.randn(1, pair_len, pair_len, 128).cuda()
+    model = evoformer_base().cuda()
+
+    # build autochunk model
+    max_memory = 3000  # MB
+    autochunk = _build_autochunk(model, max_memory, node, pair)
+
+    # benchmark
+    _benchmark_evoformer(model, node, pair, "openfold")
+    _benchmark_evoformer(autochunk, node, pair, "autochunk")
+
+
 if __name__ == "__main__":
     benchmark_evoformer()

From 289f3a45c24233fec28af6d5651b3099b55ace8b Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Thu, 29 Dec 2022 15:01:15 +0800
Subject: [PATCH 070/503] init openfold

---
 evoformer_openfold/evoformer.py   |  59 +++++++++
 evoformer_openfold/initializer.py |  29 +++++
 evoformer_openfold/kernel.py      |  19 +++
 evoformer_openfold/msa.py         |  95 +++++++++++++++
 evoformer_openfold/ops.py         | 176 +++++++++++++++++++++++++++
 evoformer_openfold/triangle.py    | 192 ++++++++++++++++++++++++++++++
 6 files changed, 570 insertions(+)
 create mode 100644 evoformer_openfold/evoformer.py
 create mode 100755 evoformer_openfold/initializer.py
 create mode 100644 evoformer_openfold/kernel.py
 create mode 100644 evoformer_openfold/msa.py
 create mode 100755 evoformer_openfold/ops.py
 create mode 100644 evoformer_openfold/triangle.py

diff --git a/evoformer_openfold/evoformer.py b/evoformer_openfold/evoformer.py
new file mode 100644
index 000000000000..cfd2bb2a2529
--- /dev/null
+++ b/evoformer_openfold/evoformer.py
@@ -0,0 +1,59 @@
+import torch
+import torch.nn as nn
+
+from .msa import MSAStack
+from .ops import OutProductMean
+from .triangle import PairStack
+
+
+def print_memory(init_mem, text=None):
+    now_mem = torch.cuda.memory_allocated() / 1024 ** 2 - init_mem
+    max_mem = torch.cuda.max_memory_allocated() / 1024 ** 2 - init_mem
+    print("%s now:%.2f max:%.2f" % ("" if text is None else text, now_mem, max_mem))
+    torch.cuda.reset_peak_memory_stats()
+
+
+class EvoformerBlock(nn.Module):
+
+    def __init__(self, d_node, d_pair):
+        super(EvoformerBlock, self).__init__()
+
+        self.msa_stack = MSAStack(d_node, d_pair, p_drop=0.15)
+        self.communication = OutProductMean(n_feat=d_node, n_feat_out=d_pair, n_feat_proj=32)
+        self.pair_stack = PairStack(d_pair=d_pair)
+
+    def forward(self, node, pair):
+        node = self.msa_stack(node, pair)
+        pair = pair + self.communication(node)
+        pair = self.pair_stack(pair)
+        return node, pair
+
+
+class Evoformer(nn.Module):
+
+    def __init__(self, d_node, d_pair):
+        super(Evoformer, self).__init__()
+
+        self.blocks = nn.ModuleList()
+        for _ in range(1):
+            self.blocks.append(EvoformerBlock(d_node, d_pair))
+
+    def forward(self, node, pair):
+        for b in self.blocks:
+            node, pair = b(node, pair)
+        return node, pair
+
+
+def evoformer_tiny():
+    return Evoformer(d_node=64, d_pair=32)
+
+
+def evoformer_base():
+    return Evoformer(d_node=256, d_pair=128)
+
+
+def evoformer_large():
+    return Evoformer(d_node=512, d_pair=256)
+
+
+__all__ = ['Evoformer', 'evoformer_base', 'evoformer_large']
diff --git a/evoformer_openfold/initializer.py b/evoformer_openfold/initializer.py
new file mode 100755
index 000000000000..c6ce0659e597
--- /dev/null
+++ b/evoformer_openfold/initializer.py
@@ -0,0 +1,29 @@
+import math
+
+import numpy as np
+import torch.nn as nn
+
+
+def glorot_uniform_af(x, gain=1.0):
+    """
+    initialize tensors the same as xavier_initializer in PyTorch, but the dimensions are different:
+    In PyTorch:
+    [feature_out, feature_in, n_head ...]
+    In Jax:
+    [... n_head, feature_in, feature_out]
+    However, there is a feature in original Alphafold2 code that they use the Jax version initializer to initialize tensors like:
+    [feature_in, n_head, feature_out]
+
+    In this function, we keep this feature to initialize [feature_in, n_head, ..., feature_out] tensors
+    """
+    fan_in, fan_out = x.shape[-2:]
+    if len(x.shape) > 2:
+        receptive_field_size = np.prod(x.shape[:-2])
+        fan_in *= receptive_field_size
+        fan_out *= receptive_field_size
+    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
+    dev = math.sqrt(3.0) * std  # Calculate uniform bounds from standard deviation
+
+    nn.init.uniform_(x, -dev, dev)
+
+    return x
diff --git a/evoformer_openfold/kernel.py b/evoformer_openfold/kernel.py
new file mode 100644
index 000000000000..26ab5dc53261
--- /dev/null
+++ b/evoformer_openfold/kernel.py
@@ -0,0 +1,19 @@
+import torch
+import torch.nn.functional as F
+
+
+def bias_sigmod_ele(y, bias, z):
+    return torch.sigmoid(y + bias) * z
+
+
+def bias_dropout_add(x: torch.Tensor, bias: torch.Tensor, dropmask: torch.Tensor,
+                     residual: torch.Tensor, prob: float) -> torch.Tensor:
+    out = (x + bias) * F.dropout(dropmask, p=prob, training=False)
+    out = residual + out
+    return out
+
+
+def bias_ele_dropout_residual(ab: torch.Tensor, b: torch.Tensor, g: torch.Tensor,
+                              dropout_mask: torch.Tensor, Z_raw: torch.Tensor,
+                              prob: float) -> torch.Tensor:
+    return Z_raw + F.dropout(dropout_mask, p=prob, training=True) * (g * (ab + b))
\ No newline at end of file
diff --git a/evoformer_openfold/msa.py b/evoformer_openfold/msa.py
new file mode 100644
index 000000000000..cac456638a55
--- /dev/null
+++ b/evoformer_openfold/msa.py
@@ -0,0 +1,95 @@
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torch.nn import LayerNorm
+
+from .kernel import bias_dropout_add
+from .ops import SelfAttention, Transition
+
+
+class MSARowAttentionWithPairBias(nn.Module):
+
+    def __init__(self, d_node, d_pair, c=32, n_head=8, p_drop=0.15):
+        super(MSARowAttentionWithPairBias, self).__init__()
+        self.d_node = d_node
+        self.d_pair = d_pair
+        self.c = c
+        self.n_head = n_head
+        self.p_drop = p_drop
+
+        self.layernormM = LayerNorm(d_node)
+        self.layernormZ = LayerNorm(d_pair)
+
+        _init_weights = torch.nn.init.normal_(torch.zeros([n_head, d_pair]),
+                                              std=1.0 / math.sqrt(d_pair))
+        self.linear_b_weights = nn.parameter.Parameter(data=_init_weights, requires_grad=True)
+
+        self.attention = SelfAttention(qkv_dim=d_node,
+                                       c=c,
+                                       n_head=n_head,
+                                       out_dim=d_node,
+                                       gating=True,
+                                       last_bias_fuse=True)
+
+        self.out_bias = nn.parameter.Parameter(data=torch.zeros((d_node,)), requires_grad=True)
+
+    def forward(self, M_raw, Z):
+        ## Input projections
+        M = self.layernormM(M_raw)
+        Z = self.layernormZ(Z)
+        b = F.linear(Z, self.linear_b_weights)
+        b = b.permute(0, 3, 1, 2)
+        # b = rearrange(b, 'b q k h -> b h q k')
+
+        M = self.attention(M, b)
+        dropout_mask = torch.ones_like(M[:, 0:1, :, :]).to(M.device).to(M.dtype)
+
+        return bias_dropout_add(M, self.out_bias, dropout_mask, M_raw, prob=self.p_drop)
+
+
+class MSAColumnAttention(nn.Module):
+
+    def __init__(self, d_node, c=32, n_head=8):
+        super(MSAColumnAttention, self).__init__()
+        self.d_node = d_node
+        self.c = c
+        self.n_head = n_head
+
+        self.layernormM = LayerNorm(d_node)
+        self.attention = SelfAttention(qkv_dim=d_node,
+                                       c=c,
+                                       n_head=n_head,
+                                       out_dim=d_node,
+                                       gating=True)
+
+    def forward(self, M_raw):
+        M = M_raw.transpose(-2, -3)
+        M = self.layernormM(M)
+
+        M = self.attention(M)
+
+        M = M.transpose(-2, -3)
+        return M_raw + M
+
+
+class MSAStack(nn.Module):
+
+    def __init__(self, d_node, d_pair, p_drop=0.15):
+        super(MSAStack, self).__init__()
+
+        self.MSARowAttentionWithPairBias = MSARowAttentionWithPairBias(d_node=d_node,
+                                                                       d_pair=d_pair,
+                                                                       p_drop=p_drop)
+
+        self.MSAColumnAttention = MSAColumnAttention(d_node=d_node)
+        self.MSATransition = Transition(d=d_node)
+
+    def forward(self, node, pair):
+        node = self.MSARowAttentionWithPairBias(node, pair)
+        node = self.MSAColumnAttention(node)
+        node = self.MSATransition(node)
+
+        return node
diff --git a/evoformer_openfold/ops.py b/evoformer_openfold/ops.py
new file mode 100755
index 000000000000..611b7b0fe777
--- /dev/null
+++ b/evoformer_openfold/ops.py
@@ -0,0 +1,176 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torch.nn import LayerNorm
+
+from .initializer import glorot_uniform_af
+from .kernel import bias_sigmod_ele
+
+
+class DropoutRowwise(nn.Module):
+
+    def __init__(self, p):
+        super(DropoutRowwise, self).__init__()
+        self.p = p
+        self.dropout = nn.Dropout(p=p)
+
+    def forward(self, x):
+        dropout_mask = torch.ones_like(x[:, 0:1, :, :])
+        dropout_mask = self.dropout(dropout_mask)
+        return dropout_mask * x
+
+
+class DropoutColumnwise(nn.Module):
+
+    def __init__(self, p):
+        super(DropoutColumnwise, self).__init__()
+        self.p = p
+        self.dropout = nn.Dropout(p=p)
+
+    def forward(self, x):
+        dropout_mask = torch.ones_like(x[:, :, 0:1, :])
+        dropout_mask = self.dropout(dropout_mask)
+        return dropout_mask * x
+
+
+class Transition(nn.Module):
+
+    def __init__(self, d, n=4):
+        super(Transition, self).__init__()
+        self.norm = LayerNorm(d)
+        self.linear1 = Linear(d, n * d, initializer='relu')
+        self.linear2 = Linear(n * d, d, initializer='zeros')
+
+    def forward(self, src):
+        x = self.norm(src)
+        x = self.linear2(F.relu(self.linear1(x)))
+        return src + x
+
+
+class OutProductMean(nn.Module):
+
+    def __init__(self, n_feat=64, n_feat_out=128, n_feat_proj=32):
+        super(OutProductMean, self).__init__()
+
+        self.layernormM = LayerNorm(n_feat)
+        self.linear_a = Linear(n_feat, n_feat_proj)
+        self.linear_b = Linear(n_feat, n_feat_proj)
+
+        self.o_linear = Linear(n_feat_proj * n_feat_proj,
+                               n_feat_out,
+                               initializer='zero',
+                               use_bias=True)
+
+    def forward(self, M):
+        M = self.layernormM(M)
+        left_act = self.linear_a(M)
+        right_act = self.linear_b(M)
+
+        O = torch.einsum('bsid,bsje->bijde', left_act, right_act).contiguous()
+        # O = rearrange(O, 'b i j d e -> b i j (d e)')
+        O = O.reshape(O.shape[0], O.shape[1], O.shape[2], -1)
+        Z = self.o_linear(O)
+
+        return Z
+
+
+class Linear(nn.Linear):
+    """
+    A Linear layer with built-in nonstandard initializations. Called just
+    like torch.nn.Linear.
+    Implements the initializers in 1.11.4, plus some additional ones found
+    in the code.
+    """
+
+    def __init__(
+        self,
+        feature_in: int,
+        feature_out: int,
+        initializer: str = 'linear',
+        use_bias: bool = True,
+        bias_init: float = 0.,
+    ):
+        super(Linear, self).__init__(feature_in, feature_out, bias=use_bias)
+
+        self.use_bias = use_bias
+        if initializer == 'linear':
+            glorot_uniform_af(self.weight, gain=1.0)
+        elif initializer == 'relu':
+            glorot_uniform_af(self.weight, gain=2.0)
+        elif initializer == 'zeros':
+            nn.init.zeros_(self.weight)
+        if self.use_bias:
+            with torch.no_grad():
+                self.bias.fill_(bias_init)
+
+
+class SelfAttention(nn.Module):
+    """
+    Multi-Head SelfAttention dealing with [batch_size1, batch_size2, len, dim] tensors
+    """
+
+    def __init__(self, qkv_dim, c, n_head, out_dim, gating=True, last_bias_fuse=False):
+        super(SelfAttention, self).__init__()
+        self.qkv_dim = qkv_dim
+        self.c = c
+        self.n_head = n_head
+        self.out_dim = out_dim
+        self.gating = gating
+        self.last_bias_fuse = last_bias_fuse
+
+        self.scaling = self.c**(-0.5)
+
+        # self.to_qkv = Linear(qkv_dim, 3 * n_head * c, initializer='linear')
+        self.to_q = Linear(qkv_dim, n_head * c, initializer='linear', use_bias=False)
+        self.to_k = Linear(qkv_dim, n_head * c, initializer='linear', use_bias=False)
+        self.to_v = Linear(qkv_dim, n_head * c, initializer='linear', use_bias=False)
+
+        if gating:
+            self.gating_bias = nn.parameter.Parameter(data=torch.ones((n_head * c,)))
+            self.gating_linear = Linear(qkv_dim, n_head * c, initializer='zero', use_bias=False)
+
+        self.o_linear = Linear(n_head * c,
+                               out_dim,
+                               initializer='zero',
+                               use_bias=(not last_bias_fuse))
+
+    def forward(self, in_data, nonbatched_bias=None):
+        """
+        :param in_data: [batch_size1, batch_size2, len_qkv, qkv_dim]
+        :param bias: None or [batch_size1, batch_size2, n_head, len_q, len_kv]
+        :param nonbatched_bias: None or [batch_size1, n_head, len_q, len_kv]
+        """
+
+        # qkv = self.to_qkv(in_data).chunk(3, dim=-1)
+        # q, k, v = map(lambda t: rearrange(t, 'b1 b2 n (h d) -> b1 b2 h n d', h=self.n_head), qkv)
+
+        q = self.to_q(in_data)
+        k = self.to_k(in_data)
+        v = self.to_v(in_data)
+
+        # q, k, v = map(lambda t: rearrange(t, 'b1 b2 n (h d) -> b1 b2 h n d', h=self.n_head),
+        #               [q, k, v])
+        q, k, v = map(lambda t: t.view(t.shape[0], t.shape[1], t.shape[2], self.n_head, -1).permute(0, 1, 3, 2, 4),
+                      [q, k, v])
+        
+        q = q * self.scaling
+
+        logits = torch.matmul(q, k.transpose(-1, -2))
+
+        if nonbatched_bias is not None:
+            logits += nonbatched_bias.unsqueeze(1)
+        weights = torch.softmax(logits, dim=-1)
+        # weights = softmax(logits)
+
+        weighted_avg = torch.matmul(weights, v)
+        # weighted_avg = rearrange(weighted_avg, 'b1 b2 h n d -> b1 b2 n (h d)')
+        weighted_avg = weighted_avg.permute(0, 1, 3, 2, 4)
+        weighted_avg = weighted_avg.reshape(weighted_avg.shape[0], weighted_avg.shape[1], weighted_avg.shape[2], -1)
+
+        if self.gating:
+            gate_values = self.gating_linear(in_data)
+            weighted_avg = bias_sigmod_ele(gate_values, self.gating_bias, weighted_avg)
+
+        output = self.o_linear(weighted_avg)
+        return output
diff --git a/evoformer_openfold/triangle.py b/evoformer_openfold/triangle.py
new file mode 100644
index 000000000000..f479469c3836
--- /dev/null
+++ b/evoformer_openfold/triangle.py
@@ -0,0 +1,192 @@
+import math
+
+import torch
+import torch.nn as nn
+from torch.nn import LayerNorm
+
+from .kernel import bias_dropout_add, bias_ele_dropout_residual
+from .ops import Linear, SelfAttention, Transition
+
+
+def permute_final_dims(tensor, inds):
+    zero_index = -1 * len(inds)
+    first_inds = list(range(len(tensor.shape[:zero_index])))
+    return tensor.permute(first_inds + [zero_index + i for i in inds])
+
+
+class TriangleMultiplicationOutgoing(nn.Module):
+
+    def __init__(self, d_pair, p_drop, c=128):
+        super(TriangleMultiplicationOutgoing, self).__init__()
+        self.d_pair = d_pair
+        self.c = c
+
+        self.layernorm1 = LayerNorm(d_pair)
+        self.left_projection = Linear(d_pair, c)
+        self.right_projection = Linear(d_pair, c)
+        self.left_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
+        self.right_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
+
+        self.output_gate = Linear(d_pair, d_pair, initializer='zeros', bias_init=1.)
+        self.layernorm2 = LayerNorm(c)
+        self.output_projection = Linear(d_pair, d_pair, initializer='zeros', use_bias=False)
+        self.output_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
+
+        self.p_drop = p_drop
+
+    def forward(self, Z_raw):
+        Z = self.layernorm1(Z_raw)
+        left_proj_act = self.left_projection(Z)
+        right_proj_act = self.right_projection(Z)
+
+        left_proj_act = left_proj_act * torch.sigmoid(self.left_gate(Z))
+        right_proj_act = right_proj_act * torch.sigmoid(self.right_gate(Z))
+
+        g = torch.sigmoid(self.output_gate(Z))
+        # p = torch.matmul(
+        #     permute_final_dims(left_proj_act, (2, 0, 1)),
+        #     permute_final_dims(right_proj_act, (2, 1, 0)),
+        # )
+        # ab = permute_final_dims(p, (1, 2, 0))
+
+        ab = torch.einsum('bikd,bjkd->bijd', left_proj_act, right_proj_act)
+        ab = self.output_projection(self.layernorm2(ab))
+        dropout_mask = torch.ones_like(Z[:, 0:1, :, :]).to(Z.device).to(Z.dtype)
+        return bias_ele_dropout_residual(ab,
+                                         self.output_bias,
+                                         g,
+                                         dropout_mask,
+                                         Z_raw,
+                                         prob=self.p_drop)
+
+
+class TriangleMultiplicationIncoming(nn.Module):
+
+    def __init__(self, d_pair, p_drop, c=128):
+        super(TriangleMultiplicationIncoming, self).__init__()
+        self.d_pair = d_pair
+        self.c = c
+
+        self.layernorm1 = LayerNorm(d_pair)
+        self.left_projection = Linear(d_pair, c)
+        self.right_projection = Linear(d_pair, c)
+        self.left_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
+        self.right_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
+
+        self.output_gate = Linear(d_pair, d_pair, initializer='zeros', bias_init=1.)
+        self.layernorm2 = LayerNorm(c)
+        self.output_projection = Linear(d_pair, d_pair, initializer='zeros', use_bias=False)
+        self.output_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
+
+        self.p_drop = p_drop
+
+    def forward(self, Z_raw):
+        Z = self.layernorm1(Z_raw)
+        left_proj_act = self.left_projection(Z)
+        right_proj_act = self.right_projection(Z)
+
+        left_proj_act = left_proj_act * torch.sigmoid(self.left_gate(Z))
+        right_proj_act = right_proj_act * torch.sigmoid(self.right_gate(Z))
+
+        g = torch.sigmoid(self.output_gate(Z))
+        # p = torch.matmul(
+        #     permute_final_dims(left_proj_act, (2, 1, 0)),
+        #     permute_final_dims(right_proj_act, (2, 0, 1)),
+        # )
+        # ab = permute_final_dims(p, (1, 2, 0))
+
+        ab = torch.einsum('bkid,bkjd->bijd', left_proj_act, right_proj_act)
+        ab = self.output_projection(self.layernorm2(ab))
+        dropout_mask = torch.ones_like(Z[:, 0:1, :, :]).to(Z.device).to(Z.dtype)
+        return bias_ele_dropout_residual(ab,
+                                         self.output_bias,
+                                         g,
+                                         dropout_mask,
+                                         Z_raw,
+                                         prob=self.p_drop)
+
+
+class TriangleAttentionStartingNode(nn.Module):
+
+    def __init__(self, d_pair, p_drop, c=32, n_head=4):
+        super(TriangleAttentionStartingNode, self).__init__()
+        self.d_pair = d_pair
+        self.c = c
+        self.n_head = n_head
+        self.p_drop = p_drop
+
+        self.layernorm1 = LayerNorm(d_pair)
+        _init_weights = torch.nn.init.normal_(torch.zeros([d_pair, n_head]),
+                                              std=1.0 / math.sqrt(d_pair))
+        self.linear_b_weights = nn.parameter.Parameter(data=_init_weights)
+        self.attention = SelfAttention(qkv_dim=d_pair,
+                                       c=c,
+                                       n_head=n_head,
+                                       out_dim=d_pair,
+                                       gating=True,
+                                       last_bias_fuse=True)
+
+        self.out_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
+
+    def forward(self, Z_raw):
+        Z = self.layernorm1(Z_raw)
+        b = torch.einsum('bqkc,ch->bhqk', Z, self.linear_b_weights)
+
+        Z = self.attention(Z, b)
+
+        dropout_mask = torch.ones_like(Z[:, 0:1, :, :]).to(Z.device).to(Z.dtype)
+        return bias_dropout_add(Z, self.out_bias, dropout_mask, Z_raw, prob=self.p_drop)
+
+
+class TriangleAttentionEndingNode(nn.Module):
+
+    def __init__(self, d_pair, p_drop, c=32, n_head=4):
+        super(TriangleAttentionEndingNode, self).__init__()
+        self.d_pair = d_pair
+        self.c = c
+        self.n_head = n_head
+        self.p_drop = p_drop
+
+        self.layernorm1 = LayerNorm(d_pair)
+        _init_weights = torch.nn.init.normal_(torch.zeros([d_pair, n_head]),
+                                              std=1.0 / math.sqrt(d_pair))
+        self.linear_b_weights = nn.parameter.Parameter(data=_init_weights)
+        self.attention = SelfAttention(qkv_dim=d_pair,
+                                       c=c,
+                                       n_head=n_head,
+                                       out_dim=d_pair,
+                                       gating=True,
+                                       last_bias_fuse=True)
+
+        self.out_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
+
+    def forward(self, Z_raw):
+        Z = Z_raw.transpose(-2, -3)
+        Z = self.layernorm1(Z)
+        b = torch.einsum('bqkc,ch->bhqk', Z, self.linear_b_weights)
+
+        Z = self.attention(Z, b)
+
+        Z = Z.transpose(-2, -3)
+        dropout_mask = torch.ones_like(Z[:, :, 0:1, :]).to(Z.device).to(Z.dtype)
+        return bias_dropout_add(Z, self.out_bias, dropout_mask, Z_raw, prob=self.p_drop)
+
+
+class PairStack(nn.Module):
+
+    def __init__(self, d_pair, p_drop=0.25):
+        super(PairStack, self).__init__()
+
+        self.TriangleMultiplicationOutgoing = TriangleMultiplicationOutgoing(d_pair, p_drop=p_drop)
+        self.TriangleMultiplicationIncoming = TriangleMultiplicationIncoming(d_pair, p_drop=p_drop)
+        self.TriangleAttentionStartingNode = TriangleAttentionStartingNode(d_pair, p_drop=p_drop)
+        self.TriangleAttentionEndingNode = TriangleAttentionEndingNode(d_pair, p_drop=p_drop)
+        self.PairTransition = Transition(d=d_pair)
+
+    def forward(self, pair):
+        pair = self.TriangleMultiplicationOutgoing(pair)
+        pair = self.TriangleMultiplicationIncoming(pair)
+        pair = self.TriangleAttentionStartingNode(pair)
+        pair = self.TriangleAttentionEndingNode(pair)
+        pair = self.PairTransition(pair)
+        return pair

From 5c4df01af3076069867a66c5fc7a8086e6c55c0a Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Thu, 29 Dec 2022 15:54:08 +0800
Subject: [PATCH 071/503] update openfold

---
 openfold/evoformer.py | 29 ++++++-------------
 openfold/msa.py       | 67 ++-----------------------------------------
 2 files changed, 12 insertions(+), 84 deletions(-)

diff --git a/openfold/evoformer.py b/openfold/evoformer.py
index 21e422b04764..7fbcd8a76b4d 100644
--- a/openfold/evoformer.py
+++ b/openfold/evoformer.py
@@ -182,33 +182,28 @@ def forward(
         self,
         m: torch.Tensor,
         z: torch.Tensor,
-        msa_mask: torch.Tensor,
-        pair_mask: torch.Tensor,
         chunk_size: Optional[int] = None,
-        _mask_trans: bool = True,
     ) -> Tuple[torch.Tensor, torch.Tensor]: 
         # DeepMind doesn't mask these transitions in the source, so _mask_trans
         # should be disabled to better approximate the exact activations of
         # the original.
-        msa_trans_mask = msa_mask if _mask_trans else None
-        pair_trans_mask = pair_mask if _mask_trans else None
 
         m = m + self.msa_transition(
-            m, mask=msa_trans_mask, chunk_size=chunk_size
+            m, chunk_size=chunk_size
         )
         z = z + self.outer_product_mean(
-            m, mask=msa_mask, chunk_size=chunk_size
+            m, chunk_size=chunk_size
         )
-        z = z + self.ps_dropout_row_layer(self.tri_mul_out(z, mask=pair_mask))
-        z = z + self.ps_dropout_row_layer(self.tri_mul_in(z, mask=pair_mask))
+        z = z + self.ps_dropout_row_layer(self.tri_mul_out(z))
+        z = z + self.ps_dropout_row_layer(self.tri_mul_in(z))
         z = z + self.ps_dropout_row_layer(
-            self.tri_att_start(z, mask=pair_mask, chunk_size=chunk_size)
+            self.tri_att_start(z, chunk_size=chunk_size)
         )
         z = z + self.ps_dropout_col_layer(
-            self.tri_att_end(z, mask=pair_mask, chunk_size=chunk_size)
+            self.tri_att_end(z, chunk_size=chunk_size)
         )
         z = z + self.pair_transition(
-            z, mask=pair_trans_mask, chunk_size=chunk_size
+            z, chunk_size=chunk_size
         )
 
         return m, z
@@ -274,22 +269,16 @@ def __init__(self,
     def forward(self,
         m: torch.Tensor,
         z: torch.Tensor,
-        msa_mask: torch.Tensor,
-        pair_mask: torch.Tensor,
         chunk_size: Optional[int] = None,
-        _mask_trans: bool = True,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         m = m + self.msa_dropout_layer(
-            self.msa_att_row(m, z=z, mask=msa_mask, chunk_size=chunk_size)
+            self.msa_att_row(m, z=z, chunk_size=chunk_size)
         )
-        m = m + self.msa_att_col(m, mask=msa_mask, chunk_size=chunk_size)
+        m = m + self.msa_att_col(m, chunk_size=chunk_size)
         m, z = self.core(
             m, 
             z, 
-            msa_mask=msa_mask, 
-            pair_mask=pair_mask, 
             chunk_size=chunk_size, 
-            _mask_trans=_mask_trans,
         )
 
         return m, z
diff --git a/openfold/msa.py b/openfold/msa.py
index 172b26def5f1..00b822e7f390 100644
--- a/openfold/msa.py
+++ b/openfold/msa.py
@@ -136,45 +136,6 @@ def _prep_inputs(self,
 
         return m, mask_bias, z
 
-    @torch.jit.ignore
-    def _chunked_msa_attn(self,
-        m: torch.Tensor,
-        z: Optional[torch.Tensor],
-        mask: Optional[torch.Tensor],
-        chunk_logits: int,
-        checkpoint: bool,
-    ) -> torch.Tensor:
-        MSA_DIM = -4
-
-        def _get_qkv(m, z):
-            m, mask_bias, z = self._prep_inputs(m, z, mask)
-            q, k, v = self.mha._prep_qkv(m, m)
-            return m, q, k, v, mask_bias, z
-
-        checkpoint_fn = get_checkpoint_fn()
-
-        if(torch.is_grad_enabled() and checkpoint):
-            m, q, k, v, mask_bias, z = checkpoint_fn(_get_qkv, m, z)
-        else:
-            m, q, k, v, mask_bias, z = _get_qkv(m, z)
-       
-        o = _attention_chunked_trainable(
-            query=q, 
-            key=k, 
-            value=v, 
-            biases=[mask_bias, z], 
-            chunk_size=chunk_logits, 
-            chunk_dim=MSA_DIM,
-            checkpoint=checkpoint,
-        )
-
-        if(torch.is_grad_enabled() and checkpoint):
-            # Storing an additional m here is far from ideal
-            m = checkpoint_fn(self.mha._wrap_up, o, m)
-        else:
-            m = self.mha._wrap_up(o, m)
-
-        return m
 
     def forward(self, 
         m: torch.Tensor, 
@@ -199,12 +160,6 @@ def forward(self,
                 cost of slower execution. Chunking is not performed by default.
                 
         """
-        if(_chunk_logits is not None):
-            return self._chunked_msa_attn(
-                m=m, z=z, mask=mask, 
-                chunk_logits=_chunk_logits, checkpoint=_checkpoint_chunks
-            )           
-
         m, mask_bias, z = self._prep_inputs(m, z, mask)
 
         biases = [mask_bias]
@@ -306,15 +261,11 @@ def forward(self,
         """ 
         # [*, N_res, N_seq, C_in]
         m = m.transpose(-2, -3)
-        if mask is not None:
-            mask = mask.transpose(-1, -2)
 
-        m = self._msa_att(m, mask=mask, chunk_size=chunk_size)
+        m = self._msa_att(m, chunk_size=chunk_size)
 
         # [*, N_seq, N_res, C_in]
         m = m.transpose(-2, -3)
-        if mask is not None:
-            mask = mask.transpose(-1, -2)
 
         return m
 
@@ -344,12 +295,10 @@ def __init__(
     @torch.jit.ignore
     def _chunk(self,
         m: torch.Tensor,
-        mask: torch.Tensor,
         chunk_size: int,
     ) -> torch.Tensor:
         mha_input = {
             "m": m,
-            "mask": mask,
         }
         return chunk_layer(
             self.global_attention,
@@ -361,30 +310,20 @@ def _chunk(self,
     def forward(
         self, 
         m: torch.Tensor, 
-        mask: Optional[torch.Tensor] = None, 
         chunk_size: Optional[int] = None,
     ) -> torch.Tensor:
         n_seq, n_res, c_in = m.shape[-3:]
 
-        if mask is None:
-            # [*, N_seq, N_res]
-            mask = torch.ones(
-                m.shape[:-1],
-                dtype=m.dtype,
-                device=m.device,
-            ).detach()
-
         # [*, N_res, N_seq, C_in]
         m = m.transpose(-2, -3)
-        mask = mask.transpose(-1, -2)
 
         # [*, N_res, N_seq, C_in]
         m = self.layer_norm_m(m)
 
         if chunk_size is not None:
-            m = self._chunk(m, mask, chunk_size) 
+            m = self._chunk(m, chunk_size) 
         else:
-            m = self.global_attention(m=m, mask=mask)
+            m = self.global_attention(m=m)
 
         # [*, N_seq, N_res, C_in]
         m = m.transpose(-2, -3)

From f7d8092c84eef1a5dfd976f883a6d38d5b11bd68 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Thu, 29 Dec 2022 16:01:05 +0800
Subject: [PATCH 072/503] align openfold

---
 autochunk_benchmark.py            |  41 ++++++-
 evoformer_openfold/evoformer.py   |  59 ---------
 evoformer_openfold/initializer.py |  29 -----
 evoformer_openfold/kernel.py      |  19 ---
 evoformer_openfold/msa.py         |  95 ---------------
 evoformer_openfold/ops.py         | 176 ---------------------------
 evoformer_openfold/triangle.py    | 192 -----------------------------
 openfold/evoformer.py             | 194 ------------------------------
 8 files changed, 36 insertions(+), 769 deletions(-)
 delete mode 100644 evoformer_openfold/evoformer.py
 delete mode 100755 evoformer_openfold/initializer.py
 delete mode 100644 evoformer_openfold/kernel.py
 delete mode 100644 evoformer_openfold/msa.py
 delete mode 100755 evoformer_openfold/ops.py
 delete mode 100644 evoformer_openfold/triangle.py

diff --git a/autochunk_benchmark.py b/autochunk_benchmark.py
index 20f615b216f7..679016438c59 100644
--- a/autochunk_benchmark.py
+++ b/autochunk_benchmark.py
@@ -9,20 +9,27 @@
 from colossalai.fx.passes.meta_info_prop import MetaInfoProp
 from colossalai.fx.profiler import MetaTensor
 from evoformer.evoformer import evoformer_base
+from openfold.evoformer import EvoformerBlock
 
 
-def _benchmark_evoformer(model: torch.nn.Module, node, pair, title):
+def _benchmark_evoformer(model: torch.nn.Module, node, pair, title, chunk_size=None):
     torch.cuda.reset_peak_memory_stats()
     now_mem = torch.cuda.memory_allocated() / 1024**2
 
     loop = 16
     with torch.no_grad():
         for _ in range(loop // 4):
-            model(node, pair)
+            if chunk_size:
+                model(node, pair, chunk_size)
+            else:
+                model(node, pair)
         torch.cuda.synchronize()
         time1 = time.time()
         for _ in range(loop):
-            model(node, pair)
+            if chunk_size:
+                model(node, pair, chunk_size)
+            else:
+                model(node, pair)
         torch.cuda.synchronize()
         time2 = time.time()
 
@@ -64,6 +71,26 @@ def _build_autochunk(model, max_memory, node, pair):
     return gm
 
 
+def _build_openfold():
+    model = EvoformerBlock(
+        c_m=256,
+        c_z=128,
+        c_hidden_msa_att=32,
+        c_hidden_opm=32,
+        c_hidden_mul=128,
+        c_hidden_pair_att=32,
+        no_heads_msa=8,
+        no_heads_pair=4,
+        transition_n=4,
+        msa_dropout=0.15,
+        pair_dropout=0.15,
+        inf=1e4,
+        eps=1e-4,
+        is_multimer=False,
+    ).cuda()
+    return model
+
+
 def benchmark_evoformer():
     # init data and model
     msa_len = 300
@@ -74,10 +101,14 @@ def benchmark_evoformer():
 
     # build autochunk model
     max_memory = 3000  # MB
-    autochunk = _build_autochunk(model, max_memory, node, pair)
+    autochunk = _build_autochunk(evoformer_base().cuda(), max_memory, node, pair)
+
+    # build openfold
+    openfold = _build_openfold()
 
     # benchmark
-    _benchmark_evoformer(model, node, pair, "openfold")
+    _benchmark_evoformer(model, node, pair, "base")
+    _benchmark_evoformer(openfold, node, pair, "openfold", chunk_size=4)
     _benchmark_evoformer(autochunk, node, pair, "autochunk")
 
 
diff --git a/evoformer_openfold/evoformer.py b/evoformer_openfold/evoformer.py
deleted file mode 100644
index cfd2bb2a2529..000000000000
--- a/evoformer_openfold/evoformer.py
+++ /dev/null
@@ -1,59 +0,0 @@
-import torch
-import torch.nn as nn
-
-from .msa import MSAStack
-from .ops import OutProductMean
-from .triangle import PairStack
-
-
-def print_memory(init_mem, text=None):
-    now_mem = torch.cuda.memory_allocated() / 1024 ** 2 - init_mem
-    max_mem = torch.cuda.max_memory_allocated() / 1024 ** 2 - init_mem
-    print("%s now:%.2f max:%.2f" % ("" if text is None else text, now_mem, max_mem))
-    torch.cuda.reset_peak_memory_stats()
-
-
-class EvoformerBlock(nn.Module):
-
-    def __init__(self, d_node, d_pair):
-        super(EvoformerBlock, self).__init__()
-
-        self.msa_stack = MSAStack(d_node, d_pair, p_drop=0.15)
-        self.communication = OutProductMean(n_feat=d_node, n_feat_out=d_pair, n_feat_proj=32)
-        self.pair_stack = PairStack(d_pair=d_pair)
-
-    def forward(self, node, pair):
-        node = self.msa_stack(node, pair)
-        pair = pair + self.communication(node)
-        pair = self.pair_stack(pair)
-        return node, pair
-
-
-class Evoformer(nn.Module):
-
-    def __init__(self, d_node, d_pair):
-        super(Evoformer, self).__init__()
-
-        self.blocks = nn.ModuleList()
-        for _ in range(1):
-            self.blocks.append(EvoformerBlock(d_node, d_pair))
-
-    def forward(self, node, pair):
-        for b in self.blocks:
-            node, pair = b(node, pair)
-        return node, pair
-
-
-def evoformer_tiny():
-    return Evoformer(d_node=64, d_pair=32)
-
-
-def evoformer_base():
-    return Evoformer(d_node=256, d_pair=128)
-
-
-def evoformer_large():
-    return Evoformer(d_node=512, d_pair=256)
-
-
-__all__ = ['Evoformer', 'evoformer_base', 'evoformer_large']
diff --git a/evoformer_openfold/initializer.py b/evoformer_openfold/initializer.py
deleted file mode 100755
index c6ce0659e597..000000000000
--- a/evoformer_openfold/initializer.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import math
-
-import numpy as np
-import torch.nn as nn
-
-
-def glorot_uniform_af(x, gain=1.0):
-    """
-    initialize tensors the same as xavier_initializer in PyTorch, but the dimensions are different:
-    In PyTorch:
-    [feature_out, feature_in, n_head ...]
-    In Jax:
-    [... n_head, feature_in, feature_out]
-    However, there is a feature in original Alphafold2 code that they use the Jax version initializer to initialize tensors like:
-    [feature_in, n_head, feature_out]
-
-    In this function, we keep this feature to initialize [feature_in, n_head, ..., feature_out] tensors
-    """
-    fan_in, fan_out = x.shape[-2:]
-    if len(x.shape) > 2:
-        receptive_field_size = np.prod(x.shape[:-2])
-        fan_in *= receptive_field_size
-        fan_out *= receptive_field_size
-    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
-    dev = math.sqrt(3.0) * std  # Calculate uniform bounds from standard deviation
-
-    nn.init.uniform_(x, -dev, dev)
-
-    return x
diff --git a/evoformer_openfold/kernel.py b/evoformer_openfold/kernel.py
deleted file mode 100644
index 26ab5dc53261..000000000000
--- a/evoformer_openfold/kernel.py
+++ /dev/null
@@ -1,19 +0,0 @@
-import torch
-import torch.nn.functional as F
-
-
-def bias_sigmod_ele(y, bias, z):
-    return torch.sigmoid(y + bias) * z
-
-
-def bias_dropout_add(x: torch.Tensor, bias: torch.Tensor, dropmask: torch.Tensor,
-                     residual: torch.Tensor, prob: float) -> torch.Tensor:
-    out = (x + bias) * F.dropout(dropmask, p=prob, training=False)
-    out = residual + out
-    return out
-
-
-def bias_ele_dropout_residual(ab: torch.Tensor, b: torch.Tensor, g: torch.Tensor,
-                              dropout_mask: torch.Tensor, Z_raw: torch.Tensor,
-                              prob: float) -> torch.Tensor:
-    return Z_raw + F.dropout(dropout_mask, p=prob, training=True) * (g * (ab + b))
\ No newline at end of file
diff --git a/evoformer_openfold/msa.py b/evoformer_openfold/msa.py
deleted file mode 100644
index cac456638a55..000000000000
--- a/evoformer_openfold/msa.py
+++ /dev/null
@@ -1,95 +0,0 @@
-import math
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from einops import rearrange
-from torch.nn import LayerNorm
-
-from .kernel import bias_dropout_add
-from .ops import SelfAttention, Transition
-
-
-class MSARowAttentionWithPairBias(nn.Module):
-
-    def __init__(self, d_node, d_pair, c=32, n_head=8, p_drop=0.15):
-        super(MSARowAttentionWithPairBias, self).__init__()
-        self.d_node = d_node
-        self.d_pair = d_pair
-        self.c = c
-        self.n_head = n_head
-        self.p_drop = p_drop
-
-        self.layernormM = LayerNorm(d_node)
-        self.layernormZ = LayerNorm(d_pair)
-
-        _init_weights = torch.nn.init.normal_(torch.zeros([n_head, d_pair]),
-                                              std=1.0 / math.sqrt(d_pair))
-        self.linear_b_weights = nn.parameter.Parameter(data=_init_weights, requires_grad=True)
-
-        self.attention = SelfAttention(qkv_dim=d_node,
-                                       c=c,
-                                       n_head=n_head,
-                                       out_dim=d_node,
-                                       gating=True,
-                                       last_bias_fuse=True)
-
-        self.out_bias = nn.parameter.Parameter(data=torch.zeros((d_node,)), requires_grad=True)
-
-    def forward(self, M_raw, Z):
-        ## Input projections
-        M = self.layernormM(M_raw)
-        Z = self.layernormZ(Z)
-        b = F.linear(Z, self.linear_b_weights)
-        b = b.permute(0, 3, 1, 2)
-        # b = rearrange(b, 'b q k h -> b h q k')
-
-        M = self.attention(M, b)
-        dropout_mask = torch.ones_like(M[:, 0:1, :, :]).to(M.device).to(M.dtype)
-
-        return bias_dropout_add(M, self.out_bias, dropout_mask, M_raw, prob=self.p_drop)
-
-
-class MSAColumnAttention(nn.Module):
-
-    def __init__(self, d_node, c=32, n_head=8):
-        super(MSAColumnAttention, self).__init__()
-        self.d_node = d_node
-        self.c = c
-        self.n_head = n_head
-
-        self.layernormM = LayerNorm(d_node)
-        self.attention = SelfAttention(qkv_dim=d_node,
-                                       c=c,
-                                       n_head=n_head,
-                                       out_dim=d_node,
-                                       gating=True)
-
-    def forward(self, M_raw):
-        M = M_raw.transpose(-2, -3)
-        M = self.layernormM(M)
-
-        M = self.attention(M)
-
-        M = M.transpose(-2, -3)
-        return M_raw + M
-
-
-class MSAStack(nn.Module):
-
-    def __init__(self, d_node, d_pair, p_drop=0.15):
-        super(MSAStack, self).__init__()
-
-        self.MSARowAttentionWithPairBias = MSARowAttentionWithPairBias(d_node=d_node,
-                                                                       d_pair=d_pair,
-                                                                       p_drop=p_drop)
-
-        self.MSAColumnAttention = MSAColumnAttention(d_node=d_node)
-        self.MSATransition = Transition(d=d_node)
-
-    def forward(self, node, pair):
-        node = self.MSARowAttentionWithPairBias(node, pair)
-        node = self.MSAColumnAttention(node)
-        node = self.MSATransition(node)
-
-        return node
diff --git a/evoformer_openfold/ops.py b/evoformer_openfold/ops.py
deleted file mode 100755
index 611b7b0fe777..000000000000
--- a/evoformer_openfold/ops.py
+++ /dev/null
@@ -1,176 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from einops import rearrange
-from torch.nn import LayerNorm
-
-from .initializer import glorot_uniform_af
-from .kernel import bias_sigmod_ele
-
-
-class DropoutRowwise(nn.Module):
-
-    def __init__(self, p):
-        super(DropoutRowwise, self).__init__()
-        self.p = p
-        self.dropout = nn.Dropout(p=p)
-
-    def forward(self, x):
-        dropout_mask = torch.ones_like(x[:, 0:1, :, :])
-        dropout_mask = self.dropout(dropout_mask)
-        return dropout_mask * x
-
-
-class DropoutColumnwise(nn.Module):
-
-    def __init__(self, p):
-        super(DropoutColumnwise, self).__init__()
-        self.p = p
-        self.dropout = nn.Dropout(p=p)
-
-    def forward(self, x):
-        dropout_mask = torch.ones_like(x[:, :, 0:1, :])
-        dropout_mask = self.dropout(dropout_mask)
-        return dropout_mask * x
-
-
-class Transition(nn.Module):
-
-    def __init__(self, d, n=4):
-        super(Transition, self).__init__()
-        self.norm = LayerNorm(d)
-        self.linear1 = Linear(d, n * d, initializer='relu')
-        self.linear2 = Linear(n * d, d, initializer='zeros')
-
-    def forward(self, src):
-        x = self.norm(src)
-        x = self.linear2(F.relu(self.linear1(x)))
-        return src + x
-
-
-class OutProductMean(nn.Module):
-
-    def __init__(self, n_feat=64, n_feat_out=128, n_feat_proj=32):
-        super(OutProductMean, self).__init__()
-
-        self.layernormM = LayerNorm(n_feat)
-        self.linear_a = Linear(n_feat, n_feat_proj)
-        self.linear_b = Linear(n_feat, n_feat_proj)
-
-        self.o_linear = Linear(n_feat_proj * n_feat_proj,
-                               n_feat_out,
-                               initializer='zero',
-                               use_bias=True)
-
-    def forward(self, M):
-        M = self.layernormM(M)
-        left_act = self.linear_a(M)
-        right_act = self.linear_b(M)
-
-        O = torch.einsum('bsid,bsje->bijde', left_act, right_act).contiguous()
-        # O = rearrange(O, 'b i j d e -> b i j (d e)')
-        O = O.reshape(O.shape[0], O.shape[1], O.shape[2], -1)
-        Z = self.o_linear(O)
-
-        return Z
-
-
-class Linear(nn.Linear):
-    """
-    A Linear layer with built-in nonstandard initializations. Called just
-    like torch.nn.Linear.
-    Implements the initializers in 1.11.4, plus some additional ones found
-    in the code.
-    """
-
-    def __init__(
-        self,
-        feature_in: int,
-        feature_out: int,
-        initializer: str = 'linear',
-        use_bias: bool = True,
-        bias_init: float = 0.,
-    ):
-        super(Linear, self).__init__(feature_in, feature_out, bias=use_bias)
-
-        self.use_bias = use_bias
-        if initializer == 'linear':
-            glorot_uniform_af(self.weight, gain=1.0)
-        elif initializer == 'relu':
-            glorot_uniform_af(self.weight, gain=2.0)
-        elif initializer == 'zeros':
-            nn.init.zeros_(self.weight)
-        if self.use_bias:
-            with torch.no_grad():
-                self.bias.fill_(bias_init)
-
-
-class SelfAttention(nn.Module):
-    """
-    Multi-Head SelfAttention dealing with [batch_size1, batch_size2, len, dim] tensors
-    """
-
-    def __init__(self, qkv_dim, c, n_head, out_dim, gating=True, last_bias_fuse=False):
-        super(SelfAttention, self).__init__()
-        self.qkv_dim = qkv_dim
-        self.c = c
-        self.n_head = n_head
-        self.out_dim = out_dim
-        self.gating = gating
-        self.last_bias_fuse = last_bias_fuse
-
-        self.scaling = self.c**(-0.5)
-
-        # self.to_qkv = Linear(qkv_dim, 3 * n_head * c, initializer='linear')
-        self.to_q = Linear(qkv_dim, n_head * c, initializer='linear', use_bias=False)
-        self.to_k = Linear(qkv_dim, n_head * c, initializer='linear', use_bias=False)
-        self.to_v = Linear(qkv_dim, n_head * c, initializer='linear', use_bias=False)
-
-        if gating:
-            self.gating_bias = nn.parameter.Parameter(data=torch.ones((n_head * c,)))
-            self.gating_linear = Linear(qkv_dim, n_head * c, initializer='zero', use_bias=False)
-
-        self.o_linear = Linear(n_head * c,
-                               out_dim,
-                               initializer='zero',
-                               use_bias=(not last_bias_fuse))
-
-    def forward(self, in_data, nonbatched_bias=None):
-        """
-        :param in_data: [batch_size1, batch_size2, len_qkv, qkv_dim]
-        :param bias: None or [batch_size1, batch_size2, n_head, len_q, len_kv]
-        :param nonbatched_bias: None or [batch_size1, n_head, len_q, len_kv]
-        """
-
-        # qkv = self.to_qkv(in_data).chunk(3, dim=-1)
-        # q, k, v = map(lambda t: rearrange(t, 'b1 b2 n (h d) -> b1 b2 h n d', h=self.n_head), qkv)
-
-        q = self.to_q(in_data)
-        k = self.to_k(in_data)
-        v = self.to_v(in_data)
-
-        # q, k, v = map(lambda t: rearrange(t, 'b1 b2 n (h d) -> b1 b2 h n d', h=self.n_head),
-        #               [q, k, v])
-        q, k, v = map(lambda t: t.view(t.shape[0], t.shape[1], t.shape[2], self.n_head, -1).permute(0, 1, 3, 2, 4),
-                      [q, k, v])
-        
-        q = q * self.scaling
-
-        logits = torch.matmul(q, k.transpose(-1, -2))
-
-        if nonbatched_bias is not None:
-            logits += nonbatched_bias.unsqueeze(1)
-        weights = torch.softmax(logits, dim=-1)
-        # weights = softmax(logits)
-
-        weighted_avg = torch.matmul(weights, v)
-        # weighted_avg = rearrange(weighted_avg, 'b1 b2 h n d -> b1 b2 n (h d)')
-        weighted_avg = weighted_avg.permute(0, 1, 3, 2, 4)
-        weighted_avg = weighted_avg.reshape(weighted_avg.shape[0], weighted_avg.shape[1], weighted_avg.shape[2], -1)
-
-        if self.gating:
-            gate_values = self.gating_linear(in_data)
-            weighted_avg = bias_sigmod_ele(gate_values, self.gating_bias, weighted_avg)
-
-        output = self.o_linear(weighted_avg)
-        return output
diff --git a/evoformer_openfold/triangle.py b/evoformer_openfold/triangle.py
deleted file mode 100644
index f479469c3836..000000000000
--- a/evoformer_openfold/triangle.py
+++ /dev/null
@@ -1,192 +0,0 @@
-import math
-
-import torch
-import torch.nn as nn
-from torch.nn import LayerNorm
-
-from .kernel import bias_dropout_add, bias_ele_dropout_residual
-from .ops import Linear, SelfAttention, Transition
-
-
-def permute_final_dims(tensor, inds):
-    zero_index = -1 * len(inds)
-    first_inds = list(range(len(tensor.shape[:zero_index])))
-    return tensor.permute(first_inds + [zero_index + i for i in inds])
-
-
-class TriangleMultiplicationOutgoing(nn.Module):
-
-    def __init__(self, d_pair, p_drop, c=128):
-        super(TriangleMultiplicationOutgoing, self).__init__()
-        self.d_pair = d_pair
-        self.c = c
-
-        self.layernorm1 = LayerNorm(d_pair)
-        self.left_projection = Linear(d_pair, c)
-        self.right_projection = Linear(d_pair, c)
-        self.left_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
-        self.right_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
-
-        self.output_gate = Linear(d_pair, d_pair, initializer='zeros', bias_init=1.)
-        self.layernorm2 = LayerNorm(c)
-        self.output_projection = Linear(d_pair, d_pair, initializer='zeros', use_bias=False)
-        self.output_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
-
-        self.p_drop = p_drop
-
-    def forward(self, Z_raw):
-        Z = self.layernorm1(Z_raw)
-        left_proj_act = self.left_projection(Z)
-        right_proj_act = self.right_projection(Z)
-
-        left_proj_act = left_proj_act * torch.sigmoid(self.left_gate(Z))
-        right_proj_act = right_proj_act * torch.sigmoid(self.right_gate(Z))
-
-        g = torch.sigmoid(self.output_gate(Z))
-        # p = torch.matmul(
-        #     permute_final_dims(left_proj_act, (2, 0, 1)),
-        #     permute_final_dims(right_proj_act, (2, 1, 0)),
-        # )
-        # ab = permute_final_dims(p, (1, 2, 0))
-
-        ab = torch.einsum('bikd,bjkd->bijd', left_proj_act, right_proj_act)
-        ab = self.output_projection(self.layernorm2(ab))
-        dropout_mask = torch.ones_like(Z[:, 0:1, :, :]).to(Z.device).to(Z.dtype)
-        return bias_ele_dropout_residual(ab,
-                                         self.output_bias,
-                                         g,
-                                         dropout_mask,
-                                         Z_raw,
-                                         prob=self.p_drop)
-
-
-class TriangleMultiplicationIncoming(nn.Module):
-
-    def __init__(self, d_pair, p_drop, c=128):
-        super(TriangleMultiplicationIncoming, self).__init__()
-        self.d_pair = d_pair
-        self.c = c
-
-        self.layernorm1 = LayerNorm(d_pair)
-        self.left_projection = Linear(d_pair, c)
-        self.right_projection = Linear(d_pair, c)
-        self.left_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
-        self.right_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
-
-        self.output_gate = Linear(d_pair, d_pair, initializer='zeros', bias_init=1.)
-        self.layernorm2 = LayerNorm(c)
-        self.output_projection = Linear(d_pair, d_pair, initializer='zeros', use_bias=False)
-        self.output_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
-
-        self.p_drop = p_drop
-
-    def forward(self, Z_raw):
-        Z = self.layernorm1(Z_raw)
-        left_proj_act = self.left_projection(Z)
-        right_proj_act = self.right_projection(Z)
-
-        left_proj_act = left_proj_act * torch.sigmoid(self.left_gate(Z))
-        right_proj_act = right_proj_act * torch.sigmoid(self.right_gate(Z))
-
-        g = torch.sigmoid(self.output_gate(Z))
-        # p = torch.matmul(
-        #     permute_final_dims(left_proj_act, (2, 1, 0)),
-        #     permute_final_dims(right_proj_act, (2, 0, 1)),
-        # )
-        # ab = permute_final_dims(p, (1, 2, 0))
-
-        ab = torch.einsum('bkid,bkjd->bijd', left_proj_act, right_proj_act)
-        ab = self.output_projection(self.layernorm2(ab))
-        dropout_mask = torch.ones_like(Z[:, 0:1, :, :]).to(Z.device).to(Z.dtype)
-        return bias_ele_dropout_residual(ab,
-                                         self.output_bias,
-                                         g,
-                                         dropout_mask,
-                                         Z_raw,
-                                         prob=self.p_drop)
-
-
-class TriangleAttentionStartingNode(nn.Module):
-
-    def __init__(self, d_pair, p_drop, c=32, n_head=4):
-        super(TriangleAttentionStartingNode, self).__init__()
-        self.d_pair = d_pair
-        self.c = c
-        self.n_head = n_head
-        self.p_drop = p_drop
-
-        self.layernorm1 = LayerNorm(d_pair)
-        _init_weights = torch.nn.init.normal_(torch.zeros([d_pair, n_head]),
-                                              std=1.0 / math.sqrt(d_pair))
-        self.linear_b_weights = nn.parameter.Parameter(data=_init_weights)
-        self.attention = SelfAttention(qkv_dim=d_pair,
-                                       c=c,
-                                       n_head=n_head,
-                                       out_dim=d_pair,
-                                       gating=True,
-                                       last_bias_fuse=True)
-
-        self.out_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
-
-    def forward(self, Z_raw):
-        Z = self.layernorm1(Z_raw)
-        b = torch.einsum('bqkc,ch->bhqk', Z, self.linear_b_weights)
-
-        Z = self.attention(Z, b)
-
-        dropout_mask = torch.ones_like(Z[:, 0:1, :, :]).to(Z.device).to(Z.dtype)
-        return bias_dropout_add(Z, self.out_bias, dropout_mask, Z_raw, prob=self.p_drop)
-
-
-class TriangleAttentionEndingNode(nn.Module):
-
-    def __init__(self, d_pair, p_drop, c=32, n_head=4):
-        super(TriangleAttentionEndingNode, self).__init__()
-        self.d_pair = d_pair
-        self.c = c
-        self.n_head = n_head
-        self.p_drop = p_drop
-
-        self.layernorm1 = LayerNorm(d_pair)
-        _init_weights = torch.nn.init.normal_(torch.zeros([d_pair, n_head]),
-                                              std=1.0 / math.sqrt(d_pair))
-        self.linear_b_weights = nn.parameter.Parameter(data=_init_weights)
-        self.attention = SelfAttention(qkv_dim=d_pair,
-                                       c=c,
-                                       n_head=n_head,
-                                       out_dim=d_pair,
-                                       gating=True,
-                                       last_bias_fuse=True)
-
-        self.out_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
-
-    def forward(self, Z_raw):
-        Z = Z_raw.transpose(-2, -3)
-        Z = self.layernorm1(Z)
-        b = torch.einsum('bqkc,ch->bhqk', Z, self.linear_b_weights)
-
-        Z = self.attention(Z, b)
-
-        Z = Z.transpose(-2, -3)
-        dropout_mask = torch.ones_like(Z[:, :, 0:1, :]).to(Z.device).to(Z.dtype)
-        return bias_dropout_add(Z, self.out_bias, dropout_mask, Z_raw, prob=self.p_drop)
-
-
-class PairStack(nn.Module):
-
-    def __init__(self, d_pair, p_drop=0.25):
-        super(PairStack, self).__init__()
-
-        self.TriangleMultiplicationOutgoing = TriangleMultiplicationOutgoing(d_pair, p_drop=p_drop)
-        self.TriangleMultiplicationIncoming = TriangleMultiplicationIncoming(d_pair, p_drop=p_drop)
-        self.TriangleAttentionStartingNode = TriangleAttentionStartingNode(d_pair, p_drop=p_drop)
-        self.TriangleAttentionEndingNode = TriangleAttentionEndingNode(d_pair, p_drop=p_drop)
-        self.PairTransition = Transition(d=d_pair)
-
-    def forward(self, pair):
-        pair = self.TriangleMultiplicationOutgoing(pair)
-        pair = self.TriangleMultiplicationIncoming(pair)
-        pair = self.TriangleAttentionStartingNode(pair)
-        pair = self.TriangleAttentionEndingNode(pair)
-        pair = self.PairTransition(pair)
-        return pair
diff --git a/openfold/evoformer.py b/openfold/evoformer.py
index 7fbcd8a76b4d..ffd4c982987a 100644
--- a/openfold/evoformer.py
+++ b/openfold/evoformer.py
@@ -284,104 +284,6 @@ def forward(self,
         return m, z
 
 
-class ExtraMSABlock(nn.Module):
-    """ 
-        Almost identical to the standard EvoformerBlock, except in that the
-        ExtraMSABlock uses GlobalAttention for MSA column attention and
-        requires more fine-grained control over checkpointing. Separated from
-        its twin to preserve the TorchScript-ability of the latter.
-    """
-    def __init__(self,
-        c_m: int,
-        c_z: int,
-        c_hidden_msa_att: int,
-        c_hidden_opm: int,
-        c_hidden_mul: int,
-        c_hidden_pair_att: int,
-        no_heads_msa: int,
-        no_heads_pair: int,
-        transition_n: int,
-        msa_dropout: float,
-        pair_dropout: float,
-        inf: float,
-        eps: float,
-        ckpt: bool,
-        is_multimer: bool,
-    ):
-        super(ExtraMSABlock, self).__init__()
-        
-        self.ckpt = ckpt
-
-        self.msa_att_row = MSARowAttentionWithPairBias(
-            c_m=c_m,
-            c_z=c_z,
-            c_hidden=c_hidden_msa_att,
-            no_heads=no_heads_msa,
-            inf=inf,
-        )
-
-        self.msa_att_col = MSAColumnGlobalAttention(
-            c_in=c_m,
-            c_hidden=c_hidden_msa_att,
-            no_heads=no_heads_msa,
-            inf=inf,
-            eps=eps,
-        )
-
-        self.msa_dropout_layer = DropoutRowwise(msa_dropout)
-
-        self.core = EvoformerBlockCore(
-            c_m=c_m,
-            c_z=c_z,
-            c_hidden_opm=c_hidden_opm,
-            c_hidden_mul=c_hidden_mul,
-            c_hidden_pair_att=c_hidden_pair_att,
-            no_heads_msa=no_heads_msa,
-            no_heads_pair=no_heads_pair,
-            transition_n=transition_n,
-            pair_dropout=pair_dropout,
-            inf=inf,
-            eps=eps,
-        )
-        self.is_multimer = is_multimer
-
-    def forward(self,
-        m: torch.Tensor,
-        z: torch.Tensor,
-        msa_mask: torch.Tensor,
-        pair_mask: torch.Tensor,
-        chunk_size: Optional[int] = None,
-        _chunk_logits: Optional[int] = 1024,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        m = m + self.msa_dropout_layer(
-            self.msa_att_row(
-                m.clone(), 
-                z=z.clone(), 
-                mask=msa_mask, 
-                chunk_size=chunk_size,
-                _chunk_logits=_chunk_logits if torch.is_grad_enabled() else None,
-                _checkpoint_chunks=
-                    self.ckpt if torch.is_grad_enabled() else False,
-            )
-        )
-
-        def fn(m, z):
-            m = m + self.msa_att_col(m, mask=msa_mask, chunk_size=chunk_size)
-            m, z = self.core(
-                m, z, msa_mask=msa_mask, pair_mask=pair_mask, chunk_size=chunk_size
-            )
-            
-            return m, z
-
-        if(torch.is_grad_enabled() and self.ckpt):
-            checkpoint_fn = get_checkpoint_fn()
-            m, z = checkpoint_fn(fn, m, z)
-        else:
-            m, z = fn(m, z)
-
-        return m, z
-
-
 class EvoformerStack(nn.Module):
     """
     Main Evoformer trunk.
@@ -527,99 +429,3 @@ def block_with_cache_clear(block, *args):
         s = self.linear(m[..., 0, :, :])
         
         return m, z, s
-
-
-class ExtraMSAStack(nn.Module):
-    """
-    Implements Algorithm 18.
-    """
-
-    def __init__(self,
-        c_m: int,
-        c_z: int,
-        c_hidden_msa_att: int,
-        c_hidden_opm: int,
-        c_hidden_mul: int,
-        c_hidden_pair_att: int,
-        no_heads_msa: int,
-        no_heads_pair: int,
-        no_blocks: int,
-        transition_n: int,
-        msa_dropout: float,
-        pair_dropout: float,
-        inf: float,
-        eps: float,
-        ckpt: bool,
-        clear_cache_between_blocks: bool = False,
-        is_multimer: bool = False,
-        **kwargs,
-    ):
-        super(ExtraMSAStack, self).__init__()
-        
-        self.clear_cache_between_blocks = clear_cache_between_blocks
-        self.blocks = nn.ModuleList()
-        for _ in range(no_blocks):
-            block = ExtraMSABlock(
-                c_m=c_m,
-                c_z=c_z,
-                c_hidden_msa_att=c_hidden_msa_att,
-                c_hidden_opm=c_hidden_opm,
-                c_hidden_mul=c_hidden_mul,
-                c_hidden_pair_att=c_hidden_pair_att,
-                no_heads_msa=no_heads_msa,
-                no_heads_pair=no_heads_pair,
-                transition_n=transition_n,
-                msa_dropout=msa_dropout,
-                pair_dropout=pair_dropout,
-                inf=inf,
-                eps=eps,
-                ckpt=ckpt,
-                is_multimer=is_multimer,
-            )
-            self.blocks.append(block)
-
-    def forward(self,
-        m: torch.Tensor,
-        z: torch.Tensor,
-        chunk_size: int,
-        msa_mask: Optional[torch.Tensor] = None,
-        pair_mask: Optional[torch.Tensor] = None,
-        _mask_trans: bool = True,
-    ) -> torch.Tensor:
-        """
-        Args:
-            m:
-                [*, N_extra, N_res, C_m] extra MSA embedding
-            z:
-                [*, N_res, N_res, C_z] pair embedding
-            msa_mask:
-                Optional [*, N_extra, N_res] MSA mask
-            pair_mask:
-                Optional [*, N_res, N_res] pair mask
-        Returns:
-            [*, N_res, N_res, C_z] pair update
-        """ 
-        #checkpoint_fn = get_checkpoint_fn()
-        #blocks = [
-        #    partial(b, msa_mask=msa_mask, pair_mask=pair_mask, chunk_size=chunk_size, _chunk_logits=None) for b in self.blocks
-        #]
-
-        #def dodo(b, *args):
-        #    torch.cuda.empty_cache()
-        #    return b(*args)
-
-        #blocks = [partial(dodo, b) for b in blocks]
-
-        #for b in blocks:
-        #    if(torch.is_grad_enabled()):
-        #        m, z = checkpoint_fn(b, *(m, z))
-        #    else:
-        #        m, z = b(m, z)
-
-        for b in self.blocks:
-            m, z = b(m, z, msa_mask, pair_mask, chunk_size=chunk_size)
-
-            if(self.clear_cache_between_blocks):
-                torch.cuda.empty_cache()
-
-        return z
\ No newline at end of file

From f5515e9978564bddc0ff97c06c7a6933668e7cef Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Thu, 29 Dec 2022 16:55:47 +0800
Subject: [PATCH 073/503] use max_mem to control stratge

---
 chunk_codegen.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 033db50dbccb..1c8be65d490a 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -1398,14 +1398,18 @@ def estimate_chunk_inference_mem(
 
 class ChunkSelector(object):
     def __init__(
-        self, index_tracer: IndexTracer, memory_estimator: MemoryEstimator, stratge, max_memory=None
+        self,
+        index_tracer: IndexTracer,
+        memory_estimator: MemoryEstimator,
+        max_memory=None,
     ):
         self.index_tracer = index_tracer
         self.memory_estimator = memory_estimator
-        assert stratge in ["min_memory", "fit_memory"]
-        assert (stratge == "fit_memory" and max_memory is not None) or stratge != "fit_memory"
-        self.stratge = stratge
-        self.max_memory = max_memory  # MB
+        if max_memory is not None:
+            self.stratge = "fit_memory"
+            self.max_memory = max_memory  # MB
+        else:
+            self.stratge = "min_memory"
 
     def _select_best_chunk_region(
         self, possible_chunk_regions, chunk_infos, peak_node, max_chunk_region, mem_peak
@@ -1538,6 +1542,8 @@ def _select_min_memory_chunk_region(self, possible_chunk_regions, chunk_infos):
             possible_chunk_regions.remove(i)
             max_region_range = 0
             best_region = None
+        if best_region is not None:
+            best_region["chunk_size"] = 2
         return best_region
 
     def _is_legal_region(self, cur_chunk_info, chunk_infos):
@@ -1563,7 +1569,7 @@ def __init__(self, gm, max_memory=None) -> None:
         self.index_tracer.trace_index()
         self.memory_estimator = MemoryEstimator(self.index_tracer)
         self.chunk_selector = ChunkSelector(
-            self.index_tracer, self.memory_estimator, stratge="fit_memory", max_memory=max_memory
+            self.index_tracer, self.memory_estimator, max_memory=max_memory
         )
 
     def _find_peak_node(self, mem_peak):
@@ -2233,7 +2239,7 @@ def emit_node(node: Node, body):
                 delete_unused_values,
                 self.meta_node,
                 self.meta_graph,
-                self.max_memory
+                self.max_memory,
             )
 
             if len(body) == 0:

From e5a5fbb8a94313722542b72f601b8433eef1e5dc Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Sat, 31 Dec 2022 01:00:06 +0800
Subject: [PATCH 074/503] update source add

---
 chunk_codegen.py | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 1c8be65d490a..de58a61b943b 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -133,24 +133,28 @@ def _inherit_all_computation(self, node_from, node_to):
 
     def _add_source(self, node_from, node_from_dim, node_to, node_to_dim, init=False):
         node_from_dim = self._transform_index(node_from, node_from_dim)
-        node_from_trace = self._find_trace_from_node(node_from)
+        node_from_trace_source = self._find_source_trace_from_node(node_from)
         node_to_dim = self._transform_index(node_to, node_to_dim)
-        node_to_trace = self._find_trace_from_node(node_to)
+        node_to_trace_source = self._find_source_trace_from_node(node_to)
         node_from_idx = _find_idx_by_name(node_from.name, self.node_list)
         if init:
-            node_to_trace["source"][node_to_dim] = {}
+            node_to_trace_source[node_to_dim] = {}
         # add dim to cur new source
-        if node_from_idx not in node_to_trace["source"][node_to_dim]:
-            node_to_trace["source"][node_to_dim][node_from_idx] = [node_from_dim]
+        if node_from_idx not in node_to_trace_source[node_to_dim]:
+            node_to_trace_source[node_to_dim][node_from_idx] = [node_from_dim]
         else:
-            if node_from_dim not in node_to_trace["source"][node_to_dim][node_from_idx]:
-                node_to_trace["source"][node_to_dim][node_from_idx].append(
+            if node_from_dim not in node_to_trace_source[node_to_dim][node_from_idx]:
+                node_to_trace_source[node_to_dim][node_from_idx].append(
                     node_from_dim
                 )
         # update inputs source
-        node_to_trace["source"][node_to_dim].update(
-            node_from_trace["source"][node_from_dim]
-        )
+        for node_idx, node_dim in node_from_trace_source[node_from_dim].items():
+            if node_idx not in node_to_trace_source[node_to_dim]:
+                node_to_trace_source[node_to_dim][node_idx] = copy.deepcopy(node_dim)
+            else:
+                for d in node_dim:
+                    if d not in node_to_trace_source[node_to_dim][node_idx]:
+                        node_to_trace_source[node_to_dim][node_idx].append(d)
 
     def _mark_computation_from_node(self, node_from, node_to, exclude=None):
         if exclude == None:
@@ -1761,9 +1765,9 @@ def search_region(self):
             )
             if self._stop_search(init_mem_peak, mem_peak):
                 break
-        # self.memory_estimator.estimate_chunk_inference_mem(
-        #     self.index_tracer.node_list, chunk_infos, print_mem=True
-        # )
+        self.memory_estimator.estimate_chunk_inference_mem(
+            self.index_tracer.node_list, chunk_infos, print_mem=True
+        )
         return chunk_infos
 
 
From 966e4ea0cbf1cd17696aa90b6b9bd4a6999cfba4 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Sat, 31 Dec 2022 02:20:07 +0800
Subject: [PATCH 075/503] add reorder in mem estimator

---
 chunk_codegen.py | 43 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 32 insertions(+), 11 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index de58a61b943b..e20d151da1fb 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -1040,11 +1040,13 @@ def _reorder_chunk_info(self, chunk_info, reorder_map):
             chunk_info["region"][0] + len(chunk_info["args"]["prepose_nodes"]),
             chunk_info["region"][1],
         )
+        new_inputs_dim = []
         for idx, input_dim in enumerate(chunk_info["inputs_dim"]):
             new_input_dim = {}
             for k, v in input_dim.items():
                 new_input_dim[reorder_map[k]] = v
-            chunk_info["inputs_dim"][idx] = new_input_dim
+            new_inputs_dim.append(new_input_dim)
+        chunk_info["inputs_dim"] = new_inputs_dim
         return chunk_info
 
     def _update_all_reorder_map(self, reorder_map):
@@ -1095,11 +1097,24 @@ def reorder_node_list(self, node_list):
         for old_idx, new_idx in self.all_reorder_map.items():
             new_node_list[new_idx] = node_list[old_idx]
         return new_node_list
+    
+    def tmp_reorder(self, node_list, chunk_info):
+        if len(chunk_info["args"]["prepose_nodes"]) == 0:
+            return node_list, chunk_info
+        reorder_map = self._get_reorder_map(chunk_info)
+        
+        # new tmp node list
+        new_node_list = [None for _ in range(len(node_list))]
+        for old_idx, new_idx in reorder_map.items():
+            new_node_list[new_idx] = node_list[old_idx]
+    
+        chunk_info = self._reorder_chunk_info(chunk_info, reorder_map)
+        return new_node_list, chunk_info
 
 
 class MemoryEstimator(object):
     def __init__(self, index_tracer: IndexTracer) -> None:
-        self.index_tracer = index_tracer
+        pass
 
     def _get_meta_node_size(self, x):
         x = x.meta["tensor_meta"]
@@ -1453,9 +1468,11 @@ def _select_fit_memory_chunk_region(
         # get mem for chunk region
         regions_dict = []
         for region in possible_chunk_regions:
-            cur_chunk_infos = chunk_infos + [region]
+            cur_region = region.copy()
+            cur_node_list, cur_region = self.index_tracer.tmp_reorder(self.index_tracer.node_list, cur_region)
+            cur_chunk_infos = chunk_infos + [cur_region]
             cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
-                self.index_tracer.node_list, cur_chunk_infos
+                cur_node_list, cur_chunk_infos
             )[0]
             cur_chunk_region_peak = cur_mem_peak[
                 max_chunk_region[0] : max_chunk_region[1] + 1
@@ -1492,9 +1509,11 @@ def _get_fit_chunk_size(self, chunk_info, chunk_infos):
         while cur_chunk_max_mem < self.max_memory:
             chunk_size *= 2
             chunk_info["chunk_size"] = chunk_size
-            cur_chunk_infos = chunk_infos + [chunk_info]
+            cur_chunk_info = chunk_info.copy()
+            cur_node_list, cur_chunk_info = self.index_tracer.tmp_reorder(self.index_tracer.node_list, cur_chunk_info)
+            cur_chunk_infos = chunk_infos + [cur_chunk_info]
             cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
-                self.index_tracer.node_list, cur_chunk_infos
+                cur_node_list, cur_chunk_infos
             )[0]
             cur_chunk_max_mem = max(
                 cur_mem_peak[chunk_info["region"][0] : chunk_info["region"][1] + 1]
@@ -1511,11 +1530,13 @@ def _chunk_size_binary_search(self, l, r, chunk_info, chunk_infos):
         else:
             gap = 1
         while r >= l + gap:
-            mid = int(l + (r - l) / 2)
+            mid = int((l + r) / 2 + 0.5)
             chunk_info["chunk_size"] = mid
-            cur_chunk_infos = chunk_infos + [chunk_info]
+            cur_chunk_info = chunk_info.copy()
+            cur_node_list, cur_chunk_info = self.index_tracer.tmp_reorder(self.index_tracer.node_list, cur_chunk_info)
+            cur_chunk_infos = chunk_infos + [cur_chunk_info]
             cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
-                self.index_tracer.node_list, cur_chunk_infos
+                cur_node_list, cur_chunk_infos
             )[0]
             cur_chunk_max_mem = max(
                 cur_mem_peak[chunk_info["region"][0] : chunk_info["region"][1] + 1]
@@ -1529,7 +1550,7 @@ def _chunk_size_binary_search(self, l, r, chunk_info, chunk_infos):
     def _get_compute_node_num(self, start, end):
         count = 0
         for i in self.index_tracer.node_list[start : end + 1]:
-            if _is_non_compute_node(i):
+            if not _is_non_compute_node(i):
                 count += 1
         return count
 
@@ -1547,7 +1568,7 @@ def _select_min_memory_chunk_region(self, possible_chunk_regions, chunk_infos):
             max_region_range = 0
             best_region = None
         if best_region is not None:
-            best_region["chunk_size"] = 2
+            best_region["chunk_size"] = 1
         return best_region
 
     def _is_legal_region(self, cur_chunk_info, chunk_infos):

From 80efd70c725b00c236b80b68393c0d13ec457b0b Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Sat, 31 Dec 2022 13:44:46 +0800
Subject: [PATCH 076/503] improve reorder efficeincy

---
 chunk_codegen.py | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index e20d151da1fb..7c334c617c7b 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -1486,6 +1486,8 @@ def _select_fit_memory_chunk_region(
                         "chunk_len": self._get_compute_node_num(
                             region["region"][0], region["region"][1]
                         ),
+                        "reorder_chunk_info": cur_region,
+                        "reorder_node_list": cur_node_list
                     }
                 )
         # no region found
@@ -1495,48 +1497,47 @@ def _select_fit_memory_chunk_region(
         # select the min chunk len
         chunk_len = [i["chunk_len"] for i in regions_dict]
         best_region_idx = chunk_len.index(min(chunk_len))
-        best_region = regions_dict[best_region_idx]["chunk_info"]
+        best_region = regions_dict[best_region_idx]
 
         # get max chunk size
         best_region = self._get_fit_chunk_size(best_region, chunk_infos)
         return best_region
 
-    def _get_fit_chunk_size(self, chunk_info, chunk_infos):
+    def _get_fit_chunk_size(self, chunk_region_dict, chunk_infos):
         chunk_size = 1
-        chunk_info["chunk_size"] = chunk_size
+        reorder_chunk_info = chunk_region_dict['reorder_chunk_info']
+        reorder_chunk_info["chunk_size"] = chunk_size
         cur_chunk_max_mem = 0
         # search a region
         while cur_chunk_max_mem < self.max_memory:
             chunk_size *= 2
-            chunk_info["chunk_size"] = chunk_size
-            cur_chunk_info = chunk_info.copy()
-            cur_node_list, cur_chunk_info = self.index_tracer.tmp_reorder(self.index_tracer.node_list, cur_chunk_info)
-            cur_chunk_infos = chunk_infos + [cur_chunk_info]
+            reorder_chunk_info["chunk_size"] = chunk_size
+            cur_chunk_infos = chunk_infos + [reorder_chunk_info]
             cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
-                cur_node_list, cur_chunk_infos
+                chunk_region_dict['reorder_node_list'], cur_chunk_infos
             )[0]
             cur_chunk_max_mem = max(
-                cur_mem_peak[chunk_info["region"][0] : chunk_info["region"][1] + 1]
+                cur_mem_peak[reorder_chunk_info["region"][0] : reorder_chunk_info["region"][1] + 1]
             )
         # search exact size
+        chunk_info = chunk_region_dict["chunk_info"]
         chunk_info["chunk_size"] = self._chunk_size_binary_search(
-            chunk_size // 2, chunk_size, chunk_info, chunk_infos
+            chunk_size // 2, chunk_size, chunk_region_dict, chunk_infos
         )
         return chunk_info
 
-    def _chunk_size_binary_search(self, l, r, chunk_info, chunk_infos):
+    def _chunk_size_binary_search(self, l, r, chunk_region_dict, chunk_infos):
         if l >= 16:
             gap = 4
         else:
             gap = 1
+        chunk_info = chunk_region_dict['reorder_chunk_info']
         while r >= l + gap:
             mid = int((l + r) / 2 + 0.5)
             chunk_info["chunk_size"] = mid
-            cur_chunk_info = chunk_info.copy()
-            cur_node_list, cur_chunk_info = self.index_tracer.tmp_reorder(self.index_tracer.node_list, cur_chunk_info)
-            cur_chunk_infos = chunk_infos + [cur_chunk_info]
+            cur_chunk_infos = chunk_infos + [chunk_info]
             cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
-                cur_node_list, cur_chunk_infos
+                chunk_region_dict['reorder_node_list'], cur_chunk_infos
             )[0]
             cur_chunk_max_mem = max(
                 cur_mem_peak[chunk_info["region"][0] : chunk_info["region"][1] + 1]
@@ -1904,7 +1905,7 @@ def _find_idx_by_name(name, nodes_list):
 
 
 def _replace_name(context, name_from, name_to):
-    patterns = [(" ", " "), (" ", "."), (" ", ","), ("(", ")"), ("(", ",")]
+    patterns = [(" ", " "), (" ", "."), (" ", ","), ("(", ")"), ("(", ","), (" ", ")")]
     for p in patterns:
         source = p[0] + name_from + p[1]
         target = p[0] + name_to + p[1]

From 5f24f4fd55956904d024d8835029ffcd0cc203a5 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Sat, 31 Dec 2022 16:29:43 +0800
Subject: [PATCH 077/503] support ones_like, add prompt if fit mode search fail

---
 chunk_codegen.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 7c334c617c7b..6f8ff2b23ff0 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -1406,9 +1406,9 @@ def estimate_chunk_inference_mem(
             # self._print_mem_log(act_memory_peak_log, node_list, "peak")
             # self._print_mem_log(act_memory_after_node_log, node_list, "after")
             self._print_compute_op_mem_log(act_memory_peak_log, node_list, "peak")
-            self._print_compute_op_mem_log(
-                act_memory_after_node_log, node_list, "after"
-            )
+            # self._print_compute_op_mem_log(
+            #     act_memory_after_node_log, node_list, "after"
+            # )
 
         # param_memory = parameter_size(gm)
         # all_memory = act_memory + param_memory
@@ -1465,6 +1465,9 @@ def _select_fit_memory_chunk_region(
             if i in possible_chunk_regions:
                 possible_chunk_regions.remove(i)
 
+        if len(possible_chunk_regions) == 0:
+            return None
+
         # get mem for chunk region
         regions_dict = []
         for region in possible_chunk_regions:
@@ -1492,7 +1495,7 @@ def _select_fit_memory_chunk_region(
                 )
         # no region found
         if len(regions_dict) == 0:
-            return None
+            raise RuntimeError("Search failed. Try a larger memory threshold.")
 
         # select the min chunk len
         chunk_len = [i["chunk_len"] for i in regions_dict]
@@ -1995,6 +1998,14 @@ def emit_code_with_chunk(
                         body[-1] = _replace_name(
                             body[-1], input_node.name, input_node.name + chunk_slice
                         )
+            # ones like
+            if "ones_like" in node.name:
+                chunk_slice = _gen_chunk_slice_dim(
+                    chunk_search[region_idx]["node_chunk_dim"][chunk_region_search.index_tracer.node_list[node_idx]]["chunk_dim"], "chunk_idx", _get_node_shape(node)
+                )
+                body[-1] = _replace_name(
+                    body[-1], node.args[0].name, node.args[0].name + chunk_slice
+                )
             body[-1] = _replace_reshape_size(
                 body[-1], node.name, chunk_search[region_idx]["reshape_size"]
             )

From 7fd3b45af21345cff9334682e277d7669c730814 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 2 Jan 2023 00:04:47 +0800
Subject: [PATCH 078/503] fix a bug in ones like, dont gen chunk if dim size is
 1

---
 autochunk_benchmark.py |  4 ++--
 chunk_codegen.py       | 41 +++++++++++++++++++++++++++--------------
 2 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/autochunk_benchmark.py b/autochunk_benchmark.py
index 679016438c59..3b48d7e461fe 100644
--- a/autochunk_benchmark.py
+++ b/autochunk_benchmark.py
@@ -16,9 +16,9 @@ def _benchmark_evoformer(model: torch.nn.Module, node, pair, title, chunk_size=N
     torch.cuda.reset_peak_memory_stats()
     now_mem = torch.cuda.memory_allocated() / 1024**2
 
-    loop = 16
+    loop = 3
     with torch.no_grad():
-        for _ in range(loop // 4):
+        for _ in range(loop // 2 + 1):
             if chunk_size:
                 model(node, pair, chunk_size)
             else:
diff --git a/chunk_codegen.py b/chunk_codegen.py
index 6f8ff2b23ff0..6f21f26f37e1 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -144,9 +144,7 @@ def _add_source(self, node_from, node_from_dim, node_to, node_to_dim, init=False
             node_to_trace_source[node_to_dim][node_from_idx] = [node_from_dim]
         else:
             if node_from_dim not in node_to_trace_source[node_to_dim][node_from_idx]:
-                node_to_trace_source[node_to_dim][node_from_idx].append(
-                    node_from_dim
-                )
+                node_to_trace_source[node_to_dim][node_from_idx].append(node_from_dim)
         # update inputs source
         for node_idx, node_dim in node_from_trace_source[node_from_dim].items():
             if node_idx not in node_to_trace_source[node_to_dim]:
@@ -1097,17 +1095,17 @@ def reorder_node_list(self, node_list):
         for old_idx, new_idx in self.all_reorder_map.items():
             new_node_list[new_idx] = node_list[old_idx]
         return new_node_list
-    
+
     def tmp_reorder(self, node_list, chunk_info):
         if len(chunk_info["args"]["prepose_nodes"]) == 0:
             return node_list, chunk_info
         reorder_map = self._get_reorder_map(chunk_info)
-        
+
         # new tmp node list
         new_node_list = [None for _ in range(len(node_list))]
         for old_idx, new_idx in reorder_map.items():
             new_node_list[new_idx] = node_list[old_idx]
-    
+
         chunk_info = self._reorder_chunk_info(chunk_info, reorder_map)
         return new_node_list, chunk_info
 
@@ -1472,7 +1470,9 @@ def _select_fit_memory_chunk_region(
         regions_dict = []
         for region in possible_chunk_regions:
             cur_region = region.copy()
-            cur_node_list, cur_region = self.index_tracer.tmp_reorder(self.index_tracer.node_list, cur_region)
+            cur_node_list, cur_region = self.index_tracer.tmp_reorder(
+                self.index_tracer.node_list, cur_region
+            )
             cur_chunk_infos = chunk_infos + [cur_region]
             cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
                 cur_node_list, cur_chunk_infos
@@ -1490,7 +1490,7 @@ def _select_fit_memory_chunk_region(
                             region["region"][0], region["region"][1]
                         ),
                         "reorder_chunk_info": cur_region,
-                        "reorder_node_list": cur_node_list
+                        "reorder_node_list": cur_node_list,
                     }
                 )
         # no region found
@@ -1508,7 +1508,7 @@ def _select_fit_memory_chunk_region(
 
     def _get_fit_chunk_size(self, chunk_region_dict, chunk_infos):
         chunk_size = 1
-        reorder_chunk_info = chunk_region_dict['reorder_chunk_info']
+        reorder_chunk_info = chunk_region_dict["reorder_chunk_info"]
         reorder_chunk_info["chunk_size"] = chunk_size
         cur_chunk_max_mem = 0
         # search a region
@@ -1517,10 +1517,13 @@ def _get_fit_chunk_size(self, chunk_region_dict, chunk_infos):
             reorder_chunk_info["chunk_size"] = chunk_size
             cur_chunk_infos = chunk_infos + [reorder_chunk_info]
             cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
-                chunk_region_dict['reorder_node_list'], cur_chunk_infos
+                chunk_region_dict["reorder_node_list"], cur_chunk_infos
             )[0]
             cur_chunk_max_mem = max(
-                cur_mem_peak[reorder_chunk_info["region"][0] : reorder_chunk_info["region"][1] + 1]
+                cur_mem_peak[
+                    reorder_chunk_info["region"][0] : reorder_chunk_info["region"][1]
+                    + 1
+                ]
             )
         # search exact size
         chunk_info = chunk_region_dict["chunk_info"]
@@ -1534,13 +1537,13 @@ def _chunk_size_binary_search(self, l, r, chunk_region_dict, chunk_infos):
             gap = 4
         else:
             gap = 1
-        chunk_info = chunk_region_dict['reorder_chunk_info']
+        chunk_info = chunk_region_dict["reorder_chunk_info"]
         while r >= l + gap:
             mid = int((l + r) / 2 + 0.5)
             chunk_info["chunk_size"] = mid
             cur_chunk_infos = chunk_infos + [chunk_info]
             cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
-                chunk_region_dict['reorder_node_list'], cur_chunk_infos
+                chunk_region_dict["reorder_node_list"], cur_chunk_infos
             )[0]
             cur_chunk_max_mem = max(
                 cur_mem_peak[chunk_info["region"][0] : chunk_info["region"][1] + 1]
@@ -2000,8 +2003,18 @@ def emit_code_with_chunk(
                         )
             # ones like
             if "ones_like" in node.name:
+                chunk_dim = chunk_search[region_idx]["node_chunk_dim"][
+                    chunk_region_search.index_tracer.node_list[node_idx]
+                ]["chunk_dim"]
+                if (
+                    _get_node_shape(
+                        chunk_region_search.index_tracer.node_list[node_idx]
+                    )[chunk_dim]
+                    == 1
+                ):
+                    continue
                 chunk_slice = _gen_chunk_slice_dim(
-                    chunk_search[region_idx]["node_chunk_dim"][chunk_region_search.index_tracer.node_list[node_idx]]["chunk_dim"], "chunk_idx", _get_node_shape(node)
+                    chunk_dim, "chunk_idx", _get_node_shape(node)
                 )
                 body[-1] = _replace_name(
                     body[-1], node.args[0].name, node.args[0].name + chunk_slice

From 9c5e028a62b003136d2402b99b728eaefcc528cd Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 2 Jan 2023 00:27:11 +0800
Subject: [PATCH 079/503] fix bug again

---
 chunk_codegen.py | 35 +++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 6f21f26f37e1..21ecc343a959 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -2003,22 +2003,25 @@ def emit_code_with_chunk(
                         )
             # ones like
             if "ones_like" in node.name:
-                chunk_dim = chunk_search[region_idx]["node_chunk_dim"][
-                    chunk_region_search.index_tracer.node_list[node_idx]
-                ]["chunk_dim"]
-                if (
-                    _get_node_shape(
-                        chunk_region_search.index_tracer.node_list[node_idx]
-                    )[chunk_dim]
-                    == 1
-                ):
-                    continue
-                chunk_slice = _gen_chunk_slice_dim(
-                    chunk_dim, "chunk_idx", _get_node_shape(node)
-                )
-                body[-1] = _replace_name(
-                    body[-1], node.args[0].name, node.args[0].name + chunk_slice
-                )
+                meta_node = chunk_region_search.index_tracer.node_list[node_idx]
+                chunk_dim = chunk_search[region_idx]["node_chunk_dim"][meta_node][
+                    "chunk_dim"
+                ]
+                if _get_node_shape(meta_node)[chunk_dim] != 1:
+                    source_node = meta_node.args[0].args[0]
+                    if (
+                        source_node not in chunk_search[region_idx]["node_chunk_dim"]
+                        or chunk_search[region_idx]["node_chunk_dim"][source_node][
+                            "chunk_dim"
+                        ]
+                        is None
+                    ):
+                        chunk_slice = _gen_chunk_slice_dim(
+                            chunk_dim, "chunk_idx", _get_node_shape(node)
+                        )
+                        body[-1] = _replace_name(
+                            body[-1], node.args[0].name, node.args[0].name + chunk_slice
+                        )
             body[-1] = _replace_reshape_size(
                 body[-1], node.name, chunk_search[region_idx]["reshape_size"]
             )

From 55cb713f36e8080313225577dde97e4d35e18108 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Thu, 5 Jan 2023 11:29:22 +0800
Subject: [PATCH 080/503] update min memory stratege, reduce mem usage by 30%

---
 chunk_codegen.py | 65 +++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 51 insertions(+), 14 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 21ecc343a959..41fcb5a3c2f4 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -1433,7 +1433,11 @@ def _select_best_chunk_region(
     ):
         if self.stratge == "min_memory":
             best_region = self._select_min_memory_chunk_region(
-                possible_chunk_regions, chunk_infos
+                possible_chunk_regions,
+                chunk_infos,
+                peak_node,
+                max_chunk_region,
+                mem_peak,
             )
         elif self.stratge == "fit_memory":
             best_region = self._select_fit_memory_chunk_region(
@@ -1561,19 +1565,52 @@ def _get_compute_node_num(self, start, end):
                 count += 1
         return count
 
-    def _select_min_memory_chunk_region(self, possible_chunk_regions, chunk_infos):
-        max_region_range = 0
-        best_region = None
-        while len(possible_chunk_regions) > 0:
-            for i in possible_chunk_regions:
-                if i["region"][1] - i["region"][0] > max_region_range:
-                    best_region = i
-                    max_region_range = i["region"][1] - i["region"][0]
-            if self._is_legal_region(best_region, chunk_infos):
-                break
-            possible_chunk_regions.remove(i)
-            max_region_range = 0
-            best_region = None
+    def _select_min_memory_chunk_region(
+        self, possible_chunk_regions, chunk_infos, peak_node, max_chunk_region, mem_peak
+    ):
+        # remove illegal regions
+        illegal_regions = []
+        for i in possible_chunk_regions:
+            if not self._is_legal_region(i, chunk_infos):
+                illegal_regions.append(i)
+        for i in illegal_regions:
+            if i in possible_chunk_regions:
+                possible_chunk_regions.remove(i)
+
+        if len(possible_chunk_regions) == 0:
+            return None
+
+        # get mem for chunk region
+        regions_dict = []
+        for region in possible_chunk_regions:
+            cur_region = region.copy()
+            cur_node_list, cur_region = self.index_tracer.tmp_reorder(
+                self.index_tracer.node_list, cur_region
+            )
+            cur_chunk_infos = chunk_infos + [cur_region]
+            cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
+                cur_node_list, cur_chunk_infos
+            )[0]
+            cur_chunk_region_peak = cur_mem_peak[
+                max_chunk_region[0] : max_chunk_region[1] + 1
+            ]
+            cur_chunk_region_max_peak = max(cur_chunk_region_peak)
+            regions_dict.append(
+                {
+                    "chunk_info": region,
+                    "chunk_max_mem": cur_chunk_region_max_peak,
+                    "chunk_len": self._get_compute_node_num(
+                        region["region"][0], region["region"][1]
+                    ),
+                    "reorder_chunk_info": cur_region,
+                    "reorder_node_list": cur_node_list,
+                }
+            )
+
+        # select the min mem
+        chunk_max_mem = [i["chunk_max_mem"] for i in regions_dict]
+        best_region_idx = chunk_max_mem.index(min(chunk_max_mem))
+        best_region = regions_dict[best_region_idx]["chunk_info"]
         if best_region is not None:
             best_region["chunk_size"] = 1
         return best_region

From b5a3a4a65f1a3196faaaf0affe2c3d6ff8f7acb1 Mon Sep 17 00:00:00 2001
From: YuliangLiu0306 <liuyuliang@luchentech.com>
Date: Thu, 5 Jan 2023 17:21:29 +0800
Subject: [PATCH 081/503] [device] find best logical mesh

---
 colossalai/device/alpha_beta_profiler.py      | 193 +++++++++++++++++-
 tests/test_device/test_extract_alpha_beta.py  |  39 ++++
 .../test_search_logical_device_mesh.py        |  36 ++++
 3 files changed, 265 insertions(+), 3 deletions(-)
 create mode 100644 tests/test_device/test_extract_alpha_beta.py
 create mode 100644 tests/test_device/test_search_logical_device_mesh.py

diff --git a/colossalai/device/alpha_beta_profiler.py b/colossalai/device/alpha_beta_profiler.py
index 324acacb8b4a..9c66cb85de5c 100644
--- a/colossalai/device/alpha_beta_profiler.py
+++ b/colossalai/device/alpha_beta_profiler.py
@@ -21,7 +21,7 @@ class AlphaBetaProfiler:
         # multi-process with multi-gpu in mpi style.
         >>> physical_devices = [0, 1, 4, 5]
         >>> ab_profiler = AlphaBetaProfiler(physical_devices)
-        >>> ab_dict = profiler.profile_ab()
+        >>> ab_dict = profiler.alpha_beta_dict
         >>> print(ab_dict)
         {(0, 1): (1.9641406834125518e-05, 4.74049549614719e-12), (0, 4): (1.9506998360157013e-05, 6.97421973297474e-11), (0, 5): (2.293858677148819e-05, 7.129930361393644e-11),
          (1, 4): (1.9010603427886962e-05, 7.077968863788975e-11), (1, 5): (1.9807778298854827e-05, 6.928845708992215e-11), (4, 5): (1.8681809306144713e-05, 4.7522367291330524e-12),
@@ -31,13 +31,16 @@ class AlphaBetaProfiler:
 
     def __init__(self,
                  physical_devices: List[int],
+                 alpha_beta_dict: Dict[Tuple[int, int], Tuple[float, float]] = None,
                  ctype: str = 'a',
                  warmup: int = 5,
                  repeat: int = 25,
-                 latency_iters: int = 5):
+                 latency_iters: int = 5,
+                 homogeneous_tolerance: float = 0.1):
         '''
         Args:
             physical_devices: A list of device id, each element inside it is the global rank of that device.
+            alpha_beta_dict: A dict which maps a process group to alpha-beta value pairs.
             ctype: 'a' for all-reduce, 'b' for broadcast.
             warmup: Number of warmup iterations.
             repeat: Number of iterations to measure.
@@ -49,8 +52,13 @@ def __init__(self,
         self.warmup = warmup
         self.repeat = repeat
         self.latency_iters = latency_iters
+        self.homogeneous_tolerance = homogeneous_tolerance
         self.process_group_dict = None
         self._init_profiling()
+        if alpha_beta_dict is None:
+            self.alpha_beta_dict = self.profile_ab()
+        else:
+            self.alpha_beta_dict = alpha_beta_dict
 
     def _init_profiling(self):
         # Create process group list based on its global rank
@@ -139,7 +147,7 @@ def profile_latency(self, process_group, pg_handler):
 
         return latency
 
-    def profile_bandwidth(self, process_group, pg_handler, maxbytes):
+    def profile_bandwidth(self, process_group, pg_handler, maxbytes=(1 * GB)):
         '''
         This function is used to profile the bandwidth of the given process group.
 
@@ -159,6 +167,7 @@ def profile_ab(self):
         '''
         alpha_beta_dict: Dict[Tuple[int], Tuple[float]] = {}
         rank = dist.get_rank()
+        global_pg_handler = dist.new_group(self.physical_devices)
 
         def get_max_nbytes(process_group: Tuple[int], pg_handler: dist.ProcessGroup):
             assert rank in process_group
@@ -197,3 +206,181 @@ def get_max_nbytes(process_group: Tuple[int], pg_handler: dist.ProcessGroup):
         alpha_beta_dict.update(symmetry_ab_dict)
 
         return alpha_beta_dict
+
+    def search_best_logical_mesh(self):
+        '''
+        This method is used to search the best logical mesh for the given device list.
+
+        The best logical mesh is searched in following steps:
+            1. detect homogeneous device groups, we assume that the devices in the alpha_beta_dict
+                are homogeneous if the beta value is close enough.
+            2. Find the best homogeneous device group contains all the physical devices. The best homogeneous
+                device group means the lowest beta value in the groups which contains all the physical devices.
+                And the reason we require the group contains all the physical devices is that the devices not in
+                the group will decrease the bandwidth of the group.
+            3. If the best homogeneous device group is found, we will construct the largest ring for each device
+                based on the best homogeneous device group, and the best logical mesh will be the union of all the
+                rings. Otherwise, the best logical mesh will be the balanced logical mesh, such as shape (2, 2) for
+                4 devices.
+
+        Returns:
+            best_logical_mesh: The best logical mesh for the given device list.
+
+        Usage:
+            >>> physical_devices = [0, 1, 2, 3]
+            >>> ab_profiler = AlphaBetaProfiler(physical_devices)
+            >>> best_logical_mesh = profiler.search_best_logical_mesh()
+            >>> print(best_logical_mesh)
+            [[0, 1], [2, 3]]
+        '''
+
+        def _power_of_two(integer):
+            return integer & (integer - 1) == 0
+
+        def _detect_homogeneous_device(alpha_beta_dict):
+            '''
+            This function is used to detect whether the devices in the alpha_beta_dict are homogeneous.
+
+            Note: we assume that the devices in the alpha_beta_dict are homogeneous if the beta value
+                of the devices are in range of [(1 - self.homogeneous_tolerance), (1 + self.homogeneous_tolerance)]
+                * base_beta.
+            '''
+            homogeneous_device_dict: Dict[float, List[Tuple[int]]] = {}
+            for process_group, (_, beta) in alpha_beta_dict.items():
+                if homogeneous_device_dict is None:
+                    homogeneous_device_dict[beta] = []
+                    homogeneous_device_dict[beta].append(process_group)
+
+                match_beta = None
+                for beta_value in homogeneous_device_dict.keys():
+                    if beta <= beta_value * (1 + self.homogeneous_tolerance) and beta >= beta_value * (
+                            1 - self.homogeneous_tolerance):
+                        match_beta = beta_value
+                        break
+
+                if match_beta is not None:
+                    homogeneous_device_dict[match_beta].append(process_group)
+                else:
+                    homogeneous_device_dict[beta] = []
+                    homogeneous_device_dict[beta].append(process_group)
+
+            return homogeneous_device_dict
+
+        def _check_contain_all_devices(homogeneous_group: List[Tuple[int]]):
+            '''
+            This function is used to check whether the homogeneous_group contains all physical devices.
+            '''
+            flatten_mesh = []
+            for process_group in homogeneous_group:
+                flatten_mesh.extend(process_group)
+            non_duplicated_flatten_mesh = set(flatten_mesh)
+            return len(non_duplicated_flatten_mesh) == len(self.physical_devices)
+
+        def _construct_largest_ring(homogeneous_group: List[Tuple[int]]):
+            '''
+            This function is used to construct the largest ring in the homogeneous_group for each rank.
+            '''
+            # Construct the ring
+            ring = []
+            ranks_in_ring = []
+            for rank in self.physical_devices:
+                if rank in ranks_in_ring:
+                    continue
+                stable_status = False
+                ring_for_rank = []
+                ring_for_rank.append(rank)
+                check_rank_list = [rank]
+                rank_to_check_list = []
+
+                while not stable_status:
+                    stable_status = True
+                    check_rank_list.extend(rank_to_check_list)
+                    rank_to_check_list = []
+                    for i in range(len(check_rank_list)):
+                        check_rank = check_rank_list.pop()
+                        for process_group in homogeneous_group:
+                            if check_rank in process_group:
+                                rank_to_append = process_group[0] if process_group[1] == check_rank else process_group[1]
+                                if rank_to_append not in ring_for_rank:
+                                    stable_status = False
+                                    rank_to_check_list.append(rank_to_append)
+                                    ring_for_rank.append(rank_to_append)
+
+                ring.append(ring_for_rank)
+                ranks_in_ring.extend(ring_for_rank)
+
+            return ring
+
+        assert _power_of_two(self.world_size)
+        power_of_two = int(math.log2(self.world_size))
+        median = power_of_two // 2
+        balanced_logical_mesh_shape = (2**median, 2**(power_of_two - median))
+        row_size, column_size = balanced_logical_mesh_shape[0], balanced_logical_mesh_shape[1]
+        balanced_logical_mesh = []
+        for row_index in range(row_size):
+            balanced_logical_mesh.append([])
+            for column_index in range(column_size):
+                balanced_logical_mesh[row_index].append(self.physical_devices[row_index * column_size + column_index])
+
+        homogeneous_device_dict = _detect_homogeneous_device(self.alpha_beta_dict)
+        beta_list = [b for b in homogeneous_device_dict.keys()]
+        beta_list.sort()
+        beta_list.reverse()
+        homogeneous_types = len(beta_list)
+        best_logical_mesh = None
+        if homogeneous_types >= 2:
+            for _ in range(homogeneous_types - 1):
+                lowest_beta = beta_list.pop()
+                best_homogeneous_group = homogeneous_device_dict[lowest_beta]
+                # if the best homogeneous group contains all physical devices,
+                # we will build the logical device mesh based on it. Otherwise,
+                # we will check next level homogeneous group.
+                if _check_contain_all_devices(best_homogeneous_group):
+                    # We choose the largest ring for each rank to maximum the best bus utilization.
+                    best_logical_mesh = _construct_largest_ring(best_homogeneous_group)
+                    break
+
+        if homogeneous_types == 1 or best_logical_mesh is None:
+            # in this case, we use balanced logical mesh as the best
+            # logical mesh.
+            best_logical_mesh = balanced_logical_mesh
+
+        return best_logical_mesh
+
+    def extract_alpha_beta_for_device_mesh(self):
+        '''
+        Extract the mesh_alpha list and mesh_beta list based on the
+            best logical mesh, which will be used to initialize the device mesh.
+
+        Usage:
+            >>> physical_devices = [0, 1, 2, 3]
+            >>> ab_profiler = AlphaBetaProfiler(physical_devices)
+            >>> mesh_alpha, mesh_beta = profiler.extract_alpha_beta_for_device_mesh()
+            >>> print(mesh_alpha)
+            [2.5917552411556242e-05, 0.00010312341153621673]
+            >>> print(mesh_beta)
+            [5.875573704655635e-11, 4.7361584445959614e-12]
+        '''
+        best_logical_mesh = self.search_best_logical_mesh()
+
+        first_axis = [row[0] for row in best_logical_mesh]
+        second_axis = best_logical_mesh[0]
+
+        # init process group for both axes
+        first_axis_process_group = dist.new_group(first_axis)
+        second_axis_process_group = dist.new_group(second_axis)
+
+        # extract alpha and beta for both axes
+        def _extract_alpha_beta(pg, pg_handler):
+            latency = self.profile_latency(pg, pg_handler)
+            bandwidth = self.profile_bandwidth(pg, pg_handler)
+            broadcast_object = [latency, bandwidth]
+            dist.broadcast_object_list(broadcast_object, src=pg[0])
+            return broadcast_object
+
+        first_latency, first_bandwidth = _extract_alpha_beta(first_axis, first_axis_process_group)
+        second_latency, second_bandwidth = _extract_alpha_beta(second_axis, second_axis_process_group)
+        mesh_alpha = [first_latency, second_latency]
+        mesh_beta = [1 / first_bandwidth, 1 / second_bandwidth]
+
+        return mesh_alpha, mesh_beta
diff --git a/tests/test_device/test_extract_alpha_beta.py b/tests/test_device/test_extract_alpha_beta.py
new file mode 100644
index 000000000000..e32bebdd908e
--- /dev/null
+++ b/tests/test_device/test_extract_alpha_beta.py
@@ -0,0 +1,39 @@
+from functools import partial
+
+import pytest
+import torch.multiprocessing as mp
+
+from colossalai.device import AlphaBetaProfiler
+from colossalai.initialize import launch
+from colossalai.logging import disable_existing_loggers
+from colossalai.testing import parameterize, rerun_if_address_is_in_use
+from colossalai.utils import free_port
+
+
+def check_extract_alpha_beta(rank, physical_devices, world_size, port):
+    disable_existing_loggers()
+    launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+    profiler = AlphaBetaProfiler(physical_devices)
+
+    mesh_alpha, mesh_beta = profiler.extract_alpha_beta_for_device_mesh()
+    for alpha in mesh_alpha:
+        assert alpha > 0 and alpha < 1e-3
+    for beta in mesh_beta:
+        assert beta > 0 and beta < 1e-10
+
+
+@pytest.mark.skip(reason="Skip because assertion may fail for CI devices")
+@pytest.mark.dist
+@parameterize('physical_devices', [[0, 1, 2, 3], [0, 3]])
+@rerun_if_address_is_in_use()
+def test_profile_alpha_beta(physical_devices):
+    world_size = 4
+    run_func = partial(check_extract_alpha_beta,
+                       physical_devices=physical_devices,
+                       world_size=world_size,
+                       port=free_port())
+    mp.spawn(run_func, nprocs=world_size)
+
+
+if __name__ == '__main__':
+    test_profile_alpha_beta()
diff --git a/tests/test_device/test_search_logical_device_mesh.py b/tests/test_device/test_search_logical_device_mesh.py
new file mode 100644
index 000000000000..591eafb2a50d
--- /dev/null
+++ b/tests/test_device/test_search_logical_device_mesh.py
@@ -0,0 +1,36 @@
+from functools import partial
+
+import pytest
+import torch.multiprocessing as mp
+
+from colossalai.device import AlphaBetaProfiler
+from colossalai.initialize import launch
+from colossalai.logging import disable_existing_loggers
+from colossalai.testing import parameterize, rerun_if_address_is_in_use
+from colossalai.utils import free_port
+
+
+def check_alpha_beta(rank, physical_devices, world_size, port):
+    disable_existing_loggers()
+    launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+    profiler = AlphaBetaProfiler(physical_devices)
+    best_logical_mesh = profiler.search_best_logical_mesh()
+
+    if physical_devices == [0, 1, 2, 3]:
+        assert best_logical_mesh == [[0, 1], [2, 3]]
+    elif physical_devices == [0, 3]:
+        assert best_logical_mesh == [[0, 3]]
+
+
+@pytest.mark.skip(reason="Skip because assertion may fail for CI devices")
+@pytest.mark.dist
+@parameterize('physical_devices', [[0, 1, 2, 3], [0, 3]])
+@rerun_if_address_is_in_use()
+def test_profile_alpha_beta(physical_devices):
+    world_size = 4
+    run_func = partial(check_alpha_beta, physical_devices=physical_devices, world_size=world_size, port=free_port())
+    mp.spawn(run_func, nprocs=world_size)
+
+
+if __name__ == '__main__':
+    test_profile_alpha_beta()

From 71e72c48907195096ef02be73e1c5b0feea2653d Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Thu, 5 Jan 2023 17:54:25 +0800
Subject: [PATCH 082/503] last version of benchmark

---
 autochunk_benchmark.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/autochunk_benchmark.py b/autochunk_benchmark.py
index 3b48d7e461fe..c938485efc05 100644
--- a/autochunk_benchmark.py
+++ b/autochunk_benchmark.py
@@ -93,22 +93,24 @@ def _build_openfold():
 
 def benchmark_evoformer():
     # init data and model
-    msa_len = 300
-    pair_len = 800
+    msa_len = 256
+    pair_len = 2048
     node = torch.randn(1, msa_len, pair_len, 256).cuda()
     pair = torch.randn(1, pair_len, pair_len, 128).cuda()
     model = evoformer_base().cuda()
 
     # build autochunk model
-    max_memory = 3000  # MB
+    max_memory = 10000  # MB fit memory mode
+    # max_memory = None  # min memory mode
     autochunk = _build_autochunk(evoformer_base().cuda(), max_memory, node, pair)
 
     # build openfold
+    chunk_size = 64
     openfold = _build_openfold()
 
     # benchmark
     _benchmark_evoformer(model, node, pair, "base")
-    _benchmark_evoformer(openfold, node, pair, "openfold", chunk_size=4)
+    _benchmark_evoformer(openfold, node, pair, "openfold", chunk_size=chunk_size)
     _benchmark_evoformer(autochunk, node, pair, "autochunk")
 
 
From 27ab5240965fc9cc0ec74ff48356abcbf098bd74 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 6 Jan 2023 11:07:57 +0800
Subject: [PATCH 083/503] refactor structure

---
 .../chunk_codegen.py                          | 41 ++++++++-----------
 .../evoformer}/evoformer.py                   |  0
 .../evoformer}/initializer.py                 |  0
 {evoformer => autochunk/evoformer}/kernel.py  |  0
 {evoformer => autochunk/evoformer}/msa.py     |  0
 {evoformer => autochunk/evoformer}/ops.py     |  0
 .../evoformer}/triangle.py                    |  0
 .../openfold}/checkpointing.py                |  0
 {openfold => autochunk/openfold}/dropout.py   |  0
 {openfold => autochunk/openfold}/evoformer.py |  0
 {openfold => autochunk/openfold}/msa.py       |  0
 .../openfold}/outer_product_mean.py           |  0
 .../openfold}/pair_transition.py              |  0
 .../openfold}/primitives.py                   |  0
 .../openfold}/tensor_utils.py                 |  0
 .../openfold}/triangular_attention.py         |  0
 .../triangular_multiplicative_update.py       |  0
 autochunk_benchmark.py                        | 18 ++++----
 chunk_codegen_run.py => autochunk_test.py     |  4 +-
 19 files changed, 29 insertions(+), 34 deletions(-)
 rename chunk_codegen.py => autochunk/chunk_codegen.py (98%)
 rename {evoformer => autochunk/evoformer}/evoformer.py (100%)
 rename {evoformer => autochunk/evoformer}/initializer.py (100%)
 rename {evoformer => autochunk/evoformer}/kernel.py (100%)
 rename {evoformer => autochunk/evoformer}/msa.py (100%)
 rename {evoformer => autochunk/evoformer}/ops.py (100%)
 rename {evoformer => autochunk/evoformer}/triangle.py (100%)
 rename {openfold => autochunk/openfold}/checkpointing.py (100%)
 rename {openfold => autochunk/openfold}/dropout.py (100%)
 rename {openfold => autochunk/openfold}/evoformer.py (100%)
 rename {openfold => autochunk/openfold}/msa.py (100%)
 rename {openfold => autochunk/openfold}/outer_product_mean.py (100%)
 rename {openfold => autochunk/openfold}/pair_transition.py (100%)
 rename {openfold => autochunk/openfold}/primitives.py (100%)
 rename {openfold => autochunk/openfold}/tensor_utils.py (100%)
 rename {openfold => autochunk/openfold}/triangular_attention.py (100%)
 rename {openfold => autochunk/openfold}/triangular_multiplicative_update.py (100%)
 rename chunk_codegen_run.py => autochunk_test.py (97%)

diff --git a/chunk_codegen.py b/autochunk/chunk_codegen.py
similarity index 98%
rename from chunk_codegen.py
rename to autochunk/chunk_codegen.py
index 41fcb5a3c2f4..7a5d06689247 100644
--- a/chunk_codegen.py
+++ b/autochunk/chunk_codegen.py
@@ -1967,13 +1967,11 @@ def _replace_reshape_size(context, node_name, reshape_size_dict):
 
 def emit_code_with_chunk(
     body,
-    ckpt_func,
     nodes,
     emit_node_func,
     delete_unused_value_func,
-    meta_nodes,
-    meta_graph,
-    max_memory=None,
+    chunk_region_search,
+    chunk_infos
 ):
     """Emit code with nested activation checkpoint
     When we detect some of the node.activation_checkpoint is a List, we will use
@@ -1988,23 +1986,19 @@ def emit_code_with_chunk(
     """
     node_list = list(nodes)
 
-    # find the chunk regions
-    chunk_region_search = ChunkRegionSearch(meta_graph, max_memory)
-    chunk_search = chunk_region_search.search_region()
-
-    chunk_regions = [i["region"] for i in chunk_search]
+    chunk_regions = [i["region"] for i in chunk_infos]
     chunk_starts = [i[0] for i in chunk_regions]
     chunk_ends = [i[1] for i in chunk_regions]
 
-    chunk_inputs = [i["inputs"] for i in chunk_search]
-    chunk_inputs_non_chunk = [i["inputs_non_chunk"] for i in chunk_search]
-    chunk_inputs_dim = [i["inputs_dim"] for i in chunk_search]
+    chunk_inputs = [i["inputs"] for i in chunk_infos]
+    chunk_inputs_non_chunk = [i["inputs_non_chunk"] for i in chunk_infos]
+    chunk_inputs_dim = [i["inputs_dim"] for i in chunk_infos]
     chunk_inputs_names = [j.name for i in chunk_inputs for j in i] + [
         j.name for i in chunk_inputs_non_chunk for j in i
     ]
 
-    chunk_outputs = [i["outputs"][0] for i in chunk_search]
-    chunk_outputs_dim = [i["outputs_dim"] for i in chunk_search]
+    chunk_outputs = [i["outputs"][0] for i in chunk_infos]
+    chunk_outputs_dim = [i["outputs_dim"] for i in chunk_infos]
 
     node_list = chunk_region_search.index_tracer.reorder_node_list(node_list)
     node_idx = 0
@@ -2022,7 +2016,7 @@ def emit_code_with_chunk(
                     chunk_inputs[region_idx],
                     chunk_outputs[region_idx],
                     chunk_outputs_dim[region_idx],
-                    chunk_search[region_idx]["chunk_size"],
+                    chunk_infos[region_idx]["chunk_size"],
                 )
             )
 
@@ -2041,14 +2035,14 @@ def emit_code_with_chunk(
             # ones like
             if "ones_like" in node.name:
                 meta_node = chunk_region_search.index_tracer.node_list[node_idx]
-                chunk_dim = chunk_search[region_idx]["node_chunk_dim"][meta_node][
+                chunk_dim = chunk_infos[region_idx]["node_chunk_dim"][meta_node][
                     "chunk_dim"
                 ]
                 if _get_node_shape(meta_node)[chunk_dim] != 1:
                     source_node = meta_node.args[0].args[0]
                     if (
-                        source_node not in chunk_search[region_idx]["node_chunk_dim"]
-                        or chunk_search[region_idx]["node_chunk_dim"][source_node][
+                        source_node not in chunk_infos[region_idx]["node_chunk_dim"]
+                        or chunk_infos[region_idx]["node_chunk_dim"][source_node][
                             "chunk_dim"
                         ]
                         is None
@@ -2060,7 +2054,7 @@ def emit_code_with_chunk(
                             body[-1], node.args[0].name, node.args[0].name + chunk_slice
                         )
             body[-1] = _replace_reshape_size(
-                body[-1], node.name, chunk_search[region_idx]["reshape_size"]
+                body[-1], node.name, chunk_infos[region_idx]["reshape_size"]
             )
             body[-1] = "    " + body[-1]
             delete_unused_value_func(node, body, chunk_inputs_names)
@@ -2092,6 +2086,9 @@ def __init__(self, meta_graph, max_memory=None):
             self.meta_graph = meta_graph
             self.max_memory = max_memory
             self.meta_node = list(meta_graph.graph.nodes)
+            # find the chunk regions
+            self.chunk_region_search = ChunkRegionSearch(meta_graph, max_memory)
+            self.chunk_infos = self.chunk_region_search.search_region()
 
         def _gen_python_code(
             self, nodes, root_module: str, namespace: _Namespace
@@ -2323,13 +2320,11 @@ def emit_node(node: Node, body):
             # will use nested type of activation checkpoint codegen
             emit_code_with_chunk(
                 body,
-                ckpt_func,
                 nodes,
                 emit_node,
                 delete_unused_values,
-                self.meta_node,
-                self.meta_graph,
-                self.max_memory,
+                self.chunk_region_search,
+                self.chunk_infos
             )
 
             if len(body) == 0:
diff --git a/evoformer/evoformer.py b/autochunk/evoformer/evoformer.py
similarity index 100%
rename from evoformer/evoformer.py
rename to autochunk/evoformer/evoformer.py
diff --git a/evoformer/initializer.py b/autochunk/evoformer/initializer.py
similarity index 100%
rename from evoformer/initializer.py
rename to autochunk/evoformer/initializer.py
diff --git a/evoformer/kernel.py b/autochunk/evoformer/kernel.py
similarity index 100%
rename from evoformer/kernel.py
rename to autochunk/evoformer/kernel.py
diff --git a/evoformer/msa.py b/autochunk/evoformer/msa.py
similarity index 100%
rename from evoformer/msa.py
rename to autochunk/evoformer/msa.py
diff --git a/evoformer/ops.py b/autochunk/evoformer/ops.py
similarity index 100%
rename from evoformer/ops.py
rename to autochunk/evoformer/ops.py
diff --git a/evoformer/triangle.py b/autochunk/evoformer/triangle.py
similarity index 100%
rename from evoformer/triangle.py
rename to autochunk/evoformer/triangle.py
diff --git a/openfold/checkpointing.py b/autochunk/openfold/checkpointing.py
similarity index 100%
rename from openfold/checkpointing.py
rename to autochunk/openfold/checkpointing.py
diff --git a/openfold/dropout.py b/autochunk/openfold/dropout.py
similarity index 100%
rename from openfold/dropout.py
rename to autochunk/openfold/dropout.py
diff --git a/openfold/evoformer.py b/autochunk/openfold/evoformer.py
similarity index 100%
rename from openfold/evoformer.py
rename to autochunk/openfold/evoformer.py
diff --git a/openfold/msa.py b/autochunk/openfold/msa.py
similarity index 100%
rename from openfold/msa.py
rename to autochunk/openfold/msa.py
diff --git a/openfold/outer_product_mean.py b/autochunk/openfold/outer_product_mean.py
similarity index 100%
rename from openfold/outer_product_mean.py
rename to autochunk/openfold/outer_product_mean.py
diff --git a/openfold/pair_transition.py b/autochunk/openfold/pair_transition.py
similarity index 100%
rename from openfold/pair_transition.py
rename to autochunk/openfold/pair_transition.py
diff --git a/openfold/primitives.py b/autochunk/openfold/primitives.py
similarity index 100%
rename from openfold/primitives.py
rename to autochunk/openfold/primitives.py
diff --git a/openfold/tensor_utils.py b/autochunk/openfold/tensor_utils.py
similarity index 100%
rename from openfold/tensor_utils.py
rename to autochunk/openfold/tensor_utils.py
diff --git a/openfold/triangular_attention.py b/autochunk/openfold/triangular_attention.py
similarity index 100%
rename from openfold/triangular_attention.py
rename to autochunk/openfold/triangular_attention.py
diff --git a/openfold/triangular_multiplicative_update.py b/autochunk/openfold/triangular_multiplicative_update.py
similarity index 100%
rename from openfold/triangular_multiplicative_update.py
rename to autochunk/openfold/triangular_multiplicative_update.py
diff --git a/autochunk_benchmark.py b/autochunk_benchmark.py
index c938485efc05..c34b5217e5d4 100644
--- a/autochunk_benchmark.py
+++ b/autochunk_benchmark.py
@@ -3,13 +3,13 @@
 import torch
 import torch.fx
 
-from chunk_codegen import ChunkCodeGen
+from autochunk.chunk_codegen import ChunkCodeGen
 from colossalai.fx import ColoTracer
 from colossalai.fx.graph_module import ColoGraphModule
 from colossalai.fx.passes.meta_info_prop import MetaInfoProp
 from colossalai.fx.profiler import MetaTensor
-from evoformer.evoformer import evoformer_base
-from openfold.evoformer import EvoformerBlock
+from autochunk.evoformer.evoformer import evoformer_base
+from autochunk.openfold.evoformer import EvoformerBlock
 
 
 def _benchmark_evoformer(model: torch.nn.Module, node, pair, title, chunk_size=None):
@@ -94,23 +94,23 @@ def _build_openfold():
 def benchmark_evoformer():
     # init data and model
     msa_len = 256
-    pair_len = 2048
+    pair_len = 1024
     node = torch.randn(1, msa_len, pair_len, 256).cuda()
     pair = torch.randn(1, pair_len, pair_len, 128).cuda()
     model = evoformer_base().cuda()
 
     # build autochunk model
-    max_memory = 10000  # MB fit memory mode
-    # max_memory = None  # min memory mode
+    # max_memory = 10000  # MB fit memory mode
+    max_memory = None  # min memory mode
     autochunk = _build_autochunk(evoformer_base().cuda(), max_memory, node, pair)
 
     # build openfold
     chunk_size = 64
-    openfold = _build_openfold()
+    # openfold = _build_openfold()
 
     # benchmark
-    _benchmark_evoformer(model, node, pair, "base")
-    _benchmark_evoformer(openfold, node, pair, "openfold", chunk_size=chunk_size)
+    # _benchmark_evoformer(model, node, pair, "base")
+    # _benchmark_evoformer(openfold, node, pair, "openfold", chunk_size=chunk_size)
     _benchmark_evoformer(autochunk, node, pair, "autochunk")
 
 
diff --git a/chunk_codegen_run.py b/autochunk_test.py
similarity index 97%
rename from chunk_codegen_run.py
rename to autochunk_test.py
index 3a3b3c599e3e..63f393531d5c 100644
--- a/chunk_codegen_run.py
+++ b/autochunk_test.py
@@ -12,8 +12,8 @@
 from colossalai.fx.graph_module import ColoGraphModule
 from colossalai.fx.passes.meta_info_prop import MetaInfoProp, TensorMetadata
 from colossalai.fx.profiler import MetaTensor
-from evoformer.evoformer import evoformer_base
-from chunk_codegen import ChunkCodeGen
+from autochunk.evoformer.evoformer import evoformer_base
+from autochunk.chunk_codegen import ChunkCodeGen
 with_codegen = True
 
 
From efb1c64c30cf2ee35dad03bfd3829f014d204a8d Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 6 Jan 2023 11:39:26 +0800
Subject: [PATCH 084/503] restruct dir

---
 .../autochunk}/chunk_codegen.py                |  0
 .../test_autochunk/autochunk_benchmark.py      | 14 +++++++-------
 .../test_autochunk/autochunk_test.py           |  4 ++--
 .../test_autochunk}/evoformer/evoformer.py     |  0
 .../test_autochunk}/evoformer/initializer.py   |  0
 .../test_autochunk}/evoformer/kernel.py        |  0
 .../test_autochunk}/evoformer/msa.py           |  0
 .../test_autochunk}/evoformer/ops.py           |  0
 .../test_autochunk}/evoformer/triangle.py      |  0
 .../test_autochunk}/openfold/checkpointing.py  |  0
 .../test_autochunk}/openfold/dropout.py        |  0
 .../test_autochunk}/openfold/evoformer.py      | 18 +++++++++---------
 .../test_autochunk}/openfold/msa.py            |  6 +++---
 .../openfold/outer_product_mean.py             |  4 ++--
 .../openfold/pair_transition.py                |  4 ++--
 .../test_autochunk}/openfold/primitives.py     |  4 ++--
 .../test_autochunk}/openfold/tensor_utils.py   |  0
 .../openfold/triangular_attention.py           |  4 ++--
 .../triangular_multiplicative_update.py        |  4 ++--
 19 files changed, 31 insertions(+), 31 deletions(-)
 rename {autochunk => colossalai/autochunk}/chunk_codegen.py (100%)
 rename autochunk_benchmark.py => tests/test_autochunk/autochunk_benchmark.py (89%)
 rename autochunk_test.py => tests/test_autochunk/autochunk_test.py (96%)
 rename {autochunk => tests/test_autochunk}/evoformer/evoformer.py (100%)
 rename {autochunk => tests/test_autochunk}/evoformer/initializer.py (100%)
 rename {autochunk => tests/test_autochunk}/evoformer/kernel.py (100%)
 rename {autochunk => tests/test_autochunk}/evoformer/msa.py (100%)
 rename {autochunk => tests/test_autochunk}/evoformer/ops.py (100%)
 rename {autochunk => tests/test_autochunk}/evoformer/triangle.py (100%)
 rename {autochunk => tests/test_autochunk}/openfold/checkpointing.py (100%)
 rename {autochunk => tests/test_autochunk}/openfold/dropout.py (100%)
 rename {autochunk => tests/test_autochunk}/openfold/evoformer.py (96%)
 rename {autochunk => tests/test_autochunk}/openfold/msa.py (98%)
 rename {autochunk => tests/test_autochunk}/openfold/outer_product_mean.py (97%)
 rename {autochunk => tests/test_autochunk}/openfold/pair_transition.py (96%)
 rename {autochunk => tests/test_autochunk}/openfold/primitives.py (99%)
 rename {autochunk => tests/test_autochunk}/openfold/tensor_utils.py (100%)
 rename {autochunk => tests/test_autochunk}/openfold/triangular_attention.py (97%)
 rename {autochunk => tests/test_autochunk}/openfold/triangular_multiplicative_update.py (97%)

diff --git a/autochunk/chunk_codegen.py b/colossalai/autochunk/chunk_codegen.py
similarity index 100%
rename from autochunk/chunk_codegen.py
rename to colossalai/autochunk/chunk_codegen.py
diff --git a/autochunk_benchmark.py b/tests/test_autochunk/autochunk_benchmark.py
similarity index 89%
rename from autochunk_benchmark.py
rename to tests/test_autochunk/autochunk_benchmark.py
index c34b5217e5d4..8df6d9ff4564 100644
--- a/autochunk_benchmark.py
+++ b/tests/test_autochunk/autochunk_benchmark.py
@@ -3,13 +3,13 @@
 import torch
 import torch.fx
 
-from autochunk.chunk_codegen import ChunkCodeGen
+from colossalai.autochunk.chunk_codegen import ChunkCodeGen
 from colossalai.fx import ColoTracer
 from colossalai.fx.graph_module import ColoGraphModule
 from colossalai.fx.passes.meta_info_prop import MetaInfoProp
 from colossalai.fx.profiler import MetaTensor
-from autochunk.evoformer.evoformer import evoformer_base
-from autochunk.openfold.evoformer import EvoformerBlock
+from tests.test_autochunk.evoformer.evoformer import evoformer_base
+from tests.test_autochunk.openfold.evoformer import EvoformerBlock
 
 
 def _benchmark_evoformer(model: torch.nn.Module, node, pair, title, chunk_size=None):
@@ -94,7 +94,7 @@ def _build_openfold():
 def benchmark_evoformer():
     # init data and model
     msa_len = 256
-    pair_len = 1024
+    pair_len = 256
     node = torch.randn(1, msa_len, pair_len, 256).cuda()
     pair = torch.randn(1, pair_len, pair_len, 128).cuda()
     model = evoformer_base().cuda()
@@ -106,11 +106,11 @@ def benchmark_evoformer():
 
     # build openfold
     chunk_size = 64
-    # openfold = _build_openfold()
+    openfold = _build_openfold()
 
     # benchmark
-    # _benchmark_evoformer(model, node, pair, "base")
-    # _benchmark_evoformer(openfold, node, pair, "openfold", chunk_size=chunk_size)
+    _benchmark_evoformer(model, node, pair, "base")
+    _benchmark_evoformer(openfold, node, pair, "openfold", chunk_size=chunk_size)
     _benchmark_evoformer(autochunk, node, pair, "autochunk")
 
 
diff --git a/autochunk_test.py b/tests/test_autochunk/autochunk_test.py
similarity index 96%
rename from autochunk_test.py
rename to tests/test_autochunk/autochunk_test.py
index 63f393531d5c..5e9aaca15f9f 100644
--- a/autochunk_test.py
+++ b/tests/test_autochunk/autochunk_test.py
@@ -12,8 +12,8 @@
 from colossalai.fx.graph_module import ColoGraphModule
 from colossalai.fx.passes.meta_info_prop import MetaInfoProp, TensorMetadata
 from colossalai.fx.profiler import MetaTensor
-from autochunk.evoformer.evoformer import evoformer_base
-from autochunk.chunk_codegen import ChunkCodeGen
+from tests.test_autochunk.evoformer.evoformer import evoformer_base
+from ...colossalai.autochunk.chunk_codegen import ChunkCodeGen
 with_codegen = True
 
 
diff --git a/autochunk/evoformer/evoformer.py b/tests/test_autochunk/evoformer/evoformer.py
similarity index 100%
rename from autochunk/evoformer/evoformer.py
rename to tests/test_autochunk/evoformer/evoformer.py
diff --git a/autochunk/evoformer/initializer.py b/tests/test_autochunk/evoformer/initializer.py
similarity index 100%
rename from autochunk/evoformer/initializer.py
rename to tests/test_autochunk/evoformer/initializer.py
diff --git a/autochunk/evoformer/kernel.py b/tests/test_autochunk/evoformer/kernel.py
similarity index 100%
rename from autochunk/evoformer/kernel.py
rename to tests/test_autochunk/evoformer/kernel.py
diff --git a/autochunk/evoformer/msa.py b/tests/test_autochunk/evoformer/msa.py
similarity index 100%
rename from autochunk/evoformer/msa.py
rename to tests/test_autochunk/evoformer/msa.py
diff --git a/autochunk/evoformer/ops.py b/tests/test_autochunk/evoformer/ops.py
similarity index 100%
rename from autochunk/evoformer/ops.py
rename to tests/test_autochunk/evoformer/ops.py
diff --git a/autochunk/evoformer/triangle.py b/tests/test_autochunk/evoformer/triangle.py
similarity index 100%
rename from autochunk/evoformer/triangle.py
rename to tests/test_autochunk/evoformer/triangle.py
diff --git a/autochunk/openfold/checkpointing.py b/tests/test_autochunk/openfold/checkpointing.py
similarity index 100%
rename from autochunk/openfold/checkpointing.py
rename to tests/test_autochunk/openfold/checkpointing.py
diff --git a/autochunk/openfold/dropout.py b/tests/test_autochunk/openfold/dropout.py
similarity index 100%
rename from autochunk/openfold/dropout.py
rename to tests/test_autochunk/openfold/dropout.py
diff --git a/autochunk/openfold/evoformer.py b/tests/test_autochunk/openfold/evoformer.py
similarity index 96%
rename from autochunk/openfold/evoformer.py
rename to tests/test_autochunk/openfold/evoformer.py
index ffd4c982987a..b53ec1aa51e5 100644
--- a/autochunk/openfold/evoformer.py
+++ b/tests/test_autochunk/openfold/evoformer.py
@@ -19,25 +19,25 @@
 from typing import Tuple, Optional
 from functools import partial
 
-from openfold.primitives import Linear, LayerNorm
-from openfold.dropout import DropoutRowwise, DropoutColumnwise
-from openfold.msa import (
+from .primitives import Linear, LayerNorm
+from .dropout import DropoutRowwise, DropoutColumnwise
+from .msa import (
     MSARowAttentionWithPairBias,
     MSAColumnAttention,
     MSAColumnGlobalAttention,
 )
-from openfold.outer_product_mean import OuterProductMean
-from openfold.pair_transition import PairTransition
-from openfold.triangular_attention import (
+from .outer_product_mean import OuterProductMean
+from .pair_transition import PairTransition
+from .triangular_attention import (
     TriangleAttentionStartingNode,
     TriangleAttentionEndingNode,
 )
-from openfold.triangular_multiplicative_update import (
+from .triangular_multiplicative_update import (
     TriangleMultiplicationOutgoing,
     TriangleMultiplicationIncoming,
 )
-from openfold.checkpointing import checkpoint_blocks, get_checkpoint_fn
-from openfold.tensor_utils import chunk_layer
+from .checkpointing import checkpoint_blocks, get_checkpoint_fn
+from .tensor_utils import chunk_layer
 
 
 class MSATransition(nn.Module):
diff --git a/autochunk/openfold/msa.py b/tests/test_autochunk/openfold/msa.py
similarity index 98%
rename from autochunk/openfold/msa.py
rename to tests/test_autochunk/openfold/msa.py
index 00b822e7f390..7c137286feab 100644
--- a/autochunk/openfold/msa.py
+++ b/tests/test_autochunk/openfold/msa.py
@@ -18,15 +18,15 @@
 import torch.nn as nn
 from typing import Optional, List, Tuple
 
-from openfold.primitives import (
+from .primitives import (
     Linear, 
     LayerNorm,
     Attention, 
     GlobalAttention, 
     _attention_chunked_trainable,
 )
-from openfold.checkpointing import get_checkpoint_fn
-from openfold.tensor_utils import (
+from .checkpointing import get_checkpoint_fn
+from .tensor_utils import (
     chunk_layer,
     permute_final_dims,
     flatten_final_dims,
diff --git a/autochunk/openfold/outer_product_mean.py b/tests/test_autochunk/openfold/outer_product_mean.py
similarity index 97%
rename from autochunk/openfold/outer_product_mean.py
rename to tests/test_autochunk/openfold/outer_product_mean.py
index 43d853833c66..daadf1c272cf 100644
--- a/autochunk/openfold/outer_product_mean.py
+++ b/tests/test_autochunk/openfold/outer_product_mean.py
@@ -19,8 +19,8 @@
 import torch
 import torch.nn as nn
 
-from openfold.primitives import Linear
-from openfold.tensor_utils import chunk_layer
+from .primitives import Linear
+from .tensor_utils import chunk_layer
 
 
 class OuterProductMean(nn.Module):
diff --git a/autochunk/openfold/pair_transition.py b/tests/test_autochunk/openfold/pair_transition.py
similarity index 96%
rename from autochunk/openfold/pair_transition.py
rename to tests/test_autochunk/openfold/pair_transition.py
index de76306418ee..7d09914dc3cc 100644
--- a/autochunk/openfold/pair_transition.py
+++ b/tests/test_autochunk/openfold/pair_transition.py
@@ -17,8 +17,8 @@
 import torch
 import torch.nn as nn
 
-from openfold.primitives import Linear, LayerNorm
-from openfold.tensor_utils import chunk_layer
+from .primitives import Linear, LayerNorm
+from .tensor_utils import chunk_layer
 
 
 class PairTransition(nn.Module):
diff --git a/autochunk/openfold/primitives.py b/tests/test_autochunk/openfold/primitives.py
similarity index 99%
rename from autochunk/openfold/primitives.py
rename to tests/test_autochunk/openfold/primitives.py
index bbc156f21d4a..32a9d487c441 100644
--- a/autochunk/openfold/primitives.py
+++ b/tests/test_autochunk/openfold/primitives.py
@@ -21,8 +21,8 @@
 import torch
 import torch.nn as nn
 
-from openfold.checkpointing import get_checkpoint_fn
-from openfold.tensor_utils import (
+from .checkpointing import get_checkpoint_fn
+from .tensor_utils import (
     permute_final_dims,
     flatten_final_dims,
     _chunk_slice,
diff --git a/autochunk/openfold/tensor_utils.py b/tests/test_autochunk/openfold/tensor_utils.py
similarity index 100%
rename from autochunk/openfold/tensor_utils.py
rename to tests/test_autochunk/openfold/tensor_utils.py
diff --git a/autochunk/openfold/triangular_attention.py b/tests/test_autochunk/openfold/triangular_attention.py
similarity index 97%
rename from autochunk/openfold/triangular_attention.py
rename to tests/test_autochunk/openfold/triangular_attention.py
index 6d3e37f4c681..12d09c502daf 100644
--- a/autochunk/openfold/triangular_attention.py
+++ b/tests/test_autochunk/openfold/triangular_attention.py
@@ -20,8 +20,8 @@
 import torch
 import torch.nn as nn
 
-from openfold.primitives import Linear, LayerNorm, Attention
-from openfold.tensor_utils import (
+from .primitives import Linear, LayerNorm, Attention
+from .tensor_utils import (
     chunk_layer,
     permute_final_dims,
     flatten_final_dims,
diff --git a/autochunk/openfold/triangular_multiplicative_update.py b/tests/test_autochunk/openfold/triangular_multiplicative_update.py
similarity index 97%
rename from autochunk/openfold/triangular_multiplicative_update.py
rename to tests/test_autochunk/openfold/triangular_multiplicative_update.py
index 2406e2bac2cf..29f7062c3212 100644
--- a/autochunk/openfold/triangular_multiplicative_update.py
+++ b/tests/test_autochunk/openfold/triangular_multiplicative_update.py
@@ -19,8 +19,8 @@
 import torch
 import torch.nn as nn
 
-from openfold.primitives import Linear, LayerNorm
-from openfold.tensor_utils import permute_final_dims
+from .primitives import Linear, LayerNorm
+from .tensor_utils import permute_final_dims
 
 
 class TriangleMultiplicativeUpdate(nn.Module):

From 06a5355d98c0069e3305679a04846637917078e9 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 6 Jan 2023 11:44:01 +0800
Subject: [PATCH 085/503] update test

---
 tests/test_autochunk/autochunk_test.py | 111 ++++++++++++-------------
 1 file changed, 52 insertions(+), 59 deletions(-)

diff --git a/tests/test_autochunk/autochunk_test.py b/tests/test_autochunk/autochunk_test.py
index 5e9aaca15f9f..caa2d9a80254 100644
--- a/tests/test_autochunk/autochunk_test.py
+++ b/tests/test_autochunk/autochunk_test.py
@@ -1,76 +1,60 @@
-import copy
-import torch
-import torch.nn.functional as F
 import pytest
+import torch
 import torch.fx
 import torch.multiprocessing as mp
-from torch.fx import GraphModule
-from colossalai.fx import ColoTracer
+
 import colossalai
-from colossalai.utils import free_port
+from colossalai.autochunk.chunk_codegen import ChunkCodeGen
 from colossalai.core import global_context as gpc
+from colossalai.fx import ColoTracer
 from colossalai.fx.graph_module import ColoGraphModule
-from colossalai.fx.passes.meta_info_prop import MetaInfoProp, TensorMetadata
+from colossalai.fx.passes.meta_info_prop import MetaInfoProp
 from colossalai.fx.profiler import MetaTensor
+from colossalai.utils import free_port
 from tests.test_autochunk.evoformer.evoformer import evoformer_base
-from ...colossalai.autochunk.chunk_codegen import ChunkCodeGen
-with_codegen = True
-
-
-def _is_all_gradient_close(m: torch.nn.Module, gm: GraphModule) -> bool:
-    for m_p, gm_p in zip(m.parameters(), gm.parameters()):
-        if m_p.grad is not None and not torch.allclose(m_p.grad, gm_p.grad):
-            return False
-    return True
-
-
-def _is_all_param_close(m: torch.nn.Module, gm: GraphModule) -> bool:
-    for m_p, gm_p in zip(m.parameters(), gm.parameters()):
-        if m_p.grad is not None and not torch.allclose(m_p.data, gm_p.data):
-            return False
-    return True
-
-
-def _test_fwd_and_bwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair):
-    # now_mem = torch.cuda.memory_allocated() / 1024**2
-    # with torch.no_grad():
-    #     node0 = node.clone()
-    #     pair0 = pair.clone()
-    #     model.graph(node0, pair0, now_mem)        
-    # new_now_mem = torch.cuda.memory_allocated() / 1024**2
-    # new_max_mem = torch.cuda.max_memory_allocated() / 1024**2
-    # print("\ncode now:%.2f max:%.2f" %(new_now_mem - now_mem, new_max_mem - now_mem))
-    
+
+
+def _test_fwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair):
     torch.cuda.reset_peak_memory_stats()
     now_mem = torch.cuda.memory_allocated() / 1024**2
     with torch.no_grad():
         node1 = node.clone()
         pair1 = pair.clone()
-        gm(node1, pair1)        
+        gm(node1, pair1)
     new_now_mem = torch.cuda.memory_allocated() / 1024**2
     new_max_mem = torch.cuda.max_memory_allocated() / 1024**2
-    print("gm now:%.2f max:%.2f" %(new_now_mem - now_mem, new_max_mem - now_mem))
-            
+    print(
+        "autochunk now mem:%.2f max mem:%.2f"
+        % (new_now_mem - now_mem, new_max_mem - now_mem)
+    )
+
     # test forward
     with torch.no_grad():
         non_fx_out = model(node, pair)
         fx_out = gm(node, pair)
 
-    assert torch.allclose(non_fx_out[0], fx_out[0], atol=1e-4), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(torch.abs(non_fx_out[0] - fx_out[0]))
-    assert torch.allclose(non_fx_out[1], fx_out[1], atol=1e-4), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(torch.abs(non_fx_out[1] - fx_out[1]))
-
-    # test barckward
-    # loss0 = non_fx_out[0].sum() + non_fx_out[1].sum()
-    # loss0.backward()
-    # loss1 = fx_out[0].sum() + fx_out[1].sum()
-    # loss1.backward()
-    # assert _is_all_param_close(model, gm)
-    # assert _is_all_gradient_close(model, gm), "gm doesn't have the same gradient as original one"
+    assert torch.allclose(
+        non_fx_out[0], fx_out[0], atol=1e-4
+    ), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(
+        torch.abs(non_fx_out[0] - fx_out[0])
+    )
+    assert torch.allclose(
+        non_fx_out[1], fx_out[1], atol=1e-4
+    ), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(
+        torch.abs(non_fx_out[1] - fx_out[1])
+    )
 
 
 def _run_offload_codegen(rank):
     # launch colossalai to make sure we could execute colossalai.utils.checkpoint currectly
-    colossalai.launch(config={}, rank=rank, world_size=1, host='localhost', port=free_port(), backend='nccl')
+    colossalai.launch(
+        config={},
+        rank=rank,
+        world_size=1,
+        host="localhost",
+        port=free_port(),
+        backend="nccl",
+    )
 
     # build model and input
     model = evoformer_base().cuda()
@@ -78,15 +62,25 @@ def _run_offload_codegen(rank):
     pair = torch.randn(1, 300, 300, 128).cuda()
 
     # trace the module and replace codegen
-    graph = ColoTracer().trace(model, meta_args={'node': node.to(torch.device('meta')), 'pair': pair.to(torch.device('meta'))})
-    gm_prop = torch.fx.symbolic_trace(model) # must use symbolic_trace
-    interp = MetaInfoProp(gm_prop) 
-    interp.propagate(MetaTensor(node, fake_device='cuda:0'), MetaTensor(pair, fake_device='cuda:0'))
+    graph = ColoTracer().trace(
+        model,
+        meta_args={
+            "node": node.to(torch.device("meta")),
+            "pair": pair.to(torch.device("meta")),
+        },
+    )
+    gm_prop = torch.fx.symbolic_trace(model)  # must use symbolic_trace
+    interp = MetaInfoProp(gm_prop)
+    interp.propagate(
+        MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0")
+    )
 
     # now run it twice to get meta info in graph module, not necessary
     gm = torch.fx.GraphModule(model, graph)
     interp = MetaInfoProp(gm)
-    interp.propagate(MetaTensor(node, fake_device='cuda:0'), MetaTensor(pair, fake_device='cuda:0'))
+    interp.propagate(
+        MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0")
+    )
 
     codegen = ChunkCodeGen(gm_prop)
     graph.set_codegen(codegen)
@@ -94,15 +88,14 @@ def _run_offload_codegen(rank):
     gm.recompile()
 
     # assert we have all the components
-    code = graph.python_code("self").src
-    print(code)
+    # code = graph.python_code("self").src
+    # print(code)
 
-    _test_fwd_and_bwd(model, gm, node, pair)
+    _test_fwd(model, gm, node, pair)
     gpc.destroy()
 
 
-@pytest.mark.skipif(not with_codegen, reason='torch version is lower than 1.12.0')
-def test_act_ckpt_codegen():
+def test_autochunk():
     mp.spawn(_run_offload_codegen, nprocs=1)
 
 
From d1f07731824c425c26197c7c82425445c8c3df3e Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 6 Jan 2023 11:48:33 +0800
Subject: [PATCH 086/503] rename

---
 .../{autochunk_benchmark.py => benchmark_autochunk.py}            | 0
 tests/test_autochunk/{autochunk_test.py => test_autochunk.py}     | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename tests/test_autochunk/{autochunk_benchmark.py => benchmark_autochunk.py} (100%)
 rename tests/test_autochunk/{autochunk_test.py => test_autochunk.py} (100%)

diff --git a/tests/test_autochunk/autochunk_benchmark.py b/tests/test_autochunk/benchmark_autochunk.py
similarity index 100%
rename from tests/test_autochunk/autochunk_benchmark.py
rename to tests/test_autochunk/benchmark_autochunk.py
diff --git a/tests/test_autochunk/autochunk_test.py b/tests/test_autochunk/test_autochunk.py
similarity index 100%
rename from tests/test_autochunk/autochunk_test.py
rename to tests/test_autochunk/test_autochunk.py

From 1a6d2a740be33d769111ed03104bb5fa73b2ad50 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 6 Jan 2023 14:14:45 +0800
Subject: [PATCH 087/503] take apart chunk code gen

---
 colossalai/autochunk/autochunk_codegen.py   |  497 ++++
 colossalai/autochunk/chunk_codegen.py       | 2364 -------------------
 colossalai/autochunk/chunk_region_search.py |  211 ++
 colossalai/autochunk/chunk_selector.py      |  221 ++
 colossalai/autochunk/index_tracer.py        | 1056 +++++++++
 colossalai/autochunk/memory_estiamtor.py    |  318 +++
 colossalai/autochunk/utils.py               |   95 +
 tests/test_autochunk/benchmark_autochunk.py |   12 +-
 tests/test_autochunk/test_autochunk.py      |    4 +-
 9 files changed, 2408 insertions(+), 2370 deletions(-)
 create mode 100644 colossalai/autochunk/autochunk_codegen.py
 delete mode 100644 colossalai/autochunk/chunk_codegen.py
 create mode 100644 colossalai/autochunk/chunk_region_search.py
 create mode 100644 colossalai/autochunk/chunk_selector.py
 create mode 100644 colossalai/autochunk/index_tracer.py
 create mode 100644 colossalai/autochunk/memory_estiamtor.py
 create mode 100644 colossalai/autochunk/utils.py

diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
new file mode 100644
index 000000000000..58a8c375136e
--- /dev/null
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -0,0 +1,497 @@
+from typing import Any, Callable, Dict, Iterable, List, Tuple
+
+import torch
+from torch.fx.graph import (
+    CodeGen,
+    PythonCode,
+    _custom_builtins,
+    _CustomBuiltin,
+    _format_target,
+    _is_from_torch,
+    _Namespace,
+    _origin_type_map,
+    inplace_methods,
+    magic_methods,
+)
+from torch.fx.node import Argument, Node, _get_qualified_name, _type_repr, map_arg
+
+import colossalai
+
+from .chunk_region_search import ChunkRegionSearch
+from .utils import delete_free_var_from_last_use, find_idx_by_name, get_node_shape
+
+CODEGEN_AVAILABLE = True
+__all__ = ["AutoChunkCodeGen"]
+
+
+def _gen_chunk_slice_dim(chunk_dim, chunk_idx_name, shape):
+    new_shape = "["
+    for idx, i in enumerate(shape):
+        if idx == chunk_dim:
+            new_shape += "%s:%s + chunk_size" % (chunk_idx_name, chunk_idx_name)
+        else:
+            new_shape += ":"
+        new_shape += ", "
+    new_shape = new_shape[:-2] + "]"
+    return new_shape
+
+
+def _gen_loop_start(chunk_input, chunk_output, chunk_ouput_dim, chunk_size=2):
+    input_node = chunk_input[0]
+    out_shape = get_node_shape(chunk_output)
+    out_str = str(list(out_shape))
+    context = (
+        "chunk_result = torch.empty(%s, dtype=%s.dtype, device=%s.device); chunk_size = %d\nfor chunk_idx in range"
+        % (out_str, input_node.name, input_node.name, chunk_size)
+    )
+    context += "(0, %d, chunk_size):\n" % (out_shape[chunk_ouput_dim])
+    return context
+
+
+def _gen_loop_end(
+    chunk_inputs, chunk_non_compute_inputs, chunk_outputs, chunk_outputs_dim, node_list
+):
+    chunk_outputs_name = chunk_outputs.name
+    chunk_outputs_idx = find_idx_by_name(chunk_outputs_name, node_list)
+    chunk_output_shape = chunk_outputs.meta["tensor_meta"].shape
+    chunk_slice = _gen_chunk_slice_dim(
+        chunk_outputs_dim, "chunk_idx", chunk_output_shape
+    )
+    context = "    chunk_result%s = %s;  %s = None\n" % (
+        chunk_slice,
+        chunk_outputs_name,
+        chunk_outputs_name,
+    )
+    context += (
+        chunk_outputs_name + " = chunk_result;  chunk_result = None;  chunk_size = None"
+    )
+
+    # determine if its the last use for chunk input
+    for chunk_input in chunk_inputs + chunk_non_compute_inputs:
+        if all(
+            [
+                find_idx_by_name(user.name, node_list) <= chunk_outputs_idx
+                for user in chunk_input.users.keys()
+            ]
+        ):
+            context += ";  %s = None" % chunk_input.name
+
+    context += "\n"
+    return context
+
+
+def _replace_name(context, name_from, name_to):
+    patterns = [(" ", " "), (" ", "."), (" ", ","), ("(", ")"), ("(", ","), (" ", ")")]
+    for p in patterns:
+        source = p[0] + name_from + p[1]
+        target = p[0] + name_to + p[1]
+        if source in context:
+            context = context.replace(source, target)
+    return context
+
+
+def _replace_reshape_size(context, node_name, reshape_size_dict):
+    if node_name not in reshape_size_dict:
+        return context
+    for size_name, size_value in reshape_size_dict[node_name].items():
+        context = context.replace(size_name, size_value)
+    return context
+
+
+def emit_code_with_chunk(
+    body,
+    nodes,
+    emit_node_func,
+    delete_unused_value_func,
+    chunk_region_search,
+    chunk_infos,
+):
+    """Emit code with nested activation checkpoint
+    When we detect some of the node.activation_checkpoint is a List, we will use
+    this function to emit the activation checkpoint codes.
+
+    Args:
+        body: forward code
+        ckpt_func: checkpoint functions code
+        nodes: graph.nodes
+        emit_node_func: function to emit node
+        delete_unused_value_func: function to remove the unused value
+    """
+    node_list = list(nodes)
+
+    chunk_regions = [i["region"] for i in chunk_infos]
+    chunk_starts = [i[0] for i in chunk_regions]
+    chunk_ends = [i[1] for i in chunk_regions]
+
+    chunk_inputs = [i["inputs"] for i in chunk_infos]
+    chunk_inputs_non_chunk = [i["inputs_non_chunk"] for i in chunk_infos]
+    chunk_inputs_dim = [i["inputs_dim"] for i in chunk_infos]
+    chunk_inputs_names = [j.name for i in chunk_inputs for j in i] + [
+        j.name for i in chunk_inputs_non_chunk for j in i
+    ]
+
+    chunk_outputs = [i["outputs"][0] for i in chunk_infos]
+    chunk_outputs_dim = [i["outputs_dim"] for i in chunk_infos]
+
+    node_list = chunk_region_search.index_tracer.reorder_node_list(node_list)
+    node_idx = 0
+    region_idx = 0
+    within_chunk_region = False
+
+    while node_idx < len(node_list):
+        node = node_list[node_idx]
+
+        if node_idx in chunk_starts:
+            within_chunk_region = True
+            region_idx = chunk_starts.index(node_idx)
+            body.append(
+                _gen_loop_start(
+                    chunk_inputs[region_idx],
+                    chunk_outputs[region_idx],
+                    chunk_outputs_dim[region_idx],
+                    chunk_infos[region_idx]["chunk_size"],
+                )
+            )
+
+        if within_chunk_region:
+            emit_node_func(node, body)
+            # replace input var with chunk var
+            for input_node_idx, input_node in enumerate(chunk_inputs[region_idx]):
+                for idx, dim in chunk_inputs_dim[region_idx][input_node_idx].items():
+                    if idx == node_idx:
+                        chunk_slice = _gen_chunk_slice_dim(
+                            dim[0], "chunk_idx", get_node_shape(input_node)
+                        )
+                        body[-1] = _replace_name(
+                            body[-1], input_node.name, input_node.name + chunk_slice
+                        )
+            # ones like
+            if "ones_like" in node.name:
+                meta_node = chunk_region_search.index_tracer.node_list[node_idx]
+                chunk_dim = chunk_infos[region_idx]["node_chunk_dim"][meta_node][
+                    "chunk_dim"
+                ]
+                if get_node_shape(meta_node)[chunk_dim] != 1:
+                    source_node = meta_node.args[0].args[0]
+                    if (
+                        source_node not in chunk_infos[region_idx]["node_chunk_dim"]
+                        or chunk_infos[region_idx]["node_chunk_dim"][source_node][
+                            "chunk_dim"
+                        ]
+                        is None
+                    ):
+                        chunk_slice = _gen_chunk_slice_dim(
+                            chunk_dim, "chunk_idx", get_node_shape(node)
+                        )
+                        body[-1] = _replace_name(
+                            body[-1], node.args[0].name, node.args[0].name + chunk_slice
+                        )
+            body[-1] = _replace_reshape_size(
+                body[-1], node.name, chunk_infos[region_idx]["reshape_size"]
+            )
+            body[-1] = "    " + body[-1]
+            delete_unused_value_func(node, body, chunk_inputs_names)
+        else:
+            emit_node_func(node, body)
+            if node_idx not in chunk_inputs:
+                delete_unused_value_func(node, body, chunk_inputs_names)
+
+        if node_idx in chunk_ends:
+            body.append(
+                _gen_loop_end(
+                    chunk_inputs[region_idx],
+                    chunk_inputs_non_chunk[region_idx],
+                    chunk_outputs[region_idx],
+                    chunk_outputs_dim[region_idx],
+                    node_list,
+                )
+            )
+            within_chunk_region = False
+
+        node_idx += 1
+
+
+if CODEGEN_AVAILABLE:
+
+    class AutoChunkCodeGen(CodeGen):
+        def __init__(self, meta_graph, max_memory=None):
+            super().__init__()
+            self.meta_graph = meta_graph
+            self.max_memory = max_memory
+            self.meta_node = list(meta_graph.graph.nodes)
+            # find the chunk regions
+            self.chunk_region_search = ChunkRegionSearch(meta_graph, max_memory)
+            self.chunk_infos = self.chunk_region_search.search_region()
+
+        def _gen_python_code(
+            self, nodes, root_module: str, namespace: _Namespace
+        ) -> PythonCode:
+            free_vars: List[str] = []
+            body: List[str] = []
+            globals_: Dict[str, Any] = {}
+            wrapped_fns: Dict[str, None] = {}
+
+            # Wrap string in list to pass by reference
+            maybe_return_annotation: List[str] = [""]
+
+            def add_global(name_hint: str, obj: Any):
+                """Add an obj to be tracked as a global.
+
+                We call this for names that reference objects external to the
+                Graph, like functions or types.
+
+                Returns: the global name that should be used to reference 'obj' in generated source.
+                """
+                if (
+                    _is_from_torch(obj) and obj != torch.device
+                ):  # to support registering torch.device
+                    # HACK: workaround for how torch custom ops are registered. We
+                    # can't import them like normal modules so they must retain their
+                    # fully qualified name.
+                    return _get_qualified_name(obj)
+
+                # normalize the name hint to get a proper identifier
+                global_name = namespace.create_name(name_hint, obj)
+
+                if global_name in globals_:
+                    assert globals_[global_name] is obj
+                    return global_name
+                globals_[global_name] = obj
+                return global_name
+
+            # set _custom_builtins here so that we needn't import colossalai in forward
+            _custom_builtins["colossalai"] = _CustomBuiltin(
+                "import colossalai", colossalai
+            )
+
+            # Pre-fill the globals table with registered builtins.
+            for name, (_, obj) in _custom_builtins.items():
+                add_global(name, obj)
+
+            def type_repr(o: Any):
+                if o == ():
+                    # Empty tuple is used for empty tuple type annotation Tuple[()]
+                    return "()"
+
+                typename = _type_repr(o)
+
+                if hasattr(o, "__origin__"):
+                    # This is a generic type, e.g. typing.List[torch.Tensor]
+                    origin_type = _origin_type_map.get(o.__origin__, o.__origin__)
+                    origin_typename = add_global(_type_repr(origin_type), origin_type)
+
+                    if hasattr(o, "__args__"):
+                        # Assign global names for each of the inner type variables.
+                        args = [type_repr(arg) for arg in o.__args__]
+
+                        if len(args) == 0:
+                            # Bare type, such as `typing.Tuple` with no subscript
+                            # This code-path used in Python < 3.9
+                            return origin_typename
+
+                        return f'{origin_typename}[{",".join(args)}]'
+                    else:
+                        # Bare type, such as `typing.Tuple` with no subscript
+                        # This code-path used in Python 3.9+
+                        return origin_typename
+
+                # Common case: this is a regular module name like 'foo.bar.baz'
+                return add_global(typename, o)
+
+            def _format_args(
+                args: Tuple[Argument, ...], kwargs: Dict[str, Argument]
+            ) -> str:
+                def _get_repr(arg):
+                    # Handle NamedTuples (if it has `_fields`) via add_global.
+                    if isinstance(arg, tuple) and hasattr(arg, "_fields"):
+                        qualified_name = _get_qualified_name(type(arg))
+                        global_name = add_global(qualified_name, type(arg))
+                        return f"{global_name}{repr(tuple(arg))}"
+                    return repr(arg)
+
+                args_s = ", ".join(_get_repr(a) for a in args)
+                kwargs_s = ", ".join(f"{k} = {_get_repr(v)}" for k, v in kwargs.items())
+                if args_s and kwargs_s:
+                    return f"{args_s}, {kwargs_s}"
+                return args_s or kwargs_s
+
+            # Run through reverse nodes and record the first instance of a use
+            # of a given node. This represents the *last* use of the node in the
+            # execution order of the program, which we will use to free unused
+            # values
+            node_to_last_use: Dict[Node, Node] = {}
+            user_to_last_uses: Dict[Node, List[Node]] = {}
+
+            def register_last_uses(n: Node, user: Node):
+                if n not in node_to_last_use:
+                    node_to_last_use[n] = user
+                    user_to_last_uses.setdefault(user, []).append(n)
+
+            for node in reversed(nodes):
+                map_arg(node.args, lambda n: register_last_uses(n, node))
+                map_arg(node.kwargs, lambda n: register_last_uses(n, node))
+
+            delete_free_var_from_last_use(user_to_last_uses)
+
+            # NOTE: we add a variable to distinguish body and ckpt_func
+            def delete_unused_values(user: Node, body, to_keep=[]):
+                """
+                Delete values after their last use. This ensures that values that are
+                not used in the remainder of the code are freed and the memory usage
+                of the code is optimal.
+                """
+                if user.op == "placeholder":
+                    return
+                if user.op == "output":
+                    body.append("\n")
+                    return
+                nodes_to_delete = user_to_last_uses.get(user, [])
+                nodes_to_delete = [i for i in nodes_to_delete if i.name not in to_keep]
+                if len(nodes_to_delete):
+                    to_delete_str = " = ".join(
+                        [repr(n) for n in nodes_to_delete] + ["None"]
+                    )
+                    body.append(f";  {to_delete_str}\n")
+                else:
+                    body.append("\n")
+
+            # NOTE: we add a variable to distinguish body and ckpt_func
+            def emit_node(node: Node, body):
+                maybe_type_annotation = (
+                    "" if node.type is None else f" : {type_repr(node.type)}"
+                )
+                if node.op == "placeholder":
+                    assert isinstance(node.target, str)
+                    maybe_default_arg = (
+                        "" if not node.args else f" = {repr(node.args[0])}"
+                    )
+                    free_vars.append(
+                        f"{node.target}{maybe_type_annotation}{maybe_default_arg}"
+                    )
+                    raw_name = node.target.replace("*", "")
+                    if raw_name != repr(node):
+                        body.append(f"{repr(node)} = {raw_name}\n")
+                    return
+                elif node.op == "call_method":
+                    assert isinstance(node.target, str)
+                    body.append(
+                        f"{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.target)}"
+                        f"({_format_args(node.args[1:], node.kwargs)})"
+                    )
+                    return
+                elif node.op == "call_function":
+                    assert callable(node.target)
+                    # pretty print operators
+                    if (
+                        node.target.__module__ == "_operator"
+                        and node.target.__name__ in magic_methods
+                    ):
+                        assert isinstance(node.args, tuple)
+                        body.append(
+                            f"{repr(node)}{maybe_type_annotation} = "
+                            f"{magic_methods[node.target.__name__].format(*(repr(a) for a in node.args))}"
+                        )
+                        return
+
+                    # pretty print inplace operators; required for jit.script to work properly
+                    # not currently supported in normal FX graphs, but generated by torchdynamo
+                    if (
+                        node.target.__module__ == "_operator"
+                        and node.target.__name__ in inplace_methods
+                    ):
+                        body.append(
+                            f"{inplace_methods[node.target.__name__].format(*(repr(a) for a in node.args))};  "
+                            f"{repr(node)}{maybe_type_annotation} = {repr(node.args[0])}"
+                        )
+                        return
+
+                    qualified_name = _get_qualified_name(node.target)
+                    global_name = add_global(qualified_name, node.target)
+                    # special case for getattr: node.args could be 2-argument or 3-argument
+                    # 2-argument: attribute access; 3-argument: fall through to attrib function call with default value
+                    if (
+                        global_name == "getattr"
+                        and isinstance(node.args, tuple)
+                        and isinstance(node.args[1], str)
+                        and node.args[1].isidentifier()
+                        and len(node.args) == 2
+                    ):
+                        body.append(
+                            f"{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.args[1])}"
+                        )
+                        return
+                    body.append(
+                        f"{repr(node)}{maybe_type_annotation} = {global_name}({_format_args(node.args, node.kwargs)})"
+                    )
+                    if node.meta.get("is_wrapped", False):
+                        wrapped_fns.setdefault(global_name)
+                    return
+                elif node.op == "call_module":
+                    assert isinstance(node.target, str)
+                    body.append(
+                        f"{repr(node)}{maybe_type_annotation} = "
+                        f"{_format_target(root_module, node.target)}({_format_args(node.args, node.kwargs)})"
+                    )
+                    return
+                elif node.op == "get_attr":
+                    assert isinstance(node.target, str)
+                    body.append(
+                        f"{repr(node)}{maybe_type_annotation} = {_format_target(root_module, node.target)}"
+                    )
+                    return
+                elif node.op == "output":
+                    if node.type is not None:
+                        maybe_return_annotation[0] = f" -> {type_repr(node.type)}"
+                    body.append(self.generate_output(node.args[0]))
+                    return
+                raise NotImplementedError(f"node: {node.op} {node.target}")
+
+            # Modified for activation checkpointing
+            ckpt_func = []
+
+            # if any node has a list of labels for activation_checkpoint, we
+            # will use nested type of activation checkpoint codegen
+            emit_code_with_chunk(
+                body,
+                nodes,
+                emit_node,
+                delete_unused_values,
+                self.chunk_region_search,
+                self.chunk_infos,
+            )
+
+            if len(body) == 0:
+                # If the Graph has no non-placeholder nodes, no lines for the body
+                # have been emitted. To continue to have valid Python code, emit a
+                # single pass statement
+                body.append("pass\n")
+
+            if len(wrapped_fns) > 0:
+                wrap_name = add_global("wrap", torch.fx.wrap)
+                wrap_stmts = "\n".join(
+                    [f'{wrap_name}("{name}")' for name in wrapped_fns]
+                )
+            else:
+                wrap_stmts = ""
+
+            if self._body_transformer:
+                body = self._body_transformer(body)
+
+            for name, value in self.additional_globals():
+                add_global(name, value)
+
+            # as we need colossalai.utils.checkpoint, we need to import colossalai
+            # in forward function
+            prologue = self.gen_fn_def(free_vars, maybe_return_annotation[0])
+            prologue = "".join(ckpt_func) + prologue
+            prologue = prologue
+
+            code = "".join(body)
+            code = "\n".join("    " + line for line in code.split("\n"))
+            fn_code = f"""
+{wrap_stmts}
+
+{prologue}
+{code}"""
+            # print(fn_code)
+            return PythonCode(fn_code, globals_)
diff --git a/colossalai/autochunk/chunk_codegen.py b/colossalai/autochunk/chunk_codegen.py
deleted file mode 100644
index 7a5d06689247..000000000000
--- a/colossalai/autochunk/chunk_codegen.py
+++ /dev/null
@@ -1,2364 +0,0 @@
-import colossalai
-import torch
-import copy
-from typing import List, Callable, Any, Tuple, Dict, Iterable
-
-from torch.fx.node import Node, Argument, map_arg, _type_repr, _get_qualified_name
-from torch.fx.graph import (
-    _Namespace,
-    PythonCode,
-    _custom_builtins,
-    _is_from_torch,
-    _format_target,
-    magic_methods,
-    CodeGen,
-    _origin_type_map,
-    inplace_methods,
-    _CustomBuiltin,
-)
-from colossalai.fx.profiler import (
-    calculate_fwd_out,
-    calculate_fwd_tmp,
-    parameter_size,
-    activation_size,
-)
-
-CODEGEN_AVAILABLE = True
-__all__ = ["ChunkCodeGen"]
-
-
-def _delete_free_var_from_last_use(user_to_last_uses):
-    for key, value in user_to_last_uses.items():
-        for n in value:
-            if n.op == "placeholder":
-                user_to_last_uses[key].remove(n)
-
-
-def _get_node_shape(node):
-    if hasattr(node.meta["tensor_meta"], "shape"):
-        return node.meta["tensor_meta"].shape
-    return None
-
-
-def _is_non_compute_node(node):
-    if any(i in node.op for i in ["placeholder", "get_attr", "output"]) or any(
-        i in node.name for i in ["getitem", "getattr"]
-    ):
-        return True
-    return False
-
-
-def _is_non_compute_node_except_placeholder(node):
-    if any(i in node.op for i in ["get_attr", "output"]) or any(
-        i in node.name for i in ["getitem", "getattr"]
-    ):
-        return True
-    return False
-
-
-def _is_non_compute_node_except_placeholder_output(node):
-    if any(i in node.op for i in ["get_attr"]) or any(
-        i in node.name for i in ["getitem", "getattr"]
-    ):
-        return True
-    return False
-
-
-class IndexTracer(object):
-    def __init__(self, node_list) -> None:
-        self.node_list = node_list
-        self.idx_trace_list = self._init_idx_trace_list()
-        self.idx_trace_equal = []
-        self.idx_view_list = {}
-        self.idx_count = -1
-        self.all_reorder_map = {i: i for i in range(len(self.idx_trace_list))}
-
-    def _init_idx_trace_list(self):
-        idx_trace_list = []
-        for n in self.node_list:
-            if _get_node_shape(n) != None:
-                cur_trace = {
-                    "idx": [None for _ in range(len(_get_node_shape(n)))],
-                    "compute": [[] for _ in range(len(_get_node_shape(n)))],
-                    "source": [{} for _ in range(len(_get_node_shape(n)))],
-                }
-            else:
-                cur_trace = {"idx": [], "compute": [], "source": []}
-            idx_trace_list.append(cur_trace)
-        return idx_trace_list
-
-    def _add_index(self):
-        """
-        Update the count and return it. To record the idx number.
-
-        Returns:
-            idx_count: int
-        """
-        self.idx_count += 1
-        return self.idx_count
-
-    def _del_dim(self, idx, dim_idx):
-        self.idx_trace_list[idx]["idx"].pop(dim_idx)
-        self.idx_trace_list[idx]["compute"].pop(dim_idx)
-        self.idx_trace_list[idx]["source"].pop(dim_idx)
-
-    def _add_dim(self, node_idx, dim_idx):
-        self.idx_trace_list[node_idx]["idx"].insert(dim_idx, self._add_index())
-        self.idx_trace_list[node_idx]["compute"].insert(dim_idx, [])
-        self.idx_trace_list[node_idx]["source"].insert(dim_idx, {})
-
-    def _transform_index(self, node, node_dim):
-        node_idx = self._find_idx_trace_from_node(node)
-        dims = list(range(len(node_idx)))
-        return dims[node_dim]
-
-    def _inherit_index(self, node_from, node_from_dim, node_to, node_to_dim):
-        node_from_dim = self._transform_index(node_from, node_from_dim)
-        node_to_dim = self._transform_index(node_to, node_to_dim)
-        node_from_trace = self._find_trace_from_node(node_from)
-        node_to_trace = self._find_trace_from_node(node_to)
-        node_to_trace["idx"][node_to_dim] = node_from_trace["idx"][node_from_dim]
-        node_to_trace["compute"][node_to_dim] = copy.deepcopy(
-            node_from_trace["compute"][node_from_dim]
-        )
-        self._add_source(node_from, node_from_dim, node_to, node_to_dim, init=True)
-
-    def _inherit_all_computation(self, node_from, node_to):
-        node_from_compute = self._find_compute_trace_from_node(node_from)
-        node_to_compute = self._find_compute_trace_from_node(node_to)
-        assert len(node_from_compute) == len(node_to_compute)
-        for i in range(len(node_from_compute)):
-            self._add_source(node_from, i, node_to, i)
-            node_to_compute[i] = copy.deepcopy(node_from_compute[i])
-
-    def _add_source(self, node_from, node_from_dim, node_to, node_to_dim, init=False):
-        node_from_dim = self._transform_index(node_from, node_from_dim)
-        node_from_trace_source = self._find_source_trace_from_node(node_from)
-        node_to_dim = self._transform_index(node_to, node_to_dim)
-        node_to_trace_source = self._find_source_trace_from_node(node_to)
-        node_from_idx = _find_idx_by_name(node_from.name, self.node_list)
-        if init:
-            node_to_trace_source[node_to_dim] = {}
-        # add dim to cur new source
-        if node_from_idx not in node_to_trace_source[node_to_dim]:
-            node_to_trace_source[node_to_dim][node_from_idx] = [node_from_dim]
-        else:
-            if node_from_dim not in node_to_trace_source[node_to_dim][node_from_idx]:
-                node_to_trace_source[node_to_dim][node_from_idx].append(node_from_dim)
-        # update inputs source
-        for node_idx, node_dim in node_from_trace_source[node_from_dim].items():
-            if node_idx not in node_to_trace_source[node_to_dim]:
-                node_to_trace_source[node_to_dim][node_idx] = copy.deepcopy(node_dim)
-            else:
-                for d in node_dim:
-                    if d not in node_to_trace_source[node_to_dim][node_idx]:
-                        node_to_trace_source[node_to_dim][node_idx].append(d)
-
-    def _mark_computation_from_node(self, node_from, node_to, exclude=None):
-        if exclude == None:
-            exclude = []
-        else:
-            exclude = [self._transform_index(node_to, i) for i in exclude]
-        node_from_compute = self._find_compute_trace_from_node(node_from)
-        node_to_compute = self._find_compute_trace_from_node(node_to)
-        # assert len(node_from_compute) == len(node_to_compute)
-        for i in range(-1, -min(len(node_from_compute), len(node_to_compute)) - 1, -1):
-            if self._transform_index(node_to, i) in exclude:
-                continue
-            self._add_source(node_from, i, node_to, i)
-            for j in node_from_compute[i]:
-                if j not in node_to_compute[i]:
-                    node_to_compute[i].append(j)
-
-    def _mark_idx_equal(self, node1, dim1, node2, dim2):
-        """
-        Mark 2 index to be equal.
-
-        Args:
-            idx1 (int): index count.
-            idx2 (int): index count.
-        """
-        # node1_idx = _find_idx_by_name(node1.name, self.nodes_list)
-        # node2_idx = _find_idx_by_name(node2.name, self.nodes_list)
-        # if node1_idx > node2_idx:
-        #     self._add_source(node2, dim2, node1, dim1)
-        # else:
-        #     self._add_source(node1, dim1, node2, dim2)
-
-    def _mark_computation(self, node, idx, dim):
-        """
-        Mark some dims of node as computed.
-
-        Args:
-            node (node)
-            idx (int): node index
-            dim (list or int): dims to be marked as computed
-        """
-        if isinstance(dim, int):
-            dim = [dim]
-        dims = list(range(len(_get_node_shape(node))))
-        for d in dim:
-            cur_dim = dims[d]
-            if idx not in self.idx_trace_list[idx]["compute"][cur_dim]:
-                self.idx_trace_list[idx]["compute"][cur_dim].append(idx)
-
-    def _find_trace_from_node(self, node):
-        """
-        Find node idx and compute trace by the node.
-
-        Args:
-            node (node)
-        Returns:
-            idx (list): idx of the node
-            compute (list): computed idx of the node.
-        """
-        node_idx = _find_idx_by_name(node.name, self.node_list)
-        node_dict = self.idx_trace_list[node_idx]
-        return node_dict
-
-    def _find_source_trace_from_node(self, node):
-        """
-        Find node source trace by the node.
-
-        Args:
-            node (node)
-        Returns:
-            idx (list): idx of the node
-            compute (list): computed idx of the node.
-        """
-        node_idx = _find_idx_by_name(node.name, self.node_list)
-        node_dict = self.idx_trace_list[node_idx]
-        return node_dict["source"]
-
-    def _find_idx_trace_from_node(self, node):
-        """
-        Find node idx trace by the node.
-
-        Args:
-            node (node)
-        Returns:
-            idx (list): idx of the node
-        """
-        node_idx = _find_idx_by_name(node.name, self.node_list)
-        return self.idx_trace_list[node_idx]["idx"]
-
-    def _find_compute_trace_from_node(self, node):
-        """
-        Find node compute trace by the node.
-
-        Args:
-            node (node)
-        Returns:
-            compute (list): computed idx of the node.
-        """
-        node_idx = _find_idx_by_name(node.name, self.node_list)
-        return self.idx_trace_list[node_idx]["compute"]
-
-    def _assign_index_as_input(self, node, node_idx, input_node=None):
-        """
-        Assign node's trace as its input node.
-
-        Args:
-            node (node)
-            node_idx (int)
-        """
-        if input_node == None:
-            input_node = node.args[0]
-        input_node_idx = _find_idx_by_name(input_node.name, self.node_list)
-        input_node_idx_trace = self.idx_trace_list[input_node_idx]["idx"]
-
-        new_idx_trace = copy.deepcopy(input_node_idx_trace)
-        self.idx_trace_list[node_idx]["idx"] = new_idx_trace
-
-        self._inherit_all_computation(input_node, node)
-
-    def _assign_all_index(self, node, node_idx):
-        """
-        Add new index for all node's dims.
-
-        Args:
-            node (node)
-            node_idx (int)
-        """
-        shape = node.meta["tensor_meta"].shape
-        new_trace = []
-        for _ in shape:
-            new_trace.append(self._add_index())
-        self.idx_trace_list[node_idx]["idx"] = new_trace
-
-    def _assign_transpose_index(self, node, node_idx):
-        """
-        Assign index for transpose op.
-        1. swap input's dim according to transpose args
-        2. inherit input's computation
-
-        Args:
-            node (node)
-            node_idx (int)
-        """
-        input_node = node.args[0]
-        tranpose_dim = node.args[1:]
-
-        self._assign_index_as_input(node, node_idx, input_node)
-        self._inherit_index(input_node, tranpose_dim[1], node, tranpose_dim[0])
-        self._inherit_index(input_node, tranpose_dim[0], node, tranpose_dim[1])
-
-    def _assign_permute_index(self, node, node_idx):
-        """
-        Assign index for permute op.
-        1. swap input's dim according to permute args
-        2. inherit input's computation
-
-        Args:
-            node (node)
-            node_idx (int)
-        """
-        permute_dim = node.args[1:]
-        input_node = node.args[0]
-
-        self._assign_index_as_input(node, node_idx, input_node)
-        for idx, d in enumerate(permute_dim):
-            self._inherit_index(input_node, d, node, idx)
-
-    def _assign_linear_index(self, node, node_idx):
-        """
-        Assign index for linear op.
-        1. copy trace from input node and change last index accroding to weight
-        2. mark equal for input node last index, weight first dim and bias dim.
-        3. inherit input's computation, mark computation for last dim.
-
-        Args:
-            node (node)
-            node_idx (int)
-        """
-        if len(node.args) == 2:
-            input_node, weight = node.args
-            bias = None
-        else:
-            input_node, weight, bias = node.args
-
-        self._assign_index_as_input(node, node_idx)
-        self._inherit_index(weight, 1, node, -1)
-
-        self._mark_computation(node, node_idx, [-1])
-        self._mark_idx_equal(input_node, -1, weight, 0)
-
-        if bias:
-            self._mark_idx_equal(input_node, -1, bias, 0)
-
-    def _assign_matmul_index(self, node, node_idx):
-        """
-        Assign index for matmul op.
-        1. copy trace from matmul_left and change last index accroding to matmul_right. (assert they have same length)
-        2. mark equal for input matmul_left -1 index and matmul_right -2 dim.
-        3. inherit matmul_left and matmul_right computation, mark computation for last dim.
-
-        Args:
-            node (node)
-            node_idx (int)
-        """
-        matmul_left, matmul_right = node.args
-
-        assert len(_get_node_shape(matmul_left)) == len(_get_node_shape(matmul_right))
-        self._assign_index_as_input(node, node_idx, matmul_left)
-        self._inherit_index(matmul_right, -1, node, -1)
-
-        self._mark_computation_from_node(matmul_right, node, [-1, -2])
-        self._mark_computation(node, node_idx, [-1])
-        self._mark_idx_equal(matmul_left, -1, matmul_right, -2)
-
-    def _assign_layernorm_index(self, node, idx):
-        """
-        Assign index for layernorm op.
-        1. assign index as input node
-        2. inherit computation and mark last 2 dims as computed.
-
-        Args:
-            node (node)
-            node_idx (int)
-        """
-        self._assign_index_as_input(node, idx)
-        self._mark_computation(node, idx, [-1])
-
-    def _assign_elementwise_index(self, node, idx):
-        """
-        Assign index for element-wise op (eg. relu sigmoid add mul).
-        1. assign index as input node
-        2. inherit computation from all input nodes.
-
-        Args:
-            node (node)
-            node_idx (int)
-        """
-        self._assign_index_as_input(node, idx)
-        nodes_in = []
-        for node_in in node.args:
-            if type(node_in) == type(node):
-                nodes_in.append(node_in)
-                self._mark_computation_from_node(node_in, node)
-        assert len(nodes_in) <= 2
-        if len(nodes_in) == 2:
-            node_in0_shape = _get_node_shape(nodes_in[0])
-            node_in1_shape = _get_node_shape(nodes_in[1])
-            for i in range(-1, -min(len(node_in0_shape), len(node_in1_shape)) - 1, -1):
-                if node_in0_shape[i] == node_in1_shape[i]:
-                    self._mark_idx_equal(nodes_in[0], i, nodes_in[1], i)
-
-    def _assgin_no_change_index(self, node, idx):
-        self._assign_index_as_input(node, idx)
-        for node_in in node.args:
-            if type(node_in) == type(node):
-                self._mark_computation_from_node(node_in, node)
-
-    def _assign_einsum_index(self, node, idx):
-        """
-        Assign index for einsum op.
-
-        Args:
-            node (node)
-            node_idx (int)
-        """
-        patterns = node.args[0]
-        input_nodes = node.args[1:]
-
-        patterns = patterns.replace(" ", "")
-        left, right = patterns.split("->")
-        left = left.split(",")
-
-        all_index = []
-        for i in left:
-            for c in i:
-                all_index.append(c)
-        all_index = set(all_index)
-        free_index = set([i for i in right])
-        sum_index = all_index - free_index
-
-        for right_idx, right_indice in enumerate(right):
-            for left_idx, left_str in enumerate(left):
-                if right_indice in left_str:
-                    source_idx = left_str.index(right_indice)
-                    self._inherit_index(
-                        input_nodes[left_idx], source_idx, node, right_idx
-                    )
-
-        # for i in sum_index:
-        #     for left_idx, left_str in enumerate(left):
-        #         if i in left_str:
-        #             self._mark_computation(node, idx, left_str.index(i))
-        #             break
-
-    def _assign_softmax_index(self, node, idx):
-        """
-        Assign index for softmax op.
-        1. assign index as input node
-        2. inherit computation and mark softmax dim as computed.
-
-        Args:
-            node (node)
-            node_idx (int)
-        """
-        self._assign_index_as_input(node, idx)
-        self._mark_computation(node, idx, [node.kwargs["dim"]])
-
-    def _assign_unsqueeze_index(self, node, node_idx):
-        """
-        Assign index for unsqueeze op.
-        1. assign new index for unsqueeze dim
-
-        Args:
-            node (node)
-            node_idx (int)
-        """
-        self._del_dim(node_idx, -1)
-        self._assign_index_as_input(node, node_idx)
-        self._add_dim(node_idx, node.args[1])
-
-    def _assign_dropout_index(self, node, node_idx):
-        """
-        Assign index for unsqueeze op.
-        1. assign new index for unsqueeze dim
-
-        Args:
-            node (node)
-            node_idx (int)
-        """
-        self._assign_index_as_input(node, node_idx)
-
-    def _assign_ones_like_index(self, node, node_idx):
-        """
-        Assign index for oneslike op.
-        1. assign new index for all dim
-
-        Args:
-            node (node)
-            node_idx (int)
-        """
-        self._assign_all_index(node, node_idx)
-
-    def _assign_view_reshape_index(self, node, node_idx):
-        """
-        Assign index for view and reshape op.
-        1. get origin shape and target shape by meta info.
-        2. compute the real value of -1 in target shape.
-        3. determine changed dim, and assgin index for generated dim.
-        4. log changed dim and generated dim for restore
-        5. inherit computation.
-        6. TODO: look into view list to see whether the view is associated with other,
-           if so assgin equal dim according to previous view.
-
-        Args:
-            node (node)
-            node_idx (int)
-        """
-        # get data, turn into number
-        origin_node = node.args[0]
-        origin_shape = origin_node.meta["tensor_meta"].shape
-        target_shape = []
-        for i in range(1, len(node.args)):
-            if isinstance(node.args[i], int):
-                target_shape.append(node.args[i])
-            else:
-                target_shape.append(node.args[i].meta["fwd_out"][0])
-
-        # compute the value of -1
-        if -1 in target_shape:
-            origin_product = 1
-            for i in origin_shape:
-                origin_product *= i
-            target_product = -1
-            for i in target_shape:
-                target_product *= i
-            shape_idx = target_shape.index(-1)
-            target_shape[shape_idx] = origin_product // target_product
-
-        # determine changed dim
-        len_diff = len(origin_shape) - len(target_shape)
-        if len_diff == 1:
-            # dim merge
-            dim_equal = [i == j for i, j in zip(origin_shape[:-1], target_shape)]
-            dim_to = [dim_equal.index(False)]
-            dim_from = [dim_equal.index(False), dim_equal.index(False) + 1]
-            self._add_dim(node_idx, -1)
-        elif len_diff == -1:
-            # dim expand
-            dim_equal = [i == j for i, j in zip(origin_shape, target_shape[:-1])]
-            dim_from = [dim_equal.index(False)]
-            dim_to = [dim_equal.index(False), dim_equal.index(False) + 1]
-            self._del_dim(node_idx, -1)
-        else:
-            raise NotImplementedError(
-                "shape"
-                + str(origin_shape)
-                + "and"
-                + str(target_shape)
-                + "view not implemented"
-            )
-
-        # get new index
-        origin_trace = self._find_idx_trace_from_node(origin_node)
-        self._assign_index_as_input(node, node_idx, origin_node)
-        dim_from.reverse()
-        for i in dim_from:
-            self._del_dim(node_idx, i)
-        for i in dim_to:
-            self._add_dim(node_idx, i)
-
-        # inherit computation
-        compute_log = self._find_compute_trace_from_node(origin_node)
-        for i in dim_from:
-            if origin_trace[i] in compute_log:
-                for j in dim_to:
-                    self._mark_computation(node, node_idx, [j])
-                break
-
-        # log view, not used now
-        view_dict = {
-            "idx_from": [origin_trace[i] for i in dim_from],
-            "dim_from": dim_from,
-            "idx_to": [self.idx_trace_list[node_idx]["idx"][i] for i in dim_to],
-            "dim_to": dim_to,
-        }
-        self.idx_view_list[node] = view_dict
-
-    def _merge_equal_idx(self):
-        idx_equal = copy.deepcopy(self.idx_trace_equal)
-        idx_equal.reverse()
-        for idx in idx_equal:
-            merge_to = min(idx)
-            merge_from = max(idx)
-            for trace in self.idx_trace_list:
-                if merge_from in trace["idx"]:
-                    trace["idx"] = [
-                        merge_to if i == merge_from else i for i in trace["idx"]
-                    ]
-
-    def trace_index(self):
-        for idx, node in enumerate(self.node_list):
-            if node.op == "placeholder":
-                self._assign_all_index(node, idx)
-            elif node.op == "call_method":
-                if "transpose" in node.name:
-                    self._assign_transpose_index(node, idx)
-                elif "permute" in node.name:
-                    self._assign_permute_index(node, idx)
-                elif "view" in node.name or "reshape" in node.name:
-                    self._assign_view_reshape_index(node, idx)
-                elif "unsqueeze" in node.name:
-                    self._assign_unsqueeze_index(node, idx)
-                elif any(i in node.name for i in ["to", "contiguous"]):
-                    self._assgin_no_change_index(node, idx)
-                else:
-                    raise NotImplementedError(node.name, "method not implemented yet!")
-            elif node.op == "call_function":
-                if "linear" in node.name:
-                    self._assign_linear_index(node, idx)
-                elif "matmul" in node.name:
-                    self._assign_matmul_index(node, idx)
-                elif "softmax" in node.name:
-                    self._assign_softmax_index(node, idx)
-                elif any(n in node.name for n in ["mul", "add", "sigmoid", "relu"]):
-                    self._assign_elementwise_index(node, idx)
-                elif "ones_like" in node.name:
-                    self._assign_ones_like_index(node, idx)
-                elif "dropout" in node.name:
-                    self._assign_dropout_index(node, idx)
-                elif "einsum" in node.name:
-                    self._assign_einsum_index(node, idx)
-                elif "getattr" in node.name:
-                    continue  # get attr like shape
-                elif "getitem" in node.name:
-                    continue  # get item in list
-                else:
-                    raise NotImplementedError(
-                        node.name, "function not implemented yet!"
-                    )
-            elif node.op == "call_module":
-                if any(n in node.name for n in ["layernorm", "norm"]):
-                    self._assign_layernorm_index(node, idx)
-                else:
-                    raise NotImplementedError(node.name, "module not implemented yet!")
-            elif node.op == "get_attr":
-                self._assign_all_index(node, idx)  # get param
-            elif node.op == "output":
-                continue
-            else:
-                raise NotImplementedError(node.op, "op not implemented yet!")
-        # self._merge_equal_idx()
-
-    def check_index_source(self, start_dim, start_node, start_idx, end_dim, end_node):
-        """
-        Check 2 given index: one index should be source of the other
-        Args:
-            start_idx(int): start node chunk dim
-            start_node(node): start node
-            end_idx(int): end node chunk dim
-            end_node(node): end node
-
-        Returns:
-            bool: True if check pass
-        """
-        start_node_idx = _find_idx_by_name(start_node.name, self.node_list)
-        end_node_trace = self._find_trace_from_node(end_node)
-        end_node_trace_source = end_node_trace["source"][end_dim]
-        sorted_source = sorted(
-            end_node_trace_source.items(), key=lambda d: d[0], reverse=True
-        )
-        for node_idx, node_dim in sorted_source:
-            if node_idx == start_node_idx and start_dim in node_dim:
-                return True
-            # it means we meet a node outside the loop, and the node is not input node
-            if node_idx < start_idx:
-                return False
-        return False
-
-    def check_index_compute(self, start_idx, end_dim, end_node, end_idx):
-        """
-        Check 2 given index: check they haven't been computed in the source trace.
-        Args:
-            start_idx(int): start node chunk dim
-            start_node(node): start node
-            end_idx(int): end node chunk dim
-            end_node(node): end node
-
-        Returns:
-            bool: True if check pass
-        """
-        end_node_trace = self._find_trace_from_node(end_node)
-        end_node_compute = end_node_trace["compute"][end_dim]
-        if any(start_idx <= i <= end_idx for i in end_node_compute):
-            return False
-        return True
-
-    def get_node_chunk_dim(self, node_from, node_from_dim, node_to):
-        node_from_source = self._find_source_trace_from_node(node_from)
-        dim_source = node_from_source[node_from_dim]
-        node_to_idx = _find_idx_by_name(node_to.name, self.node_list)
-        for k, v in dim_source.items():
-            if k == node_to_idx:
-                return v
-        return None
-
-    def _find_inherit_dim(self, input_node, input_dim, node):
-        input_node_idx = _find_idx_by_name(input_node.name, self.node_list)
-        node_trace_source = self._find_source_trace_from_node(node)
-        for node_dim in range(len(_get_node_shape(node))):
-            if (
-                input_node_idx in node_trace_source[node_dim]
-                and input_dim[0] in node_trace_source[node_dim][input_node_idx]
-            ):
-                return node_dim
-        return None
-
-    def check_index_duplicate(self, chunk_infos, return_dim=False):
-        input_dim_after_node = {}
-        for input_node_idx, input_node in enumerate(chunk_infos["inputs"]):
-            for k, v in chunk_infos["inputs_dim"][input_node_idx].items():
-                inherit_dim = self._find_inherit_dim(input_node, v, self.node_list[k])
-                if inherit_dim:
-                    input_dim_after_node[k] = inherit_dim
-
-        for node in self.node_list[
-            chunk_infos["region"][0] : chunk_infos["region"][1] + 1
-        ]:
-            if _is_non_compute_node_except_placeholder(node):
-                continue
-            count = 0
-            duplicate_dims = []
-            node_trace_source = self._find_source_trace_from_node(node)
-            for node_dim in range(len(_get_node_shape(node))):
-                duplicate_dim = []
-                duplicate_flag = False
-                dim_source = node_trace_source[node_dim]
-                for k, v in dim_source.items():
-                    if chunk_infos["region"][0] <= k <= chunk_infos["region"][1]:
-                        if k in input_dim_after_node and input_dim_after_node[k] in v:
-                            duplicate_flag = True
-                            duplicate_dim.append((k, v))
-                duplicate_dims.append(duplicate_dim)
-                if duplicate_flag:
-                    count += 1
-
-            if count > 1:
-                if return_dim:
-                    return False, duplicate_dims
-                else:
-                    return False
-        if return_dim:
-            return True, None
-        else:
-            return True
-
-    def _assgin_single_node_flow(
-        self,
-        arg_node,
-        start_idx,
-        end_idx,
-        cur_node_dim,
-        cur_node_compute,
-        cur_node_source,
-        cur_node_fix_dim,
-        all_node_info,
-        next_node_list,
-    ):
-        arg_idx = _find_idx_by_name(arg_node.name, self.node_list)
-        # arg in chunk range or be inputs
-        if not (start_idx <= arg_idx < end_idx):
-            return True
-
-        # find arg dim
-        if cur_node_dim is not None:
-            # dim is computed
-            if arg_idx in cur_node_compute[cur_node_dim]:
-                return False
-            if arg_idx not in cur_node_source[cur_node_dim]:
-                arg_dim = None
-            else:
-                arg_dim = cur_node_source[cur_node_dim][arg_idx][0]
-        else:
-            arg_dim = None
-
-        # get fix dim
-        arg_fix_dim = []
-        if cur_node_dim is not None:
-            for i in cur_node_fix_dim:
-                fix_dim_source = cur_node_source[i]
-                if arg_idx in fix_dim_source:
-                    arg_fix_dim.append(fix_dim_source[arg_idx][0])
-
-        # if already in node_info, arg dim must be same
-        if arg_node in all_node_info:
-            if all_node_info[arg_node]["chunk_dim"] != arg_dim:
-                return False
-            all_node_info[arg_node]["fix_dim"] = list(
-                set(all_node_info[arg_node]["fix_dim"] + arg_fix_dim)
-            )
-        # else add it to list
-        else:
-            all_node_info[arg_node] = {"chunk_dim": arg_dim, "fix_dim": arg_fix_dim}
-
-        next_node_list.append(arg_node)
-        return True
-
-    def flow_search(self, start_idx, start_dim, end_idx, end_dim):
-        inputs, outputs = _find_chunk_compute_input_and_output_nodes(
-            self.node_list[start_idx : end_idx + 1]
-        )
-        # only single ouput
-        if len(outputs) > 1:
-            return None
-
-        cur_node_list = [self.node_list[end_idx]]  # start from the last node
-        all_node_info = {cur_node_list[0]: {"chunk_dim": end_dim, "fix_dim": []}}
-
-        while len(cur_node_list) > 0:
-            next_node_list = []
-
-            for cur_node in cur_node_list:
-                # get cur node info
-                cur_node_chunk_dim = all_node_info[cur_node]["chunk_dim"]
-                cur_node_fix_dim = all_node_info[cur_node]["fix_dim"]
-                cur_node_idx = _find_idx_by_name(cur_node.name, self.node_list)
-                if cur_node_chunk_dim:
-                    cur_node_compute = self._find_compute_trace_from_node(cur_node)
-                    cur_node_source = self._find_source_trace_from_node(cur_node)
-                else:
-                    cur_node_compute = cur_node_source = None
-
-                # get all valid args
-                arg_list = []
-                for arg in cur_node.args:
-                    if type(arg) != type(cur_node):
-                        continue
-                    if _is_non_compute_node(arg):
-                        continue
-                    arg_list.append(arg)
-                    flow_flag = self._assgin_single_node_flow(
-                        arg,
-                        start_idx,
-                        end_idx,
-                        cur_node_chunk_dim,
-                        cur_node_compute,
-                        cur_node_source,
-                        cur_node_fix_dim,
-                        all_node_info,
-                        next_node_list,
-                    )
-                    if flow_flag == False:
-                        return None
-
-                if len(arg_list) == 2:
-                    if any(i in cur_node.name for i in ["add", "mul"]):
-                        for arg in arg_list:
-                            if not (
-                                start_idx
-                                <= _find_idx_by_name(arg.name, self.node_list)
-                                < end_idx
-                            ):
-                                continue
-                            arg_chunk_dim = all_node_info[arg]["chunk_dim"]
-                            arg_fix_dim = all_node_info[arg]["fix_dim"]
-                            arg_shape = _get_node_shape(arg)
-                            # add all dim as fix dim except chunk dim
-                            for i, shape in enumerate(arg_shape):
-                                if shape != 1 and i != cur_node_chunk_dim:
-                                    if i == arg_chunk_dim:
-                                        return None
-                                    if i not in arg_fix_dim:
-                                        arg_fix_dim.append(i)
-                    elif "einsum" in cur_node.name:
-                        pass
-                    elif "matmul" in cur_node.name:
-                        pass
-                    else:
-                        raise NotImplementedError()
-            cur_node_list = next_node_list
-
-        inputs_dim = []
-        remove_inputs = []
-        for input_node in inputs:
-            input_dict = {}
-            input_node_idx = _find_idx_by_name(input_node.name, self.node_list)
-            for user in input_node.users.keys():
-                if _is_non_compute_node(user):
-                    continue
-                user_idx = _find_idx_by_name(user.name, self.node_list)
-                if start_idx <= user_idx <= end_idx:
-                    chunk_dim = all_node_info[user]["chunk_dim"]
-                    if chunk_dim is not None:
-                        user_source = self._find_source_trace_from_node(user)[chunk_dim]
-                        if input_node_idx in user_source:
-                            input_dict[user_idx] = user_source[input_node_idx]
-                        else:
-                            return None
-            if len(input_dict) == 0:
-                remove_inputs.append(input_node)
-            else:
-                inputs_dim.append(input_dict)
-        for i in remove_inputs:
-            if i in inputs:
-                inputs.remove(i)
-
-        chunk_info = {
-            "region": (start_idx, end_idx),
-            "inputs": inputs,
-            "inputs_non_chunk": [],
-            "inputs_dim": inputs_dim,
-            "outputs": outputs,
-            "outputs_dim": end_dim,
-            "node_chunk_dim": all_node_info,
-            "args": {},
-        }
-
-        # move useless nodes ahead of loop
-        # get all possible prepose nodes
-        maybe_prepose_nodes = []
-        for node, node_info in all_node_info.items():
-            if node_info["chunk_dim"] is None:
-                maybe_prepose_nodes.append(node)
-        maybe_prepose_nodes.sort(
-            key=lambda x: _find_idx_by_name(x.name, self.node_list),
-            reverse=True,
-        )  # from last node to first node
-        prepose_nodes = []
-        # set every node as root, search its args, if all legal, turn root and args as prepose nodes
-        while len(maybe_prepose_nodes) > 0:
-            tmp_cur_prepose_nodes = [maybe_prepose_nodes[0]]
-            tmp_cur_related_prepose_nodes = []
-            prepose_flag = True
-
-            # loop cur node's all arg until out of chunk
-            while len(tmp_cur_prepose_nodes) > 0:
-                if prepose_flag == False:
-                    break
-                tmp_next_prepose_nodes = []
-                tmp_cur_related_prepose_nodes.extend(tmp_cur_prepose_nodes)
-                for cur_prepose_node in tmp_cur_prepose_nodes:
-                    if prepose_flag == False:
-                        break
-                    for cur_prepose_node_arg in cur_prepose_node.args:
-                        if type(cur_prepose_node_arg) != type(cur_prepose_node):
-                            continue
-                        # out of loop
-                        if not (
-                            start_idx
-                            <= _find_idx_by_name(
-                                cur_prepose_node_arg.name, self.node_list
-                            )
-                            < end_idx
-                        ):
-                            continue
-                        # compute op in loop
-                        elif cur_prepose_node_arg in all_node_info:
-                            if all_node_info[cur_prepose_node_arg]["chunk_dim"] is None:
-                                tmp_next_prepose_nodes.append(cur_prepose_node_arg)
-                            else:
-                                prepose_flag = False
-                                break
-                        # non compute op
-                        else:
-                            tmp_next_prepose_nodes.append(cur_prepose_node_arg)
-                tmp_cur_prepose_nodes = tmp_next_prepose_nodes
-
-            if prepose_flag == False:
-                maybe_prepose_nodes.remove(maybe_prepose_nodes[0])
-                continue
-            else:
-                for n in tmp_cur_related_prepose_nodes:
-                    if n not in prepose_nodes:
-                        prepose_nodes.append(n)
-                    if n in maybe_prepose_nodes:
-                        maybe_prepose_nodes.remove(n)
-        # sort by index
-        prepose_nodes.sort(key=lambda x: _find_idx_by_name(x.name, self.node_list))
-        chunk_info["args"]["prepose_nodes"] = prepose_nodes
-
-        # we need to log input nodes to avoid deleteing them in the loop
-        chunk_node_list = self.node_list[start_idx : end_idx + 1]
-        # also need to get some prepose node's arg out of non_chunk_inputs
-        for n in prepose_nodes:
-            chunk_node_list.remove(n)
-        non_chunk_inputs = _find_chunk_all_input_nodes(chunk_node_list)
-        for i in non_chunk_inputs:
-            if i not in chunk_info["inputs"]:
-                chunk_info["inputs_non_chunk"].append(i)
-
-        # reassgin reshape size, some size may have changed due to chunk
-        chunk_info = self._reassgin_reshape_size(chunk_info)
-
-        return chunk_info
-
-    def _reassgin_reshape_size(self, chunk_info):
-        chunk_region = chunk_info["region"]
-        reshape_size = {}
-        chunk_shape = _get_node_shape(chunk_info["outputs"][0])[
-            chunk_info["outputs_dim"]
-        ]
-        for node in self.node_list[chunk_region[0] : chunk_region[1] + 1]:
-            if any(i in node.name for i in ["reshape", "view"]):
-                reshape_args = node.args[1:]
-                reshape_log = self.idx_view_list[node]
-                chunk_dim = chunk_info["node_chunk_dim"][node]["chunk_dim"]
-                reshape_size[node.name] = {}
-                for reshape_arg_dim, reshape_arg in enumerate(reshape_args):
-                    if reshape_arg_dim in reshape_log["dim_to"]:
-                        continue
-                    if reshape_arg_dim == chunk_dim:
-                        reshape_size[node.name][reshape_arg.name] = (
-                            "min(chunk_size, %d - chunk_idx)" % chunk_shape
-                        )
-        chunk_info["reshape_size"] = reshape_size
-        return chunk_info
-
-    def _get_reorder_map(self, chunk_info):
-        reorder_map = {i: i for i in range(len(self.node_list))}
-
-        chunk_region_start = chunk_info["region"][0]
-        chunk_region_end = chunk_info["region"][1]
-        chunk_prepose_nodes = chunk_info["args"]["prepose_nodes"]
-        chunk_prepose_nodes_idx = [
-            _find_idx_by_name(i.name, self.node_list) for i in chunk_prepose_nodes
-        ]
-        # put prepose nodes ahead
-        for idx, n in enumerate(chunk_prepose_nodes):
-            n_idx = chunk_prepose_nodes_idx[idx]
-            reorder_map[n_idx] = chunk_region_start + idx
-        # put other nodes after prepose nodes
-        for n in self.node_list[chunk_region_start : chunk_region_end + 1]:
-            if n in chunk_prepose_nodes:
-                continue
-            n_idx = _find_idx_by_name(n.name, self.node_list)
-            pos = sum([n_idx < i for i in chunk_prepose_nodes_idx])
-            reorder_map[n_idx] = n_idx + pos
-
-        return reorder_map
-
-    def _reorder_chunk_info(self, chunk_info, reorder_map):
-        # update chunk info
-        chunk_info["region"] = (
-            chunk_info["region"][0] + len(chunk_info["args"]["prepose_nodes"]),
-            chunk_info["region"][1],
-        )
-        new_inputs_dim = []
-        for idx, input_dim in enumerate(chunk_info["inputs_dim"]):
-            new_input_dim = {}
-            for k, v in input_dim.items():
-                new_input_dim[reorder_map[k]] = v
-            new_inputs_dim.append(new_input_dim)
-        chunk_info["inputs_dim"] = new_inputs_dim
-        return chunk_info
-
-    def _update_all_reorder_map(self, reorder_map):
-        for origin_idx, map_idx in self.all_reorder_map.items():
-            self.all_reorder_map[origin_idx] = reorder_map[map_idx]
-
-    def _reorder_self_node_list(self, reorder_map):
-        new_node_list = [None for _ in range(len(self.node_list))]
-        for old_idx, new_idx in reorder_map.items():
-            new_node_list[new_idx] = self.node_list[old_idx]
-        self.node_list = new_node_list
-
-    def _reorder_idx_trace(self, reorder_map):
-        # reorder list
-        new_idx_trace_list = [None for _ in range(len(self.idx_trace_list))]
-        for old_idx, new_idx in reorder_map.items():
-            new_idx_trace_list[new_idx] = self.idx_trace_list[old_idx]
-        self.idx_trace_list = new_idx_trace_list
-        # update compute
-        for idx_trace in self.idx_trace_list:
-            compute = idx_trace["compute"]
-            for dim_compute in compute:
-                for idx, i in enumerate(dim_compute):
-                    dim_compute[idx] = reorder_map[i]
-        # update source
-        for idx_trace in self.idx_trace_list:
-            source = idx_trace["source"]
-            for dim_idx, dim_source in enumerate(source):
-                new_dim_source = {}
-                for k, v in dim_source.items():
-                    new_dim_source[reorder_map[k]] = v
-                source[dim_idx] = new_dim_source
-
-    def reorder_all(self, chunk_info):
-        if chunk_info is None:
-            return chunk_info
-        if len(chunk_info["args"]["prepose_nodes"]) == 0:
-            return chunk_info
-        reorder_map = self._get_reorder_map(chunk_info)
-        self._update_all_reorder_map(reorder_map)
-        self._reorder_idx_trace(reorder_map)
-        self._reorder_self_node_list(reorder_map)
-        chunk_info = self._reorder_chunk_info(chunk_info, reorder_map)
-        return chunk_info
-
-    def reorder_node_list(self, node_list):
-        new_node_list = [None for _ in range(len(node_list))]
-        for old_idx, new_idx in self.all_reorder_map.items():
-            new_node_list[new_idx] = node_list[old_idx]
-        return new_node_list
-
-    def tmp_reorder(self, node_list, chunk_info):
-        if len(chunk_info["args"]["prepose_nodes"]) == 0:
-            return node_list, chunk_info
-        reorder_map = self._get_reorder_map(chunk_info)
-
-        # new tmp node list
-        new_node_list = [None for _ in range(len(node_list))]
-        for old_idx, new_idx in reorder_map.items():
-            new_node_list[new_idx] = node_list[old_idx]
-
-        chunk_info = self._reorder_chunk_info(chunk_info, reorder_map)
-        return new_node_list, chunk_info
-
-
-class MemoryEstimator(object):
-    def __init__(self, index_tracer: IndexTracer) -> None:
-        pass
-
-    def _get_meta_node_size(self, x):
-        x = x.meta["tensor_meta"]
-        x = x.numel * torch.tensor([], dtype=x.dtype).element_size()
-        return x
-
-    def _get_output_node(self, n):
-        fwd_out = {
-            x.uuid: x
-            for x in n.meta["fwd_out"]
-            if isinstance(x, torch.Tensor) and hasattr(x, "uuid")
-        }
-        out_size = activation_size(fwd_out)
-        out_node = [n.name] if out_size > 0 else []
-        # if any(i in n.name for i in ['transpose', 'permute', 'view']):
-        #     out_size = 0
-        return out_size, out_node
-
-    def _get_output_node_size(self, n):
-        return self._get_output_node(n)[0]
-
-    def _add_active_node(self, n, active_list):
-        new_active = self._get_output_node(n)[1]
-        if n.op == "placeholder":
-            new_active.append(n.name)
-        for i in new_active:
-            if i not in active_list:
-                active_list.append(i)
-
-    def _get_delete_node(self, user, user_to_last_uses, to_keep=None):
-        delete_size = 0
-        delete_node = []
-        if user.op not in ("output",):
-            nodes_to_delete = user_to_last_uses.get(user, [])
-            if to_keep is not None:
-                keep_list = []
-                for n in nodes_to_delete:
-                    if n.name in to_keep:
-                        keep_list.append(n)
-                for n in keep_list:
-                    if n in nodes_to_delete:
-                        nodes_to_delete.remove(n)
-            if len(nodes_to_delete):
-                out_node = [self._get_output_node(i) for i in nodes_to_delete]
-                delete_size = sum([i[0] for i in out_node])
-                for i in range(len(out_node)):
-                    if out_node[i][0] > 0:
-                        delete_node.append(out_node[i][1][0])
-                    elif nodes_to_delete[i].op == "placeholder":
-                        delete_node.append(nodes_to_delete[i].name)
-                    # elif any(j in nodes_to_delete[i].name for j in ['transpose', 'permute', 'view']):
-                    #     delete_node.append(nodes_to_delete[i].name)
-        return delete_size, delete_node
-
-    def _get_delete_node_size(self, user, user_to_last_uses, to_keep):
-        return self._get_delete_node(user, user_to_last_uses, to_keep)[0]
-
-    def _remove_deactive_node(self, user, user_to_last_uses, active_list):
-        delete_node = self._get_delete_node(user, user_to_last_uses)[1]
-        for i in delete_node:
-            if i in active_list:
-                active_list.remove(i)
-
-    def _get_chunk_inputs_size(
-        self, chunk_inputs, chunk_inputs_non_chunk, node_list, chunk_end_idx
-    ):
-        nodes_to_delete = []
-        for chunk_input in chunk_inputs + chunk_inputs_non_chunk:
-            chunk_input_users = chunk_input.users.keys()
-            chunk_input_users_idx = [
-                _find_idx_by_name(i.name, node_list) for i in chunk_input_users
-            ]
-            if all(i <= chunk_end_idx for i in chunk_input_users_idx):
-                if chunk_input not in nodes_to_delete:
-                    nodes_to_delete.append(chunk_input)
-        out_node = [self._get_output_node(i) for i in nodes_to_delete]
-        delete_size = sum([i[0] for i in out_node])
-        return delete_size
-
-    def _get_last_usr(self, nodes):
-        node_to_last_use: Dict[Node, Node] = {}
-        user_to_last_uses: Dict[Node, List[Node]] = {}
-
-        def register_last_uses(n: Node, user: Node):
-            if n not in node_to_last_use:
-                node_to_last_use[n] = user
-                user_to_last_uses.setdefault(user, []).append(n)
-
-        for node in reversed(nodes):
-            map_arg(node.args, lambda n: register_last_uses(n, node))
-            map_arg(node.kwargs, lambda n: register_last_uses(n, node))
-        return user_to_last_uses
-
-    def _get_contiguous_memory(self, node, not_contiguous_list, delete=False):
-        mem = 0
-        not_contiguous_ops = ["permute"]
-        inherit_contiguous_ops = ["transpose", "view"]
-
-        if node.op == "call_function" and any(
-            n in node.name for n in ["matmul", "reshape"]
-        ):
-            for n in node.args:
-                if n in not_contiguous_list:
-                    # matmul won't change origin tensor, but create a tmp copy
-                    mem += self._get_output_node_size(n)
-        elif node.op == "call_module":
-            for n in node.args:
-                if n in not_contiguous_list:
-                    # module will just make origin tensor to contiguous
-                    if delete:
-                        not_contiguous_list.remove(n)
-        elif node.op == "call_method" and any(
-            i in node.name for i in not_contiguous_ops
-        ):
-            if node not in not_contiguous_list:
-                not_contiguous_list.append(node)
-        return mem
-
-    def _get_chunk_ratio(self, node, chunk_node_dim, chunk_size):
-        if node not in chunk_node_dim:
-            return 1.0
-        node_shape = _get_node_shape(node)
-        chunk_dim = chunk_node_dim[node]["chunk_dim"]
-        if chunk_dim is None:
-            return 1.0
-        else:
-            return float(chunk_size) / node_shape[chunk_dim]
-
-    def _get_chunk_delete_node_size(
-        self, user, user_to_last_uses, chunk_ratio, chunk_inputs_names
-    ):
-        # if any(j in user.name for j in ['transpose', 'permute', 'view']):
-        #     return 0
-        if user.op in ("placeholder", "output"):
-            return 0
-        nodes_to_delete = user_to_last_uses.get(user, [])
-        delete_size = 0
-        for n in nodes_to_delete:
-            if n.name in chunk_inputs_names:
-                continue
-            delete_size += self._get_output_node_size(n) * chunk_ratio
-        return delete_size
-
-    def _print_mem_log(self, log, nodes, title=None):
-        if title:
-            print(title)
-        for idx, (l, n) in enumerate(zip(log, nodes)):
-            print("%s:%.2f \t" % (n.name, l), end="")
-            if (idx + 1) % 3 == 0:
-                print("")
-        print("\n")
-
-    def _print_compute_op_mem_log(self, log, nodes, title=None):
-        if title:
-            print(title)
-        for idx, (l, n) in enumerate(zip(log, nodes)):
-            if n.op in ["placeholder", "get_attr", "output"]:
-                continue
-            if any(i in n.name for i in ["getitem", "getattr"]):
-                continue
-            print("%s:%.2f \t" % (n.name, l), end="")
-            if (idx + 1) % 3 == 0:
-                print("")
-        print("\n")
-
-    def estimate_chunk_inference_mem(
-        self,
-        node_list,
-        chunk_infos=None,
-        print_mem=False,
-    ):
-        act_memory = 0.0
-        act_memory_peak_log = []
-        act_memory_after_node_log = []
-        active_node_list = []
-        active_node_list_log = []
-        not_contiguous_list = []
-        user_to_last_uses = self._get_last_usr(node_list)
-        user_to_last_uses_no_free_var = self._get_last_usr(node_list)
-        _delete_free_var_from_last_use(user_to_last_uses_no_free_var)
-
-        use_chunk = True if chunk_infos is not None else False
-        chunk_within = False
-        chunk_region_idx = None
-        chunk_ratio = 1  # use it to estimate chunk mem
-        chunk_inputs_names = []
-
-        if use_chunk:
-            chunk_regions = [i["region"] for i in chunk_infos]
-            chunk_starts = [i[0] for i in chunk_regions]
-            chunk_ends = [i[1] for i in chunk_regions]
-            chunk_inputs = [i["inputs"] for i in chunk_infos]
-            chunk_inputs_non_chunk = [i["inputs_non_chunk"] for i in chunk_infos]
-            chunk_inputs_names = [j.name for i in chunk_inputs for j in i] + [
-                j.name for i in chunk_inputs_non_chunk for j in i
-            ]
-            chunk_outputs = [i["outputs"][0] for i in chunk_infos]
-            chunk_node_dim = [i["node_chunk_dim"] for i in chunk_infos]
-            chunk_sizes = [
-                i["chunk_size"] if "chunk_size" in i else 1 for i in chunk_infos
-            ]
-
-        for idx, node in enumerate(node_list):
-            # if node in chunk start nodes, change chunk ratio and add chunk_tensor
-            if use_chunk and idx in chunk_starts:
-                chunk_within = True
-                chunk_region_idx = chunk_starts.index(idx)
-                act_memory += self._get_output_node_size(
-                    chunk_outputs[chunk_region_idx]
-                ) / (1024**2)
-
-            # determine chunk ratio for current node
-            if chunk_within:
-                chunk_ratio = self._get_chunk_ratio(
-                    node,
-                    chunk_node_dim[chunk_region_idx],
-                    chunk_sizes[chunk_region_idx],
-                )
-
-            # if node is placeholder, just add the size of the node
-            if node.op == "placeholder":
-                act_memory += self._get_meta_node_size(node) * chunk_ratio / (1024**2)
-                act_memory_peak_log.append(act_memory)
-            # skip output
-            elif node.op == "output":
-                continue
-            # no change for non compute node
-            elif _is_non_compute_node_except_placeholder(node):
-                act_memory_peak_log.append(act_memory)
-            # node is a compute op
-            # calculate tmp, output node and delete node memory
-            else:
-                # forward memory
-                # TODO: contiguous_memory still not accurate for matmul, view, reshape and transpose
-                act_memory += (
-                    self._get_contiguous_memory(node, not_contiguous_list)
-                    * chunk_ratio
-                    / (1024**2)
-                )
-                act_memory += (
-                    self._get_output_node_size(node) * chunk_ratio / (1024**2)
-                )
-                # record max act memory
-                act_memory_peak_log.append(act_memory)
-                # delete useless memory
-                act_memory -= (
-                    self._get_contiguous_memory(node, not_contiguous_list, delete=True)
-                    * chunk_ratio
-                    / (1024**2)
-                )
-                # delete unused vars not in chunk_input_list
-                # we can't delete input nodes until chunk ends
-                if chunk_within:
-                    act_memory -= self._get_chunk_delete_node_size(
-                        node,
-                        user_to_last_uses_no_free_var,
-                        chunk_ratio,
-                        chunk_inputs_names,
-                    ) / (1024**2)
-                else:
-                    act_memory -= self._get_delete_node_size(
-                        node, user_to_last_uses_no_free_var, chunk_inputs_names
-                    ) / (1024**2)
-
-            # log active node, only effective without chunk
-            self._add_active_node(node, active_node_list)
-            self._remove_deactive_node(node, user_to_last_uses, active_node_list)
-
-            # if node in chunk end nodes, restore chunk settings
-            if use_chunk and idx in chunk_ends:
-                act_memory -= (
-                    self._get_output_node_size(node) * chunk_ratio / (1024**2)
-                )
-                act_memory -= self._get_chunk_inputs_size(
-                    chunk_inputs[chunk_region_idx],
-                    chunk_inputs_non_chunk[chunk_region_idx],
-                    node_list,
-                    chunk_regions[chunk_region_idx][1],
-                ) / (1024**2)
-                chunk_within = False
-                chunk_ratio = 1
-                chunk_region_idx = None
-
-            act_memory_after_node_log.append(act_memory)
-            active_node_list_log.append(copy.deepcopy(active_node_list))
-
-        if print_mem:
-            print("with chunk" if use_chunk else "without chunk")
-            # self._print_mem_log(act_memory_peak_log, node_list, "peak")
-            # self._print_mem_log(act_memory_after_node_log, node_list, "after")
-            self._print_compute_op_mem_log(act_memory_peak_log, node_list, "peak")
-            # self._print_compute_op_mem_log(
-            #     act_memory_after_node_log, node_list, "after"
-            # )
-
-        # param_memory = parameter_size(gm)
-        # all_memory = act_memory + param_memory
-        return act_memory_peak_log, act_memory_after_node_log, active_node_list_log
-
-
-class ChunkSelector(object):
-    def __init__(
-        self,
-        index_tracer: IndexTracer,
-        memory_estimator: MemoryEstimator,
-        max_memory=None,
-    ):
-        self.index_tracer = index_tracer
-        self.memory_estimator = memory_estimator
-        if max_memory is not None:
-            self.stratge = "fit_memory"
-            self.max_memory = max_memory  # MB
-        else:
-            self.stratge = "min_memory"
-
-    def _select_best_chunk_region(
-        self, possible_chunk_regions, chunk_infos, peak_node, max_chunk_region, mem_peak
-    ):
-        if self.stratge == "min_memory":
-            best_region = self._select_min_memory_chunk_region(
-                possible_chunk_regions,
-                chunk_infos,
-                peak_node,
-                max_chunk_region,
-                mem_peak,
-            )
-        elif self.stratge == "fit_memory":
-            best_region = self._select_fit_memory_chunk_region(
-                possible_chunk_regions,
-                chunk_infos,
-                peak_node,
-                max_chunk_region,
-                mem_peak,
-            )
-        else:
-            raise RuntimeError()
-        return best_region
-
-    def _select_fit_memory_chunk_region(
-        self, possible_chunk_regions, chunk_infos, peak_node, max_chunk_region, mem_peak
-    ):
-        # stop chunk if max memory satisfy memory limit
-        if max(mem_peak) < self.max_memory:
-            return None
-
-        # remove illegal regions
-        illegal_regions = []
-        for i in possible_chunk_regions:
-            if not self._is_legal_region(i, chunk_infos):
-                illegal_regions.append(i)
-        for i in illegal_regions:
-            if i in possible_chunk_regions:
-                possible_chunk_regions.remove(i)
-
-        if len(possible_chunk_regions) == 0:
-            return None
-
-        # get mem for chunk region
-        regions_dict = []
-        for region in possible_chunk_regions:
-            cur_region = region.copy()
-            cur_node_list, cur_region = self.index_tracer.tmp_reorder(
-                self.index_tracer.node_list, cur_region
-            )
-            cur_chunk_infos = chunk_infos + [cur_region]
-            cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
-                cur_node_list, cur_chunk_infos
-            )[0]
-            cur_chunk_region_peak = cur_mem_peak[
-                max_chunk_region[0] : max_chunk_region[1] + 1
-            ]
-            cur_chunk_region_max_peak = max(cur_chunk_region_peak)
-            if cur_chunk_region_max_peak < self.max_memory:
-                regions_dict.append(
-                    {
-                        "chunk_info": region,
-                        "chunk_max_mem": cur_chunk_region_max_peak,
-                        "chunk_len": self._get_compute_node_num(
-                            region["region"][0], region["region"][1]
-                        ),
-                        "reorder_chunk_info": cur_region,
-                        "reorder_node_list": cur_node_list,
-                    }
-                )
-        # no region found
-        if len(regions_dict) == 0:
-            raise RuntimeError("Search failed. Try a larger memory threshold.")
-
-        # select the min chunk len
-        chunk_len = [i["chunk_len"] for i in regions_dict]
-        best_region_idx = chunk_len.index(min(chunk_len))
-        best_region = regions_dict[best_region_idx]
-
-        # get max chunk size
-        best_region = self._get_fit_chunk_size(best_region, chunk_infos)
-        return best_region
-
-    def _get_fit_chunk_size(self, chunk_region_dict, chunk_infos):
-        chunk_size = 1
-        reorder_chunk_info = chunk_region_dict["reorder_chunk_info"]
-        reorder_chunk_info["chunk_size"] = chunk_size
-        cur_chunk_max_mem = 0
-        # search a region
-        while cur_chunk_max_mem < self.max_memory:
-            chunk_size *= 2
-            reorder_chunk_info["chunk_size"] = chunk_size
-            cur_chunk_infos = chunk_infos + [reorder_chunk_info]
-            cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
-                chunk_region_dict["reorder_node_list"], cur_chunk_infos
-            )[0]
-            cur_chunk_max_mem = max(
-                cur_mem_peak[
-                    reorder_chunk_info["region"][0] : reorder_chunk_info["region"][1]
-                    + 1
-                ]
-            )
-        # search exact size
-        chunk_info = chunk_region_dict["chunk_info"]
-        chunk_info["chunk_size"] = self._chunk_size_binary_search(
-            chunk_size // 2, chunk_size, chunk_region_dict, chunk_infos
-        )
-        return chunk_info
-
-    def _chunk_size_binary_search(self, l, r, chunk_region_dict, chunk_infos):
-        if l >= 16:
-            gap = 4
-        else:
-            gap = 1
-        chunk_info = chunk_region_dict["reorder_chunk_info"]
-        while r >= l + gap:
-            mid = int((l + r) / 2 + 0.5)
-            chunk_info["chunk_size"] = mid
-            cur_chunk_infos = chunk_infos + [chunk_info]
-            cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
-                chunk_region_dict["reorder_node_list"], cur_chunk_infos
-            )[0]
-            cur_chunk_max_mem = max(
-                cur_mem_peak[chunk_info["region"][0] : chunk_info["region"][1] + 1]
-            )
-            if cur_chunk_max_mem >= self.max_memory:
-                r = mid - gap
-            else:
-                l = mid + gap
-        return l
-
-    def _get_compute_node_num(self, start, end):
-        count = 0
-        for i in self.index_tracer.node_list[start : end + 1]:
-            if not _is_non_compute_node(i):
-                count += 1
-        return count
-
-    def _select_min_memory_chunk_region(
-        self, possible_chunk_regions, chunk_infos, peak_node, max_chunk_region, mem_peak
-    ):
-        # remove illegal regions
-        illegal_regions = []
-        for i in possible_chunk_regions:
-            if not self._is_legal_region(i, chunk_infos):
-                illegal_regions.append(i)
-        for i in illegal_regions:
-            if i in possible_chunk_regions:
-                possible_chunk_regions.remove(i)
-
-        if len(possible_chunk_regions) == 0:
-            return None
-
-        # get mem for chunk region
-        regions_dict = []
-        for region in possible_chunk_regions:
-            cur_region = region.copy()
-            cur_node_list, cur_region = self.index_tracer.tmp_reorder(
-                self.index_tracer.node_list, cur_region
-            )
-            cur_chunk_infos = chunk_infos + [cur_region]
-            cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
-                cur_node_list, cur_chunk_infos
-            )[0]
-            cur_chunk_region_peak = cur_mem_peak[
-                max_chunk_region[0] : max_chunk_region[1] + 1
-            ]
-            cur_chunk_region_max_peak = max(cur_chunk_region_peak)
-            regions_dict.append(
-                {
-                    "chunk_info": region,
-                    "chunk_max_mem": cur_chunk_region_max_peak,
-                    "chunk_len": self._get_compute_node_num(
-                        region["region"][0], region["region"][1]
-                    ),
-                    "reorder_chunk_info": cur_region,
-                    "reorder_node_list": cur_node_list,
-                }
-            )
-
-        # select the min mem
-        chunk_max_mem = [i["chunk_max_mem"] for i in regions_dict]
-        best_region_idx = chunk_max_mem.index(min(chunk_max_mem))
-        best_region = regions_dict[best_region_idx]["chunk_info"]
-        if best_region is not None:
-            best_region["chunk_size"] = 1
-        return best_region
-
-    def _is_legal_region(self, cur_chunk_info, chunk_infos):
-        (chunk_region_start, chunk_region_end) = cur_chunk_info["region"]
-        if cur_chunk_info in chunk_infos:
-            return False
-        if chunk_region_end < chunk_region_start:
-            return False
-        for i in chunk_infos:
-            region = i["region"]
-            if not (
-                (chunk_region_start > region[1] and chunk_region_end > region[1])
-                or (chunk_region_start < region[0] and chunk_region_end < region[0])
-            ):
-                return False
-        return True
-
-
-class ChunkRegionSearch(object):
-    def __init__(self, gm, max_memory=None) -> None:
-        self.gm = gm
-        self.index_tracer = IndexTracer(list(gm.graph.nodes))
-        self.index_tracer.trace_index()
-        self.memory_estimator = MemoryEstimator(self.index_tracer)
-        self.chunk_selector = ChunkSelector(
-            self.index_tracer, self.memory_estimator, max_memory=max_memory
-        )
-
-    def _find_peak_node(self, mem_peak):
-        max_value = max(mem_peak)
-        max_idx = mem_peak.index(max_value)
-        return max_idx
-
-    def _get_free_var(self):
-        free_var_idx = []
-        for idx, n in enumerate(self.index_tracer.node_list):
-            if n.op == "placeholder":
-                free_var_idx.append(idx)
-        return free_var_idx
-
-    def _get_min_free_var(self, active_node_list, free_vars):
-        min_len = 999
-        for idx, n in enumerate(active_node_list):
-            if idx in free_vars:
-                continue
-            if len(n) < min_len:
-                min_len = len(n)
-        return min_len
-
-    def _search_max_chunk_region(self, active_node, peak_node, chunk_regions):
-        free_vars = self._get_free_var()
-        free_var_num = len(free_vars)
-        active_node_num = [len(i) for i in active_node]
-        min_active_node_num = min(active_node_num[free_var_num:])
-        threshold = max(free_var_num, min_active_node_num)
-
-        # from peak_node to free_var
-        inside_flag = False
-        chunk_region_start = free_var_num
-        for i in range(peak_node, -1, -1):
-            if active_node_num[i] <= threshold:
-                inside_flag = True
-            if inside_flag and active_node_num[i] > threshold:
-                chunk_region_start = i + 1
-                break
-
-        # from peak_node to len-2
-        inside_flag = False
-        chunk_region_end = len(active_node) - 1
-        for i in range(peak_node, len(active_node)):
-            if active_node_num[i] <= threshold:
-                inside_flag = True
-            if inside_flag and active_node_num[i] > threshold:
-                chunk_region_end = i
-                break
-
-        for i in chunk_regions:
-            region = i["region"]
-            if chunk_region_start >= region[0] and chunk_region_end <= region[1]:
-                return None
-            elif (
-                region[0] <= chunk_region_start <= region[1]
-                and chunk_region_end > region[1]
-            ):
-                chunk_region_start = region[1] + 1
-            elif (
-                region[0] <= chunk_region_end <= region[1]
-                and chunk_region_start < region[0]
-            ):
-                chunk_region_end = region[0] - 1
-        return chunk_region_start, chunk_region_end
-
-    def _is_not_compute(self, trace, chunk_range, dim_idx):
-        if trace["idx"][dim_idx] not in trace["compute"]:
-            return True
-        if trace["idx"][dim_idx] in trace["compute"] and all(
-            i < chunk_range[0] or i > chunk_range[1]
-            for i in trace["compute"][trace["idx"][dim_idx]]
-        ):
-            return True
-        return False
-
-    def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
-        start_traces = input_trace[start_idx]
-        end_trace = output_trace[end_idx]
-        end_node = self.index_tracer.node_list[end_idx]
-        chunk_infos = []
-        for end_dim, _ in enumerate(end_trace["idx"]):
-            if len(start_traces) > 1:
-                continue
-            for start_node, start_trace in start_traces.items():
-                for start_dim, _ in enumerate(start_trace["idx"]):
-                    # dim size cannot be 1
-                    if (
-                        _get_node_shape(end_node)[end_dim] == 1
-                        or _get_node_shape(start_node)[start_dim] == 1
-                    ):
-                        continue
-                    # check index source align
-                    if not self.index_tracer.check_index_source(
-                        start_dim, start_node, start_idx, end_dim, end_node
-                    ):
-                        continue
-                    # check index copmute
-                    if not self.index_tracer.check_index_compute(
-                        start_idx, end_dim, end_node, end_idx
-                    ):
-                        continue
-                    # flow search
-                    chunk_info = self.index_tracer.flow_search(
-                        start_idx, start_dim, end_idx, end_dim
-                    )
-                    if chunk_info is None:
-                        continue
-                    # check index copmute
-                    if not self.index_tracer.check_index_duplicate(chunk_info):
-                        continue
-                    chunk_infos.append(chunk_info)
-        return chunk_infos
-
-    def _search_possible_chunk_regions(self, max_chunk_region, peak_node):
-        possible_chunk_region = []
-        output_trace = copy.deepcopy(self.index_tracer.idx_trace_list)
-        input_trace = []  # trace of a node's input nodes
-        for _, n in enumerate(self.index_tracer.node_list):
-            cur_trace = {}
-            for arg in n.args:
-                if type(arg) == type(n) and not _is_non_compute_node_except_placeholder(
-                    arg
-                ):
-                    cur_trace[arg] = self.index_tracer._find_trace_from_node(arg)
-            input_trace.append(cur_trace)
-
-        for start_idx in range(max_chunk_region[0], peak_node + 1):
-            for end_idx in range(peak_node, max_chunk_region[1] + 1):
-                # skip non compute nodes
-                if _is_non_compute_node(
-                    self.index_tracer.node_list[start_idx]
-                ) or _is_non_compute_node(self.index_tracer.node_list[end_idx]):
-                    continue
-
-                # select free dim
-                chunk_info = self._find_free_dim(
-                    input_trace, output_trace, start_idx, end_idx
-                )
-                if len(chunk_info) > 0:
-                    possible_chunk_region.extend(chunk_info)
-        return possible_chunk_region
-
-    def _step_search(self, mem_peak, active_node, chunk_regions):
-        peak_node = self._find_peak_node(mem_peak)
-        max_chunk_region = self._search_max_chunk_region(
-            active_node, peak_node, chunk_regions
-        )
-        if max_chunk_region == None:
-            return None
-        possible_chunk_regions = self._search_possible_chunk_regions(
-            max_chunk_region, peak_node
-        )
-        best_chunk_region = self.chunk_selector._select_best_chunk_region(
-            possible_chunk_regions, chunk_regions, peak_node, max_chunk_region, mem_peak
-        )
-        best_chunk_region = self.index_tracer.reorder_all(best_chunk_region)
-        return best_chunk_region
-
-    def _stop_search(self, init_mem_peak, mem_peak):
-        sorted_init_mem_peak = sorted(init_mem_peak)
-        if max(mem_peak) < sorted_init_mem_peak[int(len(sorted_init_mem_peak) * 0.5)]:
-            return True
-        return False
-
-    def search_region(self):
-        chunk_infos = []
-        (
-            init_mem_peak,
-            _,
-            active_node,
-        ) = self.memory_estimator.estimate_chunk_inference_mem(
-            self.index_tracer.node_list
-        )
-        mem_peak = init_mem_peak
-
-        while True:
-            chunk_info = self._step_search(mem_peak, active_node, chunk_infos)
-            if chunk_info is None:
-                break
-            chunk_infos.append(chunk_info)
-
-            (
-                mem_peak,
-                _,
-                active_node,
-            ) = self.memory_estimator.estimate_chunk_inference_mem(
-                self.index_tracer.node_list, chunk_infos
-            )
-            if self._stop_search(init_mem_peak, mem_peak):
-                break
-        self.memory_estimator.estimate_chunk_inference_mem(
-            self.index_tracer.node_list, chunk_infos, print_mem=True
-        )
-        return chunk_infos
-
-
-def _gen_chunk_slice_dim(chunk_dim, chunk_idx_name, shape):
-    new_shape = "["
-    for idx, i in enumerate(shape):
-        if idx == chunk_dim:
-            new_shape += "%s:%s + chunk_size" % (chunk_idx_name, chunk_idx_name)
-        else:
-            new_shape += ":"
-        new_shape += ", "
-    new_shape = new_shape[:-2] + "]"
-    return new_shape
-
-
-def _gen_loop_start(chunk_input, chunk_output, chunk_ouput_dim, chunk_size=2):
-    input_node = chunk_input[0]
-    out_shape = _get_node_shape(chunk_output)
-    out_str = str(list(out_shape))
-    context = (
-        "chunk_result = torch.empty(%s, dtype=%s.dtype, device=%s.device); chunk_size = %d\nfor chunk_idx in range"
-        % (out_str, input_node.name, input_node.name, chunk_size)
-    )
-    context += "(0, %d, chunk_size):\n" % (out_shape[chunk_ouput_dim])
-    return context
-
-
-def _gen_loop_end(
-    chunk_inputs, chunk_non_compute_inputs, chunk_outputs, chunk_outputs_dim, node_list
-):
-    chunk_outputs_name = chunk_outputs.name
-    chunk_outputs_idx = _find_idx_by_name(chunk_outputs_name, node_list)
-    chunk_output_shape = chunk_outputs.meta["tensor_meta"].shape
-    chunk_slice = _gen_chunk_slice_dim(
-        chunk_outputs_dim, "chunk_idx", chunk_output_shape
-    )
-    context = "    chunk_result%s = %s;  %s = None\n" % (
-        chunk_slice,
-        chunk_outputs_name,
-        chunk_outputs_name,
-    )
-    context += (
-        chunk_outputs_name + " = chunk_result;  chunk_result = None;  chunk_size = None"
-    )
-
-    # determine if its the last use for chunk input
-    for chunk_input in chunk_inputs + chunk_non_compute_inputs:
-        if all(
-            [
-                _find_idx_by_name(user.name, node_list) <= chunk_outputs_idx
-                for user in chunk_input.users.keys()
-            ]
-        ):
-            context += ";  %s = None" % chunk_input.name
-
-    context += "\n"
-    return context
-
-
-def _find_chunk_all_input_nodes(nodes: List[Node]):
-    """
-    Find non-compute input and output node names.
-    input nodes are nodes used in the list
-    output nodes are nodes will use nodes in the list
-    """
-    input_nodes = []
-    for node in nodes:
-        for input_node in node._input_nodes.keys():
-            if input_node not in nodes and input_node not in input_nodes:
-                input_nodes.append(input_node)
-    return input_nodes
-
-
-def _find_chunk_compute_input_and_output_nodes(nodes: List[Node]):
-    """
-    Find non-compute input and output node names.
-    input nodes are nodes used in the list
-    output nodes are nodes will use nodes in the list
-    """
-    input_nodes = []
-    output_nodes = []
-
-    # if a node has an input node which is not in the node list
-    # we treat that input node as the input of the checkpoint function
-    for node in nodes:
-        for input_node in node._input_nodes.keys():
-            if (
-                input_node not in nodes
-                and input_node not in input_nodes
-                and not _is_non_compute_node_except_placeholder(input_node)
-            ):
-                input_nodes.append(input_node)
-
-    # if a node has a user node which is not in the node list
-    # we treat that user node as the node receiving the current node output
-    for node in nodes:
-        for output_node in node.users.keys():
-            if (
-                output_node not in nodes
-                and node not in output_nodes
-                and not _is_non_compute_node_except_placeholder_output(output_node)
-            ):
-                output_nodes.append(node)
-
-    return input_nodes, output_nodes
-
-
-def _find_idx_by_name(name, nodes_list):
-    for idx, node in enumerate(nodes_list):
-        if node.name == name:
-            return idx
-    raise RuntimeError("name %s not found in node list" % name)
-
-
-def _replace_name(context, name_from, name_to):
-    patterns = [(" ", " "), (" ", "."), (" ", ","), ("(", ")"), ("(", ","), (" ", ")")]
-    for p in patterns:
-        source = p[0] + name_from + p[1]
-        target = p[0] + name_to + p[1]
-        if source in context:
-            context = context.replace(source, target)
-    return context
-
-
-def _replace_reshape_size(context, node_name, reshape_size_dict):
-    if node_name not in reshape_size_dict:
-        return context
-    for size_name, size_value in reshape_size_dict[node_name].items():
-        context = context.replace(size_name, size_value)
-    return context
-
-
-def emit_code_with_chunk(
-    body,
-    nodes,
-    emit_node_func,
-    delete_unused_value_func,
-    chunk_region_search,
-    chunk_infos
-):
-    """Emit code with nested activation checkpoint
-    When we detect some of the node.activation_checkpoint is a List, we will use
-    this function to emit the activation checkpoint codes.
-
-    Args:
-        body: forward code
-        ckpt_func: checkpoint functions code
-        nodes: graph.nodes
-        emit_node_func: function to emit node
-        delete_unused_value_func: function to remove the unused value
-    """
-    node_list = list(nodes)
-
-    chunk_regions = [i["region"] for i in chunk_infos]
-    chunk_starts = [i[0] for i in chunk_regions]
-    chunk_ends = [i[1] for i in chunk_regions]
-
-    chunk_inputs = [i["inputs"] for i in chunk_infos]
-    chunk_inputs_non_chunk = [i["inputs_non_chunk"] for i in chunk_infos]
-    chunk_inputs_dim = [i["inputs_dim"] for i in chunk_infos]
-    chunk_inputs_names = [j.name for i in chunk_inputs for j in i] + [
-        j.name for i in chunk_inputs_non_chunk for j in i
-    ]
-
-    chunk_outputs = [i["outputs"][0] for i in chunk_infos]
-    chunk_outputs_dim = [i["outputs_dim"] for i in chunk_infos]
-
-    node_list = chunk_region_search.index_tracer.reorder_node_list(node_list)
-    node_idx = 0
-    region_idx = 0
-    within_chunk_region = False
-
-    while node_idx < len(node_list):
-        node = node_list[node_idx]
-
-        if node_idx in chunk_starts:
-            within_chunk_region = True
-            region_idx = chunk_starts.index(node_idx)
-            body.append(
-                _gen_loop_start(
-                    chunk_inputs[region_idx],
-                    chunk_outputs[region_idx],
-                    chunk_outputs_dim[region_idx],
-                    chunk_infos[region_idx]["chunk_size"],
-                )
-            )
-
-        if within_chunk_region:
-            emit_node_func(node, body)
-            # replace input var with chunk var
-            for input_node_idx, input_node in enumerate(chunk_inputs[region_idx]):
-                for idx, dim in chunk_inputs_dim[region_idx][input_node_idx].items():
-                    if idx == node_idx:
-                        chunk_slice = _gen_chunk_slice_dim(
-                            dim[0], "chunk_idx", _get_node_shape(input_node)
-                        )
-                        body[-1] = _replace_name(
-                            body[-1], input_node.name, input_node.name + chunk_slice
-                        )
-            # ones like
-            if "ones_like" in node.name:
-                meta_node = chunk_region_search.index_tracer.node_list[node_idx]
-                chunk_dim = chunk_infos[region_idx]["node_chunk_dim"][meta_node][
-                    "chunk_dim"
-                ]
-                if _get_node_shape(meta_node)[chunk_dim] != 1:
-                    source_node = meta_node.args[0].args[0]
-                    if (
-                        source_node not in chunk_infos[region_idx]["node_chunk_dim"]
-                        or chunk_infos[region_idx]["node_chunk_dim"][source_node][
-                            "chunk_dim"
-                        ]
-                        is None
-                    ):
-                        chunk_slice = _gen_chunk_slice_dim(
-                            chunk_dim, "chunk_idx", _get_node_shape(node)
-                        )
-                        body[-1] = _replace_name(
-                            body[-1], node.args[0].name, node.args[0].name + chunk_slice
-                        )
-            body[-1] = _replace_reshape_size(
-                body[-1], node.name, chunk_infos[region_idx]["reshape_size"]
-            )
-            body[-1] = "    " + body[-1]
-            delete_unused_value_func(node, body, chunk_inputs_names)
-        else:
-            emit_node_func(node, body)
-            if node_idx not in chunk_inputs:
-                delete_unused_value_func(node, body, chunk_inputs_names)
-
-        if node_idx in chunk_ends:
-            body.append(
-                _gen_loop_end(
-                    chunk_inputs[region_idx],
-                    chunk_inputs_non_chunk[region_idx],
-                    chunk_outputs[region_idx],
-                    chunk_outputs_dim[region_idx],
-                    node_list,
-                )
-            )
-            within_chunk_region = False
-
-        node_idx += 1
-
-
-if CODEGEN_AVAILABLE:
-
-    class ChunkCodeGen(CodeGen):
-        def __init__(self, meta_graph, max_memory=None):
-            super().__init__()
-            self.meta_graph = meta_graph
-            self.max_memory = max_memory
-            self.meta_node = list(meta_graph.graph.nodes)
-            # find the chunk regions
-            self.chunk_region_search = ChunkRegionSearch(meta_graph, max_memory)
-            self.chunk_infos = self.chunk_region_search.search_region()
-
-        def _gen_python_code(
-            self, nodes, root_module: str, namespace: _Namespace
-        ) -> PythonCode:
-            free_vars: List[str] = []
-            body: List[str] = []
-            globals_: Dict[str, Any] = {}
-            wrapped_fns: Dict[str, None] = {}
-
-            # Wrap string in list to pass by reference
-            maybe_return_annotation: List[str] = [""]
-
-            def add_global(name_hint: str, obj: Any):
-                """Add an obj to be tracked as a global.
-
-                We call this for names that reference objects external to the
-                Graph, like functions or types.
-
-                Returns: the global name that should be used to reference 'obj' in generated source.
-                """
-                if (
-                    _is_from_torch(obj) and obj != torch.device
-                ):  # to support registering torch.device
-                    # HACK: workaround for how torch custom ops are registered. We
-                    # can't import them like normal modules so they must retain their
-                    # fully qualified name.
-                    return _get_qualified_name(obj)
-
-                # normalize the name hint to get a proper identifier
-                global_name = namespace.create_name(name_hint, obj)
-
-                if global_name in globals_:
-                    assert globals_[global_name] is obj
-                    return global_name
-                globals_[global_name] = obj
-                return global_name
-
-            # set _custom_builtins here so that we needn't import colossalai in forward
-            _custom_builtins["colossalai"] = _CustomBuiltin(
-                "import colossalai", colossalai
-            )
-
-            # Pre-fill the globals table with registered builtins.
-            for name, (_, obj) in _custom_builtins.items():
-                add_global(name, obj)
-
-            def type_repr(o: Any):
-                if o == ():
-                    # Empty tuple is used for empty tuple type annotation Tuple[()]
-                    return "()"
-
-                typename = _type_repr(o)
-
-                if hasattr(o, "__origin__"):
-                    # This is a generic type, e.g. typing.List[torch.Tensor]
-                    origin_type = _origin_type_map.get(o.__origin__, o.__origin__)
-                    origin_typename = add_global(_type_repr(origin_type), origin_type)
-
-                    if hasattr(o, "__args__"):
-                        # Assign global names for each of the inner type variables.
-                        args = [type_repr(arg) for arg in o.__args__]
-
-                        if len(args) == 0:
-                            # Bare type, such as `typing.Tuple` with no subscript
-                            # This code-path used in Python < 3.9
-                            return origin_typename
-
-                        return f'{origin_typename}[{",".join(args)}]'
-                    else:
-                        # Bare type, such as `typing.Tuple` with no subscript
-                        # This code-path used in Python 3.9+
-                        return origin_typename
-
-                # Common case: this is a regular module name like 'foo.bar.baz'
-                return add_global(typename, o)
-
-            def _format_args(
-                args: Tuple[Argument, ...], kwargs: Dict[str, Argument]
-            ) -> str:
-                def _get_repr(arg):
-                    # Handle NamedTuples (if it has `_fields`) via add_global.
-                    if isinstance(arg, tuple) and hasattr(arg, "_fields"):
-                        qualified_name = _get_qualified_name(type(arg))
-                        global_name = add_global(qualified_name, type(arg))
-                        return f"{global_name}{repr(tuple(arg))}"
-                    return repr(arg)
-
-                args_s = ", ".join(_get_repr(a) for a in args)
-                kwargs_s = ", ".join(f"{k} = {_get_repr(v)}" for k, v in kwargs.items())
-                if args_s and kwargs_s:
-                    return f"{args_s}, {kwargs_s}"
-                return args_s or kwargs_s
-
-            # Run through reverse nodes and record the first instance of a use
-            # of a given node. This represents the *last* use of the node in the
-            # execution order of the program, which we will use to free unused
-            # values
-            node_to_last_use: Dict[Node, Node] = {}
-            user_to_last_uses: Dict[Node, List[Node]] = {}
-
-            def register_last_uses(n: Node, user: Node):
-                if n not in node_to_last_use:
-                    node_to_last_use[n] = user
-                    user_to_last_uses.setdefault(user, []).append(n)
-
-            for node in reversed(nodes):
-                map_arg(node.args, lambda n: register_last_uses(n, node))
-                map_arg(node.kwargs, lambda n: register_last_uses(n, node))
-
-            _delete_free_var_from_last_use(user_to_last_uses)
-
-            # NOTE: we add a variable to distinguish body and ckpt_func
-            def delete_unused_values(user: Node, body, to_keep=[]):
-                """
-                Delete values after their last use. This ensures that values that are
-                not used in the remainder of the code are freed and the memory usage
-                of the code is optimal.
-                """
-                if user.op == "placeholder":
-                    return
-                if user.op == "output":
-                    body.append("\n")
-                    return
-                nodes_to_delete = user_to_last_uses.get(user, [])
-                nodes_to_delete = [i for i in nodes_to_delete if i.name not in to_keep]
-                if len(nodes_to_delete):
-                    to_delete_str = " = ".join(
-                        [repr(n) for n in nodes_to_delete] + ["None"]
-                    )
-                    body.append(f";  {to_delete_str}\n")
-                else:
-                    body.append("\n")
-
-            # NOTE: we add a variable to distinguish body and ckpt_func
-            def emit_node(node: Node, body):
-                maybe_type_annotation = (
-                    "" if node.type is None else f" : {type_repr(node.type)}"
-                )
-                if node.op == "placeholder":
-                    assert isinstance(node.target, str)
-                    maybe_default_arg = (
-                        "" if not node.args else f" = {repr(node.args[0])}"
-                    )
-                    free_vars.append(
-                        f"{node.target}{maybe_type_annotation}{maybe_default_arg}"
-                    )
-                    raw_name = node.target.replace("*", "")
-                    if raw_name != repr(node):
-                        body.append(f"{repr(node)} = {raw_name}\n")
-                    return
-                elif node.op == "call_method":
-                    assert isinstance(node.target, str)
-                    body.append(
-                        f"{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.target)}"
-                        f"({_format_args(node.args[1:], node.kwargs)})"
-                    )
-                    return
-                elif node.op == "call_function":
-                    assert callable(node.target)
-                    # pretty print operators
-                    if (
-                        node.target.__module__ == "_operator"
-                        and node.target.__name__ in magic_methods
-                    ):
-                        assert isinstance(node.args, tuple)
-                        body.append(
-                            f"{repr(node)}{maybe_type_annotation} = "
-                            f"{magic_methods[node.target.__name__].format(*(repr(a) for a in node.args))}"
-                        )
-                        return
-
-                    # pretty print inplace operators; required for jit.script to work properly
-                    # not currently supported in normal FX graphs, but generated by torchdynamo
-                    if (
-                        node.target.__module__ == "_operator"
-                        and node.target.__name__ in inplace_methods
-                    ):
-                        body.append(
-                            f"{inplace_methods[node.target.__name__].format(*(repr(a) for a in node.args))};  "
-                            f"{repr(node)}{maybe_type_annotation} = {repr(node.args[0])}"
-                        )
-                        return
-
-                    qualified_name = _get_qualified_name(node.target)
-                    global_name = add_global(qualified_name, node.target)
-                    # special case for getattr: node.args could be 2-argument or 3-argument
-                    # 2-argument: attribute access; 3-argument: fall through to attrib function call with default value
-                    if (
-                        global_name == "getattr"
-                        and isinstance(node.args, tuple)
-                        and isinstance(node.args[1], str)
-                        and node.args[1].isidentifier()
-                        and len(node.args) == 2
-                    ):
-                        body.append(
-                            f"{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.args[1])}"
-                        )
-                        return
-                    body.append(
-                        f"{repr(node)}{maybe_type_annotation} = {global_name}({_format_args(node.args, node.kwargs)})"
-                    )
-                    if node.meta.get("is_wrapped", False):
-                        wrapped_fns.setdefault(global_name)
-                    return
-                elif node.op == "call_module":
-                    assert isinstance(node.target, str)
-                    body.append(
-                        f"{repr(node)}{maybe_type_annotation} = "
-                        f"{_format_target(root_module, node.target)}({_format_args(node.args, node.kwargs)})"
-                    )
-                    return
-                elif node.op == "get_attr":
-                    assert isinstance(node.target, str)
-                    body.append(
-                        f"{repr(node)}{maybe_type_annotation} = {_format_target(root_module, node.target)}"
-                    )
-                    return
-                elif node.op == "output":
-                    if node.type is not None:
-                        maybe_return_annotation[0] = f" -> {type_repr(node.type)}"
-                    body.append(self.generate_output(node.args[0]))
-                    return
-                raise NotImplementedError(f"node: {node.op} {node.target}")
-
-            # Modified for activation checkpointing
-            ckpt_func = []
-
-            # if any node has a list of labels for activation_checkpoint, we
-            # will use nested type of activation checkpoint codegen
-            emit_code_with_chunk(
-                body,
-                nodes,
-                emit_node,
-                delete_unused_values,
-                self.chunk_region_search,
-                self.chunk_infos
-            )
-
-            if len(body) == 0:
-                # If the Graph has no non-placeholder nodes, no lines for the body
-                # have been emitted. To continue to have valid Python code, emit a
-                # single pass statement
-                body.append("pass\n")
-
-            if len(wrapped_fns) > 0:
-                wrap_name = add_global("wrap", torch.fx.wrap)
-                wrap_stmts = "\n".join(
-                    [f'{wrap_name}("{name}")' for name in wrapped_fns]
-                )
-            else:
-                wrap_stmts = ""
-
-            if self._body_transformer:
-                body = self._body_transformer(body)
-
-            for name, value in self.additional_globals():
-                add_global(name, value)
-
-            # as we need colossalai.utils.checkpoint, we need to import colossalai
-            # in forward function
-            prologue = self.gen_fn_def(free_vars, maybe_return_annotation[0])
-            prologue = "".join(ckpt_func) + prologue
-            prologue = prologue
-
-            code = "".join(body)
-            code = "\n".join("    " + line for line in code.split("\n"))
-            fn_code = f"""
-{wrap_stmts}
-
-{prologue}
-{code}"""
-            # print(fn_code)
-            return PythonCode(fn_code, globals_)
diff --git a/colossalai/autochunk/chunk_region_search.py b/colossalai/autochunk/chunk_region_search.py
new file mode 100644
index 000000000000..0d0825f2584e
--- /dev/null
+++ b/colossalai/autochunk/chunk_region_search.py
@@ -0,0 +1,211 @@
+from .index_tracer import IndexTracer
+from .memory_estiamtor import MemoryEstimator
+from .chunk_selector import ChunkSelector
+import copy
+from .utils import is_non_compute_node, is_non_compute_node_except_placeholder, get_node_shape
+
+
+class ChunkRegionSearch(object):
+    def __init__(self, gm, max_memory=None) -> None:
+        self.gm = gm
+        self.index_tracer = IndexTracer(list(gm.graph.nodes))
+        self.index_tracer.trace_index()
+        self.memory_estimator = MemoryEstimator(self.index_tracer)
+        self.chunk_selector = ChunkSelector(
+            self.index_tracer, self.memory_estimator, max_memory=max_memory
+        )
+
+    def _find_peak_node(self, mem_peak):
+        max_value = max(mem_peak)
+        max_idx = mem_peak.index(max_value)
+        return max_idx
+
+    def _get_free_var(self):
+        free_var_idx = []
+        for idx, n in enumerate(self.index_tracer.node_list):
+            if n.op == "placeholder":
+                free_var_idx.append(idx)
+        return free_var_idx
+
+    def _get_min_free_var(self, active_node_list, free_vars):
+        min_len = 999
+        for idx, n in enumerate(active_node_list):
+            if idx in free_vars:
+                continue
+            if len(n) < min_len:
+                min_len = len(n)
+        return min_len
+
+    def _search_max_chunk_region(self, active_node, peak_node, chunk_regions):
+        free_vars = self._get_free_var()
+        free_var_num = len(free_vars)
+        active_node_num = [len(i) for i in active_node]
+        min_active_node_num = min(active_node_num[free_var_num:])
+        threshold = max(free_var_num, min_active_node_num)
+
+        # from peak_node to free_var
+        inside_flag = False
+        chunk_region_start = free_var_num
+        for i in range(peak_node, -1, -1):
+            if active_node_num[i] <= threshold:
+                inside_flag = True
+            if inside_flag and active_node_num[i] > threshold:
+                chunk_region_start = i + 1
+                break
+
+        # from peak_node to len-2
+        inside_flag = False
+        chunk_region_end = len(active_node) - 1
+        for i in range(peak_node, len(active_node)):
+            if active_node_num[i] <= threshold:
+                inside_flag = True
+            if inside_flag and active_node_num[i] > threshold:
+                chunk_region_end = i
+                break
+
+        for i in chunk_regions:
+            region = i["region"]
+            if chunk_region_start >= region[0] and chunk_region_end <= region[1]:
+                return None
+            elif (
+                region[0] <= chunk_region_start <= region[1]
+                and chunk_region_end > region[1]
+            ):
+                chunk_region_start = region[1] + 1
+            elif (
+                region[0] <= chunk_region_end <= region[1]
+                and chunk_region_start < region[0]
+            ):
+                chunk_region_end = region[0] - 1
+        return chunk_region_start, chunk_region_end
+
+    def _is_not_compute(self, trace, chunk_range, dim_idx):
+        if trace["idx"][dim_idx] not in trace["compute"]:
+            return True
+        if trace["idx"][dim_idx] in trace["compute"] and all(
+            i < chunk_range[0] or i > chunk_range[1]
+            for i in trace["compute"][trace["idx"][dim_idx]]
+        ):
+            return True
+        return False
+
+    def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
+        start_traces = input_trace[start_idx]
+        end_trace = output_trace[end_idx]
+        end_node = self.index_tracer.node_list[end_idx]
+        chunk_infos = []
+        for end_dim, _ in enumerate(end_trace["idx"]):
+            if len(start_traces) > 1:
+                continue
+            for start_node, start_trace in start_traces.items():
+                for start_dim, _ in enumerate(start_trace["idx"]):
+                    # dim size cannot be 1
+                    if (
+                        get_node_shape(end_node)[end_dim] == 1
+                        or get_node_shape(start_node)[start_dim] == 1
+                    ):
+                        continue
+                    # check index source align
+                    if not self.index_tracer.check_index_source(
+                        start_dim, start_node, start_idx, end_dim, end_node
+                    ):
+                        continue
+                    # check index copmute
+                    if not self.index_tracer.check_index_compute(
+                        start_idx, end_dim, end_node, end_idx
+                    ):
+                        continue
+                    # flow search
+                    chunk_info = self.index_tracer.flow_search(
+                        start_idx, start_dim, end_idx, end_dim
+                    )
+                    if chunk_info is None:
+                        continue
+                    # check index copmute
+                    if not self.index_tracer.check_index_duplicate(chunk_info):
+                        continue
+                    chunk_infos.append(chunk_info)
+        return chunk_infos
+
+    def _search_possible_chunk_regions(self, max_chunk_region, peak_node):
+        possible_chunk_region = []
+        output_trace = copy.deepcopy(self.index_tracer.idx_trace_list)
+        input_trace = []  # trace of a node's input nodes
+        for _, n in enumerate(self.index_tracer.node_list):
+            cur_trace = {}
+            for arg in n.args:
+                if type(arg) == type(n) and not is_non_compute_node_except_placeholder(
+                    arg
+                ):
+                    cur_trace[arg] = self.index_tracer._find_trace_from_node(arg)
+            input_trace.append(cur_trace)
+
+        for start_idx in range(max_chunk_region[0], peak_node + 1):
+            for end_idx in range(peak_node, max_chunk_region[1] + 1):
+                # skip non compute nodes
+                if is_non_compute_node(
+                    self.index_tracer.node_list[start_idx]
+                ) or is_non_compute_node(self.index_tracer.node_list[end_idx]):
+                    continue
+
+                # select free dim
+                chunk_info = self._find_free_dim(
+                    input_trace, output_trace, start_idx, end_idx
+                )
+                if len(chunk_info) > 0:
+                    possible_chunk_region.extend(chunk_info)
+        return possible_chunk_region
+
+    def _step_search(self, mem_peak, active_node, chunk_regions):
+        peak_node = self._find_peak_node(mem_peak)
+        max_chunk_region = self._search_max_chunk_region(
+            active_node, peak_node, chunk_regions
+        )
+        if max_chunk_region == None:
+            return None
+        possible_chunk_regions = self._search_possible_chunk_regions(
+            max_chunk_region, peak_node
+        )
+        best_chunk_region = self.chunk_selector._select_best_chunk_region(
+            possible_chunk_regions, chunk_regions, peak_node, max_chunk_region, mem_peak
+        )
+        best_chunk_region = self.index_tracer.reorder_all(best_chunk_region)
+        return best_chunk_region
+
+    def _stop_search(self, init_mem_peak, mem_peak):
+        sorted_init_mem_peak = sorted(init_mem_peak)
+        if max(mem_peak) < sorted_init_mem_peak[int(len(sorted_init_mem_peak) * 0.5)]:
+            return True
+        return False
+
+    def search_region(self):
+        chunk_infos = []
+        (
+            init_mem_peak,
+            _,
+            active_node,
+        ) = self.memory_estimator.estimate_chunk_inference_mem(
+            self.index_tracer.node_list
+        )
+        mem_peak = init_mem_peak
+
+        while True:
+            chunk_info = self._step_search(mem_peak, active_node, chunk_infos)
+            if chunk_info is None:
+                break
+            chunk_infos.append(chunk_info)
+
+            (
+                mem_peak,
+                _,
+                active_node,
+            ) = self.memory_estimator.estimate_chunk_inference_mem(
+                self.index_tracer.node_list, chunk_infos
+            )
+            if self._stop_search(init_mem_peak, mem_peak):
+                break
+        self.memory_estimator.estimate_chunk_inference_mem(
+            self.index_tracer.node_list, chunk_infos, print_mem=True
+        )
+        return chunk_infos
+
diff --git a/colossalai/autochunk/chunk_selector.py b/colossalai/autochunk/chunk_selector.py
new file mode 100644
index 000000000000..f84322082cc4
--- /dev/null
+++ b/colossalai/autochunk/chunk_selector.py
@@ -0,0 +1,221 @@
+from .index_tracer import IndexTracer
+from .memory_estiamtor import MemoryEstimator
+from .utils import is_non_compute_node
+
+
+class ChunkSelector(object):
+    def __init__(
+        self,
+        index_tracer: IndexTracer,
+        memory_estimator: MemoryEstimator,
+        max_memory=None,
+    ):
+        self.index_tracer = index_tracer
+        self.memory_estimator = memory_estimator
+        if max_memory is not None:
+            self.stratge = "fit_memory"
+            self.max_memory = max_memory  # MB
+        else:
+            self.stratge = "min_memory"
+
+    def _select_best_chunk_region(
+        self, possible_chunk_regions, chunk_infos, peak_node, max_chunk_region, mem_peak
+    ):
+        if self.stratge == "min_memory":
+            best_region = self._select_min_memory_chunk_region(
+                possible_chunk_regions,
+                chunk_infos,
+                peak_node,
+                max_chunk_region,
+                mem_peak,
+            )
+        elif self.stratge == "fit_memory":
+            best_region = self._select_fit_memory_chunk_region(
+                possible_chunk_regions,
+                chunk_infos,
+                peak_node,
+                max_chunk_region,
+                mem_peak,
+            )
+        else:
+            raise RuntimeError()
+        return best_region
+
+    def _select_fit_memory_chunk_region(
+        self, possible_chunk_regions, chunk_infos, peak_node, max_chunk_region, mem_peak
+    ):
+        # stop chunk if max memory satisfy memory limit
+        if max(mem_peak) < self.max_memory:
+            return None
+
+        # remove illegal regions
+        illegal_regions = []
+        for i in possible_chunk_regions:
+            if not self._is_legal_region(i, chunk_infos):
+                illegal_regions.append(i)
+        for i in illegal_regions:
+            if i in possible_chunk_regions:
+                possible_chunk_regions.remove(i)
+
+        if len(possible_chunk_regions) == 0:
+            return None
+
+        # get mem for chunk region
+        regions_dict = []
+        for region in possible_chunk_regions:
+            cur_region = region.copy()
+            cur_node_list, cur_region = self.index_tracer.tmp_reorder(
+                self.index_tracer.node_list, cur_region
+            )
+            cur_chunk_infos = chunk_infos + [cur_region]
+            cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
+                cur_node_list, cur_chunk_infos
+            )[0]
+            cur_chunk_region_peak = cur_mem_peak[
+                max_chunk_region[0] : max_chunk_region[1] + 1
+            ]
+            cur_chunk_region_max_peak = max(cur_chunk_region_peak)
+            if cur_chunk_region_max_peak < self.max_memory:
+                regions_dict.append(
+                    {
+                        "chunk_info": region,
+                        "chunk_max_mem": cur_chunk_region_max_peak,
+                        "chunk_len": self._get_compute_node_num(
+                            region["region"][0], region["region"][1]
+                        ),
+                        "reorder_chunk_info": cur_region,
+                        "reorder_node_list": cur_node_list,
+                    }
+                )
+        # no region found
+        if len(regions_dict) == 0:
+            raise RuntimeError("Search failed. Try a larger memory threshold.")
+
+        # select the min chunk len
+        chunk_len = [i["chunk_len"] for i in regions_dict]
+        best_region_idx = chunk_len.index(min(chunk_len))
+        best_region = regions_dict[best_region_idx]
+
+        # get max chunk size
+        best_region = self._get_fit_chunk_size(best_region, chunk_infos)
+        return best_region
+
+    def _get_fit_chunk_size(self, chunk_region_dict, chunk_infos):
+        chunk_size = 1
+        reorder_chunk_info = chunk_region_dict["reorder_chunk_info"]
+        reorder_chunk_info["chunk_size"] = chunk_size
+        cur_chunk_max_mem = 0
+        # search a region
+        while cur_chunk_max_mem < self.max_memory:
+            chunk_size *= 2
+            reorder_chunk_info["chunk_size"] = chunk_size
+            cur_chunk_infos = chunk_infos + [reorder_chunk_info]
+            cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
+                chunk_region_dict["reorder_node_list"], cur_chunk_infos
+            )[0]
+            cur_chunk_max_mem = max(
+                cur_mem_peak[
+                    reorder_chunk_info["region"][0] : reorder_chunk_info["region"][1]
+                    + 1
+                ]
+            )
+        # search exact size
+        chunk_info = chunk_region_dict["chunk_info"]
+        chunk_info["chunk_size"] = self._chunk_size_binary_search(
+            chunk_size // 2, chunk_size, chunk_region_dict, chunk_infos
+        )
+        return chunk_info
+
+    def _chunk_size_binary_search(self, l, r, chunk_region_dict, chunk_infos):
+        if l >= 16:
+            gap = 4
+        else:
+            gap = 1
+        chunk_info = chunk_region_dict["reorder_chunk_info"]
+        while r >= l + gap:
+            mid = int((l + r) / 2 + 0.5)
+            chunk_info["chunk_size"] = mid
+            cur_chunk_infos = chunk_infos + [chunk_info]
+            cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
+                chunk_region_dict["reorder_node_list"], cur_chunk_infos
+            )[0]
+            cur_chunk_max_mem = max(
+                cur_mem_peak[chunk_info["region"][0] : chunk_info["region"][1] + 1]
+            )
+            if cur_chunk_max_mem >= self.max_memory:
+                r = mid - gap
+            else:
+                l = mid + gap
+        return l
+
+    def _get_compute_node_num(self, start, end):
+        count = 0
+        for i in self.index_tracer.node_list[start : end + 1]:
+            if not is_non_compute_node(i):
+                count += 1
+        return count
+
+    def _select_min_memory_chunk_region(
+        self, possible_chunk_regions, chunk_infos, peak_node, max_chunk_region, mem_peak
+    ):
+        # remove illegal regions
+        illegal_regions = []
+        for i in possible_chunk_regions:
+            if not self._is_legal_region(i, chunk_infos):
+                illegal_regions.append(i)
+        for i in illegal_regions:
+            if i in possible_chunk_regions:
+                possible_chunk_regions.remove(i)
+
+        if len(possible_chunk_regions) == 0:
+            return None
+
+        # get mem for chunk region
+        regions_dict = []
+        for region in possible_chunk_regions:
+            cur_region = region.copy()
+            cur_node_list, cur_region = self.index_tracer.tmp_reorder(
+                self.index_tracer.node_list, cur_region
+            )
+            cur_chunk_infos = chunk_infos + [cur_region]
+            cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
+                cur_node_list, cur_chunk_infos
+            )[0]
+            cur_chunk_region_peak = cur_mem_peak[
+                max_chunk_region[0] : max_chunk_region[1] + 1
+            ]
+            cur_chunk_region_max_peak = max(cur_chunk_region_peak)
+            regions_dict.append(
+                {
+                    "chunk_info": region,
+                    "chunk_max_mem": cur_chunk_region_max_peak,
+                    "chunk_len": self._get_compute_node_num(
+                        region["region"][0], region["region"][1]
+                    ),
+                    "reorder_chunk_info": cur_region,
+                    "reorder_node_list": cur_node_list,
+                }
+            )
+
+        # select the min mem
+        chunk_max_mem = [i["chunk_max_mem"] for i in regions_dict]
+        best_region_idx = chunk_max_mem.index(min(chunk_max_mem))
+        best_region = regions_dict[best_region_idx]["chunk_info"]
+        if best_region is not None:
+            best_region["chunk_size"] = 1
+        return best_region
+
+    def _is_legal_region(self, cur_chunk_info, chunk_infos):
+        (chunk_region_start, chunk_region_end) = cur_chunk_info["region"]
+        if cur_chunk_info in chunk_infos:
+            return False
+        if chunk_region_end < chunk_region_start:
+            return False
+        for i in chunk_infos:
+            region = i["region"]
+            if not (
+                (chunk_region_start > region[1] and chunk_region_end > region[1])
+                or (chunk_region_start < region[0] and chunk_region_end < region[0])
+            ):
+                return False
+        return True
diff --git a/colossalai/autochunk/index_tracer.py b/colossalai/autochunk/index_tracer.py
new file mode 100644
index 000000000000..7a86f3c998fb
--- /dev/null
+++ b/colossalai/autochunk/index_tracer.py
@@ -0,0 +1,1056 @@
+import copy
+
+from .utils import (
+    find_chunk_all_input_nodes,
+    find_chunk_compute_input_and_output_nodes,
+    find_idx_by_name,
+    get_node_shape,
+    is_non_compute_node,
+    is_non_compute_node_except_placeholder,
+)
+
+
+class IndexTracer(object):
+    def __init__(self, node_list) -> None:
+        self.node_list = node_list
+        self.idx_trace_list = self._init_idx_trace_list()
+        self.idx_trace_equal = []
+        self.idx_view_list = {}
+        self.idx_count = -1
+        self.all_reorder_map = {i: i for i in range(len(self.idx_trace_list))}
+
+    def _init_idx_trace_list(self):
+        idx_trace_list = []
+        for n in self.node_list:
+            if get_node_shape(n) != None:
+                cur_trace = {
+                    "idx": [None for _ in range(len(get_node_shape(n)))],
+                    "compute": [[] for _ in range(len(get_node_shape(n)))],
+                    "source": [{} for _ in range(len(get_node_shape(n)))],
+                }
+            else:
+                cur_trace = {"idx": [], "compute": [], "source": []}
+            idx_trace_list.append(cur_trace)
+        return idx_trace_list
+
+    def _add_index(self):
+        """
+        Update the count and return it. To record the idx number.
+
+        Returns:
+            idx_count: int
+        """
+        self.idx_count += 1
+        return self.idx_count
+
+    def _del_dim(self, idx, dim_idx):
+        self.idx_trace_list[idx]["idx"].pop(dim_idx)
+        self.idx_trace_list[idx]["compute"].pop(dim_idx)
+        self.idx_trace_list[idx]["source"].pop(dim_idx)
+
+    def _add_dim(self, node_idx, dim_idx):
+        self.idx_trace_list[node_idx]["idx"].insert(dim_idx, self._add_index())
+        self.idx_trace_list[node_idx]["compute"].insert(dim_idx, [])
+        self.idx_trace_list[node_idx]["source"].insert(dim_idx, {})
+
+    def _transform_index(self, node, node_dim):
+        node_idx = self._find_idx_trace_from_node(node)
+        dims = list(range(len(node_idx)))
+        return dims[node_dim]
+
+    def _inherit_index(self, node_from, node_from_dim, node_to, node_to_dim):
+        node_from_dim = self._transform_index(node_from, node_from_dim)
+        node_to_dim = self._transform_index(node_to, node_to_dim)
+        node_from_trace = self._find_trace_from_node(node_from)
+        node_to_trace = self._find_trace_from_node(node_to)
+        node_to_trace["idx"][node_to_dim] = node_from_trace["idx"][node_from_dim]
+        node_to_trace["compute"][node_to_dim] = copy.deepcopy(
+            node_from_trace["compute"][node_from_dim]
+        )
+        self._add_source(node_from, node_from_dim, node_to, node_to_dim, init=True)
+
+    def _inherit_all_computation(self, node_from, node_to):
+        node_from_compute = self._find_compute_trace_from_node(node_from)
+        node_to_compute = self._find_compute_trace_from_node(node_to)
+        assert len(node_from_compute) == len(node_to_compute)
+        for i in range(len(node_from_compute)):
+            self._add_source(node_from, i, node_to, i)
+            node_to_compute[i] = copy.deepcopy(node_from_compute[i])
+
+    def _add_source(self, node_from, node_from_dim, node_to, node_to_dim, init=False):
+        node_from_dim = self._transform_index(node_from, node_from_dim)
+        node_from_trace_source = self._find_source_trace_from_node(node_from)
+        node_to_dim = self._transform_index(node_to, node_to_dim)
+        node_to_trace_source = self._find_source_trace_from_node(node_to)
+        node_from_idx = find_idx_by_name(node_from.name, self.node_list)
+        if init:
+            node_to_trace_source[node_to_dim] = {}
+        # add dim to cur new source
+        if node_from_idx not in node_to_trace_source[node_to_dim]:
+            node_to_trace_source[node_to_dim][node_from_idx] = [node_from_dim]
+        else:
+            if node_from_dim not in node_to_trace_source[node_to_dim][node_from_idx]:
+                node_to_trace_source[node_to_dim][node_from_idx].append(node_from_dim)
+        # update inputs source
+        for node_idx, node_dim in node_from_trace_source[node_from_dim].items():
+            if node_idx not in node_to_trace_source[node_to_dim]:
+                node_to_trace_source[node_to_dim][node_idx] = copy.deepcopy(node_dim)
+            else:
+                for d in node_dim:
+                    if d not in node_to_trace_source[node_to_dim][node_idx]:
+                        node_to_trace_source[node_to_dim][node_idx].append(d)
+
+    def _mark_computation_from_node(self, node_from, node_to, exclude=None):
+        if exclude == None:
+            exclude = []
+        else:
+            exclude = [self._transform_index(node_to, i) for i in exclude]
+        node_from_compute = self._find_compute_trace_from_node(node_from)
+        node_to_compute = self._find_compute_trace_from_node(node_to)
+        # assert len(node_from_compute) == len(node_to_compute)
+        for i in range(-1, -min(len(node_from_compute), len(node_to_compute)) - 1, -1):
+            if self._transform_index(node_to, i) in exclude:
+                continue
+            self._add_source(node_from, i, node_to, i)
+            for j in node_from_compute[i]:
+                if j not in node_to_compute[i]:
+                    node_to_compute[i].append(j)
+
+    def _mark_idx_equal(self, node1, dim1, node2, dim2):
+        """
+        Mark 2 index to be equal.
+
+        Args:
+            idx1 (int): index count.
+            idx2 (int): index count.
+        """
+        # node1_idx = _find_idx_by_name(node1.name, self.nodes_list)
+        # node2_idx = _find_idx_by_name(node2.name, self.nodes_list)
+        # if node1_idx > node2_idx:
+        #     self._add_source(node2, dim2, node1, dim1)
+        # else:
+        #     self._add_source(node1, dim1, node2, dim2)
+
+    def _mark_computation(self, node, idx, dim):
+        """
+        Mark some dims of node as computed.
+
+        Args:
+            node (node)
+            idx (int): node index
+            dim (list or int): dims to be marked as computed
+        """
+        if isinstance(dim, int):
+            dim = [dim]
+        dims = list(range(len(get_node_shape(node))))
+        for d in dim:
+            cur_dim = dims[d]
+            if idx not in self.idx_trace_list[idx]["compute"][cur_dim]:
+                self.idx_trace_list[idx]["compute"][cur_dim].append(idx)
+
+    def _find_trace_from_node(self, node):
+        """
+        Find node idx and compute trace by the node.
+
+        Args:
+            node (node)
+        Returns:
+            idx (list): idx of the node
+            compute (list): computed idx of the node.
+        """
+        node_idx = find_idx_by_name(node.name, self.node_list)
+        node_dict = self.idx_trace_list[node_idx]
+        return node_dict
+
+    def _find_source_trace_from_node(self, node):
+        """
+        Find node source trace by the node.
+
+        Args:
+            node (node)
+        Returns:
+            idx (list): idx of the node
+            compute (list): computed idx of the node.
+        """
+        node_idx = find_idx_by_name(node.name, self.node_list)
+        node_dict = self.idx_trace_list[node_idx]
+        return node_dict["source"]
+
+    def _find_idx_trace_from_node(self, node):
+        """
+        Find node idx trace by the node.
+
+        Args:
+            node (node)
+        Returns:
+            idx (list): idx of the node
+        """
+        node_idx = find_idx_by_name(node.name, self.node_list)
+        return self.idx_trace_list[node_idx]["idx"]
+
+    def _find_compute_trace_from_node(self, node):
+        """
+        Find node compute trace by the node.
+
+        Args:
+            node (node)
+        Returns:
+            compute (list): computed idx of the node.
+        """
+        node_idx = find_idx_by_name(node.name, self.node_list)
+        return self.idx_trace_list[node_idx]["compute"]
+
+    def _assign_index_as_input(self, node, node_idx, input_node=None):
+        """
+        Assign node's trace as its input node.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        if input_node == None:
+            input_node = node.args[0]
+        input_node_idx = find_idx_by_name(input_node.name, self.node_list)
+        input_node_idx_trace = self.idx_trace_list[input_node_idx]["idx"]
+
+        new_idx_trace = copy.deepcopy(input_node_idx_trace)
+        self.idx_trace_list[node_idx]["idx"] = new_idx_trace
+
+        self._inherit_all_computation(input_node, node)
+
+    def _assign_all_index(self, node, node_idx):
+        """
+        Add new index for all node's dims.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        shape = node.meta["tensor_meta"].shape
+        new_trace = []
+        for _ in shape:
+            new_trace.append(self._add_index())
+        self.idx_trace_list[node_idx]["idx"] = new_trace
+
+    def _assign_transpose_index(self, node, node_idx):
+        """
+        Assign index for transpose op.
+        1. swap input's dim according to transpose args
+        2. inherit input's computation
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        input_node = node.args[0]
+        tranpose_dim = node.args[1:]
+
+        self._assign_index_as_input(node, node_idx, input_node)
+        self._inherit_index(input_node, tranpose_dim[1], node, tranpose_dim[0])
+        self._inherit_index(input_node, tranpose_dim[0], node, tranpose_dim[1])
+
+    def _assign_permute_index(self, node, node_idx):
+        """
+        Assign index for permute op.
+        1. swap input's dim according to permute args
+        2. inherit input's computation
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        permute_dim = node.args[1:]
+        input_node = node.args[0]
+
+        self._assign_index_as_input(node, node_idx, input_node)
+        for idx, d in enumerate(permute_dim):
+            self._inherit_index(input_node, d, node, idx)
+
+    def _assign_linear_index(self, node, node_idx):
+        """
+        Assign index for linear op.
+        1. copy trace from input node and change last index accroding to weight
+        2. mark equal for input node last index, weight first dim and bias dim.
+        3. inherit input's computation, mark computation for last dim.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        if len(node.args) == 2:
+            input_node, weight = node.args
+            bias = None
+        else:
+            input_node, weight, bias = node.args
+
+        self._assign_index_as_input(node, node_idx)
+        self._inherit_index(weight, 1, node, -1)
+
+        self._mark_computation(node, node_idx, [-1])
+        self._mark_idx_equal(input_node, -1, weight, 0)
+
+        if bias:
+            self._mark_idx_equal(input_node, -1, bias, 0)
+
+    def _assign_matmul_index(self, node, node_idx):
+        """
+        Assign index for matmul op.
+        1. copy trace from matmul_left and change last index accroding to matmul_right. (assert they have same length)
+        2. mark equal for input matmul_left -1 index and matmul_right -2 dim.
+        3. inherit matmul_left and matmul_right computation, mark computation for last dim.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        matmul_left, matmul_right = node.args
+
+        assert len(get_node_shape(matmul_left)) == len(get_node_shape(matmul_right))
+        self._assign_index_as_input(node, node_idx, matmul_left)
+        self._inherit_index(matmul_right, -1, node, -1)
+
+        self._mark_computation_from_node(matmul_right, node, [-1, -2])
+        self._mark_computation(node, node_idx, [-1])
+        self._mark_idx_equal(matmul_left, -1, matmul_right, -2)
+
+    def _assign_layernorm_index(self, node, idx):
+        """
+        Assign index for layernorm op.
+        1. assign index as input node
+        2. inherit computation and mark last 2 dims as computed.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        self._assign_index_as_input(node, idx)
+        self._mark_computation(node, idx, [-1])
+
+    def _assign_elementwise_index(self, node, idx):
+        """
+        Assign index for element-wise op (eg. relu sigmoid add mul).
+        1. assign index as input node
+        2. inherit computation from all input nodes.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        self._assign_index_as_input(node, idx)
+        nodes_in = []
+        for node_in in node.args:
+            if type(node_in) == type(node):
+                nodes_in.append(node_in)
+                self._mark_computation_from_node(node_in, node)
+        assert len(nodes_in) <= 2
+        if len(nodes_in) == 2:
+            node_in0_shape = get_node_shape(nodes_in[0])
+            node_in1_shape = get_node_shape(nodes_in[1])
+            for i in range(-1, -min(len(node_in0_shape), len(node_in1_shape)) - 1, -1):
+                if node_in0_shape[i] == node_in1_shape[i]:
+                    self._mark_idx_equal(nodes_in[0], i, nodes_in[1], i)
+
+    def _assgin_no_change_index(self, node, idx):
+        self._assign_index_as_input(node, idx)
+        for node_in in node.args:
+            if type(node_in) == type(node):
+                self._mark_computation_from_node(node_in, node)
+
+    def _assign_einsum_index(self, node, idx):
+        """
+        Assign index for einsum op.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        patterns = node.args[0]
+        input_nodes = node.args[1:]
+
+        patterns = patterns.replace(" ", "")
+        left, right = patterns.split("->")
+        left = left.split(",")
+
+        all_index = []
+        for i in left:
+            for c in i:
+                all_index.append(c)
+        all_index = set(all_index)
+        free_index = set([i for i in right])
+        sum_index = all_index - free_index
+
+        for right_idx, right_indice in enumerate(right):
+            for left_idx, left_str in enumerate(left):
+                if right_indice in left_str:
+                    source_idx = left_str.index(right_indice)
+                    self._inherit_index(
+                        input_nodes[left_idx], source_idx, node, right_idx
+                    )
+
+        # for i in sum_index:
+        #     for left_idx, left_str in enumerate(left):
+        #         if i in left_str:
+        #             self._mark_computation(node, idx, left_str.index(i))
+        #             break
+
+    def _assign_softmax_index(self, node, idx):
+        """
+        Assign index for softmax op.
+        1. assign index as input node
+        2. inherit computation and mark softmax dim as computed.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        self._assign_index_as_input(node, idx)
+        self._mark_computation(node, idx, [node.kwargs["dim"]])
+
+    def _assign_unsqueeze_index(self, node, node_idx):
+        """
+        Assign index for unsqueeze op.
+        1. assign new index for unsqueeze dim
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        self._del_dim(node_idx, -1)
+        self._assign_index_as_input(node, node_idx)
+        self._add_dim(node_idx, node.args[1])
+
+    def _assign_dropout_index(self, node, node_idx):
+        """
+        Assign index for unsqueeze op.
+        1. assign new index for unsqueeze dim
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        self._assign_index_as_input(node, node_idx)
+
+    def _assign_ones_like_index(self, node, node_idx):
+        """
+        Assign index for oneslike op.
+        1. assign new index for all dim
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        self._assign_all_index(node, node_idx)
+
+    def _assign_view_reshape_index(self, node, node_idx):
+        """
+        Assign index for view and reshape op.
+        1. get origin shape and target shape by meta info.
+        2. compute the real value of -1 in target shape.
+        3. determine changed dim, and assgin index for generated dim.
+        4. log changed dim and generated dim for restore
+        5. inherit computation.
+        6. TODO: look into view list to see whether the view is associated with other,
+           if so assgin equal dim according to previous view.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        # get data, turn into number
+        origin_node = node.args[0]
+        origin_shape = origin_node.meta["tensor_meta"].shape
+        target_shape = []
+        for i in range(1, len(node.args)):
+            if isinstance(node.args[i], int):
+                target_shape.append(node.args[i])
+            else:
+                target_shape.append(node.args[i].meta["fwd_out"][0])
+
+        # compute the value of -1
+        if -1 in target_shape:
+            origin_product = 1
+            for i in origin_shape:
+                origin_product *= i
+            target_product = -1
+            for i in target_shape:
+                target_product *= i
+            shape_idx = target_shape.index(-1)
+            target_shape[shape_idx] = origin_product // target_product
+
+        # determine changed dim
+        len_diff = len(origin_shape) - len(target_shape)
+        if len_diff == 1:
+            # dim merge
+            dim_equal = [i == j for i, j in zip(origin_shape[:-1], target_shape)]
+            dim_to = [dim_equal.index(False)]
+            dim_from = [dim_equal.index(False), dim_equal.index(False) + 1]
+            self._add_dim(node_idx, -1)
+        elif len_diff == -1:
+            # dim expand
+            dim_equal = [i == j for i, j in zip(origin_shape, target_shape[:-1])]
+            dim_from = [dim_equal.index(False)]
+            dim_to = [dim_equal.index(False), dim_equal.index(False) + 1]
+            self._del_dim(node_idx, -1)
+        else:
+            raise NotImplementedError(
+                "shape"
+                + str(origin_shape)
+                + "and"
+                + str(target_shape)
+                + "view not implemented"
+            )
+
+        # get new index
+        origin_trace = self._find_idx_trace_from_node(origin_node)
+        self._assign_index_as_input(node, node_idx, origin_node)
+        dim_from.reverse()
+        for i in dim_from:
+            self._del_dim(node_idx, i)
+        for i in dim_to:
+            self._add_dim(node_idx, i)
+
+        # inherit computation
+        compute_log = self._find_compute_trace_from_node(origin_node)
+        for i in dim_from:
+            if origin_trace[i] in compute_log:
+                for j in dim_to:
+                    self._mark_computation(node, node_idx, [j])
+                break
+
+        # log view, not used now
+        view_dict = {
+            "idx_from": [origin_trace[i] for i in dim_from],
+            "dim_from": dim_from,
+            "idx_to": [self.idx_trace_list[node_idx]["idx"][i] for i in dim_to],
+            "dim_to": dim_to,
+        }
+        self.idx_view_list[node] = view_dict
+
+    def _merge_equal_idx(self):
+        idx_equal = copy.deepcopy(self.idx_trace_equal)
+        idx_equal.reverse()
+        for idx in idx_equal:
+            merge_to = min(idx)
+            merge_from = max(idx)
+            for trace in self.idx_trace_list:
+                if merge_from in trace["idx"]:
+                    trace["idx"] = [
+                        merge_to if i == merge_from else i for i in trace["idx"]
+                    ]
+
+    def trace_index(self):
+        for idx, node in enumerate(self.node_list):
+            if node.op == "placeholder":
+                self._assign_all_index(node, idx)
+            elif node.op == "call_method":
+                if "transpose" in node.name:
+                    self._assign_transpose_index(node, idx)
+                elif "permute" in node.name:
+                    self._assign_permute_index(node, idx)
+                elif "view" in node.name or "reshape" in node.name:
+                    self._assign_view_reshape_index(node, idx)
+                elif "unsqueeze" in node.name:
+                    self._assign_unsqueeze_index(node, idx)
+                elif any(i in node.name for i in ["to", "contiguous"]):
+                    self._assgin_no_change_index(node, idx)
+                else:
+                    raise NotImplementedError(node.name, "method not implemented yet!")
+            elif node.op == "call_function":
+                if "linear" in node.name:
+                    self._assign_linear_index(node, idx)
+                elif "matmul" in node.name:
+                    self._assign_matmul_index(node, idx)
+                elif "softmax" in node.name:
+                    self._assign_softmax_index(node, idx)
+                elif any(n in node.name for n in ["mul", "add", "sigmoid", "relu"]):
+                    self._assign_elementwise_index(node, idx)
+                elif "ones_like" in node.name:
+                    self._assign_ones_like_index(node, idx)
+                elif "dropout" in node.name:
+                    self._assign_dropout_index(node, idx)
+                elif "einsum" in node.name:
+                    self._assign_einsum_index(node, idx)
+                elif "getattr" in node.name:
+                    continue  # get attr like shape
+                elif "getitem" in node.name:
+                    continue  # get item in list
+                else:
+                    raise NotImplementedError(
+                        node.name, "function not implemented yet!"
+                    )
+            elif node.op == "call_module":
+                if any(n in node.name for n in ["layernorm", "norm"]):
+                    self._assign_layernorm_index(node, idx)
+                else:
+                    raise NotImplementedError(node.name, "module not implemented yet!")
+            elif node.op == "get_attr":
+                self._assign_all_index(node, idx)  # get param
+            elif node.op == "output":
+                continue
+            else:
+                raise NotImplementedError(node.op, "op not implemented yet!")
+        # self._merge_equal_idx()
+
+    def check_index_source(self, start_dim, start_node, start_idx, end_dim, end_node):
+        """
+        Check 2 given index: one index should be source of the other
+        Args:
+            start_idx(int): start node chunk dim
+            start_node(node): start node
+            end_idx(int): end node chunk dim
+            end_node(node): end node
+
+        Returns:
+            bool: True if check pass
+        """
+        start_node_idx = find_idx_by_name(start_node.name, self.node_list)
+        end_node_trace = self._find_trace_from_node(end_node)
+        end_node_trace_source = end_node_trace["source"][end_dim]
+        sorted_source = sorted(
+            end_node_trace_source.items(), key=lambda d: d[0], reverse=True
+        )
+        for node_idx, node_dim in sorted_source:
+            if node_idx == start_node_idx and start_dim in node_dim:
+                return True
+            # it means we meet a node outside the loop, and the node is not input node
+            if node_idx < start_idx:
+                return False
+        return False
+
+    def check_index_compute(self, start_idx, end_dim, end_node, end_idx):
+        """
+        Check 2 given index: check they haven't been computed in the source trace.
+        Args:
+            start_idx(int): start node chunk dim
+            start_node(node): start node
+            end_idx(int): end node chunk dim
+            end_node(node): end node
+
+        Returns:
+            bool: True if check pass
+        """
+        end_node_trace = self._find_trace_from_node(end_node)
+        end_node_compute = end_node_trace["compute"][end_dim]
+        if any(start_idx <= i <= end_idx for i in end_node_compute):
+            return False
+        return True
+
+    def get_node_chunk_dim(self, node_from, node_from_dim, node_to):
+        node_from_source = self._find_source_trace_from_node(node_from)
+        dim_source = node_from_source[node_from_dim]
+        node_to_idx = find_idx_by_name(node_to.name, self.node_list)
+        for k, v in dim_source.items():
+            if k == node_to_idx:
+                return v
+        return None
+
+    def _find_inherit_dim(self, input_node, input_dim, node):
+        input_node_idx = find_idx_by_name(input_node.name, self.node_list)
+        node_trace_source = self._find_source_trace_from_node(node)
+        for node_dim in range(len(get_node_shape(node))):
+            if (
+                input_node_idx in node_trace_source[node_dim]
+                and input_dim[0] in node_trace_source[node_dim][input_node_idx]
+            ):
+                return node_dim
+        return None
+
+    def check_index_duplicate(self, chunk_infos, return_dim=False):
+        input_dim_after_node = {}
+        for input_node_idx, input_node in enumerate(chunk_infos["inputs"]):
+            for k, v in chunk_infos["inputs_dim"][input_node_idx].items():
+                inherit_dim = self._find_inherit_dim(input_node, v, self.node_list[k])
+                if inherit_dim:
+                    input_dim_after_node[k] = inherit_dim
+
+        for node in self.node_list[
+            chunk_infos["region"][0] : chunk_infos["region"][1] + 1
+        ]:
+            if is_non_compute_node_except_placeholder(node):
+                continue
+            count = 0
+            duplicate_dims = []
+            node_trace_source = self._find_source_trace_from_node(node)
+            for node_dim in range(len(get_node_shape(node))):
+                duplicate_dim = []
+                duplicate_flag = False
+                dim_source = node_trace_source[node_dim]
+                for k, v in dim_source.items():
+                    if chunk_infos["region"][0] <= k <= chunk_infos["region"][1]:
+                        if k in input_dim_after_node and input_dim_after_node[k] in v:
+                            duplicate_flag = True
+                            duplicate_dim.append((k, v))
+                duplicate_dims.append(duplicate_dim)
+                if duplicate_flag:
+                    count += 1
+
+            if count > 1:
+                if return_dim:
+                    return False, duplicate_dims
+                else:
+                    return False
+        if return_dim:
+            return True, None
+        else:
+            return True
+
+    def _assgin_single_node_flow(
+        self,
+        arg_node,
+        start_idx,
+        end_idx,
+        cur_node_dim,
+        cur_node_compute,
+        cur_node_source,
+        cur_node_fix_dim,
+        all_node_info,
+        next_node_list,
+    ):
+        arg_idx = find_idx_by_name(arg_node.name, self.node_list)
+        # arg in chunk range or be inputs
+        if not (start_idx <= arg_idx < end_idx):
+            return True
+
+        # find arg dim
+        if cur_node_dim is not None:
+            # dim is computed
+            if arg_idx in cur_node_compute[cur_node_dim]:
+                return False
+            if arg_idx not in cur_node_source[cur_node_dim]:
+                arg_dim = None
+            else:
+                arg_dim = cur_node_source[cur_node_dim][arg_idx][0]
+        else:
+            arg_dim = None
+
+        # get fix dim
+        arg_fix_dim = []
+        if cur_node_dim is not None:
+            for i in cur_node_fix_dim:
+                fix_dim_source = cur_node_source[i]
+                if arg_idx in fix_dim_source:
+                    arg_fix_dim.append(fix_dim_source[arg_idx][0])
+
+        # if already in node_info, arg dim must be same
+        if arg_node in all_node_info:
+            if all_node_info[arg_node]["chunk_dim"] != arg_dim:
+                return False
+            all_node_info[arg_node]["fix_dim"] = list(
+                set(all_node_info[arg_node]["fix_dim"] + arg_fix_dim)
+            )
+        # else add it to list
+        else:
+            all_node_info[arg_node] = {"chunk_dim": arg_dim, "fix_dim": arg_fix_dim}
+
+        next_node_list.append(arg_node)
+        return True
+
+    def flow_search(self, start_idx, start_dim, end_idx, end_dim):
+        inputs, outputs = find_chunk_compute_input_and_output_nodes(
+            self.node_list[start_idx : end_idx + 1]
+        )
+        # only single ouput
+        if len(outputs) > 1:
+            return None
+
+        cur_node_list = [self.node_list[end_idx]]  # start from the last node
+        all_node_info = {cur_node_list[0]: {"chunk_dim": end_dim, "fix_dim": []}}
+
+        while len(cur_node_list) > 0:
+            next_node_list = []
+
+            for cur_node in cur_node_list:
+                # get cur node info
+                cur_node_chunk_dim = all_node_info[cur_node]["chunk_dim"]
+                cur_node_fix_dim = all_node_info[cur_node]["fix_dim"]
+                cur_node_idx = find_idx_by_name(cur_node.name, self.node_list)
+                if cur_node_chunk_dim:
+                    cur_node_compute = self._find_compute_trace_from_node(cur_node)
+                    cur_node_source = self._find_source_trace_from_node(cur_node)
+                else:
+                    cur_node_compute = cur_node_source = None
+
+                # get all valid args
+                arg_list = []
+                for arg in cur_node.args:
+                    if type(arg) != type(cur_node):
+                        continue
+                    if is_non_compute_node(arg):
+                        continue
+                    arg_list.append(arg)
+                    flow_flag = self._assgin_single_node_flow(
+                        arg,
+                        start_idx,
+                        end_idx,
+                        cur_node_chunk_dim,
+                        cur_node_compute,
+                        cur_node_source,
+                        cur_node_fix_dim,
+                        all_node_info,
+                        next_node_list,
+                    )
+                    if flow_flag == False:
+                        return None
+
+                if len(arg_list) == 2:
+                    if any(i in cur_node.name for i in ["add", "mul"]):
+                        for arg in arg_list:
+                            if not (
+                                start_idx
+                                <= find_idx_by_name(arg.name, self.node_list)
+                                < end_idx
+                            ):
+                                continue
+                            arg_chunk_dim = all_node_info[arg]["chunk_dim"]
+                            arg_fix_dim = all_node_info[arg]["fix_dim"]
+                            arg_shape = get_node_shape(arg)
+                            # add all dim as fix dim except chunk dim
+                            for i, shape in enumerate(arg_shape):
+                                if shape != 1 and i != cur_node_chunk_dim:
+                                    if i == arg_chunk_dim:
+                                        return None
+                                    if i not in arg_fix_dim:
+                                        arg_fix_dim.append(i)
+                    elif "einsum" in cur_node.name:
+                        pass
+                    elif "matmul" in cur_node.name:
+                        pass
+                    else:
+                        raise NotImplementedError()
+            cur_node_list = next_node_list
+
+        inputs_dim = []
+        remove_inputs = []
+        for input_node in inputs:
+            input_dict = {}
+            input_node_idx = find_idx_by_name(input_node.name, self.node_list)
+            for user in input_node.users.keys():
+                if is_non_compute_node(user):
+                    continue
+                user_idx = find_idx_by_name(user.name, self.node_list)
+                if start_idx <= user_idx <= end_idx:
+                    chunk_dim = all_node_info[user]["chunk_dim"]
+                    if chunk_dim is not None:
+                        user_source = self._find_source_trace_from_node(user)[chunk_dim]
+                        if input_node_idx in user_source:
+                            input_dict[user_idx] = user_source[input_node_idx]
+                        else:
+                            return None
+            if len(input_dict) == 0:
+                remove_inputs.append(input_node)
+            else:
+                inputs_dim.append(input_dict)
+        for i in remove_inputs:
+            if i in inputs:
+                inputs.remove(i)
+
+        chunk_info = {
+            "region": (start_idx, end_idx),
+            "inputs": inputs,
+            "inputs_non_chunk": [],
+            "inputs_dim": inputs_dim,
+            "outputs": outputs,
+            "outputs_dim": end_dim,
+            "node_chunk_dim": all_node_info,
+            "args": {},
+        }
+
+        # move useless nodes ahead of loop
+        # get all possible prepose nodes
+        maybe_prepose_nodes = []
+        for node, node_info in all_node_info.items():
+            if node_info["chunk_dim"] is None:
+                maybe_prepose_nodes.append(node)
+        maybe_prepose_nodes.sort(
+            key=lambda x: find_idx_by_name(x.name, self.node_list),
+            reverse=True,
+        )  # from last node to first node
+        prepose_nodes = []
+        # set every node as root, search its args, if all legal, turn root and args as prepose nodes
+        while len(maybe_prepose_nodes) > 0:
+            tmp_cur_prepose_nodes = [maybe_prepose_nodes[0]]
+            tmp_cur_related_prepose_nodes = []
+            prepose_flag = True
+
+            # loop cur node's all arg until out of chunk
+            while len(tmp_cur_prepose_nodes) > 0:
+                if prepose_flag == False:
+                    break
+                tmp_next_prepose_nodes = []
+                tmp_cur_related_prepose_nodes.extend(tmp_cur_prepose_nodes)
+                for cur_prepose_node in tmp_cur_prepose_nodes:
+                    if prepose_flag == False:
+                        break
+                    for cur_prepose_node_arg in cur_prepose_node.args:
+                        if type(cur_prepose_node_arg) != type(cur_prepose_node):
+                            continue
+                        # out of loop
+                        if not (
+                            start_idx
+                            <= find_idx_by_name(
+                                cur_prepose_node_arg.name, self.node_list
+                            )
+                            < end_idx
+                        ):
+                            continue
+                        # compute op in loop
+                        elif cur_prepose_node_arg in all_node_info:
+                            if all_node_info[cur_prepose_node_arg]["chunk_dim"] is None:
+                                tmp_next_prepose_nodes.append(cur_prepose_node_arg)
+                            else:
+                                prepose_flag = False
+                                break
+                        # non compute op
+                        else:
+                            tmp_next_prepose_nodes.append(cur_prepose_node_arg)
+                tmp_cur_prepose_nodes = tmp_next_prepose_nodes
+
+            if prepose_flag == False:
+                maybe_prepose_nodes.remove(maybe_prepose_nodes[0])
+                continue
+            else:
+                for n in tmp_cur_related_prepose_nodes:
+                    if n not in prepose_nodes:
+                        prepose_nodes.append(n)
+                    if n in maybe_prepose_nodes:
+                        maybe_prepose_nodes.remove(n)
+        # sort by index
+        prepose_nodes.sort(key=lambda x: find_idx_by_name(x.name, self.node_list))
+        chunk_info["args"]["prepose_nodes"] = prepose_nodes
+
+        # we need to log input nodes to avoid deleteing them in the loop
+        chunk_node_list = self.node_list[start_idx : end_idx + 1]
+        # also need to get some prepose node's arg out of non_chunk_inputs
+        for n in prepose_nodes:
+            chunk_node_list.remove(n)
+        non_chunk_inputs = find_chunk_all_input_nodes(chunk_node_list)
+        for i in non_chunk_inputs:
+            if i not in chunk_info["inputs"]:
+                chunk_info["inputs_non_chunk"].append(i)
+
+        # reassgin reshape size, some size may have changed due to chunk
+        chunk_info = self._reassgin_reshape_size(chunk_info)
+
+        return chunk_info
+
+    def _reassgin_reshape_size(self, chunk_info):
+        chunk_region = chunk_info["region"]
+        reshape_size = {}
+        chunk_shape = get_node_shape(chunk_info["outputs"][0])[
+            chunk_info["outputs_dim"]
+        ]
+        for node in self.node_list[chunk_region[0] : chunk_region[1] + 1]:
+            if any(i in node.name for i in ["reshape", "view"]):
+                reshape_args = node.args[1:]
+                reshape_log = self.idx_view_list[node]
+                chunk_dim = chunk_info["node_chunk_dim"][node]["chunk_dim"]
+                reshape_size[node.name] = {}
+                for reshape_arg_dim, reshape_arg in enumerate(reshape_args):
+                    if reshape_arg_dim in reshape_log["dim_to"]:
+                        continue
+                    if reshape_arg_dim == chunk_dim:
+                        reshape_size[node.name][reshape_arg.name] = (
+                            "min(chunk_size, %d - chunk_idx)" % chunk_shape
+                        )
+        chunk_info["reshape_size"] = reshape_size
+        return chunk_info
+
+    def _get_reorder_map(self, chunk_info):
+        reorder_map = {i: i for i in range(len(self.node_list))}
+
+        chunk_region_start = chunk_info["region"][0]
+        chunk_region_end = chunk_info["region"][1]
+        chunk_prepose_nodes = chunk_info["args"]["prepose_nodes"]
+        chunk_prepose_nodes_idx = [
+            find_idx_by_name(i.name, self.node_list) for i in chunk_prepose_nodes
+        ]
+        # put prepose nodes ahead
+        for idx, n in enumerate(chunk_prepose_nodes):
+            n_idx = chunk_prepose_nodes_idx[idx]
+            reorder_map[n_idx] = chunk_region_start + idx
+        # put other nodes after prepose nodes
+        for n in self.node_list[chunk_region_start : chunk_region_end + 1]:
+            if n in chunk_prepose_nodes:
+                continue
+            n_idx = find_idx_by_name(n.name, self.node_list)
+            pos = sum([n_idx < i for i in chunk_prepose_nodes_idx])
+            reorder_map[n_idx] = n_idx + pos
+
+        return reorder_map
+
+    def _reorder_chunk_info(self, chunk_info, reorder_map):
+        # update chunk info
+        chunk_info["region"] = (
+            chunk_info["region"][0] + len(chunk_info["args"]["prepose_nodes"]),
+            chunk_info["region"][1],
+        )
+        new_inputs_dim = []
+        for idx, input_dim in enumerate(chunk_info["inputs_dim"]):
+            new_input_dim = {}
+            for k, v in input_dim.items():
+                new_input_dim[reorder_map[k]] = v
+            new_inputs_dim.append(new_input_dim)
+        chunk_info["inputs_dim"] = new_inputs_dim
+        return chunk_info
+
+    def _update_all_reorder_map(self, reorder_map):
+        for origin_idx, map_idx in self.all_reorder_map.items():
+            self.all_reorder_map[origin_idx] = reorder_map[map_idx]
+
+    def _reorder_self_node_list(self, reorder_map):
+        new_node_list = [None for _ in range(len(self.node_list))]
+        for old_idx, new_idx in reorder_map.items():
+            new_node_list[new_idx] = self.node_list[old_idx]
+        self.node_list = new_node_list
+
+    def _reorder_idx_trace(self, reorder_map):
+        # reorder list
+        new_idx_trace_list = [None for _ in range(len(self.idx_trace_list))]
+        for old_idx, new_idx in reorder_map.items():
+            new_idx_trace_list[new_idx] = self.idx_trace_list[old_idx]
+        self.idx_trace_list = new_idx_trace_list
+        # update compute
+        for idx_trace in self.idx_trace_list:
+            compute = idx_trace["compute"]
+            for dim_compute in compute:
+                for idx, i in enumerate(dim_compute):
+                    dim_compute[idx] = reorder_map[i]
+        # update source
+        for idx_trace in self.idx_trace_list:
+            source = idx_trace["source"]
+            for dim_idx, dim_source in enumerate(source):
+                new_dim_source = {}
+                for k, v in dim_source.items():
+                    new_dim_source[reorder_map[k]] = v
+                source[dim_idx] = new_dim_source
+
+    def reorder_all(self, chunk_info):
+        if chunk_info is None:
+            return chunk_info
+        if len(chunk_info["args"]["prepose_nodes"]) == 0:
+            return chunk_info
+        reorder_map = self._get_reorder_map(chunk_info)
+        self._update_all_reorder_map(reorder_map)
+        self._reorder_idx_trace(reorder_map)
+        self._reorder_self_node_list(reorder_map)
+        chunk_info = self._reorder_chunk_info(chunk_info, reorder_map)
+        return chunk_info
+
+    def reorder_node_list(self, node_list):
+        new_node_list = [None for _ in range(len(node_list))]
+        for old_idx, new_idx in self.all_reorder_map.items():
+            new_node_list[new_idx] = node_list[old_idx]
+        return new_node_list
+
+    def tmp_reorder(self, node_list, chunk_info):
+        if len(chunk_info["args"]["prepose_nodes"]) == 0:
+            return node_list, chunk_info
+        reorder_map = self._get_reorder_map(chunk_info)
+
+        # new tmp node list
+        new_node_list = [None for _ in range(len(node_list))]
+        for old_idx, new_idx in reorder_map.items():
+            new_node_list[new_idx] = node_list[old_idx]
+
+        chunk_info = self._reorder_chunk_info(chunk_info, reorder_map)
+        return new_node_list, chunk_info
diff --git a/colossalai/autochunk/memory_estiamtor.py b/colossalai/autochunk/memory_estiamtor.py
new file mode 100644
index 000000000000..c3d8b1803ce9
--- /dev/null
+++ b/colossalai/autochunk/memory_estiamtor.py
@@ -0,0 +1,318 @@
+import copy
+from typing import Any, Callable, Dict, Iterable, List, Tuple
+
+import torch
+from torch.fx.node import Node, map_arg
+
+from colossalai.fx.profiler import activation_size, parameter_size
+
+from .index_tracer import IndexTracer
+from .utils import (
+    delete_free_var_from_last_use,
+    find_idx_by_name,
+    get_node_shape,
+    is_non_compute_node_except_placeholder,
+)
+
+
+class MemoryEstimator(object):
+    def __init__(self, index_tracer: IndexTracer) -> None:
+        pass
+
+    def _get_meta_node_size(self, x):
+        x = x.meta["tensor_meta"]
+        x = x.numel * torch.tensor([], dtype=x.dtype).element_size()
+        return x
+
+    def _get_output_node(self, n):
+        fwd_out = {
+            x.uuid: x
+            for x in n.meta["fwd_out"]
+            if isinstance(x, torch.Tensor) and hasattr(x, "uuid")
+        }
+        out_size = activation_size(fwd_out)
+        out_node = [n.name] if out_size > 0 else []
+        # if any(i in n.name for i in ['transpose', 'permute', 'view']):
+        #     out_size = 0
+        return out_size, out_node
+
+    def _get_output_node_size(self, n):
+        return self._get_output_node(n)[0]
+
+    def _add_active_node(self, n, active_list):
+        new_active = self._get_output_node(n)[1]
+        if n.op == "placeholder":
+            new_active.append(n.name)
+        for i in new_active:
+            if i not in active_list:
+                active_list.append(i)
+
+    def _get_delete_node(self, user, user_to_last_uses, to_keep=None):
+        delete_size = 0
+        delete_node = []
+        if user.op not in ("output",):
+            nodes_to_delete = user_to_last_uses.get(user, [])
+            if to_keep is not None:
+                keep_list = []
+                for n in nodes_to_delete:
+                    if n.name in to_keep:
+                        keep_list.append(n)
+                for n in keep_list:
+                    if n in nodes_to_delete:
+                        nodes_to_delete.remove(n)
+            if len(nodes_to_delete):
+                out_node = [self._get_output_node(i) for i in nodes_to_delete]
+                delete_size = sum([i[0] for i in out_node])
+                for i in range(len(out_node)):
+                    if out_node[i][0] > 0:
+                        delete_node.append(out_node[i][1][0])
+                    elif nodes_to_delete[i].op == "placeholder":
+                        delete_node.append(nodes_to_delete[i].name)
+                    # elif any(j in nodes_to_delete[i].name for j in ['transpose', 'permute', 'view']):
+                    #     delete_node.append(nodes_to_delete[i].name)
+        return delete_size, delete_node
+
+    def _get_delete_node_size(self, user, user_to_last_uses, to_keep):
+        return self._get_delete_node(user, user_to_last_uses, to_keep)[0]
+
+    def _remove_deactive_node(self, user, user_to_last_uses, active_list):
+        delete_node = self._get_delete_node(user, user_to_last_uses)[1]
+        for i in delete_node:
+            if i in active_list:
+                active_list.remove(i)
+
+    def _get_chunk_inputs_size(
+        self, chunk_inputs, chunk_inputs_non_chunk, node_list, chunk_end_idx
+    ):
+        nodes_to_delete = []
+        for chunk_input in chunk_inputs + chunk_inputs_non_chunk:
+            chunk_input_users = chunk_input.users.keys()
+            chunk_input_users_idx = [
+                find_idx_by_name(i.name, node_list) for i in chunk_input_users
+            ]
+            if all(i <= chunk_end_idx for i in chunk_input_users_idx):
+                if chunk_input not in nodes_to_delete:
+                    nodes_to_delete.append(chunk_input)
+        out_node = [self._get_output_node(i) for i in nodes_to_delete]
+        delete_size = sum([i[0] for i in out_node])
+        return delete_size
+
+    def _get_last_usr(self, nodes):
+        node_to_last_use: Dict[Node, Node] = {}
+        user_to_last_uses: Dict[Node, List[Node]] = {}
+
+        def register_last_uses(n: Node, user: Node):
+            if n not in node_to_last_use:
+                node_to_last_use[n] = user
+                user_to_last_uses.setdefault(user, []).append(n)
+
+        for node in reversed(nodes):
+            map_arg(node.args, lambda n: register_last_uses(n, node))
+            map_arg(node.kwargs, lambda n: register_last_uses(n, node))
+        return user_to_last_uses
+
+    def _get_contiguous_memory(self, node, not_contiguous_list, delete=False):
+        mem = 0
+        not_contiguous_ops = ["permute"]
+        inherit_contiguous_ops = ["transpose", "view"]
+
+        if node.op == "call_function" and any(
+            n in node.name for n in ["matmul", "reshape"]
+        ):
+            for n in node.args:
+                if n in not_contiguous_list:
+                    # matmul won't change origin tensor, but create a tmp copy
+                    mem += self._get_output_node_size(n)
+        elif node.op == "call_module":
+            for n in node.args:
+                if n in not_contiguous_list:
+                    # module will just make origin tensor to contiguous
+                    if delete:
+                        not_contiguous_list.remove(n)
+        elif node.op == "call_method" and any(
+            i in node.name for i in not_contiguous_ops
+        ):
+            if node not in not_contiguous_list:
+                not_contiguous_list.append(node)
+        return mem
+
+    def _get_chunk_ratio(self, node, chunk_node_dim, chunk_size):
+        if node not in chunk_node_dim:
+            return 1.0
+        node_shape = get_node_shape(node)
+        chunk_dim = chunk_node_dim[node]["chunk_dim"]
+        if chunk_dim is None:
+            return 1.0
+        else:
+            return float(chunk_size) / node_shape[chunk_dim]
+
+    def _get_chunk_delete_node_size(
+        self, user, user_to_last_uses, chunk_ratio, chunk_inputs_names
+    ):
+        # if any(j in user.name for j in ['transpose', 'permute', 'view']):
+        #     return 0
+        if user.op in ("placeholder", "output"):
+            return 0
+        nodes_to_delete = user_to_last_uses.get(user, [])
+        delete_size = 0
+        for n in nodes_to_delete:
+            if n.name in chunk_inputs_names:
+                continue
+            delete_size += self._get_output_node_size(n) * chunk_ratio
+        return delete_size
+
+    def _print_mem_log(self, log, nodes, title=None):
+        if title:
+            print(title)
+        for idx, (l, n) in enumerate(zip(log, nodes)):
+            print("%s:%.2f \t" % (n.name, l), end="")
+            if (idx + 1) % 3 == 0:
+                print("")
+        print("\n")
+
+    def _print_compute_op_mem_log(self, log, nodes, title=None):
+        if title:
+            print(title)
+        for idx, (l, n) in enumerate(zip(log, nodes)):
+            if n.op in ["placeholder", "get_attr", "output"]:
+                continue
+            if any(i in n.name for i in ["getitem", "getattr"]):
+                continue
+            print("%s:%.2f \t" % (n.name, l), end="")
+            if (idx + 1) % 3 == 0:
+                print("")
+        print("\n")
+
+    def estimate_chunk_inference_mem(
+        self,
+        node_list,
+        chunk_infos=None,
+        print_mem=False,
+    ):
+        act_memory = 0.0
+        act_memory_peak_log = []
+        act_memory_after_node_log = []
+        active_node_list = []
+        active_node_list_log = []
+        not_contiguous_list = []
+        user_to_last_uses = self._get_last_usr(node_list)
+        user_to_last_uses_no_free_var = self._get_last_usr(node_list)
+        delete_free_var_from_last_use(user_to_last_uses_no_free_var)
+
+        use_chunk = True if chunk_infos is not None else False
+        chunk_within = False
+        chunk_region_idx = None
+        chunk_ratio = 1  # use it to estimate chunk mem
+        chunk_inputs_names = []
+
+        if use_chunk:
+            chunk_regions = [i["region"] for i in chunk_infos]
+            chunk_starts = [i[0] for i in chunk_regions]
+            chunk_ends = [i[1] for i in chunk_regions]
+            chunk_inputs = [i["inputs"] for i in chunk_infos]
+            chunk_inputs_non_chunk = [i["inputs_non_chunk"] for i in chunk_infos]
+            chunk_inputs_names = [j.name for i in chunk_inputs for j in i] + [
+                j.name for i in chunk_inputs_non_chunk for j in i
+            ]
+            chunk_outputs = [i["outputs"][0] for i in chunk_infos]
+            chunk_node_dim = [i["node_chunk_dim"] for i in chunk_infos]
+            chunk_sizes = [
+                i["chunk_size"] if "chunk_size" in i else 1 for i in chunk_infos
+            ]
+
+        for idx, node in enumerate(node_list):
+            # if node in chunk start nodes, change chunk ratio and add chunk_tensor
+            if use_chunk and idx in chunk_starts:
+                chunk_within = True
+                chunk_region_idx = chunk_starts.index(idx)
+                act_memory += self._get_output_node_size(
+                    chunk_outputs[chunk_region_idx]
+                ) / (1024**2)
+
+            # determine chunk ratio for current node
+            if chunk_within:
+                chunk_ratio = self._get_chunk_ratio(
+                    node,
+                    chunk_node_dim[chunk_region_idx],
+                    chunk_sizes[chunk_region_idx],
+                )
+
+            # if node is placeholder, just add the size of the node
+            if node.op == "placeholder":
+                act_memory += self._get_meta_node_size(node) * chunk_ratio / (1024**2)
+                act_memory_peak_log.append(act_memory)
+            # skip output
+            elif node.op == "output":
+                continue
+            # no change for non compute node
+            elif is_non_compute_node_except_placeholder(node):
+                act_memory_peak_log.append(act_memory)
+            # node is a compute op
+            # calculate tmp, output node and delete node memory
+            else:
+                # forward memory
+                # TODO: contiguous_memory still not accurate for matmul, view, reshape and transpose
+                act_memory += (
+                    self._get_contiguous_memory(node, not_contiguous_list)
+                    * chunk_ratio
+                    / (1024**2)
+                )
+                act_memory += (
+                    self._get_output_node_size(node) * chunk_ratio / (1024**2)
+                )
+                # record max act memory
+                act_memory_peak_log.append(act_memory)
+                # delete useless memory
+                act_memory -= (
+                    self._get_contiguous_memory(node, not_contiguous_list, delete=True)
+                    * chunk_ratio
+                    / (1024**2)
+                )
+                # delete unused vars not in chunk_input_list
+                # we can't delete input nodes until chunk ends
+                if chunk_within:
+                    act_memory -= self._get_chunk_delete_node_size(
+                        node,
+                        user_to_last_uses_no_free_var,
+                        chunk_ratio,
+                        chunk_inputs_names,
+                    ) / (1024**2)
+                else:
+                    act_memory -= self._get_delete_node_size(
+                        node, user_to_last_uses_no_free_var, chunk_inputs_names
+                    ) / (1024**2)
+
+            # log active node, only effective without chunk
+            self._add_active_node(node, active_node_list)
+            self._remove_deactive_node(node, user_to_last_uses, active_node_list)
+
+            # if node in chunk end nodes, restore chunk settings
+            if use_chunk and idx in chunk_ends:
+                act_memory -= (
+                    self._get_output_node_size(node) * chunk_ratio / (1024**2)
+                )
+                act_memory -= self._get_chunk_inputs_size(
+                    chunk_inputs[chunk_region_idx],
+                    chunk_inputs_non_chunk[chunk_region_idx],
+                    node_list,
+                    chunk_regions[chunk_region_idx][1],
+                ) / (1024**2)
+                chunk_within = False
+                chunk_ratio = 1
+                chunk_region_idx = None
+
+            act_memory_after_node_log.append(act_memory)
+            active_node_list_log.append(copy.deepcopy(active_node_list))
+
+        if print_mem:
+            print("with chunk" if use_chunk else "without chunk")
+            # self._print_mem_log(act_memory_peak_log, node_list, "peak")
+            # self._print_mem_log(act_memory_after_node_log, node_list, "after")
+            self._print_compute_op_mem_log(act_memory_peak_log, node_list, "peak")
+            # self._print_compute_op_mem_log(
+            #     act_memory_after_node_log, node_list, "after"
+            # )
+
+        # param_memory = parameter_size(gm)
+        # all_memory = act_memory + param_memory
+        return act_memory_peak_log, act_memory_after_node_log, active_node_list_log
diff --git a/colossalai/autochunk/utils.py b/colossalai/autochunk/utils.py
new file mode 100644
index 000000000000..b62a6600adc8
--- /dev/null
+++ b/colossalai/autochunk/utils.py
@@ -0,0 +1,95 @@
+from typing import Any, Callable, Dict, Iterable, List, Tuple
+
+from torch.fx.node import Node
+
+
+def is_non_compute_node(node):
+    if any(i in node.op for i in ["placeholder", "get_attr", "output"]) or any(
+        i in node.name for i in ["getitem", "getattr"]
+    ):
+        return True
+    return False
+
+
+def get_node_shape(node):
+    if hasattr(node.meta["tensor_meta"], "shape"):
+        return node.meta["tensor_meta"].shape
+    return None
+
+
+def is_non_compute_node_except_placeholder(node):
+    if any(i in node.op for i in ["get_attr", "output"]) or any(
+        i in node.name for i in ["getitem", "getattr"]
+    ):
+        return True
+    return False
+
+
+def is_non_compute_node_except_placeholder_output(node):
+    if any(i in node.op for i in ["get_attr"]) or any(
+        i in node.name for i in ["getitem", "getattr"]
+    ):
+        return True
+    return False
+
+
+def find_idx_by_name(name, nodes_list):
+    for idx, node in enumerate(nodes_list):
+        if node.name == name:
+            return idx
+    raise RuntimeError("name %s not found in node list" % name)
+
+
+def delete_free_var_from_last_use(user_to_last_uses):
+    for key, value in user_to_last_uses.items():
+        for n in value:
+            if n.op == "placeholder":
+                user_to_last_uses[key].remove(n)
+
+
+def find_chunk_all_input_nodes(nodes: List[Node]):
+    """
+    Find non-compute input and output node names.
+    input nodes are nodes used in the list
+    output nodes are nodes will use nodes in the list
+    """
+    input_nodes = []
+    for node in nodes:
+        for input_node in node._input_nodes.keys():
+            if input_node not in nodes and input_node not in input_nodes:
+                input_nodes.append(input_node)
+    return input_nodes
+
+
+def find_chunk_compute_input_and_output_nodes(nodes: List[Node]):
+    """
+    Find non-compute input and output node names.
+    input nodes are nodes used in the list
+    output nodes are nodes will use nodes in the list
+    """
+    input_nodes = []
+    output_nodes = []
+
+    # if a node has an input node which is not in the node list
+    # we treat that input node as the input of the checkpoint function
+    for node in nodes:
+        for input_node in node._input_nodes.keys():
+            if (
+                input_node not in nodes
+                and input_node not in input_nodes
+                and not is_non_compute_node_except_placeholder(input_node)
+            ):
+                input_nodes.append(input_node)
+
+    # if a node has a user node which is not in the node list
+    # we treat that user node as the node receiving the current node output
+    for node in nodes:
+        for output_node in node.users.keys():
+            if (
+                output_node not in nodes
+                and node not in output_nodes
+                and not is_non_compute_node_except_placeholder_output(output_node)
+            ):
+                output_nodes.append(node)
+
+    return input_nodes, output_nodes
diff --git a/tests/test_autochunk/benchmark_autochunk.py b/tests/test_autochunk/benchmark_autochunk.py
index 8df6d9ff4564..702eb7026bb7 100644
--- a/tests/test_autochunk/benchmark_autochunk.py
+++ b/tests/test_autochunk/benchmark_autochunk.py
@@ -3,7 +3,7 @@
 import torch
 import torch.fx
 
-from colossalai.autochunk.chunk_codegen import ChunkCodeGen
+from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
 from colossalai.fx import ColoTracer
 from colossalai.fx.graph_module import ColoGraphModule
 from colossalai.fx.passes.meta_info_prop import MetaInfoProp
@@ -49,25 +49,29 @@ def _build_autochunk(model, max_memory, node, pair):
             "pair": pair.to(torch.device("meta")),
         },
     )
+
     gm_prop = torch.fx.symbolic_trace(model)  # must use symbolic_trace
     interp = MetaInfoProp(gm_prop)
     interp.propagate(
         MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0")
     )
+
     # now run it twice to get meta info in graph module, not necessary
     gm = torch.fx.GraphModule(model, graph)
     interp = MetaInfoProp(gm)
     interp.propagate(
         MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0")
     )
+
     # set code_gen
-    codegen = ChunkCodeGen(gm_prop, max_memory)
+    codegen = AutoChunkCodeGen(gm_prop, max_memory)
     graph.set_codegen(codegen)
     gm = ColoGraphModule(model, graph)
     gm.recompile()
+
     # print
-    code = graph.python_code("self").src
-    print(code)
+    # code = graph.python_code("self").src
+    # print(code)
     return gm
 
 
diff --git a/tests/test_autochunk/test_autochunk.py b/tests/test_autochunk/test_autochunk.py
index caa2d9a80254..85a162084cc9 100644
--- a/tests/test_autochunk/test_autochunk.py
+++ b/tests/test_autochunk/test_autochunk.py
@@ -4,7 +4,7 @@
 import torch.multiprocessing as mp
 
 import colossalai
-from colossalai.autochunk.chunk_codegen import ChunkCodeGen
+from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
 from colossalai.core import global_context as gpc
 from colossalai.fx import ColoTracer
 from colossalai.fx.graph_module import ColoGraphModule
@@ -82,7 +82,7 @@ def _run_offload_codegen(rank):
         MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0")
     )
 
-    codegen = ChunkCodeGen(gm_prop)
+    codegen = AutoChunkCodeGen(gm_prop)
     graph.set_codegen(codegen)
     gm = ColoGraphModule(model, graph)
     gm.recompile()

From 8a634af2f5510954e7a992c0ee894d22cf9e26d2 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 6 Jan 2023 14:19:45 +0800
Subject: [PATCH 088/503] close mem and code print

---
 colossalai/autochunk/autochunk_codegen.py   |  4 ++--
 colossalai/autochunk/chunk_region_search.py | 11 +++++++----
 tests/test_autochunk/benchmark_autochunk.py |  2 +-
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
index 58a8c375136e..dcc6bba9ed0a 100644
--- a/colossalai/autochunk/autochunk_codegen.py
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -214,13 +214,13 @@ def emit_code_with_chunk(
 if CODEGEN_AVAILABLE:
 
     class AutoChunkCodeGen(CodeGen):
-        def __init__(self, meta_graph, max_memory=None):
+        def __init__(self, meta_graph, max_memory=None, print_mem=False):
             super().__init__()
             self.meta_graph = meta_graph
             self.max_memory = max_memory
             self.meta_node = list(meta_graph.graph.nodes)
             # find the chunk regions
-            self.chunk_region_search = ChunkRegionSearch(meta_graph, max_memory)
+            self.chunk_region_search = ChunkRegionSearch(meta_graph, max_memory, print_mem)
             self.chunk_infos = self.chunk_region_search.search_region()
 
         def _gen_python_code(
diff --git a/colossalai/autochunk/chunk_region_search.py b/colossalai/autochunk/chunk_region_search.py
index 0d0825f2584e..76b02cadeb3b 100644
--- a/colossalai/autochunk/chunk_region_search.py
+++ b/colossalai/autochunk/chunk_region_search.py
@@ -6,8 +6,9 @@
 
 
 class ChunkRegionSearch(object):
-    def __init__(self, gm, max_memory=None) -> None:
+    def __init__(self, gm, max_memory=None, print_mem=False) -> None:
         self.gm = gm
+        self.print_mem = print_mem
         self.index_tracer = IndexTracer(list(gm.graph.nodes))
         self.index_tracer.trace_index()
         self.memory_estimator = MemoryEstimator(self.index_tracer)
@@ -204,8 +205,10 @@ def search_region(self):
             )
             if self._stop_search(init_mem_peak, mem_peak):
                 break
-        self.memory_estimator.estimate_chunk_inference_mem(
-            self.index_tracer.node_list, chunk_infos, print_mem=True
-        )
+        if self.print_mem:
+            self.print_mem = False
+            self.memory_estimator.estimate_chunk_inference_mem(
+                self.index_tracer.node_list, chunk_infos, print_mem=True
+            )
         return chunk_infos
 
diff --git a/tests/test_autochunk/benchmark_autochunk.py b/tests/test_autochunk/benchmark_autochunk.py
index 702eb7026bb7..9daaa364a710 100644
--- a/tests/test_autochunk/benchmark_autochunk.py
+++ b/tests/test_autochunk/benchmark_autochunk.py
@@ -64,7 +64,7 @@ def _build_autochunk(model, max_memory, node, pair):
     )
 
     # set code_gen
-    codegen = AutoChunkCodeGen(gm_prop, max_memory)
+    codegen = AutoChunkCodeGen(gm_prop, max_memory, print_mem=False)
     graph.set_codegen(codegen)
     gm = ColoGraphModule(model, graph)
     gm.recompile()

From 2bde9d2b7fd43f3160088b820d926301f6527ebf Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 6 Jan 2023 14:21:49 +0800
Subject: [PATCH 089/503] code format

---
 colossalai/autochunk/autochunk_codegen.py   |  4 +++-
 colossalai/autochunk/chunk_region_search.py | 14 +++++++++-----
 colossalai/autochunk/memory_estiamtor.py    |  2 +-
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
index dcc6bba9ed0a..fbd5d5e368dc 100644
--- a/colossalai/autochunk/autochunk_codegen.py
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -220,7 +220,9 @@ def __init__(self, meta_graph, max_memory=None, print_mem=False):
             self.max_memory = max_memory
             self.meta_node = list(meta_graph.graph.nodes)
             # find the chunk regions
-            self.chunk_region_search = ChunkRegionSearch(meta_graph, max_memory, print_mem)
+            self.chunk_region_search = ChunkRegionSearch(
+                meta_graph, max_memory, print_mem
+            )
             self.chunk_infos = self.chunk_region_search.search_region()
 
         def _gen_python_code(
diff --git a/colossalai/autochunk/chunk_region_search.py b/colossalai/autochunk/chunk_region_search.py
index 76b02cadeb3b..7a0e8a36cd6c 100644
--- a/colossalai/autochunk/chunk_region_search.py
+++ b/colossalai/autochunk/chunk_region_search.py
@@ -1,8 +1,13 @@
+import copy
+
+from .chunk_selector import ChunkSelector
 from .index_tracer import IndexTracer
 from .memory_estiamtor import MemoryEstimator
-from .chunk_selector import ChunkSelector
-import copy
-from .utils import is_non_compute_node, is_non_compute_node_except_placeholder, get_node_shape
+from .utils import (
+    get_node_shape,
+    is_non_compute_node,
+    is_non_compute_node_except_placeholder,
+)
 
 
 class ChunkRegionSearch(object):
@@ -11,7 +16,7 @@ def __init__(self, gm, max_memory=None, print_mem=False) -> None:
         self.print_mem = print_mem
         self.index_tracer = IndexTracer(list(gm.graph.nodes))
         self.index_tracer.trace_index()
-        self.memory_estimator = MemoryEstimator(self.index_tracer)
+        self.memory_estimator = MemoryEstimator()
         self.chunk_selector = ChunkSelector(
             self.index_tracer, self.memory_estimator, max_memory=max_memory
         )
@@ -211,4 +216,3 @@ def search_region(self):
                 self.index_tracer.node_list, chunk_infos, print_mem=True
             )
         return chunk_infos
-
diff --git a/colossalai/autochunk/memory_estiamtor.py b/colossalai/autochunk/memory_estiamtor.py
index c3d8b1803ce9..034f59e52858 100644
--- a/colossalai/autochunk/memory_estiamtor.py
+++ b/colossalai/autochunk/memory_estiamtor.py
@@ -16,7 +16,7 @@
 
 
 class MemoryEstimator(object):
-    def __init__(self, index_tracer: IndexTracer) -> None:
+    def __init__(self) -> None:
         pass
 
     def _get_meta_node_size(self, x):

From fd87d78a28a70fcb840c16d4084f67926ecc309c Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 6 Jan 2023 14:28:04 +0800
Subject: [PATCH 090/503] rename ambiguous variable

---
 colossalai/autochunk/chunk_selector.py        | 14 +++++++-------
 tests/test_autochunk/evoformer/ops.py         |  6 +++---
 tests/test_autochunk/openfold/tensor_utils.py |  8 ++++----
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/colossalai/autochunk/chunk_selector.py b/colossalai/autochunk/chunk_selector.py
index f84322082cc4..aeab66572099 100644
--- a/colossalai/autochunk/chunk_selector.py
+++ b/colossalai/autochunk/chunk_selector.py
@@ -126,14 +126,14 @@ def _get_fit_chunk_size(self, chunk_region_dict, chunk_infos):
         )
         return chunk_info
 
-    def _chunk_size_binary_search(self, l, r, chunk_region_dict, chunk_infos):
-        if l >= 16:
+    def _chunk_size_binary_search(self, left, right, chunk_region_dict, chunk_infos):
+        if left >= 16:
             gap = 4
         else:
             gap = 1
         chunk_info = chunk_region_dict["reorder_chunk_info"]
-        while r >= l + gap:
-            mid = int((l + r) / 2 + 0.5)
+        while right >= left + gap:
+            mid = int((left + right) / 2 + 0.5)
             chunk_info["chunk_size"] = mid
             cur_chunk_infos = chunk_infos + [chunk_info]
             cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
@@ -143,10 +143,10 @@ def _chunk_size_binary_search(self, l, r, chunk_region_dict, chunk_infos):
                 cur_mem_peak[chunk_info["region"][0] : chunk_info["region"][1] + 1]
             )
             if cur_chunk_max_mem >= self.max_memory:
-                r = mid - gap
+                right = mid - gap
             else:
-                l = mid + gap
-        return l
+                left = mid + gap
+        return left
 
     def _get_compute_node_num(self, start, end):
         count = 0
diff --git a/tests/test_autochunk/evoformer/ops.py b/tests/test_autochunk/evoformer/ops.py
index 611b7b0fe777..a56057522eaa 100755
--- a/tests/test_autochunk/evoformer/ops.py
+++ b/tests/test_autochunk/evoformer/ops.py
@@ -67,10 +67,10 @@ def forward(self, M):
         left_act = self.linear_a(M)
         right_act = self.linear_b(M)
 
-        O = torch.einsum('bsid,bsje->bijde', left_act, right_act).contiguous()
+        o = torch.einsum('bsid,bsje->bijde', left_act, right_act).contiguous()
         # O = rearrange(O, 'b i j d e -> b i j (d e)')
-        O = O.reshape(O.shape[0], O.shape[1], O.shape[2], -1)
-        Z = self.o_linear(O)
+        o = o.reshape(o.shape[0], o.shape[1], o.shape[2], -1)
+        Z = self.o_linear(o)
 
         return Z
 
diff --git a/tests/test_autochunk/openfold/tensor_utils.py b/tests/test_autochunk/openfold/tensor_utils.py
index 7e5e8e4b6b5e..384a71fb5ffd 100644
--- a/tests/test_autochunk/openfold/tensor_utils.py
+++ b/tests/test_autochunk/openfold/tensor_utils.py
@@ -157,12 +157,12 @@ def _get_minimal_slice_set(
     # start_edges and end_edges both indicate whether, starting from any given
     # dimension, the start/end index is at the top/bottom edge of the
     # corresponding tensor, modeled as a tree
-    def reduce_edge_list(l):
+    def reduce_edge_list(ll):
         tally = 1
-        for i in range(len(l)):
+        for i in range(len(ll)):
             reversed_idx = -1 * (i + 1)
-            l[reversed_idx] *= tally
-            tally = l[reversed_idx]
+            ll[reversed_idx] *= tally
+            tally = ll[reversed_idx]
 
     if(start_edges is None):
         start_edges = [s == 0 for s in start]

From ae27a8b26d7a36a3d9215fc6fd1db92982bdeef7 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 6 Jan 2023 14:57:33 +0800
Subject: [PATCH 091/503] seperate flow tracer

---
 colossalai/autochunk/index_tracer.py | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/colossalai/autochunk/index_tracer.py b/colossalai/autochunk/index_tracer.py
index 7a86f3c998fb..0323e3a7e07d 100644
--- a/colossalai/autochunk/index_tracer.py
+++ b/colossalai/autochunk/index_tracer.py
@@ -745,14 +745,7 @@ def _assgin_single_node_flow(
         next_node_list.append(arg_node)
         return True
 
-    def flow_search(self, start_idx, start_dim, end_idx, end_dim):
-        inputs, outputs = find_chunk_compute_input_and_output_nodes(
-            self.node_list[start_idx : end_idx + 1]
-        )
-        # only single ouput
-        if len(outputs) > 1:
-            return None
-
+    def _get_all_node_info(self, end_dim, start_idx, end_idx):
         cur_node_list = [self.node_list[end_idx]]  # start from the last node
         all_node_info = {cur_node_list[0]: {"chunk_dim": end_dim, "fix_dim": []}}
 
@@ -763,7 +756,6 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
                 # get cur node info
                 cur_node_chunk_dim = all_node_info[cur_node]["chunk_dim"]
                 cur_node_fix_dim = all_node_info[cur_node]["fix_dim"]
-                cur_node_idx = find_idx_by_name(cur_node.name, self.node_list)
                 if cur_node_chunk_dim:
                     cur_node_compute = self._find_compute_trace_from_node(cur_node)
                     cur_node_source = self._find_source_trace_from_node(cur_node)
@@ -818,6 +810,20 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
                     else:
                         raise NotImplementedError()
             cur_node_list = next_node_list
+        return all_node_info
+
+    def flow_search(self, start_idx, start_dim, end_idx, end_dim):
+        inputs, outputs = find_chunk_compute_input_and_output_nodes(
+            self.node_list[start_idx : end_idx + 1]
+        )
+        # only single ouput
+        if len(outputs) > 1:
+            return None
+
+        # get every node's chunk dim and fix dim
+        all_node_info = self._get_all_node_info(end_dim, start_idx, end_idx)
+        if all_node_info is None:
+            return None
 
         inputs_dim = []
         remove_inputs = []

From f4a1607e5645e3a537df6e88b67fb57a8fc6ed4f Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 6 Jan 2023 15:36:17 +0800
Subject: [PATCH 092/503] seperate input node dim search

---
 colossalai/autochunk/index_tracer.py | 35 +++++++++++++++++-----------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/colossalai/autochunk/index_tracer.py b/colossalai/autochunk/index_tracer.py
index 0323e3a7e07d..221217e2d101 100644
--- a/colossalai/autochunk/index_tracer.py
+++ b/colossalai/autochunk/index_tracer.py
@@ -812,19 +812,7 @@ def _get_all_node_info(self, end_dim, start_idx, end_idx):
             cur_node_list = next_node_list
         return all_node_info
 
-    def flow_search(self, start_idx, start_dim, end_idx, end_dim):
-        inputs, outputs = find_chunk_compute_input_and_output_nodes(
-            self.node_list[start_idx : end_idx + 1]
-        )
-        # only single ouput
-        if len(outputs) > 1:
-            return None
-
-        # get every node's chunk dim and fix dim
-        all_node_info = self._get_all_node_info(end_dim, start_idx, end_idx)
-        if all_node_info is None:
-            return None
-
+    def _get_input_nodes_dim(self, inputs, start_idx, end_idx, all_node_info):
         inputs_dim = []
         remove_inputs = []
         for input_node in inputs:
@@ -841,7 +829,7 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
                         if input_node_idx in user_source:
                             input_dict[user_idx] = user_source[input_node_idx]
                         else:
-                            return None
+                            return None, None
             if len(input_dict) == 0:
                 remove_inputs.append(input_node)
             else:
@@ -849,6 +837,25 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
         for i in remove_inputs:
             if i in inputs:
                 inputs.remove(i)
+        return inputs, inputs_dim
+
+    def flow_search(self, start_idx, start_dim, end_idx, end_dim):
+        inputs, outputs = find_chunk_compute_input_and_output_nodes(
+            self.node_list[start_idx : end_idx + 1]
+        )
+        # only single ouput
+        if len(outputs) > 1:
+            return None
+
+        # get every node's chunk dim and fix dim
+        all_node_info = self._get_all_node_info(end_dim, start_idx, end_idx)
+        if all_node_info is None:
+            return None
+
+        # get input nodes' chunk dim
+        inputs, inputs_dim = self._get_input_nodes_dim(inputs, start_idx, end_idx, all_node_info)
+        if inputs is None:
+            return None
 
         chunk_info = {
             "region": (start_idx, end_idx),

From f856611d217e13c11ea382fe9d8f8af4cdeabb49 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 6 Jan 2023 15:47:17 +0800
Subject: [PATCH 093/503] seperate prepose_nodes

---
 colossalai/autochunk/index_tracer.py | 68 +++++++++++++++-------------
 1 file changed, 36 insertions(+), 32 deletions(-)

diff --git a/colossalai/autochunk/index_tracer.py b/colossalai/autochunk/index_tracer.py
index 221217e2d101..206d2edbd5df 100644
--- a/colossalai/autochunk/index_tracer.py
+++ b/colossalai/autochunk/index_tracer.py
@@ -839,36 +839,7 @@ def _get_input_nodes_dim(self, inputs, start_idx, end_idx, all_node_info):
                 inputs.remove(i)
         return inputs, inputs_dim
 
-    def flow_search(self, start_idx, start_dim, end_idx, end_dim):
-        inputs, outputs = find_chunk_compute_input_and_output_nodes(
-            self.node_list[start_idx : end_idx + 1]
-        )
-        # only single ouput
-        if len(outputs) > 1:
-            return None
-
-        # get every node's chunk dim and fix dim
-        all_node_info = self._get_all_node_info(end_dim, start_idx, end_idx)
-        if all_node_info is None:
-            return None
-
-        # get input nodes' chunk dim
-        inputs, inputs_dim = self._get_input_nodes_dim(inputs, start_idx, end_idx, all_node_info)
-        if inputs is None:
-            return None
-
-        chunk_info = {
-            "region": (start_idx, end_idx),
-            "inputs": inputs,
-            "inputs_non_chunk": [],
-            "inputs_dim": inputs_dim,
-            "outputs": outputs,
-            "outputs_dim": end_dim,
-            "node_chunk_dim": all_node_info,
-            "args": {},
-        }
-
-        # move useless nodes ahead of loop
+    def _set_prepose_nodes(self, all_node_info, start_idx, end_idx):
         # get all possible prepose nodes
         maybe_prepose_nodes = []
         for node, node_info in all_node_info.items():
@@ -929,12 +900,45 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
                         maybe_prepose_nodes.remove(n)
         # sort by index
         prepose_nodes.sort(key=lambda x: find_idx_by_name(x.name, self.node_list))
-        chunk_info["args"]["prepose_nodes"] = prepose_nodes
+
+        return prepose_nodes
+    
+    def flow_search(self, start_idx, start_dim, end_idx, end_dim):
+        inputs, outputs = find_chunk_compute_input_and_output_nodes(
+            self.node_list[start_idx : end_idx + 1]
+        )
+        # only single ouput
+        if len(outputs) > 1:
+            return None
+
+        # get every node's chunk dim and fix dim
+        all_node_info = self._get_all_node_info(end_dim, start_idx, end_idx)
+        if all_node_info is None:
+            return None
+
+        # get input nodes' chunk dim
+        inputs, inputs_dim = self._get_input_nodes_dim(inputs, start_idx, end_idx, all_node_info)
+        if inputs is None:
+            return None
+
+        chunk_info = {
+            "region": (start_idx, end_idx),
+            "inputs": inputs,
+            "inputs_non_chunk": [],
+            "inputs_dim": inputs_dim,
+            "outputs": outputs,
+            "outputs_dim": end_dim,
+            "node_chunk_dim": all_node_info,
+            "args": {},
+        }
+
+        # move useless nodes ahead of loop
+        chunk_info["args"]["prepose_nodes"] = self._set_prepose_nodes(all_node_info, start_idx, end_idx)
 
         # we need to log input nodes to avoid deleteing them in the loop
         chunk_node_list = self.node_list[start_idx : end_idx + 1]
         # also need to get some prepose node's arg out of non_chunk_inputs
-        for n in prepose_nodes:
+        for n in chunk_info["args"]["prepose_nodes"]:
             chunk_node_list.remove(n)
         non_chunk_inputs = find_chunk_all_input_nodes(chunk_node_list)
         for i in non_chunk_inputs:

From 6685a9d022a912ab3d0a57486b045b92b3f681ce Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 6 Jan 2023 15:53:24 +0800
Subject: [PATCH 094/503] seperate non chunk input

---
 colossalai/autochunk/index_tracer.py | 35 +++++++++++++++++-----------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/colossalai/autochunk/index_tracer.py b/colossalai/autochunk/index_tracer.py
index 206d2edbd5df..202044763b0f 100644
--- a/colossalai/autochunk/index_tracer.py
+++ b/colossalai/autochunk/index_tracer.py
@@ -839,7 +839,7 @@ def _get_input_nodes_dim(self, inputs, start_idx, end_idx, all_node_info):
                 inputs.remove(i)
         return inputs, inputs_dim
 
-    def _set_prepose_nodes(self, all_node_info, start_idx, end_idx):
+    def _get_prepose_nodes(self, all_node_info, start_idx, end_idx):
         # get all possible prepose nodes
         maybe_prepose_nodes = []
         for node, node_info in all_node_info.items():
@@ -902,7 +902,19 @@ def _set_prepose_nodes(self, all_node_info, start_idx, end_idx):
         prepose_nodes.sort(key=lambda x: find_idx_by_name(x.name, self.node_list))
 
         return prepose_nodes
-    
+
+    def _get_non_chunk_inputs(self, chunk_info, start_idx, end_idx):
+        # we need to log input nodes to avoid deleteing them in the loop
+        chunk_node_list = self.node_list[start_idx : end_idx + 1]
+        # also need to get some prepose node's arg out of non_chunk_inputs
+        for n in chunk_info["args"]["prepose_nodes"]:
+            chunk_node_list.remove(n)
+        non_chunk_inputs = find_chunk_all_input_nodes(chunk_node_list)
+        for i in non_chunk_inputs:
+            if i not in chunk_info["inputs"]:
+                chunk_info["inputs_non_chunk"].append(i)
+        return chunk_info
+
     def flow_search(self, start_idx, start_dim, end_idx, end_dim):
         inputs, outputs = find_chunk_compute_input_and_output_nodes(
             self.node_list[start_idx : end_idx + 1]
@@ -917,7 +929,9 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
             return None
 
         # get input nodes' chunk dim
-        inputs, inputs_dim = self._get_input_nodes_dim(inputs, start_idx, end_idx, all_node_info)
+        inputs, inputs_dim = self._get_input_nodes_dim(
+            inputs, start_idx, end_idx, all_node_info
+        )
         if inputs is None:
             return None
 
@@ -933,17 +947,12 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
         }
 
         # move useless nodes ahead of loop
-        chunk_info["args"]["prepose_nodes"] = self._set_prepose_nodes(all_node_info, start_idx, end_idx)
+        chunk_info["args"]["prepose_nodes"] = self._get_prepose_nodes(
+            all_node_info, start_idx, end_idx
+        )
 
-        # we need to log input nodes to avoid deleteing them in the loop
-        chunk_node_list = self.node_list[start_idx : end_idx + 1]
-        # also need to get some prepose node's arg out of non_chunk_inputs
-        for n in chunk_info["args"]["prepose_nodes"]:
-            chunk_node_list.remove(n)
-        non_chunk_inputs = find_chunk_all_input_nodes(chunk_node_list)
-        for i in non_chunk_inputs:
-            if i not in chunk_info["inputs"]:
-                chunk_info["inputs_non_chunk"].append(i)
+        # find non chunk inputs
+        chunk_info = self._get_non_chunk_inputs(chunk_info, start_idx, end_idx)
 
         # reassgin reshape size, some size may have changed due to chunk
         chunk_info = self._reassgin_reshape_size(chunk_info)

From c3d72f7db9e2fc28e9a3aa92749f08c7a7d51e42 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 6 Jan 2023 16:53:01 +0800
Subject: [PATCH 095/503] seperate reorder

---
 colossalai/autochunk/autochunk_codegen.py   |  4 +--
 colossalai/autochunk/chunk_region_search.py |  7 +++--
 colossalai/autochunk/chunk_selector.py      |  8 ++++--
 colossalai/autochunk/index_tracer.py        | 31 ++++++++++++---------
 4 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
index fbd5d5e368dc..b4144196accc 100644
--- a/colossalai/autochunk/autochunk_codegen.py
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -103,7 +103,7 @@ def emit_code_with_chunk(
     nodes,
     emit_node_func,
     delete_unused_value_func,
-    chunk_region_search,
+    chunk_region_search: ChunkRegionSearch,
     chunk_infos,
 ):
     """Emit code with nested activation checkpoint
@@ -133,7 +133,7 @@ def emit_code_with_chunk(
     chunk_outputs = [i["outputs"][0] for i in chunk_infos]
     chunk_outputs_dim = [i["outputs_dim"] for i in chunk_infos]
 
-    node_list = chunk_region_search.index_tracer.reorder_node_list(node_list)
+    node_list = chunk_region_search.reorder_graph.reorder_node_list(node_list)
     node_idx = 0
     region_idx = 0
     within_chunk_region = False
diff --git a/colossalai/autochunk/chunk_region_search.py b/colossalai/autochunk/chunk_region_search.py
index 7a0e8a36cd6c..47e2fe13ceb5 100644
--- a/colossalai/autochunk/chunk_region_search.py
+++ b/colossalai/autochunk/chunk_region_search.py
@@ -1,7 +1,7 @@
 import copy
 
 from .chunk_selector import ChunkSelector
-from .index_tracer import IndexTracer
+from .index_tracer import IndexTracer, ReorderGraph
 from .memory_estiamtor import MemoryEstimator
 from .utils import (
     get_node_shape,
@@ -16,9 +16,10 @@ def __init__(self, gm, max_memory=None, print_mem=False) -> None:
         self.print_mem = print_mem
         self.index_tracer = IndexTracer(list(gm.graph.nodes))
         self.index_tracer.trace_index()
+        self.reorder_graph = ReorderGraph(self.index_tracer)
         self.memory_estimator = MemoryEstimator()
         self.chunk_selector = ChunkSelector(
-            self.index_tracer, self.memory_estimator, max_memory=max_memory
+            self.index_tracer, self.memory_estimator, self.reorder_graph, max_memory=max_memory
         )
 
     def _find_peak_node(self, mem_peak):
@@ -175,7 +176,7 @@ def _step_search(self, mem_peak, active_node, chunk_regions):
         best_chunk_region = self.chunk_selector._select_best_chunk_region(
             possible_chunk_regions, chunk_regions, peak_node, max_chunk_region, mem_peak
         )
-        best_chunk_region = self.index_tracer.reorder_all(best_chunk_region)
+        best_chunk_region = self.reorder_graph.reorder_all(best_chunk_region)
         return best_chunk_region
 
     def _stop_search(self, init_mem_peak, mem_peak):
diff --git a/colossalai/autochunk/chunk_selector.py b/colossalai/autochunk/chunk_selector.py
index aeab66572099..119ff8aafdd0 100644
--- a/colossalai/autochunk/chunk_selector.py
+++ b/colossalai/autochunk/chunk_selector.py
@@ -1,4 +1,4 @@
-from .index_tracer import IndexTracer
+from .index_tracer import IndexTracer, ReorderGraph
 from .memory_estiamtor import MemoryEstimator
 from .utils import is_non_compute_node
 
@@ -8,10 +8,12 @@ def __init__(
         self,
         index_tracer: IndexTracer,
         memory_estimator: MemoryEstimator,
+        reorder_graph: ReorderGraph,
         max_memory=None,
     ):
         self.index_tracer = index_tracer
         self.memory_estimator = memory_estimator
+        self.reorder_graph = reorder_graph
         if max_memory is not None:
             self.stratge = "fit_memory"
             self.max_memory = max_memory  # MB
@@ -64,7 +66,7 @@ def _select_fit_memory_chunk_region(
         regions_dict = []
         for region in possible_chunk_regions:
             cur_region = region.copy()
-            cur_node_list, cur_region = self.index_tracer.tmp_reorder(
+            cur_node_list, cur_region = self.reorder_graph.tmp_reorder(
                 self.index_tracer.node_list, cur_region
             )
             cur_chunk_infos = chunk_infos + [cur_region]
@@ -174,7 +176,7 @@ def _select_min_memory_chunk_region(
         regions_dict = []
         for region in possible_chunk_regions:
             cur_region = region.copy()
-            cur_node_list, cur_region = self.index_tracer.tmp_reorder(
+            cur_node_list, cur_region = self.reorder_graph.tmp_reorder(
                 self.index_tracer.node_list, cur_region
             )
             cur_chunk_infos = chunk_infos + [cur_region]
diff --git a/colossalai/autochunk/index_tracer.py b/colossalai/autochunk/index_tracer.py
index 202044763b0f..8b4d3aabd13a 100644
--- a/colossalai/autochunk/index_tracer.py
+++ b/colossalai/autochunk/index_tracer.py
@@ -17,7 +17,6 @@ def __init__(self, node_list) -> None:
         self.idx_trace_equal = []
         self.idx_view_list = {}
         self.idx_count = -1
-        self.all_reorder_map = {i: i for i in range(len(self.idx_trace_list))}
 
     def _init_idx_trace_list(self):
         idx_trace_list = []
@@ -981,24 +980,30 @@ def _reassgin_reshape_size(self, chunk_info):
         chunk_info["reshape_size"] = reshape_size
         return chunk_info
 
+
+class ReorderGraph(object):
+    def __init__(self, index_tracer: IndexTracer) -> None:
+        self.index_tracer = index_tracer
+        self.all_reorder_map = {i: i for i in range(len(self.index_tracer.idx_trace_list))}
+
     def _get_reorder_map(self, chunk_info):
-        reorder_map = {i: i for i in range(len(self.node_list))}
+        reorder_map = {i: i for i in range(len(self.index_tracer.node_list))}
 
         chunk_region_start = chunk_info["region"][0]
         chunk_region_end = chunk_info["region"][1]
         chunk_prepose_nodes = chunk_info["args"]["prepose_nodes"]
         chunk_prepose_nodes_idx = [
-            find_idx_by_name(i.name, self.node_list) for i in chunk_prepose_nodes
+            find_idx_by_name(i.name, self.index_tracer.node_list) for i in chunk_prepose_nodes
         ]
         # put prepose nodes ahead
         for idx, n in enumerate(chunk_prepose_nodes):
             n_idx = chunk_prepose_nodes_idx[idx]
             reorder_map[n_idx] = chunk_region_start + idx
         # put other nodes after prepose nodes
-        for n in self.node_list[chunk_region_start : chunk_region_end + 1]:
+        for n in self.index_tracer.node_list[chunk_region_start : chunk_region_end + 1]:
             if n in chunk_prepose_nodes:
                 continue
-            n_idx = find_idx_by_name(n.name, self.node_list)
+            n_idx = find_idx_by_name(n.name, self.index_tracer.node_list)
             pos = sum([n_idx < i for i in chunk_prepose_nodes_idx])
             reorder_map[n_idx] = n_idx + pos
 
@@ -1024,25 +1029,25 @@ def _update_all_reorder_map(self, reorder_map):
             self.all_reorder_map[origin_idx] = reorder_map[map_idx]
 
     def _reorder_self_node_list(self, reorder_map):
-        new_node_list = [None for _ in range(len(self.node_list))]
+        new_node_list = [None for _ in range(len(self.index_tracer.node_list))]
         for old_idx, new_idx in reorder_map.items():
-            new_node_list[new_idx] = self.node_list[old_idx]
-        self.node_list = new_node_list
+            new_node_list[new_idx] = self.index_tracer.node_list[old_idx]
+        self.index_tracer.node_list = new_node_list
 
     def _reorder_idx_trace(self, reorder_map):
         # reorder list
-        new_idx_trace_list = [None for _ in range(len(self.idx_trace_list))]
+        new_idx_trace_list = [None for _ in range(len(self.index_tracer.idx_trace_list))]
         for old_idx, new_idx in reorder_map.items():
-            new_idx_trace_list[new_idx] = self.idx_trace_list[old_idx]
-        self.idx_trace_list = new_idx_trace_list
+            new_idx_trace_list[new_idx] = self.index_tracer.idx_trace_list[old_idx]
+        self.index_tracer.idx_trace_list = new_idx_trace_list
         # update compute
-        for idx_trace in self.idx_trace_list:
+        for idx_trace in self.index_tracer.idx_trace_list:
             compute = idx_trace["compute"]
             for dim_compute in compute:
                 for idx, i in enumerate(dim_compute):
                     dim_compute[idx] = reorder_map[i]
         # update source
-        for idx_trace in self.idx_trace_list:
+        for idx_trace in self.index_tracer.idx_trace_list:
             source = idx_trace["source"]
             for dim_idx, dim_source in enumerate(source):
                 new_dim_source = {}

From da4076846d693be0153c8e89ee48ce25f56d09ce Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 6 Jan 2023 17:09:37 +0800
Subject: [PATCH 096/503] rename

---
 colossalai/autochunk/autochunk_codegen.py          |  6 +++---
 .../{memory_estiamtor.py => estiamte_memory.py}    |  3 +--
 .../{chunk_region_search.py => search_chunk.py}    | 14 +++++++-------
 .../{chunk_selector.py => select_chunk.py}         | 10 +++++-----
 .../autochunk/{index_tracer.py => trace_index.py}  |  4 ++--
 tests/test_autochunk/benchmark_autochunk.py        |  2 +-
 6 files changed, 19 insertions(+), 20 deletions(-)
 rename colossalai/autochunk/{memory_estiamtor.py => estiamte_memory.py} (99%)
 rename colossalai/autochunk/{chunk_region_search.py => search_chunk.py} (96%)
 rename colossalai/autochunk/{chunk_selector.py => select_chunk.py} (97%)
 rename colossalai/autochunk/{index_tracer.py => trace_index.py} (99%)

diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
index b4144196accc..3bb2e83be242 100644
--- a/colossalai/autochunk/autochunk_codegen.py
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -17,7 +17,7 @@
 
 import colossalai
 
-from .chunk_region_search import ChunkRegionSearch
+from .search_chunk import SearchChunk
 from .utils import delete_free_var_from_last_use, find_idx_by_name, get_node_shape
 
 CODEGEN_AVAILABLE = True
@@ -103,7 +103,7 @@ def emit_code_with_chunk(
     nodes,
     emit_node_func,
     delete_unused_value_func,
-    chunk_region_search: ChunkRegionSearch,
+    chunk_region_search: SearchChunk,
     chunk_infos,
 ):
     """Emit code with nested activation checkpoint
@@ -220,7 +220,7 @@ def __init__(self, meta_graph, max_memory=None, print_mem=False):
             self.max_memory = max_memory
             self.meta_node = list(meta_graph.graph.nodes)
             # find the chunk regions
-            self.chunk_region_search = ChunkRegionSearch(
+            self.chunk_region_search = SearchChunk(
                 meta_graph, max_memory, print_mem
             )
             self.chunk_infos = self.chunk_region_search.search_region()
diff --git a/colossalai/autochunk/memory_estiamtor.py b/colossalai/autochunk/estiamte_memory.py
similarity index 99%
rename from colossalai/autochunk/memory_estiamtor.py
rename to colossalai/autochunk/estiamte_memory.py
index 034f59e52858..90cfd66a00d5 100644
--- a/colossalai/autochunk/memory_estiamtor.py
+++ b/colossalai/autochunk/estiamte_memory.py
@@ -6,7 +6,6 @@
 
 from colossalai.fx.profiler import activation_size, parameter_size
 
-from .index_tracer import IndexTracer
 from .utils import (
     delete_free_var_from_last_use,
     find_idx_by_name,
@@ -15,7 +14,7 @@
 )
 
 
-class MemoryEstimator(object):
+class EstimateMemory(object):
     def __init__(self) -> None:
         pass
 
diff --git a/colossalai/autochunk/chunk_region_search.py b/colossalai/autochunk/search_chunk.py
similarity index 96%
rename from colossalai/autochunk/chunk_region_search.py
rename to colossalai/autochunk/search_chunk.py
index 47e2fe13ceb5..5c58bda0c393 100644
--- a/colossalai/autochunk/chunk_region_search.py
+++ b/colossalai/autochunk/search_chunk.py
@@ -1,8 +1,8 @@
 import copy
 
-from .chunk_selector import ChunkSelector
-from .index_tracer import IndexTracer, ReorderGraph
-from .memory_estiamtor import MemoryEstimator
+from .select_chunk import SelectChunk
+from .trace_index import TraceIndex, ReorderGraph
+from .estiamte_memory import EstimateMemory
 from .utils import (
     get_node_shape,
     is_non_compute_node,
@@ -10,15 +10,15 @@
 )
 
 
-class ChunkRegionSearch(object):
+class SearchChunk(object):
     def __init__(self, gm, max_memory=None, print_mem=False) -> None:
         self.gm = gm
         self.print_mem = print_mem
-        self.index_tracer = IndexTracer(list(gm.graph.nodes))
+        self.index_tracer = TraceIndex(list(gm.graph.nodes))
         self.index_tracer.trace_index()
         self.reorder_graph = ReorderGraph(self.index_tracer)
-        self.memory_estimator = MemoryEstimator()
-        self.chunk_selector = ChunkSelector(
+        self.memory_estimator = EstimateMemory()
+        self.chunk_selector = SelectChunk(
             self.index_tracer, self.memory_estimator, self.reorder_graph, max_memory=max_memory
         )
 
diff --git a/colossalai/autochunk/chunk_selector.py b/colossalai/autochunk/select_chunk.py
similarity index 97%
rename from colossalai/autochunk/chunk_selector.py
rename to colossalai/autochunk/select_chunk.py
index 119ff8aafdd0..f0262f1e57eb 100644
--- a/colossalai/autochunk/chunk_selector.py
+++ b/colossalai/autochunk/select_chunk.py
@@ -1,13 +1,13 @@
-from .index_tracer import IndexTracer, ReorderGraph
-from .memory_estiamtor import MemoryEstimator
+from .trace_index import TraceIndex, ReorderGraph
+from .estiamte_memory import EstimateMemory
 from .utils import is_non_compute_node
 
 
-class ChunkSelector(object):
+class SelectChunk(object):
     def __init__(
         self,
-        index_tracer: IndexTracer,
-        memory_estimator: MemoryEstimator,
+        index_tracer: TraceIndex,
+        memory_estimator: EstimateMemory,
         reorder_graph: ReorderGraph,
         max_memory=None,
     ):
diff --git a/colossalai/autochunk/index_tracer.py b/colossalai/autochunk/trace_index.py
similarity index 99%
rename from colossalai/autochunk/index_tracer.py
rename to colossalai/autochunk/trace_index.py
index 8b4d3aabd13a..103a05dadbf5 100644
--- a/colossalai/autochunk/index_tracer.py
+++ b/colossalai/autochunk/trace_index.py
@@ -10,7 +10,7 @@
 )
 
 
-class IndexTracer(object):
+class TraceIndex(object):
     def __init__(self, node_list) -> None:
         self.node_list = node_list
         self.idx_trace_list = self._init_idx_trace_list()
@@ -982,7 +982,7 @@ def _reassgin_reshape_size(self, chunk_info):
 
 
 class ReorderGraph(object):
-    def __init__(self, index_tracer: IndexTracer) -> None:
+    def __init__(self, index_tracer: TraceIndex) -> None:
         self.index_tracer = index_tracer
         self.all_reorder_map = {i: i for i in range(len(self.index_tracer.idx_trace_list))}
 
diff --git a/tests/test_autochunk/benchmark_autochunk.py b/tests/test_autochunk/benchmark_autochunk.py
index 9daaa364a710..081f01368a42 100644
--- a/tests/test_autochunk/benchmark_autochunk.py
+++ b/tests/test_autochunk/benchmark_autochunk.py
@@ -104,7 +104,7 @@ def benchmark_evoformer():
     model = evoformer_base().cuda()
 
     # build autochunk model
-    # max_memory = 10000  # MB fit memory mode
+    # max_memory = 1000  # MB fit memory mode
     max_memory = None  # min memory mode
     autochunk = _build_autochunk(evoformer_base().cuda(), max_memory, node, pair)
 

From 4748967fb12747043c6688b3f13190203ade769f Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 6 Jan 2023 17:13:18 +0800
Subject: [PATCH 097/503] ad reorder graph

---
 colossalai/autochunk/reorder_graph.py | 108 ++++++++++++++++++++++++++
 colossalai/autochunk/trace_index.py   | 106 -------------------------
 2 files changed, 108 insertions(+), 106 deletions(-)
 create mode 100644 colossalai/autochunk/reorder_graph.py

diff --git a/colossalai/autochunk/reorder_graph.py b/colossalai/autochunk/reorder_graph.py
new file mode 100644
index 000000000000..7b9f4a20d6ab
--- /dev/null
+++ b/colossalai/autochunk/reorder_graph.py
@@ -0,0 +1,108 @@
+from .trace_index import TraceIndex
+from .utils import find_idx_by_name
+
+
+class ReorderGraph(object):
+    def __init__(self, index_tracer: TraceIndex) -> None:
+        self.index_tracer = index_tracer
+        self.all_reorder_map = {i: i for i in range(len(self.index_tracer.idx_trace_list))}
+
+    def _get_reorder_map(self, chunk_info):
+        reorder_map = {i: i for i in range(len(self.index_tracer.node_list))}
+
+        chunk_region_start = chunk_info["region"][0]
+        chunk_region_end = chunk_info["region"][1]
+        chunk_prepose_nodes = chunk_info["args"]["prepose_nodes"]
+        chunk_prepose_nodes_idx = [
+            find_idx_by_name(i.name, self.index_tracer.node_list) for i in chunk_prepose_nodes
+        ]
+        # put prepose nodes ahead
+        for idx, n in enumerate(chunk_prepose_nodes):
+            n_idx = chunk_prepose_nodes_idx[idx]
+            reorder_map[n_idx] = chunk_region_start + idx
+        # put other nodes after prepose nodes
+        for n in self.index_tracer.node_list[chunk_region_start : chunk_region_end + 1]:
+            if n in chunk_prepose_nodes:
+                continue
+            n_idx = find_idx_by_name(n.name, self.index_tracer.node_list)
+            pos = sum([n_idx < i for i in chunk_prepose_nodes_idx])
+            reorder_map[n_idx] = n_idx + pos
+
+        return reorder_map
+
+    def _reorder_chunk_info(self, chunk_info, reorder_map):
+        # update chunk info
+        chunk_info["region"] = (
+            chunk_info["region"][0] + len(chunk_info["args"]["prepose_nodes"]),
+            chunk_info["region"][1],
+        )
+        new_inputs_dim = []
+        for idx, input_dim in enumerate(chunk_info["inputs_dim"]):
+            new_input_dim = {}
+            for k, v in input_dim.items():
+                new_input_dim[reorder_map[k]] = v
+            new_inputs_dim.append(new_input_dim)
+        chunk_info["inputs_dim"] = new_inputs_dim
+        return chunk_info
+
+    def _update_all_reorder_map(self, reorder_map):
+        for origin_idx, map_idx in self.all_reorder_map.items():
+            self.all_reorder_map[origin_idx] = reorder_map[map_idx]
+
+    def _reorder_self_node_list(self, reorder_map):
+        new_node_list = [None for _ in range(len(self.index_tracer.node_list))]
+        for old_idx, new_idx in reorder_map.items():
+            new_node_list[new_idx] = self.index_tracer.node_list[old_idx]
+        self.index_tracer.node_list = new_node_list
+
+    def _reorder_idx_trace(self, reorder_map):
+        # reorder list
+        new_idx_trace_list = [None for _ in range(len(self.index_tracer.idx_trace_list))]
+        for old_idx, new_idx in reorder_map.items():
+            new_idx_trace_list[new_idx] = self.index_tracer.idx_trace_list[old_idx]
+        self.index_tracer.idx_trace_list = new_idx_trace_list
+        # update compute
+        for idx_trace in self.index_tracer.idx_trace_list:
+            compute = idx_trace["compute"]
+            for dim_compute in compute:
+                for idx, i in enumerate(dim_compute):
+                    dim_compute[idx] = reorder_map[i]
+        # update source
+        for idx_trace in self.index_tracer.idx_trace_list:
+            source = idx_trace["source"]
+            for dim_idx, dim_source in enumerate(source):
+                new_dim_source = {}
+                for k, v in dim_source.items():
+                    new_dim_source[reorder_map[k]] = v
+                source[dim_idx] = new_dim_source
+
+    def reorder_all(self, chunk_info):
+        if chunk_info is None:
+            return chunk_info
+        if len(chunk_info["args"]["prepose_nodes"]) == 0:
+            return chunk_info
+        reorder_map = self._get_reorder_map(chunk_info)
+        self._update_all_reorder_map(reorder_map)
+        self._reorder_idx_trace(reorder_map)
+        self._reorder_self_node_list(reorder_map)
+        chunk_info = self._reorder_chunk_info(chunk_info, reorder_map)
+        return chunk_info
+
+    def reorder_node_list(self, node_list):
+        new_node_list = [None for _ in range(len(node_list))]
+        for old_idx, new_idx in self.all_reorder_map.items():
+            new_node_list[new_idx] = node_list[old_idx]
+        return new_node_list
+
+    def tmp_reorder(self, node_list, chunk_info):
+        if len(chunk_info["args"]["prepose_nodes"]) == 0:
+            return node_list, chunk_info
+        reorder_map = self._get_reorder_map(chunk_info)
+
+        # new tmp node list
+        new_node_list = [None for _ in range(len(node_list))]
+        for old_idx, new_idx in reorder_map.items():
+            new_node_list[new_idx] = node_list[old_idx]
+
+        chunk_info = self._reorder_chunk_info(chunk_info, reorder_map)
+        return new_node_list, chunk_info
diff --git a/colossalai/autochunk/trace_index.py b/colossalai/autochunk/trace_index.py
index 103a05dadbf5..3ac0d7f84272 100644
--- a/colossalai/autochunk/trace_index.py
+++ b/colossalai/autochunk/trace_index.py
@@ -979,109 +979,3 @@ def _reassgin_reshape_size(self, chunk_info):
                         )
         chunk_info["reshape_size"] = reshape_size
         return chunk_info
-
-
-class ReorderGraph(object):
-    def __init__(self, index_tracer: TraceIndex) -> None:
-        self.index_tracer = index_tracer
-        self.all_reorder_map = {i: i for i in range(len(self.index_tracer.idx_trace_list))}
-
-    def _get_reorder_map(self, chunk_info):
-        reorder_map = {i: i for i in range(len(self.index_tracer.node_list))}
-
-        chunk_region_start = chunk_info["region"][0]
-        chunk_region_end = chunk_info["region"][1]
-        chunk_prepose_nodes = chunk_info["args"]["prepose_nodes"]
-        chunk_prepose_nodes_idx = [
-            find_idx_by_name(i.name, self.index_tracer.node_list) for i in chunk_prepose_nodes
-        ]
-        # put prepose nodes ahead
-        for idx, n in enumerate(chunk_prepose_nodes):
-            n_idx = chunk_prepose_nodes_idx[idx]
-            reorder_map[n_idx] = chunk_region_start + idx
-        # put other nodes after prepose nodes
-        for n in self.index_tracer.node_list[chunk_region_start : chunk_region_end + 1]:
-            if n in chunk_prepose_nodes:
-                continue
-            n_idx = find_idx_by_name(n.name, self.index_tracer.node_list)
-            pos = sum([n_idx < i for i in chunk_prepose_nodes_idx])
-            reorder_map[n_idx] = n_idx + pos
-
-        return reorder_map
-
-    def _reorder_chunk_info(self, chunk_info, reorder_map):
-        # update chunk info
-        chunk_info["region"] = (
-            chunk_info["region"][0] + len(chunk_info["args"]["prepose_nodes"]),
-            chunk_info["region"][1],
-        )
-        new_inputs_dim = []
-        for idx, input_dim in enumerate(chunk_info["inputs_dim"]):
-            new_input_dim = {}
-            for k, v in input_dim.items():
-                new_input_dim[reorder_map[k]] = v
-            new_inputs_dim.append(new_input_dim)
-        chunk_info["inputs_dim"] = new_inputs_dim
-        return chunk_info
-
-    def _update_all_reorder_map(self, reorder_map):
-        for origin_idx, map_idx in self.all_reorder_map.items():
-            self.all_reorder_map[origin_idx] = reorder_map[map_idx]
-
-    def _reorder_self_node_list(self, reorder_map):
-        new_node_list = [None for _ in range(len(self.index_tracer.node_list))]
-        for old_idx, new_idx in reorder_map.items():
-            new_node_list[new_idx] = self.index_tracer.node_list[old_idx]
-        self.index_tracer.node_list = new_node_list
-
-    def _reorder_idx_trace(self, reorder_map):
-        # reorder list
-        new_idx_trace_list = [None for _ in range(len(self.index_tracer.idx_trace_list))]
-        for old_idx, new_idx in reorder_map.items():
-            new_idx_trace_list[new_idx] = self.index_tracer.idx_trace_list[old_idx]
-        self.index_tracer.idx_trace_list = new_idx_trace_list
-        # update compute
-        for idx_trace in self.index_tracer.idx_trace_list:
-            compute = idx_trace["compute"]
-            for dim_compute in compute:
-                for idx, i in enumerate(dim_compute):
-                    dim_compute[idx] = reorder_map[i]
-        # update source
-        for idx_trace in self.index_tracer.idx_trace_list:
-            source = idx_trace["source"]
-            for dim_idx, dim_source in enumerate(source):
-                new_dim_source = {}
-                for k, v in dim_source.items():
-                    new_dim_source[reorder_map[k]] = v
-                source[dim_idx] = new_dim_source
-
-    def reorder_all(self, chunk_info):
-        if chunk_info is None:
-            return chunk_info
-        if len(chunk_info["args"]["prepose_nodes"]) == 0:
-            return chunk_info
-        reorder_map = self._get_reorder_map(chunk_info)
-        self._update_all_reorder_map(reorder_map)
-        self._reorder_idx_trace(reorder_map)
-        self._reorder_self_node_list(reorder_map)
-        chunk_info = self._reorder_chunk_info(chunk_info, reorder_map)
-        return chunk_info
-
-    def reorder_node_list(self, node_list):
-        new_node_list = [None for _ in range(len(node_list))]
-        for old_idx, new_idx in self.all_reorder_map.items():
-            new_node_list[new_idx] = node_list[old_idx]
-        return new_node_list
-
-    def tmp_reorder(self, node_list, chunk_info):
-        if len(chunk_info["args"]["prepose_nodes"]) == 0:
-            return node_list, chunk_info
-        reorder_map = self._get_reorder_map(chunk_info)
-
-        # new tmp node list
-        new_node_list = [None for _ in range(len(node_list))]
-        for old_idx, new_idx in reorder_map.items():
-            new_node_list[new_idx] = node_list[old_idx]
-
-        chunk_info = self._reorder_chunk_info(chunk_info, reorder_map)
-        return new_node_list, chunk_info

From a6cdbf9161afc526d3a961708c0b202ca18c3e7e Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 6 Jan 2023 17:24:23 +0800
Subject: [PATCH 098/503] seperate trace flow

---
 colossalai/autochunk/autochunk_codegen.py   |   2 +-
 colossalai/autochunk/search_chunk.py        |  53 +--
 colossalai/autochunk/select_chunk.py        |   3 +-
 colossalai/autochunk/trace_flow.py          | 414 ++++++++++++++++++++
 colossalai/autochunk/trace_index.py         | 395 -------------------
 tests/test_autochunk/benchmark_autochunk.py |   4 +-
 6 files changed, 447 insertions(+), 424 deletions(-)
 create mode 100644 colossalai/autochunk/trace_flow.py

diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
index 3bb2e83be242..39728cb794f7 100644
--- a/colossalai/autochunk/autochunk_codegen.py
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -167,7 +167,7 @@ def emit_code_with_chunk(
                         )
             # ones like
             if "ones_like" in node.name:
-                meta_node = chunk_region_search.index_tracer.node_list[node_idx]
+                meta_node = chunk_region_search.trace_index.node_list[node_idx]
                 chunk_dim = chunk_infos[region_idx]["node_chunk_dim"][meta_node][
                     "chunk_dim"
                 ]
diff --git a/colossalai/autochunk/search_chunk.py b/colossalai/autochunk/search_chunk.py
index 5c58bda0c393..030b13bdb9c4 100644
--- a/colossalai/autochunk/search_chunk.py
+++ b/colossalai/autochunk/search_chunk.py
@@ -1,8 +1,10 @@
 import copy
 
 from .select_chunk import SelectChunk
-from .trace_index import TraceIndex, ReorderGraph
+from .trace_index import TraceIndex
+from .reorder_graph import ReorderGraph
 from .estiamte_memory import EstimateMemory
+from .trace_flow import TraceFlow
 from .utils import (
     get_node_shape,
     is_non_compute_node,
@@ -14,12 +16,13 @@ class SearchChunk(object):
     def __init__(self, gm, max_memory=None, print_mem=False) -> None:
         self.gm = gm
         self.print_mem = print_mem
-        self.index_tracer = TraceIndex(list(gm.graph.nodes))
-        self.index_tracer.trace_index()
-        self.reorder_graph = ReorderGraph(self.index_tracer)
-        self.memory_estimator = EstimateMemory()
-        self.chunk_selector = SelectChunk(
-            self.index_tracer, self.memory_estimator, self.reorder_graph, max_memory=max_memory
+        self.trace_index = TraceIndex(list(gm.graph.nodes))
+        self.trace_index.trace_index()
+        self.trace_flow = TraceFlow(self.trace_index)
+        self.reorder_graph = ReorderGraph(self.trace_index)
+        self.estimate_memory = EstimateMemory()
+        self.select_chunk = SelectChunk(
+            self.trace_index, self.estimate_memory, self.reorder_graph, max_memory=max_memory
         )
 
     def _find_peak_node(self, mem_peak):
@@ -29,7 +32,7 @@ def _find_peak_node(self, mem_peak):
 
     def _get_free_var(self):
         free_var_idx = []
-        for idx, n in enumerate(self.index_tracer.node_list):
+        for idx, n in enumerate(self.trace_index.node_list):
             if n.op == "placeholder":
                 free_var_idx.append(idx)
         return free_var_idx
@@ -99,7 +102,7 @@ def _is_not_compute(self, trace, chunk_range, dim_idx):
     def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
         start_traces = input_trace[start_idx]
         end_trace = output_trace[end_idx]
-        end_node = self.index_tracer.node_list[end_idx]
+        end_node = self.trace_index.node_list[end_idx]
         chunk_infos = []
         for end_dim, _ in enumerate(end_trace["idx"]):
             if len(start_traces) > 1:
@@ -113,46 +116,46 @@ def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
                     ):
                         continue
                     # check index source align
-                    if not self.index_tracer.check_index_source(
+                    if not self.trace_flow.check_index_source(
                         start_dim, start_node, start_idx, end_dim, end_node
                     ):
                         continue
                     # check index copmute
-                    if not self.index_tracer.check_index_compute(
+                    if not self.trace_flow.check_index_compute(
                         start_idx, end_dim, end_node, end_idx
                     ):
                         continue
                     # flow search
-                    chunk_info = self.index_tracer.flow_search(
+                    chunk_info = self.trace_flow.flow_search(
                         start_idx, start_dim, end_idx, end_dim
                     )
                     if chunk_info is None:
                         continue
                     # check index copmute
-                    if not self.index_tracer.check_index_duplicate(chunk_info):
+                    if not self.trace_flow.check_index_duplicate(chunk_info):
                         continue
                     chunk_infos.append(chunk_info)
         return chunk_infos
 
     def _search_possible_chunk_regions(self, max_chunk_region, peak_node):
         possible_chunk_region = []
-        output_trace = copy.deepcopy(self.index_tracer.idx_trace_list)
+        output_trace = copy.deepcopy(self.trace_index.idx_trace_list)
         input_trace = []  # trace of a node's input nodes
-        for _, n in enumerate(self.index_tracer.node_list):
+        for _, n in enumerate(self.trace_index.node_list):
             cur_trace = {}
             for arg in n.args:
                 if type(arg) == type(n) and not is_non_compute_node_except_placeholder(
                     arg
                 ):
-                    cur_trace[arg] = self.index_tracer._find_trace_from_node(arg)
+                    cur_trace[arg] = self.trace_index._find_trace_from_node(arg)
             input_trace.append(cur_trace)
 
         for start_idx in range(max_chunk_region[0], peak_node + 1):
             for end_idx in range(peak_node, max_chunk_region[1] + 1):
                 # skip non compute nodes
                 if is_non_compute_node(
-                    self.index_tracer.node_list[start_idx]
-                ) or is_non_compute_node(self.index_tracer.node_list[end_idx]):
+                    self.trace_index.node_list[start_idx]
+                ) or is_non_compute_node(self.trace_index.node_list[end_idx]):
                     continue
 
                 # select free dim
@@ -173,7 +176,7 @@ def _step_search(self, mem_peak, active_node, chunk_regions):
         possible_chunk_regions = self._search_possible_chunk_regions(
             max_chunk_region, peak_node
         )
-        best_chunk_region = self.chunk_selector._select_best_chunk_region(
+        best_chunk_region = self.select_chunk._select_best_chunk_region(
             possible_chunk_regions, chunk_regions, peak_node, max_chunk_region, mem_peak
         )
         best_chunk_region = self.reorder_graph.reorder_all(best_chunk_region)
@@ -191,8 +194,8 @@ def search_region(self):
             init_mem_peak,
             _,
             active_node,
-        ) = self.memory_estimator.estimate_chunk_inference_mem(
-            self.index_tracer.node_list
+        ) = self.estimate_memory.estimate_chunk_inference_mem(
+            self.trace_index.node_list
         )
         mem_peak = init_mem_peak
 
@@ -206,14 +209,14 @@ def search_region(self):
                 mem_peak,
                 _,
                 active_node,
-            ) = self.memory_estimator.estimate_chunk_inference_mem(
-                self.index_tracer.node_list, chunk_infos
+            ) = self.estimate_memory.estimate_chunk_inference_mem(
+                self.trace_index.node_list, chunk_infos
             )
             if self._stop_search(init_mem_peak, mem_peak):
                 break
         if self.print_mem:
             self.print_mem = False
-            self.memory_estimator.estimate_chunk_inference_mem(
-                self.index_tracer.node_list, chunk_infos, print_mem=True
+            self.estimate_memory.estimate_chunk_inference_mem(
+                self.trace_index.node_list, chunk_infos, print_mem=True
             )
         return chunk_infos
diff --git a/colossalai/autochunk/select_chunk.py b/colossalai/autochunk/select_chunk.py
index f0262f1e57eb..30f4226f54ec 100644
--- a/colossalai/autochunk/select_chunk.py
+++ b/colossalai/autochunk/select_chunk.py
@@ -1,4 +1,5 @@
-from .trace_index import TraceIndex, ReorderGraph
+from .trace_index import TraceIndex
+from .reorder_graph import ReorderGraph
 from .estiamte_memory import EstimateMemory
 from .utils import is_non_compute_node
 
diff --git a/colossalai/autochunk/trace_flow.py b/colossalai/autochunk/trace_flow.py
new file mode 100644
index 000000000000..f372fa91335f
--- /dev/null
+++ b/colossalai/autochunk/trace_flow.py
@@ -0,0 +1,414 @@
+from .trace_index import TraceIndex
+from .utils import (
+    find_chunk_all_input_nodes,
+    find_chunk_compute_input_and_output_nodes,
+    find_idx_by_name,
+    get_node_shape,
+    is_non_compute_node,
+    is_non_compute_node_except_placeholder,
+)
+
+
+class TraceFlow(object):
+    def __init__(self, trace_index: TraceIndex) -> None:
+        self.trace_index = trace_index
+
+    def check_index_source(self, start_dim, start_node, start_idx, end_dim, end_node):
+        """
+        Check 2 given index: one index should be source of the other
+        Args:
+            start_idx(int): start node chunk dim
+            start_node(node): start node
+            end_idx(int): end node chunk dim
+            end_node(node): end node
+
+        Returns:
+            bool: True if check pass
+        """
+        start_node_idx = find_idx_by_name(start_node.name, self.trace_index.node_list)
+        end_node_trace = self.trace_index._find_trace_from_node(end_node)
+        end_node_trace_source = end_node_trace["source"][end_dim]
+        sorted_source = sorted(
+            end_node_trace_source.items(), key=lambda d: d[0], reverse=True
+        )
+        for node_idx, node_dim in sorted_source:
+            if node_idx == start_node_idx and start_dim in node_dim:
+                return True
+            # it means we meet a node outside the loop, and the node is not input node
+            if node_idx < start_idx:
+                return False
+        return False
+
+    def check_index_compute(self, start_idx, end_dim, end_node, end_idx):
+        """
+        Check 2 given index: check they haven't been computed in the source trace.
+        Args:
+            start_idx(int): start node chunk dim
+            start_node(node): start node
+            end_idx(int): end node chunk dim
+            end_node(node): end node
+
+        Returns:
+            bool: True if check pass
+        """
+        end_node_trace = self.trace_index._find_trace_from_node(end_node)
+        end_node_compute = end_node_trace["compute"][end_dim]
+        if any(start_idx <= i <= end_idx for i in end_node_compute):
+            return False
+        return True
+
+    def get_node_chunk_dim(self, node_from, node_from_dim, node_to):
+        node_from_source = self.trace_index._find_source_trace_from_node(node_from)
+        dim_source = node_from_source[node_from_dim]
+        node_to_idx = find_idx_by_name(node_to.name, self.trace_index.node_list)
+        for k, v in dim_source.items():
+            if k == node_to_idx:
+                return v
+        return None
+
+    def _find_inherit_dim(self, input_node, input_dim, node):
+        input_node_idx = find_idx_by_name(input_node.name, self.trace_index.node_list)
+        node_trace_source = self.trace_index._find_source_trace_from_node(node)
+        for node_dim in range(len(get_node_shape(node))):
+            if (
+                input_node_idx in node_trace_source[node_dim]
+                and input_dim[0] in node_trace_source[node_dim][input_node_idx]
+            ):
+                return node_dim
+        return None
+
+    def check_index_duplicate(self, chunk_infos, return_dim=False):
+        input_dim_after_node = {}
+        for input_node_idx, input_node in enumerate(chunk_infos["inputs"]):
+            for k, v in chunk_infos["inputs_dim"][input_node_idx].items():
+                inherit_dim = self._find_inherit_dim(input_node, v, self.trace_index.node_list[k])
+                if inherit_dim:
+                    input_dim_after_node[k] = inherit_dim
+
+        for node in self.trace_index.node_list[
+            chunk_infos["region"][0] : chunk_infos["region"][1] + 1
+        ]:
+            if is_non_compute_node_except_placeholder(node):
+                continue
+            count = 0
+            duplicate_dims = []
+            node_trace_source = self.trace_index._find_source_trace_from_node(node)
+            for node_dim in range(len(get_node_shape(node))):
+                duplicate_dim = []
+                duplicate_flag = False
+                dim_source = node_trace_source[node_dim]
+                for k, v in dim_source.items():
+                    if chunk_infos["region"][0] <= k <= chunk_infos["region"][1]:
+                        if k in input_dim_after_node and input_dim_after_node[k] in v:
+                            duplicate_flag = True
+                            duplicate_dim.append((k, v))
+                duplicate_dims.append(duplicate_dim)
+                if duplicate_flag:
+                    count += 1
+
+            if count > 1:
+                if return_dim:
+                    return False, duplicate_dims
+                else:
+                    return False
+        if return_dim:
+            return True, None
+        else:
+            return True
+
+    def _assgin_single_node_flow(
+        self,
+        arg_node,
+        start_idx,
+        end_idx,
+        cur_node_dim,
+        cur_node_compute,
+        cur_node_source,
+        cur_node_fix_dim,
+        all_node_info,
+        next_node_list,
+    ):
+        arg_idx = find_idx_by_name(arg_node.name, self.trace_index.node_list)
+        # arg in chunk range or be inputs
+        if not (start_idx <= arg_idx < end_idx):
+            return True
+
+        # find arg dim
+        if cur_node_dim is not None:
+            # dim is computed
+            if arg_idx in cur_node_compute[cur_node_dim]:
+                return False
+            if arg_idx not in cur_node_source[cur_node_dim]:
+                arg_dim = None
+            else:
+                arg_dim = cur_node_source[cur_node_dim][arg_idx][0]
+        else:
+            arg_dim = None
+
+        # get fix dim
+        arg_fix_dim = []
+        if cur_node_dim is not None:
+            for i in cur_node_fix_dim:
+                fix_dim_source = cur_node_source[i]
+                if arg_idx in fix_dim_source:
+                    arg_fix_dim.append(fix_dim_source[arg_idx][0])
+
+        # if already in node_info, arg dim must be same
+        if arg_node in all_node_info:
+            if all_node_info[arg_node]["chunk_dim"] != arg_dim:
+                return False
+            all_node_info[arg_node]["fix_dim"] = list(
+                set(all_node_info[arg_node]["fix_dim"] + arg_fix_dim)
+            )
+        # else add it to list
+        else:
+            all_node_info[arg_node] = {"chunk_dim": arg_dim, "fix_dim": arg_fix_dim}
+
+        next_node_list.append(arg_node)
+        return True
+
+    def _get_all_node_info(self, end_dim, start_idx, end_idx):
+        cur_node_list = [
+            self.trace_index.node_list[end_idx]
+        ]  # start from the last node
+        all_node_info = {cur_node_list[0]: {"chunk_dim": end_dim, "fix_dim": []}}
+
+        while len(cur_node_list) > 0:
+            next_node_list = []
+
+            for cur_node in cur_node_list:
+                # get cur node info
+                cur_node_chunk_dim = all_node_info[cur_node]["chunk_dim"]
+                cur_node_fix_dim = all_node_info[cur_node]["fix_dim"]
+                if cur_node_chunk_dim:
+                    cur_node_compute = self.trace_index._find_compute_trace_from_node(
+                        cur_node
+                    )
+                    cur_node_source = self.trace_index._find_source_trace_from_node(
+                        cur_node
+                    )
+                else:
+                    cur_node_compute = cur_node_source = None
+
+                # get all valid args
+                arg_list = []
+                for arg in cur_node.args:
+                    if type(arg) != type(cur_node):
+                        continue
+                    if is_non_compute_node(arg):
+                        continue
+                    arg_list.append(arg)
+                    flow_flag = self._assgin_single_node_flow(
+                        arg,
+                        start_idx,
+                        end_idx,
+                        cur_node_chunk_dim,
+                        cur_node_compute,
+                        cur_node_source,
+                        cur_node_fix_dim,
+                        all_node_info,
+                        next_node_list,
+                    )
+                    if flow_flag == False:
+                        return None
+
+                if len(arg_list) == 2:
+                    if any(i in cur_node.name for i in ["add", "mul"]):
+                        for arg in arg_list:
+                            if not (
+                                start_idx
+                                <= find_idx_by_name(arg.name, self.trace_index.node_list)
+                                < end_idx
+                            ):
+                                continue
+                            arg_chunk_dim = all_node_info[arg]["chunk_dim"]
+                            arg_fix_dim = all_node_info[arg]["fix_dim"]
+                            arg_shape = get_node_shape(arg)
+                            # add all dim as fix dim except chunk dim
+                            for i, shape in enumerate(arg_shape):
+                                if shape != 1 and i != cur_node_chunk_dim:
+                                    if i == arg_chunk_dim:
+                                        return None
+                                    if i not in arg_fix_dim:
+                                        arg_fix_dim.append(i)
+                    elif "einsum" in cur_node.name:
+                        pass
+                    elif "matmul" in cur_node.name:
+                        pass
+                    else:
+                        raise NotImplementedError()
+            cur_node_list = next_node_list
+        return all_node_info
+
+    def _get_input_nodes_dim(self, inputs, start_idx, end_idx, all_node_info):
+        inputs_dim = []
+        remove_inputs = []
+        for input_node in inputs:
+            input_dict = {}
+            input_node_idx = find_idx_by_name(
+                input_node.name, self.trace_index.node_list
+            )
+            for user in input_node.users.keys():
+                if is_non_compute_node(user):
+                    continue
+                user_idx = find_idx_by_name(user.name, self.trace_index.node_list)
+                if start_idx <= user_idx <= end_idx:
+                    chunk_dim = all_node_info[user]["chunk_dim"]
+                    if chunk_dim is not None:
+                        user_source = self.trace_index._find_source_trace_from_node(user)[chunk_dim]
+                        if input_node_idx in user_source:
+                            input_dict[user_idx] = user_source[input_node_idx]
+                        else:
+                            return None, None
+            if len(input_dict) == 0:
+                remove_inputs.append(input_node)
+            else:
+                inputs_dim.append(input_dict)
+        for i in remove_inputs:
+            if i in inputs:
+                inputs.remove(i)
+        return inputs, inputs_dim
+
+    def _get_prepose_nodes(self, all_node_info, start_idx, end_idx):
+        # get all possible prepose nodes
+        maybe_prepose_nodes = []
+        for node, node_info in all_node_info.items():
+            if node_info["chunk_dim"] is None:
+                maybe_prepose_nodes.append(node)
+        maybe_prepose_nodes.sort(
+            key=lambda x: find_idx_by_name(x.name, self.trace_index.node_list),
+            reverse=True,
+        )  # from last node to first node
+        prepose_nodes = []
+        # set every node as root, search its args, if all legal, turn root and args as prepose nodes
+        while len(maybe_prepose_nodes) > 0:
+            tmp_cur_prepose_nodes = [maybe_prepose_nodes[0]]
+            tmp_cur_related_prepose_nodes = []
+            prepose_flag = True
+
+            # loop cur node's all arg until out of chunk
+            while len(tmp_cur_prepose_nodes) > 0:
+                if prepose_flag == False:
+                    break
+                tmp_next_prepose_nodes = []
+                tmp_cur_related_prepose_nodes.extend(tmp_cur_prepose_nodes)
+                for cur_prepose_node in tmp_cur_prepose_nodes:
+                    if prepose_flag == False:
+                        break
+                    for cur_prepose_node_arg in cur_prepose_node.args:
+                        if type(cur_prepose_node_arg) != type(cur_prepose_node):
+                            continue
+                        # out of loop
+                        if not (
+                            start_idx
+                            <= find_idx_by_name(
+                                cur_prepose_node_arg.name, self.trace_index.node_list
+                            )
+                            < end_idx
+                        ):
+                            continue
+                        # compute op in loop
+                        elif cur_prepose_node_arg in all_node_info:
+                            if all_node_info[cur_prepose_node_arg]["chunk_dim"] is None:
+                                tmp_next_prepose_nodes.append(cur_prepose_node_arg)
+                            else:
+                                prepose_flag = False
+                                break
+                        # non compute op
+                        else:
+                            tmp_next_prepose_nodes.append(cur_prepose_node_arg)
+                tmp_cur_prepose_nodes = tmp_next_prepose_nodes
+
+            if prepose_flag == False:
+                maybe_prepose_nodes.remove(maybe_prepose_nodes[0])
+                continue
+            else:
+                for n in tmp_cur_related_prepose_nodes:
+                    if n not in prepose_nodes:
+                        prepose_nodes.append(n)
+                    if n in maybe_prepose_nodes:
+                        maybe_prepose_nodes.remove(n)
+        # sort by index
+        prepose_nodes.sort(
+            key=lambda x: find_idx_by_name(x.name, self.trace_index.node_list)
+        )
+
+        return prepose_nodes
+
+    def _get_non_chunk_inputs(self, chunk_info, start_idx, end_idx):
+        # we need to log input nodes to avoid deleteing them in the loop
+        chunk_node_list = self.trace_index.node_list[start_idx : end_idx + 1]
+        # also need to get some prepose node's arg out of non_chunk_inputs
+        for n in chunk_info["args"]["prepose_nodes"]:
+            chunk_node_list.remove(n)
+        non_chunk_inputs = find_chunk_all_input_nodes(chunk_node_list)
+        for i in non_chunk_inputs:
+            if i not in chunk_info["inputs"]:
+                chunk_info["inputs_non_chunk"].append(i)
+        return chunk_info
+
+    def flow_search(self, start_idx, start_dim, end_idx, end_dim):
+        inputs, outputs = find_chunk_compute_input_and_output_nodes(
+            self.trace_index.node_list[start_idx : end_idx + 1]
+        )
+        # only single ouput
+        if len(outputs) > 1:
+            return None
+
+        # get every node's chunk dim and fix dim
+        all_node_info = self._get_all_node_info(end_dim, start_idx, end_idx)
+        if all_node_info is None:
+            return None
+
+        # get input nodes' chunk dim
+        inputs, inputs_dim = self._get_input_nodes_dim(
+            inputs, start_idx, end_idx, all_node_info
+        )
+        if inputs is None:
+            return None
+
+        chunk_info = {
+            "region": (start_idx, end_idx),
+            "inputs": inputs,
+            "inputs_non_chunk": [],
+            "inputs_dim": inputs_dim,
+            "outputs": outputs,
+            "outputs_dim": end_dim,
+            "node_chunk_dim": all_node_info,
+            "args": {},
+        }
+
+        # move useless nodes ahead of loop
+        chunk_info["args"]["prepose_nodes"] = self._get_prepose_nodes(
+            all_node_info, start_idx, end_idx
+        )
+
+        # find non chunk inputs
+        chunk_info = self._get_non_chunk_inputs(chunk_info, start_idx, end_idx)
+
+        # reassgin reshape size, some size may have changed due to chunk
+        chunk_info = self._reassgin_reshape_size(chunk_info)
+
+        return chunk_info
+
+    def _reassgin_reshape_size(self, chunk_info):
+        chunk_region = chunk_info["region"]
+        reshape_size = {}
+        chunk_shape = get_node_shape(chunk_info["outputs"][0])[
+            chunk_info["outputs_dim"]
+        ]
+        for node in self.trace_index.node_list[chunk_region[0] : chunk_region[1] + 1]:
+            if any(i in node.name for i in ["reshape", "view"]):
+                reshape_args = node.args[1:]
+                reshape_log = self.trace_index.idx_view_list[node]
+                chunk_dim = chunk_info["node_chunk_dim"][node]["chunk_dim"]
+                reshape_size[node.name] = {}
+                for reshape_arg_dim, reshape_arg in enumerate(reshape_args):
+                    if reshape_arg_dim in reshape_log["dim_to"]:
+                        continue
+                    if reshape_arg_dim == chunk_dim:
+                        reshape_size[node.name][reshape_arg.name] = (
+                            "min(chunk_size, %d - chunk_idx)" % chunk_shape
+                        )
+        chunk_info["reshape_size"] = reshape_size
+        return chunk_info
diff --git a/colossalai/autochunk/trace_index.py b/colossalai/autochunk/trace_index.py
index 3ac0d7f84272..1e8969d8796e 100644
--- a/colossalai/autochunk/trace_index.py
+++ b/colossalai/autochunk/trace_index.py
@@ -1,12 +1,8 @@
 import copy
 
 from .utils import (
-    find_chunk_all_input_nodes,
-    find_chunk_compute_input_and_output_nodes,
     find_idx_by_name,
     get_node_shape,
-    is_non_compute_node,
-    is_non_compute_node_except_placeholder,
 )
 
 
@@ -588,394 +584,3 @@ def trace_index(self):
                 continue
             else:
                 raise NotImplementedError(node.op, "op not implemented yet!")
-        # self._merge_equal_idx()
-
-    def check_index_source(self, start_dim, start_node, start_idx, end_dim, end_node):
-        """
-        Check 2 given index: one index should be source of the other
-        Args:
-            start_idx(int): start node chunk dim
-            start_node(node): start node
-            end_idx(int): end node chunk dim
-            end_node(node): end node
-
-        Returns:
-            bool: True if check pass
-        """
-        start_node_idx = find_idx_by_name(start_node.name, self.node_list)
-        end_node_trace = self._find_trace_from_node(end_node)
-        end_node_trace_source = end_node_trace["source"][end_dim]
-        sorted_source = sorted(
-            end_node_trace_source.items(), key=lambda d: d[0], reverse=True
-        )
-        for node_idx, node_dim in sorted_source:
-            if node_idx == start_node_idx and start_dim in node_dim:
-                return True
-            # it means we meet a node outside the loop, and the node is not input node
-            if node_idx < start_idx:
-                return False
-        return False
-
-    def check_index_compute(self, start_idx, end_dim, end_node, end_idx):
-        """
-        Check 2 given index: check they haven't been computed in the source trace.
-        Args:
-            start_idx(int): start node chunk dim
-            start_node(node): start node
-            end_idx(int): end node chunk dim
-            end_node(node): end node
-
-        Returns:
-            bool: True if check pass
-        """
-        end_node_trace = self._find_trace_from_node(end_node)
-        end_node_compute = end_node_trace["compute"][end_dim]
-        if any(start_idx <= i <= end_idx for i in end_node_compute):
-            return False
-        return True
-
-    def get_node_chunk_dim(self, node_from, node_from_dim, node_to):
-        node_from_source = self._find_source_trace_from_node(node_from)
-        dim_source = node_from_source[node_from_dim]
-        node_to_idx = find_idx_by_name(node_to.name, self.node_list)
-        for k, v in dim_source.items():
-            if k == node_to_idx:
-                return v
-        return None
-
-    def _find_inherit_dim(self, input_node, input_dim, node):
-        input_node_idx = find_idx_by_name(input_node.name, self.node_list)
-        node_trace_source = self._find_source_trace_from_node(node)
-        for node_dim in range(len(get_node_shape(node))):
-            if (
-                input_node_idx in node_trace_source[node_dim]
-                and input_dim[0] in node_trace_source[node_dim][input_node_idx]
-            ):
-                return node_dim
-        return None
-
-    def check_index_duplicate(self, chunk_infos, return_dim=False):
-        input_dim_after_node = {}
-        for input_node_idx, input_node in enumerate(chunk_infos["inputs"]):
-            for k, v in chunk_infos["inputs_dim"][input_node_idx].items():
-                inherit_dim = self._find_inherit_dim(input_node, v, self.node_list[k])
-                if inherit_dim:
-                    input_dim_after_node[k] = inherit_dim
-
-        for node in self.node_list[
-            chunk_infos["region"][0] : chunk_infos["region"][1] + 1
-        ]:
-            if is_non_compute_node_except_placeholder(node):
-                continue
-            count = 0
-            duplicate_dims = []
-            node_trace_source = self._find_source_trace_from_node(node)
-            for node_dim in range(len(get_node_shape(node))):
-                duplicate_dim = []
-                duplicate_flag = False
-                dim_source = node_trace_source[node_dim]
-                for k, v in dim_source.items():
-                    if chunk_infos["region"][0] <= k <= chunk_infos["region"][1]:
-                        if k in input_dim_after_node and input_dim_after_node[k] in v:
-                            duplicate_flag = True
-                            duplicate_dim.append((k, v))
-                duplicate_dims.append(duplicate_dim)
-                if duplicate_flag:
-                    count += 1
-
-            if count > 1:
-                if return_dim:
-                    return False, duplicate_dims
-                else:
-                    return False
-        if return_dim:
-            return True, None
-        else:
-            return True
-
-    def _assgin_single_node_flow(
-        self,
-        arg_node,
-        start_idx,
-        end_idx,
-        cur_node_dim,
-        cur_node_compute,
-        cur_node_source,
-        cur_node_fix_dim,
-        all_node_info,
-        next_node_list,
-    ):
-        arg_idx = find_idx_by_name(arg_node.name, self.node_list)
-        # arg in chunk range or be inputs
-        if not (start_idx <= arg_idx < end_idx):
-            return True
-
-        # find arg dim
-        if cur_node_dim is not None:
-            # dim is computed
-            if arg_idx in cur_node_compute[cur_node_dim]:
-                return False
-            if arg_idx not in cur_node_source[cur_node_dim]:
-                arg_dim = None
-            else:
-                arg_dim = cur_node_source[cur_node_dim][arg_idx][0]
-        else:
-            arg_dim = None
-
-        # get fix dim
-        arg_fix_dim = []
-        if cur_node_dim is not None:
-            for i in cur_node_fix_dim:
-                fix_dim_source = cur_node_source[i]
-                if arg_idx in fix_dim_source:
-                    arg_fix_dim.append(fix_dim_source[arg_idx][0])
-
-        # if already in node_info, arg dim must be same
-        if arg_node in all_node_info:
-            if all_node_info[arg_node]["chunk_dim"] != arg_dim:
-                return False
-            all_node_info[arg_node]["fix_dim"] = list(
-                set(all_node_info[arg_node]["fix_dim"] + arg_fix_dim)
-            )
-        # else add it to list
-        else:
-            all_node_info[arg_node] = {"chunk_dim": arg_dim, "fix_dim": arg_fix_dim}
-
-        next_node_list.append(arg_node)
-        return True
-
-    def _get_all_node_info(self, end_dim, start_idx, end_idx):
-        cur_node_list = [self.node_list[end_idx]]  # start from the last node
-        all_node_info = {cur_node_list[0]: {"chunk_dim": end_dim, "fix_dim": []}}
-
-        while len(cur_node_list) > 0:
-            next_node_list = []
-
-            for cur_node in cur_node_list:
-                # get cur node info
-                cur_node_chunk_dim = all_node_info[cur_node]["chunk_dim"]
-                cur_node_fix_dim = all_node_info[cur_node]["fix_dim"]
-                if cur_node_chunk_dim:
-                    cur_node_compute = self._find_compute_trace_from_node(cur_node)
-                    cur_node_source = self._find_source_trace_from_node(cur_node)
-                else:
-                    cur_node_compute = cur_node_source = None
-
-                # get all valid args
-                arg_list = []
-                for arg in cur_node.args:
-                    if type(arg) != type(cur_node):
-                        continue
-                    if is_non_compute_node(arg):
-                        continue
-                    arg_list.append(arg)
-                    flow_flag = self._assgin_single_node_flow(
-                        arg,
-                        start_idx,
-                        end_idx,
-                        cur_node_chunk_dim,
-                        cur_node_compute,
-                        cur_node_source,
-                        cur_node_fix_dim,
-                        all_node_info,
-                        next_node_list,
-                    )
-                    if flow_flag == False:
-                        return None
-
-                if len(arg_list) == 2:
-                    if any(i in cur_node.name for i in ["add", "mul"]):
-                        for arg in arg_list:
-                            if not (
-                                start_idx
-                                <= find_idx_by_name(arg.name, self.node_list)
-                                < end_idx
-                            ):
-                                continue
-                            arg_chunk_dim = all_node_info[arg]["chunk_dim"]
-                            arg_fix_dim = all_node_info[arg]["fix_dim"]
-                            arg_shape = get_node_shape(arg)
-                            # add all dim as fix dim except chunk dim
-                            for i, shape in enumerate(arg_shape):
-                                if shape != 1 and i != cur_node_chunk_dim:
-                                    if i == arg_chunk_dim:
-                                        return None
-                                    if i not in arg_fix_dim:
-                                        arg_fix_dim.append(i)
-                    elif "einsum" in cur_node.name:
-                        pass
-                    elif "matmul" in cur_node.name:
-                        pass
-                    else:
-                        raise NotImplementedError()
-            cur_node_list = next_node_list
-        return all_node_info
-
-    def _get_input_nodes_dim(self, inputs, start_idx, end_idx, all_node_info):
-        inputs_dim = []
-        remove_inputs = []
-        for input_node in inputs:
-            input_dict = {}
-            input_node_idx = find_idx_by_name(input_node.name, self.node_list)
-            for user in input_node.users.keys():
-                if is_non_compute_node(user):
-                    continue
-                user_idx = find_idx_by_name(user.name, self.node_list)
-                if start_idx <= user_idx <= end_idx:
-                    chunk_dim = all_node_info[user]["chunk_dim"]
-                    if chunk_dim is not None:
-                        user_source = self._find_source_trace_from_node(user)[chunk_dim]
-                        if input_node_idx in user_source:
-                            input_dict[user_idx] = user_source[input_node_idx]
-                        else:
-                            return None, None
-            if len(input_dict) == 0:
-                remove_inputs.append(input_node)
-            else:
-                inputs_dim.append(input_dict)
-        for i in remove_inputs:
-            if i in inputs:
-                inputs.remove(i)
-        return inputs, inputs_dim
-
-    def _get_prepose_nodes(self, all_node_info, start_idx, end_idx):
-        # get all possible prepose nodes
-        maybe_prepose_nodes = []
-        for node, node_info in all_node_info.items():
-            if node_info["chunk_dim"] is None:
-                maybe_prepose_nodes.append(node)
-        maybe_prepose_nodes.sort(
-            key=lambda x: find_idx_by_name(x.name, self.node_list),
-            reverse=True,
-        )  # from last node to first node
-        prepose_nodes = []
-        # set every node as root, search its args, if all legal, turn root and args as prepose nodes
-        while len(maybe_prepose_nodes) > 0:
-            tmp_cur_prepose_nodes = [maybe_prepose_nodes[0]]
-            tmp_cur_related_prepose_nodes = []
-            prepose_flag = True
-
-            # loop cur node's all arg until out of chunk
-            while len(tmp_cur_prepose_nodes) > 0:
-                if prepose_flag == False:
-                    break
-                tmp_next_prepose_nodes = []
-                tmp_cur_related_prepose_nodes.extend(tmp_cur_prepose_nodes)
-                for cur_prepose_node in tmp_cur_prepose_nodes:
-                    if prepose_flag == False:
-                        break
-                    for cur_prepose_node_arg in cur_prepose_node.args:
-                        if type(cur_prepose_node_arg) != type(cur_prepose_node):
-                            continue
-                        # out of loop
-                        if not (
-                            start_idx
-                            <= find_idx_by_name(
-                                cur_prepose_node_arg.name, self.node_list
-                            )
-                            < end_idx
-                        ):
-                            continue
-                        # compute op in loop
-                        elif cur_prepose_node_arg in all_node_info:
-                            if all_node_info[cur_prepose_node_arg]["chunk_dim"] is None:
-                                tmp_next_prepose_nodes.append(cur_prepose_node_arg)
-                            else:
-                                prepose_flag = False
-                                break
-                        # non compute op
-                        else:
-                            tmp_next_prepose_nodes.append(cur_prepose_node_arg)
-                tmp_cur_prepose_nodes = tmp_next_prepose_nodes
-
-            if prepose_flag == False:
-                maybe_prepose_nodes.remove(maybe_prepose_nodes[0])
-                continue
-            else:
-                for n in tmp_cur_related_prepose_nodes:
-                    if n not in prepose_nodes:
-                        prepose_nodes.append(n)
-                    if n in maybe_prepose_nodes:
-                        maybe_prepose_nodes.remove(n)
-        # sort by index
-        prepose_nodes.sort(key=lambda x: find_idx_by_name(x.name, self.node_list))
-
-        return prepose_nodes
-
-    def _get_non_chunk_inputs(self, chunk_info, start_idx, end_idx):
-        # we need to log input nodes to avoid deleteing them in the loop
-        chunk_node_list = self.node_list[start_idx : end_idx + 1]
-        # also need to get some prepose node's arg out of non_chunk_inputs
-        for n in chunk_info["args"]["prepose_nodes"]:
-            chunk_node_list.remove(n)
-        non_chunk_inputs = find_chunk_all_input_nodes(chunk_node_list)
-        for i in non_chunk_inputs:
-            if i not in chunk_info["inputs"]:
-                chunk_info["inputs_non_chunk"].append(i)
-        return chunk_info
-
-    def flow_search(self, start_idx, start_dim, end_idx, end_dim):
-        inputs, outputs = find_chunk_compute_input_and_output_nodes(
-            self.node_list[start_idx : end_idx + 1]
-        )
-        # only single ouput
-        if len(outputs) > 1:
-            return None
-
-        # get every node's chunk dim and fix dim
-        all_node_info = self._get_all_node_info(end_dim, start_idx, end_idx)
-        if all_node_info is None:
-            return None
-
-        # get input nodes' chunk dim
-        inputs, inputs_dim = self._get_input_nodes_dim(
-            inputs, start_idx, end_idx, all_node_info
-        )
-        if inputs is None:
-            return None
-
-        chunk_info = {
-            "region": (start_idx, end_idx),
-            "inputs": inputs,
-            "inputs_non_chunk": [],
-            "inputs_dim": inputs_dim,
-            "outputs": outputs,
-            "outputs_dim": end_dim,
-            "node_chunk_dim": all_node_info,
-            "args": {},
-        }
-
-        # move useless nodes ahead of loop
-        chunk_info["args"]["prepose_nodes"] = self._get_prepose_nodes(
-            all_node_info, start_idx, end_idx
-        )
-
-        # find non chunk inputs
-        chunk_info = self._get_non_chunk_inputs(chunk_info, start_idx, end_idx)
-
-        # reassgin reshape size, some size may have changed due to chunk
-        chunk_info = self._reassgin_reshape_size(chunk_info)
-
-        return chunk_info
-
-    def _reassgin_reshape_size(self, chunk_info):
-        chunk_region = chunk_info["region"]
-        reshape_size = {}
-        chunk_shape = get_node_shape(chunk_info["outputs"][0])[
-            chunk_info["outputs_dim"]
-        ]
-        for node in self.node_list[chunk_region[0] : chunk_region[1] + 1]:
-            if any(i in node.name for i in ["reshape", "view"]):
-                reshape_args = node.args[1:]
-                reshape_log = self.idx_view_list[node]
-                chunk_dim = chunk_info["node_chunk_dim"][node]["chunk_dim"]
-                reshape_size[node.name] = {}
-                for reshape_arg_dim, reshape_arg in enumerate(reshape_args):
-                    if reshape_arg_dim in reshape_log["dim_to"]:
-                        continue
-                    if reshape_arg_dim == chunk_dim:
-                        reshape_size[node.name][reshape_arg.name] = (
-                            "min(chunk_size, %d - chunk_idx)" % chunk_shape
-                        )
-        chunk_info["reshape_size"] = reshape_size
-        return chunk_info
diff --git a/tests/test_autochunk/benchmark_autochunk.py b/tests/test_autochunk/benchmark_autochunk.py
index 081f01368a42..7a9d8cdeee03 100644
--- a/tests/test_autochunk/benchmark_autochunk.py
+++ b/tests/test_autochunk/benchmark_autochunk.py
@@ -104,8 +104,8 @@ def benchmark_evoformer():
     model = evoformer_base().cuda()
 
     # build autochunk model
-    # max_memory = 1000  # MB fit memory mode
-    max_memory = None  # min memory mode
+    max_memory = 1000  # MB fit memory mode
+    # max_memory = None  # min memory mode
     autochunk = _build_autochunk(evoformer_base().cuda(), max_memory, node, pair)
 
     # build openfold

From c3a2bf48b447a5e051bcae5d694ff5dd7beda54a Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 6 Jan 2023 17:31:59 +0800
Subject: [PATCH 099/503] code style

---
 colossalai/autochunk/autochunk_codegen.py | 14 +++++-----
 colossalai/autochunk/reorder_graph.py     | 33 ++++++++++++-----------
 colossalai/autochunk/search_chunk.py      | 11 +++++---
 colossalai/autochunk/select_chunk.py      | 12 ++++-----
 colossalai/autochunk/trace_flow.py        | 12 ++++++---
 5 files changed, 46 insertions(+), 36 deletions(-)

diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
index 39728cb794f7..891753faae6d 100644
--- a/colossalai/autochunk/autochunk_codegen.py
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -103,7 +103,7 @@ def emit_code_with_chunk(
     nodes,
     emit_node_func,
     delete_unused_value_func,
-    chunk_region_search: SearchChunk,
+    search_chunk: SearchChunk,
     chunk_infos,
 ):
     """Emit code with nested activation checkpoint
@@ -133,7 +133,7 @@ def emit_code_with_chunk(
     chunk_outputs = [i["outputs"][0] for i in chunk_infos]
     chunk_outputs_dim = [i["outputs_dim"] for i in chunk_infos]
 
-    node_list = chunk_region_search.reorder_graph.reorder_node_list(node_list)
+    node_list = search_chunk.reorder_graph.reorder_node_list(node_list)
     node_idx = 0
     region_idx = 0
     within_chunk_region = False
@@ -167,7 +167,7 @@ def emit_code_with_chunk(
                         )
             # ones like
             if "ones_like" in node.name:
-                meta_node = chunk_region_search.trace_index.node_list[node_idx]
+                meta_node = search_chunk.trace_index.node_list[node_idx]
                 chunk_dim = chunk_infos[region_idx]["node_chunk_dim"][meta_node][
                     "chunk_dim"
                 ]
@@ -220,10 +220,8 @@ def __init__(self, meta_graph, max_memory=None, print_mem=False):
             self.max_memory = max_memory
             self.meta_node = list(meta_graph.graph.nodes)
             # find the chunk regions
-            self.chunk_region_search = SearchChunk(
-                meta_graph, max_memory, print_mem
-            )
-            self.chunk_infos = self.chunk_region_search.search_region()
+            self.search_chunk = SearchChunk(meta_graph, max_memory, print_mem)
+            self.chunk_infos = self.search_chunk.search_region()
 
         def _gen_python_code(
             self, nodes, root_module: str, namespace: _Namespace
@@ -458,7 +456,7 @@ def emit_node(node: Node, body):
                 nodes,
                 emit_node,
                 delete_unused_values,
-                self.chunk_region_search,
+                self.search_chunk,
                 self.chunk_infos,
             )
 
diff --git a/colossalai/autochunk/reorder_graph.py b/colossalai/autochunk/reorder_graph.py
index 7b9f4a20d6ab..bf4420eac7ee 100644
--- a/colossalai/autochunk/reorder_graph.py
+++ b/colossalai/autochunk/reorder_graph.py
@@ -3,28 +3,31 @@
 
 
 class ReorderGraph(object):
-    def __init__(self, index_tracer: TraceIndex) -> None:
-        self.index_tracer = index_tracer
-        self.all_reorder_map = {i: i for i in range(len(self.index_tracer.idx_trace_list))}
+    def __init__(self, trace_index: TraceIndex) -> None:
+        self.trace_index = trace_index
+        self.all_reorder_map = {
+            i: i for i in range(len(self.trace_index.idx_trace_list))
+        }
 
     def _get_reorder_map(self, chunk_info):
-        reorder_map = {i: i for i in range(len(self.index_tracer.node_list))}
+        reorder_map = {i: i for i in range(len(self.trace_index.node_list))}
 
         chunk_region_start = chunk_info["region"][0]
         chunk_region_end = chunk_info["region"][1]
         chunk_prepose_nodes = chunk_info["args"]["prepose_nodes"]
         chunk_prepose_nodes_idx = [
-            find_idx_by_name(i.name, self.index_tracer.node_list) for i in chunk_prepose_nodes
+            find_idx_by_name(i.name, self.trace_index.node_list)
+            for i in chunk_prepose_nodes
         ]
         # put prepose nodes ahead
         for idx, n in enumerate(chunk_prepose_nodes):
             n_idx = chunk_prepose_nodes_idx[idx]
             reorder_map[n_idx] = chunk_region_start + idx
         # put other nodes after prepose nodes
-        for n in self.index_tracer.node_list[chunk_region_start : chunk_region_end + 1]:
+        for n in self.trace_index.node_list[chunk_region_start : chunk_region_end + 1]:
             if n in chunk_prepose_nodes:
                 continue
-            n_idx = find_idx_by_name(n.name, self.index_tracer.node_list)
+            n_idx = find_idx_by_name(n.name, self.trace_index.node_list)
             pos = sum([n_idx < i for i in chunk_prepose_nodes_idx])
             reorder_map[n_idx] = n_idx + pos
 
@@ -50,25 +53,25 @@ def _update_all_reorder_map(self, reorder_map):
             self.all_reorder_map[origin_idx] = reorder_map[map_idx]
 
     def _reorder_self_node_list(self, reorder_map):
-        new_node_list = [None for _ in range(len(self.index_tracer.node_list))]
+        new_node_list = [None for _ in range(len(self.trace_index.node_list))]
         for old_idx, new_idx in reorder_map.items():
-            new_node_list[new_idx] = self.index_tracer.node_list[old_idx]
-        self.index_tracer.node_list = new_node_list
+            new_node_list[new_idx] = self.trace_index.node_list[old_idx]
+        self.trace_index.node_list = new_node_list
 
     def _reorder_idx_trace(self, reorder_map):
         # reorder list
-        new_idx_trace_list = [None for _ in range(len(self.index_tracer.idx_trace_list))]
+        new_idx_trace_list = [None for _ in range(len(self.trace_index.idx_trace_list))]
         for old_idx, new_idx in reorder_map.items():
-            new_idx_trace_list[new_idx] = self.index_tracer.idx_trace_list[old_idx]
-        self.index_tracer.idx_trace_list = new_idx_trace_list
+            new_idx_trace_list[new_idx] = self.trace_index.idx_trace_list[old_idx]
+        self.trace_index.idx_trace_list = new_idx_trace_list
         # update compute
-        for idx_trace in self.index_tracer.idx_trace_list:
+        for idx_trace in self.trace_index.idx_trace_list:
             compute = idx_trace["compute"]
             for dim_compute in compute:
                 for idx, i in enumerate(dim_compute):
                     dim_compute[idx] = reorder_map[i]
         # update source
-        for idx_trace in self.index_tracer.idx_trace_list:
+        for idx_trace in self.trace_index.idx_trace_list:
             source = idx_trace["source"]
             for dim_idx, dim_source in enumerate(source):
                 new_dim_source = {}
diff --git a/colossalai/autochunk/search_chunk.py b/colossalai/autochunk/search_chunk.py
index 030b13bdb9c4..e2c8de74e012 100644
--- a/colossalai/autochunk/search_chunk.py
+++ b/colossalai/autochunk/search_chunk.py
@@ -1,10 +1,10 @@
 import copy
 
-from .select_chunk import SelectChunk
-from .trace_index import TraceIndex
-from .reorder_graph import ReorderGraph
 from .estiamte_memory import EstimateMemory
+from .reorder_graph import ReorderGraph
+from .select_chunk import SelectChunk
 from .trace_flow import TraceFlow
+from .trace_index import TraceIndex
 from .utils import (
     get_node_shape,
     is_non_compute_node,
@@ -22,7 +22,10 @@ def __init__(self, gm, max_memory=None, print_mem=False) -> None:
         self.reorder_graph = ReorderGraph(self.trace_index)
         self.estimate_memory = EstimateMemory()
         self.select_chunk = SelectChunk(
-            self.trace_index, self.estimate_memory, self.reorder_graph, max_memory=max_memory
+            self.trace_index,
+            self.estimate_memory,
+            self.reorder_graph,
+            max_memory=max_memory,
         )
 
     def _find_peak_node(self, mem_peak):
diff --git a/colossalai/autochunk/select_chunk.py b/colossalai/autochunk/select_chunk.py
index 30f4226f54ec..bdc64528ef18 100644
--- a/colossalai/autochunk/select_chunk.py
+++ b/colossalai/autochunk/select_chunk.py
@@ -1,19 +1,19 @@
-from .trace_index import TraceIndex
-from .reorder_graph import ReorderGraph
 from .estiamte_memory import EstimateMemory
+from .reorder_graph import ReorderGraph
+from .trace_index import TraceIndex
 from .utils import is_non_compute_node
 
 
 class SelectChunk(object):
     def __init__(
         self,
-        index_tracer: TraceIndex,
-        memory_estimator: EstimateMemory,
+        trace_index: TraceIndex,
+        estimate_memory: EstimateMemory,
         reorder_graph: ReorderGraph,
         max_memory=None,
     ):
-        self.index_tracer = index_tracer
-        self.memory_estimator = memory_estimator
+        self.index_tracer = trace_index
+        self.memory_estimator = estimate_memory
         self.reorder_graph = reorder_graph
         if max_memory is not None:
             self.stratge = "fit_memory"
diff --git a/colossalai/autochunk/trace_flow.py b/colossalai/autochunk/trace_flow.py
index f372fa91335f..7139e7e047ef 100644
--- a/colossalai/autochunk/trace_flow.py
+++ b/colossalai/autochunk/trace_flow.py
@@ -81,7 +81,9 @@ def check_index_duplicate(self, chunk_infos, return_dim=False):
         input_dim_after_node = {}
         for input_node_idx, input_node in enumerate(chunk_infos["inputs"]):
             for k, v in chunk_infos["inputs_dim"][input_node_idx].items():
-                inherit_dim = self._find_inherit_dim(input_node, v, self.trace_index.node_list[k])
+                inherit_dim = self._find_inherit_dim(
+                    input_node, v, self.trace_index.node_list[k]
+                )
                 if inherit_dim:
                     input_dim_after_node[k] = inherit_dim
 
@@ -217,7 +219,9 @@ def _get_all_node_info(self, end_dim, start_idx, end_idx):
                         for arg in arg_list:
                             if not (
                                 start_idx
-                                <= find_idx_by_name(arg.name, self.trace_index.node_list)
+                                <= find_idx_by_name(
+                                    arg.name, self.trace_index.node_list
+                                )
                                 < end_idx
                             ):
                                 continue
@@ -255,7 +259,9 @@ def _get_input_nodes_dim(self, inputs, start_idx, end_idx, all_node_info):
                 if start_idx <= user_idx <= end_idx:
                     chunk_dim = all_node_info[user]["chunk_dim"]
                     if chunk_dim is not None:
-                        user_source = self.trace_index._find_source_trace_from_node(user)[chunk_dim]
+                        user_source = self.trace_index._find_source_trace_from_node(
+                            user
+                        )[chunk_dim]
                         if input_node_idx in user_source:
                             input_dict[user_idx] = user_source[input_node_idx]
                         else:

From 8a989a0d89418c308c1d97b4d692a4e753395732 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 6 Jan 2023 17:55:22 +0800
Subject: [PATCH 100/503] code style

---
 colossalai/autochunk/autochunk_codegen.py | 69 +++++++++++++----------
 1 file changed, 40 insertions(+), 29 deletions(-)

diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
index 891753faae6d..0db2e59080dd 100644
--- a/colossalai/autochunk/autochunk_codegen.py
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -98,6 +98,39 @@ def _replace_reshape_size(context, node_name, reshape_size_dict):
     return context
 
 
+def _replace_ones_like(search_chunk, chunk_infos, region_idx, node_idx, node, body):
+    if "ones_like" in node.name:
+        meta_node = search_chunk.trace_index.node_list[node_idx]
+        chunk_dim = chunk_infos[region_idx]["node_chunk_dim"][meta_node]["chunk_dim"]
+        if get_node_shape(meta_node)[chunk_dim] != 1:
+            source_node = meta_node.args[0].args[0]
+            if (
+                source_node not in chunk_infos[region_idx]["node_chunk_dim"]
+                or chunk_infos[region_idx]["node_chunk_dim"][source_node]["chunk_dim"]
+                is None
+            ):
+                chunk_slice = _gen_chunk_slice_dim(
+                    chunk_dim, "chunk_idx", get_node_shape(node)
+                )
+                body[-1] = _replace_name(
+                    body[-1], node.args[0].name, node.args[0].name + chunk_slice
+                )
+    return body
+
+
+def _replace_input_var(chunk_inputs, region_idx, chunk_inputs_dim, node_idx, body):
+    for input_node_idx, input_node in enumerate(chunk_inputs[region_idx]):
+        for idx, dim in chunk_inputs_dim[region_idx][input_node_idx].items():
+            if idx == node_idx:
+                chunk_slice = _gen_chunk_slice_dim(
+                    dim[0], "chunk_idx", get_node_shape(input_node)
+                )
+                body[-1] = _replace_name(
+                    body[-1], input_node.name, input_node.name + chunk_slice
+                )
+    return body
+
+
 def emit_code_with_chunk(
     body,
     nodes,
@@ -156,36 +189,14 @@ def emit_code_with_chunk(
         if within_chunk_region:
             emit_node_func(node, body)
             # replace input var with chunk var
-            for input_node_idx, input_node in enumerate(chunk_inputs[region_idx]):
-                for idx, dim in chunk_inputs_dim[region_idx][input_node_idx].items():
-                    if idx == node_idx:
-                        chunk_slice = _gen_chunk_slice_dim(
-                            dim[0], "chunk_idx", get_node_shape(input_node)
-                        )
-                        body[-1] = _replace_name(
-                            body[-1], input_node.name, input_node.name + chunk_slice
-                        )
+            body = _replace_input_var(
+                chunk_inputs, region_idx, chunk_inputs_dim, node_idx, body
+            )
             # ones like
-            if "ones_like" in node.name:
-                meta_node = search_chunk.trace_index.node_list[node_idx]
-                chunk_dim = chunk_infos[region_idx]["node_chunk_dim"][meta_node][
-                    "chunk_dim"
-                ]
-                if get_node_shape(meta_node)[chunk_dim] != 1:
-                    source_node = meta_node.args[0].args[0]
-                    if (
-                        source_node not in chunk_infos[region_idx]["node_chunk_dim"]
-                        or chunk_infos[region_idx]["node_chunk_dim"][source_node][
-                            "chunk_dim"
-                        ]
-                        is None
-                    ):
-                        chunk_slice = _gen_chunk_slice_dim(
-                            chunk_dim, "chunk_idx", get_node_shape(node)
-                        )
-                        body[-1] = _replace_name(
-                            body[-1], node.args[0].name, node.args[0].name + chunk_slice
-                        )
+            body = _replace_ones_like(
+                search_chunk, chunk_infos, region_idx, node_idx, node, body
+            )
+            # reassgin reshape size
             body[-1] = _replace_reshape_size(
                 body[-1], node.name, chunk_infos[region_idx]["reshape_size"]
             )

From 69d9180c4b8b07cffe5067434308192f43d6c796 Mon Sep 17 00:00:00 2001
From: jiaruifang <fangjiarui123@gmail.com>
Date: Sat, 7 Jan 2023 18:23:02 +0800
Subject: [PATCH 101/503] [hotfix] issue #2388

---
 colossalai/kernel/cuda_native/layer_norm.py     | 15 ++++++++-------
 colossalai/kernel/cuda_native/scaled_softmax.py | 14 ++++++++------
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/colossalai/kernel/cuda_native/layer_norm.py b/colossalai/kernel/cuda_native/layer_norm.py
index f1b5efa4ec8c..4be3363882ce 100644
--- a/colossalai/kernel/cuda_native/layer_norm.py
+++ b/colossalai/kernel/cuda_native/layer_norm.py
@@ -16,17 +16,17 @@ class FusedLayerNormAffineFunction(torch.autograd.Function):
     @custom_fwd(cast_inputs=torch.float32)
     def forward(ctx, input, weight, bias, normalized_shape, eps):
         try:
-            import colossalai._C.layer_norm
+            from colossalai._C import layer_norm
         except ImportError:
-            raise RuntimeError('FusedLayerNormAffineFunction requires cuda extensions')
+            from colossalai.kernel.op_builder.layernorm import LayerNormBuilder
+            layer_norm = LayerNormBuilder().load()
 
         ctx.normalized_shape = normalized_shape
         ctx.eps = eps
         input_ = input.contiguous()
         weight_ = weight.contiguous()
         bias_ = bias.contiguous()
-        output, mean, invvar = colossalai._C.layer_norm.forward_affine(input_, ctx.normalized_shape, weight_, bias_,
-                                                                       ctx.eps)
+        output, mean, invvar = layer_norm.forward_affine(input_, ctx.normalized_shape, weight_, bias_, ctx.eps)
         ctx.save_for_backward(input_, weight_, bias_, mean, invvar)
 
         return output
@@ -35,14 +35,15 @@ def forward(ctx, input, weight, bias, normalized_shape, eps):
     @custom_bwd
     def backward(ctx, grad_output):
         try:
-            import colossalai._C.layer_norm
+            from colossalai._C import layer_norm
         except ImportError:
-            raise RuntimeError('FusedLayerNormAffineFunction requires cuda extensions')
+            from colossalai.kernel.op_builder.layernorm import LayerNormBuilder
+            layer_norm = LayerNormBuilder().load()
 
         input_, weight_, bias_, mean, invvar = ctx.saved_tensors
         grad_input = grad_weight = grad_bias = None
         grad_input, grad_weight, grad_bias \
-            = colossalai._C.layer_norm.backward_affine(
+            = layer_norm.backward_affine(
                 grad_output.contiguous(), mean, invvar,
                 input_, ctx.normalized_shape,
                 weight_, bias_, ctx.eps)
diff --git a/colossalai/kernel/cuda_native/scaled_softmax.py b/colossalai/kernel/cuda_native/scaled_softmax.py
index 9e147b4199ec..3f0260aaed87 100644
--- a/colossalai/kernel/cuda_native/scaled_softmax.py
+++ b/colossalai/kernel/cuda_native/scaled_softmax.py
@@ -53,26 +53,28 @@ class ScaledMaskedSoftmax(torch.autograd.Function):
     @staticmethod
     def forward(ctx, inputs, mask, scale):
         try:
-            import colossalai._C.scaled_masked_softmax
+            from colossalai._C import scaled_masked_softmax
         except ImportError:
-            raise RuntimeError('ScaledMaskedSoftmax requires cuda extensions')
+            from colossalai.kernel.op_builder.scaled_masked_softmax import ScaledMaskedSoftmaxBuilder
+            scaled_masked_softmax = ScaledMaskedSoftmaxBuilder().load()
 
         scale_t = torch.tensor([scale])
 
-        softmax_results = colossalai._C.scaled_masked_softmax.forward(inputs, mask, scale_t[0])
+        softmax_results = scaled_masked_softmax.forward(inputs, mask, scale_t[0])
         ctx.save_for_backward(softmax_results, scale_t)
         return softmax_results
 
     @staticmethod
     def backward(ctx, output_grads):
         try:
-            import colossalai._C.scaled_masked_softmax
+            from colossalai._C import scaled_masked_softmax
         except ImportError:
-            raise RuntimeError('ScaledMaskedSoftmax requires cuda extensions')
+            from colossalai.kernel.op_builder.scaled_masked_softmax import ScaledMaskedSoftmaxBuilder
+            scaled_masked_softmax = ScaledMaskedSoftmaxBuilder().load()
 
         softmax_results, scale_t = ctx.saved_tensors
 
-        input_grads = colossalai._C.scaled_masked_softmax.backward(output_grads, softmax_results, scale_t[0])
+        input_grads = scaled_masked_softmax.backward(output_grads, softmax_results, scale_t[0])
         return input_grads, None, None
 
 
From b2e0d502b8b9b7d4e6263fd97dff9974eace9a60 Mon Sep 17 00:00:00 2001
From: jiaruifang <fangjiarui123@gmail.com>
Date: Sat, 7 Jan 2023 19:44:50 +0800
Subject: [PATCH 102/503] [doc] hotfix #2377

---
 .../{requirement_colossalai.txt => requirements_colossalai.txt}   | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename examples/images/dreambooth/{requirement_colossalai.txt => requirements_colossalai.txt} (100%)

diff --git a/examples/images/dreambooth/requirement_colossalai.txt b/examples/images/dreambooth/requirements_colossalai.txt
similarity index 100%
rename from examples/images/dreambooth/requirement_colossalai.txt
rename to examples/images/dreambooth/requirements_colossalai.txt

From 2add870138be6b89b26717ad3d6410a43b3fe3ad Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 9 Jan 2023 09:18:44 +0800
Subject: [PATCH 103/503] [workflow] added missing file change detection output
 (#2387)

---
 .github/workflows/build.yml | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 6b3f9f9d7a21..5366f69cc7b0 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -14,18 +14,27 @@ jobs:
         contains( github.event.pull_request.labels.*.name, 'Run Build and Test')
     outputs:
       changedFiles: ${{ steps.find-changed-files.outputs.changedFiles }}
+      anyChanged: ${{ steps.find-changed-files.outputs.any_changed }}
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
       - name: Find the changed files
         id: find-changed-files
-        uses: tj-actions/changed-files@v34
+        uses: tj-actions/changed-files@v35
         with:
           since_last_remote_commit: true
           files: |
             op_builder/**
             colossalai/kernel/**
             setup.py
+      - name: List changed files
+        run: |
+          for file in ${{ steps.find-changed-files.outputs.all_changed_files }}; do
+            echo "$file was changed"
+          done
+
 
   build:
     name: Build and Test Colossal-AI
@@ -54,9 +63,10 @@ jobs:
           ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
 
       - name: Restore cache
-        if: needs.detect.outputs.anyChanged == 'true'
+        if: needs.detect.outputs.anyChanged != 'true'
         run: |
-          [ ! -z "$(ls -A /github/home/cuda_ext_cache/)" ] && cp -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/
+          # -p flag is required to preserve the file timestamp to avoid ninja rebuild
+          [ ! -z "$(ls -A /github/home/cuda_ext_cache/)" ] && cp -p -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/
 
       - name: Install Colossal-AI
         run: |
@@ -70,7 +80,8 @@ jobs:
           DATA: /data/scratch/cifar-10
           NCCL_SHM_DISABLE: 1
           LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
-      
+
       - name: Store Cache
         run: |
-          cp -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/
+          # -p flag is required to preserve the file timestamp to avoid ninja rebuild
+          cp -p -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/

From ce08661eb14f732671cf31f7c1e81f51c838b57f Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 9 Jan 2023 11:05:27 +0800
Subject: [PATCH 104/503] [cli] updated installation check cli for aot/jit
 build (#2395)

---
 colossalai/cli/check/check_installation.py | 195 ++++++++++++++++-----
 1 file changed, 150 insertions(+), 45 deletions(-)

diff --git a/colossalai/cli/check/check_installation.py b/colossalai/cli/check/check_installation.py
index a12b24402794..22c169577495 100644
--- a/colossalai/cli/check/check_installation.py
+++ b/colossalai/cli/check/check_installation.py
@@ -7,30 +7,100 @@
 import colossalai
 
 
+def to_click_output(val):
+    # installation check output to understandable symbols for readability
+    VAL_TO_SYMBOL = {True: u'\u2713', False: 'x', None: 'N/A'}
+
+    if val in VAL_TO_SYMBOL:
+        return VAL_TO_SYMBOL[val]
+    else:
+        return val
+
+
 def check_installation():
-    cuda_ext_installed = _check_cuda_extension_installed()
-    cuda_version, torch_version, torch_cuda_version = _check_cuda_torch()
+    """
+    This function will check the installation of colossalai, specifically, the version compatibility of
+    colossalai, pytorch and cuda.
+
+    Example:
+    ```text
+    ```
+
+    Returns: A table of installation information.
+    """
+    found_aot_cuda_ext = _check_aot_built_cuda_extension_installed()
+    cuda_version = _check_cuda_version()
+    torch_version, torch_cuda_version = _check_torch_version()
     colossalai_verison, torch_version_required, cuda_version_required = _parse_colossalai_version()
 
-    cuda_compatibility = _get_compatibility_string([cuda_version, torch_cuda_version, cuda_version_required])
-    torch_compatibility = _get_compatibility_string([torch_version, torch_version_required])
-
-    click.echo(f'#### Installation Report ####\n')
-    click.echo(f"Colossal-AI version: {colossalai_verison}")
-    click.echo(f'----------------------------')
-    click.echo(f"PyTorch Version: {torch_version}")
-    click.echo(f"PyTorch Version required by Colossal-AI: {torch_version_required}")
-    click.echo(f'PyTorch version match: {torch_compatibility}')
-    click.echo(f'----------------------------')
-    click.echo(f"System CUDA Version: {cuda_version}")
-    click.echo(f"CUDA Version required by PyTorch: {torch_cuda_version}")
-    click.echo(f"CUDA Version required by Colossal-AI: {cuda_version_required}")
-    click.echo(f"CUDA Version Match: {cuda_compatibility}")
-    click.echo(f'----------------------------')
-    click.echo(f"CUDA Extension: {cuda_ext_installed}")
+    # if cuda_version is None, that means either
+    # CUDA_HOME is not found, thus cannot compare the version compatibility
+    if not cuda_version:
+        sys_torch_cuda_compatibility = None
+    else:
+        sys_torch_cuda_compatibility = _is_compatible([cuda_version, torch_cuda_version])
 
+    # if cuda_version or cuda_version_required is None, that means either
+    # CUDA_HOME is not found or AOT compilation is not enabled
+    # thus, there is no need to compare the version compatibility at all
+    if not cuda_version or not cuda_version_required:
+        sys_colossalai_cuda_compatibility = None
+    else:
+        sys_colossalai_cuda_compatibility = _is_compatible([cuda_version, cuda_version_required])
 
-def _get_compatibility_string(versions):
+    # if torch_version_required is None, that means AOT compilation is not enabled
+    # thus there is no need to compare the versions
+    if torch_version_required is None:
+        torch_compatibility = None
+    else:
+        torch_compatibility = _is_compatible([torch_version, torch_version_required])
+
+    click.echo(f'#### Installation Report ####')
+    click.echo(f'\n------------ Environment ------------')
+    click.echo(f"Colossal-AI version: {to_click_output(colossalai_verison)}")
+    click.echo(f"PyTorch version: {to_click_output(torch_version)}")
+    click.echo(f"CUDA version: {to_click_output(cuda_version)}")
+    click.echo(f"CUDA version required by PyTorch: {to_click_output(torch_cuda_version)}")
+    click.echo("")
+    click.echo(f"Note:")
+    click.echo(f"1. The table above checks the versions of the libraries/tools in the current environment")
+    click.echo(f"2. If the CUDA version is N/A, you can set the CUDA_HOME environment variable to locate it")
+
+    click.echo(f'\n------------ CUDA Extensions AOT Compilation ------------')
+    click.echo(f"Found AOT CUDA Extension: {to_click_output(found_aot_cuda_ext)}")
+    click.echo(f"PyTorch version used for AOT compilation: {to_click_output(torch_version_required)}")
+    click.echo(f"CUDA version used for AOT compilation: {to_click_output(cuda_version_required)}")
+    click.echo("")
+    click.echo(f"Note:")
+    click.echo(
+        f"1. AOT (ahead-of-time) compilation of the CUDA kernels occurs during installation when the environment varialbe CUDA_EXT=1 is set"
+    )
+    click.echo(f"2. If AOT compilation is not enabled, stay calm as the CUDA kernels can still be built during runtime")
+
+    click.echo(f"\n------------ Compatibility ------------")
+    click.echo(f'PyTorch version match: {to_click_output(torch_compatibility)}')
+    click.echo(f"System and PyTorch CUDA version match: {to_click_output(sys_torch_cuda_compatibility)}")
+    click.echo(f"System and Colossal-AI CUDA version match: {to_click_output(sys_colossalai_cuda_compatibility)}")
+    click.echo(f"")
+    click.echo(f"Note:")
+    click.echo(f"1. The table above checks the version compatibility of the libraries/tools in the current environment")
+    click.echo(
+        f"   - PyTorch version mistach: whether the PyTorch version in the current environment is compatible with the PyTorch version used for AOT compilation"
+    )
+    click.echo(
+        f"   - System and PyTorch CUDA version match: whether the CUDA version in the current environment is compatible with the CUDA version required by PyTorch"
+    )
+    click.echo(
+        f"   - System and Colossal-AI CUDA version match: whether the CUDA version in the current environment is compatible with the CUDA version used for AOT compilation"
+    )
+
+
+def _is_compatible(versions):
+    """
+    Compare the list of versions and return whether they are compatible.
+    """
+    if None in versions:
+        return False
 
     # split version into [major, minor, patch]
     versions = [version.split('.') for version in versions]
@@ -44,37 +114,81 @@ def _get_compatibility_string(versions):
         equal = len(set(version_values)) == 1
 
         if idx in [0, 1] and not equal:
-            # if the major/minor versions do not match
-            # return a cross
-            return 'x'
+            return False
         elif idx == 1:
-            # if the minor versions match
-            # return a tick
-            return u'\u2713'
+            return True
         else:
             continue
 
 
 def _parse_colossalai_version():
+    """
+    Get the Colossal-AI version information.
+
+    Returns:
+        colossalai_version: Colossal-AI version.
+        torch_version_for_aot_build: PyTorch version used for AOT compilation of CUDA kernels.
+        cuda_version_for_aot_build: CUDA version used for AOT compilation of CUDA kernels.
+    """
+    # colossalai version can be in two formats
+    # 1. X.X.X+torchX.XXcuXX.X (when colossalai is installed with CUDA extensions)
+    # 2. X.X.X (when colossalai is not installed with CUDA extensions)
+    # where X represents an integer.
     colossalai_verison = colossalai.__version__.split('+')[0]
-    torch_version_required = colossalai.__version__.split('torch')[1].split('cu')[0]
-    cuda_version_required = colossalai.__version__.split('cu')[1]
-    return colossalai_verison, torch_version_required, cuda_version_required
 
-
-def _check_cuda_extension_installed():
+    try:
+        torch_version_for_aot_build = colossalai.__version__.split('torch')[1].split('cu')[0]
+        cuda_version_for_aot_build = colossalai.__version__.split('cu')[1]
+    except:
+        torch_version_for_aot_build = None
+        cuda_version_for_aot_build = None
+    return colossalai_verison, torch_version_for_aot_build, cuda_version_for_aot_build
+
+
+def _check_aot_built_cuda_extension_installed():
+    """
+    According to `op_builder/README.md`, the CUDA extension can be built with either
+    AOT (ahead-of-time) or JIT (just-in-time) compilation.
+    AOT compilation will build CUDA extensions to `colossalai._C` during installation.
+    JIT (just-in-time) compilation will build CUDA extensions to `~/.cache/colossalai/torch_extensions` during runtime.
+    """
     try:
         import colossalai._C.fused_optim
-        is_cuda_extension_installed = u'\u2713'
+        found_aot_cuda_ext = True
     except ImportError:
-        is_cuda_extension_installed = 'x'
-    return is_cuda_extension_installed
+        found_aot_cuda_ext = False
+    return found_aot_cuda_ext
+
+
+def _check_torch_version():
+    """
+    Get the PyTorch version information.
 
+    Returns:
+        torch_version: PyTorch version.
+        torch_cuda_version: CUDA version required by PyTorch.
+    """
+    # get torch version
+    torch_version = torch.__version__.split('+')[0]
 
-def _check_cuda_torch():
+    # get cuda version in pytorch build
+    torch_cuda_major = torch.version.cuda.split(".")[0]
+    torch_cuda_minor = torch.version.cuda.split(".")[1]
+    torch_cuda_version = f'{torch_cuda_major}.{torch_cuda_minor}'
+
+    return torch_version, torch_cuda_version
+
+
+def _check_cuda_version():
+    """
+    Get the CUDA version information.
+
+    Returns:
+        cuda_version: CUDA version found on the system.
+    """
     # get cuda version
     if CUDA_HOME is None:
-        cuda_version = 'N/A (CUDA_HOME is not set)'
+        cuda_version = CUDA_HOME
     else:
         raw_output = subprocess.check_output([CUDA_HOME + "/bin/nvcc", "-V"], universal_newlines=True)
         output = raw_output.split()
@@ -83,13 +197,4 @@ def _check_cuda_torch():
         bare_metal_major = release[0]
         bare_metal_minor = release[1][0]
         cuda_version = f'{bare_metal_major}.{bare_metal_minor}'
-
-    # get torch version
-    torch_version = torch.__version__.split('+')[0]
-
-    # get cuda version in pytorch build
-    torch_cuda_major = torch.version.cuda.split(".")[0]
-    torch_cuda_minor = torch.version.cuda.split(".")[1]
-    torch_cuda_version = f'{torch_cuda_major}.{torch_cuda_minor}'
-
-    return cuda_version, torch_version, torch_cuda_version
+    return cuda_version

From 4d223e18a2600ca2467fb21ef4c18f0e9aa0d04c Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 13:46:17 +0800
Subject: [PATCH 105/503] fix typo

---
 colossalai/autochunk/{estiamte_memory.py => estimate_memory.py} | 0
 colossalai/autochunk/search_chunk.py                            | 2 +-
 colossalai/autochunk/select_chunk.py                            | 2 +-
 3 files changed, 2 insertions(+), 2 deletions(-)
 rename colossalai/autochunk/{estiamte_memory.py => estimate_memory.py} (100%)

diff --git a/colossalai/autochunk/estiamte_memory.py b/colossalai/autochunk/estimate_memory.py
similarity index 100%
rename from colossalai/autochunk/estiamte_memory.py
rename to colossalai/autochunk/estimate_memory.py
diff --git a/colossalai/autochunk/search_chunk.py b/colossalai/autochunk/search_chunk.py
index e2c8de74e012..21b967497f1b 100644
--- a/colossalai/autochunk/search_chunk.py
+++ b/colossalai/autochunk/search_chunk.py
@@ -1,6 +1,6 @@
 import copy
 
-from .estiamte_memory import EstimateMemory
+from .estimate_memory import EstimateMemory
 from .reorder_graph import ReorderGraph
 from .select_chunk import SelectChunk
 from .trace_flow import TraceFlow
diff --git a/colossalai/autochunk/select_chunk.py b/colossalai/autochunk/select_chunk.py
index bdc64528ef18..7127cfd64e69 100644
--- a/colossalai/autochunk/select_chunk.py
+++ b/colossalai/autochunk/select_chunk.py
@@ -1,4 +1,4 @@
-from .estiamte_memory import EstimateMemory
+from .estimate_memory import EstimateMemory
 from .reorder_graph import ReorderGraph
 from .trace_index import TraceIndex
 from .utils import is_non_compute_node

From cb68ee864a21e330e8061ee13811a7045f3d65f3 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 14:20:41 +0800
Subject: [PATCH 106/503] set benchmark

---
 tests/test_autochunk/benchmark_autochunk.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_autochunk/benchmark_autochunk.py b/tests/test_autochunk/benchmark_autochunk.py
index 7a9d8cdeee03..6632ece61376 100644
--- a/tests/test_autochunk/benchmark_autochunk.py
+++ b/tests/test_autochunk/benchmark_autochunk.py
@@ -98,14 +98,14 @@ def _build_openfold():
 def benchmark_evoformer():
     # init data and model
     msa_len = 256
-    pair_len = 256
+    pair_len = 512
     node = torch.randn(1, msa_len, pair_len, 256).cuda()
     pair = torch.randn(1, pair_len, pair_len, 128).cuda()
     model = evoformer_base().cuda()
 
     # build autochunk model
-    max_memory = 1000  # MB fit memory mode
-    # max_memory = None  # min memory mode
+    # max_memory = 1000  # MB, fit memory mode
+    max_memory = None  # min memory mode
     autochunk = _build_autochunk(evoformer_base().cuda(), max_memory, node, pair)
 
     # build openfold

From 18a51c87fe0aa3a1210d7484fc09c16714e04bb7 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 14:20:54 +0800
Subject: [PATCH 107/503] rename test

---
 .../{test_autochunk.py => test_autochunk_codegen.py}          | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)
 rename tests/test_autochunk/{test_autochunk.py => test_autochunk_codegen.py} (97%)

diff --git a/tests/test_autochunk/test_autochunk.py b/tests/test_autochunk/test_autochunk_codegen.py
similarity index 97%
rename from tests/test_autochunk/test_autochunk.py
rename to tests/test_autochunk/test_autochunk_codegen.py
index 85a162084cc9..1c5dd939d710 100644
--- a/tests/test_autochunk/test_autochunk.py
+++ b/tests/test_autochunk/test_autochunk_codegen.py
@@ -18,9 +18,7 @@ def _test_fwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair):
     torch.cuda.reset_peak_memory_stats()
     now_mem = torch.cuda.memory_allocated() / 1024**2
     with torch.no_grad():
-        node1 = node.clone()
-        pair1 = pair.clone()
-        gm(node1, pair1)
+        gm(node.clone(), pair.clone())
     new_now_mem = torch.cuda.memory_allocated() / 1024**2
     new_max_mem = torch.cuda.max_memory_allocated() / 1024**2
     print(

From 74b81395a2edbce36896f3d184c6cfae327024b5 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 14:26:22 +0800
Subject: [PATCH 108/503] update codegen test

---
 .../test_autochunk/test_autochunk_codegen.py  | 42 +++++++++++--------
 1 file changed, 24 insertions(+), 18 deletions(-)

diff --git a/tests/test_autochunk/test_autochunk_codegen.py b/tests/test_autochunk/test_autochunk_codegen.py
index 1c5dd939d710..8246275eb08a 100644
--- a/tests/test_autochunk/test_autochunk_codegen.py
+++ b/tests/test_autochunk/test_autochunk_codegen.py
@@ -15,16 +15,19 @@
 
 
 def _test_fwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair):
-    torch.cuda.reset_peak_memory_stats()
-    now_mem = torch.cuda.memory_allocated() / 1024**2
-    with torch.no_grad():
-        gm(node.clone(), pair.clone())
-    new_now_mem = torch.cuda.memory_allocated() / 1024**2
-    new_max_mem = torch.cuda.max_memory_allocated() / 1024**2
-    print(
-        "autochunk now mem:%.2f max mem:%.2f"
-        % (new_now_mem - now_mem, new_max_mem - now_mem)
-    )
+    # for memory test
+    # torch.cuda.reset_peak_memory_stats()
+    # now_mem = torch.cuda.memory_allocated() / 1024**2
+    # with torch.no_grad():
+    #     node1 = node.clone()
+    #     pair1 = pair.clone()
+    #     gm(node1, pair1)
+    # new_now_mem = torch.cuda.memory_allocated() / 1024**2
+    # new_max_mem = torch.cuda.max_memory_allocated() / 1024**2
+    # print(
+    #     "autochunk now mem:%.2f max mem:%.2f"
+    #     % (new_now_mem - now_mem, new_max_mem - now_mem)
+    # )
 
     # test forward
     with torch.no_grad():
@@ -43,7 +46,7 @@ def _test_fwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair):
     )
 
 
-def _run_offload_codegen(rank):
+def _test_autochunk_codegen(rank):
     # launch colossalai to make sure we could execute colossalai.utils.checkpoint currectly
     colossalai.launch(
         config={},
@@ -56,8 +59,10 @@ def _run_offload_codegen(rank):
 
     # build model and input
     model = evoformer_base().cuda()
-    node = torch.randn(1, 100, 300, 256).cuda()
-    pair = torch.randn(1, 300, 300, 128).cuda()
+    msa_len = 32
+    pair_len = 64
+    node = torch.randn(1, msa_len, pair_len, 256).cuda()
+    pair = torch.randn(1, pair_len, pair_len, 128).cuda()
 
     # trace the module and replace codegen
     graph = ColoTracer().trace(
@@ -85,17 +90,18 @@ def _run_offload_codegen(rank):
     gm = ColoGraphModule(model, graph)
     gm.recompile()
 
-    # assert we have all the components
-    # code = graph.python_code("self").src
+    # assert we have inserted chunk
+    code = graph.python_code("self").src
+    assert "chunk_size" in code
     # print(code)
 
     _test_fwd(model, gm, node, pair)
     gpc.destroy()
 
 
-def test_autochunk():
-    mp.spawn(_run_offload_codegen, nprocs=1)
+def test_autochunk_codegen():
+    mp.spawn(_test_autochunk_codegen, nprocs=1)
 
 
 if __name__ == "__main__":
-    _run_offload_codegen(0)
+    _test_autochunk_codegen(0)

From 9880fd2cd8b3b24c28333926338656a06dd170f3 Mon Sep 17 00:00:00 2001
From: eric8607242 <e0928021388@gmail.com>
Date: Mon, 9 Jan 2023 14:35:14 +0800
Subject: [PATCH 109/503] Fix state_dict key missing issue of the ZeroDDP
 (#2363)

* Fix state_dict output for ZeroDDP duplicated parameters

* Rewrite state_dict based on get_static_torch_model

* Modify get_static_torch_model to be compatible with the lower version (ZeroDDP)
---
 colossalai/nn/parallel/data_parallel.py | 37 +++++++++++++++++++++----
 colossalai/nn/parallel/utils.py         | 16 +++++------
 2 files changed, 39 insertions(+), 14 deletions(-)

diff --git a/colossalai/nn/parallel/data_parallel.py b/colossalai/nn/parallel/data_parallel.py
index e3bb83347d21..8fd08db957b7 100644
--- a/colossalai/nn/parallel/data_parallel.py
+++ b/colossalai/nn/parallel/data_parallel.py
@@ -18,6 +18,7 @@
 from colossalai.zero.utils.gemini_hook import GeminiZeROHook
 
 from .reducer import Reducer
+from .utils import get_static_torch_model
 
 try:
     from torch.nn.modules.module import _EXTRA_STATE_KEY_SUFFIX, _IncompatibleKeys
@@ -251,6 +252,7 @@ def __init__(self,
                                                pin_memory=pin_memory)
             self.fp32_params.append(fp32_p)
             self.grads_device[p] = self.gemini_manager.default_device
+
         self.chunk_manager.close_all_groups()
         self._cast_buffers()
 
@@ -331,12 +333,11 @@ def set_chunk_grad_device(self, chunk: Chunk, device: torch.device) -> None:
         for tensor in chunk.get_tensors():
             self.grads_device[tensor] = device
 
-    def state_dict(self, destination=None, prefix='', keep_vars=False, only_rank_0: bool = True):
-        r"""Returns a dictionary containing a whole state of the module.
-
-        Both parameters and persistent buffers (e.g. running averages) are
-        included. Keys are corresponding parameter and buffer names.
-        Parameters and buffers set to ``None`` are not included.
+    def state_dict(self, destination=None, prefix='', keep_vars=False, only_rank_0: bool = True, strict: bool = True):
+        r"""
+        Args:
+            strict (bool): whether to reture the whole model state
+                as the original pytorch state_dict()
 
         Returns:
             dict:
@@ -346,7 +347,31 @@ def state_dict(self, destination=None, prefix='', keep_vars=False, only_rank_0:
 
             >>> module.state_dict().keys()
             ['bias', 'weight']
+        """
+        if strict:
+            return get_static_torch_model(zero_ddp_model=self, device=get_current_device(),
+                                          only_rank_0=only_rank_0).state_dict(destination=destination,
+                                                                              prefix=prefix,
+                                                                              keep_vars=keep_vars)
+        return self._non_strict_state_dict(destination=destination,
+                                           prefix=prefix,
+                                           keep_vars=keep_vars,
+                                           only_rank_0=only_rank_0)
+
+    def _non_strict_state_dict(self, destination=None, prefix='', keep_vars=False, only_rank_0: bool = True):
+        r"""Returns a dictionary containing a whole state of the module.
+
+        Both parameters and persistent buffers (e.g. running averages) are
+        included. Keys are corresponding parameter and buffer names.
+        Parameters and buffers set to ``None`` are not included.
 
+        Warning: The non strict state dict would ignore the parameters if the
+            tensors of the parameters are shared with other parameters which
+            have been included in the dictionary.
+
+        Returns:
+            dict:
+                a dictionary containing a whole state of the module
         """
         if destination is None:
             destination = OrderedDict()
diff --git a/colossalai/nn/parallel/utils.py b/colossalai/nn/parallel/utils.py
index 1205cbc3a658..988f978254a1 100644
--- a/colossalai/nn/parallel/utils.py
+++ b/colossalai/nn/parallel/utils.py
@@ -60,17 +60,17 @@ def _get_shallow_copy_model(model: nn.Module):
     return name_to_module['']
 
 
-def get_static_torch_model(gemini_ddp_model,
+def get_static_torch_model(zero_ddp_model,
                            device=torch.device("cpu"),
                            dtype=torch.float32,
                            only_rank_0=True) -> torch.nn.Module:
-    """Get a static torch.nn.Module model from the given GeminiDDP module.
-    You should notice that the original GeminiDDP model is not modified.
+    """Get a static torch.nn.Module model from the given ZeroDDP module.
+    You should notice that the original ZeroDDP model is not modified.
     Thus, you can use the original model in further training.
     But you should not use the returned torch model to train, this can cause unexpected errors.
 
     Args:
-        gemini_ddp_model (GeminiDDP): a gemini ddp model
+        zero_ddp_model (ZeroDDP): a zero ddp model
         device (torch.device): the device of the final torch model
         dtype (torch.dtype): the dtype of the final torch model
         only_rank_0 (bool): if True, only rank0 has the coverted torch model
@@ -78,11 +78,11 @@ def get_static_torch_model(gemini_ddp_model,
     Returns:
         torch.nn.Module: a static torch model used for saving checkpoints or numeric checks
     """
-    from colossalai.nn.parallel import GeminiDDP
-    assert isinstance(gemini_ddp_model, GeminiDDP)
+    from colossalai.nn.parallel import ZeroDDP
+    assert isinstance(zero_ddp_model, ZeroDDP)
 
-    state_dict = gemini_ddp_model.state_dict(only_rank_0=only_rank_0)
-    colo_model = gemini_ddp_model.module
+    state_dict = zero_ddp_model.state_dict(only_rank_0=only_rank_0, strict=False)
+    colo_model = zero_ddp_model.module
     torch_model = _get_shallow_copy_model(colo_model)
 
     if not only_rank_0 or dist.get_rank() == 0:

From 3abbaf8bc68c8a3366241a3dc2e97f6944605fb2 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 14:53:04 +0800
Subject: [PATCH 110/503] update codegen test

---
 .../test_autochunk/test_autochunk_codegen.py  | 23 +++++++++++++------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/tests/test_autochunk/test_autochunk_codegen.py b/tests/test_autochunk/test_autochunk_codegen.py
index 8246275eb08a..c91148e11ff8 100644
--- a/tests/test_autochunk/test_autochunk_codegen.py
+++ b/tests/test_autochunk/test_autochunk_codegen.py
@@ -1,3 +1,5 @@
+from functools import partial
+
 import pytest
 import torch
 import torch.fx
@@ -46,7 +48,7 @@ def _test_fwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair):
     )
 
 
-def _test_autochunk_codegen(rank):
+def _test_autochunk_codegen(rank, msa_len, pair_len, max_memory):
     # launch colossalai to make sure we could execute colossalai.utils.checkpoint currectly
     colossalai.launch(
         config={},
@@ -59,8 +61,6 @@ def _test_autochunk_codegen(rank):
 
     # build model and input
     model = evoformer_base().cuda()
-    msa_len = 32
-    pair_len = 64
     node = torch.randn(1, msa_len, pair_len, 256).cuda()
     pair = torch.randn(1, pair_len, pair_len, 128).cuda()
 
@@ -85,7 +85,7 @@ def _test_autochunk_codegen(rank):
         MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0")
     )
 
-    codegen = AutoChunkCodeGen(gm_prop)
+    codegen = AutoChunkCodeGen(gm_prop, max_memory=max_memory)
     graph.set_codegen(codegen)
     gm = ColoGraphModule(model, graph)
     gm.recompile()
@@ -99,9 +99,18 @@ def _test_autochunk_codegen(rank):
     gpc.destroy()
 
 
-def test_autochunk_codegen():
-    mp.spawn(_test_autochunk_codegen, nprocs=1)
+@pytest.mark.parametrize("max_memory", [None, 20, 24, 28, 32])
+@pytest.mark.parametrize("msa_len", [32])
+@pytest.mark.parametrize("pair_len", [64])
+def test_autochunk_codegen(msa_len, pair_len, max_memory):
+    run_func = partial(
+        _test_autochunk_codegen,
+        msa_len=msa_len,
+        pair_len=pair_len,
+        max_memory=max_memory,
+    )
+    mp.spawn(run_func, nprocs=1)
 
 
 if __name__ == "__main__":
-    _test_autochunk_codegen(0)
+    _test_autochunk_codegen(0, 32, 64, None)

From a005965d2d5f506aafe672575388501bfc5dc5d8 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 14:57:47 +0800
Subject: [PATCH 111/503] update codegen test

---
 tests/test_autochunk/test_autochunk_codegen.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_autochunk/test_autochunk_codegen.py b/tests/test_autochunk/test_autochunk_codegen.py
index c91148e11ff8..62763a6d5e2a 100644
--- a/tests/test_autochunk/test_autochunk_codegen.py
+++ b/tests/test_autochunk/test_autochunk_codegen.py
@@ -99,7 +99,7 @@ def _test_autochunk_codegen(rank, msa_len, pair_len, max_memory):
     gpc.destroy()
 
 
-@pytest.mark.parametrize("max_memory", [None, 20, 24, 28, 32])
+@pytest.mark.parametrize("max_memory", [None, 20, 25, 30])
 @pytest.mark.parametrize("msa_len", [32])
 @pytest.mark.parametrize("pair_len", [64])
 def test_autochunk_codegen(msa_len, pair_len, max_memory):

From d106b271f8fa8968bfa7a5f7652448c41f26c260 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 15:19:08 +0800
Subject: [PATCH 112/503] add chunk search test

---
 tests/test_autochunk/test_autochunk_search.py | 86 +++++++++++++++++++
 1 file changed, 86 insertions(+)
 create mode 100644 tests/test_autochunk/test_autochunk_search.py

diff --git a/tests/test_autochunk/test_autochunk_search.py b/tests/test_autochunk/test_autochunk_search.py
new file mode 100644
index 000000000000..c824a43ab612
--- /dev/null
+++ b/tests/test_autochunk/test_autochunk_search.py
@@ -0,0 +1,86 @@
+from functools import partial
+
+import pytest
+import torch
+import torch.fx
+import torch.multiprocessing as mp
+
+import colossalai
+from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
+from colossalai.core import global_context as gpc
+from colossalai.fx import ColoTracer
+from colossalai.fx.graph_module import ColoGraphModule
+from colossalai.fx.passes.meta_info_prop import MetaInfoProp
+from colossalai.fx.profiler import MetaTensor
+from colossalai.utils import free_port
+from tests.test_autochunk.evoformer.evoformer import evoformer_base
+
+
+def assert_chunk_infos(chunk_infos, max_memory, msa_len, pair_len):
+    found_regions = [i["region"] for i in chunk_infos]
+
+    if msa_len == 32 and pair_len == 64:
+        if max_memory is None:
+            target_regions = [(142, 154), (366, 373), (233, 283), (301, 351), (127, 134), (204, 228), (167, 191), (161, 166), (198, 203), (6, 69)]
+        elif max_memory == 20:
+            target_regions = [(142, 154), (369, 373), (233, 269), (301, 351)]
+        elif max_memory == 25:
+            target_regions = [(144, 154), (369, 370)]
+        elif max_memory == 30:
+            target_regions = [(144, 154)]
+        else:
+            raise NotImplementedError()
+    else:
+        raise NotImplementedError()
+    
+    assert len(found_regions) == len(target_regions), "len of found regions %s doesn't equal len of target regions %s" % (str(found_regions), str(target_regions))
+    for region in target_regions:
+        assert region in found_regions, "region:%s not in found regions for msa:%d, pair:%d, maxmem:%d" % (str(region), msa_len, pair_len, max_memory)
+    for region in found_regions:
+        assert region in target_regions, "region:%s should not be found for msa:%d, pair:%d, maxmem:%d" % (str(region), msa_len, pair_len, max_memory)
+
+
+def _test_autochunk_search(rank, msa_len, pair_len, max_memory):
+    # launch colossalai to make sure we could execute colossalai.utils.checkpoint currectly
+    colossalai.launch(
+        config={},
+        rank=rank,
+        world_size=1,
+        host="localhost",
+        port=free_port(),
+        backend="nccl",
+    )
+
+    # build model and input
+    model = evoformer_base().cuda()
+    node = torch.randn(1, msa_len, pair_len, 256).cuda()
+    pair = torch.randn(1, pair_len, pair_len, 128).cuda()
+
+    gm_prop = torch.fx.symbolic_trace(model)  # must use symbolic_trace
+    interp = MetaInfoProp(gm_prop)
+    interp.propagate(
+        MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0")
+    )
+
+    codegen = AutoChunkCodeGen(gm_prop, max_memory=max_memory)
+    chunk_infos = codegen.chunk_infos
+    assert_chunk_infos(chunk_infos, max_memory, msa_len, pair_len)
+
+    gpc.destroy()
+
+
+@pytest.mark.parametrize("max_memory", [None, 20, 25, 30])
+@pytest.mark.parametrize("msa_len", [32])
+@pytest.mark.parametrize("pair_len", [64])
+def test_autochunk_search(msa_len, pair_len, max_memory):
+    run_func = partial(
+        _test_autochunk_search,
+        msa_len=msa_len,
+        pair_len=pair_len,
+        max_memory=max_memory,
+    )
+    mp.spawn(run_func, nprocs=1)
+
+
+if __name__ == "__main__":
+    _test_autochunk_search(0, 32, 64, 20)

From d5c4f0bf954a5686777f652e34b5cd18df2a0d5a Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 15:22:09 +0800
Subject: [PATCH 113/503] code style

---
 tests/test_autochunk/test_autochunk_search.py | 29 +++++++++++++++----
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/tests/test_autochunk/test_autochunk_search.py b/tests/test_autochunk/test_autochunk_search.py
index c824a43ab612..6f7214633fa3 100644
--- a/tests/test_autochunk/test_autochunk_search.py
+++ b/tests/test_autochunk/test_autochunk_search.py
@@ -8,8 +8,6 @@
 import colossalai
 from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
 from colossalai.core import global_context as gpc
-from colossalai.fx import ColoTracer
-from colossalai.fx.graph_module import ColoGraphModule
 from colossalai.fx.passes.meta_info_prop import MetaInfoProp
 from colossalai.fx.profiler import MetaTensor
 from colossalai.utils import free_port
@@ -32,12 +30,31 @@ def assert_chunk_infos(chunk_infos, max_memory, msa_len, pair_len):
             raise NotImplementedError()
     else:
         raise NotImplementedError()
-    
-    assert len(found_regions) == len(target_regions), "len of found regions %s doesn't equal len of target regions %s" % (str(found_regions), str(target_regions))
+
+    assert len(found_regions) == len(
+        target_regions
+    ), "len of found regions %s doesn't equal len of target regions %s" % (
+        str(found_regions),
+        str(target_regions),
+    )
     for region in target_regions:
-        assert region in found_regions, "region:%s not in found regions for msa:%d, pair:%d, maxmem:%d" % (str(region), msa_len, pair_len, max_memory)
+        assert (
+            region in found_regions
+        ), "region:%s not in found regions for msa:%d, pair:%d, maxmem:%d" % (
+            str(region),
+            msa_len,
+            pair_len,
+            max_memory,
+        )
     for region in found_regions:
-        assert region in target_regions, "region:%s should not be found for msa:%d, pair:%d, maxmem:%d" % (str(region), msa_len, pair_len, max_memory)
+        assert (
+            region in target_regions
+        ), "region:%s should not be found for msa:%d, pair:%d, maxmem:%d" % (
+            str(region),
+            msa_len,
+            pair_len,
+            max_memory,
+        )
 
 
 def _test_autochunk_search(rank, msa_len, pair_len, max_memory):

From aafc3516a5c07347f58bbc1a52410f74e51b685f Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 15:32:19 +0800
Subject: [PATCH 114/503] add available

---
 tests/test_autochunk/test_autochunk_codegen.py | 2 ++
 tests/test_autochunk/test_autochunk_search.py  | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/tests/test_autochunk/test_autochunk_codegen.py b/tests/test_autochunk/test_autochunk_codegen.py
index 62763a6d5e2a..c4f5cda67204 100644
--- a/tests/test_autochunk/test_autochunk_codegen.py
+++ b/tests/test_autochunk/test_autochunk_codegen.py
@@ -9,6 +9,7 @@
 from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
 from colossalai.core import global_context as gpc
 from colossalai.fx import ColoTracer
+from colossalai.fx.codegen.activation_checkpoint_codegen import CODEGEN_AVAILABLE
 from colossalai.fx.graph_module import ColoGraphModule
 from colossalai.fx.passes.meta_info_prop import MetaInfoProp
 from colossalai.fx.profiler import MetaTensor
@@ -99,6 +100,7 @@ def _test_autochunk_codegen(rank, msa_len, pair_len, max_memory):
     gpc.destroy()
 
 
+@pytest.mark.skipif(not CODEGEN_AVAILABLE, reason='torch version is lower than 1.12.0')
 @pytest.mark.parametrize("max_memory", [None, 20, 25, 30])
 @pytest.mark.parametrize("msa_len", [32])
 @pytest.mark.parametrize("pair_len", [64])
diff --git a/tests/test_autochunk/test_autochunk_search.py b/tests/test_autochunk/test_autochunk_search.py
index 6f7214633fa3..5026c3ad3b3d 100644
--- a/tests/test_autochunk/test_autochunk_search.py
+++ b/tests/test_autochunk/test_autochunk_search.py
@@ -8,6 +8,7 @@
 import colossalai
 from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
 from colossalai.core import global_context as gpc
+from colossalai.fx.codegen.activation_checkpoint_codegen import CODEGEN_AVAILABLE
 from colossalai.fx.passes.meta_info_prop import MetaInfoProp
 from colossalai.fx.profiler import MetaTensor
 from colossalai.utils import free_port
@@ -86,6 +87,7 @@ def _test_autochunk_search(rank, msa_len, pair_len, max_memory):
     gpc.destroy()
 
 
+@pytest.mark.skipif(not CODEGEN_AVAILABLE, reason="torch version is lower than 1.12.0")
 @pytest.mark.parametrize("max_memory", [None, 20, 25, 30])
 @pytest.mark.parametrize("msa_len", [32])
 @pytest.mark.parametrize("pair_len", [64])

From 498b5ca993fb17eccdfbe7608f36444d5779f0c8 Mon Sep 17 00:00:00 2001
From: HELSON <c2h214748@gmail.com>
Date: Mon, 9 Jan 2023 15:52:17 +0800
Subject: [PATCH 115/503] [hotfix] fix gpt gemini example (#2404)

* [hotfix] fix gpt gemini example

* [example] add new assertions
---
 .../language/gpt/gemini/benchmark_gemini.sh   | 30 ++++++++++---------
 .../language/gpt/gemini/train_gpt_demo.py     |  2 ++
 2 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/examples/language/gpt/gemini/benchmark_gemini.sh b/examples/language/gpt/gemini/benchmark_gemini.sh
index 13086666eefd..464ea03da7eb 100644
--- a/examples/language/gpt/gemini/benchmark_gemini.sh
+++ b/examples/language/gpt/gemini/benchmark_gemini.sh
@@ -1,18 +1,20 @@
 for MODEL_TYPE in "gpt2_medium"; do
-  for BATCH_SIZE in 16; do
-    for GPUNUM in 1 2 4 8; do
-      for TPDEGREE in 1 2 4 8; do
-        if [ ${TPDEGREE} -gt ${GPUNUM} ]; then
-          continue
-        fi
-        for PLACEMENT in "cpu" "auto"; do
-          echo "****************** Begin ***************************"
-          echo "* benchmrking MODEL_TYPE ${MODEL_TYPE} BS ${BATCH_SIZE} BS ${BS} GPUNUM ${GPUNUM} TPDEGREE ${TPDEGREE} PLACEMENT ${PLACEMENT}"
-          MODEL_TYPE=${MODEL_TYPE} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE} PLACEMENT=${PLACEMENT} \
-          bash ./gemini/run_gemini.sh
-          echo "****************** Finished ***************************"
-          echo ""
-          echo ""
+  for DISPAN in "colossalai"; do
+    for BATCH_SIZE in 16; do
+      for GPUNUM in 1 2 4 8; do
+        for TPDEGREE in 1 2 4 8; do
+          if [ ${TPDEGREE} -gt ${GPUNUM} ]; then
+            continue
+          fi
+          for PLACEMENT in "cpu" "auto"; do
+            echo "****************** Begin ***************************"
+            echo "+ benchmrking MODEL ${MODEL_TYPE} DISPAN ${DISPAN} GPU ${GPUNUM} BS ${BATCH_SIZE} TP ${TPDEGREE} POLICY ${PLACEMENT}"
+            MODEL_TYPE=${MODEL_TYPE} DISPAN=${DISPAN} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE} PLACEMENT=${PLACEMENT} \
+            bash ./run_gemini.sh
+            echo "****************** Finished ***************************"
+            echo ""
+            echo ""
+          done
         done
       done
     done
diff --git a/examples/language/gpt/gemini/train_gpt_demo.py b/examples/language/gpt/gemini/train_gpt_demo.py
index 29f8c8ef1215..891b1de15af1 100644
--- a/examples/language/gpt/gemini/train_gpt_demo.py
+++ b/examples/language/gpt/gemini/train_gpt_demo.py
@@ -270,6 +270,7 @@ def main():
 
         tp_pg = ProcessGroup(tp_degree=args.tp_degree)
         # Tensor Parallelism (TP)
+        # You should notice that v0.1.10 is not compatible with TP degree > 1
         tensor_parallelize(model, tp_pg)
 
         # build a Gemini model and a highly optimized cpu optimizer
@@ -278,6 +279,7 @@ def main():
 
         logger.info(get_mem_info(prefix='After init optim, '), ranks=[0])
     else:
+        assert args.tp_degree == 1, "The degree of TP should be 1 for DDP examples."
         model = model_builder(args.model_type)(checkpoint=True).cuda()
 
     if args.distplan.startswith("torch"):

From 19cc64b1d39529bde502f9507d20770430f6e3af Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 16:06:58 +0800
Subject: [PATCH 116/503] remove autochunk_available

---
 colossalai/autochunk/autochunk_codegen.py | 490 +++++++++++-----------
 1 file changed, 239 insertions(+), 251 deletions(-)

diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
index 0db2e59080dd..9ec59477b426 100644
--- a/colossalai/autochunk/autochunk_codegen.py
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -16,13 +16,9 @@
 from torch.fx.node import Argument, Node, _get_qualified_name, _type_repr, map_arg
 
 import colossalai
-
 from .search_chunk import SearchChunk
 from .utils import delete_free_var_from_last_use, find_idx_by_name, get_node_shape
 
-CODEGEN_AVAILABLE = True
-__all__ = ["AutoChunkCodeGen"]
-
 
 def _gen_chunk_slice_dim(chunk_dim, chunk_idx_name, shape):
     new_shape = "["
@@ -222,287 +218,279 @@ def emit_code_with_chunk(
         node_idx += 1
 
 
-if CODEGEN_AVAILABLE:
-
-    class AutoChunkCodeGen(CodeGen):
-        def __init__(self, meta_graph, max_memory=None, print_mem=False):
-            super().__init__()
-            self.meta_graph = meta_graph
-            self.max_memory = max_memory
-            self.meta_node = list(meta_graph.graph.nodes)
-            # find the chunk regions
-            self.search_chunk = SearchChunk(meta_graph, max_memory, print_mem)
-            self.chunk_infos = self.search_chunk.search_region()
+class AutoChunkCodeGen(CodeGen):
+    def __init__(self, meta_graph, max_memory=None, print_mem=False):
+        super().__init__()
+        self.meta_graph = meta_graph
+        self.max_memory = max_memory
+        self.meta_node = list(meta_graph.graph.nodes)
+        # find the chunk regions
+        self.search_chunk = SearchChunk(meta_graph, max_memory, print_mem)
+        self.chunk_infos = self.search_chunk.search_region()
 
-        def _gen_python_code(
-            self, nodes, root_module: str, namespace: _Namespace
-        ) -> PythonCode:
-            free_vars: List[str] = []
-            body: List[str] = []
-            globals_: Dict[str, Any] = {}
-            wrapped_fns: Dict[str, None] = {}
+    def _gen_python_code(
+        self, nodes, root_module: str, namespace: _Namespace
+    ) -> PythonCode:
+        free_vars: List[str] = []
+        body: List[str] = []
+        globals_: Dict[str, Any] = {}
+        wrapped_fns: Dict[str, None] = {}
 
-            # Wrap string in list to pass by reference
-            maybe_return_annotation: List[str] = [""]
+        # Wrap string in list to pass by reference
+        maybe_return_annotation: List[str] = [""]
 
-            def add_global(name_hint: str, obj: Any):
-                """Add an obj to be tracked as a global.
+        def add_global(name_hint: str, obj: Any):
+            """Add an obj to be tracked as a global.
 
-                We call this for names that reference objects external to the
-                Graph, like functions or types.
+            We call this for names that reference objects external to the
+            Graph, like functions or types.
 
-                Returns: the global name that should be used to reference 'obj' in generated source.
-                """
-                if (
-                    _is_from_torch(obj) and obj != torch.device
-                ):  # to support registering torch.device
-                    # HACK: workaround for how torch custom ops are registered. We
-                    # can't import them like normal modules so they must retain their
-                    # fully qualified name.
-                    return _get_qualified_name(obj)
-
-                # normalize the name hint to get a proper identifier
-                global_name = namespace.create_name(name_hint, obj)
-
-                if global_name in globals_:
-                    assert globals_[global_name] is obj
-                    return global_name
-                globals_[global_name] = obj
+            Returns: the global name that should be used to reference 'obj' in generated source.
+            """
+            if (
+                _is_from_torch(obj) and obj != torch.device
+            ):  # to support registering torch.device
+                # HACK: workaround for how torch custom ops are registered. We
+                # can't import them like normal modules so they must retain their
+                # fully qualified name.
+                return _get_qualified_name(obj)
+
+            # normalize the name hint to get a proper identifier
+            global_name = namespace.create_name(name_hint, obj)
+
+            if global_name in globals_:
+                assert globals_[global_name] is obj
                 return global_name
+            globals_[global_name] = obj
+            return global_name
 
-            # set _custom_builtins here so that we needn't import colossalai in forward
-            _custom_builtins["colossalai"] = _CustomBuiltin(
-                "import colossalai", colossalai
-            )
-
-            # Pre-fill the globals table with registered builtins.
-            for name, (_, obj) in _custom_builtins.items():
-                add_global(name, obj)
+        # set _custom_builtins here so that we needn't import colossalai in forward
+        _custom_builtins["colossalai"] = _CustomBuiltin("import colossalai", colossalai)
 
-            def type_repr(o: Any):
-                if o == ():
-                    # Empty tuple is used for empty tuple type annotation Tuple[()]
-                    return "()"
+        # Pre-fill the globals table with registered builtins.
+        for name, (_, obj) in _custom_builtins.items():
+            add_global(name, obj)
 
-                typename = _type_repr(o)
+        def type_repr(o: Any):
+            if o == ():
+                # Empty tuple is used for empty tuple type annotation Tuple[()]
+                return "()"
 
-                if hasattr(o, "__origin__"):
-                    # This is a generic type, e.g. typing.List[torch.Tensor]
-                    origin_type = _origin_type_map.get(o.__origin__, o.__origin__)
-                    origin_typename = add_global(_type_repr(origin_type), origin_type)
+            typename = _type_repr(o)
 
-                    if hasattr(o, "__args__"):
-                        # Assign global names for each of the inner type variables.
-                        args = [type_repr(arg) for arg in o.__args__]
+            if hasattr(o, "__origin__"):
+                # This is a generic type, e.g. typing.List[torch.Tensor]
+                origin_type = _origin_type_map.get(o.__origin__, o.__origin__)
+                origin_typename = add_global(_type_repr(origin_type), origin_type)
 
-                        if len(args) == 0:
-                            # Bare type, such as `typing.Tuple` with no subscript
-                            # This code-path used in Python < 3.9
-                            return origin_typename
+                if hasattr(o, "__args__"):
+                    # Assign global names for each of the inner type variables.
+                    args = [type_repr(arg) for arg in o.__args__]
 
-                        return f'{origin_typename}[{",".join(args)}]'
-                    else:
+                    if len(args) == 0:
                         # Bare type, such as `typing.Tuple` with no subscript
-                        # This code-path used in Python 3.9+
+                        # This code-path used in Python < 3.9
                         return origin_typename
 
-                # Common case: this is a regular module name like 'foo.bar.baz'
-                return add_global(typename, o)
-
-            def _format_args(
-                args: Tuple[Argument, ...], kwargs: Dict[str, Argument]
-            ) -> str:
-                def _get_repr(arg):
-                    # Handle NamedTuples (if it has `_fields`) via add_global.
-                    if isinstance(arg, tuple) and hasattr(arg, "_fields"):
-                        qualified_name = _get_qualified_name(type(arg))
-                        global_name = add_global(qualified_name, type(arg))
-                        return f"{global_name}{repr(tuple(arg))}"
-                    return repr(arg)
-
-                args_s = ", ".join(_get_repr(a) for a in args)
-                kwargs_s = ", ".join(f"{k} = {_get_repr(v)}" for k, v in kwargs.items())
-                if args_s and kwargs_s:
-                    return f"{args_s}, {kwargs_s}"
-                return args_s or kwargs_s
-
-            # Run through reverse nodes and record the first instance of a use
-            # of a given node. This represents the *last* use of the node in the
-            # execution order of the program, which we will use to free unused
-            # values
-            node_to_last_use: Dict[Node, Node] = {}
-            user_to_last_uses: Dict[Node, List[Node]] = {}
-
-            def register_last_uses(n: Node, user: Node):
-                if n not in node_to_last_use:
-                    node_to_last_use[n] = user
-                    user_to_last_uses.setdefault(user, []).append(n)
-
-            for node in reversed(nodes):
-                map_arg(node.args, lambda n: register_last_uses(n, node))
-                map_arg(node.kwargs, lambda n: register_last_uses(n, node))
-
-            delete_free_var_from_last_use(user_to_last_uses)
-
-            # NOTE: we add a variable to distinguish body and ckpt_func
-            def delete_unused_values(user: Node, body, to_keep=[]):
-                """
-                Delete values after their last use. This ensures that values that are
-                not used in the remainder of the code are freed and the memory usage
-                of the code is optimal.
-                """
-                if user.op == "placeholder":
-                    return
-                if user.op == "output":
-                    body.append("\n")
-                    return
-                nodes_to_delete = user_to_last_uses.get(user, [])
-                nodes_to_delete = [i for i in nodes_to_delete if i.name not in to_keep]
-                if len(nodes_to_delete):
-                    to_delete_str = " = ".join(
-                        [repr(n) for n in nodes_to_delete] + ["None"]
-                    )
-                    body.append(f";  {to_delete_str}\n")
+                    return f'{origin_typename}[{",".join(args)}]'
                 else:
-                    body.append("\n")
+                    # Bare type, such as `typing.Tuple` with no subscript
+                    # This code-path used in Python 3.9+
+                    return origin_typename
+
+            # Common case: this is a regular module name like 'foo.bar.baz'
+            return add_global(typename, o)
+
+        def _format_args(
+            args: Tuple[Argument, ...], kwargs: Dict[str, Argument]
+        ) -> str:
+            def _get_repr(arg):
+                # Handle NamedTuples (if it has `_fields`) via add_global.
+                if isinstance(arg, tuple) and hasattr(arg, "_fields"):
+                    qualified_name = _get_qualified_name(type(arg))
+                    global_name = add_global(qualified_name, type(arg))
+                    return f"{global_name}{repr(tuple(arg))}"
+                return repr(arg)
+
+            args_s = ", ".join(_get_repr(a) for a in args)
+            kwargs_s = ", ".join(f"{k} = {_get_repr(v)}" for k, v in kwargs.items())
+            if args_s and kwargs_s:
+                return f"{args_s}, {kwargs_s}"
+            return args_s or kwargs_s
+
+        # Run through reverse nodes and record the first instance of a use
+        # of a given node. This represents the *last* use of the node in the
+        # execution order of the program, which we will use to free unused
+        # values
+        node_to_last_use: Dict[Node, Node] = {}
+        user_to_last_uses: Dict[Node, List[Node]] = {}
+
+        def register_last_uses(n: Node, user: Node):
+            if n not in node_to_last_use:
+                node_to_last_use[n] = user
+                user_to_last_uses.setdefault(user, []).append(n)
+
+        for node in reversed(nodes):
+            map_arg(node.args, lambda n: register_last_uses(n, node))
+            map_arg(node.kwargs, lambda n: register_last_uses(n, node))
+
+        delete_free_var_from_last_use(user_to_last_uses)
+
+        # NOTE: we add a variable to distinguish body and ckpt_func
+        def delete_unused_values(user: Node, body, to_keep=[]):
+            """
+            Delete values after their last use. This ensures that values that are
+            not used in the remainder of the code are freed and the memory usage
+            of the code is optimal.
+            """
+            if user.op == "placeholder":
+                return
+            if user.op == "output":
+                body.append("\n")
+                return
+            nodes_to_delete = user_to_last_uses.get(user, [])
+            nodes_to_delete = [i for i in nodes_to_delete if i.name not in to_keep]
+            if len(nodes_to_delete):
+                to_delete_str = " = ".join(
+                    [repr(n) for n in nodes_to_delete] + ["None"]
+                )
+                body.append(f";  {to_delete_str}\n")
+            else:
+                body.append("\n")
 
-            # NOTE: we add a variable to distinguish body and ckpt_func
-            def emit_node(node: Node, body):
-                maybe_type_annotation = (
-                    "" if node.type is None else f" : {type_repr(node.type)}"
+        # NOTE: we add a variable to distinguish body and ckpt_func
+        def emit_node(node: Node, body):
+            maybe_type_annotation = (
+                "" if node.type is None else f" : {type_repr(node.type)}"
+            )
+            if node.op == "placeholder":
+                assert isinstance(node.target, str)
+                maybe_default_arg = "" if not node.args else f" = {repr(node.args[0])}"
+                free_vars.append(
+                    f"{node.target}{maybe_type_annotation}{maybe_default_arg}"
                 )
-                if node.op == "placeholder":
-                    assert isinstance(node.target, str)
-                    maybe_default_arg = (
-                        "" if not node.args else f" = {repr(node.args[0])}"
-                    )
-                    free_vars.append(
-                        f"{node.target}{maybe_type_annotation}{maybe_default_arg}"
-                    )
-                    raw_name = node.target.replace("*", "")
-                    if raw_name != repr(node):
-                        body.append(f"{repr(node)} = {raw_name}\n")
-                    return
-                elif node.op == "call_method":
-                    assert isinstance(node.target, str)
-                    body.append(
-                        f"{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.target)}"
-                        f"({_format_args(node.args[1:], node.kwargs)})"
-                    )
-                    return
-                elif node.op == "call_function":
-                    assert callable(node.target)
-                    # pretty print operators
-                    if (
-                        node.target.__module__ == "_operator"
-                        and node.target.__name__ in magic_methods
-                    ):
-                        assert isinstance(node.args, tuple)
-                        body.append(
-                            f"{repr(node)}{maybe_type_annotation} = "
-                            f"{magic_methods[node.target.__name__].format(*(repr(a) for a in node.args))}"
-                        )
-                        return
-
-                    # pretty print inplace operators; required for jit.script to work properly
-                    # not currently supported in normal FX graphs, but generated by torchdynamo
-                    if (
-                        node.target.__module__ == "_operator"
-                        and node.target.__name__ in inplace_methods
-                    ):
-                        body.append(
-                            f"{inplace_methods[node.target.__name__].format(*(repr(a) for a in node.args))};  "
-                            f"{repr(node)}{maybe_type_annotation} = {repr(node.args[0])}"
-                        )
-                        return
-
-                    qualified_name = _get_qualified_name(node.target)
-                    global_name = add_global(qualified_name, node.target)
-                    # special case for getattr: node.args could be 2-argument or 3-argument
-                    # 2-argument: attribute access; 3-argument: fall through to attrib function call with default value
-                    if (
-                        global_name == "getattr"
-                        and isinstance(node.args, tuple)
-                        and isinstance(node.args[1], str)
-                        and node.args[1].isidentifier()
-                        and len(node.args) == 2
-                    ):
-                        body.append(
-                            f"{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.args[1])}"
-                        )
-                        return
+                raw_name = node.target.replace("*", "")
+                if raw_name != repr(node):
+                    body.append(f"{repr(node)} = {raw_name}\n")
+                return
+            elif node.op == "call_method":
+                assert isinstance(node.target, str)
+                body.append(
+                    f"{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.target)}"
+                    f"({_format_args(node.args[1:], node.kwargs)})"
+                )
+                return
+            elif node.op == "call_function":
+                assert callable(node.target)
+                # pretty print operators
+                if (
+                    node.target.__module__ == "_operator"
+                    and node.target.__name__ in magic_methods
+                ):
+                    assert isinstance(node.args, tuple)
                     body.append(
-                        f"{repr(node)}{maybe_type_annotation} = {global_name}({_format_args(node.args, node.kwargs)})"
+                        f"{repr(node)}{maybe_type_annotation} = "
+                        f"{magic_methods[node.target.__name__].format(*(repr(a) for a in node.args))}"
                     )
-                    if node.meta.get("is_wrapped", False):
-                        wrapped_fns.setdefault(global_name)
                     return
-                elif node.op == "call_module":
-                    assert isinstance(node.target, str)
+
+                # pretty print inplace operators; required for jit.script to work properly
+                # not currently supported in normal FX graphs, but generated by torchdynamo
+                if (
+                    node.target.__module__ == "_operator"
+                    and node.target.__name__ in inplace_methods
+                ):
                     body.append(
-                        f"{repr(node)}{maybe_type_annotation} = "
-                        f"{_format_target(root_module, node.target)}({_format_args(node.args, node.kwargs)})"
+                        f"{inplace_methods[node.target.__name__].format(*(repr(a) for a in node.args))};  "
+                        f"{repr(node)}{maybe_type_annotation} = {repr(node.args[0])}"
                     )
                     return
-                elif node.op == "get_attr":
-                    assert isinstance(node.target, str)
+
+                qualified_name = _get_qualified_name(node.target)
+                global_name = add_global(qualified_name, node.target)
+                # special case for getattr: node.args could be 2-argument or 3-argument
+                # 2-argument: attribute access; 3-argument: fall through to attrib function call with default value
+                if (
+                    global_name == "getattr"
+                    and isinstance(node.args, tuple)
+                    and isinstance(node.args[1], str)
+                    and node.args[1].isidentifier()
+                    and len(node.args) == 2
+                ):
                     body.append(
-                        f"{repr(node)}{maybe_type_annotation} = {_format_target(root_module, node.target)}"
+                        f"{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.args[1])}"
                     )
                     return
-                elif node.op == "output":
-                    if node.type is not None:
-                        maybe_return_annotation[0] = f" -> {type_repr(node.type)}"
-                    body.append(self.generate_output(node.args[0]))
-                    return
-                raise NotImplementedError(f"node: {node.op} {node.target}")
-
-            # Modified for activation checkpointing
-            ckpt_func = []
-
-            # if any node has a list of labels for activation_checkpoint, we
-            # will use nested type of activation checkpoint codegen
-            emit_code_with_chunk(
-                body,
-                nodes,
-                emit_node,
-                delete_unused_values,
-                self.search_chunk,
-                self.chunk_infos,
-            )
-
-            if len(body) == 0:
-                # If the Graph has no non-placeholder nodes, no lines for the body
-                # have been emitted. To continue to have valid Python code, emit a
-                # single pass statement
-                body.append("pass\n")
-
-            if len(wrapped_fns) > 0:
-                wrap_name = add_global("wrap", torch.fx.wrap)
-                wrap_stmts = "\n".join(
-                    [f'{wrap_name}("{name}")' for name in wrapped_fns]
+                body.append(
+                    f"{repr(node)}{maybe_type_annotation} = {global_name}({_format_args(node.args, node.kwargs)})"
                 )
-            else:
-                wrap_stmts = ""
+                if node.meta.get("is_wrapped", False):
+                    wrapped_fns.setdefault(global_name)
+                return
+            elif node.op == "call_module":
+                assert isinstance(node.target, str)
+                body.append(
+                    f"{repr(node)}{maybe_type_annotation} = "
+                    f"{_format_target(root_module, node.target)}({_format_args(node.args, node.kwargs)})"
+                )
+                return
+            elif node.op == "get_attr":
+                assert isinstance(node.target, str)
+                body.append(
+                    f"{repr(node)}{maybe_type_annotation} = {_format_target(root_module, node.target)}"
+                )
+                return
+            elif node.op == "output":
+                if node.type is not None:
+                    maybe_return_annotation[0] = f" -> {type_repr(node.type)}"
+                body.append(self.generate_output(node.args[0]))
+                return
+            raise NotImplementedError(f"node: {node.op} {node.target}")
+
+        # Modified for activation checkpointing
+        ckpt_func = []
+
+        # if any node has a list of labels for activation_checkpoint, we
+        # will use nested type of activation checkpoint codegen
+        emit_code_with_chunk(
+            body,
+            nodes,
+            emit_node,
+            delete_unused_values,
+            self.search_chunk,
+            self.chunk_infos,
+        )
+
+        if len(body) == 0:
+            # If the Graph has no non-placeholder nodes, no lines for the body
+            # have been emitted. To continue to have valid Python code, emit a
+            # single pass statement
+            body.append("pass\n")
+
+        if len(wrapped_fns) > 0:
+            wrap_name = add_global("wrap", torch.fx.wrap)
+            wrap_stmts = "\n".join([f'{wrap_name}("{name}")' for name in wrapped_fns])
+        else:
+            wrap_stmts = ""
 
-            if self._body_transformer:
-                body = self._body_transformer(body)
+        if self._body_transformer:
+            body = self._body_transformer(body)
 
-            for name, value in self.additional_globals():
-                add_global(name, value)
+        for name, value in self.additional_globals():
+            add_global(name, value)
 
-            # as we need colossalai.utils.checkpoint, we need to import colossalai
-            # in forward function
-            prologue = self.gen_fn_def(free_vars, maybe_return_annotation[0])
-            prologue = "".join(ckpt_func) + prologue
-            prologue = prologue
+        # as we need colossalai.utils.checkpoint, we need to import colossalai
+        # in forward function
+        prologue = self.gen_fn_def(free_vars, maybe_return_annotation[0])
+        prologue = "".join(ckpt_func) + prologue
+        prologue = prologue
 
-            code = "".join(body)
-            code = "\n".join("    " + line for line in code.split("\n"))
-            fn_code = f"""
+        code = "".join(body)
+        code = "\n".join("    " + line for line in code.split("\n"))
+        fn_code = f"""
 {wrap_stmts}
 
 {prologue}
 {code}"""
-            # print(fn_code)
-            return PythonCode(fn_code, globals_)
+        # print(fn_code)
+        return PythonCode(fn_code, globals_)

From d3f5ce9efb35bf9e292aa041a3e98b737cbb68ee Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 9 Jan 2023 16:21:44 +0800
Subject: [PATCH 117/503] [workflow] added nightly release to pypi (#2403)

---
 .github/workflows/release_nightly.yml | 86 +++++++--------------------
 setup.py                              | 30 ++++++++--
 2 files changed, 45 insertions(+), 71 deletions(-)

diff --git a/.github/workflows/release_nightly.yml b/.github/workflows/release_nightly.yml
index 6bc000d1f4f6..8aa48b8ed89e 100644
--- a/.github/workflows/release_nightly.yml
+++ b/.github/workflows/release_nightly.yml
@@ -1,73 +1,29 @@
-name: Release bdist wheel for Nightly versions
+name: Publish Nightly Version to PyPI
 
 on:
-  schedule:
-    # run at 00:00 of every Sunday
-    - cron:  '0 0 * * 6'
   workflow_dispatch:
+  schedule:
+    - cron:  '0 0 * * 6' # release on every Sunday 00:00 UTC time
 
 jobs:
-  matrix_preparation:
-    name: Prepare Container List
+  build-n-publish:
+    if: github.event_name == 'workflow_dispatch' || github.repository == 'hpcaitech/ColossalAI'
+    name: Build and publish Python 🐍 distributions 📦 to PyPI
     runs-on: ubuntu-latest
-    outputs:
-      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    timeout-minutes: 20
     steps:
-    - id: set-matrix
-      run: |
-        matrix="[\"hpcaitech/cuda-conda:11.3\", \"hpcaitech/cuda-conda:10.2\"]"
-        echo $matrix
-        echo "::set-output name=matrix::{\"container\":$(echo $matrix)}"
+    - uses: actions/checkout@v2
 
-  build:
-    name: Release bdist wheels
-    needs: matrix_preparation
-    if: github.repository == 'hpcaitech/ColossalAI' && contains(fromJson('["FrankLeeeee", "ver217", "feifeibear", "kurisusnowdeng"]'), github.actor)
-    runs-on: [self-hosted, gpu]
-    strategy:
-      fail-fast: false
-      matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
-    container:
-      image: ${{ matrix.container }}
-      options: --gpus all --rm
-    steps:
-      - uses: actions/checkout@v2
-        with:
-          fetch-depth: 0
-      # cub is for cuda 10.2
-      - name: Copy scripts and checkout
-        run: |
-          cp -r ./.github/workflows/scripts/* ./
-          ln -s /github/home/pip_wheels ./pip_wheels
-          wget https://github.com/NVIDIA/cub/archive/refs/tags/1.8.0.zip
-          unzip 1.8.0.zip
-      - name: Build bdist wheel
-        run: |
-          pip install beautifulsoup4 requests packaging
-          python ./build_colossalai_wheel.py --nightly
-      - name: 🚀 Deploy
-        uses: garygrossgarten/github-action-scp@release
-        with:
-          local: all_dist
-          remote: ${{ secrets.PRIVATE_PYPI_NIGHTLY_DIR }}
-          host: ${{ secrets.PRIVATE_PYPI_HOST }}
-          username: ${{ secrets.PRIVATE_PYPI_USER }}
-          password: ${{ secrets.PRIVATE_PYPI_PASSWD }}
-  remove_old_build:
-    name: Remove old nightly build
-    runs-on: ubuntu-latest
-    needs: build
-    steps:
-      - name: executing remote ssh commands using password
-        uses: appleboy/ssh-action@master
-        env:
-          BUILD_DIR: ${{ secrets.PRIVATE_PYPI_NIGHTLY_DIR }}
-        with:
-          host: ${{ secrets.PRIVATE_PYPI_HOST }}
-          username: ${{ secrets.PRIVATE_PYPI_USER }}
-          password: ${{ secrets.PRIVATE_PYPI_PASSWD }}
-          envs: BUILD_DIR
-          script: |
-            cd $BUILD_DIR
-            find . -type f -mtime +0 -exec rm -f {} +
-          script_stop: true
+    - uses: actions/setup-python@v2
+      with:
+        python-version: '3.8.14'
+
+    - run: NIGHTLY=1 python setup.py sdist build
+
+    # publish to PyPI if executed on the main branch
+    - name: Publish package to PyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
+      with:
+        user: __token__
+        password: ${{ secrets.PYPI_API_TOKEN }}
+        verbose: true
diff --git a/setup.py b/setup.py
index 38d5fa91cecd..5128b80e880d 100644
--- a/setup.py
+++ b/setup.py
@@ -1,5 +1,6 @@
 import os
 import re
+from datetime import datetime
 
 from setuptools import find_packages, setup
 
@@ -20,18 +21,22 @@
     TORCH_AVAILABLE = False
     CUDA_HOME = None
 
-
 # ninja build does not work unless include_dirs are abs path
 this_dir = os.path.dirname(os.path.abspath(__file__))
 build_cuda_ext = False
 ext_modules = []
+is_nightly = int(os.environ.get('NIGHTLY', '0')) == 1
 
 if int(os.environ.get('CUDA_EXT', '0')) == 1:
     if not TORCH_AVAILABLE:
-        raise ModuleNotFoundError("PyTorch is not found while CUDA_EXT=1. You need to install PyTorch first in order to build CUDA extensions")
+        raise ModuleNotFoundError(
+            "PyTorch is not found while CUDA_EXT=1. You need to install PyTorch first in order to build CUDA extensions"
+        )
 
     if not CUDA_HOME:
-        raise RuntimeError("CUDA_HOME is not found while CUDA_EXT=1. You need to export CUDA_HOME environment vairable or install CUDA Toolkit first in order to build CUDA extensions")
+        raise RuntimeError(
+            "CUDA_HOME is not found while CUDA_EXT=1. You need to export CUDA_HOME environment vairable or install CUDA Toolkit first in order to build CUDA extensions"
+        )
 
     build_cuda_ext = True
 
@@ -139,8 +144,16 @@ def get_version():
         print(f'===== Building Extension {name} =====')
         ext_modules.append(builder_cls().builder())
 
-setup(name='colossalai',
-      version=get_version(),
+if is_nightly:
+    # use date as the nightly version
+    version = datetime.today().strftime('%Y.%m.%d')
+    package_name = 'colossalai-nightly'
+else:
+    version = get_version()
+    package_name = 'colossalai'
+
+setup(name=package_name,
+      version=version,
       packages=find_packages(exclude=(
           'benchmark',
           'docker',
@@ -179,4 +192,9 @@ def get_version():
           'Topic :: Scientific/Engineering :: Artificial Intelligence',
           'Topic :: System :: Distributed Computing',
       ],
-      package_data={'colossalai': ['_C/*.pyi', 'kernel/cuda_native/csrc/*', 'kernel/cuda_native/csrc/kernel/*', 'kernel/cuda_native/csrc/kernels/include/*']})
+      package_data={
+          'colossalai': [
+              '_C/*.pyi', 'kernel/cuda_native/csrc/*', 'kernel/cuda_native/csrc/kernel/*',
+              'kernel/cuda_native/csrc/kernels/include/*'
+          ]
+      })

From 212b5b1b5f4f3debf983d8c47c58af507a554be4 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 16:29:33 +0800
Subject: [PATCH 118/503] add comments

---
 colossalai/autochunk/autochunk_codegen.py     | 35 +++++++++++--------
 .../test_autochunk/test_autochunk_codegen.py  |  2 +-
 2 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
index 9ec59477b426..5ef560ac209a 100644
--- a/colossalai/autochunk/autochunk_codegen.py
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -1,4 +1,4 @@
-from typing import Any, Callable, Dict, Iterable, List, Tuple
+from typing import Any, Dict, Iterable, List, Tuple
 
 import torch
 from torch.fx.graph import (
@@ -128,37 +128,42 @@ def _replace_input_var(chunk_inputs, region_idx, chunk_inputs_dim, node_idx, bod
 
 
 def emit_code_with_chunk(
-    body,
-    nodes,
+    body: List[str],
+    nodes: Iterable[Node],
     emit_node_func,
     delete_unused_value_func,
     search_chunk: SearchChunk,
-    chunk_infos,
+    chunk_infos: List,
 ):
-    """Emit code with nested activation checkpoint
-    When we detect some of the node.activation_checkpoint is a List, we will use
-    this function to emit the activation checkpoint codes.
+    """
+    Emit code with chunk according to chunk_infos.
+    
+    It will generate a for loop in chunk regions, and replace inputs 
+        and outputs of regions with chunked variables.
 
     Args:
         body: forward code
-        ckpt_func: checkpoint functions code
         nodes: graph.nodes
         emit_node_func: function to emit node
         delete_unused_value_func: function to remove the unused value
+        search_chunk: the class to search all chunks
+        chunk_infos: store all information about all chunks.
     """
     node_list = list(nodes)
 
-    chunk_regions = [i["region"] for i in chunk_infos]
-    chunk_starts = [i[0] for i in chunk_regions]
-    chunk_ends = [i[1] for i in chunk_regions]
+    # chunk region
+    chunk_starts = [i["region"][0] for i in chunk_infos]
+    chunk_ends = [i["region"][1] for i in chunk_infos]
 
-    chunk_inputs = [i["inputs"] for i in chunk_infos]
-    chunk_inputs_non_chunk = [i["inputs_non_chunk"] for i in chunk_infos]
-    chunk_inputs_dim = [i["inputs_dim"] for i in chunk_infos]
+    # chunk inputs
+    chunk_inputs = [i["inputs"] for i in chunk_infos] # input with chunk
+    chunk_inputs_non_chunk = [i["inputs_non_chunk"] for i in chunk_infos] # input without chunk
+    chunk_inputs_dim = [i["inputs_dim"] for i in chunk_infos] # input chunk dim
     chunk_inputs_names = [j.name for i in chunk_inputs for j in i] + [
         j.name for i in chunk_inputs_non_chunk for j in i
     ]
 
+    # chunk outputs
     chunk_outputs = [i["outputs"][0] for i in chunk_infos]
     chunk_outputs_dim = [i["outputs_dim"] for i in chunk_infos]
 
@@ -170,6 +175,7 @@ def emit_code_with_chunk(
     while node_idx < len(node_list):
         node = node_list[node_idx]
 
+        # if is chunk start, generate for loop start
         if node_idx in chunk_starts:
             within_chunk_region = True
             region_idx = chunk_starts.index(node_idx)
@@ -203,6 +209,7 @@ def emit_code_with_chunk(
             if node_idx not in chunk_inputs:
                 delete_unused_value_func(node, body, chunk_inputs_names)
 
+        # generate chunk region end
         if node_idx in chunk_ends:
             body.append(
                 _gen_loop_end(
diff --git a/tests/test_autochunk/test_autochunk_codegen.py b/tests/test_autochunk/test_autochunk_codegen.py
index c4f5cda67204..53f62077c07a 100644
--- a/tests/test_autochunk/test_autochunk_codegen.py
+++ b/tests/test_autochunk/test_autochunk_codegen.py
@@ -115,4 +115,4 @@ def test_autochunk_codegen(msa_len, pair_len, max_memory):
 
 
 if __name__ == "__main__":
-    _test_autochunk_codegen(0, 32, 64, None)
+    _test_autochunk_codegen(0, 32, 64, 25)

From 1951f7fa87725b6cc719226d26e5734958adffac Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 16:30:16 +0800
Subject: [PATCH 119/503] code style

---
 colossalai/autochunk/autochunk_codegen.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
index 5ef560ac209a..cc39e391e4be 100644
--- a/colossalai/autochunk/autochunk_codegen.py
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -137,9 +137,9 @@ def emit_code_with_chunk(
 ):
     """
     Emit code with chunk according to chunk_infos.
-    
-    It will generate a for loop in chunk regions, and replace inputs 
-        and outputs of regions with chunked variables.
+
+    It will generate a for loop in chunk regions, and 
+    replace inputs and outputs of regions with chunked variables.
 
     Args:
         body: forward code
@@ -156,9 +156,11 @@ def emit_code_with_chunk(
     chunk_ends = [i["region"][1] for i in chunk_infos]
 
     # chunk inputs
-    chunk_inputs = [i["inputs"] for i in chunk_infos] # input with chunk
-    chunk_inputs_non_chunk = [i["inputs_non_chunk"] for i in chunk_infos] # input without chunk
-    chunk_inputs_dim = [i["inputs_dim"] for i in chunk_infos] # input chunk dim
+    chunk_inputs = [i["inputs"] for i in chunk_infos]  # input with chunk
+    chunk_inputs_non_chunk = [
+        i["inputs_non_chunk"] for i in chunk_infos
+    ]  # input without chunk
+    chunk_inputs_dim = [i["inputs_dim"] for i in chunk_infos]  # input chunk dim
     chunk_inputs_names = [j.name for i in chunk_inputs for j in i] + [
         j.name for i in chunk_inputs_non_chunk for j in i
     ]

From a68d240ed56dcd62a0726621c50233f733e79367 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 16:54:08 +0800
Subject: [PATCH 120/503] add doc for search chunk

---
 colossalai/autochunk/search_chunk.py | 76 ++++++++++++++++++++--------
 1 file changed, 55 insertions(+), 21 deletions(-)

diff --git a/colossalai/autochunk/search_chunk.py b/colossalai/autochunk/search_chunk.py
index 21b967497f1b..613c28454df3 100644
--- a/colossalai/autochunk/search_chunk.py
+++ b/colossalai/autochunk/search_chunk.py
@@ -1,4 +1,7 @@
 import copy
+from typing import Any, Dict, Iterable, List, Tuple
+
+from torch.fx.node import Node
 
 from .estimate_memory import EstimateMemory
 from .reorder_graph import ReorderGraph
@@ -13,6 +16,34 @@
 
 
 class SearchChunk(object):
+    """
+    This is the core class for AutoChunk.
+
+    It defines the framework of the strategy of AutoChunk.
+    Chunks will be selected one by one utill search stops.
+
+    The chunk search is as follows:
+    1. find the peak memory node
+    2. find the max chunk region according to the peak memory node
+    3. find all possible chunk regions in the max chunk region
+    4. find the best chunk region for current status
+    5. goto 1
+
+    Attributes:
+        gm: graph model
+        print_mem (bool): print estimated memory
+        trace_index: trace the flow of every dim of every node to find all free dims
+        trace_flow: determine the region chunk strategy
+        reorder_graph: reorder nodes to improve chunk efficiency
+        estimate_memory: estimate memory with chunk
+        select_chunk: select the best chunk region
+
+    Args:
+        gm: graph model
+        max_memory (int): max memory in MB
+        print_mem (bool): print estimated memory
+    """
+
     def __init__(self, gm, max_memory=None, print_mem=False) -> None:
         self.gm = gm
         self.print_mem = print_mem
@@ -33,24 +64,37 @@ def _find_peak_node(self, mem_peak):
         max_idx = mem_peak.index(max_value)
         return max_idx
 
-    def _get_free_var(self):
+    def _get_free_var_idx(self) -> List:
+        """
+        Get free var index
+
+        Returns:
+            free_var_idx (List): all indexs of free vars
+        """
         free_var_idx = []
         for idx, n in enumerate(self.trace_index.node_list):
             if n.op == "placeholder":
                 free_var_idx.append(idx)
         return free_var_idx
 
-    def _get_min_free_var(self, active_node_list, free_vars):
-        min_len = 999
-        for idx, n in enumerate(active_node_list):
-            if idx in free_vars:
-                continue
-            if len(n) < min_len:
-                min_len = len(n)
-        return min_len
+    def _search_max_chunk_region(
+        self, active_node: List, peak_node: Node, chunk_regions: List
+    ) -> Tuple:
+        """
+        Search max chunk region according to peak memory node
+
+        Chunk region starts extending from the peak node, stops where free var num is min
 
-    def _search_max_chunk_region(self, active_node, peak_node, chunk_regions):
-        free_vars = self._get_free_var()
+        Args:
+            active_node (List): active node status for every node
+            peak_node (Node): peak memory node
+            chunk_regions (List): chunk region info
+
+        Returns:
+            chunk_region_start (int)
+            chunk_region_end (int)
+        """
+        free_vars = self._get_free_var_idx()
         free_var_num = len(free_vars)
         active_node_num = [len(i) for i in active_node]
         min_active_node_num = min(active_node_num[free_var_num:])
@@ -92,16 +136,6 @@ def _search_max_chunk_region(self, active_node, peak_node, chunk_regions):
                 chunk_region_end = region[0] - 1
         return chunk_region_start, chunk_region_end
 
-    def _is_not_compute(self, trace, chunk_range, dim_idx):
-        if trace["idx"][dim_idx] not in trace["compute"]:
-            return True
-        if trace["idx"][dim_idx] in trace["compute"] and all(
-            i < chunk_range[0] or i > chunk_range[1]
-            for i in trace["compute"][trace["idx"][dim_idx]]
-        ):
-            return True
-        return False
-
     def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
         start_traces = input_trace[start_idx]
         end_trace = output_trace[end_idx]

From 85e045b063a70cd36ccc0405acc245d86f2a1621 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 9 Jan 2023 17:08:55 +0800
Subject: [PATCH 121/503] [doc] updated readme regarding pypi installation
 (#2406)

---
 README-zh-Hans.md | 46 ++++++++++++++++++++++++++++++++++------------
 README.md         | 28 ++++++++++++++++++++++++----
 2 files changed, 58 insertions(+), 16 deletions(-)

diff --git a/README-zh-Hans.md b/README-zh-Hans.md
index 8edcff28bf04..b97b02f5ab84 100644
--- a/README-zh-Hans.md
+++ b/README-zh-Hans.md
@@ -5,10 +5,10 @@
 
    Colossal-AI: 一个面向大模型时代的通用深度学习系统
 
-   <h3> <a href="https://arxiv.org/abs/2110.14883"> 论文 </a> | 
-   <a href="https://www.colossalai.org/"> 文档 </a> | 
-   <a href="https://github.com/hpcaitech/ColossalAI-Examples"> 例程 </a> |   
-   <a href="https://github.com/hpcaitech/ColossalAI/discussions"> 论坛 </a> | 
+   <h3> <a href="https://arxiv.org/abs/2110.14883"> 论文 </a> |
+   <a href="https://www.colossalai.org/"> 文档 </a> |
+   <a href="https://github.com/hpcaitech/ColossalAI-Examples"> 例程 </a> |
+   <a href="https://github.com/hpcaitech/ColossalAI/discussions"> 论坛 </a> |
    <a href="https://medium.com/@hpcaitech"> 博客 </a></h3>
 
    [![Build](https://github.com/hpcaitech/ColossalAI/actions/workflows/build.yml/badge.svg)](https://github.com/hpcaitech/ColossalAI/actions/workflows/build.yml)
@@ -35,7 +35,7 @@
  <li><a href="#为何选择-Colossal-AI">为何选择 Colossal-AI</a> </li>
  <li><a href="#特点">特点</a> </li>
  <li>
-   <a href="#并行训练样例展示">并行训练样例展示</a> 
+   <a href="#并行训练样例展示">并行训练样例展示</a>
    <ul>
      <li><a href="#GPT-3">GPT-3</a></li>
      <li><a href="#GPT-2">GPT-2</a></li>
@@ -47,14 +47,14 @@
    </ul>
  </li>
 <li>
-   <a href="#单GPU训练样例展示">单GPU训练样例展示</a> 
+   <a href="#单GPU训练样例展示">单GPU训练样例展示</a>
    <ul>
      <li><a href="#GPT-2-Single">GPT-2</a></li>
      <li><a href="#PaLM-Single">PaLM</a></li>
    </ul>
  </li>
 <li>
-   <a href="#推理-Energon-AI-样例展示">推理 (Energon-AI) 样例展示</a> 
+   <a href="#推理-Energon-AI-样例展示">推理 (Energon-AI) 样例展示</a>
    <ul>
      <li><a href="#GPT-3-Inference">GPT-3</a></li>
      <li><a href="#OPT-Serving">1750亿参数OPT在线推理服务</a></li>
@@ -62,7 +62,7 @@
    </ul>
  </li>
 <li>
-   <a href="#Colossal-AI-in-the-Real-World">Colossal-AI 成功案例</a> 
+   <a href="#Colossal-AI-in-the-Real-World">Colossal-AI 成功案例</a>
    <ul>
      <li><a href="#AIGC">AIGC: 加速 Stable Diffusion</a></li>
      <li><a href="#生物医药">生物医药: 加速AlphaFold蛋白质结构预测</a></li>
@@ -131,7 +131,7 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/(updated)GPT-2.png" width=800>
 
 - 用相同的硬件训练24倍大的模型
-- 超3倍的吞吐量 
+- 超3倍的吞吐量
 
 ### BERT
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/BERT.png" width=800/>
@@ -145,7 +145,7 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/OPT_update.png" width=800/>
 
 - [Open Pretrained Transformer (OPT)](https://github.com/facebookresearch/metaseq), 由Meta发布的1750亿语言模型，由于完全公开了预训练参数权重，因此促进了下游任务和应用部署的发展。
-- 加速45%，仅用几行代码以低成本微调OPT。[[样例]](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/language/opt) [[在线推理]](https://service.colossalai.org/opt) 
+- 加速45%，仅用几行代码以低成本微调OPT。[[样例]](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/language/opt) [[在线推理]](https://service.colossalai.org/opt)
 
 请访问我们的 [文档](https://www.colossalai.org/) 和 [例程](https://github.com/hpcaitech/ColossalAI-Examples) 以了解详情。
 
@@ -255,6 +255,28 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
 
 ## 安装
 
+### 从PyPI安装
+
+您可以用下面的命令直接从PyPI上下载并安装Colossal-AI。我们默认不会安装PyTorch扩展包
+
+```bash
+pip install colossalai
+```
+
+但是，如果你想在安装时就直接构建PyTorch扩展，您可以设置环境变量`CUDA_EXT=1`.
+
+```bash
+CUDA_EXT=1 pip install colossalai
+```
+
+**否则，PyTorch扩展只会在你实际需要使用他们时在运行时里被构建。**
+
+与此同时，我们也每周定时发布Nightly版本，这能让你提前体验到新的feature和bug fix。你可以通过以下命令安装Nightly版本。
+
+```bash
+pip install colossalai-nightly
+```
+
 ### 从官方安装
 
 您可以访问我们[下载](https://www.colossalai.org/download)页面来安装Colossal-AI，在这个页面上发布的版本都预编译了CUDA扩展。
@@ -274,10 +296,10 @@ pip install -r requirements/requirements.txt
 pip install .
 ```
 
-如果您不想安装和启用 CUDA 内核融合（使用融合优化器时强制安装）：
+我们默认在`pip install`时不安装PyTorch扩展，而是在运行时临时编译，如果你想要提前安装这些扩展的话（在使用融合优化器时会用到），可以使用一下命令。
 
 ```shell
-NO_CUDA_EXT=1 pip install .
+CUDA_EXT=1 pip install .
 ```
 
 <p align="right">(<a href="#top">返回顶端</a>)</p>
diff --git a/README.md b/README.md
index 1b0ca7e973e0..7aba907e0a64 100644
--- a/README.md
+++ b/README.md
@@ -257,9 +257,32 @@ Acceleration of [AlphaFold Protein Structure](https://alphafold.ebi.ac.uk/)
 
 ## Installation
 
+### Install from PyPI
+
+You can easily install Colossal-AI with the following command. **By defualt, we do not build PyTorch extensions during installation.**
+
+```bash
+pip install colossalai
+```
+
+However, if you want to build the PyTorch extensions during installation, you can set `CUDA_EXT=1`.
+
+```bash
+CUDA_EXT=1 pip install colossalai
+```
+
+**Otherwise, CUDA kernels will be built during runtime when you actually need it.**
+
+We also keep release the nightly version to PyPI on a weekly basis. This allows you to access the unreleased features and bug fixes in the main branch.
+Installation can be made via
+
+```bash
+pip install colossalai-nightly
+```
+
 ### Download From Official Releases
 
-You can visit the [Download](https://www.colossalai.org/download) page to download Colossal-AI with pre-built CUDA extensions.
+You can visit the [Download](https://www.colossalai.org/download) page to download Colossal-AI with pre-built PyTorch extensions.
 
 
 ### Download From Source
@@ -270,9 +293,6 @@ You can visit the [Download](https://www.colossalai.org/download) page to downlo
 git clone https://github.com/hpcaitech/ColossalAI.git
 cd ColossalAI
 
-# install dependency
-pip install -r requirements/requirements.txt
-
 # install colossalai
 pip install .
 ```

From 065f0b4c27316e8bc022dbb16d929194b5bb3445 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 17:11:51 +0800
Subject: [PATCH 122/503] add doc for search

---
 colossalai/autochunk/search_chunk.py | 76 +++++++++++++++++++++++++---
 1 file changed, 68 insertions(+), 8 deletions(-)

diff --git a/colossalai/autochunk/search_chunk.py b/colossalai/autochunk/search_chunk.py
index 613c28454df3..ff4c1587849e 100644
--- a/colossalai/autochunk/search_chunk.py
+++ b/colossalai/autochunk/search_chunk.py
@@ -1,5 +1,5 @@
 import copy
-from typing import Any, Dict, Iterable, List, Tuple
+from typing import Dict, List, Tuple
 
 from torch.fx.node import Node
 
@@ -136,7 +136,24 @@ def _search_max_chunk_region(
                 chunk_region_end = region[0] - 1
         return chunk_region_start, chunk_region_end
 
-    def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
+    def _find_chunk_info(self, input_trace, output_trace, start_idx, end_idx) -> List:
+        """
+        Find chunk info for a region.
+
+        We are given the region start and region end, and need to find out all chunk info for it.
+        We first loop every dim of start node and end node, to see if we can find dim pair,
+        which is linked in a flow and not computed.
+        If found, we then search flow in the whole region to find out all chunk infos.
+
+        Args:
+            input_trace (List): node's input trace in region
+            output_trace (List): node's output trace in region
+            start_idx (int): region start node index
+            end_idx (int): region end node index
+
+        Returns:
+            chunk_infos: possible regions found
+        """
         start_traces = input_trace[start_idx]
         end_trace = output_trace[end_idx]
         end_node = self.trace_index.node_list[end_idx]
@@ -174,7 +191,19 @@ def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
                     chunk_infos.append(chunk_info)
         return chunk_infos
 
-    def _search_possible_chunk_regions(self, max_chunk_region, peak_node):
+    def _search_possible_chunk_regions(
+        self, max_chunk_region: Tuple, peak_node: Node
+    ) -> List:
+        """
+        Search every possible region within the max chunk region.
+
+        Args:
+            max_chunk_region (Tuple)
+            peak_node (Node): peak memory node
+
+        Returns:
+            possible_chunk_region (List)
+        """
         possible_chunk_region = []
         output_trace = copy.deepcopy(self.trace_index.idx_trace_list)
         input_trace = []  # trace of a node's input nodes
@@ -196,17 +225,39 @@ def _search_possible_chunk_regions(self, max_chunk_region, peak_node):
                     continue
 
                 # select free dim
-                chunk_info = self._find_free_dim(
+                chunk_info = self._find_chunk_info(
                     input_trace, output_trace, start_idx, end_idx
                 )
                 if len(chunk_info) > 0:
                     possible_chunk_region.extend(chunk_info)
         return possible_chunk_region
 
-    def _step_search(self, mem_peak, active_node, chunk_regions):
+    def _step_search(
+        self,
+        mem_peak: List[float],
+        active_node: List[List[Node]],
+        chunk_infos: List[Dict],
+    ) -> Dict:
+        """
+        Find one chunk region
+
+        The chunk search is as follows:
+        1. find the peak memory node
+        2. find the max chunk region according to the peak memory node
+        3. find all possible chunk regions in the max chunk region
+        4. find the best chunk region for current status
+
+        Args:
+            mem_peak (List): peak memory for every node
+            active_node (List[List[Node]]): active node for every node
+            chunk_infos (List[Dict]): all chunk info
+
+        Returns:
+            best_chunk_region (Dict)
+        """
         peak_node = self._find_peak_node(mem_peak)
         max_chunk_region = self._search_max_chunk_region(
-            active_node, peak_node, chunk_regions
+            active_node, peak_node, chunk_infos
         )
         if max_chunk_region == None:
             return None
@@ -214,7 +265,7 @@ def _step_search(self, mem_peak, active_node, chunk_regions):
             max_chunk_region, peak_node
         )
         best_chunk_region = self.select_chunk._select_best_chunk_region(
-            possible_chunk_regions, chunk_regions, peak_node, max_chunk_region, mem_peak
+            possible_chunk_regions, chunk_infos, peak_node, max_chunk_region, mem_peak
         )
         best_chunk_region = self.reorder_graph.reorder_all(best_chunk_region)
         return best_chunk_region
@@ -225,7 +276,16 @@ def _stop_search(self, init_mem_peak, mem_peak):
             return True
         return False
 
-    def search_region(self):
+    def search_region(self) -> Dict:
+        """
+        Search all chunk regions:
+        1. Estimate current memory
+        2. Find best chunk for current memory
+        3. goto 1
+
+        Returns:
+            chunk_infos (Dict)
+        """
         chunk_infos = []
         (
             init_mem_peak,

From 551cafec14477f17da38d671106341cdc8fed5ff Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 9 Jan 2023 17:13:53 +0800
Subject: [PATCH 123/503] [doc] updated kernel-related optimisers' docstring
 (#2385)

* [doc] updated kernel-related optimisers' docstring

* polish doc
---
 colossalai/nn/optimizer/cpu_adam.py    | 2 +-
 colossalai/nn/optimizer/fused_adam.py  | 3 +--
 colossalai/nn/optimizer/fused_lamb.py  | 3 +--
 colossalai/nn/optimizer/fused_sgd.py   | 3 +--
 colossalai/nn/optimizer/hybrid_adam.py | 2 +-
 5 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/colossalai/nn/optimizer/cpu_adam.py b/colossalai/nn/optimizer/cpu_adam.py
index a8c3522793d8..54036973e1e3 100644
--- a/colossalai/nn/optimizer/cpu_adam.py
+++ b/colossalai/nn/optimizer/cpu_adam.py
@@ -19,7 +19,7 @@ class CPUAdam(NVMeOptimizer):
       * Parameters on GPU and gradients on GPU is allowed.
       * Parameters on GPU and gradients on CPU is **not** allowed.
 
-    Requires ColossalAI to be installed via ``pip install .``.
+    `CPUAdam` requires CUDA extensions which can be built during installation or runtime.
 
     This version of CPU Adam accelates parameters updating on CPU with SIMD.
     Support of AVX2 or AVX512 is required.
diff --git a/colossalai/nn/optimizer/fused_adam.py b/colossalai/nn/optimizer/fused_adam.py
index 2f6bde5ca1ab..941866d557ff 100644
--- a/colossalai/nn/optimizer/fused_adam.py
+++ b/colossalai/nn/optimizer/fused_adam.py
@@ -9,8 +9,7 @@
 class FusedAdam(torch.optim.Optimizer):
     """Implements Adam algorithm.
 
-    Currently GPU-only.  Requires ColossalAI to be installed via
-    ``pip install .``.
+    `FusedAdam` requires CUDA extensions which can be built during installation or runtime.
 
     This version of fused Adam implements 2 fusions.
 
diff --git a/colossalai/nn/optimizer/fused_lamb.py b/colossalai/nn/optimizer/fused_lamb.py
index 891a76da73dd..72520064e98b 100644
--- a/colossalai/nn/optimizer/fused_lamb.py
+++ b/colossalai/nn/optimizer/fused_lamb.py
@@ -9,8 +9,7 @@
 class FusedLAMB(torch.optim.Optimizer):
     """Implements LAMB algorithm.
 
-    Currently GPU-only.  Requires ColossalAI to be installed via
-    ``pip install .``.
+    `FusedLAMB` requires CUDA extensions which can be built during installation or runtime.
 
     This version of fused LAMB implements 2 fusions.
 
diff --git a/colossalai/nn/optimizer/fused_sgd.py b/colossalai/nn/optimizer/fused_sgd.py
index 41e6d524895a..468713b223c1 100644
--- a/colossalai/nn/optimizer/fused_sgd.py
+++ b/colossalai/nn/optimizer/fused_sgd.py
@@ -10,8 +10,7 @@
 class FusedSGD(Optimizer):
     r"""Implements stochastic gradient descent (optionally with momentum).
 
-    Currently GPU-only.  Requires ColossalAI to be installed via
-    ``pip install .``.
+    `FusedSGD` requires CUDA extensions which can be built during installation or runtime.
 
     This version of fused SGD implements 2 fusions.
 
diff --git a/colossalai/nn/optimizer/hybrid_adam.py b/colossalai/nn/optimizer/hybrid_adam.py
index 5196d4338441..1d0fb92de499 100644
--- a/colossalai/nn/optimizer/hybrid_adam.py
+++ b/colossalai/nn/optimizer/hybrid_adam.py
@@ -19,7 +19,7 @@ class HybridAdam(NVMeOptimizer):
       * Parameters on GPU and gradients on GPU is allowed.
       * Parameters on GPU and gradients on CPU is **not** allowed.
 
-    Requires ColossalAI to be installed via ``pip install .``
+    `HybriadAdam` requires CUDA extensions which can be built during installation or runtime.
 
     This version of Hybrid Adam is an hybrid of CPUAdam and FusedAdam.
 

From 0ea903b94edb59df8e24ed86764197292f6345c5 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 17:25:13 +0800
Subject: [PATCH 124/503] rename trace_index to trace_indice

---
 colossalai/autochunk/autochunk_codegen.py     |  4 +-
 colossalai/autochunk/reorder_graph.py         | 32 +++++------
 colossalai/autochunk/search_chunk.py          | 32 +++++------
 colossalai/autochunk/select_chunk.py          | 22 ++++----
 colossalai/autochunk/trace_flow.py            | 56 +++++++++----------
 .../{trace_index.py => trace_indice.py}       |  2 +-
 6 files changed, 74 insertions(+), 74 deletions(-)
 rename colossalai/autochunk/{trace_index.py => trace_indice.py} (99%)

diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
index cc39e391e4be..6e0cfb9cb2e7 100644
--- a/colossalai/autochunk/autochunk_codegen.py
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -94,9 +94,9 @@ def _replace_reshape_size(context, node_name, reshape_size_dict):
     return context
 
 
-def _replace_ones_like(search_chunk, chunk_infos, region_idx, node_idx, node, body):
+def _replace_ones_like(search_chunk: SearchChunk, chunk_infos, region_idx, node_idx, node, body):
     if "ones_like" in node.name:
-        meta_node = search_chunk.trace_index.node_list[node_idx]
+        meta_node = search_chunk.trace_indice.node_list[node_idx]
         chunk_dim = chunk_infos[region_idx]["node_chunk_dim"][meta_node]["chunk_dim"]
         if get_node_shape(meta_node)[chunk_dim] != 1:
             source_node = meta_node.args[0].args[0]
diff --git a/colossalai/autochunk/reorder_graph.py b/colossalai/autochunk/reorder_graph.py
index bf4420eac7ee..6baa0d2a7d13 100644
--- a/colossalai/autochunk/reorder_graph.py
+++ b/colossalai/autochunk/reorder_graph.py
@@ -1,22 +1,22 @@
-from .trace_index import TraceIndex
+from .trace_indice import TraceIndice
 from .utils import find_idx_by_name
 
 
 class ReorderGraph(object):
-    def __init__(self, trace_index: TraceIndex) -> None:
-        self.trace_index = trace_index
+    def __init__(self, trace_indice: TraceIndice) -> None:
+        self.trace_indice = trace_indice
         self.all_reorder_map = {
-            i: i for i in range(len(self.trace_index.idx_trace_list))
+            i: i for i in range(len(self.trace_indice.idx_trace_list))
         }
 
     def _get_reorder_map(self, chunk_info):
-        reorder_map = {i: i for i in range(len(self.trace_index.node_list))}
+        reorder_map = {i: i for i in range(len(self.trace_indice.node_list))}
 
         chunk_region_start = chunk_info["region"][0]
         chunk_region_end = chunk_info["region"][1]
         chunk_prepose_nodes = chunk_info["args"]["prepose_nodes"]
         chunk_prepose_nodes_idx = [
-            find_idx_by_name(i.name, self.trace_index.node_list)
+            find_idx_by_name(i.name, self.trace_indice.node_list)
             for i in chunk_prepose_nodes
         ]
         # put prepose nodes ahead
@@ -24,10 +24,10 @@ def _get_reorder_map(self, chunk_info):
             n_idx = chunk_prepose_nodes_idx[idx]
             reorder_map[n_idx] = chunk_region_start + idx
         # put other nodes after prepose nodes
-        for n in self.trace_index.node_list[chunk_region_start : chunk_region_end + 1]:
+        for n in self.trace_indice.node_list[chunk_region_start : chunk_region_end + 1]:
             if n in chunk_prepose_nodes:
                 continue
-            n_idx = find_idx_by_name(n.name, self.trace_index.node_list)
+            n_idx = find_idx_by_name(n.name, self.trace_indice.node_list)
             pos = sum([n_idx < i for i in chunk_prepose_nodes_idx])
             reorder_map[n_idx] = n_idx + pos
 
@@ -53,25 +53,25 @@ def _update_all_reorder_map(self, reorder_map):
             self.all_reorder_map[origin_idx] = reorder_map[map_idx]
 
     def _reorder_self_node_list(self, reorder_map):
-        new_node_list = [None for _ in range(len(self.trace_index.node_list))]
+        new_node_list = [None for _ in range(len(self.trace_indice.node_list))]
         for old_idx, new_idx in reorder_map.items():
-            new_node_list[new_idx] = self.trace_index.node_list[old_idx]
-        self.trace_index.node_list = new_node_list
+            new_node_list[new_idx] = self.trace_indice.node_list[old_idx]
+        self.trace_indice.node_list = new_node_list
 
     def _reorder_idx_trace(self, reorder_map):
         # reorder list
-        new_idx_trace_list = [None for _ in range(len(self.trace_index.idx_trace_list))]
+        new_idx_trace_list = [None for _ in range(len(self.trace_indice.idx_trace_list))]
         for old_idx, new_idx in reorder_map.items():
-            new_idx_trace_list[new_idx] = self.trace_index.idx_trace_list[old_idx]
-        self.trace_index.idx_trace_list = new_idx_trace_list
+            new_idx_trace_list[new_idx] = self.trace_indice.idx_trace_list[old_idx]
+        self.trace_indice.idx_trace_list = new_idx_trace_list
         # update compute
-        for idx_trace in self.trace_index.idx_trace_list:
+        for idx_trace in self.trace_indice.idx_trace_list:
             compute = idx_trace["compute"]
             for dim_compute in compute:
                 for idx, i in enumerate(dim_compute):
                     dim_compute[idx] = reorder_map[i]
         # update source
-        for idx_trace in self.trace_index.idx_trace_list:
+        for idx_trace in self.trace_indice.idx_trace_list:
             source = idx_trace["source"]
             for dim_idx, dim_source in enumerate(source):
                 new_dim_source = {}
diff --git a/colossalai/autochunk/search_chunk.py b/colossalai/autochunk/search_chunk.py
index ff4c1587849e..d90e50927110 100644
--- a/colossalai/autochunk/search_chunk.py
+++ b/colossalai/autochunk/search_chunk.py
@@ -7,7 +7,7 @@
 from .reorder_graph import ReorderGraph
 from .select_chunk import SelectChunk
 from .trace_flow import TraceFlow
-from .trace_index import TraceIndex
+from .trace_indice import TraceIndice
 from .utils import (
     get_node_shape,
     is_non_compute_node,
@@ -47,13 +47,13 @@ class SearchChunk(object):
     def __init__(self, gm, max_memory=None, print_mem=False) -> None:
         self.gm = gm
         self.print_mem = print_mem
-        self.trace_index = TraceIndex(list(gm.graph.nodes))
-        self.trace_index.trace_index()
-        self.trace_flow = TraceFlow(self.trace_index)
-        self.reorder_graph = ReorderGraph(self.trace_index)
+        self.trace_indice = TraceIndice(list(gm.graph.nodes))
+        self.trace_indice.trace_index()
+        self.trace_flow = TraceFlow(self.trace_indice)
+        self.reorder_graph = ReorderGraph(self.trace_indice)
         self.estimate_memory = EstimateMemory()
         self.select_chunk = SelectChunk(
-            self.trace_index,
+            self.trace_indice,
             self.estimate_memory,
             self.reorder_graph,
             max_memory=max_memory,
@@ -72,7 +72,7 @@ def _get_free_var_idx(self) -> List:
             free_var_idx (List): all indexs of free vars
         """
         free_var_idx = []
-        for idx, n in enumerate(self.trace_index.node_list):
+        for idx, n in enumerate(self.trace_indice.node_list):
             if n.op == "placeholder":
                 free_var_idx.append(idx)
         return free_var_idx
@@ -156,7 +156,7 @@ def _find_chunk_info(self, input_trace, output_trace, start_idx, end_idx) -> Lis
         """
         start_traces = input_trace[start_idx]
         end_trace = output_trace[end_idx]
-        end_node = self.trace_index.node_list[end_idx]
+        end_node = self.trace_indice.node_list[end_idx]
         chunk_infos = []
         for end_dim, _ in enumerate(end_trace["idx"]):
             if len(start_traces) > 1:
@@ -205,23 +205,23 @@ def _search_possible_chunk_regions(
             possible_chunk_region (List)
         """
         possible_chunk_region = []
-        output_trace = copy.deepcopy(self.trace_index.idx_trace_list)
+        output_trace = copy.deepcopy(self.trace_indice.idx_trace_list)
         input_trace = []  # trace of a node's input nodes
-        for _, n in enumerate(self.trace_index.node_list):
+        for _, n in enumerate(self.trace_indice.node_list):
             cur_trace = {}
             for arg in n.args:
                 if type(arg) == type(n) and not is_non_compute_node_except_placeholder(
                     arg
                 ):
-                    cur_trace[arg] = self.trace_index._find_trace_from_node(arg)
+                    cur_trace[arg] = self.trace_indice._find_trace_from_node(arg)
             input_trace.append(cur_trace)
 
         for start_idx in range(max_chunk_region[0], peak_node + 1):
             for end_idx in range(peak_node, max_chunk_region[1] + 1):
                 # skip non compute nodes
                 if is_non_compute_node(
-                    self.trace_index.node_list[start_idx]
-                ) or is_non_compute_node(self.trace_index.node_list[end_idx]):
+                    self.trace_indice.node_list[start_idx]
+                ) or is_non_compute_node(self.trace_indice.node_list[end_idx]):
                     continue
 
                 # select free dim
@@ -292,7 +292,7 @@ def search_region(self) -> Dict:
             _,
             active_node,
         ) = self.estimate_memory.estimate_chunk_inference_mem(
-            self.trace_index.node_list
+            self.trace_indice.node_list
         )
         mem_peak = init_mem_peak
 
@@ -307,13 +307,13 @@ def search_region(self) -> Dict:
                 _,
                 active_node,
             ) = self.estimate_memory.estimate_chunk_inference_mem(
-                self.trace_index.node_list, chunk_infos
+                self.trace_indice.node_list, chunk_infos
             )
             if self._stop_search(init_mem_peak, mem_peak):
                 break
         if self.print_mem:
             self.print_mem = False
             self.estimate_memory.estimate_chunk_inference_mem(
-                self.trace_index.node_list, chunk_infos, print_mem=True
+                self.trace_indice.node_list, chunk_infos, print_mem=True
             )
         return chunk_infos
diff --git a/colossalai/autochunk/select_chunk.py b/colossalai/autochunk/select_chunk.py
index 7127cfd64e69..f0612e45a8e6 100644
--- a/colossalai/autochunk/select_chunk.py
+++ b/colossalai/autochunk/select_chunk.py
@@ -1,19 +1,19 @@
 from .estimate_memory import EstimateMemory
 from .reorder_graph import ReorderGraph
-from .trace_index import TraceIndex
+from .trace_indice import TraceIndice
 from .utils import is_non_compute_node
 
 
 class SelectChunk(object):
     def __init__(
         self,
-        trace_index: TraceIndex,
+        trace_indice: TraceIndice,
         estimate_memory: EstimateMemory,
         reorder_graph: ReorderGraph,
         max_memory=None,
     ):
-        self.index_tracer = trace_index
-        self.memory_estimator = estimate_memory
+        self.trace_indice = trace_indice
+        self.estimate_memory = estimate_memory
         self.reorder_graph = reorder_graph
         if max_memory is not None:
             self.stratge = "fit_memory"
@@ -68,10 +68,10 @@ def _select_fit_memory_chunk_region(
         for region in possible_chunk_regions:
             cur_region = region.copy()
             cur_node_list, cur_region = self.reorder_graph.tmp_reorder(
-                self.index_tracer.node_list, cur_region
+                self.trace_indice.node_list, cur_region
             )
             cur_chunk_infos = chunk_infos + [cur_region]
-            cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
+            cur_mem_peak = self.estimate_memory.estimate_chunk_inference_mem(
                 cur_node_list, cur_chunk_infos
             )[0]
             cur_chunk_region_peak = cur_mem_peak[
@@ -113,7 +113,7 @@ def _get_fit_chunk_size(self, chunk_region_dict, chunk_infos):
             chunk_size *= 2
             reorder_chunk_info["chunk_size"] = chunk_size
             cur_chunk_infos = chunk_infos + [reorder_chunk_info]
-            cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
+            cur_mem_peak = self.estimate_memory.estimate_chunk_inference_mem(
                 chunk_region_dict["reorder_node_list"], cur_chunk_infos
             )[0]
             cur_chunk_max_mem = max(
@@ -139,7 +139,7 @@ def _chunk_size_binary_search(self, left, right, chunk_region_dict, chunk_infos)
             mid = int((left + right) / 2 + 0.5)
             chunk_info["chunk_size"] = mid
             cur_chunk_infos = chunk_infos + [chunk_info]
-            cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
+            cur_mem_peak = self.estimate_memory.estimate_chunk_inference_mem(
                 chunk_region_dict["reorder_node_list"], cur_chunk_infos
             )[0]
             cur_chunk_max_mem = max(
@@ -153,7 +153,7 @@ def _chunk_size_binary_search(self, left, right, chunk_region_dict, chunk_infos)
 
     def _get_compute_node_num(self, start, end):
         count = 0
-        for i in self.index_tracer.node_list[start : end + 1]:
+        for i in self.trace_indice.node_list[start : end + 1]:
             if not is_non_compute_node(i):
                 count += 1
         return count
@@ -178,10 +178,10 @@ def _select_min_memory_chunk_region(
         for region in possible_chunk_regions:
             cur_region = region.copy()
             cur_node_list, cur_region = self.reorder_graph.tmp_reorder(
-                self.index_tracer.node_list, cur_region
+                self.trace_indice.node_list, cur_region
             )
             cur_chunk_infos = chunk_infos + [cur_region]
-            cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
+            cur_mem_peak = self.estimate_memory.estimate_chunk_inference_mem(
                 cur_node_list, cur_chunk_infos
             )[0]
             cur_chunk_region_peak = cur_mem_peak[
diff --git a/colossalai/autochunk/trace_flow.py b/colossalai/autochunk/trace_flow.py
index 7139e7e047ef..33fade1a5463 100644
--- a/colossalai/autochunk/trace_flow.py
+++ b/colossalai/autochunk/trace_flow.py
@@ -1,4 +1,4 @@
-from .trace_index import TraceIndex
+from .trace_indice import TraceIndice
 from .utils import (
     find_chunk_all_input_nodes,
     find_chunk_compute_input_and_output_nodes,
@@ -10,8 +10,8 @@
 
 
 class TraceFlow(object):
-    def __init__(self, trace_index: TraceIndex) -> None:
-        self.trace_index = trace_index
+    def __init__(self, trace_indice: TraceIndice) -> None:
+        self.trace_indice = trace_indice
 
     def check_index_source(self, start_dim, start_node, start_idx, end_dim, end_node):
         """
@@ -25,8 +25,8 @@ def check_index_source(self, start_dim, start_node, start_idx, end_dim, end_node
         Returns:
             bool: True if check pass
         """
-        start_node_idx = find_idx_by_name(start_node.name, self.trace_index.node_list)
-        end_node_trace = self.trace_index._find_trace_from_node(end_node)
+        start_node_idx = find_idx_by_name(start_node.name, self.trace_indice.node_list)
+        end_node_trace = self.trace_indice._find_trace_from_node(end_node)
         end_node_trace_source = end_node_trace["source"][end_dim]
         sorted_source = sorted(
             end_node_trace_source.items(), key=lambda d: d[0], reverse=True
@@ -51,24 +51,24 @@ def check_index_compute(self, start_idx, end_dim, end_node, end_idx):
         Returns:
             bool: True if check pass
         """
-        end_node_trace = self.trace_index._find_trace_from_node(end_node)
+        end_node_trace = self.trace_indice._find_trace_from_node(end_node)
         end_node_compute = end_node_trace["compute"][end_dim]
         if any(start_idx <= i <= end_idx for i in end_node_compute):
             return False
         return True
 
     def get_node_chunk_dim(self, node_from, node_from_dim, node_to):
-        node_from_source = self.trace_index._find_source_trace_from_node(node_from)
+        node_from_source = self.trace_indice._find_source_trace_from_node(node_from)
         dim_source = node_from_source[node_from_dim]
-        node_to_idx = find_idx_by_name(node_to.name, self.trace_index.node_list)
+        node_to_idx = find_idx_by_name(node_to.name, self.trace_indice.node_list)
         for k, v in dim_source.items():
             if k == node_to_idx:
                 return v
         return None
 
     def _find_inherit_dim(self, input_node, input_dim, node):
-        input_node_idx = find_idx_by_name(input_node.name, self.trace_index.node_list)
-        node_trace_source = self.trace_index._find_source_trace_from_node(node)
+        input_node_idx = find_idx_by_name(input_node.name, self.trace_indice.node_list)
+        node_trace_source = self.trace_indice._find_source_trace_from_node(node)
         for node_dim in range(len(get_node_shape(node))):
             if (
                 input_node_idx in node_trace_source[node_dim]
@@ -82,19 +82,19 @@ def check_index_duplicate(self, chunk_infos, return_dim=False):
         for input_node_idx, input_node in enumerate(chunk_infos["inputs"]):
             for k, v in chunk_infos["inputs_dim"][input_node_idx].items():
                 inherit_dim = self._find_inherit_dim(
-                    input_node, v, self.trace_index.node_list[k]
+                    input_node, v, self.trace_indice.node_list[k]
                 )
                 if inherit_dim:
                     input_dim_after_node[k] = inherit_dim
 
-        for node in self.trace_index.node_list[
+        for node in self.trace_indice.node_list[
             chunk_infos["region"][0] : chunk_infos["region"][1] + 1
         ]:
             if is_non_compute_node_except_placeholder(node):
                 continue
             count = 0
             duplicate_dims = []
-            node_trace_source = self.trace_index._find_source_trace_from_node(node)
+            node_trace_source = self.trace_indice._find_source_trace_from_node(node)
             for node_dim in range(len(get_node_shape(node))):
                 duplicate_dim = []
                 duplicate_flag = False
@@ -130,7 +130,7 @@ def _assgin_single_node_flow(
         all_node_info,
         next_node_list,
     ):
-        arg_idx = find_idx_by_name(arg_node.name, self.trace_index.node_list)
+        arg_idx = find_idx_by_name(arg_node.name, self.trace_indice.node_list)
         # arg in chunk range or be inputs
         if not (start_idx <= arg_idx < end_idx):
             return True
@@ -171,7 +171,7 @@ def _assgin_single_node_flow(
 
     def _get_all_node_info(self, end_dim, start_idx, end_idx):
         cur_node_list = [
-            self.trace_index.node_list[end_idx]
+            self.trace_indice.node_list[end_idx]
         ]  # start from the last node
         all_node_info = {cur_node_list[0]: {"chunk_dim": end_dim, "fix_dim": []}}
 
@@ -183,10 +183,10 @@ def _get_all_node_info(self, end_dim, start_idx, end_idx):
                 cur_node_chunk_dim = all_node_info[cur_node]["chunk_dim"]
                 cur_node_fix_dim = all_node_info[cur_node]["fix_dim"]
                 if cur_node_chunk_dim:
-                    cur_node_compute = self.trace_index._find_compute_trace_from_node(
+                    cur_node_compute = self.trace_indice._find_compute_trace_from_node(
                         cur_node
                     )
-                    cur_node_source = self.trace_index._find_source_trace_from_node(
+                    cur_node_source = self.trace_indice._find_source_trace_from_node(
                         cur_node
                     )
                 else:
@@ -220,7 +220,7 @@ def _get_all_node_info(self, end_dim, start_idx, end_idx):
                             if not (
                                 start_idx
                                 <= find_idx_by_name(
-                                    arg.name, self.trace_index.node_list
+                                    arg.name, self.trace_indice.node_list
                                 )
                                 < end_idx
                             ):
@@ -250,16 +250,16 @@ def _get_input_nodes_dim(self, inputs, start_idx, end_idx, all_node_info):
         for input_node in inputs:
             input_dict = {}
             input_node_idx = find_idx_by_name(
-                input_node.name, self.trace_index.node_list
+                input_node.name, self.trace_indice.node_list
             )
             for user in input_node.users.keys():
                 if is_non_compute_node(user):
                     continue
-                user_idx = find_idx_by_name(user.name, self.trace_index.node_list)
+                user_idx = find_idx_by_name(user.name, self.trace_indice.node_list)
                 if start_idx <= user_idx <= end_idx:
                     chunk_dim = all_node_info[user]["chunk_dim"]
                     if chunk_dim is not None:
-                        user_source = self.trace_index._find_source_trace_from_node(
+                        user_source = self.trace_indice._find_source_trace_from_node(
                             user
                         )[chunk_dim]
                         if input_node_idx in user_source:
@@ -282,7 +282,7 @@ def _get_prepose_nodes(self, all_node_info, start_idx, end_idx):
             if node_info["chunk_dim"] is None:
                 maybe_prepose_nodes.append(node)
         maybe_prepose_nodes.sort(
-            key=lambda x: find_idx_by_name(x.name, self.trace_index.node_list),
+            key=lambda x: find_idx_by_name(x.name, self.trace_indice.node_list),
             reverse=True,
         )  # from last node to first node
         prepose_nodes = []
@@ -308,7 +308,7 @@ def _get_prepose_nodes(self, all_node_info, start_idx, end_idx):
                         if not (
                             start_idx
                             <= find_idx_by_name(
-                                cur_prepose_node_arg.name, self.trace_index.node_list
+                                cur_prepose_node_arg.name, self.trace_indice.node_list
                             )
                             < end_idx
                         ):
@@ -336,14 +336,14 @@ def _get_prepose_nodes(self, all_node_info, start_idx, end_idx):
                         maybe_prepose_nodes.remove(n)
         # sort by index
         prepose_nodes.sort(
-            key=lambda x: find_idx_by_name(x.name, self.trace_index.node_list)
+            key=lambda x: find_idx_by_name(x.name, self.trace_indice.node_list)
         )
 
         return prepose_nodes
 
     def _get_non_chunk_inputs(self, chunk_info, start_idx, end_idx):
         # we need to log input nodes to avoid deleteing them in the loop
-        chunk_node_list = self.trace_index.node_list[start_idx : end_idx + 1]
+        chunk_node_list = self.trace_indice.node_list[start_idx : end_idx + 1]
         # also need to get some prepose node's arg out of non_chunk_inputs
         for n in chunk_info["args"]["prepose_nodes"]:
             chunk_node_list.remove(n)
@@ -355,7 +355,7 @@ def _get_non_chunk_inputs(self, chunk_info, start_idx, end_idx):
 
     def flow_search(self, start_idx, start_dim, end_idx, end_dim):
         inputs, outputs = find_chunk_compute_input_and_output_nodes(
-            self.trace_index.node_list[start_idx : end_idx + 1]
+            self.trace_indice.node_list[start_idx : end_idx + 1]
         )
         # only single ouput
         if len(outputs) > 1:
@@ -403,10 +403,10 @@ def _reassgin_reshape_size(self, chunk_info):
         chunk_shape = get_node_shape(chunk_info["outputs"][0])[
             chunk_info["outputs_dim"]
         ]
-        for node in self.trace_index.node_list[chunk_region[0] : chunk_region[1] + 1]:
+        for node in self.trace_indice.node_list[chunk_region[0] : chunk_region[1] + 1]:
             if any(i in node.name for i in ["reshape", "view"]):
                 reshape_args = node.args[1:]
-                reshape_log = self.trace_index.idx_view_list[node]
+                reshape_log = self.trace_indice.idx_view_list[node]
                 chunk_dim = chunk_info["node_chunk_dim"][node]["chunk_dim"]
                 reshape_size[node.name] = {}
                 for reshape_arg_dim, reshape_arg in enumerate(reshape_args):
diff --git a/colossalai/autochunk/trace_index.py b/colossalai/autochunk/trace_indice.py
similarity index 99%
rename from colossalai/autochunk/trace_index.py
rename to colossalai/autochunk/trace_indice.py
index 1e8969d8796e..9a04c2a0d71d 100644
--- a/colossalai/autochunk/trace_index.py
+++ b/colossalai/autochunk/trace_indice.py
@@ -6,7 +6,7 @@
 )
 
 
-class TraceIndex(object):
+class TraceIndice(object):
     def __init__(self, node_list) -> None:
         self.node_list = node_list
         self.idx_trace_list = self._init_idx_trace_list()

From cb9817f75df7bb3569088e3f97cabb442373f256 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 17:34:30 +0800
Subject: [PATCH 125/503] rename function from index to indice

---
 colossalai/autochunk/reorder_graph.py |  12 +-
 colossalai/autochunk/search_chunk.py  |   2 +-
 colossalai/autochunk/trace_flow.py    |   2 +-
 colossalai/autochunk/trace_indice.py  | 166 +++++++++++++-------------
 4 files changed, 91 insertions(+), 91 deletions(-)

diff --git a/colossalai/autochunk/reorder_graph.py b/colossalai/autochunk/reorder_graph.py
index 6baa0d2a7d13..2ece0126e91b 100644
--- a/colossalai/autochunk/reorder_graph.py
+++ b/colossalai/autochunk/reorder_graph.py
@@ -6,7 +6,7 @@ class ReorderGraph(object):
     def __init__(self, trace_indice: TraceIndice) -> None:
         self.trace_indice = trace_indice
         self.all_reorder_map = {
-            i: i for i in range(len(self.trace_indice.idx_trace_list))
+            i: i for i in range(len(self.trace_indice.indice_trace_list))
         }
 
     def _get_reorder_map(self, chunk_info):
@@ -60,18 +60,18 @@ def _reorder_self_node_list(self, reorder_map):
 
     def _reorder_idx_trace(self, reorder_map):
         # reorder list
-        new_idx_trace_list = [None for _ in range(len(self.trace_indice.idx_trace_list))]
+        new_idx_trace_list = [None for _ in range(len(self.trace_indice.indice_trace_list))]
         for old_idx, new_idx in reorder_map.items():
-            new_idx_trace_list[new_idx] = self.trace_indice.idx_trace_list[old_idx]
-        self.trace_indice.idx_trace_list = new_idx_trace_list
+            new_idx_trace_list[new_idx] = self.trace_indice.indice_trace_list[old_idx]
+        self.trace_indice.indice_trace_list = new_idx_trace_list
         # update compute
-        for idx_trace in self.trace_indice.idx_trace_list:
+        for idx_trace in self.trace_indice.indice_trace_list:
             compute = idx_trace["compute"]
             for dim_compute in compute:
                 for idx, i in enumerate(dim_compute):
                     dim_compute[idx] = reorder_map[i]
         # update source
-        for idx_trace in self.trace_indice.idx_trace_list:
+        for idx_trace in self.trace_indice.indice_trace_list:
             source = idx_trace["source"]
             for dim_idx, dim_source in enumerate(source):
                 new_dim_source = {}
diff --git a/colossalai/autochunk/search_chunk.py b/colossalai/autochunk/search_chunk.py
index d90e50927110..67f764a31cc5 100644
--- a/colossalai/autochunk/search_chunk.py
+++ b/colossalai/autochunk/search_chunk.py
@@ -205,7 +205,7 @@ def _search_possible_chunk_regions(
             possible_chunk_region (List)
         """
         possible_chunk_region = []
-        output_trace = copy.deepcopy(self.trace_indice.idx_trace_list)
+        output_trace = copy.deepcopy(self.trace_indice.indice_trace_list)
         input_trace = []  # trace of a node's input nodes
         for _, n in enumerate(self.trace_indice.node_list):
             cur_trace = {}
diff --git a/colossalai/autochunk/trace_flow.py b/colossalai/autochunk/trace_flow.py
index 33fade1a5463..1e2e6dc1258b 100644
--- a/colossalai/autochunk/trace_flow.py
+++ b/colossalai/autochunk/trace_flow.py
@@ -406,7 +406,7 @@ def _reassgin_reshape_size(self, chunk_info):
         for node in self.trace_indice.node_list[chunk_region[0] : chunk_region[1] + 1]:
             if any(i in node.name for i in ["reshape", "view"]):
                 reshape_args = node.args[1:]
-                reshape_log = self.trace_indice.idx_view_list[node]
+                reshape_log = self.trace_indice.indice_view_list[node]
                 chunk_dim = chunk_info["node_chunk_dim"][node]["chunk_dim"]
                 reshape_size[node.name] = {}
                 for reshape_arg_dim, reshape_arg in enumerate(reshape_args):
diff --git a/colossalai/autochunk/trace_indice.py b/colossalai/autochunk/trace_indice.py
index 9a04c2a0d71d..669bfb30a412 100644
--- a/colossalai/autochunk/trace_indice.py
+++ b/colossalai/autochunk/trace_indice.py
@@ -9,13 +9,13 @@
 class TraceIndice(object):
     def __init__(self, node_list) -> None:
         self.node_list = node_list
-        self.idx_trace_list = self._init_idx_trace_list()
-        self.idx_trace_equal = []
-        self.idx_view_list = {}
-        self.idx_count = -1
+        self.indice_trace_list = self._init_indice_trace_list()
+        self.indice_trace_equal = []
+        self.indice_view_list = {}
+        self.indice_count = -1
 
-    def _init_idx_trace_list(self):
-        idx_trace_list = []
+    def _init_indice_trace_list(self):
+        indice_trace_list = []
         for n in self.node_list:
             if get_node_shape(n) != None:
                 cur_trace = {
@@ -25,37 +25,37 @@ def _init_idx_trace_list(self):
                 }
             else:
                 cur_trace = {"idx": [], "compute": [], "source": []}
-            idx_trace_list.append(cur_trace)
-        return idx_trace_list
+            indice_trace_list.append(cur_trace)
+        return indice_trace_list
 
-    def _add_index(self):
+    def _add_indice(self):
         """
         Update the count and return it. To record the idx number.
 
         Returns:
             idx_count: int
         """
-        self.idx_count += 1
-        return self.idx_count
+        self.indice_count += 1
+        return self.indice_count
 
     def _del_dim(self, idx, dim_idx):
-        self.idx_trace_list[idx]["idx"].pop(dim_idx)
-        self.idx_trace_list[idx]["compute"].pop(dim_idx)
-        self.idx_trace_list[idx]["source"].pop(dim_idx)
+        self.indice_trace_list[idx]["idx"].pop(dim_idx)
+        self.indice_trace_list[idx]["compute"].pop(dim_idx)
+        self.indice_trace_list[idx]["source"].pop(dim_idx)
 
     def _add_dim(self, node_idx, dim_idx):
-        self.idx_trace_list[node_idx]["idx"].insert(dim_idx, self._add_index())
-        self.idx_trace_list[node_idx]["compute"].insert(dim_idx, [])
-        self.idx_trace_list[node_idx]["source"].insert(dim_idx, {})
+        self.indice_trace_list[node_idx]["idx"].insert(dim_idx, self._add_indice())
+        self.indice_trace_list[node_idx]["compute"].insert(dim_idx, [])
+        self.indice_trace_list[node_idx]["source"].insert(dim_idx, {})
 
-    def _transform_index(self, node, node_dim):
-        node_idx = self._find_idx_trace_from_node(node)
+    def _transform_indice(self, node, node_dim):
+        node_idx = self._find_indice_trace_from_node(node)
         dims = list(range(len(node_idx)))
         return dims[node_dim]
 
-    def _inherit_index(self, node_from, node_from_dim, node_to, node_to_dim):
-        node_from_dim = self._transform_index(node_from, node_from_dim)
-        node_to_dim = self._transform_index(node_to, node_to_dim)
+    def _inherit_indice(self, node_from, node_from_dim, node_to, node_to_dim):
+        node_from_dim = self._transform_indice(node_from, node_from_dim)
+        node_to_dim = self._transform_indice(node_to, node_to_dim)
         node_from_trace = self._find_trace_from_node(node_from)
         node_to_trace = self._find_trace_from_node(node_to)
         node_to_trace["idx"][node_to_dim] = node_from_trace["idx"][node_from_dim]
@@ -73,9 +73,9 @@ def _inherit_all_computation(self, node_from, node_to):
             node_to_compute[i] = copy.deepcopy(node_from_compute[i])
 
     def _add_source(self, node_from, node_from_dim, node_to, node_to_dim, init=False):
-        node_from_dim = self._transform_index(node_from, node_from_dim)
+        node_from_dim = self._transform_indice(node_from, node_from_dim)
         node_from_trace_source = self._find_source_trace_from_node(node_from)
-        node_to_dim = self._transform_index(node_to, node_to_dim)
+        node_to_dim = self._transform_indice(node_to, node_to_dim)
         node_to_trace_source = self._find_source_trace_from_node(node_to)
         node_from_idx = find_idx_by_name(node_from.name, self.node_list)
         if init:
@@ -99,19 +99,19 @@ def _mark_computation_from_node(self, node_from, node_to, exclude=None):
         if exclude == None:
             exclude = []
         else:
-            exclude = [self._transform_index(node_to, i) for i in exclude]
+            exclude = [self._transform_indice(node_to, i) for i in exclude]
         node_from_compute = self._find_compute_trace_from_node(node_from)
         node_to_compute = self._find_compute_trace_from_node(node_to)
         # assert len(node_from_compute) == len(node_to_compute)
         for i in range(-1, -min(len(node_from_compute), len(node_to_compute)) - 1, -1):
-            if self._transform_index(node_to, i) in exclude:
+            if self._transform_indice(node_to, i) in exclude:
                 continue
             self._add_source(node_from, i, node_to, i)
             for j in node_from_compute[i]:
                 if j not in node_to_compute[i]:
                     node_to_compute[i].append(j)
 
-    def _mark_idx_equal(self, node1, dim1, node2, dim2):
+    def _mark_indice_equal(self, node1, dim1, node2, dim2):
         """
         Mark 2 index to be equal.
 
@@ -140,8 +140,8 @@ def _mark_computation(self, node, idx, dim):
         dims = list(range(len(get_node_shape(node))))
         for d in dim:
             cur_dim = dims[d]
-            if idx not in self.idx_trace_list[idx]["compute"][cur_dim]:
-                self.idx_trace_list[idx]["compute"][cur_dim].append(idx)
+            if idx not in self.indice_trace_list[idx]["compute"][cur_dim]:
+                self.indice_trace_list[idx]["compute"][cur_dim].append(idx)
 
     def _find_trace_from_node(self, node):
         """
@@ -154,7 +154,7 @@ def _find_trace_from_node(self, node):
             compute (list): computed idx of the node.
         """
         node_idx = find_idx_by_name(node.name, self.node_list)
-        node_dict = self.idx_trace_list[node_idx]
+        node_dict = self.indice_trace_list[node_idx]
         return node_dict
 
     def _find_source_trace_from_node(self, node):
@@ -168,10 +168,10 @@ def _find_source_trace_from_node(self, node):
             compute (list): computed idx of the node.
         """
         node_idx = find_idx_by_name(node.name, self.node_list)
-        node_dict = self.idx_trace_list[node_idx]
+        node_dict = self.indice_trace_list[node_idx]
         return node_dict["source"]
 
-    def _find_idx_trace_from_node(self, node):
+    def _find_indice_trace_from_node(self, node):
         """
         Find node idx trace by the node.
 
@@ -181,7 +181,7 @@ def _find_idx_trace_from_node(self, node):
             idx (list): idx of the node
         """
         node_idx = find_idx_by_name(node.name, self.node_list)
-        return self.idx_trace_list[node_idx]["idx"]
+        return self.indice_trace_list[node_idx]["idx"]
 
     def _find_compute_trace_from_node(self, node):
         """
@@ -193,7 +193,7 @@ def _find_compute_trace_from_node(self, node):
             compute (list): computed idx of the node.
         """
         node_idx = find_idx_by_name(node.name, self.node_list)
-        return self.idx_trace_list[node_idx]["compute"]
+        return self.indice_trace_list[node_idx]["compute"]
 
     def _assign_index_as_input(self, node, node_idx, input_node=None):
         """
@@ -206,14 +206,14 @@ def _assign_index_as_input(self, node, node_idx, input_node=None):
         if input_node == None:
             input_node = node.args[0]
         input_node_idx = find_idx_by_name(input_node.name, self.node_list)
-        input_node_idx_trace = self.idx_trace_list[input_node_idx]["idx"]
+        input_node_idx_trace = self.indice_trace_list[input_node_idx]["idx"]
 
         new_idx_trace = copy.deepcopy(input_node_idx_trace)
-        self.idx_trace_list[node_idx]["idx"] = new_idx_trace
+        self.indice_trace_list[node_idx]["idx"] = new_idx_trace
 
         self._inherit_all_computation(input_node, node)
 
-    def _assign_all_index(self, node, node_idx):
+    def _assign_all_indice(self, node, node_idx):
         """
         Add new index for all node's dims.
 
@@ -224,10 +224,10 @@ def _assign_all_index(self, node, node_idx):
         shape = node.meta["tensor_meta"].shape
         new_trace = []
         for _ in shape:
-            new_trace.append(self._add_index())
-        self.idx_trace_list[node_idx]["idx"] = new_trace
+            new_trace.append(self._add_indice())
+        self.indice_trace_list[node_idx]["idx"] = new_trace
 
-    def _assign_transpose_index(self, node, node_idx):
+    def _assign_transpose_indice(self, node, node_idx):
         """
         Assign index for transpose op.
         1. swap input's dim according to transpose args
@@ -241,10 +241,10 @@ def _assign_transpose_index(self, node, node_idx):
         tranpose_dim = node.args[1:]
 
         self._assign_index_as_input(node, node_idx, input_node)
-        self._inherit_index(input_node, tranpose_dim[1], node, tranpose_dim[0])
-        self._inherit_index(input_node, tranpose_dim[0], node, tranpose_dim[1])
+        self._inherit_indice(input_node, tranpose_dim[1], node, tranpose_dim[0])
+        self._inherit_indice(input_node, tranpose_dim[0], node, tranpose_dim[1])
 
-    def _assign_permute_index(self, node, node_idx):
+    def _assign_permute_indice(self, node, node_idx):
         """
         Assign index for permute op.
         1. swap input's dim according to permute args
@@ -259,9 +259,9 @@ def _assign_permute_index(self, node, node_idx):
 
         self._assign_index_as_input(node, node_idx, input_node)
         for idx, d in enumerate(permute_dim):
-            self._inherit_index(input_node, d, node, idx)
+            self._inherit_indice(input_node, d, node, idx)
 
-    def _assign_linear_index(self, node, node_idx):
+    def _assign_linear_indice(self, node, node_idx):
         """
         Assign index for linear op.
         1. copy trace from input node and change last index accroding to weight
@@ -279,15 +279,15 @@ def _assign_linear_index(self, node, node_idx):
             input_node, weight, bias = node.args
 
         self._assign_index_as_input(node, node_idx)
-        self._inherit_index(weight, 1, node, -1)
+        self._inherit_indice(weight, 1, node, -1)
 
         self._mark_computation(node, node_idx, [-1])
-        self._mark_idx_equal(input_node, -1, weight, 0)
+        self._mark_indice_equal(input_node, -1, weight, 0)
 
         if bias:
-            self._mark_idx_equal(input_node, -1, bias, 0)
+            self._mark_indice_equal(input_node, -1, bias, 0)
 
-    def _assign_matmul_index(self, node, node_idx):
+    def _assign_matmul_indice(self, node, node_idx):
         """
         Assign index for matmul op.
         1. copy trace from matmul_left and change last index accroding to matmul_right. (assert they have same length)
@@ -302,13 +302,13 @@ def _assign_matmul_index(self, node, node_idx):
 
         assert len(get_node_shape(matmul_left)) == len(get_node_shape(matmul_right))
         self._assign_index_as_input(node, node_idx, matmul_left)
-        self._inherit_index(matmul_right, -1, node, -1)
+        self._inherit_indice(matmul_right, -1, node, -1)
 
         self._mark_computation_from_node(matmul_right, node, [-1, -2])
         self._mark_computation(node, node_idx, [-1])
-        self._mark_idx_equal(matmul_left, -1, matmul_right, -2)
+        self._mark_indice_equal(matmul_left, -1, matmul_right, -2)
 
-    def _assign_layernorm_index(self, node, idx):
+    def _assign_layernorm_indice(self, node, idx):
         """
         Assign index for layernorm op.
         1. assign index as input node
@@ -321,7 +321,7 @@ def _assign_layernorm_index(self, node, idx):
         self._assign_index_as_input(node, idx)
         self._mark_computation(node, idx, [-1])
 
-    def _assign_elementwise_index(self, node, idx):
+    def _assign_elementwise_indice(self, node, idx):
         """
         Assign index for element-wise op (eg. relu sigmoid add mul).
         1. assign index as input node
@@ -343,15 +343,15 @@ def _assign_elementwise_index(self, node, idx):
             node_in1_shape = get_node_shape(nodes_in[1])
             for i in range(-1, -min(len(node_in0_shape), len(node_in1_shape)) - 1, -1):
                 if node_in0_shape[i] == node_in1_shape[i]:
-                    self._mark_idx_equal(nodes_in[0], i, nodes_in[1], i)
+                    self._mark_indice_equal(nodes_in[0], i, nodes_in[1], i)
 
-    def _assgin_no_change_index(self, node, idx):
+    def _assgin_no_change_indice(self, node, idx):
         self._assign_index_as_input(node, idx)
         for node_in in node.args:
             if type(node_in) == type(node):
                 self._mark_computation_from_node(node_in, node)
 
-    def _assign_einsum_index(self, node, idx):
+    def _assign_einsum_indice(self, node, idx):
         """
         Assign index for einsum op.
 
@@ -378,7 +378,7 @@ def _assign_einsum_index(self, node, idx):
             for left_idx, left_str in enumerate(left):
                 if right_indice in left_str:
                     source_idx = left_str.index(right_indice)
-                    self._inherit_index(
+                    self._inherit_indice(
                         input_nodes[left_idx], source_idx, node, right_idx
                     )
 
@@ -388,7 +388,7 @@ def _assign_einsum_index(self, node, idx):
         #             self._mark_computation(node, idx, left_str.index(i))
         #             break
 
-    def _assign_softmax_index(self, node, idx):
+    def _assign_softmax_indice(self, node, idx):
         """
         Assign index for softmax op.
         1. assign index as input node
@@ -401,7 +401,7 @@ def _assign_softmax_index(self, node, idx):
         self._assign_index_as_input(node, idx)
         self._mark_computation(node, idx, [node.kwargs["dim"]])
 
-    def _assign_unsqueeze_index(self, node, node_idx):
+    def _assign_unsqueeze_indice(self, node, node_idx):
         """
         Assign index for unsqueeze op.
         1. assign new index for unsqueeze dim
@@ -414,7 +414,7 @@ def _assign_unsqueeze_index(self, node, node_idx):
         self._assign_index_as_input(node, node_idx)
         self._add_dim(node_idx, node.args[1])
 
-    def _assign_dropout_index(self, node, node_idx):
+    def _assign_dropout_indice(self, node, node_idx):
         """
         Assign index for unsqueeze op.
         1. assign new index for unsqueeze dim
@@ -425,7 +425,7 @@ def _assign_dropout_index(self, node, node_idx):
         """
         self._assign_index_as_input(node, node_idx)
 
-    def _assign_ones_like_index(self, node, node_idx):
+    def _assign_ones_like_indice(self, node, node_idx):
         """
         Assign index for oneslike op.
         1. assign new index for all dim
@@ -434,9 +434,9 @@ def _assign_ones_like_index(self, node, node_idx):
             node (node)
             node_idx (int)
         """
-        self._assign_all_index(node, node_idx)
+        self._assign_all_indice(node, node_idx)
 
-    def _assign_view_reshape_index(self, node, node_idx):
+    def _assign_view_reshape_indice(self, node, node_idx):
         """
         Assign index for view and reshape op.
         1. get origin shape and target shape by meta info.
@@ -496,7 +496,7 @@ def _assign_view_reshape_index(self, node, node_idx):
             )
 
         # get new index
-        origin_trace = self._find_idx_trace_from_node(origin_node)
+        origin_trace = self._find_indice_trace_from_node(origin_node)
         self._assign_index_as_input(node, node_idx, origin_node)
         dim_from.reverse()
         for i in dim_from:
@@ -516,18 +516,18 @@ def _assign_view_reshape_index(self, node, node_idx):
         view_dict = {
             "idx_from": [origin_trace[i] for i in dim_from],
             "dim_from": dim_from,
-            "idx_to": [self.idx_trace_list[node_idx]["idx"][i] for i in dim_to],
+            "idx_to": [self.indice_trace_list[node_idx]["idx"][i] for i in dim_to],
             "dim_to": dim_to,
         }
-        self.idx_view_list[node] = view_dict
+        self.indice_view_list[node] = view_dict
 
     def _merge_equal_idx(self):
-        idx_equal = copy.deepcopy(self.idx_trace_equal)
+        idx_equal = copy.deepcopy(self.indice_trace_equal)
         idx_equal.reverse()
         for idx in idx_equal:
             merge_to = min(idx)
             merge_from = max(idx)
-            for trace in self.idx_trace_list:
+            for trace in self.indice_trace_list:
                 if merge_from in trace["idx"]:
                     trace["idx"] = [
                         merge_to if i == merge_from else i for i in trace["idx"]
@@ -536,35 +536,35 @@ def _merge_equal_idx(self):
     def trace_index(self):
         for idx, node in enumerate(self.node_list):
             if node.op == "placeholder":
-                self._assign_all_index(node, idx)
+                self._assign_all_indice(node, idx)
             elif node.op == "call_method":
                 if "transpose" in node.name:
-                    self._assign_transpose_index(node, idx)
+                    self._assign_transpose_indice(node, idx)
                 elif "permute" in node.name:
-                    self._assign_permute_index(node, idx)
+                    self._assign_permute_indice(node, idx)
                 elif "view" in node.name or "reshape" in node.name:
-                    self._assign_view_reshape_index(node, idx)
+                    self._assign_view_reshape_indice(node, idx)
                 elif "unsqueeze" in node.name:
-                    self._assign_unsqueeze_index(node, idx)
+                    self._assign_unsqueeze_indice(node, idx)
                 elif any(i in node.name for i in ["to", "contiguous"]):
-                    self._assgin_no_change_index(node, idx)
+                    self._assgin_no_change_indice(node, idx)
                 else:
                     raise NotImplementedError(node.name, "method not implemented yet!")
             elif node.op == "call_function":
                 if "linear" in node.name:
-                    self._assign_linear_index(node, idx)
+                    self._assign_linear_indice(node, idx)
                 elif "matmul" in node.name:
-                    self._assign_matmul_index(node, idx)
+                    self._assign_matmul_indice(node, idx)
                 elif "softmax" in node.name:
-                    self._assign_softmax_index(node, idx)
+                    self._assign_softmax_indice(node, idx)
                 elif any(n in node.name for n in ["mul", "add", "sigmoid", "relu"]):
-                    self._assign_elementwise_index(node, idx)
+                    self._assign_elementwise_indice(node, idx)
                 elif "ones_like" in node.name:
-                    self._assign_ones_like_index(node, idx)
+                    self._assign_ones_like_indice(node, idx)
                 elif "dropout" in node.name:
-                    self._assign_dropout_index(node, idx)
+                    self._assign_dropout_indice(node, idx)
                 elif "einsum" in node.name:
-                    self._assign_einsum_index(node, idx)
+                    self._assign_einsum_indice(node, idx)
                 elif "getattr" in node.name:
                     continue  # get attr like shape
                 elif "getitem" in node.name:
@@ -575,11 +575,11 @@ def trace_index(self):
                     )
             elif node.op == "call_module":
                 if any(n in node.name for n in ["layernorm", "norm"]):
-                    self._assign_layernorm_index(node, idx)
+                    self._assign_layernorm_indice(node, idx)
                 else:
                     raise NotImplementedError(node.name, "module not implemented yet!")
             elif node.op == "get_attr":
-                self._assign_all_index(node, idx)  # get param
+                self._assign_all_indice(node, idx)  # get param
             elif node.op == "output":
                 continue
             else:

From 1bb1f2ad8989bf2b0abc923aeff39c0c1b446e1b Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 17:38:16 +0800
Subject: [PATCH 126/503] rename

---
 colossalai/autochunk/search_chunk.py |  4 +--
 colossalai/autochunk/trace_indice.py | 50 ++++++++++++++--------------
 2 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/colossalai/autochunk/search_chunk.py b/colossalai/autochunk/search_chunk.py
index 67f764a31cc5..eee357073f2f 100644
--- a/colossalai/autochunk/search_chunk.py
+++ b/colossalai/autochunk/search_chunk.py
@@ -158,11 +158,11 @@ def _find_chunk_info(self, input_trace, output_trace, start_idx, end_idx) -> Lis
         end_trace = output_trace[end_idx]
         end_node = self.trace_indice.node_list[end_idx]
         chunk_infos = []
-        for end_dim, _ in enumerate(end_trace["idx"]):
+        for end_dim, _ in enumerate(end_trace["indice"]):
             if len(start_traces) > 1:
                 continue
             for start_node, start_trace in start_traces.items():
-                for start_dim, _ in enumerate(start_trace["idx"]):
+                for start_dim, _ in enumerate(start_trace["indice"]):
                     # dim size cannot be 1
                     if (
                         get_node_shape(end_node)[end_dim] == 1
diff --git a/colossalai/autochunk/trace_indice.py b/colossalai/autochunk/trace_indice.py
index 669bfb30a412..791e5a36e357 100644
--- a/colossalai/autochunk/trace_indice.py
+++ b/colossalai/autochunk/trace_indice.py
@@ -19,12 +19,12 @@ def _init_indice_trace_list(self):
         for n in self.node_list:
             if get_node_shape(n) != None:
                 cur_trace = {
-                    "idx": [None for _ in range(len(get_node_shape(n)))],
+                    "indice": [None for _ in range(len(get_node_shape(n)))],
                     "compute": [[] for _ in range(len(get_node_shape(n)))],
                     "source": [{} for _ in range(len(get_node_shape(n)))],
                 }
             else:
-                cur_trace = {"idx": [], "compute": [], "source": []}
+                cur_trace = {"indice": [], "compute": [], "source": []}
             indice_trace_list.append(cur_trace)
         return indice_trace_list
 
@@ -39,12 +39,12 @@ def _add_indice(self):
         return self.indice_count
 
     def _del_dim(self, idx, dim_idx):
-        self.indice_trace_list[idx]["idx"].pop(dim_idx)
+        self.indice_trace_list[idx]["indice"].pop(dim_idx)
         self.indice_trace_list[idx]["compute"].pop(dim_idx)
         self.indice_trace_list[idx]["source"].pop(dim_idx)
 
     def _add_dim(self, node_idx, dim_idx):
-        self.indice_trace_list[node_idx]["idx"].insert(dim_idx, self._add_indice())
+        self.indice_trace_list[node_idx]["indice"].insert(dim_idx, self._add_indice())
         self.indice_trace_list[node_idx]["compute"].insert(dim_idx, [])
         self.indice_trace_list[node_idx]["source"].insert(dim_idx, {})
 
@@ -58,7 +58,7 @@ def _inherit_indice(self, node_from, node_from_dim, node_to, node_to_dim):
         node_to_dim = self._transform_indice(node_to, node_to_dim)
         node_from_trace = self._find_trace_from_node(node_from)
         node_to_trace = self._find_trace_from_node(node_to)
-        node_to_trace["idx"][node_to_dim] = node_from_trace["idx"][node_from_dim]
+        node_to_trace["indice"][node_to_dim] = node_from_trace["indice"][node_from_dim]
         node_to_trace["compute"][node_to_dim] = copy.deepcopy(
             node_from_trace["compute"][node_from_dim]
         )
@@ -181,7 +181,7 @@ def _find_indice_trace_from_node(self, node):
             idx (list): idx of the node
         """
         node_idx = find_idx_by_name(node.name, self.node_list)
-        return self.indice_trace_list[node_idx]["idx"]
+        return self.indice_trace_list[node_idx]["indice"]
 
     def _find_compute_trace_from_node(self, node):
         """
@@ -195,7 +195,7 @@ def _find_compute_trace_from_node(self, node):
         node_idx = find_idx_by_name(node.name, self.node_list)
         return self.indice_trace_list[node_idx]["compute"]
 
-    def _assign_index_as_input(self, node, node_idx, input_node=None):
+    def _assign_indice_as_input(self, node, node_idx, input_node=None):
         """
         Assign node's trace as its input node.
 
@@ -206,10 +206,10 @@ def _assign_index_as_input(self, node, node_idx, input_node=None):
         if input_node == None:
             input_node = node.args[0]
         input_node_idx = find_idx_by_name(input_node.name, self.node_list)
-        input_node_idx_trace = self.indice_trace_list[input_node_idx]["idx"]
+        input_node_idx_trace = self.indice_trace_list[input_node_idx]["indice"]
 
         new_idx_trace = copy.deepcopy(input_node_idx_trace)
-        self.indice_trace_list[node_idx]["idx"] = new_idx_trace
+        self.indice_trace_list[node_idx]["indice"] = new_idx_trace
 
         self._inherit_all_computation(input_node, node)
 
@@ -225,7 +225,7 @@ def _assign_all_indice(self, node, node_idx):
         new_trace = []
         for _ in shape:
             new_trace.append(self._add_indice())
-        self.indice_trace_list[node_idx]["idx"] = new_trace
+        self.indice_trace_list[node_idx]["indice"] = new_trace
 
     def _assign_transpose_indice(self, node, node_idx):
         """
@@ -240,7 +240,7 @@ def _assign_transpose_indice(self, node, node_idx):
         input_node = node.args[0]
         tranpose_dim = node.args[1:]
 
-        self._assign_index_as_input(node, node_idx, input_node)
+        self._assign_indice_as_input(node, node_idx, input_node)
         self._inherit_indice(input_node, tranpose_dim[1], node, tranpose_dim[0])
         self._inherit_indice(input_node, tranpose_dim[0], node, tranpose_dim[1])
 
@@ -257,7 +257,7 @@ def _assign_permute_indice(self, node, node_idx):
         permute_dim = node.args[1:]
         input_node = node.args[0]
 
-        self._assign_index_as_input(node, node_idx, input_node)
+        self._assign_indice_as_input(node, node_idx, input_node)
         for idx, d in enumerate(permute_dim):
             self._inherit_indice(input_node, d, node, idx)
 
@@ -278,7 +278,7 @@ def _assign_linear_indice(self, node, node_idx):
         else:
             input_node, weight, bias = node.args
 
-        self._assign_index_as_input(node, node_idx)
+        self._assign_indice_as_input(node, node_idx)
         self._inherit_indice(weight, 1, node, -1)
 
         self._mark_computation(node, node_idx, [-1])
@@ -301,7 +301,7 @@ def _assign_matmul_indice(self, node, node_idx):
         matmul_left, matmul_right = node.args
 
         assert len(get_node_shape(matmul_left)) == len(get_node_shape(matmul_right))
-        self._assign_index_as_input(node, node_idx, matmul_left)
+        self._assign_indice_as_input(node, node_idx, matmul_left)
         self._inherit_indice(matmul_right, -1, node, -1)
 
         self._mark_computation_from_node(matmul_right, node, [-1, -2])
@@ -318,7 +318,7 @@ def _assign_layernorm_indice(self, node, idx):
             node (node)
             node_idx (int)
         """
-        self._assign_index_as_input(node, idx)
+        self._assign_indice_as_input(node, idx)
         self._mark_computation(node, idx, [-1])
 
     def _assign_elementwise_indice(self, node, idx):
@@ -331,7 +331,7 @@ def _assign_elementwise_indice(self, node, idx):
             node (node)
             node_idx (int)
         """
-        self._assign_index_as_input(node, idx)
+        self._assign_indice_as_input(node, idx)
         nodes_in = []
         for node_in in node.args:
             if type(node_in) == type(node):
@@ -346,7 +346,7 @@ def _assign_elementwise_indice(self, node, idx):
                     self._mark_indice_equal(nodes_in[0], i, nodes_in[1], i)
 
     def _assgin_no_change_indice(self, node, idx):
-        self._assign_index_as_input(node, idx)
+        self._assign_indice_as_input(node, idx)
         for node_in in node.args:
             if type(node_in) == type(node):
                 self._mark_computation_from_node(node_in, node)
@@ -398,7 +398,7 @@ def _assign_softmax_indice(self, node, idx):
             node (node)
             node_idx (int)
         """
-        self._assign_index_as_input(node, idx)
+        self._assign_indice_as_input(node, idx)
         self._mark_computation(node, idx, [node.kwargs["dim"]])
 
     def _assign_unsqueeze_indice(self, node, node_idx):
@@ -411,7 +411,7 @@ def _assign_unsqueeze_indice(self, node, node_idx):
             node_idx (int)
         """
         self._del_dim(node_idx, -1)
-        self._assign_index_as_input(node, node_idx)
+        self._assign_indice_as_input(node, node_idx)
         self._add_dim(node_idx, node.args[1])
 
     def _assign_dropout_indice(self, node, node_idx):
@@ -423,7 +423,7 @@ def _assign_dropout_indice(self, node, node_idx):
             node (node)
             node_idx (int)
         """
-        self._assign_index_as_input(node, node_idx)
+        self._assign_indice_as_input(node, node_idx)
 
     def _assign_ones_like_indice(self, node, node_idx):
         """
@@ -497,7 +497,7 @@ def _assign_view_reshape_indice(self, node, node_idx):
 
         # get new index
         origin_trace = self._find_indice_trace_from_node(origin_node)
-        self._assign_index_as_input(node, node_idx, origin_node)
+        self._assign_indice_as_input(node, node_idx, origin_node)
         dim_from.reverse()
         for i in dim_from:
             self._del_dim(node_idx, i)
@@ -516,7 +516,7 @@ def _assign_view_reshape_indice(self, node, node_idx):
         view_dict = {
             "idx_from": [origin_trace[i] for i in dim_from],
             "dim_from": dim_from,
-            "idx_to": [self.indice_trace_list[node_idx]["idx"][i] for i in dim_to],
+            "idx_to": [self.indice_trace_list[node_idx]["indice"][i] for i in dim_to],
             "dim_to": dim_to,
         }
         self.indice_view_list[node] = view_dict
@@ -528,9 +528,9 @@ def _merge_equal_idx(self):
             merge_to = min(idx)
             merge_from = max(idx)
             for trace in self.indice_trace_list:
-                if merge_from in trace["idx"]:
-                    trace["idx"] = [
-                        merge_to if i == merge_from else i for i in trace["idx"]
+                if merge_from in trace["indice"]:
+                    trace["indice"] = [
+                        merge_to if i == merge_from else i for i in trace["indice"]
                     ]
 
     def trace_index(self):

From a4ed5b0d0d926f9e3f84711799e21db795a339e9 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 17:41:26 +0800
Subject: [PATCH 127/503] rename in doc

---
 colossalai/autochunk/trace_indice.py | 66 ++++++++++++----------------
 1 file changed, 29 insertions(+), 37 deletions(-)

diff --git a/colossalai/autochunk/trace_indice.py b/colossalai/autochunk/trace_indice.py
index 791e5a36e357..71b5c16dc04d 100644
--- a/colossalai/autochunk/trace_indice.py
+++ b/colossalai/autochunk/trace_indice.py
@@ -33,7 +33,7 @@ def _add_indice(self):
         Update the count and return it. To record the idx number.
 
         Returns:
-            idx_count: int
+            indice_count: int
         """
         self.indice_count += 1
         return self.indice_count
@@ -113,11 +113,11 @@ def _mark_computation_from_node(self, node_from, node_to, exclude=None):
 
     def _mark_indice_equal(self, node1, dim1, node2, dim2):
         """
-        Mark 2 index to be equal.
+        Mark 2 indice to be equal.
 
         Args:
-            idx1 (int): index count.
-            idx2 (int): index count.
+            idx1 (int): indice count.
+            idx2 (int): indice count.
         """
         # node1_idx = _find_idx_by_name(node1.name, self.nodes_list)
         # node2_idx = _find_idx_by_name(node2.name, self.nodes_list)
@@ -215,7 +215,7 @@ def _assign_indice_as_input(self, node, node_idx, input_node=None):
 
     def _assign_all_indice(self, node, node_idx):
         """
-        Add new index for all node's dims.
+        Add new indice for all node's dims.
 
         Args:
             node (node)
@@ -229,7 +229,7 @@ def _assign_all_indice(self, node, node_idx):
 
     def _assign_transpose_indice(self, node, node_idx):
         """
-        Assign index for transpose op.
+        Assign indice for transpose op.
         1. swap input's dim according to transpose args
         2. inherit input's computation
 
@@ -246,7 +246,7 @@ def _assign_transpose_indice(self, node, node_idx):
 
     def _assign_permute_indice(self, node, node_idx):
         """
-        Assign index for permute op.
+        Assign indice for permute op.
         1. swap input's dim according to permute args
         2. inherit input's computation
 
@@ -263,9 +263,9 @@ def _assign_permute_indice(self, node, node_idx):
 
     def _assign_linear_indice(self, node, node_idx):
         """
-        Assign index for linear op.
-        1. copy trace from input node and change last index accroding to weight
-        2. mark equal for input node last index, weight first dim and bias dim.
+        Assign indice for linear op.
+        1. copy trace from input node and change last indice accroding to weight
+        2. mark equal for input node last indice, weight first dim and bias dim.
         3. inherit input's computation, mark computation for last dim.
 
         Args:
@@ -289,9 +289,9 @@ def _assign_linear_indice(self, node, node_idx):
 
     def _assign_matmul_indice(self, node, node_idx):
         """
-        Assign index for matmul op.
-        1. copy trace from matmul_left and change last index accroding to matmul_right. (assert they have same length)
-        2. mark equal for input matmul_left -1 index and matmul_right -2 dim.
+        Assign indice for matmul op.
+        1. copy trace from matmul_left and change last indice accroding to matmul_right. (assert they have same length)
+        2. mark equal for input matmul_left -1 indice and matmul_right -2 dim.
         3. inherit matmul_left and matmul_right computation, mark computation for last dim.
 
         Args:
@@ -310,8 +310,8 @@ def _assign_matmul_indice(self, node, node_idx):
 
     def _assign_layernorm_indice(self, node, idx):
         """
-        Assign index for layernorm op.
-        1. assign index as input node
+        Assign indice for layernorm op.
+        1. assign indice as input node
         2. inherit computation and mark last 2 dims as computed.
 
         Args:
@@ -323,8 +323,8 @@ def _assign_layernorm_indice(self, node, idx):
 
     def _assign_elementwise_indice(self, node, idx):
         """
-        Assign index for element-wise op (eg. relu sigmoid add mul).
-        1. assign index as input node
+        Assign indice for element-wise op (eg. relu sigmoid add mul).
+        1. assign indice as input node
         2. inherit computation from all input nodes.
 
         Args:
@@ -353,7 +353,7 @@ def _assgin_no_change_indice(self, node, idx):
 
     def _assign_einsum_indice(self, node, idx):
         """
-        Assign index for einsum op.
+        Assign indice for einsum op.
 
         Args:
             node (node)
@@ -371,8 +371,6 @@ def _assign_einsum_indice(self, node, idx):
             for c in i:
                 all_index.append(c)
         all_index = set(all_index)
-        free_index = set([i for i in right])
-        sum_index = all_index - free_index
 
         for right_idx, right_indice in enumerate(right):
             for left_idx, left_str in enumerate(left):
@@ -382,16 +380,10 @@ def _assign_einsum_indice(self, node, idx):
                         input_nodes[left_idx], source_idx, node, right_idx
                     )
 
-        # for i in sum_index:
-        #     for left_idx, left_str in enumerate(left):
-        #         if i in left_str:
-        #             self._mark_computation(node, idx, left_str.index(i))
-        #             break
-
     def _assign_softmax_indice(self, node, idx):
         """
-        Assign index for softmax op.
-        1. assign index as input node
+        Assign indice for softmax op.
+        1. assign indice as input node
         2. inherit computation and mark softmax dim as computed.
 
         Args:
@@ -403,8 +395,8 @@ def _assign_softmax_indice(self, node, idx):
 
     def _assign_unsqueeze_indice(self, node, node_idx):
         """
-        Assign index for unsqueeze op.
-        1. assign new index for unsqueeze dim
+        Assign indice for unsqueeze op.
+        1. assign new indice for unsqueeze dim
 
         Args:
             node (node)
@@ -416,8 +408,8 @@ def _assign_unsqueeze_indice(self, node, node_idx):
 
     def _assign_dropout_indice(self, node, node_idx):
         """
-        Assign index for unsqueeze op.
-        1. assign new index for unsqueeze dim
+        Assign indice for unsqueeze op.
+        1. assign new indice for unsqueeze dim
 
         Args:
             node (node)
@@ -427,8 +419,8 @@ def _assign_dropout_indice(self, node, node_idx):
 
     def _assign_ones_like_indice(self, node, node_idx):
         """
-        Assign index for oneslike op.
-        1. assign new index for all dim
+        Assign indice for oneslike op.
+        1. assign new indice for all dim
 
         Args:
             node (node)
@@ -438,10 +430,10 @@ def _assign_ones_like_indice(self, node, node_idx):
 
     def _assign_view_reshape_indice(self, node, node_idx):
         """
-        Assign index for view and reshape op.
+        Assign indice for view and reshape op.
         1. get origin shape and target shape by meta info.
         2. compute the real value of -1 in target shape.
-        3. determine changed dim, and assgin index for generated dim.
+        3. determine changed dim, and assgin indice for generated dim.
         4. log changed dim and generated dim for restore
         5. inherit computation.
         6. TODO: look into view list to see whether the view is associated with other,
@@ -495,7 +487,7 @@ def _assign_view_reshape_indice(self, node, node_idx):
                 + "view not implemented"
             )
 
-        # get new index
+        # get new indice
         origin_trace = self._find_indice_trace_from_node(origin_node)
         self._assign_indice_as_input(node, node_idx, origin_node)
         dim_from.reverse()

From ea13a201bbd7eb6022069c8379f3626f9788b0f9 Mon Sep 17 00:00:00 2001
From: HELSON <c2h214748@gmail.com>
Date: Mon, 9 Jan 2023 17:41:38 +0800
Subject: [PATCH 128/503] [polish] polish code for get_static_torch_model
 (#2405)

* [gemini] polish code

* [testing] remove code

* [gemini] make more robust
---
 colossalai/nn/parallel/data_parallel.py       | 24 +++++++++----------
 colossalai/nn/parallel/utils.py               |  9 ++++---
 tests/test_gemini/update/test_grad_clip.py    |  2 --
 tests/test_gemini/update/test_optim.py        |  2 --
 .../update/test_zeroddp_state_dict.py         |  4 ----
 tests/test_tensor/test_tp_with_zero.py        |  2 --
 6 files changed, 15 insertions(+), 28 deletions(-)

diff --git a/colossalai/nn/parallel/data_parallel.py b/colossalai/nn/parallel/data_parallel.py
index 8fd08db957b7..a7d79be160d0 100644
--- a/colossalai/nn/parallel/data_parallel.py
+++ b/colossalai/nn/parallel/data_parallel.py
@@ -334,10 +334,9 @@ def set_chunk_grad_device(self, chunk: Chunk, device: torch.device) -> None:
             self.grads_device[tensor] = device
 
     def state_dict(self, destination=None, prefix='', keep_vars=False, only_rank_0: bool = True, strict: bool = True):
-        r"""
+        """
         Args:
-            strict (bool): whether to reture the whole model state
-                as the original pytorch state_dict()
+            strict (bool): whether to reture the whole model state as the pytorch `Module.state_dict()`
 
         Returns:
             dict:
@@ -349,25 +348,24 @@ def state_dict(self, destination=None, prefix='', keep_vars=False, only_rank_0:
             ['bias', 'weight']
         """
         if strict:
-            return get_static_torch_model(zero_ddp_model=self, device=get_current_device(),
-                                          only_rank_0=only_rank_0).state_dict(destination=destination,
-                                                                              prefix=prefix,
-                                                                              keep_vars=keep_vars)
+            assert keep_vars is False, "`state_dict` with parameter, `keep_vars=True`, is not supported now."
+            torch_model = get_static_torch_model(zero_ddp_model=self, only_rank_0=only_rank_0)
+            return torch_model.state_dict(destination=destination, prefix=prefix, keep_vars=keep_vars)
         return self._non_strict_state_dict(destination=destination,
                                            prefix=prefix,
                                            keep_vars=keep_vars,
                                            only_rank_0=only_rank_0)
 
     def _non_strict_state_dict(self, destination=None, prefix='', keep_vars=False, only_rank_0: bool = True):
-        r"""Returns a dictionary containing a whole state of the module.
+        """Returns a dictionary containing a whole state of the module.
 
-        Both parameters and persistent buffers (e.g. running averages) are
-        included. Keys are corresponding parameter and buffer names.
+        Both parameters and persistent buffers (e.g. running averages) are included.
+        Keys are corresponding parameter and buffer names.
         Parameters and buffers set to ``None`` are not included.
 
-        Warning: The non strict state dict would ignore the parameters if the
-            tensors of the parameters are shared with other parameters which
-            have been included in the dictionary.
+        Warning: The non strict state dict would ignore the parameters if the tensors of the parameters
+            are shared with other parameters which have been included in the dictionary.
+            When you need to load the state dict, you should set the argument `strict` to False.
 
         Returns:
             dict:
diff --git a/colossalai/nn/parallel/utils.py b/colossalai/nn/parallel/utils.py
index 988f978254a1..d323556d5f72 100644
--- a/colossalai/nn/parallel/utils.py
+++ b/colossalai/nn/parallel/utils.py
@@ -47,17 +47,16 @@ def _get_shallow_copy_model(model: nn.Module):
     """Get a shallow copy of the given model. Each submodule is different from the original submodule.
     But the new submodule and the old submodule share all attributes.
     """
-    name_to_module = dict()
+    old_to_new = dict()
     for name, module in _get_dfs_module_list(model):
         new_module = copy(module)
         new_module._modules = OrderedDict()
         for subname, submodule in module._modules.items():
             if submodule is None:
                 continue
-            full_name = name + ('.' if name else '') + subname
-            setattr(new_module, subname, name_to_module[full_name])
-        name_to_module[name] = new_module
-    return name_to_module['']
+            setattr(new_module, subname, old_to_new[submodule])
+        old_to_new[module] = new_module
+    return old_to_new[model]
 
 
 def get_static_torch_model(zero_ddp_model,
diff --git a/tests/test_gemini/update/test_grad_clip.py b/tests/test_gemini/update/test_grad_clip.py
index 185521edb357..fda1cf8cfd14 100644
--- a/tests/test_gemini/update/test_grad_clip.py
+++ b/tests/test_gemini/update/test_grad_clip.py
@@ -31,8 +31,6 @@ def check_param(model: ZeroDDP, torch_model: torch.nn.Module):
     for key, value in torch_dict.items():
         # key is 'module.model.PARAMETER', so we truncate it
         key = key[7:]
-        if key == 'model.lm_head.weight':
-            continue
         assert key in zero_dict, "{} not in ZeRO dictionary.".format(key)
         temp_zero_value = zero_dict[key].to(device=value.device, dtype=value.dtype)
         # debug_print([0], "max range: ", key, torch.max(torch.abs(value - temp_zero_value)))
diff --git a/tests/test_gemini/update/test_optim.py b/tests/test_gemini/update/test_optim.py
index 34509cc0cf00..07e6e65f2cd4 100644
--- a/tests/test_gemini/update/test_optim.py
+++ b/tests/test_gemini/update/test_optim.py
@@ -36,8 +36,6 @@ def check_param(model: ZeroDDP, torch_model: torch.nn.Module):
     for key, value in torch_dict.items():
         # key is 'module.model.PARAMETER', so we truncate it
         key = key[7:]
-        if key == 'model.lm_head.weight':
-            continue
         assert key in zero_dict, "{} not in ZeRO dictionary.".format(key)
         temp_zero_value = zero_dict[key].to(device=value.device, dtype=value.dtype)
         # debug_print([0], "max range: ", key, torch.max(torch.abs(value - temp_zero_value)))
diff --git a/tests/test_gemini/update/test_zeroddp_state_dict.py b/tests/test_gemini/update/test_zeroddp_state_dict.py
index 7b0c6e37a7e8..b902bb0f010e 100644
--- a/tests/test_gemini/update/test_zeroddp_state_dict.py
+++ b/tests/test_gemini/update/test_zeroddp_state_dict.py
@@ -45,8 +45,6 @@ def exam_state_dict(placement_policy, keep_gathered, model_name: str):
     torch_dict = torch_model.state_dict()
 
     for key, value in torch_dict.items():
-        if key == 'model.lm_head.weight':
-            continue
         assert key in zero_dict, "{} not in ZeRO dictionary.".format(key)
         temp_zero_value = zero_dict[key].to(device=value.device, dtype=value.dtype)
         assert torch.equal(value, temp_zero_value), "parameter '{}' has problem.".format(key)
@@ -84,8 +82,6 @@ def exam_load_state_dict(placement_policy, keep_gathered, model_name: str):
     zero_dict = model.state_dict(only_rank_0=False)
 
     for key, value in torch_dict.items():
-        if key == 'model.lm_head.weight':
-            continue
         assert key in zero_dict, "{} not in ZeRO dictionary.".format(key)
         temp_zero_value = zero_dict[key].to(device=value.device, dtype=value.dtype)
         assert torch.equal(value, temp_zero_value), "parameter '{}' has problem.".format(key)
diff --git a/tests/test_tensor/test_tp_with_zero.py b/tests/test_tensor/test_tp_with_zero.py
index 33db676cb85f..7e611e8a14f9 100644
--- a/tests/test_tensor/test_tp_with_zero.py
+++ b/tests/test_tensor/test_tp_with_zero.py
@@ -27,8 +27,6 @@ def check_param(model: ZeroDDP, torch_model: torch.nn.Module, pg: ProcessGroup):
     for key, value in torch_dict.items():
         # key is 'module.model.PARAMETER', so we truncate it
         key = key[7:]
-        if key == 'model.lm_head.weight':
-            continue
         assert key in zero_dict, "{} not in ZeRO dictionary.".format(key)
         temp_zero_value = zero_dict[key].to(device=value.device, dtype=value.dtype)
         # debug_print([0], "max range: ", key, torch.max(torch.abs(value - temp_zero_value)))

From 865f2e01965dc3381e16b908c0ce4e544d2fcda9 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 17:42:25 +0800
Subject: [PATCH 129/503] rename

---
 colossalai/autochunk/search_chunk.py | 2 +-
 colossalai/autochunk/trace_indice.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/colossalai/autochunk/search_chunk.py b/colossalai/autochunk/search_chunk.py
index eee357073f2f..590567045507 100644
--- a/colossalai/autochunk/search_chunk.py
+++ b/colossalai/autochunk/search_chunk.py
@@ -48,7 +48,7 @@ def __init__(self, gm, max_memory=None, print_mem=False) -> None:
         self.gm = gm
         self.print_mem = print_mem
         self.trace_indice = TraceIndice(list(gm.graph.nodes))
-        self.trace_indice.trace_index()
+        self.trace_indice.trace_indice()
         self.trace_flow = TraceFlow(self.trace_indice)
         self.reorder_graph = ReorderGraph(self.trace_indice)
         self.estimate_memory = EstimateMemory()
diff --git a/colossalai/autochunk/trace_indice.py b/colossalai/autochunk/trace_indice.py
index 71b5c16dc04d..9ad2649e7cdd 100644
--- a/colossalai/autochunk/trace_indice.py
+++ b/colossalai/autochunk/trace_indice.py
@@ -525,7 +525,7 @@ def _merge_equal_idx(self):
                         merge_to if i == merge_from else i for i in trace["indice"]
                     ]
 
-    def trace_index(self):
+    def trace_indice(self):
         for idx, node in enumerate(self.node_list):
             if node.op == "placeholder":
                 self._assign_all_indice(node, idx)

From d914a21d6405956b954c1cb47735356e3207635e Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 17:45:36 +0800
Subject: [PATCH 130/503] rename

---
 colossalai/autochunk/trace_indice.py | 31 ++--------------------------
 1 file changed, 2 insertions(+), 29 deletions(-)

diff --git a/colossalai/autochunk/trace_indice.py b/colossalai/autochunk/trace_indice.py
index 9ad2649e7cdd..a72fd775b9ed 100644
--- a/colossalai/autochunk/trace_indice.py
+++ b/colossalai/autochunk/trace_indice.py
@@ -111,21 +111,6 @@ def _mark_computation_from_node(self, node_from, node_to, exclude=None):
                 if j not in node_to_compute[i]:
                     node_to_compute[i].append(j)
 
-    def _mark_indice_equal(self, node1, dim1, node2, dim2):
-        """
-        Mark 2 indice to be equal.
-
-        Args:
-            idx1 (int): indice count.
-            idx2 (int): indice count.
-        """
-        # node1_idx = _find_idx_by_name(node1.name, self.nodes_list)
-        # node2_idx = _find_idx_by_name(node2.name, self.nodes_list)
-        # if node1_idx > node2_idx:
-        #     self._add_source(node2, dim2, node1, dim1)
-        # else:
-        #     self._add_source(node1, dim1, node2, dim2)
-
     def _mark_computation(self, node, idx, dim):
         """
         Mark some dims of node as computed.
@@ -273,19 +258,14 @@ def _assign_linear_indice(self, node, node_idx):
             node_idx (int)
         """
         if len(node.args) == 2:
-            input_node, weight = node.args
-            bias = None
+            _, weight = node.args
         else:
-            input_node, weight, bias = node.args
+            _, weight, _ = node.args
 
         self._assign_indice_as_input(node, node_idx)
         self._inherit_indice(weight, 1, node, -1)
 
         self._mark_computation(node, node_idx, [-1])
-        self._mark_indice_equal(input_node, -1, weight, 0)
-
-        if bias:
-            self._mark_indice_equal(input_node, -1, bias, 0)
 
     def _assign_matmul_indice(self, node, node_idx):
         """
@@ -306,7 +286,6 @@ def _assign_matmul_indice(self, node, node_idx):
 
         self._mark_computation_from_node(matmul_right, node, [-1, -2])
         self._mark_computation(node, node_idx, [-1])
-        self._mark_indice_equal(matmul_left, -1, matmul_right, -2)
 
     def _assign_layernorm_indice(self, node, idx):
         """
@@ -338,12 +317,6 @@ def _assign_elementwise_indice(self, node, idx):
                 nodes_in.append(node_in)
                 self._mark_computation_from_node(node_in, node)
         assert len(nodes_in) <= 2
-        if len(nodes_in) == 2:
-            node_in0_shape = get_node_shape(nodes_in[0])
-            node_in1_shape = get_node_shape(nodes_in[1])
-            for i in range(-1, -min(len(node_in0_shape), len(node_in1_shape)) - 1, -1):
-                if node_in0_shape[i] == node_in1_shape[i]:
-                    self._mark_indice_equal(nodes_in[0], i, nodes_in[1], i)
 
     def _assgin_no_change_indice(self, node, idx):
         self._assign_indice_as_input(node, idx)

From 0b6af554df09743cfd97245d0b4e9f7819b1764f Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 17:46:43 +0800
Subject: [PATCH 131/503] remove useless function

---
 colossalai/autochunk/trace_indice.py | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/colossalai/autochunk/trace_indice.py b/colossalai/autochunk/trace_indice.py
index a72fd775b9ed..0d09ed9f0e21 100644
--- a/colossalai/autochunk/trace_indice.py
+++ b/colossalai/autochunk/trace_indice.py
@@ -10,7 +10,6 @@ class TraceIndice(object):
     def __init__(self, node_list) -> None:
         self.node_list = node_list
         self.indice_trace_list = self._init_indice_trace_list()
-        self.indice_trace_equal = []
         self.indice_view_list = {}
         self.indice_count = -1
 
@@ -486,18 +485,6 @@ def _assign_view_reshape_indice(self, node, node_idx):
         }
         self.indice_view_list[node] = view_dict
 
-    def _merge_equal_idx(self):
-        idx_equal = copy.deepcopy(self.indice_trace_equal)
-        idx_equal.reverse()
-        for idx in idx_equal:
-            merge_to = min(idx)
-            merge_from = max(idx)
-            for trace in self.indice_trace_list:
-                if merge_from in trace["indice"]:
-                    trace["indice"] = [
-                        merge_to if i == merge_from else i for i in trace["indice"]
-                    ]
-
     def trace_indice(self):
         for idx, node in enumerate(self.node_list):
             if node.op == "placeholder":

From 53bb8682a2e5a0bfe3e3925d943f13ebc9df879d Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 9 Jan 2023 17:57:57 +0800
Subject: [PATCH 132/503] [worfklow] added coverage test (#2399)

* [worfklow] added coverage test

* polish code

* polish code

* polish code

* polish code

* polish code

* polish code

* polish code

* polish code
---
 .github/workflows/build.yml        | 3 ++-
 .gitignore                         | 3 +++
 requirements/requirements-test.txt | 1 +
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 5366f69cc7b0..62d6350d6511 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -20,6 +20,7 @@ jobs:
       - uses: actions/checkout@v2
         with:
           fetch-depth: 0
+          ref: ${{ github.event.pull_request.head.sha }}
       - name: Find the changed files
         id: find-changed-files
         uses: tj-actions/changed-files@v35
@@ -75,7 +76,7 @@ jobs:
 
       - name: Unit Testing
         run: |
-          PYTHONPATH=$PWD pytest tests
+          PYTHONPATH=$PWD pytest --cov=. --cov-report lcov tests
         env:
           DATA: /data/scratch/cifar-10
           NCCL_SHM_DISABLE: 1
diff --git a/.gitignore b/.gitignore
index 6b6f980e3392..8e345eeb8388 100644
--- a/.gitignore
+++ b/.gitignore
@@ -151,3 +151,6 @@ colossalai/version.py
 
 # ignore python interface defition file
 .pyi
+
+# ignore coverage test file
+converage.lcov
diff --git a/requirements/requirements-test.txt b/requirements/requirements-test.txt
index f9e8960d2eaf..9ef0a682b6b8 100644
--- a/requirements/requirements-test.txt
+++ b/requirements/requirements-test.txt
@@ -1,5 +1,6 @@
 fbgemm-gpu==0.2.0
 pytest
+pytest-cov
 torchvision
 transformers
 timm

From 1be0ac3cbf3bd393c116b78ff64a9b7fea0c3fb8 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 17:59:52 +0800
Subject: [PATCH 133/503] add doc for trace indice

---
 colossalai/autochunk/trace_indice.py | 31 +++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/colossalai/autochunk/trace_indice.py b/colossalai/autochunk/trace_indice.py
index 0d09ed9f0e21..1e16ab9bdf35 100644
--- a/colossalai/autochunk/trace_indice.py
+++ b/colossalai/autochunk/trace_indice.py
@@ -1,13 +1,34 @@
 import copy
+from typing import Dict, List, Tuple
 
-from .utils import (
-    find_idx_by_name,
-    get_node_shape,
-)
+from torch.fx.node import Node
+
+from .utils import find_idx_by_name, get_node_shape
 
 
 class TraceIndice(object):
-    def __init__(self, node_list) -> None:
+    """
+    Trace all indice infomation for every node.
+
+    Indice is a logical concept. Equal dims can been treated as one indice.
+    eg. dim(x1) = [a, b, c]
+        dim(x2) = [d, e, f]
+        and we have x3 = x1 * x2.
+        then a=d, b=e, c=f, due to the broadcast property,
+        dim(x1)=dim(x2)=dim(x3)=[a, b, c]
+    This class will record every node's dims' indice, compute and source.
+
+    Attibutes:
+        node_list (List)
+        indice_trace_list (List): [{"indice": [...], "compute": [...], "source": [...]}, {...}]
+        indice_view_list (Dict): not used for now
+        indice_count (int): record indice number
+
+    Args:
+        node_list (List)
+    """
+
+    def __init__(self, node_list: List) -> None:
         self.node_list = node_list
         self.indice_trace_list = self._init_indice_trace_list()
         self.indice_view_list = {}

From 8de8de9fa3076e8da8e5a946d4b74f6985364bbb Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Tue, 10 Jan 2023 09:26:14 +0800
Subject: [PATCH 134/503] [docker] updated Dockerfile and release workflow
 (#2410)

---
 .github/workflows/release_docker.yml | 20 +++++++-------------
 docker/Dockerfile                    |  5 +++--
 2 files changed, 10 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/release_docker.yml b/.github/workflows/release_docker.yml
index 328d232a8356..c72d3fb33edd 100644
--- a/.github/workflows/release_docker.yml
+++ b/.github/workflows/release_docker.yml
@@ -18,23 +18,17 @@ jobs:
         with:
           fetch-depth: 0
       - name: Build Docker
+        id: build
         run: |
           version=$(cat version.txt)
-          docker build --build-arg http_proxy=http://172.17.0.1:7890 --build-arg https_proxy=http://172.17.0.1:7890 -t hpcaitech/colossalai:$version ./docker
+          tag=hpcaitech/colossalai:$version
+          docker build --build-arg http_proxy=http://172.17.0.1:7890 --build-arg https_proxy=http://172.17.0.1:7890 -t $tag ./docker
+          echo "tag=${tag}" >> $GITHUB_OUTPUT
       - name: Log in to Docker Hub
         uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9
         with:
           username: ${{ secrets.DOCKER_USERNAME }}
           password: ${{ secrets.DOCKER_PASSWORD }}
-      - name: Extract metadata (tags, labels) for Docker
-        id: meta
-        uses: docker/metadata-action@98669ae865ea3cffbcbaa878cf57c20bbf1c6c38
-        with:
-          images: hpcaitech/colossalai
-      - name: Build and push Docker image
-        uses: docker/build-push-action@ad44023a93711e3deb337508980b4b5e9bcdc5dc
-        with:
-          context: .
-          push: true
-          tags: ${{ steps.meta.outputs.tags }}
-          labels: ${{ steps.meta.outputs.labels }}
+      - name: Push Docker image
+        run: |
+          docker push ${{ steps.build.outputs.tag }}
diff --git a/docker/Dockerfile b/docker/Dockerfile
index bcb7c0fffbb3..0faba17b9cee 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,17 +1,18 @@
 FROM hpcaitech/cuda-conda:11.3
 
 # install torch
-RUN conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch
+RUN conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit=11.3 -c pytorch
 
 # install apex
 RUN git clone https://github.com/NVIDIA/apex && \
     cd apex && \
+    pip install packaging && \
     pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_layer_norm" ./
 
 # install colossalai
 RUN git clone https://github.com/hpcaitech/ColossalAI.git \
     && cd ./ColossalAI \
-    && pip install -v --no-cache-dir .
+    && CUDA_EXT=1 pip install -v --no-cache-dir .
 
 # install titans
 RUN pip install --no-cache-dir titans

From 7d4abaa5257758011f0f4ba1c5943f492e650a55 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 10 Jan 2023 09:59:47 +0800
Subject: [PATCH 135/503] add doc

---
 colossalai/autochunk/autochunk_codegen.py | 99 ++++++++++++++++++++---
 colossalai/autochunk/estimate_memory.py   | 22 ++++-
 colossalai/autochunk/reorder_graph.py     |  8 +-
 3 files changed, 113 insertions(+), 16 deletions(-)

diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
index 6e0cfb9cb2e7..73b6bf52460b 100644
--- a/colossalai/autochunk/autochunk_codegen.py
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -20,11 +20,22 @@
 from .utils import delete_free_var_from_last_use, find_idx_by_name, get_node_shape
 
 
-def _gen_chunk_slice_dim(chunk_dim, chunk_idx_name, shape):
+def _gen_chunk_slice_dim(chunk_dim: int, chunk_indice_name: str, shape: List) -> str:
+    """
+    Generate chunk slice string, eg. [:, :, chunk_idx_name:chunk_idx_name + chunk_size, :]
+
+    Args:
+        chunk_dim (int)
+        chunk_indice_name (str): chunk indice name
+        shape (List): node shape
+
+    Returns:
+        new_shape (str): return slice
+    """
     new_shape = "["
-    for idx, i in enumerate(shape):
+    for idx, _ in enumerate(shape):
         if idx == chunk_dim:
-            new_shape += "%s:%s + chunk_size" % (chunk_idx_name, chunk_idx_name)
+            new_shape += "%s:%s + chunk_size" % (chunk_indice_name, chunk_indice_name)
         else:
             new_shape += ":"
         new_shape += ", "
@@ -32,7 +43,26 @@ def _gen_chunk_slice_dim(chunk_dim, chunk_idx_name, shape):
     return new_shape
 
 
-def _gen_loop_start(chunk_input, chunk_output, chunk_ouput_dim, chunk_size=2):
+def _gen_loop_start(
+    chunk_input: List[Node], chunk_output: Node, chunk_ouput_dim: int, chunk_size=2
+) -> str:
+    """
+    Generate chunk loop start
+
+    eg. chunk_result = torch.empty([100, 100], dtype=input_node.dtype, device=input_node.device)
+        chunk_size = 32
+        for chunk_idx in range(0, 100, 32):
+            ......
+
+    Args:
+        chunk_input (List[Node]): chunk input node
+        chunk_output (Node): chunk output node
+        chunk_ouput_dim (int): chunk output node chunk dim
+        chunk_size (int): chunk size. Defaults to 2.
+
+    Returns:
+        context (str): generated str
+    """
     input_node = chunk_input[0]
     out_shape = get_node_shape(chunk_output)
     out_str = str(list(out_shape))
@@ -45,8 +75,28 @@ def _gen_loop_start(chunk_input, chunk_output, chunk_ouput_dim, chunk_size=2):
 
 
 def _gen_loop_end(
-    chunk_inputs, chunk_non_compute_inputs, chunk_outputs, chunk_outputs_dim, node_list
-):
+    chunk_inputs: List[Node],
+    chunk_non_compute_inputs: List[Node],
+    chunk_outputs: Node,
+    chunk_outputs_dim: int,
+    node_list: List[Node],
+) -> str:
+    """
+    Generate chunk loop end
+
+    eg.     chunk_result[chunk_idx:chunk_idx + chunk_size] = output_node
+        output_node = chunk_result; xx = None; xx = None
+
+    Args:
+        chunk_inputs (List[Node]): chunk input node
+        chunk_non_compute_inputs (List[Node]): input node without chunk
+        chunk_outputs (Node): chunk output node
+        chunk_outputs_dim (int): chunk output node chunk dim
+        node_list (List)
+
+    Returns:
+        context (str): generated str
+    """    
     chunk_outputs_name = chunk_outputs.name
     chunk_outputs_idx = find_idx_by_name(chunk_outputs_name, node_list)
     chunk_output_shape = chunk_outputs.meta["tensor_meta"].shape
@@ -76,7 +126,10 @@ def _gen_loop_end(
     return context
 
 
-def _replace_name(context, name_from, name_to):
+def _replace_name(context: str, name_from: str, name_to: str) -> str:
+    """
+    replace node name
+    """
     patterns = [(" ", " "), (" ", "."), (" ", ","), ("(", ")"), ("(", ","), (" ", ")")]
     for p in patterns:
         source = p[0] + name_from + p[1]
@@ -86,7 +139,10 @@ def _replace_name(context, name_from, name_to):
     return context
 
 
-def _replace_reshape_size(context, node_name, reshape_size_dict):
+def _replace_reshape_size(context: str, node_name: str, reshape_size_dict: Dict) -> str:
+    """
+    replace reshape size, some may have changed due to chunk
+    """
     if node_name not in reshape_size_dict:
         return context
     for size_name, size_value in reshape_size_dict[node_name].items():
@@ -94,7 +150,17 @@ def _replace_reshape_size(context, node_name, reshape_size_dict):
     return context
 
 
-def _replace_ones_like(search_chunk: SearchChunk, chunk_infos, region_idx, node_idx, node, body):
+def _replace_ones_like(
+    search_chunk: SearchChunk,
+    chunk_infos: List[Dict],
+    region_idx: int,
+    node_idx: int,
+    node: Node,
+    body: List[str],
+) -> List[str]:
+    """
+    add chunk slice for new tensor op such as ones like
+    """
     if "ones_like" in node.name:
         meta_node = search_chunk.trace_indice.node_list[node_idx]
         chunk_dim = chunk_infos[region_idx]["node_chunk_dim"][meta_node]["chunk_dim"]
@@ -114,7 +180,16 @@ def _replace_ones_like(search_chunk: SearchChunk, chunk_infos, region_idx, node_
     return body
 
 
-def _replace_input_var(chunk_inputs, region_idx, chunk_inputs_dim, node_idx, body):
+def _replace_input_node(
+    chunk_inputs: List[Node],
+    region_idx: int,
+    chunk_inputs_dim: Dict,
+    node_idx: int,
+    body: List[str],
+) -> List[str]:
+    """
+    add chunk slice for input nodes
+    """
     for input_node_idx, input_node in enumerate(chunk_inputs[region_idx]):
         for idx, dim in chunk_inputs_dim[region_idx][input_node_idx].items():
             if idx == node_idx:
@@ -138,7 +213,7 @@ def emit_code_with_chunk(
     """
     Emit code with chunk according to chunk_infos.
 
-    It will generate a for loop in chunk regions, and 
+    It will generate a for loop in chunk regions, and
     replace inputs and outputs of regions with chunked variables.
 
     Args:
@@ -193,7 +268,7 @@ def emit_code_with_chunk(
         if within_chunk_region:
             emit_node_func(node, body)
             # replace input var with chunk var
-            body = _replace_input_var(
+            body = _replace_input_node(
                 chunk_inputs, region_idx, chunk_inputs_dim, node_idx, body
             )
             # ones like
diff --git a/colossalai/autochunk/estimate_memory.py b/colossalai/autochunk/estimate_memory.py
index 90cfd66a00d5..62b23cf9fc93 100644
--- a/colossalai/autochunk/estimate_memory.py
+++ b/colossalai/autochunk/estimate_memory.py
@@ -15,6 +15,10 @@
 
 
 class EstimateMemory(object):
+    """
+    Estimate memory with chunk
+    """
+
     def __init__(self) -> None:
         pass
 
@@ -31,8 +35,6 @@ def _get_output_node(self, n):
         }
         out_size = activation_size(fwd_out)
         out_node = [n.name] if out_size > 0 else []
-        # if any(i in n.name for i in ['transpose', 'permute', 'view']):
-        #     out_size = 0
         return out_size, out_node
 
     def _get_output_node_size(self, n):
@@ -184,10 +186,24 @@ def _print_compute_op_mem_log(self, log, nodes, title=None):
 
     def estimate_chunk_inference_mem(
         self,
-        node_list,
+        node_list: List,
         chunk_infos=None,
         print_mem=False,
     ):
+        """
+        Estimate inference memory with chunk
+
+        Args:
+            node_list (List): _description_
+            chunk_infos (Dict): Chunk information. Defaults to None.
+            print_mem (bool): Wether to print peak memory of every node. Defaults to False.
+
+        Returns:
+            act_memory_peak_log (List): peak memory of every node
+            act_memory_after_node_log (List): memory after excuting every node
+            active_node_list_log (List): active nodes of every node. active nodes refer to 
+                nodes generated but not deleted.
+        """
         act_memory = 0.0
         act_memory_peak_log = []
         act_memory_after_node_log = []
diff --git a/colossalai/autochunk/reorder_graph.py b/colossalai/autochunk/reorder_graph.py
index 2ece0126e91b..0343e52eedd6 100644
--- a/colossalai/autochunk/reorder_graph.py
+++ b/colossalai/autochunk/reorder_graph.py
@@ -3,6 +3,10 @@
 
 
 class ReorderGraph(object):
+    """
+    Reorder node list and indice trace list
+    """
+
     def __init__(self, trace_indice: TraceIndice) -> None:
         self.trace_indice = trace_indice
         self.all_reorder_map = {
@@ -60,7 +64,9 @@ def _reorder_self_node_list(self, reorder_map):
 
     def _reorder_idx_trace(self, reorder_map):
         # reorder list
-        new_idx_trace_list = [None for _ in range(len(self.trace_indice.indice_trace_list))]
+        new_idx_trace_list = [
+            None for _ in range(len(self.trace_indice.indice_trace_list))
+        ]
         for old_idx, new_idx in reorder_map.items():
             new_idx_trace_list[new_idx] = self.trace_indice.indice_trace_list[old_idx]
         self.trace_indice.indice_trace_list = new_idx_trace_list

From 615e7e68d9bc00e1b29879e73df4cca8afdd907d Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 10 Jan 2023 10:44:07 +0800
Subject: [PATCH 136/503] update doc

---
 colossalai/autochunk/search_chunk.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/colossalai/autochunk/search_chunk.py b/colossalai/autochunk/search_chunk.py
index 590567045507..c9e5e5172274 100644
--- a/colossalai/autochunk/search_chunk.py
+++ b/colossalai/autochunk/search_chunk.py
@@ -88,7 +88,7 @@ def _search_max_chunk_region(
         Args:
             active_node (List): active node status for every node
             peak_node (Node): peak memory node
-            chunk_regions (List): chunk region info
+            chunk_regions (List): chunk region infos
 
         Returns:
             chunk_region_start (int)

From a591d45b2994b02399dda171bd2e20723361b991 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 10 Jan 2023 10:56:39 +0800
Subject: [PATCH 137/503] add available

---
 colossalai/autochunk/autochunk_codegen.py | 524 +++++++++++-----------
 1 file changed, 268 insertions(+), 256 deletions(-)

diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
index 73b6bf52460b..1ee1d818a253 100644
--- a/colossalai/autochunk/autochunk_codegen.py
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -1,21 +1,25 @@
 from typing import Any, Dict, Iterable, List, Tuple
 
 import torch
-from torch.fx.graph import (
-    CodeGen,
-    PythonCode,
-    _custom_builtins,
-    _CustomBuiltin,
-    _format_target,
-    _is_from_torch,
-    _Namespace,
-    _origin_type_map,
-    inplace_methods,
-    magic_methods,
-)
-from torch.fx.node import Argument, Node, _get_qualified_name, _type_repr, map_arg
 
 import colossalai
+from colossalai.fx.codegen.activation_checkpoint_codegen import CODEGEN_AVAILABLE
+
+if CODEGEN_AVAILABLE:
+    from torch.fx.graph import (
+        CodeGen,
+        PythonCode,
+        _custom_builtins,
+        _CustomBuiltin,
+        _format_target,
+        _is_from_torch,
+        _Namespace,
+        _origin_type_map,
+        inplace_methods,
+        magic_methods,
+    )
+    from torch.fx.node import Argument, Node, _get_qualified_name, _type_repr, map_arg
+
 from .search_chunk import SearchChunk
 from .utils import delete_free_var_from_last_use, find_idx_by_name, get_node_shape
 
@@ -96,7 +100,7 @@ def _gen_loop_end(
 
     Returns:
         context (str): generated str
-    """    
+    """
     chunk_outputs_name = chunk_outputs.name
     chunk_outputs_idx = find_idx_by_name(chunk_outputs_name, node_list)
     chunk_output_shape = chunk_outputs.meta["tensor_meta"].shape
@@ -302,279 +306,287 @@ def emit_code_with_chunk(
         node_idx += 1
 
 
-class AutoChunkCodeGen(CodeGen):
-    def __init__(self, meta_graph, max_memory=None, print_mem=False):
-        super().__init__()
-        self.meta_graph = meta_graph
-        self.max_memory = max_memory
-        self.meta_node = list(meta_graph.graph.nodes)
-        # find the chunk regions
-        self.search_chunk = SearchChunk(meta_graph, max_memory, print_mem)
-        self.chunk_infos = self.search_chunk.search_region()
+if CODEGEN_AVAILABLE:
 
-    def _gen_python_code(
-        self, nodes, root_module: str, namespace: _Namespace
-    ) -> PythonCode:
-        free_vars: List[str] = []
-        body: List[str] = []
-        globals_: Dict[str, Any] = {}
-        wrapped_fns: Dict[str, None] = {}
+    class AutoChunkCodeGen(CodeGen):
+        def __init__(self, meta_graph, max_memory=None, print_mem=False):
+            super().__init__()
+            self.meta_graph = meta_graph
+            self.max_memory = max_memory
+            self.meta_node = list(meta_graph.graph.nodes)
+            # find the chunk regions
+            self.search_chunk = SearchChunk(meta_graph, max_memory, print_mem)
+            self.chunk_infos = self.search_chunk.search_region()
 
-        # Wrap string in list to pass by reference
-        maybe_return_annotation: List[str] = [""]
+        def _gen_python_code(
+            self, nodes, root_module: str, namespace: _Namespace
+        ) -> PythonCode:
+            free_vars: List[str] = []
+            body: List[str] = []
+            globals_: Dict[str, Any] = {}
+            wrapped_fns: Dict[str, None] = {}
 
-        def add_global(name_hint: str, obj: Any):
-            """Add an obj to be tracked as a global.
+            # Wrap string in list to pass by reference
+            maybe_return_annotation: List[str] = [""]
 
-            We call this for names that reference objects external to the
-            Graph, like functions or types.
+            def add_global(name_hint: str, obj: Any):
+                """Add an obj to be tracked as a global.
 
-            Returns: the global name that should be used to reference 'obj' in generated source.
-            """
-            if (
-                _is_from_torch(obj) and obj != torch.device
-            ):  # to support registering torch.device
-                # HACK: workaround for how torch custom ops are registered. We
-                # can't import them like normal modules so they must retain their
-                # fully qualified name.
-                return _get_qualified_name(obj)
-
-            # normalize the name hint to get a proper identifier
-            global_name = namespace.create_name(name_hint, obj)
-
-            if global_name in globals_:
-                assert globals_[global_name] is obj
+                We call this for names that reference objects external to the
+                Graph, like functions or types.
+
+                Returns: the global name that should be used to reference 'obj' in generated source.
+                """
+                if (
+                    _is_from_torch(obj) and obj != torch.device
+                ):  # to support registering torch.device
+                    # HACK: workaround for how torch custom ops are registered. We
+                    # can't import them like normal modules so they must retain their
+                    # fully qualified name.
+                    return _get_qualified_name(obj)
+
+                # normalize the name hint to get a proper identifier
+                global_name = namespace.create_name(name_hint, obj)
+
+                if global_name in globals_:
+                    assert globals_[global_name] is obj
+                    return global_name
+                globals_[global_name] = obj
                 return global_name
-            globals_[global_name] = obj
-            return global_name
 
-        # set _custom_builtins here so that we needn't import colossalai in forward
-        _custom_builtins["colossalai"] = _CustomBuiltin("import colossalai", colossalai)
+            # set _custom_builtins here so that we needn't import colossalai in forward
+            _custom_builtins["colossalai"] = _CustomBuiltin(
+                "import colossalai", colossalai
+            )
+
+            # Pre-fill the globals table with registered builtins.
+            for name, (_, obj) in _custom_builtins.items():
+                add_global(name, obj)
 
-        # Pre-fill the globals table with registered builtins.
-        for name, (_, obj) in _custom_builtins.items():
-            add_global(name, obj)
+            def type_repr(o: Any):
+                if o == ():
+                    # Empty tuple is used for empty tuple type annotation Tuple[()]
+                    return "()"
 
-        def type_repr(o: Any):
-            if o == ():
-                # Empty tuple is used for empty tuple type annotation Tuple[()]
-                return "()"
+                typename = _type_repr(o)
 
-            typename = _type_repr(o)
+                if hasattr(o, "__origin__"):
+                    # This is a generic type, e.g. typing.List[torch.Tensor]
+                    origin_type = _origin_type_map.get(o.__origin__, o.__origin__)
+                    origin_typename = add_global(_type_repr(origin_type), origin_type)
 
-            if hasattr(o, "__origin__"):
-                # This is a generic type, e.g. typing.List[torch.Tensor]
-                origin_type = _origin_type_map.get(o.__origin__, o.__origin__)
-                origin_typename = add_global(_type_repr(origin_type), origin_type)
+                    if hasattr(o, "__args__"):
+                        # Assign global names for each of the inner type variables.
+                        args = [type_repr(arg) for arg in o.__args__]
 
-                if hasattr(o, "__args__"):
-                    # Assign global names for each of the inner type variables.
-                    args = [type_repr(arg) for arg in o.__args__]
+                        if len(args) == 0:
+                            # Bare type, such as `typing.Tuple` with no subscript
+                            # This code-path used in Python < 3.9
+                            return origin_typename
 
-                    if len(args) == 0:
+                        return f'{origin_typename}[{",".join(args)}]'
+                    else:
                         # Bare type, such as `typing.Tuple` with no subscript
-                        # This code-path used in Python < 3.9
+                        # This code-path used in Python 3.9+
                         return origin_typename
 
-                    return f'{origin_typename}[{",".join(args)}]'
+                # Common case: this is a regular module name like 'foo.bar.baz'
+                return add_global(typename, o)
+
+            def _format_args(
+                args: Tuple[Argument, ...], kwargs: Dict[str, Argument]
+            ) -> str:
+                def _get_repr(arg):
+                    # Handle NamedTuples (if it has `_fields`) via add_global.
+                    if isinstance(arg, tuple) and hasattr(arg, "_fields"):
+                        qualified_name = _get_qualified_name(type(arg))
+                        global_name = add_global(qualified_name, type(arg))
+                        return f"{global_name}{repr(tuple(arg))}"
+                    return repr(arg)
+
+                args_s = ", ".join(_get_repr(a) for a in args)
+                kwargs_s = ", ".join(f"{k} = {_get_repr(v)}" for k, v in kwargs.items())
+                if args_s and kwargs_s:
+                    return f"{args_s}, {kwargs_s}"
+                return args_s or kwargs_s
+
+            # Run through reverse nodes and record the first instance of a use
+            # of a given node. This represents the *last* use of the node in the
+            # execution order of the program, which we will use to free unused
+            # values
+            node_to_last_use: Dict[Node, Node] = {}
+            user_to_last_uses: Dict[Node, List[Node]] = {}
+
+            def register_last_uses(n: Node, user: Node):
+                if n not in node_to_last_use:
+                    node_to_last_use[n] = user
+                    user_to_last_uses.setdefault(user, []).append(n)
+
+            for node in reversed(nodes):
+                map_arg(node.args, lambda n: register_last_uses(n, node))
+                map_arg(node.kwargs, lambda n: register_last_uses(n, node))
+
+            delete_free_var_from_last_use(user_to_last_uses)
+
+            # NOTE: we add a variable to distinguish body and ckpt_func
+            def delete_unused_values(user: Node, body, to_keep=[]):
+                """
+                Delete values after their last use. This ensures that values that are
+                not used in the remainder of the code are freed and the memory usage
+                of the code is optimal.
+                """
+                if user.op == "placeholder":
+                    return
+                if user.op == "output":
+                    body.append("\n")
+                    return
+                nodes_to_delete = user_to_last_uses.get(user, [])
+                nodes_to_delete = [i for i in nodes_to_delete if i.name not in to_keep]
+                if len(nodes_to_delete):
+                    to_delete_str = " = ".join(
+                        [repr(n) for n in nodes_to_delete] + ["None"]
+                    )
+                    body.append(f";  {to_delete_str}\n")
                 else:
-                    # Bare type, such as `typing.Tuple` with no subscript
-                    # This code-path used in Python 3.9+
-                    return origin_typename
-
-            # Common case: this is a regular module name like 'foo.bar.baz'
-            return add_global(typename, o)
-
-        def _format_args(
-            args: Tuple[Argument, ...], kwargs: Dict[str, Argument]
-        ) -> str:
-            def _get_repr(arg):
-                # Handle NamedTuples (if it has `_fields`) via add_global.
-                if isinstance(arg, tuple) and hasattr(arg, "_fields"):
-                    qualified_name = _get_qualified_name(type(arg))
-                    global_name = add_global(qualified_name, type(arg))
-                    return f"{global_name}{repr(tuple(arg))}"
-                return repr(arg)
-
-            args_s = ", ".join(_get_repr(a) for a in args)
-            kwargs_s = ", ".join(f"{k} = {_get_repr(v)}" for k, v in kwargs.items())
-            if args_s and kwargs_s:
-                return f"{args_s}, {kwargs_s}"
-            return args_s or kwargs_s
-
-        # Run through reverse nodes and record the first instance of a use
-        # of a given node. This represents the *last* use of the node in the
-        # execution order of the program, which we will use to free unused
-        # values
-        node_to_last_use: Dict[Node, Node] = {}
-        user_to_last_uses: Dict[Node, List[Node]] = {}
-
-        def register_last_uses(n: Node, user: Node):
-            if n not in node_to_last_use:
-                node_to_last_use[n] = user
-                user_to_last_uses.setdefault(user, []).append(n)
-
-        for node in reversed(nodes):
-            map_arg(node.args, lambda n: register_last_uses(n, node))
-            map_arg(node.kwargs, lambda n: register_last_uses(n, node))
-
-        delete_free_var_from_last_use(user_to_last_uses)
-
-        # NOTE: we add a variable to distinguish body and ckpt_func
-        def delete_unused_values(user: Node, body, to_keep=[]):
-            """
-            Delete values after their last use. This ensures that values that are
-            not used in the remainder of the code are freed and the memory usage
-            of the code is optimal.
-            """
-            if user.op == "placeholder":
-                return
-            if user.op == "output":
-                body.append("\n")
-                return
-            nodes_to_delete = user_to_last_uses.get(user, [])
-            nodes_to_delete = [i for i in nodes_to_delete if i.name not in to_keep]
-            if len(nodes_to_delete):
-                to_delete_str = " = ".join(
-                    [repr(n) for n in nodes_to_delete] + ["None"]
-                )
-                body.append(f";  {to_delete_str}\n")
-            else:
-                body.append("\n")
+                    body.append("\n")
 
-        # NOTE: we add a variable to distinguish body and ckpt_func
-        def emit_node(node: Node, body):
-            maybe_type_annotation = (
-                "" if node.type is None else f" : {type_repr(node.type)}"
-            )
-            if node.op == "placeholder":
-                assert isinstance(node.target, str)
-                maybe_default_arg = "" if not node.args else f" = {repr(node.args[0])}"
-                free_vars.append(
-                    f"{node.target}{maybe_type_annotation}{maybe_default_arg}"
+            # NOTE: we add a variable to distinguish body and ckpt_func
+            def emit_node(node: Node, body):
+                maybe_type_annotation = (
+                    "" if node.type is None else f" : {type_repr(node.type)}"
                 )
-                raw_name = node.target.replace("*", "")
-                if raw_name != repr(node):
-                    body.append(f"{repr(node)} = {raw_name}\n")
-                return
-            elif node.op == "call_method":
-                assert isinstance(node.target, str)
-                body.append(
-                    f"{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.target)}"
-                    f"({_format_args(node.args[1:], node.kwargs)})"
-                )
-                return
-            elif node.op == "call_function":
-                assert callable(node.target)
-                # pretty print operators
-                if (
-                    node.target.__module__ == "_operator"
-                    and node.target.__name__ in magic_methods
-                ):
-                    assert isinstance(node.args, tuple)
+                if node.op == "placeholder":
+                    assert isinstance(node.target, str)
+                    maybe_default_arg = (
+                        "" if not node.args else f" = {repr(node.args[0])}"
+                    )
+                    free_vars.append(
+                        f"{node.target}{maybe_type_annotation}{maybe_default_arg}"
+                    )
+                    raw_name = node.target.replace("*", "")
+                    if raw_name != repr(node):
+                        body.append(f"{repr(node)} = {raw_name}\n")
+                    return
+                elif node.op == "call_method":
+                    assert isinstance(node.target, str)
                     body.append(
-                        f"{repr(node)}{maybe_type_annotation} = "
-                        f"{magic_methods[node.target.__name__].format(*(repr(a) for a in node.args))}"
+                        f"{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.target)}"
+                        f"({_format_args(node.args[1:], node.kwargs)})"
                     )
                     return
-
-                # pretty print inplace operators; required for jit.script to work properly
-                # not currently supported in normal FX graphs, but generated by torchdynamo
-                if (
-                    node.target.__module__ == "_operator"
-                    and node.target.__name__ in inplace_methods
-                ):
+                elif node.op == "call_function":
+                    assert callable(node.target)
+                    # pretty print operators
+                    if (
+                        node.target.__module__ == "_operator"
+                        and node.target.__name__ in magic_methods
+                    ):
+                        assert isinstance(node.args, tuple)
+                        body.append(
+                            f"{repr(node)}{maybe_type_annotation} = "
+                            f"{magic_methods[node.target.__name__].format(*(repr(a) for a in node.args))}"
+                        )
+                        return
+
+                    # pretty print inplace operators; required for jit.script to work properly
+                    # not currently supported in normal FX graphs, but generated by torchdynamo
+                    if (
+                        node.target.__module__ == "_operator"
+                        and node.target.__name__ in inplace_methods
+                    ):
+                        body.append(
+                            f"{inplace_methods[node.target.__name__].format(*(repr(a) for a in node.args))};  "
+                            f"{repr(node)}{maybe_type_annotation} = {repr(node.args[0])}"
+                        )
+                        return
+
+                    qualified_name = _get_qualified_name(node.target)
+                    global_name = add_global(qualified_name, node.target)
+                    # special case for getattr: node.args could be 2-argument or 3-argument
+                    # 2-argument: attribute access; 3-argument: fall through to attrib function call with default value
+                    if (
+                        global_name == "getattr"
+                        and isinstance(node.args, tuple)
+                        and isinstance(node.args[1], str)
+                        and node.args[1].isidentifier()
+                        and len(node.args) == 2
+                    ):
+                        body.append(
+                            f"{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.args[1])}"
+                        )
+                        return
                     body.append(
-                        f"{inplace_methods[node.target.__name__].format(*(repr(a) for a in node.args))};  "
-                        f"{repr(node)}{maybe_type_annotation} = {repr(node.args[0])}"
+                        f"{repr(node)}{maybe_type_annotation} = {global_name}({_format_args(node.args, node.kwargs)})"
                     )
+                    if node.meta.get("is_wrapped", False):
+                        wrapped_fns.setdefault(global_name)
                     return
-
-                qualified_name = _get_qualified_name(node.target)
-                global_name = add_global(qualified_name, node.target)
-                # special case for getattr: node.args could be 2-argument or 3-argument
-                # 2-argument: attribute access; 3-argument: fall through to attrib function call with default value
-                if (
-                    global_name == "getattr"
-                    and isinstance(node.args, tuple)
-                    and isinstance(node.args[1], str)
-                    and node.args[1].isidentifier()
-                    and len(node.args) == 2
-                ):
+                elif node.op == "call_module":
+                    assert isinstance(node.target, str)
                     body.append(
-                        f"{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.args[1])}"
+                        f"{repr(node)}{maybe_type_annotation} = "
+                        f"{_format_target(root_module, node.target)}({_format_args(node.args, node.kwargs)})"
                     )
                     return
-                body.append(
-                    f"{repr(node)}{maybe_type_annotation} = {global_name}({_format_args(node.args, node.kwargs)})"
-                )
-                if node.meta.get("is_wrapped", False):
-                    wrapped_fns.setdefault(global_name)
-                return
-            elif node.op == "call_module":
-                assert isinstance(node.target, str)
-                body.append(
-                    f"{repr(node)}{maybe_type_annotation} = "
-                    f"{_format_target(root_module, node.target)}({_format_args(node.args, node.kwargs)})"
-                )
-                return
-            elif node.op == "get_attr":
-                assert isinstance(node.target, str)
-                body.append(
-                    f"{repr(node)}{maybe_type_annotation} = {_format_target(root_module, node.target)}"
+                elif node.op == "get_attr":
+                    assert isinstance(node.target, str)
+                    body.append(
+                        f"{repr(node)}{maybe_type_annotation} = {_format_target(root_module, node.target)}"
+                    )
+                    return
+                elif node.op == "output":
+                    if node.type is not None:
+                        maybe_return_annotation[0] = f" -> {type_repr(node.type)}"
+                    body.append(self.generate_output(node.args[0]))
+                    return
+                raise NotImplementedError(f"node: {node.op} {node.target}")
+
+            # Modified for activation checkpointing
+            ckpt_func = []
+
+            # if any node has a list of labels for activation_checkpoint, we
+            # will use nested type of activation checkpoint codegen
+            emit_code_with_chunk(
+                body,
+                nodes,
+                emit_node,
+                delete_unused_values,
+                self.search_chunk,
+                self.chunk_infos,
+            )
+
+            if len(body) == 0:
+                # If the Graph has no non-placeholder nodes, no lines for the body
+                # have been emitted. To continue to have valid Python code, emit a
+                # single pass statement
+                body.append("pass\n")
+
+            if len(wrapped_fns) > 0:
+                wrap_name = add_global("wrap", torch.fx.wrap)
+                wrap_stmts = "\n".join(
+                    [f'{wrap_name}("{name}")' for name in wrapped_fns]
                 )
-                return
-            elif node.op == "output":
-                if node.type is not None:
-                    maybe_return_annotation[0] = f" -> {type_repr(node.type)}"
-                body.append(self.generate_output(node.args[0]))
-                return
-            raise NotImplementedError(f"node: {node.op} {node.target}")
-
-        # Modified for activation checkpointing
-        ckpt_func = []
-
-        # if any node has a list of labels for activation_checkpoint, we
-        # will use nested type of activation checkpoint codegen
-        emit_code_with_chunk(
-            body,
-            nodes,
-            emit_node,
-            delete_unused_values,
-            self.search_chunk,
-            self.chunk_infos,
-        )
-
-        if len(body) == 0:
-            # If the Graph has no non-placeholder nodes, no lines for the body
-            # have been emitted. To continue to have valid Python code, emit a
-            # single pass statement
-            body.append("pass\n")
-
-        if len(wrapped_fns) > 0:
-            wrap_name = add_global("wrap", torch.fx.wrap)
-            wrap_stmts = "\n".join([f'{wrap_name}("{name}")' for name in wrapped_fns])
-        else:
-            wrap_stmts = ""
+            else:
+                wrap_stmts = ""
 
-        if self._body_transformer:
-            body = self._body_transformer(body)
+            if self._body_transformer:
+                body = self._body_transformer(body)
 
-        for name, value in self.additional_globals():
-            add_global(name, value)
+            for name, value in self.additional_globals():
+                add_global(name, value)
 
-        # as we need colossalai.utils.checkpoint, we need to import colossalai
-        # in forward function
-        prologue = self.gen_fn_def(free_vars, maybe_return_annotation[0])
-        prologue = "".join(ckpt_func) + prologue
-        prologue = prologue
+            # as we need colossalai.utils.checkpoint, we need to import colossalai
+            # in forward function
+            prologue = self.gen_fn_def(free_vars, maybe_return_annotation[0])
+            prologue = "".join(ckpt_func) + prologue
+            prologue = prologue
 
-        code = "".join(body)
-        code = "\n".join("    " + line for line in code.split("\n"))
-        fn_code = f"""
-{wrap_stmts}
+            code = "".join(body)
+            code = "\n".join("    " + line for line in code.split("\n"))
+            fn_code = f"""
+    {wrap_stmts}
 
-{prologue}
-{code}"""
-        # print(fn_code)
-        return PythonCode(fn_code, globals_)
+    {prologue}
+    {code}"""
+            # print(fn_code)
+            return PythonCode(fn_code, globals_)

From fd818cf14423489714cd3fb19c703a5b40271e17 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 10 Jan 2023 11:10:45 +0800
Subject: [PATCH 138/503] change imports

---
 colossalai/autochunk/autochunk_codegen.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
index 1ee1d818a253..14f17b1d37ba 100644
--- a/colossalai/autochunk/autochunk_codegen.py
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -18,7 +18,8 @@
         inplace_methods,
         magic_methods,
     )
-    from torch.fx.node import Argument, Node, _get_qualified_name, _type_repr, map_arg
+
+from torch.fx.node import Argument, Node, _get_qualified_name, _type_repr, map_arg
 
 from .search_chunk import SearchChunk
 from .utils import delete_free_var_from_last_use, find_idx_by_name, get_node_shape

From c1492e5013709e49093e497c3b7a6ec4bb10b9d4 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 10 Jan 2023 11:20:28 +0800
Subject: [PATCH 139/503] add test in import

---
 tests/test_autochunk/test_autochunk_codegen.py | 4 +++-
 tests/test_autochunk/test_autochunk_search.py  | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/test_autochunk/test_autochunk_codegen.py b/tests/test_autochunk/test_autochunk_codegen.py
index 53f62077c07a..28999706b20a 100644
--- a/tests/test_autochunk/test_autochunk_codegen.py
+++ b/tests/test_autochunk/test_autochunk_codegen.py
@@ -6,7 +6,6 @@
 import torch.multiprocessing as mp
 
 import colossalai
-from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
 from colossalai.core import global_context as gpc
 from colossalai.fx import ColoTracer
 from colossalai.fx.codegen.activation_checkpoint_codegen import CODEGEN_AVAILABLE
@@ -16,6 +15,9 @@
 from colossalai.utils import free_port
 from tests.test_autochunk.evoformer.evoformer import evoformer_base
 
+if CODEGEN_AVAILABLE:
+    from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
+
 
 def _test_fwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair):
     # for memory test
diff --git a/tests/test_autochunk/test_autochunk_search.py b/tests/test_autochunk/test_autochunk_search.py
index 5026c3ad3b3d..eb2bf4560e2c 100644
--- a/tests/test_autochunk/test_autochunk_search.py
+++ b/tests/test_autochunk/test_autochunk_search.py
@@ -6,7 +6,6 @@
 import torch.multiprocessing as mp
 
 import colossalai
-from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
 from colossalai.core import global_context as gpc
 from colossalai.fx.codegen.activation_checkpoint_codegen import CODEGEN_AVAILABLE
 from colossalai.fx.passes.meta_info_prop import MetaInfoProp
@@ -14,6 +13,9 @@
 from colossalai.utils import free_port
 from tests.test_autochunk.evoformer.evoformer import evoformer_base
 
+if CODEGEN_AVAILABLE:
+    from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
+
 
 def assert_chunk_infos(chunk_infos, max_memory, msa_len, pair_len):
     found_regions = [i["region"] for i in chunk_infos]

From 8327932d2c2e2169422c8e9428983f780c55983d Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Tue, 10 Jan 2023 11:26:19 +0800
Subject: [PATCH 140/503] [workflow] refactored the example check workflow
 (#2411)

* [workflow] refactored the example check workflow

* polish code

* polish code

* polish code

* polish code

* polish code

* polish code

* polish code

* polish code

* polish code

* polish code

* polish code
---
 ...eekly_check.yml => auto_example_check.yml} | 75 +++++++++++--------
 ...example.yml => dispatch_example_check.yml} | 44 +++++------
 .../example_checks/check_dispatch_inputs.py   | 27 +++++++
 .../check_example_weekly.py}                  |  9 +--
 .../detect_changed_example.py}                | 11 ++-
 .../workflows/scripts/input_check_example.py  | 23 ------
 examples/tutorial/hybrid_parallel/config.py   |  4 +-
 .../tutorial/hybrid_parallel/requirements.txt |  1 +
 examples/tutorial/hybrid_parallel/test_ci.sh  |  5 ++
 examples/tutorial/hybrid_parallel/train.py    |  6 +-
 10 files changed, 113 insertions(+), 92 deletions(-)
 rename .github/workflows/{changed_file_trigger_examples_check_and_weekly_check.yml => auto_example_check.yml} (62%)
 rename .github/workflows/{workflow_dispatch_example.yml => dispatch_example_check.yml} (57%)
 create mode 100644 .github/workflows/scripts/example_checks/check_dispatch_inputs.py
 rename .github/workflows/scripts/{weekly_check_example.py => example_checks/check_example_weekly.py} (76%)
 rename .github/workflows/scripts/{changed_example.py => example_checks/detect_changed_example.py} (52%)
 delete mode 100644 .github/workflows/scripts/input_check_example.py
 create mode 100644 examples/tutorial/hybrid_parallel/test_ci.sh

diff --git a/.github/workflows/changed_file_trigger_examples_check_and_weekly_check.yml b/.github/workflows/auto_example_check.yml
similarity index 62%
rename from .github/workflows/changed_file_trigger_examples_check_and_weekly_check.yml
rename to .github/workflows/auto_example_check.yml
index 2b7ec31252e4..7f1e357e33e8 100644
--- a/.github/workflows/changed_file_trigger_examples_check_and_weekly_check.yml
+++ b/.github/workflows/auto_example_check.yml
@@ -1,7 +1,7 @@
 name: Test Example
 on:
   pull_request:
-    # So only the changes in examples folder will trigger jobs below.
+    # any change in the examples folder will trigger check for the corresponding example.
     paths:
       - 'examples/**'
   # run at 00:00 of every Sunday(singapore time) so here is UTC time Saturday 16:00
@@ -17,12 +17,14 @@ jobs:
         github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request'
     runs-on: ubuntu-latest
     outputs:
-      matrix: ${{ steps.set-matrix.outputs.matrix }}
-    name: Check out all files
+      matrix: ${{ steps.setup-matrix.outputs.matrix }}
+      anyChanged: ${{ steps.setup-matrix.outputs.anyChanged }}
+    name: Detect changed example files
     steps:
       - uses: actions/checkout@v3
         with:
-          fetch-depth: 2
+          fetch-depth: 0
+          ref: ${{ github.event.pull_request.head.sha }}
       - name: Get all changed example files
         id: changed-files
         uses: tj-actions/changed-files@v35
@@ -30,46 +32,53 @@ jobs:
         with:
           since_last_remote_commit: true
       - name: setup matrix
-        id: set-matrix
+        id: setup-matrix
         run: |
           changedFileName=""
           for file in ${{ steps.changed-files.outputs.all_changed_files  }}; do
             changedFileName="${file}:${changedFileName}"
           done
           echo "$changedFileName was changed"
-          res=`python .github/workflows/scripts/changed_example.py --fileNameList $changedFileName`
-          echo "All changed files are $res"
-          loc=$( IFS=',' ; echo "${res[*]}" )
-          echo "$loc"
-          echo "::set-output name=matrix::{\"loc\":$(echo "$loc")}"
+          res=`python .github/workflows/scripts/example_checks/detect_changed_example.py --fileNameList $changedFileName`
+          echo "All changed examples are $res"
+
+          if [ "$x" = "[]" ]; then
+            echo "anyChanged=false" >> $GITHUB_OUTPUT
+            echo "matrix=null" >> $GITHUB_OUTPUT
+          else
+            dirs=$( IFS=',' ; echo "${res[*]}" )
+            echo "anyChanged=true" >> $GITHUB_OUTPUT
+            echo "matrix={\"directory\":$(echo "$dirs")}" >> $GITHUB_OUTPUT
+          fi
 
   # If no file is changed, it will prompt an error and shows the matrix do not have value.
-  check-all-changed-files:
+  check-changed-example:
     # Add this condition to avoid executing this job if the trigger event is workflow_dispatch.
     if: |
         github.event.pull_request.draft == false &&
         github.base_ref == 'main' &&
         github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request'
-    name: Test each changed example files
+    name: Test the changed example
     needs: detect-changed-example
     runs-on: [self-hosted, gpu]
     strategy:
       matrix: ${{fromJson(needs.detect-changed-example.outputs.matrix)}}
     container:
       image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      options: --gpus all --rm -v /data/scratch/examples-data:/data/
+    timeout-minutes: 10
     steps:
       - uses: actions/checkout@v3
-        with:
-          fetch-depth: 2
-      - name: Install dependancies
+      - name: Install Colossal-AI
         run: |
-          pip install -r ./requirements/requirements.txt
-          pip install colossalai
-      - name: List all changed example files
+          pip install -v .
+      - name: Test the example
         run: |
-          res=${{ matrix.loc }}
-          cd "${PWD}/examples/${res}"
+          example_dir=${{ matrix.directory }}
+          cd "${PWD}/examples/${example_dir}"
           bash test_ci.sh
+        env:
+          NCCL_SHM_DISABLE: 1
 
   # This is for all files' weekly check. Specifically, this job is to find all the directories.
   matrix_preparation:
@@ -77,20 +86,20 @@ jobs:
         github.event.pull_request.draft == false &&
         github.base_ref == 'main' &&
         github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'schedule'
-    name: Prepare Directory List for All files
+    name: Prepare matrix for weekly check
     runs-on: ubuntu-latest
     outputs:
-      matrix: ${{ steps.set-matrix.outputs.matrix }}
+      matrix: ${{ steps.setup-matrix.outputs.matrix }}
     steps:
     - name: 📚 Checkout
       uses: actions/checkout@v3
     - name: setup matrix
-      id: set-matrix
+      id: setup-matrix
       run: |
-        res=`python .github/workflows/scripts/weekly_check_example.py`
+        res=`python .github/workflows/scripts/example_checks/check_example_weekly.py`
         all_loc=$( IFS=',' ; echo "${res[*]}" )
-        echo "$all_loc"
-        echo "::set-output name=matrix::{\"all_loc\":$(echo "$all_loc")}"
+        echo "Found the examples: $all_loc"
+        echo "matrix={\"directory\":$(echo "$all_loc")}" >> $GITHUB_OUTPUT
 
   weekly_check:
     if: |
@@ -104,16 +113,18 @@ jobs:
       matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
     container:
       image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+    timeout-minutes: 10
     steps:
       - name: 📚 Checkout
         uses: actions/checkout@v3
-      - name: Install the requirements
+      - name: Install Colossal-AI
         run: |
-          pip install -r ./requirements/requirements.txt
-          pip install colossalai
+          pip install -v .
       - name: Traverse all files
         run: |
-          dir=${{ matrix.all_loc }}
-          echo "${dir} is current directory"
-          cd "${PWD}/examples/${dir}"
+          example_dir=${{ matrix.diretory }}
+          echo "Testing ${example_dir} now"
+          cd "${PWD}/examples/${example_dir}"
           bash test_ci.sh
+        env:
+          NCCL_SHM_DISABLE: 1
diff --git a/.github/workflows/workflow_dispatch_example.yml b/.github/workflows/dispatch_example_check.yml
similarity index 57%
rename from .github/workflows/workflow_dispatch_example.yml
rename to .github/workflows/dispatch_example_check.yml
index d9d5769109a3..e0333422f50d 100644
--- a/.github/workflows/workflow_dispatch_example.yml
+++ b/.github/workflows/dispatch_example_check.yml
@@ -8,7 +8,7 @@ on:
         required: true
 
 jobs:
-  manual_check_matrix_preparation:
+  matrix_preparation:
     if: |
         github.event.pull_request.draft == false &&
         github.base_ref == 'main' &&
@@ -16,31 +16,24 @@ jobs:
     name: Check the examples user want
     runs-on: ubuntu-latest
     outputs:
-      matrix: ${{ steps.set-matrix-1.outputs.matrix }}
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
     steps:
     - name: 📚 Checkout
       uses: actions/checkout@v3
-    - name: Get manual directories
-      id: set-matrix-1
+    - name: Set up matrix
+      id: set-matrix
       env:
         check_dir: ${{ inputs.example_directory }}
       run: |
-        all_mannual_check_dir=()
-        for cdi in $check_dir
-        do
-          all_mannual_check_dir+=("\"${cdi}\"")
-        done
-        man_loc=$( IFS=',' ; echo "${all_mannual_check_dir[*]}" )
-        res=`python .github/workflows/scripts/input_check_example.py --fileNameList $man_loc`
-        echo "${res} is file existance. 1 for all exist, -1 for at least one file not exist."
-        if [ res == -1 ];then
-           exit(1)
+        res=`python .github/workflows/scripts/example_checks/check_dispatch_inputs.py --fileNameList $check_dir`
+        if [ res == "failure" ];then
+          exit -1
         fi
-        man_loc="[${man_loc}]"
-        echo "$man_loc"
-        echo "::set-output name=matrix::{\"man_loc\":$(echo "$man_loc")}"
+        dirs="[${check_dir}]"
+        echo "Testing examples in $dirs"
+        echo "matrix={\"directory\":$(echo "$dirs")}" >> $GITHUB_OUTPUT
 
-  manual_check:
+  test_example:
     if: |
         github.event.pull_request.draft == false &&
         github.base_ref == 'main' &&
@@ -52,16 +45,19 @@ jobs:
       matrix: ${{fromJson(needs.manual_check_matrix_preparation.outputs.matrix)}}
     container:
       image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      options: --gpus all --rm -v /data/scratch/examples-data:/data/
+    timeout-minutes: 10
     steps:
       - name: 📚 Checkout
         uses: actions/checkout@v3
-      - name: Install the requirements
+      - name: Install Colossal-AI
         run: |
-          pip install -r ./requirements/requirements.txt
-          pip install colossalai
-      - name: Traverse all files
+          pip install -v .
+      - name: Test the example
         run: |
-          dir=${{ matrix.man_loc }}
-          echo "${dir} is current directory"
+          dir=${{ matrix.directory }}
+          echo "Testing ${dir} now"
           cd "${PWD}/examples/${dir}"
           bash test_ci.sh
+        env:
+          NCCL_SHM_DISABLE: 1
diff --git a/.github/workflows/scripts/example_checks/check_dispatch_inputs.py b/.github/workflows/scripts/example_checks/check_dispatch_inputs.py
new file mode 100644
index 000000000000..04d2063ec5fc
--- /dev/null
+++ b/.github/workflows/scripts/example_checks/check_dispatch_inputs.py
@@ -0,0 +1,27 @@
+import argparse
+import os
+
+
+def check_inputs(input_list):
+    for path in input_list:
+        real_path = os.path.join('examples', path)
+        if not os.path.exists(real_path):
+            return False
+    return True
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-f', '--fileNameList', type=str, help="List of file names")
+    args = parser.parse_args()
+    name_list = args.fileNameList.split(",")
+    is_correct = check_inputs(name_list)
+
+    if is_correct:
+        print('success')
+    else:
+        print('failure')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/.github/workflows/scripts/weekly_check_example.py b/.github/workflows/scripts/example_checks/check_example_weekly.py
similarity index 76%
rename from .github/workflows/scripts/weekly_check_example.py
rename to .github/workflows/scripts/example_checks/check_example_weekly.py
index dfedc46287f2..941e90901f3d 100644
--- a/.github/workflows/scripts/weekly_check_example.py
+++ b/.github/workflows/scripts/example_checks/check_example_weekly.py
@@ -5,9 +5,9 @@ def show_files(path, all_files):
     # Traverse all the folder/file in current directory
     file_list = os.listdir(path)
     # Determine the element is folder or file. If file, pass it into list, if folder, recurse.
-    for file in file_list:
+    for file_name in file_list:
         # Get the abs directory using os.path.join() and store into cur_path.
-        cur_path = os.path.join(path, file)
+        cur_path = os.path.join(path, file_name)
         # Determine whether folder
         if os.path.isdir(cur_path):
             show_files(cur_path, all_files)
@@ -26,9 +26,8 @@ def main():
     for file_loc in contents:
         split_loc = file_loc.split('/')
         # must have two sub-folder levels after examples folder, such as examples/images/vit is acceptable, examples/images/README.md is not, examples/requirements.txt is not.
-        if len(split_loc) - split_loc.index('examples') >= 3:
-            tmp_loc = split_loc[(split_loc.index('examples') + 1):(split_loc.index('examples') + 3)]
-            re_loc = join(tmp_loc, '/')
+        if len(split_loc) >= 4:
+            re_loc = '/'.join(split_loc[1:3])
             if re_loc not in all_loc:
                 all_loc.append(re_loc)
     print(all_loc)
diff --git a/.github/workflows/scripts/changed_example.py b/.github/workflows/scripts/example_checks/detect_changed_example.py
similarity index 52%
rename from .github/workflows/scripts/changed_example.py
rename to .github/workflows/scripts/example_checks/detect_changed_example.py
index ac2f0864eb72..df4fd67368fc 100644
--- a/.github/workflows/scripts/changed_example.py
+++ b/.github/workflows/scripts/example_checks/detect_changed_example.py
@@ -3,14 +3,19 @@
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--fileNameList', type=str)
+    parser.add_argument('-f', '--fileNameList', type=str, help="The list of changed files")
     args = parser.parse_args()
     name_list = args.fileNameList.split(":")
     folder_need_check = set()
     for loc in name_list:
-        # Find only the sub-folder of 'example' folder
+        # Find only the sub-sub-folder of 'example' folder
+        # the examples folder structure is like
+        # - examples
+        #   - area
+        #     - application
+        #       - file
         if loc.split("/")[0] == "examples" and len(loc.split("/")) >= 4:
-            folder_need_check.add(loc.split("/")[1] + "/" + loc.split("/")[2])
+            folder_need_check.add('/'.join(loc.split("/")[1:3]))
     # Output the result using print. Then the shell can get the values.
     print(list(folder_need_check))
 
diff --git a/.github/workflows/scripts/input_check_example.py b/.github/workflows/scripts/input_check_example.py
deleted file mode 100644
index 5602d8f0904a..000000000000
--- a/.github/workflows/scripts/input_check_example.py
+++ /dev/null
@@ -1,23 +0,0 @@
-import argparse
-import os
-
-
-def detect_correct(loc_li):
-    for loc in loc_li:
-        real_loc = 'examples/' + eval(loc)
-        if not os.path.exists(real_loc):
-            return -1
-    return 1
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--fileNameList', type=str)
-    args = parser.parse_args()
-    name_list = args.fileNameList.split(",")
-    result = detect_correct(name_list)
-    print(result)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/tutorial/hybrid_parallel/config.py b/examples/tutorial/hybrid_parallel/config.py
index 2450ab1c7a72..ac273c305006 100644
--- a/examples/tutorial/hybrid_parallel/config.py
+++ b/examples/tutorial/hybrid_parallel/config.py
@@ -6,8 +6,8 @@
 BATCH_SIZE = 256
 LEARNING_RATE = 3e-3
 WEIGHT_DECAY = 0.3
-NUM_EPOCHS = 10
-WARMUP_EPOCHS = 3
+NUM_EPOCHS = 2
+WARMUP_EPOCHS = 1
 
 # model config
 IMG_SIZE = 224
diff --git a/examples/tutorial/hybrid_parallel/requirements.txt b/examples/tutorial/hybrid_parallel/requirements.txt
index 137a69e80498..dbf6aaf3e4e2 100644
--- a/examples/tutorial/hybrid_parallel/requirements.txt
+++ b/examples/tutorial/hybrid_parallel/requirements.txt
@@ -1,2 +1,3 @@
 colossalai >= 0.1.12
 torch >= 1.8.1
+titans
\ No newline at end of file
diff --git a/examples/tutorial/hybrid_parallel/test_ci.sh b/examples/tutorial/hybrid_parallel/test_ci.sh
new file mode 100644
index 000000000000..8860b72a2fb3
--- /dev/null
+++ b/examples/tutorial/hybrid_parallel/test_ci.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+set -euxo pipefail
+
+pip install -r requirements.txt
+torchrun --standalone --nproc_per_node 4 train.py --config config.py -s
diff --git a/examples/tutorial/hybrid_parallel/train.py b/examples/tutorial/hybrid_parallel/train.py
index 0f2a207cb172..2a8576db747b 100644
--- a/examples/tutorial/hybrid_parallel/train.py
+++ b/examples/tutorial/hybrid_parallel/train.py
@@ -98,9 +98,9 @@ def main():
     root = os.environ.get('DATA', '../data')
     if args.synthetic:
         # if we use synthetic dataset
-        # we train for 30 steps and eval for 10 steps per epoch
-        train_dataloader = DummyDataloader(length=30, batch_size=gpc.config.BATCH_SIZE)
-        test_dataloader = DummyDataloader(length=10, batch_size=gpc.config.BATCH_SIZE)
+        # we train for 10 steps and eval for 5 steps per epoch
+        train_dataloader = DummyDataloader(length=10, batch_size=gpc.config.BATCH_SIZE)
+        test_dataloader = DummyDataloader(length=5, batch_size=gpc.config.BATCH_SIZE)
     else:
         train_dataloader, test_dataloader = build_cifar(gpc.config.BATCH_SIZE, root, pad_if_needed=True)
 

From 7d5640b9db01b501e95b66e91be9fe27b58d2e58 Mon Sep 17 00:00:00 2001
From: Haofan Wang <haofanwang.ai@gmail.com>
Date: Tue, 10 Jan 2023 11:27:23 +0800
Subject: [PATCH 141/503] Update parallel_context.py (#2408)

---
 colossalai/context/parallel_context.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/colossalai/context/parallel_context.py b/colossalai/context/parallel_context.py
index dd12dad6d347..b7338b53ddde 100644
--- a/colossalai/context/parallel_context.py
+++ b/colossalai/context/parallel_context.py
@@ -375,7 +375,7 @@ def init_global_dist(self, rank: int, world_size: int, backend: str, host: str,
 
         # None will give the default global process group for pytorch dist operations
         ranks = list(range(world_size))
-        cpu_group = dist.new_group(ranks, backend='gloo') if dist.get_backend() != 'gloo' else None
+        cpu_group = dist.new_group(ranks, backend='gloo') if dist.get_backend() == 'gloo' else None
         self._register_dist(rank, world_size, dist.GroupMember.WORLD, cpu_group, ranks, ParallelMode.GLOBAL)
         self.add_global_rank(ParallelMode.GLOBAL, rank)
 

From d84e7479750f820040ca53ca8bbf4589ae6f645c Mon Sep 17 00:00:00 2001
From: HELSON <c2h214748@gmail.com>
Date: Tue, 10 Jan 2023 11:39:25 +0800
Subject: [PATCH 142/503] [hotfix] add DISTPAN argument for benchmark (#2412)

* change the benchmark config file

* change config

* revert config file

* rename distpan to distplan
---
 examples/language/gpt/gemini/benchmark_gemini.sh | 6 +++---
 examples/language/gpt/gemini/run_gemini.sh       | 8 ++++----
 examples/language/gpt/gemini/train_gpt_demo.py   | 2 ++
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/examples/language/gpt/gemini/benchmark_gemini.sh b/examples/language/gpt/gemini/benchmark_gemini.sh
index 464ea03da7eb..9a630b2ffe23 100644
--- a/examples/language/gpt/gemini/benchmark_gemini.sh
+++ b/examples/language/gpt/gemini/benchmark_gemini.sh
@@ -1,5 +1,5 @@
 for MODEL_TYPE in "gpt2_medium"; do
-  for DISPAN in "colossalai"; do
+  for DISTPLAN in "colossalai"; do
     for BATCH_SIZE in 16; do
       for GPUNUM in 1 2 4 8; do
         for TPDEGREE in 1 2 4 8; do
@@ -8,8 +8,8 @@ for MODEL_TYPE in "gpt2_medium"; do
           fi
           for PLACEMENT in "cpu" "auto"; do
             echo "****************** Begin ***************************"
-            echo "+ benchmrking MODEL ${MODEL_TYPE} DISPAN ${DISPAN} GPU ${GPUNUM} BS ${BATCH_SIZE} TP ${TPDEGREE} POLICY ${PLACEMENT}"
-            MODEL_TYPE=${MODEL_TYPE} DISPAN=${DISPAN} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE} PLACEMENT=${PLACEMENT} \
+            echo "+ benchmrking MODEL ${MODEL_TYPE} DISTPLAN ${DISTPLAN} GPU ${GPUNUM} BS ${BATCH_SIZE} TP ${TPDEGREE} POLICY ${PLACEMENT}"
+            MODEL_TYPE=${MODEL_TYPE} DISTPLAN=${DISTPLAN} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE} PLACEMENT=${PLACEMENT} \
             bash ./run_gemini.sh
             echo "****************** Finished ***************************"
             echo ""
diff --git a/examples/language/gpt/gemini/run_gemini.sh b/examples/language/gpt/gemini/run_gemini.sh
index ad577c350d39..0c2ea660f1e0 100644
--- a/examples/language/gpt/gemini/run_gemini.sh
+++ b/examples/language/gpt/gemini/run_gemini.sh
@@ -1,8 +1,8 @@
 set -x
 # distplan in ["colossalai", "zero1", "zero2", "torch_ddp", "torch_zero"]
-export DISTPAN=${DISTPAN:-"colossalai"}
+export DISTPLAN=${DISTPLAN:-"colossalai"}
 
-# The following options only valid when DISTPAN="colossalai"
+# The following options only valid when DISTPLAN="colossalai"
 export GPUNUM=${GPUNUM:-1}
 export TPDEGREE=${TPDEGREE:-1}
 export PLACEMENT=${PLACEMENT:-"cpu"}
@@ -20,5 +20,5 @@ torchrun --standalone --nproc_per_node=${GPUNUM} ./train_gpt_demo.py \
 --batch_size=${BATCH_SIZE} \
 --placement=${PLACEMENT} \
 --shardinit=${USE_SHARD_INIT} \
---distplan=${DISTPAN} \
-2>&1 | tee ./gemini_logs/${MODEL_TYPE}_${DISTPAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}_${PLACEMENT}.log
+--distplan=${DISTPLAN} \
+2>&1 | tee ./gemini_logs/${MODEL_TYPE}_${DISTPLAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}_${PLACEMENT}.log
diff --git a/examples/language/gpt/gemini/train_gpt_demo.py b/examples/language/gpt/gemini/train_gpt_demo.py
index 891b1de15af1..92cb7393c37b 100644
--- a/examples/language/gpt/gemini/train_gpt_demo.py
+++ b/examples/language/gpt/gemini/train_gpt_demo.py
@@ -290,9 +290,11 @@ def main():
             from torch.distributed.optim import ZeroRedundancyOptimizer
             optimizer = ZeroRedundancyOptimizer(model.parameters(), optimizer_class=torch.optim.Adam, lr=0.01)
     elif args.distplan.startswith("zero"):
+        model = model.half()
         partition_flag = args.distplan == "zero2"
         optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
         optimizer = LowLevelZeroOptimizer(optimizer,
+                                          reduce_bucket_size=12 * 1024 * 1024,
                                           overlap_communication=True,
                                           partition_grad=partition_flag,
                                           verbose=True)

From 4befaabace567589251a0c5ba2916a7fc891bcfa Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Tue, 10 Jan 2023 11:40:04 +0800
Subject: [PATCH 143/503] [workflow] added precommit check for code consistency
 (#2401)

* [workflow] added precommit check for code consistency

* polish code

* polish code

* polish code

* polish code

* polish code

* polish code

* polish code
---
 .github/workflows/pre_commit.yml | 46 ++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 .github/workflows/pre_commit.yml

diff --git a/.github/workflows/pre_commit.yml b/.github/workflows/pre_commit.yml
new file mode 100644
index 000000000000..128802629ce6
--- /dev/null
+++ b/.github/workflows/pre_commit.yml
@@ -0,0 +1,46 @@
+name: pre-commit
+
+on:
+  pull_request:
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+      with:
+          fetch-depth: 0
+          ref: ${{ github.event.pull_request.head.sha }}
+
+    - name: Find the changed files
+      id: find-changed-files
+      uses: tj-actions/changed-files@v35
+      with:
+        since_last_remote_commit: true
+
+    - name: List all changed files
+      run: |
+        for file in ${{ steps.find-changed-files.outputs.all_changed_files }}; do
+          echo "$file was changed"
+        done
+
+    - uses: actions/setup-python@v3
+
+    - name: Cache pre-commit hooks
+      uses: actions/cache@v3
+      with:
+        path: ~/.cache/pre-commit
+        key: ${{ runner.os }}-pre-commit-hooks
+
+    - name: Set up pre-commit
+      run: |
+        pip install pre-commit
+        pre-commit install
+
+    - name: Run pre-commit on Changed Files
+      id: precommit
+      run: |
+        for file in ${{ steps.find-changed-files.outputs.all_changed_files }}; do
+          echo "======= running pre-commit on ${file} ======="
+          pre-commit run --files $file
+        done

From 7ab2db206f0342b3f69ea9f6cc25813363c00f56 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 10 Jan 2023 11:56:00 +0800
Subject: [PATCH 144/503] adapt new fx

---
 colossalai/autochunk/autochunk_codegen.py      | 6 +++---
 colossalai/autochunk/estimate_memory.py        | 7 +------
 tests/test_autochunk/test_autochunk_codegen.py | 6 ++++--
 tests/test_autochunk/test_autochunk_search.py  | 7 ++++---
 4 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
index 14f17b1d37ba..e8af9bde86d8 100644
--- a/colossalai/autochunk/autochunk_codegen.py
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -585,9 +585,9 @@ def emit_node(node: Node, body):
             code = "".join(body)
             code = "\n".join("    " + line for line in code.split("\n"))
             fn_code = f"""
-    {wrap_stmts}
+{wrap_stmts}
 
-    {prologue}
-    {code}"""
+{prologue}
+{code}"""
             # print(fn_code)
             return PythonCode(fn_code, globals_)
diff --git a/colossalai/autochunk/estimate_memory.py b/colossalai/autochunk/estimate_memory.py
index 62b23cf9fc93..e001423f1fbb 100644
--- a/colossalai/autochunk/estimate_memory.py
+++ b/colossalai/autochunk/estimate_memory.py
@@ -28,12 +28,7 @@ def _get_meta_node_size(self, x):
         return x
 
     def _get_output_node(self, n):
-        fwd_out = {
-            x.uuid: x
-            for x in n.meta["fwd_out"]
-            if isinstance(x, torch.Tensor) and hasattr(x, "uuid")
-        }
-        out_size = activation_size(fwd_out)
+        out_size = activation_size(n.meta["fwd_out"])
         out_node = [n.name] if out_size > 0 else []
         return out_size, out_node
 
diff --git a/tests/test_autochunk/test_autochunk_codegen.py b/tests/test_autochunk/test_autochunk_codegen.py
index 28999706b20a..fe19168842ad 100644
--- a/tests/test_autochunk/test_autochunk_codegen.py
+++ b/tests/test_autochunk/test_autochunk_codegen.py
@@ -8,6 +8,7 @@
 import colossalai
 from colossalai.core import global_context as gpc
 from colossalai.fx import ColoTracer
+from colossalai.fx._compatibility import is_compatible_with_meta
 from colossalai.fx.codegen.activation_checkpoint_codegen import CODEGEN_AVAILABLE
 from colossalai.fx.graph_module import ColoGraphModule
 from colossalai.fx.passes.meta_info_prop import MetaInfoProp
@@ -15,8 +16,9 @@
 from colossalai.utils import free_port
 from tests.test_autochunk.evoformer.evoformer import evoformer_base
 
-if CODEGEN_AVAILABLE:
+if CODEGEN_AVAILABLE and is_compatible_with_meta():
     from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
+    from colossalai.fx.profiler import MetaTensor
 
 
 def _test_fwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair):
@@ -102,7 +104,7 @@ def _test_autochunk_codegen(rank, msa_len, pair_len, max_memory):
     gpc.destroy()
 
 
-@pytest.mark.skipif(not CODEGEN_AVAILABLE, reason='torch version is lower than 1.12.0')
+@pytest.mark.skipif(not (CODEGEN_AVAILABLE and is_compatible_with_meta()), reason='torch version is lower than 1.12.0')
 @pytest.mark.parametrize("max_memory", [None, 20, 25, 30])
 @pytest.mark.parametrize("msa_len", [32])
 @pytest.mark.parametrize("pair_len", [64])
diff --git a/tests/test_autochunk/test_autochunk_search.py b/tests/test_autochunk/test_autochunk_search.py
index eb2bf4560e2c..537bf4f4170d 100644
--- a/tests/test_autochunk/test_autochunk_search.py
+++ b/tests/test_autochunk/test_autochunk_search.py
@@ -7,14 +7,15 @@
 
 import colossalai
 from colossalai.core import global_context as gpc
+from colossalai.fx._compatibility import is_compatible_with_meta
 from colossalai.fx.codegen.activation_checkpoint_codegen import CODEGEN_AVAILABLE
 from colossalai.fx.passes.meta_info_prop import MetaInfoProp
-from colossalai.fx.profiler import MetaTensor
 from colossalai.utils import free_port
 from tests.test_autochunk.evoformer.evoformer import evoformer_base
 
-if CODEGEN_AVAILABLE:
+if CODEGEN_AVAILABLE and is_compatible_with_meta():
     from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
+    from colossalai.fx.profiler import MetaTensor
 
 
 def assert_chunk_infos(chunk_infos, max_memory, msa_len, pair_len):
@@ -89,7 +90,7 @@ def _test_autochunk_search(rank, msa_len, pair_len, max_memory):
     gpc.destroy()
 
 
-@pytest.mark.skipif(not CODEGEN_AVAILABLE, reason="torch version is lower than 1.12.0")
+@pytest.mark.skipif(not (CODEGEN_AVAILABLE and is_compatible_with_meta()), reason="torch version is lower than 1.12.0")
 @pytest.mark.parametrize("max_memory", [None, 20, 25, 30])
 @pytest.mark.parametrize("msa_len", [32])
 @pytest.mark.parametrize("pair_len", [64])

From 9d432230ba0c006efeaf0d448e2cbac409f88f60 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Tue, 10 Jan 2023 12:06:01 +0800
Subject: [PATCH 145/503] [workflow] added translation for non-english comments
 (#2414)

---
 .github/workflows/translate_comment.yml | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 .github/workflows/translate_comment.yml

diff --git a/.github/workflows/translate_comment.yml b/.github/workflows/translate_comment.yml
new file mode 100644
index 000000000000..83c127b3caa4
--- /dev/null
+++ b/.github/workflows/translate_comment.yml
@@ -0,0 +1,18 @@
+name: 'issue-translator'
+on:
+  issue_comment:
+    types: [created]
+  issues:
+    types: [opened]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: usthe/issues-translate-action@v2.7
+        with:
+          IS_MODIFY_TITLE: false
+          # not require, default false, . Decide whether to modify the issue title
+          # if true, the robot account @Issues-translate-bot must have modification permissions, invite @Issues-translate-bot to your project or use your custom bot.
+          CUSTOM_BOT_NOTE: Bot detected the issue body's language is not English, translate it automatically. 👯👭🏻🧑‍🤝‍🧑👫🧑🏿‍🤝‍🧑🏻👩🏾‍🤝‍👨🏿👬🏿
+          # not require. Customize the translation robot prefix message.

From 2445279a084163dfd4ff009a43df7dc185319bd5 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Tue, 10 Jan 2023 12:10:13 +0800
Subject: [PATCH 146/503] [setup] refactored setup.py for dependency graph
 (#2413)

---
 setup.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/setup.py b/setup.py
index 5128b80e880d..b9cd9e5e4714 100644
--- a/setup.py
+++ b/setup.py
@@ -144,13 +144,16 @@ def get_version():
         print(f'===== Building Extension {name} =====')
         ext_modules.append(builder_cls().builder())
 
-if is_nightly:
+# always put not nightly branch as the if branch
+# otherwise github will treat colossalai-nightly as the project name
+# and it will mess up with the dependency graph insights
+if not is_nightly:
+    version = get_version()
+    package_name = 'colossalai'
+else:
     # use date as the nightly version
     version = datetime.today().strftime('%Y.%m.%d')
     package_name = 'colossalai-nightly'
-else:
-    version = get_version()
-    package_name = 'colossalai'
 
 setup(name=package_name,
       version=version,

From 36ab2cb783fdee9899f734f99633fc0d63d4e980 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 10 Jan 2023 12:20:40 +0800
Subject: [PATCH 147/503] change import

---
 tests/test_autochunk/test_autochunk_codegen.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_autochunk/test_autochunk_codegen.py b/tests/test_autochunk/test_autochunk_codegen.py
index fe19168842ad..a061e0ce10d3 100644
--- a/tests/test_autochunk/test_autochunk_codegen.py
+++ b/tests/test_autochunk/test_autochunk_codegen.py
@@ -12,7 +12,6 @@
 from colossalai.fx.codegen.activation_checkpoint_codegen import CODEGEN_AVAILABLE
 from colossalai.fx.graph_module import ColoGraphModule
 from colossalai.fx.passes.meta_info_prop import MetaInfoProp
-from colossalai.fx.profiler import MetaTensor
 from colossalai.utils import free_port
 from tests.test_autochunk.evoformer.evoformer import evoformer_base
 

From 61fdd3464af8225c7f674386ad1d358ed26ebbaa Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 10 Jan 2023 12:29:09 +0800
Subject: [PATCH 148/503] update doc

---
 .../test_autochunk/test_autochunk_codegen.py  | 28 +++++++------------
 tests/test_autochunk/test_autochunk_search.py | 28 ++++++++-----------
 2 files changed, 21 insertions(+), 35 deletions(-)

diff --git a/tests/test_autochunk/test_autochunk_codegen.py b/tests/test_autochunk/test_autochunk_codegen.py
index a061e0ce10d3..02fa07e2ca00 100644
--- a/tests/test_autochunk/test_autochunk_codegen.py
+++ b/tests/test_autochunk/test_autochunk_codegen.py
@@ -40,20 +40,16 @@ def _test_fwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair):
         non_fx_out = model(node, pair)
         fx_out = gm(node, pair)
 
-    assert torch.allclose(
-        non_fx_out[0], fx_out[0], atol=1e-4
-    ), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(
-        torch.abs(non_fx_out[0] - fx_out[0])
-    )
-    assert torch.allclose(
-        non_fx_out[1], fx_out[1], atol=1e-4
-    ), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(
-        torch.abs(non_fx_out[1] - fx_out[1])
-    )
+    assert torch.allclose(non_fx_out[0], fx_out[0],
+                          atol=1e-4), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(
+                              torch.abs(non_fx_out[0] - fx_out[0]))
+    assert torch.allclose(non_fx_out[1], fx_out[1],
+                          atol=1e-4), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(
+                              torch.abs(non_fx_out[1] - fx_out[1]))
 
 
 def _test_autochunk_codegen(rank, msa_len, pair_len, max_memory):
-    # launch colossalai to make sure we could execute colossalai.utils.checkpoint currectly
+    # launch colossalai
     colossalai.launch(
         config={},
         rank=rank,
@@ -76,18 +72,14 @@ def _test_autochunk_codegen(rank, msa_len, pair_len, max_memory):
             "pair": pair.to(torch.device("meta")),
         },
     )
-    gm_prop = torch.fx.symbolic_trace(model)  # must use symbolic_trace
+    gm_prop = torch.fx.symbolic_trace(model)    # must use symbolic_trace
     interp = MetaInfoProp(gm_prop)
-    interp.propagate(
-        MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0")
-    )
+    interp.propagate(MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0"))
 
     # now run it twice to get meta info in graph module, not necessary
     gm = torch.fx.GraphModule(model, graph)
     interp = MetaInfoProp(gm)
-    interp.propagate(
-        MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0")
-    )
+    interp.propagate(MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0"))
 
     codegen = AutoChunkCodeGen(gm_prop, max_memory=max_memory)
     graph.set_codegen(codegen)
diff --git a/tests/test_autochunk/test_autochunk_search.py b/tests/test_autochunk/test_autochunk_search.py
index 537bf4f4170d..371fce64fdf7 100644
--- a/tests/test_autochunk/test_autochunk_search.py
+++ b/tests/test_autochunk/test_autochunk_search.py
@@ -23,7 +23,8 @@ def assert_chunk_infos(chunk_infos, max_memory, msa_len, pair_len):
 
     if msa_len == 32 and pair_len == 64:
         if max_memory is None:
-            target_regions = [(142, 154), (366, 373), (233, 283), (301, 351), (127, 134), (204, 228), (167, 191), (161, 166), (198, 203), (6, 69)]
+            target_regions = [(142, 154), (366, 373), (233, 283), (301, 351), (127, 134), (204, 228), (167, 191),
+                              (161, 166), (198, 203), (6, 69)]
         elif max_memory == 20:
             target_regions = [(142, 154), (369, 373), (233, 269), (301, 351)]
         elif max_memory == 25:
@@ -36,24 +37,19 @@ def assert_chunk_infos(chunk_infos, max_memory, msa_len, pair_len):
         raise NotImplementedError()
 
     assert len(found_regions) == len(
-        target_regions
-    ), "len of found regions %s doesn't equal len of target regions %s" % (
-        str(found_regions),
-        str(target_regions),
-    )
+        target_regions), "len of found regions %s doesn't equal len of target regions %s" % (
+            str(found_regions),
+            str(target_regions),
+        )
     for region in target_regions:
-        assert (
-            region in found_regions
-        ), "region:%s not in found regions for msa:%d, pair:%d, maxmem:%d" % (
+        assert (region in found_regions), "region:%s not in found regions for msa:%d, pair:%d, maxmem:%d" % (
             str(region),
             msa_len,
             pair_len,
             max_memory,
         )
     for region in found_regions:
-        assert (
-            region in target_regions
-        ), "region:%s should not be found for msa:%d, pair:%d, maxmem:%d" % (
+        assert (region in target_regions), "region:%s should not be found for msa:%d, pair:%d, maxmem:%d" % (
             str(region),
             msa_len,
             pair_len,
@@ -62,7 +58,7 @@ def assert_chunk_infos(chunk_infos, max_memory, msa_len, pair_len):
 
 
 def _test_autochunk_search(rank, msa_len, pair_len, max_memory):
-    # launch colossalai to make sure we could execute colossalai.utils.checkpoint currectly
+    # launch colossalai
     colossalai.launch(
         config={},
         rank=rank,
@@ -77,11 +73,9 @@ def _test_autochunk_search(rank, msa_len, pair_len, max_memory):
     node = torch.randn(1, msa_len, pair_len, 256).cuda()
     pair = torch.randn(1, pair_len, pair_len, 128).cuda()
 
-    gm_prop = torch.fx.symbolic_trace(model)  # must use symbolic_trace
+    gm_prop = torch.fx.symbolic_trace(model)    # must use symbolic_trace
     interp = MetaInfoProp(gm_prop)
-    interp.propagate(
-        MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0")
-    )
+    interp.propagate(MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0"))
 
     codegen = AutoChunkCodeGen(gm_prop, max_memory=max_memory)
     chunk_infos = codegen.chunk_infos

From 57b6157b6ca25dbda89f5f67d84cf363b911eecf Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Tue, 10 Jan 2023 15:06:27 +0800
Subject: [PATCH 149/503] [workflow] auto comment if precommit check fails
 (#2417)

---
 .github/workflows/auto_example_check.yml |  3 --
 .github/workflows/build.yml              |  1 -
 .github/workflows/comment.yml            | 67 ++++++++++++++++++++++++
 .github/workflows/pre_commit.yml         | 15 +++++-
 4 files changed, 80 insertions(+), 6 deletions(-)
 create mode 100644 .github/workflows/comment.yml

diff --git a/.github/workflows/auto_example_check.yml b/.github/workflows/auto_example_check.yml
index 7f1e357e33e8..d9063bad9f33 100644
--- a/.github/workflows/auto_example_check.yml
+++ b/.github/workflows/auto_example_check.yml
@@ -28,9 +28,6 @@ jobs:
       - name: Get all changed example files
         id: changed-files
         uses: tj-actions/changed-files@v35
-        # Using this can trigger action each time a PR is submitted.
-        with:
-          since_last_remote_commit: true
       - name: setup matrix
         id: setup-matrix
         run: |
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 62d6350d6511..25c8a395734a 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -25,7 +25,6 @@ jobs:
         id: find-changed-files
         uses: tj-actions/changed-files@v35
         with:
-          since_last_remote_commit: true
           files: |
             op_builder/**
             colossalai/kernel/**
diff --git a/.github/workflows/comment.yml b/.github/workflows/comment.yml
new file mode 100644
index 000000000000..9f873bad7ae0
--- /dev/null
+++ b/.github/workflows/comment.yml
@@ -0,0 +1,67 @@
+name: Auto Workflow Comment
+
+on:
+  workflow_run:
+    workflows: [pre-commit]
+    types:
+      - completed
+
+jobs:
+  # comment with a message on how to do pre-commit
+  # if the pre-commit check was not passed
+  report-precommit-failure:
+    runs-on: ubuntu-latest
+    if: ${{ github.event.workflow_run.name }} == "pre-commit" && ${{ github.event.workflow_run.conclusion == 'failure' }}
+    steps:
+      - name: 'Download artifact'
+        uses: actions/github-script@v6
+        with:
+          script: |
+            let allArtifacts = await github.rest.actions.listWorkflowRunArtifacts({
+               owner: context.repo.owner,
+               repo: context.repo.repo,
+               run_id: context.payload.workflow_run.id,
+            });
+            let matchArtifact = allArtifacts.data.artifacts.filter((artifact) => {
+              return artifact.name == "pr_number"
+            })[0];
+            let download = await github.rest.actions.downloadArtifact({
+               owner: context.repo.owner,
+               repo: context.repo.repo,
+               artifact_id: matchArtifact.id,
+               archive_format: 'zip',
+            });
+            let fs = require('fs');
+            fs.writeFileSync(`${process.env.GITHUB_WORKSPACE}/pr_number.zip`, Buffer.from(download.data));
+
+      - name: 'Unzip artifact'
+        run: unzip pr_number.zip
+
+      - name: 'Comment on PR'
+        uses: actions/github-script@v6
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            let fs = require('fs');
+            let issue_number = Number(fs.readFileSync('./pr_number'));
+            let owner = context.repo.owner;
+            let repo = context.repo.repo;
+            let run_id = context.payload.workflow_run.id;
+            let run_url = `https://github.com/${owner}/${repo}/actions/runs/${run_id}`
+            let body = `
+            Your pre-commit check failed, follow the steps to run pre-commit on your file for code style consistency.
+
+            1. install pre-commit via "pip install pre-commit"
+            2. install pre-commit hooks via "pre-commit install"
+            3. run pre-commit on file with format error via "pre-commit run --files path" by replacing "path" with the actual file path
+            4. commit and push to your branch
+
+            View your job at ${run_url}.
+            Read our "CONTRIBUTING.md" for more reference to the code style.
+            `;
+            await github.rest.issues.createComment({
+              owner: owner,
+              repo: repo,
+              issue_number: issue_number,
+              body: body
+            });
diff --git a/.github/workflows/pre_commit.yml b/.github/workflows/pre_commit.yml
index 128802629ce6..113f50ee0569 100644
--- a/.github/workflows/pre_commit.yml
+++ b/.github/workflows/pre_commit.yml
@@ -15,8 +15,6 @@ jobs:
     - name: Find the changed files
       id: find-changed-files
       uses: tj-actions/changed-files@v35
-      with:
-        since_last_remote_commit: true
 
     - name: List all changed files
       run: |
@@ -44,3 +42,16 @@ jobs:
           echo "======= running pre-commit on ${file} ======="
           pre-commit run --files $file
         done
+
+    - name: Save PR number
+      if: always()
+      env:
+        PR_NUMBER: ${{ github.event.number }}
+      run: |
+        mkdir -p ./pr
+        echo $PR_NUMBER > ./pr/pr_number
+    - uses: actions/upload-artifact@v3
+      if: always()
+      with:
+        name: pr_number
+        path: pr/

From dddacd2d2c4d2416563fa4160d715d11a9a2a691 Mon Sep 17 00:00:00 2001
From: HELSON <c2h214748@gmail.com>
Date: Tue, 10 Jan 2023 15:43:06 +0800
Subject: [PATCH 150/503] [hotfix] add norm clearing for the overflow step
 (#2416)

---
 colossalai/nn/optimizer/zero_optimizer.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/colossalai/nn/optimizer/zero_optimizer.py b/colossalai/nn/optimizer/zero_optimizer.py
index 2786d4496a8e..7f9d2fe8fc97 100644
--- a/colossalai/nn/optimizer/zero_optimizer.py
+++ b/colossalai/nn/optimizer/zero_optimizer.py
@@ -140,6 +140,10 @@ def _check_overflow(self):
 
         return self._found_overflow.item() > 0
 
+    def _clear_global_norm(self) -> None:
+        for c16 in self.chunk16_set:
+            c16.l2_norm = None
+
     def _calc_global_norm(self) -> float:
         norm_sqr: float = 0.0
         group_to_norm = dict()
@@ -201,6 +205,7 @@ def step(self, *args, **kwargs):
             self.optim_state = OptimState.UNSCALED    # no need to unscale grad
             self.grad_scaler.update(found_inf)    # update gradient scaler
             self._logger.info(f'Found overflow. Skip step')
+            self._clear_global_norm()    # clear recorded norm
             self.zero_grad()    # reset all gradients
             self._update_fp16_params()
             return

From fe0f7970a21cce04c8e014b72ff7df8c91742643 Mon Sep 17 00:00:00 2001
From: ZijianYY <119492445+ZijianYY@users.noreply.github.com>
Date: Tue, 10 Jan 2023 16:18:56 +0800
Subject: [PATCH 151/503] [examples] adding tflops to PaLM (#2365)

---
 examples/language/palm/train.py | 49 +++++++++++++++++++++++++++------
 1 file changed, 41 insertions(+), 8 deletions(-)

diff --git a/examples/language/palm/train.py b/examples/language/palm/train.py
index 7c080b7f321d..6725c07dfac7 100644
--- a/examples/language/palm/train.py
+++ b/examples/language/palm/train.py
@@ -1,9 +1,11 @@
 import gzip
 import random
-
+from time import time
+from functools import partial
 import numpy as np
 import torch
 import torch.optim as optim
+import torch.nn as nn
 import tqdm
 from packaging import version
 from palm_pytorch import PaLM
@@ -21,7 +23,8 @@
 
 # constants
 
-NUM_BATCHES = int(1000)
+NUM_BATCHES = int(100)
+WARMUP_BATCHES = 1
 GRADIENT_ACCUMULATE_EVERY = 1
 LEARNING_RATE = 2e-4
 VALIDATE_EVERY = 100
@@ -76,10 +79,18 @@ def cycle(loader):
 def decode_token(token):
     return str(chr(max(32, token)))
 
+def get_tflops(model_numel, batch_size, seq_len, step_time):
+    return model_numel * batch_size * seq_len * 8 / 1e12 / (step_time + 1e-12)
 
 def decode_tokens(tokens):
     return "".join(list(map(decode_token, tokens)))
 
+def get_model_size(model: nn.Module):
+    total_numel = 0
+    for module in model.modules():
+        for p in module.parameters(recurse=False):
+            total_numel += p.numel()
+    return total_numel
 
 # Gemini + ZeRO DDP
 def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy: str = "auto"):
@@ -143,7 +154,6 @@ def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup):
                 split_param_row_tp1d(param, pg)    # row slice
             else:
                 param.set_dist_spec(ReplicaSpec())
-
             param.visited = True
 
 
@@ -152,6 +162,7 @@ def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup):
         raise TypeError(f"{args.distplan} is error")
 disable_existing_loggers()
 colossalai.launch_from_torch(config={})
+logger = get_dist_logger()
 
 with gzip.open("./data/enwik8.gz") as file:
     X = np.fromstring(file.read(int(95e6)), dtype=np.uint8)
@@ -188,7 +199,7 @@ def __len__(self):
     ctx = ColoInitContext(device='cpu', default_dist_spec=default_dist_spec, default_pg=default_pg)
 
     with ctx:
-        model = PaLM(num_tokens=256, dim=512, depth=8)
+        model = PaLM(num_tokens=50304, dim=4096, depth=64)
         model = AutoregressiveWrapper(model, max_seq_len=SEQ_LEN)
 
     pg = default_pg
@@ -205,25 +216,42 @@ def __len__(self):
     model.cuda()
     optim = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
 
-
+ # model is shared after TP
+numel = get_model_size(model)
+get_tflops_func = partial(get_tflops, numel, args.batch_size, SEQ_LEN)
 
 # training
 model.train()
-
+tflops_list = []
 for i in tqdm.tqdm(range(NUM_BATCHES), mininterval=10.0, desc="training"):
 
     if args.distplan == "colossalai":
         optimizer.zero_grad()
-
+        start = time()
         loss = model(next(train_loader))
+        fwd_end = time()
+        fwd_time = fwd_end - start
         # loss.backward()
         optimizer.backward(loss)
+        bwd_end = time()
+        bwd_time = bwd_end - fwd_end
 
-        print(f"training loss: {loss.item()}")
+        # print(f"training loss: {loss.item()}")
         torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
         # optim.step()
         # optim.zero_grad()
         optimizer.step()
+        optim_time = time() - bwd_end
+        step_time = time() - start
+
+        step_tflops = get_tflops_func(step_time)
+        logger.info(
+            f"[{i + 1}/{NUM_BATCHES}] Loss:{loss.item():.3f}, Step time: {step_time:.3f}s, TFLOPS: {get_tflops_func(step_time):.3f}, FWD time: {fwd_time:.3f}s, BWD time: {bwd_time:.3f}s, OPTIM time: {optim_time:.3f}s",
+            ranks=[0],
+        )
+        if i >= WARMUP_BATCHES:
+            tflops_list.append(step_tflops)
+    
     else:
         for __ in range(GRADIENT_ACCUMULATE_EVERY):
             loss = model(next(train_loader))
@@ -233,6 +261,11 @@ def __len__(self):
         torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
         optim.step()
         optim.zero_grad()
+    
+tflops_list.sort()
+median_index = ((NUM_BATCHES - WARMUP_BATCHES) >> 1) + WARMUP_BATCHES
+logger.info(f"Median TFLOPS is {tflops_list[median_index]:.3f}")
+
 
     # TODO
     # if i % VALIDATE_EVERY == 0:

From b3472d32e03ced200f1591aca5a50201ab44d274 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Tue, 10 Jan 2023 22:30:16 +0800
Subject: [PATCH 152/503] [workflow]auto comment with test coverage report
 (#2419)

* [workflow]auto comment with test coverage report

* polish code

* polish yaml
---
 .github/workflows/build.yml                   | 16 ++++-
 ...mment.yml => report_precommit_failure.yml} |  4 +-
 .github/workflows/report_test_coverage.yml    | 69 +++++++++++++++++++
 .gitignore                                    |  3 +-
 4 files changed, 88 insertions(+), 4 deletions(-)
 rename .github/workflows/{comment.yml => report_precommit_failure.yml} (94%)
 create mode 100644 .github/workflows/report_test_coverage.yml

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 25c8a395734a..30b932729019 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -75,12 +75,26 @@ jobs:
 
       - name: Unit Testing
         run: |
-          PYTHONPATH=$PWD pytest --cov=. --cov-report lcov tests
+          PYTHONPATH=$PWD pytest --cov=. --cov-report xml tests
         env:
           DATA: /data/scratch/cifar-10
           NCCL_SHM_DISABLE: 1
           LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
 
+      - name: Collate artifact
+        env:
+          PR_NUMBER: ${{ github.event.number }}
+        run: |
+          mkdir report
+          echo $PR_NUMBER > ./report/pr_number
+          mv coverage.xml ./report
+
+      - name: Upload test coverage artifact
+        uses: actions/upload-artifact@v3
+        with:
+          name: report
+          path: report/
+
       - name: Store Cache
         run: |
           # -p flag is required to preserve the file timestamp to avoid ninja rebuild
diff --git a/.github/workflows/comment.yml b/.github/workflows/report_precommit_failure.yml
similarity index 94%
rename from .github/workflows/comment.yml
rename to .github/workflows/report_precommit_failure.yml
index 9f873bad7ae0..e6ca7b01bcc1 100644
--- a/.github/workflows/comment.yml
+++ b/.github/workflows/report_precommit_failure.yml
@@ -1,4 +1,4 @@
-name: Auto Workflow Comment
+name: Report Precommit Failure
 
 on:
   workflow_run:
@@ -11,7 +11,7 @@ jobs:
   # if the pre-commit check was not passed
   report-precommit-failure:
     runs-on: ubuntu-latest
-    if: ${{ github.event.workflow_run.name }} == "pre-commit" && ${{ github.event.workflow_run.conclusion == 'failure' }}
+    if: ${{ github.event.workflow_run.conclusion == 'failure' }}
     steps:
       - name: 'Download artifact'
         uses: actions/github-script@v6
diff --git a/.github/workflows/report_test_coverage.yml b/.github/workflows/report_test_coverage.yml
new file mode 100644
index 000000000000..167aa28b6b62
--- /dev/null
+++ b/.github/workflows/report_test_coverage.yml
@@ -0,0 +1,69 @@
+name: Report Test Coverage
+
+on:
+  workflow_run:
+    workflows: [Build]
+    types:
+      - completed
+
+jobs:
+  report-test-coverage:
+    runs-on: ubuntu-latest
+    steps:
+      - name: 'Download artifact'
+        uses: actions/github-script@v6
+        with:
+          script: |
+            let allArtifacts = await github.rest.actions.listWorkflowRunArtifacts({
+               owner: context.repo.owner,
+               repo: context.repo.repo,
+               run_id: context.payload.workflow_run.id,
+            });
+            let matchArtifact = allArtifacts.data.artifacts.filter((artifact) => {
+              return artifact.name == "report"
+            })[0];
+            let download = await github.rest.actions.downloadArtifact({
+               owner: context.repo.owner,
+               repo: context.repo.repo,
+               artifact_id: matchArtifact.id,
+               archive_format: 'zip',
+            });
+            let fs = require('fs');
+            fs.writeFileSync(`${process.env.GITHUB_WORKSPACE}/report.zip`, Buffer.from(download.data));
+
+      - name: 'Unzip artifact'
+        run: |
+          unzip report.zip
+
+      - name: Code Coverage Report
+        uses: irongut/CodeCoverageSummary@v1.3.0
+        with:
+          filename: coverage.xml
+          badge: true
+          fail_below_min: true
+          format: markdown
+          hide_branch_rate: false
+          hide_complexity: false
+          indicators: true
+          output: both
+          thresholds: '80 90'
+
+      - name: 'Comment on PR'
+        uses: actions/github-script@v6
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            let fs = require('fs');
+            let issue_number = Number(fs.readFileSync('./pr_number'));
+            let owner = context.repo.owner;
+            let repo = context.repo.repo;
+            let run_id = context.payload.workflow_run.id;
+            let run_url = `https://github.com/${owner}/${repo}/actions/runs/${run_id}`
+            let body = fs.readFileSync('./code-coverage-results.md', {encoding:'utf8', flag:'r'})
+
+            await github.rest.issues.createComment({
+              owner: owner,
+              repo: repo,
+              issue_number: issue_number,
+              body: body
+            });
diff --git a/.gitignore b/.gitignore
index 8e345eeb8388..bf74a753894f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -153,4 +153,5 @@ colossalai/version.py
 .pyi
 
 # ignore coverage test file
-converage.lcov
+coverage.lcov
+coverage.xml

From cd38167c1a4742ac3810b98ff13be0ab64fbeb9c Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Tue, 10 Jan 2023 22:30:32 +0800
Subject: [PATCH 153/503] [doc] added documentation for CI/CD (#2420)

* [doc] added documentation for CI/CD

* polish markdown

* polish markdown

* polish markdown
---
 .github/workflows/README.md | 121 ++++++++++++++++++++++++++++++++++++
 README-zh-Hans.md           |   5 ++
 README.md                   |   5 ++
 3 files changed, 131 insertions(+)
 create mode 100644 .github/workflows/README.md

diff --git a/.github/workflows/README.md b/.github/workflows/README.md
new file mode 100644
index 000000000000..65017a397c81
--- /dev/null
+++ b/.github/workflows/README.md
@@ -0,0 +1,121 @@
+# CI/CD
+
+## Table of Contents
+
+- [CI/CD](#cicd)
+  - [Table of Contents](#table-of-contents)
+  - [Overview](#overview)
+  - [Workflows](#workflows)
+    - [Checks on Pull Requests](#checks-on-pull-requests)
+    - [Regular Checks](#regular-checks)
+    - [Release](#release)
+    - [Manual Dispatch](#manual-dispatch)
+      - [Release bdist wheel](#release-bdist-wheel)
+      - [Dispatch Example Test](#dispatch-example-test)
+      - [Compatibility Test](#compatibility-test)
+    - [User Friendliness](#user-friendliness)
+  - [Progress Log](#progress-log)
+
+## Overview
+
+Automation makes our development more efficient as the machine automatically run the pre-defined tasks for the contributors.
+This saves a lot of manual work and allow the developer to fully focus on the features and bug fixes.
+In Colossal-AI, we use [GitHub Actions](https://github.com/features/actions) to automate a wide range of workflows to ensure the robustness of the software.
+In the section below, we will dive into the details of different workflows available.
+
+## Workflows
+
+### Checks on Pull Requests
+
+| Workflow Name               | File name                      | Description                                                                                                                                       |
+| --------------------------- | ------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `Build`                     | `build.yml`                    | This workflow is triggered when the label `Run build and Test` is assigned to a PR. It will run all the unit tests in the repository with 4 GPUs. |
+| `Pre-commit`                | `pre_commit.yml`               | This workflow runs pre-commit checks for code style consistency.                                                                                  |
+| `Report pre-commit failure` | `report_precommit_failure.yml` | This PR will put up a comment in the PR to explain the precommit failure and remedy. This is executed when `Pre-commit` is done                   |
+| `Report test coverage`      | `report_test_coverage.yml`     | This PR will put up a comment to report the test coverage results. This is executed when `Build` is completed.                                    |
+| `Test example`              | `auto_example_check.yml`       | The example will be automatically tested if its files are changed in the PR                                                                       |
+
+### Regular Checks
+
+| Workflow Name           | File name                | Description                                                                                                            |
+| ----------------------- | ------------------------ | ---------------------------------------------------------------------------------------------------------------------- |
+| `Test example`          | `auto_example_check.yml` | This workflow will test all examples every Sunday                                                                      |
+| `Build on 8 GPUs`       | `build_gpu_8.yml`        | This workflow will run the unit tests everyday with 8 GPUs.                                                            |
+| `Synchronize submodule` | `submodule.yml`          | This workflow will check if any git submodule is updated. If so, it will create a PR to update the submodule pointers. |
+| `Close inactive issues` | `close_inactive.yml`     | This workflow will close issues which are stale for 14 days.                                                           |
+
+### Release
+
+| Workflow Name               | File name                       | Description                                                                                                       |
+| --------------------------- | ------------------------------- | ----------------------------------------------------------------------------------------------------------------- |
+| `Draft GitHub Release Post` | `draft_github_release_post.yml` | Compose a GitHub release post draft based on the commit history. Triggered when `version.txt` is updated.         |
+| `Release to PyPI`           | `release_pypi.yml`              | Build and release the wheel to PyPI. Triggered when `version.txt` is updated.                                     |
+| `Release Nightly to PyPI`   | `release_nightly.yml`           | Build and release the nightly wheel to PyPI as `colossalai-nightly`. Automatically executed every Sunday.         |
+| `Release Docker`            | `release_docker.yml`            | Build and release the Docker image to DockerHub. Triggered when `version.txt` is updated.                         |
+| `Release bdist wheel`       | `release_bdist.yml`             | Build binary wheels with pre-built PyTorch extensions. Manually dispatched. See more details in the next section. |
+
+### Manual Dispatch
+
+| Workflow Name           | File name                    | Description                                            |
+| ----------------------- | ---------------------------- | ------------------------------------------------------ |
+| `Release bdist wheel`   | `release_bdist.yml`          | Build binary wheels with pre-built PyTorch extensions. |
+| `Dispatch Example Test` | `dispatch_example_check.yml` | Manually test a specified example.                     |
+| `Compatiblity Test`     | `compatiblity_test.yml`      | Test PyTorch and Python Compatibility.                 |
+
+Refer to this [documentation](https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow) on how to manually trigger a workflow.
+I will provide the details of each workflow below.
+
+#### Release bdist wheel
+
+Parameters:
+- `torch version`:torch version to test against, multiple versions are supported but must be separated by comma. The default is value is all, which will test all available torch versions listed in this [repository](https://github.com/hpcaitech/public_assets/tree/main/colossalai/torch_build/torch_wheels) which is regularly updated.
+- `cuda version`: cuda versions to test against, multiple versions are supported but must be separated by comma. The CUDA versions must be present in our [DockerHub repository](https://hub.docker.com/r/hpcaitech/cuda-conda).
+- `ref`: input the branch or tag name to build the wheel for this ref.
+
+#### Dispatch Example Test
+
+parameters:
+- `example_directory`: the example directory to test. Multiple directories are supported and must be separated by comma. For example, language/gpt, images/vit. Simply input language or simply gpt does not work.
+
+
+#### Compatibility Test
+
+Parameters:
+- `torch version`:torch version to test against, multiple versions are supported but must be separated by comma. The default is value is all, which will test all available torch versions listed in this [repository](https://github.com/hpcaitech/public_assets/tree/main/colossalai/torch_build/torch_wheels).
+- `cuda version`: cuda versions to test against, multiple versions are supported but must be separated by comma. The CUDA versions must be present in our [DockerHub repository](https://hub.docker.com/r/hpcaitech/cuda-conda).
+
+> It only test the compatiblity of the main branch
+
+
+### User Friendliness
+
+| Workflow Name     | File name               | Description                                                                                                                            |
+| ----------------- | ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------- |
+| `issue-translate` | `translate_comment.yml` | This workflow is triggered when a new issue comment is created. The comment will be translated into English if not written in English. |
+
+## Progress Log
+
+- [x] unit testing
+  - [x] test on PR
+  - [x] report test coverage
+  - [x] regular test
+- [x] release
+  - [x] official release
+  - [x] nightly build
+  - [x] binary build
+  - [x] docker build
+  - [x] draft release post
+- [x] pre-commit
+  - [x] check on PR
+  - [x] report failure
+- [x] example check
+  - [x] check on PR
+  - [x] regular check
+  - [x] manual dispatch
+- [ ] compatiblity check
+  - [x] manual dispatch
+  - [ ] auto test when release
+- [x] helpers
+  - [x] comment translation
+  - [x] submodule update
+  - [x] close inactive issue
diff --git a/README-zh-Hans.md b/README-zh-Hans.md
index b97b02f5ab84..6b1848c4bdd7 100644
--- a/README-zh-Hans.md
+++ b/README-zh-Hans.md
@@ -349,6 +349,11 @@ docker run -ti --gpus all --rm --ipc=host colossalai bash
 <p align="right">(<a href="#top">返回顶端</a>)</p>
 
 
+## CI/CD
+
+我们使用[GitHub Actions](https://github.com/features/actions)来自动化大部分开发以及部署流程。如果想了解这些工作流是如何运行的，请查看这个[文档](.github/workflows/README.md).
+
+
 ## 引用我们
 
 ```
diff --git a/README.md b/README.md
index 7aba907e0a64..396260e97399 100644
--- a/README.md
+++ b/README.md
@@ -353,6 +353,11 @@ Thanks so much to all of our amazing contributors!
 <p align="right">(<a href="#top">back to top</a>)</p>
 
 
+## CI/CD
+
+We leverage the power of [GitHub Actions](https://github.com/features/actions) to automate our development, release and deployment workflows. Please check out this [documentation](.github/workflows/README.md) on how the automated workflows are operated.
+
+
 ## Cite Us
 
 ```

From 63be79d5057843049d287ec29d92c96fab6f3437 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Wed, 11 Jan 2023 10:07:18 +0800
Subject: [PATCH 154/503] [example] removed duplicated stable diffusion example
 (#2424)

---
 examples/tutorial/README.md                   |   26 -
 examples/tutorial/stable_diffusion/LICENSE    |   82 -
 examples/tutorial/stable_diffusion/README.md  |  149 --
 .../configs/train_colossalai.yaml             |  116 --
 .../configs/train_colossalai_cifar10.yaml     |  123 --
 .../stable_diffusion/configs/train_ddp.yaml   |  113 --
 .../configs/train_pokemon.yaml                |  121 --
 .../stable_diffusion/environment.yaml         |   34 -
 .../stable_diffusion/ldm/data/__init__.py     |    0
 .../stable_diffusion/ldm/data/base.py         |   75 -
 .../stable_diffusion/ldm/data/cifar10.py      |  184 --
 .../stable_diffusion/ldm/data/imagenet.py     |  394 -----
 .../stable_diffusion/ldm/data/lsun.py         |   92 -
 .../stable_diffusion/ldm/lr_scheduler.py      |   98 --
 .../ldm/models/autoencoder.py                 |  544 ------
 .../ldm/models/diffusion/__init__.py          |    0
 .../ldm/models/diffusion/classifier.py        |  267 ---
 .../ldm/models/diffusion/ddim.py              |  240 ---
 .../ldm/models/diffusion/ddpm.py              | 1554 -----------------
 .../ldm/models/diffusion/plms.py              |  236 ---
 .../stable_diffusion/ldm/modules/attention.py |  314 ----
 .../ldm/modules/diffusionmodules/__init__.py  |    0
 .../ldm/modules/diffusionmodules/model.py     |  862 ---------
 .../modules/diffusionmodules/openaimodel.py   | 1152 ------------
 .../ldm/modules/diffusionmodules/util.py      |  276 ---
 .../ldm/modules/distributions/__init__.py     |    0
 .../modules/distributions/distributions.py    |   92 -
 .../stable_diffusion/ldm/modules/ema.py       |   76 -
 .../ldm/modules/encoders/__init__.py          |    0
 .../ldm/modules/encoders/modules.py           |  264 ---
 .../ldm/modules/flash_attention.py            |   50 -
 .../ldm/modules/image_degradation/__init__.py |    2 -
 .../ldm/modules/image_degradation/bsrgan.py   |  730 --------
 .../modules/image_degradation/bsrgan_light.py |  650 -------
 .../modules/image_degradation/utils/test.png  |  Bin 441072 -> 0 bytes
 .../modules/image_degradation/utils_image.py  |  916 ----------
 .../ldm/modules/losses/__init__.py            |    1 -
 .../ldm/modules/losses/contperceptual.py      |  111 --
 .../ldm/modules/losses/vqperceptual.py        |  167 --
 .../ldm/modules/x_transformer.py              |  641 -------
 .../tutorial/stable_diffusion/ldm/util.py     |  203 ---
 examples/tutorial/stable_diffusion/main.py    |  830 ---------
 .../stable_diffusion/requirements.txt         |   22 -
 .../scripts/download_first_stages.sh          |   41 -
 .../scripts/download_models.sh                |   49 -
 .../stable_diffusion/scripts/img2img.py       |  293 ----
 .../stable_diffusion/scripts/inpaint.py       |   98 --
 .../stable_diffusion/scripts/knn2img.py       |  398 -----
 .../scripts/sample_diffusion.py               |  313 ----
 .../scripts/tests/test_checkpoint.py          |   37 -
 .../scripts/tests/test_watermark.py           |   18 -
 .../scripts/train_searcher.py                 |  147 --
 .../stable_diffusion/scripts/txt2img.py       |  344 ----
 examples/tutorial/stable_diffusion/setup.py   |   13 -
 examples/tutorial/stable_diffusion/train.sh   |    4 -
 55 files changed, 13562 deletions(-)
 delete mode 100644 examples/tutorial/stable_diffusion/LICENSE
 delete mode 100644 examples/tutorial/stable_diffusion/README.md
 delete mode 100644 examples/tutorial/stable_diffusion/configs/train_colossalai.yaml
 delete mode 100644 examples/tutorial/stable_diffusion/configs/train_colossalai_cifar10.yaml
 delete mode 100644 examples/tutorial/stable_diffusion/configs/train_ddp.yaml
 delete mode 100644 examples/tutorial/stable_diffusion/configs/train_pokemon.yaml
 delete mode 100644 examples/tutorial/stable_diffusion/environment.yaml
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/data/__init__.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/data/base.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/data/cifar10.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/data/imagenet.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/data/lsun.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/lr_scheduler.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/models/autoencoder.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/models/diffusion/__init__.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/models/diffusion/classifier.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/models/diffusion/ddim.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/models/diffusion/ddpm.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/models/diffusion/plms.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/modules/attention.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/modules/diffusionmodules/__init__.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/modules/diffusionmodules/model.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/modules/diffusionmodules/openaimodel.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/modules/diffusionmodules/util.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/modules/distributions/__init__.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/modules/distributions/distributions.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/modules/ema.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/modules/encoders/__init__.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/modules/encoders/modules.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/modules/flash_attention.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/modules/image_degradation/__init__.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/modules/image_degradation/bsrgan.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/modules/image_degradation/bsrgan_light.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/modules/image_degradation/utils/test.png
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/modules/image_degradation/utils_image.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/modules/losses/__init__.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/modules/losses/contperceptual.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/modules/losses/vqperceptual.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/modules/x_transformer.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/util.py
 delete mode 100644 examples/tutorial/stable_diffusion/main.py
 delete mode 100644 examples/tutorial/stable_diffusion/requirements.txt
 delete mode 100644 examples/tutorial/stable_diffusion/scripts/download_first_stages.sh
 delete mode 100644 examples/tutorial/stable_diffusion/scripts/download_models.sh
 delete mode 100644 examples/tutorial/stable_diffusion/scripts/img2img.py
 delete mode 100644 examples/tutorial/stable_diffusion/scripts/inpaint.py
 delete mode 100644 examples/tutorial/stable_diffusion/scripts/knn2img.py
 delete mode 100644 examples/tutorial/stable_diffusion/scripts/sample_diffusion.py
 delete mode 100644 examples/tutorial/stable_diffusion/scripts/tests/test_checkpoint.py
 delete mode 100644 examples/tutorial/stable_diffusion/scripts/tests/test_watermark.py
 delete mode 100644 examples/tutorial/stable_diffusion/scripts/train_searcher.py
 delete mode 100644 examples/tutorial/stable_diffusion/scripts/txt2img.py
 delete mode 100644 examples/tutorial/stable_diffusion/setup.py
 delete mode 100644 examples/tutorial/stable_diffusion/train.sh

diff --git a/examples/tutorial/README.md b/examples/tutorial/README.md
index bef7c8905033..9c61e41cd146 100644
--- a/examples/tutorial/README.md
+++ b/examples/tutorial/README.md
@@ -39,9 +39,6 @@ quickly deploy large AI model training and inference, reducing large AI model tr
    - Try pre-trained OPT model weights with Colossal-AI
    - Fine-tuning OPT with limited hardware using ZeRO, Gemini and parallelism
    - Deploy the fine-tuned model to inference service
- - Acceleration of Stable Diffusion
-   - Stable Diffusion with Lightning
-   - Try Lightning Colossal-AI strategy to optimize memory and accelerate speed
 
 
 ## Discussion
@@ -168,26 +165,3 @@ docker run -it --rm --gpus all --ipc host -p 7070:7070 hpcaitech/tutorial:opt-in
 ```bash
 python opt_fastapi.py opt-125m --tp 2 --checkpoint /data/opt-125m
 ```
-
-## 🖼️ Accelerate Stable Diffusion with Colossal-AI
-1. Create a new environment for diffusion
-```bash
-conda env create -f environment.yaml
-conda activate ldm
-```
-2. Install Colossal-AI from our official page
-```bash
-pip install colossalai==0.1.10+torch1.11cu11.3 -f https://release.colossalai.org
-```
-3. Install PyTorch Lightning compatible commit
-```bash
-git clone https://github.com/Lightning-AI/lightning && cd lightning && git reset --hard b04a7aa
-pip install -r requirements.txt && pip install .
-cd ..
-```
-
-4. Comment out the `from_pretrained` field in the `train_colossalai_cifar10.yaml`.
-5. Run training with CIFAR10.
-```bash
-python main.py -logdir /tmp -t true -postfix test -b configs/train_colossalai_cifar10.yaml
-```
diff --git a/examples/tutorial/stable_diffusion/LICENSE b/examples/tutorial/stable_diffusion/LICENSE
deleted file mode 100644
index 0e609df0d8cd..000000000000
--- a/examples/tutorial/stable_diffusion/LICENSE
+++ /dev/null
@@ -1,82 +0,0 @@
-Copyright (c) 2022 Robin Rombach and Patrick Esser and contributors
-
-CreativeML Open RAIL-M
-dated August 22, 2022
-
-Section I: PREAMBLE
-
-Multimodal generative models are being widely adopted and used, and have the potential to transform the way artists, among other individuals, conceive and benefit from AI or ML technologies as a tool for content creation.
-
-Notwithstanding the current and potential benefits that these artifacts can bring to society at large, there are also concerns about potential misuses of them, either due to their technical limitations or ethical considerations.
-
-In short, this license strives for both the open and responsible downstream use of the accompanying model. When it comes to the open character, we took inspiration from open source permissive licenses regarding the grant of IP rights. Referring to the downstream responsible use, we added use-based restrictions not permitting the use of the Model in very specific scenarios, in order for the licensor to be able to enforce the license in case potential misuses of the Model may occur. At the same time, we strive to promote open and responsible research on generative models for art and content generation.
-
-Even though downstream derivative versions of the model could be released under different licensing terms, the latter will always have to include - at minimum - the same use-based restrictions as the ones in the original license (this license). We believe in the intersection between open and responsible AI development; thus, this License aims to strike a balance between both in order to enable responsible open-science in the field of AI.
-
-This License governs the use of the model (and its derivatives) and is informed by the model card associated with the model.
-
-NOW THEREFORE, You and Licensor agree as follows:
-
-1. Definitions
-
-- "License" means the terms and conditions for use, reproduction, and Distribution as defined in this document.
-- "Data" means a collection of information and/or content extracted from the dataset used with the Model, including to train, pretrain, or otherwise evaluate the Model. The Data is not licensed under this License.
-- "Output" means the results of operating a Model as embodied in informational content resulting therefrom.
-- "Model" means any accompanying machine-learning based assemblies (including checkpoints), consisting of learnt weights, parameters (including optimizer states), corresponding to the model architecture as embodied in the Complementary Material, that have been trained or tuned, in whole or in part on the Data, using the Complementary Material.
-- "Derivatives of the Model" means all modifications to the Model, works based on the Model, or any other model which is created or initialized by transfer of patterns of the weights, parameters, activations or output of the Model, to the other model, in order to cause the other model to perform similarly to the Model, including - but not limited to - distillation methods entailing the use of intermediate data representations or methods based on the generation of synthetic data by the Model for training the other model.
-- "Complementary Material" means the accompanying source code and scripts used to define, run, load, benchmark or evaluate the Model, and used to prepare data for training or evaluation, if any. This includes any accompanying documentation, tutorials, examples, etc, if any.
-- "Distribution" means any transmission, reproduction, publication or other sharing of the Model or Derivatives of the Model to a third party, including providing the Model as a hosted service made available by electronic or other remote means - e.g. API-based or web access.
-- "Licensor" means the copyright owner or entity authorized by the copyright owner that is granting the License, including the persons or entities that may have rights in the Model and/or distributing the Model.
-- "You" (or "Your") means an individual or Legal Entity exercising permissions granted by this License and/or making use of the Model for whichever purpose and in any field of use, including usage of the Model in an end-use application - e.g. chatbot, translator, image generator.
-- "Third Parties" means individuals or legal entities that are not under common control with Licensor or You.
-- "Contribution" means any work of authorship, including the original version of the Model and any modifications or additions to that Model or Derivatives of the Model thereof, that is intentionally submitted to Licensor for inclusion in the Model by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Model, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
-- "Contributor" means Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Model.
-
-Section II: INTELLECTUAL PROPERTY RIGHTS
-
-Both copyright and patent grants apply to the Model, Derivatives of the Model and Complementary Material. The Model and Derivatives of the Model are subject to additional terms as described in Section III.
-
-2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare, publicly display, publicly perform, sublicense, and distribute the Complementary Material, the Model, and Derivatives of the Model.
-3. Grant of Patent License. Subject to the terms and conditions of this License and where and as applicable, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this paragraph) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Model and the Complementary Material, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Model to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Model and/or Complementary Material or a Contribution incorporated within the Model and/or Complementary Material constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for the Model and/or Work shall terminate as of the date such litigation is asserted or filed.
-
-Section III: CONDITIONS OF USAGE, DISTRIBUTION AND REDISTRIBUTION
-
-4. Distribution and Redistribution. You may host for Third Party remote access purposes (e.g. software-as-a-service), reproduce and distribute copies of the Model or Derivatives of the Model thereof in any medium, with or without modifications, provided that You meet the following conditions:
-Use-based restrictions as referenced in paragraph 5 MUST be included as an enforceable provision by You in any type of legal agreement (e.g. a license) governing the use and/or distribution of the Model or Derivatives of the Model, and You shall give notice to subsequent users You Distribute to, that the Model or Derivatives of the Model are subject to paragraph 5. This provision does not apply to the use of Complementary Material.
-You must give any Third Party recipients of the Model or Derivatives of the Model a copy of this License;
-You must cause any modified files to carry prominent notices stating that You changed the files;
-You must retain all copyright, patent, trademark, and attribution notices excluding those notices that do not pertain to any part of the Model, Derivatives of the Model.
-You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions - respecting paragraph 4.a. - for use, reproduction, or Distribution of Your modifications, or for any such Derivatives of the Model as a whole, provided Your use, reproduction, and Distribution of the Model otherwise complies with the conditions stated in this License.
-5. Use-based restrictions. The restrictions set forth in Attachment A are considered Use-based restrictions. Therefore You cannot use the Model and the Derivatives of the Model for the specified restricted uses. You may use the Model subject to this License, including only for lawful purposes and in accordance with the License. Use may include creating any content with, finetuning, updating, running, training, evaluating and/or reparametrizing the Model. You shall require all of Your users who use the Model or a Derivative of the Model to comply with the terms of this paragraph (paragraph 5).
-6. The Output You Generate. Except as set forth herein, Licensor claims no rights in the Output You generate using the Model. You are accountable for the Output you generate and its subsequent uses. No use of the output can contravene any provision as stated in the License.
-
-Section IV: OTHER PROVISIONS
-
-7. Updates and Runtime Restrictions. To the maximum extent permitted by law, Licensor reserves the right to restrict (remotely or otherwise) usage of the Model in violation of this License, update the Model through electronic means, or modify the Output of the Model based on updates. You shall undertake reasonable efforts to use the latest version of the Model.
-8. Trademarks and related. Nothing in this License permits You to make use of Licensors’ trademarks, trade names, logos or to otherwise suggest endorsement or misrepresent the relationship between the parties; and any rights not expressly granted herein are reserved by the Licensors.
-9. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Model and the Complementary Material (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Model, Derivatives of the Model, and the Complementary Material and assume any risks associated with Your exercise of permissions under this License.
-10. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Model and the Complementary Material (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
-11. Accepting Warranty or Additional Liability. While redistributing the Model, Derivatives of the Model and the Complementary Material thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
-12. If any provision of this License is held to be invalid, illegal or unenforceable, the remaining provisions shall be unaffected thereby and remain valid as if such provision had not been set forth herein.
-
-END OF TERMS AND CONDITIONS
-
-
-
-
-Attachment A
-
-Use Restrictions
-
-You agree not to use the Model or Derivatives of the Model:
-- In any way that violates any applicable national, federal, state, local or international law or regulation;
-- For the purpose of exploiting, harming or attempting to exploit or harm minors in any way;
-- To generate or disseminate verifiably false information and/or content with the purpose of harming others;
-- To generate or disseminate personal identifiable information that can be used to harm an individual;
-- To defame, disparage or otherwise harass others;
-- For fully automated decision making that adversely impacts an individual’s legal rights or otherwise creates or modifies a binding, enforceable obligation;
-- For any use intended to or which has the effect of discriminating against or harming individuals or groups based on online or offline social behavior or known or predicted personal or personality characteristics;
-- To exploit any of the vulnerabilities of a specific group of persons based on their age, social, physical or mental characteristics, in order to materially distort the behavior of a person pertaining to that group in a manner that causes or is likely to cause that person or another person physical or psychological harm;
-- For any use intended to or which has the effect of discriminating against individuals or groups based on legally protected characteristics or categories;
-- To provide medical advice and medical results interpretation;
-- To generate or disseminate information for the purpose to be used for administration of justice, law enforcement, immigration or asylum processes, such as predicting an individual will commit fraud/crime commitment (e.g. by text profiling, drawing causal relationships between assertions made in documents, indiscriminate and arbitrarily-targeted use).
diff --git a/examples/tutorial/stable_diffusion/README.md b/examples/tutorial/stable_diffusion/README.md
deleted file mode 100644
index a0ece4485d27..000000000000
--- a/examples/tutorial/stable_diffusion/README.md
+++ /dev/null
@@ -1,149 +0,0 @@
-# Stable Diffusion with Colossal-AI
-*[Colosssal-AI](https://github.com/hpcaitech/ColossalAI) provides a faster and lower cost solution for pretraining and
-fine-tuning for AIGC (AI-Generated Content) applications such as the model [stable-diffusion](https://github.com/CompVis/stable-diffusion) from [Stability AI](https://stability.ai/).*
-
-We take advantage of [Colosssal-AI](https://github.com/hpcaitech/ColossalAI) to exploit multiple optimization strategies
-, e.g. data parallelism, tensor parallelism, mixed precision & ZeRO, to scale the training to multiple GPUs.
-
-## 🚀Quick Start
-1. Create a new environment for diffusion
-```bash
-conda env create -f environment.yaml
-conda activate ldm
-```
-2. Install Colossal-AI from our official page
-```bash
-pip install colossalai==0.1.10+torch1.11cu11.3 -f https://release.colossalai.org
-```
-3. Install PyTorch Lightning compatible commit
-```bash
-git clone https://github.com/Lightning-AI/lightning && cd lightning && git reset --hard b04a7aa
-pip install -r requirements.txt && pip install .
-cd ..
-```
-
-4. Comment out the `from_pretrained` field in the `train_colossalai_cifar10.yaml`.
-5. Run training with CIFAR10.
-```bash
-python main.py -logdir /tmp -t true -postfix test -b configs/train_colossalai_cifar10.yaml
-```
-
-## Stable Diffusion
-[Stable Diffusion](https://huggingface.co/CompVis/stable-diffusion) is a latent text-to-image diffusion
-model.
-Thanks to a generous compute donation from [Stability AI](https://stability.ai/) and support from [LAION](https://laion.ai/), we were able to train a Latent Diffusion Model on 512x512 images from a subset of the [LAION-5B](https://laion.ai/blog/laion-5b/) database.
-Similar to Google's [Imagen](https://arxiv.org/abs/2205.11487),
-this model uses a frozen CLIP ViT-L/14 text encoder to condition the model on text prompts.
-
-<p id="diffusion_train" align="center">
-<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/diffusion_train.png" width=800/>
-</p>
-
-[Stable Diffusion with Colossal-AI](https://github.com/hpcaitech/ColossalAI/tree/main/examples/images/diffusion) provides **6.5x faster training and pretraining cost saving, the hardware cost of fine-tuning can be almost 7X cheaper** (from RTX3090/4090 24GB to RTX3050/2070 8GB).
-
-<p id="diffusion_demo" align="center">
-<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/diffusion_demo.png" width=800/>
-</p>
-
-## Requirements
-A suitable [conda](https://conda.io/) environment named `ldm` can be created
-and activated with:
-
-```
-conda env create -f environment.yaml
-conda activate ldm
-```
-
-You can also update an existing [latent diffusion](https://github.com/CompVis/latent-diffusion) environment by running
-
-```
-conda install pytorch torchvision -c pytorch
-pip install transformers==4.19.2 diffusers invisible-watermark
-pip install -e .
-```
-
-### Install [Colossal-AI v0.1.10](https://colossalai.org/download/) From Our Official Website
-```
-pip install colossalai==0.1.10+torch1.11cu11.3 -f https://release.colossalai.org
-```
-
-### Install [Lightning](https://github.com/Lightning-AI/lightning)
-We use the Sep. 2022 version with commit id as `b04a7aa`.
-```
-git clone https://github.com/Lightning-AI/lightning && cd lightning && git reset --hard b04a7aa
-pip install -r requirements.txt && pip install .
-```
-
-> The specified version is due to the interface incompatibility caused by the latest update of [Lightning](https://github.com/Lightning-AI/lightning), which will be fixed in the near future.
-
-## Dataset
-The dataSet is from [LAION-5B](https://laion.ai/blog/laion-5b/), the subset of [LAION](https://laion.ai/),
-you should the change the `data.file_path` in the `config/train_colossalai.yaml`
-
-## Training
-
-We provide the script `train.sh` to run the training task , and two Stategy in `configs`:`train_colossalai.yaml`
-
-For example, you can run the training from colossalai by
-```
-python main.py --logdir /tmp -t --postfix test -b configs/train_colossalai.yaml
-```
-
-- you can change the `--logdir` the save the log information and the last checkpoint
-
-### Training config
-You can change the trainging config in the yaml file
-
-- accelerator: acceleratortype, default 'gpu'
-- devices: device number used for training, default 4
-- max_epochs: max training epochs
-- precision: usefp16 for training or not, default 16, you must use fp16 if you want to apply colossalai
-
-## Example
-
-### Training on cifar10
-
-We provide the finetuning example on CIFAR10 dataset
-
-You can run by config `train_colossalai_cifar10.yaml`
-```
-python main.py --logdir /tmp -t --postfix test -b configs/train_colossalai_cifar10.yaml
-```
-
-
-
-## Comments
-
-- Our codebase for the diffusion models builds heavily on [OpenAI's ADM codebase](https://github.com/openai/guided-diffusion)
-, [lucidrains](https://github.com/lucidrains/denoising-diffusion-pytorch),
-[Stable Diffusion](https://github.com/CompVis/stable-diffusion), [Lightning](https://github.com/Lightning-AI/lightning) and [Hugging Face](https://huggingface.co/CompVis/stable-diffusion).
-Thanks for open-sourcing!
-
-- The implementation of the transformer encoder is from [x-transformers](https://github.com/lucidrains/x-transformers) by [lucidrains](https://github.com/lucidrains?tab=repositories).
-
-- The implementation of [flash attention](https://github.com/HazyResearch/flash-attention) is from [HazyResearch](https://github.com/HazyResearch).
-
-## BibTeX
-
-```
-@article{bian2021colossal,
-  title={Colossal-AI: A Unified Deep Learning System For Large-Scale Parallel Training},
-  author={Bian, Zhengda and Liu, Hongxin and Wang, Boxiang and Huang, Haichen and Li, Yongbin and Wang, Chuanrui and Cui, Fan and You, Yang},
-  journal={arXiv preprint arXiv:2110.14883},
-  year={2021}
-}
-@misc{rombach2021highresolution,
-  title={High-Resolution Image Synthesis with Latent Diffusion Models},
-  author={Robin Rombach and Andreas Blattmann and Dominik Lorenz and Patrick Esser and Björn Ommer},
-  year={2021},
-  eprint={2112.10752},
-  archivePrefix={arXiv},
-  primaryClass={cs.CV}
-}
-@article{dao2022flashattention,
-  title={FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness},
-  author={Dao, Tri and Fu, Daniel Y. and Ermon, Stefano and Rudra, Atri and R{\'e}, Christopher},
-  journal={arXiv preprint arXiv:2205.14135},
-  year={2022}
-}
-```
diff --git a/examples/tutorial/stable_diffusion/configs/train_colossalai.yaml b/examples/tutorial/stable_diffusion/configs/train_colossalai.yaml
deleted file mode 100644
index c457787dd881..000000000000
--- a/examples/tutorial/stable_diffusion/configs/train_colossalai.yaml
+++ /dev/null
@@ -1,116 +0,0 @@
-model:
-  base_learning_rate: 1.0e-04
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: image
-    cond_stage_key: caption
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false   # Note: different from the one we trained before
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-
-    scheduler_config: # 10000 warmup steps
-      target: ldm.lr_scheduler.LambdaLinearScheduler
-      params:
-        warm_up_steps: [ 1 ] # NOTE for resuming. use 10000 if starting from scratch
-        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
-        f_start: [ 1.e-6 ]
-        f_max: [ 1.e-4 ]
-        f_min: [ 1.e-10 ]
-
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 32 # unused
-        from_pretrained: '/data/scratch/diffuser/stable-diffusion-v1-4/unet/diffusion_pytorch_model.bin'
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: False
-        legacy: False
-
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        from_pretrained: '/data/scratch/diffuser/stable-diffusion-v1-4/vae/diffusion_pytorch_model.bin'
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
-      params:
-        use_fp16: True
-
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 64
-    wrap: False
-    train:
-      target: ldm.data.base.Txt2ImgIterableBaseDataset
-      params:
-        file_path: "/data/scratch/diffuser/laion_part0/"
-        world_size: 1
-        rank: 0
-
-lightning:
-  trainer:
-    accelerator: 'gpu' 
-    devices: 4
-    log_gpu_memory: all
-    max_epochs: 2
-    precision: 16
-    auto_select_gpus: False
-    strategy:
-      target: pytorch_lightning.strategies.ColossalAIStrategy
-      params:
-        use_chunk: False
-        enable_distributed_storage: True,
-        placement_policy: cuda
-        force_outputs_fp32: False
-
-    log_every_n_steps: 2
-    logger: True
-    default_root_dir: "/tmp/diff_log/"
-    profiler: pytorch
-
-  logger_config:
-    wandb:
-      target: pytorch_lightning.loggers.WandbLogger
-      params:
-          name: nowname
-          save_dir: "/tmp/diff_log/"
-          offline: opt.debug
-          id: nowname
\ No newline at end of file
diff --git a/examples/tutorial/stable_diffusion/configs/train_colossalai_cifar10.yaml b/examples/tutorial/stable_diffusion/configs/train_colossalai_cifar10.yaml
deleted file mode 100644
index 63b9d1c0179c..000000000000
--- a/examples/tutorial/stable_diffusion/configs/train_colossalai_cifar10.yaml
+++ /dev/null
@@ -1,123 +0,0 @@
-model:
-  base_learning_rate: 1.0e-04
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: image
-    cond_stage_key: txt
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false   # Note: different from the one we trained before
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-
-    scheduler_config: # 10000 warmup steps
-      target: ldm.lr_scheduler.LambdaLinearScheduler
-      params:
-        warm_up_steps: [ 1 ] # NOTE for resuming. use 10000 if starting from scratch
-        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
-        f_start: [ 1.e-6 ]
-        f_max: [ 1.e-4 ]
-        f_min: [ 1.e-10 ]
-
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 32 # unused
-        from_pretrained: '/data/scratch/diffuser/stable-diffusion-v1-4/unet/diffusion_pytorch_model.bin'
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: False
-        legacy: False
-
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        from_pretrained: '/data/scratch/diffuser/stable-diffusion-v1-4/vae/diffusion_pytorch_model.bin'
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
-      params:
-        use_fp16: True
-
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 4
-    num_workers: 4
-    train:
-      target: ldm.data.cifar10.hf_dataset
-      params:
-        name: cifar10
-        image_transforms:
-        - target: torchvision.transforms.Resize
-          params:
-            size: 512
-            interpolation: 3
-        - target: torchvision.transforms.RandomCrop
-          params:
-            size: 512
-        - target: torchvision.transforms.RandomHorizontalFlip
-
-lightning:
-  trainer:
-    accelerator: 'gpu' 
-    devices: 2
-    log_gpu_memory: all
-    max_epochs: 2
-    precision: 16
-    auto_select_gpus: False
-    strategy:
-      target: pytorch_lightning.strategies.ColossalAIStrategy
-      params:
-        use_chunk: False
-        enable_distributed_storage: True,
-        placement_policy: cuda
-        force_outputs_fp32: False
-
-    log_every_n_steps: 2
-    logger: True
-    default_root_dir: "/tmp/diff_log/"
-    profiler: pytorch
-
-  logger_config:
-    wandb:
-      target: pytorch_lightning.loggers.WandbLogger
-      params:
-          name: nowname
-          save_dir: "/tmp/diff_log/"
-          offline: opt.debug
-          id: nowname
\ No newline at end of file
diff --git a/examples/tutorial/stable_diffusion/configs/train_ddp.yaml b/examples/tutorial/stable_diffusion/configs/train_ddp.yaml
deleted file mode 100644
index 90d41258fada..000000000000
--- a/examples/tutorial/stable_diffusion/configs/train_ddp.yaml
+++ /dev/null
@@ -1,113 +0,0 @@
-model:
-  base_learning_rate: 1.0e-04
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: image
-    cond_stage_key: caption
-    image_size: 32
-    channels: 4
-    cond_stage_trainable: false   # Note: different from the one we trained before
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-
-    scheduler_config: # 10000 warmup steps
-      target: ldm.lr_scheduler.LambdaLinearScheduler
-      params:
-        warm_up_steps: [ 100 ]
-        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
-        f_start: [ 1.e-6 ]
-        f_max: [ 1.e-4 ]
-        f_min: [ 1.e-10  ]
-
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 32 # unused
-        from_pretrained: '/data/scratch/diffuser/stable-diffusion-v1-4/unet/diffusion_pytorch_model.bin'
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: False
-        legacy: False
-
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        from_pretrained: '/data/scratch/diffuser/stable-diffusion-v1-4/vae/diffusion_pytorch_model.bin'
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
-      params:
-        use_fp16: True
-
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 64
-    wrap: False
-    train:
-      target: ldm.data.base.Txt2ImgIterableBaseDataset
-      params:
-        file_path: "/data/scratch/diffuser/laion_part0/"
-        world_size: 1
-        rank: 0
-
-lightning:
-  trainer:
-    accelerator: 'gpu' 
-    devices: 4
-    log_gpu_memory: all
-    max_epochs: 2
-    precision: 16
-    auto_select_gpus: False
-    strategy:
-      target: pytorch_lightning.strategies.DDPStrategy
-      params:
-        find_unused_parameters: False
-    log_every_n_steps: 2
-#    max_steps: 6o
-    logger: True
-    default_root_dir: "/tmp/diff_log/"
-    # profiler: pytorch
-
-  logger_config:
-    wandb:
-      target: pytorch_lightning.loggers.WandbLogger
-      params:
-          name: nowname
-          save_dir: "/tmp/diff_log/"
-          offline: opt.debug
-          id: nowname
\ No newline at end of file
diff --git a/examples/tutorial/stable_diffusion/configs/train_pokemon.yaml b/examples/tutorial/stable_diffusion/configs/train_pokemon.yaml
deleted file mode 100644
index 8b5d2adfaf17..000000000000
--- a/examples/tutorial/stable_diffusion/configs/train_pokemon.yaml
+++ /dev/null
@@ -1,121 +0,0 @@
-model:
-  base_learning_rate: 1.0e-04
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: image
-    cond_stage_key: caption
-    image_size: 32
-    channels: 4
-    cond_stage_trainable: false   # Note: different from the one we trained before
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-    check_nan_inf: False
-
-    scheduler_config: # 10000 warmup steps
-      target: ldm.lr_scheduler.LambdaLinearScheduler
-      params:
-        warm_up_steps: [ 10000 ]
-        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
-        f_start: [ 1.e-6 ]
-        f_max: [ 1.e-4 ]
-        f_min: [ 1.e-10  ]
-
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 32 # unused
-        from_pretrained: '/data/scratch/diffuser/stable-diffusion-v1-4/unet/diffusion_pytorch_model.bin'
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: False
-        legacy: False
-
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        from_pretrained: '/data/scratch/diffuser/stable-diffusion-v1-4/vae/diffusion_pytorch_model.bin'
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
-      params:
-        use_fp16: True
-
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 32
-    wrap: False
-    train:
-      target: ldm.data.pokemon.PokemonDataset
-      # params:
-        # file_path: "/data/scratch/diffuser/laion_part0/"
-        # world_size: 1
-        # rank: 0
-
-lightning:
-  trainer:
-    accelerator: 'gpu' 
-    devices: 4
-    log_gpu_memory: all
-    max_epochs: 2
-    precision: 16
-    auto_select_gpus: False
-    strategy:
-      target: pytorch_lightning.strategies.ColossalAIStrategy
-      params:
-        use_chunk: False
-        enable_distributed_storage: True,
-        placement_policy: cuda
-        force_outputs_fp32: False
-        initial_scale: 65536
-        min_scale: 1
-        max_scale: 65536
-        # max_scale: 4294967296
-
-    log_every_n_steps: 2
-    logger: True
-    default_root_dir: "/tmp/diff_log/"
-    profiler: pytorch
-
-  logger_config:
-    wandb:
-      target: pytorch_lightning.loggers.WandbLogger
-      params:
-          name: nowname
-          save_dir: "/tmp/diff_log/"
-          offline: opt.debug
-          id: nowname
\ No newline at end of file
diff --git a/examples/tutorial/stable_diffusion/environment.yaml b/examples/tutorial/stable_diffusion/environment.yaml
deleted file mode 100644
index 7d8aec86f288..000000000000
--- a/examples/tutorial/stable_diffusion/environment.yaml
+++ /dev/null
@@ -1,34 +0,0 @@
-name: ldm
-channels:
-  - pytorch
-  - defaults
-dependencies:
-  - python=3.9.12
-  - pip=20.3
-  - cudatoolkit=11.3
-  - pytorch=1.11.0
-  - torchvision=0.12.0
-  - numpy=1.19.2
-  - pip:
-    - albumentations==0.4.3
-    - datasets
-    - diffusers
-    - opencv-python==4.6.0.66
-    - pudb==2019.2
-    - invisible-watermark
-    - imageio==2.9.0
-    - imageio-ffmpeg==0.4.2
-    - pytorch-lightning==1.8.0
-    - omegaconf==2.1.1
-    - test-tube>=0.7.5
-    - streamlit>=0.73.1
-    - einops==0.3.0
-    - torch-fidelity==0.3.0
-    - transformers==4.19.2
-    - torchmetrics==0.7.0
-    - kornia==0.6
-    - prefetch_generator
-    - colossalai
-    - -e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers
-    - -e git+https://github.com/openai/CLIP.git@main#egg=clip
-    - -e .
diff --git a/examples/tutorial/stable_diffusion/ldm/data/__init__.py b/examples/tutorial/stable_diffusion/ldm/data/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/examples/tutorial/stable_diffusion/ldm/data/base.py b/examples/tutorial/stable_diffusion/ldm/data/base.py
deleted file mode 100644
index 4f3cd35714a0..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/data/base.py
+++ /dev/null
@@ -1,75 +0,0 @@
-import math
-from abc import abstractmethod
-
-import torch
-from torch.utils.data import Dataset, ConcatDataset, ChainDataset, IterableDataset
-import os
-import numpy as np
-import cv2
-
-class Txt2ImgIterableBaseDataset(IterableDataset):
-    '''
-    Define an interface to make the IterableDatasets for text2img data chainable
-    '''
-    def __init__(self, file_path: str, rank, world_size):
-        super().__init__()
-        self.file_path = file_path
-        self.folder_list = []
-        self.file_list = []
-        self.txt_list = []
-        self.info = self._get_file_info(file_path)
-        self.start = self.info['start']
-        self.end = self.info['end']
-        self.rank = rank
-
-        self.world_size = world_size
-        # self.per_worker = int(math.floor((self.end - self.start) / float(self.world_size)))
-        # self.iter_start = self.start + self.rank * self.per_worker
-        # self.iter_end = min(self.iter_start + self.per_worker, self.end)
-        # self.num_records = self.iter_end - self.iter_start
-        # self.valid_ids = [i for i in range(self.iter_end)]
-        self.num_records = self.end - self.start
-        self.valid_ids = [i for i in range(self.end)]
-
-        print(f'{self.__class__.__name__} dataset contains {self.__len__()} examples.')
-
-    def __len__(self):
-        # return self.iter_end - self.iter_start
-        return self.end - self.start
-
-    def __iter__(self):
-        sample_iterator = self._sample_generator(self.start, self.end)
-        # sample_iterator = self._sample_generator(self.iter_start, self.iter_end)
-        return sample_iterator
-
-    def _sample_generator(self, start, end):
-        for idx in range(start, end):
-            file_name = self.file_list[idx]
-            txt_name = self.txt_list[idx]
-            f_ = open(txt_name, 'r')
-            txt_ = f_.read()
-            f_.close()
-            image = cv2.imdecode(np.fromfile(file_name, dtype=np.uint8), 1)
-            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-            image = torch.from_numpy(image) / 255
-            yield {"caption": txt_, "image":image}
-
-
-    def _get_file_info(self, file_path):
-        info = \
-        {
-            "start": 1,
-            "end": 0,
-        }
-        self.folder_list = [file_path + i for i in os.listdir(file_path) if '.' not in i]
-        for folder in self.folder_list:
-            files = [folder + '/' + i for i in os.listdir(folder) if 'jpg' in i]
-            txts = [k.replace('jpg', 'txt') for k in files]
-            self.file_list.extend(files)
-            self.txt_list.extend(txts)
-        info['end'] = len(self.file_list)
-        # with open(file_path, 'r') as fin:
-        #     for _ in enumerate(fin):
-        #         info['end'] += 1
-        # self.txt_list = [k.replace('jpg', 'txt') for k in self.file_list]
-        return info
\ No newline at end of file
diff --git a/examples/tutorial/stable_diffusion/ldm/data/cifar10.py b/examples/tutorial/stable_diffusion/ldm/data/cifar10.py
deleted file mode 100644
index 53cd61263b47..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/data/cifar10.py
+++ /dev/null
@@ -1,184 +0,0 @@
-from typing import Dict
-import numpy as np
-from omegaconf import DictConfig, ListConfig
-import torch
-from torch.utils.data import Dataset
-from pathlib import Path
-import json
-from PIL import Image
-from torchvision import transforms
-from einops import rearrange
-from ldm.util import instantiate_from_config
-from datasets import load_dataset
-
-def make_multi_folder_data(paths, caption_files=None, **kwargs):
-    """Make a concat dataset from multiple folders
-    Don't suport captions yet
-    If paths is a list, that's ok, if it's a Dict interpret it as:
-    k=folder v=n_times to repeat that
-    """
-    list_of_paths = []
-    if isinstance(paths, (Dict, DictConfig)):
-        assert caption_files is None, \
-            "Caption files not yet supported for repeats"
-        for folder_path, repeats in paths.items():
-            list_of_paths.extend([folder_path]*repeats)
-        paths = list_of_paths
-
-    if caption_files is not None:
-        datasets = [FolderData(p, caption_file=c, **kwargs) for (p, c) in zip(paths, caption_files)]
-    else:
-        datasets = [FolderData(p, **kwargs) for p in paths]
-    return torch.utils.data.ConcatDataset(datasets)
-
-class FolderData(Dataset):
-    def __init__(self,
-        root_dir,
-        caption_file=None,
-        image_transforms=[],
-        ext="jpg",
-        default_caption="",
-        postprocess=None,
-        return_paths=False,
-        ) -> None:
-        """Create a dataset from a folder of images.
-        If you pass in a root directory it will be searched for images
-        ending in ext (ext can be a list)
-        """
-        self.root_dir = Path(root_dir)
-        self.default_caption = default_caption
-        self.return_paths = return_paths
-        if isinstance(postprocess, DictConfig):
-            postprocess = instantiate_from_config(postprocess)
-        self.postprocess = postprocess
-        if caption_file is not None:
-            with open(caption_file, "rt") as f:
-                ext = Path(caption_file).suffix.lower()
-                if ext == ".json":
-                    captions = json.load(f)
-                elif ext == ".jsonl":
-                    lines = f.readlines()
-                    lines = [json.loads(x) for x in lines]
-                    captions = {x["file_name"]: x["text"].strip("\n") for x in lines}
-                else:
-                    raise ValueError(f"Unrecognised format: {ext}")
-            self.captions = captions
-        else:
-            self.captions = None
-
-        if not isinstance(ext, (tuple, list, ListConfig)):
-            ext = [ext]
-
-        # Only used if there is no caption file
-        self.paths = []
-        for e in ext:
-            self.paths.extend(list(self.root_dir.rglob(f"*.{e}")))
-        if isinstance(image_transforms, ListConfig):
-            image_transforms = [instantiate_from_config(tt) for tt in image_transforms]
-        image_transforms.extend([transforms.ToTensor(),
-                                 transforms.Lambda(lambda x: rearrange(x * 2. - 1., 'c h w -> h w c'))])
-        image_transforms = transforms.Compose(image_transforms)
-        self.tform = image_transforms
-
-
-    def __len__(self):
-        if self.captions is not None:
-            return len(self.captions.keys())
-        else:
-            return len(self.paths)
-
-    def __getitem__(self, index):
-        data = {}
-        if self.captions is not None:
-            chosen = list(self.captions.keys())[index]
-            caption = self.captions.get(chosen, None)
-            if caption is None:
-                caption = self.default_caption
-            filename = self.root_dir/chosen
-        else:
-            filename = self.paths[index]
-
-        if self.return_paths:
-            data["path"] = str(filename)
-
-        im = Image.open(filename)
-        im = self.process_im(im)
-        data["image"] = im
-
-        if self.captions is not None:
-            data["txt"] = caption
-        else:
-            data["txt"] = self.default_caption
-
-        if self.postprocess is not None:
-            data = self.postprocess(data)
-
-        return data
-
-    def process_im(self, im):
-        im = im.convert("RGB")
-        return self.tform(im)
-
-def hf_dataset(
-    name,
-    image_transforms=[],
-    image_column="img",
-    label_column="label",
-    text_column="txt",
-    split='train',
-    image_key='image',
-    caption_key='txt',
-    ):
-    """Make huggingface dataset with appropriate list of transforms applied
-    """
-    ds = load_dataset(name, split=split)
-    image_transforms = [instantiate_from_config(tt) for tt in image_transforms]
-    image_transforms.extend([transforms.ToTensor(),
-                                transforms.Lambda(lambda x: rearrange(x * 2. - 1., 'c h w -> h w c'))])
-    tform = transforms.Compose(image_transforms)
-
-    assert image_column in ds.column_names, f"Didn't find column {image_column} in {ds.column_names}"
-    assert label_column in ds.column_names, f"Didn't find column {label_column} in {ds.column_names}"
-
-    def pre_process(examples):
-        processed = {}
-        processed[image_key] = [tform(im) for im in examples[image_column]]
-
-        label_to_text_dict = {0: "airplane", 1: "automobile", 2: "bird", 3: "cat", 4: "deer", 5: "dog", 6: "frog", 7: "horse", 8: "ship", 9: "truck"}
-
-        processed[caption_key] = [label_to_text_dict[label] for label in examples[label_column]]
-
-        return processed
-
-    ds.set_transform(pre_process)
-    return ds
-
-class TextOnly(Dataset):
-    def __init__(self, captions, output_size, image_key="image", caption_key="txt", n_gpus=1):
-        """Returns only captions with dummy images"""
-        self.output_size = output_size
-        self.image_key = image_key
-        self.caption_key = caption_key
-        if isinstance(captions, Path):
-            self.captions = self._load_caption_file(captions)
-        else:
-            self.captions = captions
-
-        if n_gpus > 1:
-            # hack to make sure that all the captions appear on each gpu
-            repeated = [n_gpus*[x] for x in self.captions]
-            self.captions = []
-            [self.captions.extend(x) for x in repeated]
-
-    def __len__(self):
-        return len(self.captions)
-
-    def __getitem__(self, index):
-        dummy_im = torch.zeros(3, self.output_size, self.output_size)
-        dummy_im = rearrange(dummy_im * 2. - 1., 'c h w -> h w c')
-        return {self.image_key: dummy_im, self.caption_key: self.captions[index]}
-
-    def _load_caption_file(self, filename):
-        with open(filename, 'rt') as f:
-            captions = f.readlines()
-        return [x.strip('\n') for x in captions]
\ No newline at end of file
diff --git a/examples/tutorial/stable_diffusion/ldm/data/imagenet.py b/examples/tutorial/stable_diffusion/ldm/data/imagenet.py
deleted file mode 100644
index 1c473f9c6965..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/data/imagenet.py
+++ /dev/null
@@ -1,394 +0,0 @@
-import os, yaml, pickle, shutil, tarfile, glob
-import cv2
-import albumentations
-import PIL
-import numpy as np
-import torchvision.transforms.functional as TF
-from omegaconf import OmegaConf
-from functools import partial
-from PIL import Image
-from tqdm import tqdm
-from torch.utils.data import Dataset, Subset
-
-import taming.data.utils as tdu
-from taming.data.imagenet import str_to_indices, give_synsets_from_indices, download, retrieve
-from taming.data.imagenet import ImagePaths
-
-from ldm.modules.image_degradation import degradation_fn_bsr, degradation_fn_bsr_light
-
-
-def synset2idx(path_to_yaml="data/index_synset.yaml"):
-    with open(path_to_yaml) as f:
-        di2s = yaml.load(f)
-    return dict((v,k) for k,v in di2s.items())
-
-
-class ImageNetBase(Dataset):
-    def __init__(self, config=None):
-        self.config = config or OmegaConf.create()
-        if not type(self.config)==dict:
-            self.config = OmegaConf.to_container(self.config)
-        self.keep_orig_class_label = self.config.get("keep_orig_class_label", False)
-        self.process_images = True  # if False we skip loading & processing images and self.data contains filepaths
-        self._prepare()
-        self._prepare_synset_to_human()
-        self._prepare_idx_to_synset()
-        self._prepare_human_to_integer_label()
-        self._load()
-
-    def __len__(self):
-        return len(self.data)
-
-    def __getitem__(self, i):
-        return self.data[i]
-
-    def _prepare(self):
-        raise NotImplementedError()
-
-    def _filter_relpaths(self, relpaths):
-        ignore = set([
-            "n06596364_9591.JPEG",
-        ])
-        relpaths = [rpath for rpath in relpaths if not rpath.split("/")[-1] in ignore]
-        if "sub_indices" in self.config:
-            indices = str_to_indices(self.config["sub_indices"])
-            synsets = give_synsets_from_indices(indices, path_to_yaml=self.idx2syn)  # returns a list of strings
-            self.synset2idx = synset2idx(path_to_yaml=self.idx2syn)
-            files = []
-            for rpath in relpaths:
-                syn = rpath.split("/")[0]
-                if syn in synsets:
-                    files.append(rpath)
-            return files
-        else:
-            return relpaths
-
-    def _prepare_synset_to_human(self):
-        SIZE = 2655750
-        URL = "https://heibox.uni-heidelberg.de/f/9f28e956cd304264bb82/?dl=1"
-        self.human_dict = os.path.join(self.root, "synset_human.txt")
-        if (not os.path.exists(self.human_dict) or
-                not os.path.getsize(self.human_dict)==SIZE):
-            download(URL, self.human_dict)
-
-    def _prepare_idx_to_synset(self):
-        URL = "https://heibox.uni-heidelberg.de/f/d835d5b6ceda4d3aa910/?dl=1"
-        self.idx2syn = os.path.join(self.root, "index_synset.yaml")
-        if (not os.path.exists(self.idx2syn)):
-            download(URL, self.idx2syn)
-
-    def _prepare_human_to_integer_label(self):
-        URL = "https://heibox.uni-heidelberg.de/f/2362b797d5be43b883f6/?dl=1"
-        self.human2integer = os.path.join(self.root, "imagenet1000_clsidx_to_labels.txt")
-        if (not os.path.exists(self.human2integer)):
-            download(URL, self.human2integer)
-        with open(self.human2integer, "r") as f:
-            lines = f.read().splitlines()
-            assert len(lines) == 1000
-            self.human2integer_dict = dict()
-            for line in lines:
-                value, key = line.split(":")
-                self.human2integer_dict[key] = int(value)
-
-    def _load(self):
-        with open(self.txt_filelist, "r") as f:
-            self.relpaths = f.read().splitlines()
-            l1 = len(self.relpaths)
-            self.relpaths = self._filter_relpaths(self.relpaths)
-            print("Removed {} files from filelist during filtering.".format(l1 - len(self.relpaths)))
-
-        self.synsets = [p.split("/")[0] for p in self.relpaths]
-        self.abspaths = [os.path.join(self.datadir, p) for p in self.relpaths]
-
-        unique_synsets = np.unique(self.synsets)
-        class_dict = dict((synset, i) for i, synset in enumerate(unique_synsets))
-        if not self.keep_orig_class_label:
-            self.class_labels = [class_dict[s] for s in self.synsets]
-        else:
-            self.class_labels = [self.synset2idx[s] for s in self.synsets]
-
-        with open(self.human_dict, "r") as f:
-            human_dict = f.read().splitlines()
-            human_dict = dict(line.split(maxsplit=1) for line in human_dict)
-
-        self.human_labels = [human_dict[s] for s in self.synsets]
-
-        labels = {
-            "relpath": np.array(self.relpaths),
-            "synsets": np.array(self.synsets),
-            "class_label": np.array(self.class_labels),
-            "human_label": np.array(self.human_labels),
-        }
-
-        if self.process_images:
-            self.size = retrieve(self.config, "size", default=256)
-            self.data = ImagePaths(self.abspaths,
-                                   labels=labels,
-                                   size=self.size,
-                                   random_crop=self.random_crop,
-                                   )
-        else:
-            self.data = self.abspaths
-
-
-class ImageNetTrain(ImageNetBase):
-    NAME = "ILSVRC2012_train"
-    URL = "http://www.image-net.org/challenges/LSVRC/2012/"
-    AT_HASH = "a306397ccf9c2ead27155983c254227c0fd938e2"
-    FILES = [
-        "ILSVRC2012_img_train.tar",
-    ]
-    SIZES = [
-        147897477120,
-    ]
-
-    def __init__(self, process_images=True, data_root=None, **kwargs):
-        self.process_images = process_images
-        self.data_root = data_root
-        super().__init__(**kwargs)
-
-    def _prepare(self):
-        if self.data_root:
-            self.root = os.path.join(self.data_root, self.NAME)
-        else:
-            cachedir = os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache"))
-            self.root = os.path.join(cachedir, "autoencoders/data", self.NAME)
-
-        self.datadir = os.path.join(self.root, "data")
-        self.txt_filelist = os.path.join(self.root, "filelist.txt")
-        self.expected_length = 1281167
-        self.random_crop = retrieve(self.config, "ImageNetTrain/random_crop",
-                                    default=True)
-        if not tdu.is_prepared(self.root):
-            # prep
-            print("Preparing dataset {} in {}".format(self.NAME, self.root))
-
-            datadir = self.datadir
-            if not os.path.exists(datadir):
-                path = os.path.join(self.root, self.FILES[0])
-                if not os.path.exists(path) or not os.path.getsize(path)==self.SIZES[0]:
-                    import academictorrents as at
-                    atpath = at.get(self.AT_HASH, datastore=self.root)
-                    assert atpath == path
-
-                print("Extracting {} to {}".format(path, datadir))
-                os.makedirs(datadir, exist_ok=True)
-                with tarfile.open(path, "r:") as tar:
-                    tar.extractall(path=datadir)
-
-                print("Extracting sub-tars.")
-                subpaths = sorted(glob.glob(os.path.join(datadir, "*.tar")))
-                for subpath in tqdm(subpaths):
-                    subdir = subpath[:-len(".tar")]
-                    os.makedirs(subdir, exist_ok=True)
-                    with tarfile.open(subpath, "r:") as tar:
-                        tar.extractall(path=subdir)
-
-            filelist = glob.glob(os.path.join(datadir, "**", "*.JPEG"))
-            filelist = [os.path.relpath(p, start=datadir) for p in filelist]
-            filelist = sorted(filelist)
-            filelist = "\n".join(filelist)+"\n"
-            with open(self.txt_filelist, "w") as f:
-                f.write(filelist)
-
-            tdu.mark_prepared(self.root)
-
-
-class ImageNetValidation(ImageNetBase):
-    NAME = "ILSVRC2012_validation"
-    URL = "http://www.image-net.org/challenges/LSVRC/2012/"
-    AT_HASH = "5d6d0df7ed81efd49ca99ea4737e0ae5e3a5f2e5"
-    VS_URL = "https://heibox.uni-heidelberg.de/f/3e0f6e9c624e45f2bd73/?dl=1"
-    FILES = [
-        "ILSVRC2012_img_val.tar",
-        "validation_synset.txt",
-    ]
-    SIZES = [
-        6744924160,
-        1950000,
-    ]
-
-    def __init__(self, process_images=True, data_root=None, **kwargs):
-        self.data_root = data_root
-        self.process_images = process_images
-        super().__init__(**kwargs)
-
-    def _prepare(self):
-        if self.data_root:
-            self.root = os.path.join(self.data_root, self.NAME)
-        else:
-            cachedir = os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache"))
-            self.root = os.path.join(cachedir, "autoencoders/data", self.NAME)
-        self.datadir = os.path.join(self.root, "data")
-        self.txt_filelist = os.path.join(self.root, "filelist.txt")
-        self.expected_length = 50000
-        self.random_crop = retrieve(self.config, "ImageNetValidation/random_crop",
-                                    default=False)
-        if not tdu.is_prepared(self.root):
-            # prep
-            print("Preparing dataset {} in {}".format(self.NAME, self.root))
-
-            datadir = self.datadir
-            if not os.path.exists(datadir):
-                path = os.path.join(self.root, self.FILES[0])
-                if not os.path.exists(path) or not os.path.getsize(path)==self.SIZES[0]:
-                    import academictorrents as at
-                    atpath = at.get(self.AT_HASH, datastore=self.root)
-                    assert atpath == path
-
-                print("Extracting {} to {}".format(path, datadir))
-                os.makedirs(datadir, exist_ok=True)
-                with tarfile.open(path, "r:") as tar:
-                    tar.extractall(path=datadir)
-
-                vspath = os.path.join(self.root, self.FILES[1])
-                if not os.path.exists(vspath) or not os.path.getsize(vspath)==self.SIZES[1]:
-                    download(self.VS_URL, vspath)
-
-                with open(vspath, "r") as f:
-                    synset_dict = f.read().splitlines()
-                    synset_dict = dict(line.split() for line in synset_dict)
-
-                print("Reorganizing into synset folders")
-                synsets = np.unique(list(synset_dict.values()))
-                for s in synsets:
-                    os.makedirs(os.path.join(datadir, s), exist_ok=True)
-                for k, v in synset_dict.items():
-                    src = os.path.join(datadir, k)
-                    dst = os.path.join(datadir, v)
-                    shutil.move(src, dst)
-
-            filelist = glob.glob(os.path.join(datadir, "**", "*.JPEG"))
-            filelist = [os.path.relpath(p, start=datadir) for p in filelist]
-            filelist = sorted(filelist)
-            filelist = "\n".join(filelist)+"\n"
-            with open(self.txt_filelist, "w") as f:
-                f.write(filelist)
-
-            tdu.mark_prepared(self.root)
-
-
-
-class ImageNetSR(Dataset):
-    def __init__(self, size=None,
-                 degradation=None, downscale_f=4, min_crop_f=0.5, max_crop_f=1.,
-                 random_crop=True):
-        """
-        Imagenet Superresolution Dataloader
-        Performs following ops in order:
-        1.  crops a crop of size s from image either as random or center crop
-        2.  resizes crop to size with cv2.area_interpolation
-        3.  degrades resized crop with degradation_fn
-
-        :param size: resizing to size after cropping
-        :param degradation: degradation_fn, e.g. cv_bicubic or bsrgan_light
-        :param downscale_f: Low Resolution Downsample factor
-        :param min_crop_f: determines crop size s,
-          where s = c * min_img_side_len with c sampled from interval (min_crop_f, max_crop_f)
-        :param max_crop_f: ""
-        :param data_root:
-        :param random_crop:
-        """
-        self.base = self.get_base()
-        assert size
-        assert (size / downscale_f).is_integer()
-        self.size = size
-        self.LR_size = int(size / downscale_f)
-        self.min_crop_f = min_crop_f
-        self.max_crop_f = max_crop_f
-        assert(max_crop_f <= 1.)
-        self.center_crop = not random_crop
-
-        self.image_rescaler = albumentations.SmallestMaxSize(max_size=size, interpolation=cv2.INTER_AREA)
-
-        self.pil_interpolation = False # gets reset later if incase interp_op is from pillow
-
-        if degradation == "bsrgan":
-            self.degradation_process = partial(degradation_fn_bsr, sf=downscale_f)
-
-        elif degradation == "bsrgan_light":
-            self.degradation_process = partial(degradation_fn_bsr_light, sf=downscale_f)
-
-        else:
-            interpolation_fn = {
-            "cv_nearest": cv2.INTER_NEAREST,
-            "cv_bilinear": cv2.INTER_LINEAR,
-            "cv_bicubic": cv2.INTER_CUBIC,
-            "cv_area": cv2.INTER_AREA,
-            "cv_lanczos": cv2.INTER_LANCZOS4,
-            "pil_nearest": PIL.Image.NEAREST,
-            "pil_bilinear": PIL.Image.BILINEAR,
-            "pil_bicubic": PIL.Image.BICUBIC,
-            "pil_box": PIL.Image.BOX,
-            "pil_hamming": PIL.Image.HAMMING,
-            "pil_lanczos": PIL.Image.LANCZOS,
-            }[degradation]
-
-            self.pil_interpolation = degradation.startswith("pil_")
-
-            if self.pil_interpolation:
-                self.degradation_process = partial(TF.resize, size=self.LR_size, interpolation=interpolation_fn)
-
-            else:
-                self.degradation_process = albumentations.SmallestMaxSize(max_size=self.LR_size,
-                                                                          interpolation=interpolation_fn)
-
-    def __len__(self):
-        return len(self.base)
-
-    def __getitem__(self, i):
-        example = self.base[i]
-        image = Image.open(example["file_path_"])
-
-        if not image.mode == "RGB":
-            image = image.convert("RGB")
-
-        image = np.array(image).astype(np.uint8)
-
-        min_side_len = min(image.shape[:2])
-        crop_side_len = min_side_len * np.random.uniform(self.min_crop_f, self.max_crop_f, size=None)
-        crop_side_len = int(crop_side_len)
-
-        if self.center_crop:
-            self.cropper = albumentations.CenterCrop(height=crop_side_len, width=crop_side_len)
-
-        else:
-            self.cropper = albumentations.RandomCrop(height=crop_side_len, width=crop_side_len)
-
-        image = self.cropper(image=image)["image"]
-        image = self.image_rescaler(image=image)["image"]
-
-        if self.pil_interpolation:
-            image_pil = PIL.Image.fromarray(image)
-            LR_image = self.degradation_process(image_pil)
-            LR_image = np.array(LR_image).astype(np.uint8)
-
-        else:
-            LR_image = self.degradation_process(image=image)["image"]
-
-        example["image"] = (image/127.5 - 1.0).astype(np.float32)
-        example["LR_image"] = (LR_image/127.5 - 1.0).astype(np.float32)
-
-        return example
-
-
-class ImageNetSRTrain(ImageNetSR):
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-
-    def get_base(self):
-        with open("data/imagenet_train_hr_indices.p", "rb") as f:
-            indices = pickle.load(f)
-        dset = ImageNetTrain(process_images=False,)
-        return Subset(dset, indices)
-
-
-class ImageNetSRValidation(ImageNetSR):
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-
-    def get_base(self):
-        with open("data/imagenet_val_hr_indices.p", "rb") as f:
-            indices = pickle.load(f)
-        dset = ImageNetValidation(process_images=False,)
-        return Subset(dset, indices)
diff --git a/examples/tutorial/stable_diffusion/ldm/data/lsun.py b/examples/tutorial/stable_diffusion/ldm/data/lsun.py
deleted file mode 100644
index 6256e45715ff..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/data/lsun.py
+++ /dev/null
@@ -1,92 +0,0 @@
-import os
-import numpy as np
-import PIL
-from PIL import Image
-from torch.utils.data import Dataset
-from torchvision import transforms
-
-
-class LSUNBase(Dataset):
-    def __init__(self,
-                 txt_file,
-                 data_root,
-                 size=None,
-                 interpolation="bicubic",
-                 flip_p=0.5
-                 ):
-        self.data_paths = txt_file
-        self.data_root = data_root
-        with open(self.data_paths, "r") as f:
-            self.image_paths = f.read().splitlines()
-        self._length = len(self.image_paths)
-        self.labels = {
-            "relative_file_path_": [l for l in self.image_paths],
-            "file_path_": [os.path.join(self.data_root, l)
-                           for l in self.image_paths],
-        }
-
-        self.size = size
-        self.interpolation = {"linear": PIL.Image.LINEAR,
-                              "bilinear": PIL.Image.BILINEAR,
-                              "bicubic": PIL.Image.BICUBIC,
-                              "lanczos": PIL.Image.LANCZOS,
-                              }[interpolation]
-        self.flip = transforms.RandomHorizontalFlip(p=flip_p)
-
-    def __len__(self):
-        return self._length
-
-    def __getitem__(self, i):
-        example = dict((k, self.labels[k][i]) for k in self.labels)
-        image = Image.open(example["file_path_"])
-        if not image.mode == "RGB":
-            image = image.convert("RGB")
-
-        # default to score-sde preprocessing
-        img = np.array(image).astype(np.uint8)
-        crop = min(img.shape[0], img.shape[1])
-        h, w, = img.shape[0], img.shape[1]
-        img = img[(h - crop) // 2:(h + crop) // 2,
-              (w - crop) // 2:(w + crop) // 2]
-
-        image = Image.fromarray(img)
-        if self.size is not None:
-            image = image.resize((self.size, self.size), resample=self.interpolation)
-
-        image = self.flip(image)
-        image = np.array(image).astype(np.uint8)
-        example["image"] = (image / 127.5 - 1.0).astype(np.float32)
-        return example
-
-
-class LSUNChurchesTrain(LSUNBase):
-    def __init__(self, **kwargs):
-        super().__init__(txt_file="data/lsun/church_outdoor_train.txt", data_root="data/lsun/churches", **kwargs)
-
-
-class LSUNChurchesValidation(LSUNBase):
-    def __init__(self, flip_p=0., **kwargs):
-        super().__init__(txt_file="data/lsun/church_outdoor_val.txt", data_root="data/lsun/churches",
-                         flip_p=flip_p, **kwargs)
-
-
-class LSUNBedroomsTrain(LSUNBase):
-    def __init__(self, **kwargs):
-        super().__init__(txt_file="data/lsun/bedrooms_train.txt", data_root="data/lsun/bedrooms", **kwargs)
-
-
-class LSUNBedroomsValidation(LSUNBase):
-    def __init__(self, flip_p=0.0, **kwargs):
-        super().__init__(txt_file="data/lsun/bedrooms_val.txt", data_root="data/lsun/bedrooms",
-                         flip_p=flip_p, **kwargs)
-
-
-class LSUNCatsTrain(LSUNBase):
-    def __init__(self, **kwargs):
-        super().__init__(txt_file="data/lsun/cat_train.txt", data_root="data/lsun/cats", **kwargs)
-
-
-class LSUNCatsValidation(LSUNBase):
-    def __init__(self, flip_p=0., **kwargs):
-        super().__init__(txt_file="data/lsun/cat_val.txt", data_root="data/lsun/cats",
-                         flip_p=flip_p, **kwargs)
diff --git a/examples/tutorial/stable_diffusion/ldm/lr_scheduler.py b/examples/tutorial/stable_diffusion/ldm/lr_scheduler.py
deleted file mode 100644
index be39da9ca6da..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/lr_scheduler.py
+++ /dev/null
@@ -1,98 +0,0 @@
-import numpy as np
-
-
-class LambdaWarmUpCosineScheduler:
-    """
-    note: use with a base_lr of 1.0
-    """
-    def __init__(self, warm_up_steps, lr_min, lr_max, lr_start, max_decay_steps, verbosity_interval=0):
-        self.lr_warm_up_steps = warm_up_steps
-        self.lr_start = lr_start
-        self.lr_min = lr_min
-        self.lr_max = lr_max
-        self.lr_max_decay_steps = max_decay_steps
-        self.last_lr = 0.
-        self.verbosity_interval = verbosity_interval
-
-    def schedule(self, n, **kwargs):
-        if self.verbosity_interval > 0:
-            if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_lr}")
-        if n < self.lr_warm_up_steps:
-            lr = (self.lr_max - self.lr_start) / self.lr_warm_up_steps * n + self.lr_start
-            self.last_lr = lr
-            return lr
-        else:
-            t = (n - self.lr_warm_up_steps) / (self.lr_max_decay_steps - self.lr_warm_up_steps)
-            t = min(t, 1.0)
-            lr = self.lr_min + 0.5 * (self.lr_max - self.lr_min) * (
-                    1 + np.cos(t * np.pi))
-            self.last_lr = lr
-            return lr
-
-    def __call__(self, n, **kwargs):
-        return self.schedule(n,**kwargs)
-
-
-class LambdaWarmUpCosineScheduler2:
-    """
-    supports repeated iterations, configurable via lists
-    note: use with a base_lr of 1.0.
-    """
-    def __init__(self, warm_up_steps, f_min, f_max, f_start, cycle_lengths, verbosity_interval=0):
-        assert len(warm_up_steps) == len(f_min) == len(f_max) == len(f_start) == len(cycle_lengths)
-        self.lr_warm_up_steps = warm_up_steps
-        self.f_start = f_start
-        self.f_min = f_min
-        self.f_max = f_max
-        self.cycle_lengths = cycle_lengths
-        self.cum_cycles = np.cumsum([0] + list(self.cycle_lengths))
-        self.last_f = 0.
-        self.verbosity_interval = verbosity_interval
-
-    def find_in_interval(self, n):
-        interval = 0
-        for cl in self.cum_cycles[1:]:
-            if n <= cl:
-                return interval
-            interval += 1
-
-    def schedule(self, n, **kwargs):
-        cycle = self.find_in_interval(n)
-        n = n - self.cum_cycles[cycle]
-        if self.verbosity_interval > 0:
-            if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_f}, "
-                                                       f"current cycle {cycle}")
-        if n < self.lr_warm_up_steps[cycle]:
-            f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle]
-            self.last_f = f
-            return f
-        else:
-            t = (n - self.lr_warm_up_steps[cycle]) / (self.cycle_lengths[cycle] - self.lr_warm_up_steps[cycle])
-            t = min(t, 1.0)
-            f = self.f_min[cycle] + 0.5 * (self.f_max[cycle] - self.f_min[cycle]) * (
-                    1 + np.cos(t * np.pi))
-            self.last_f = f
-            return f
-
-    def __call__(self, n, **kwargs):
-        return self.schedule(n, **kwargs)
-
-
-class LambdaLinearScheduler(LambdaWarmUpCosineScheduler2):
-
-    def schedule(self, n, **kwargs):
-        cycle = self.find_in_interval(n)
-        n = n - self.cum_cycles[cycle]
-        if self.verbosity_interval > 0:
-            if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_f}, "
-                                                       f"current cycle {cycle}")
-
-        if n < self.lr_warm_up_steps[cycle]:
-            f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle]
-            self.last_f = f
-            return f
-        else:
-            f = self.f_min[cycle] + (self.f_max[cycle] - self.f_min[cycle]) * (self.cycle_lengths[cycle] - n) / (self.cycle_lengths[cycle])
-            self.last_f = f
-            return f
-
diff --git a/examples/tutorial/stable_diffusion/ldm/models/autoencoder.py b/examples/tutorial/stable_diffusion/ldm/models/autoencoder.py
deleted file mode 100644
index 873d8b69bd22..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/models/autoencoder.py
+++ /dev/null
@@ -1,544 +0,0 @@
-import torch
-import pytorch_lightning as pl
-import torch.nn.functional as F
-from contextlib import contextmanager
-
-from taming.modules.vqvae.quantize import VectorQuantizer2 as VectorQuantizer
-
-from ldm.modules.diffusionmodules.model import Encoder, Decoder
-from ldm.modules.distributions.distributions import DiagonalGaussianDistribution
-
-from ldm.util import instantiate_from_config
-
-
-class VQModel(pl.LightningModule):
-    def __init__(self,
-                 ddconfig,
-                 lossconfig,
-                 n_embed,
-                 embed_dim,
-                 ckpt_path=None,
-                 ignore_keys=[],
-                 image_key="image",
-                 colorize_nlabels=None,
-                 monitor=None,
-                 batch_resize_range=None,
-                 scheduler_config=None,
-                 lr_g_factor=1.0,
-                 remap=None,
-                 sane_index_shape=False, # tell vector quantizer to return indices as bhw
-                 use_ema=False
-                 ):
-        super().__init__()
-        self.embed_dim = embed_dim
-        self.n_embed = n_embed
-        self.image_key = image_key
-        self.encoder = Encoder(**ddconfig)
-        self.decoder = Decoder(**ddconfig)
-        self.loss = instantiate_from_config(lossconfig)
-        self.quantize = VectorQuantizer(n_embed, embed_dim, beta=0.25,
-                                        remap=remap,
-                                        sane_index_shape=sane_index_shape)
-        self.quant_conv = torch.nn.Conv2d(ddconfig["z_channels"], embed_dim, 1)
-        self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
-        if colorize_nlabels is not None:
-            assert type(colorize_nlabels)==int
-            self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
-        if monitor is not None:
-            self.monitor = monitor
-        self.batch_resize_range = batch_resize_range
-        if self.batch_resize_range is not None:
-            print(f"{self.__class__.__name__}: Using per-batch resizing in range {batch_resize_range}.")
-
-        self.use_ema = use_ema
-        if self.use_ema:
-            self.model_ema = LitEma(self)
-            print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
-
-        if ckpt_path is not None:
-            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
-        self.scheduler_config = scheduler_config
-        self.lr_g_factor = lr_g_factor
-
-    @contextmanager
-    def ema_scope(self, context=None):
-        if self.use_ema:
-            self.model_ema.store(self.parameters())
-            self.model_ema.copy_to(self)
-            if context is not None:
-                print(f"{context}: Switched to EMA weights")
-        try:
-            yield None
-        finally:
-            if self.use_ema:
-                self.model_ema.restore(self.parameters())
-                if context is not None:
-                    print(f"{context}: Restored training weights")
-
-    def init_from_ckpt(self, path, ignore_keys=list()):
-        sd = torch.load(path, map_location="cpu")["state_dict"]
-        keys = list(sd.keys())
-        for k in keys:
-            for ik in ignore_keys:
-                if k.startswith(ik):
-                    print("Deleting key {} from state_dict.".format(k))
-                    del sd[k]
-        missing, unexpected = self.load_state_dict(sd, strict=False)
-        print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
-        if len(missing) > 0:
-            print(f"Missing Keys: {missing}")
-            print(f"Unexpected Keys: {unexpected}")
-
-    def on_train_batch_end(self, *args, **kwargs):
-        if self.use_ema:
-            self.model_ema(self)
-
-    def encode(self, x):
-        h = self.encoder(x)
-        h = self.quant_conv(h)
-        quant, emb_loss, info = self.quantize(h)
-        return quant, emb_loss, info
-
-    def encode_to_prequant(self, x):
-        h = self.encoder(x)
-        h = self.quant_conv(h)
-        return h
-
-    def decode(self, quant):
-        quant = self.post_quant_conv(quant)
-        dec = self.decoder(quant)
-        return dec
-
-    def decode_code(self, code_b):
-        quant_b = self.quantize.embed_code(code_b)
-        dec = self.decode(quant_b)
-        return dec
-
-    def forward(self, input, return_pred_indices=False):
-        quant, diff, (_,_,ind) = self.encode(input)
-        dec = self.decode(quant)
-        if return_pred_indices:
-            return dec, diff, ind
-        return dec, diff
-
-    def get_input(self, batch, k):
-        x = batch[k]
-        if len(x.shape) == 3:
-            x = x[..., None]
-        x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format).float()
-        if self.batch_resize_range is not None:
-            lower_size = self.batch_resize_range[0]
-            upper_size = self.batch_resize_range[1]
-            if self.global_step <= 4:
-                # do the first few batches with max size to avoid later oom
-                new_resize = upper_size
-            else:
-                new_resize = np.random.choice(np.arange(lower_size, upper_size+16, 16))
-            if new_resize != x.shape[2]:
-                x = F.interpolate(x, size=new_resize, mode="bicubic")
-            x = x.detach()
-        return x
-
-    def training_step(self, batch, batch_idx, optimizer_idx):
-        # https://github.com/pytorch/pytorch/issues/37142
-        # try not to fool the heuristics
-        x = self.get_input(batch, self.image_key)
-        xrec, qloss, ind = self(x, return_pred_indices=True)
-
-        if optimizer_idx == 0:
-            # autoencode
-            aeloss, log_dict_ae = self.loss(qloss, x, xrec, optimizer_idx, self.global_step,
-                                            last_layer=self.get_last_layer(), split="train",
-                                            predicted_indices=ind)
-
-            self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=True)
-            return aeloss
-
-        if optimizer_idx == 1:
-            # discriminator
-            discloss, log_dict_disc = self.loss(qloss, x, xrec, optimizer_idx, self.global_step,
-                                            last_layer=self.get_last_layer(), split="train")
-            self.log_dict(log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=True)
-            return discloss
-
-    def validation_step(self, batch, batch_idx):
-        log_dict = self._validation_step(batch, batch_idx)
-        with self.ema_scope():
-            log_dict_ema = self._validation_step(batch, batch_idx, suffix="_ema")
-        return log_dict
-
-    def _validation_step(self, batch, batch_idx, suffix=""):
-        x = self.get_input(batch, self.image_key)
-        xrec, qloss, ind = self(x, return_pred_indices=True)
-        aeloss, log_dict_ae = self.loss(qloss, x, xrec, 0,
-                                        self.global_step,
-                                        last_layer=self.get_last_layer(),
-                                        split="val"+suffix,
-                                        predicted_indices=ind
-                                        )
-
-        discloss, log_dict_disc = self.loss(qloss, x, xrec, 1,
-                                            self.global_step,
-                                            last_layer=self.get_last_layer(),
-                                            split="val"+suffix,
-                                            predicted_indices=ind
-                                            )
-        rec_loss = log_dict_ae[f"val{suffix}/rec_loss"]
-        self.log(f"val{suffix}/rec_loss", rec_loss,
-                   prog_bar=True, logger=True, on_step=False, on_epoch=True, sync_dist=True)
-        self.log(f"val{suffix}/aeloss", aeloss,
-                   prog_bar=True, logger=True, on_step=False, on_epoch=True, sync_dist=True)
-        if version.parse(pl.__version__) >= version.parse('1.4.0'):
-            del log_dict_ae[f"val{suffix}/rec_loss"]
-        self.log_dict(log_dict_ae)
-        self.log_dict(log_dict_disc)
-        return self.log_dict
-
-    def configure_optimizers(self):
-        lr_d = self.learning_rate
-        lr_g = self.lr_g_factor*self.learning_rate
-        print("lr_d", lr_d)
-        print("lr_g", lr_g)
-        opt_ae = torch.optim.Adam(list(self.encoder.parameters())+
-                                  list(self.decoder.parameters())+
-                                  list(self.quantize.parameters())+
-                                  list(self.quant_conv.parameters())+
-                                  list(self.post_quant_conv.parameters()),
-                                  lr=lr_g, betas=(0.5, 0.9))
-        opt_disc = torch.optim.Adam(self.loss.discriminator.parameters(),
-                                    lr=lr_d, betas=(0.5, 0.9))
-
-        if self.scheduler_config is not None:
-            scheduler = instantiate_from_config(self.scheduler_config)
-
-            print("Setting up LambdaLR scheduler...")
-            scheduler = [
-                {
-                    'scheduler': LambdaLR(opt_ae, lr_lambda=scheduler.schedule),
-                    'interval': 'step',
-                    'frequency': 1
-                },
-                {
-                    'scheduler': LambdaLR(opt_disc, lr_lambda=scheduler.schedule),
-                    'interval': 'step',
-                    'frequency': 1
-                },
-            ]
-            return [opt_ae, opt_disc], scheduler
-        return [opt_ae, opt_disc], []
-
-    def get_last_layer(self):
-        return self.decoder.conv_out.weight
-
-    def log_images(self, batch, only_inputs=False, plot_ema=False, **kwargs):
-        log = dict()
-        x = self.get_input(batch, self.image_key)
-        x = x.to(self.device)
-        if only_inputs:
-            log["inputs"] = x
-            return log
-        xrec, _ = self(x)
-        if x.shape[1] > 3:
-            # colorize with random projection
-            assert xrec.shape[1] > 3
-            x = self.to_rgb(x)
-            xrec = self.to_rgb(xrec)
-        log["inputs"] = x
-        log["reconstructions"] = xrec
-        if plot_ema:
-            with self.ema_scope():
-                xrec_ema, _ = self(x)
-                if x.shape[1] > 3: xrec_ema = self.to_rgb(xrec_ema)
-                log["reconstructions_ema"] = xrec_ema
-        return log
-
-    def to_rgb(self, x):
-        assert self.image_key == "segmentation"
-        if not hasattr(self, "colorize"):
-            self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
-        x = F.conv2d(x, weight=self.colorize)
-        x = 2.*(x-x.min())/(x.max()-x.min()) - 1.
-        return x
-
-
-class VQModelInterface(VQModel):
-    def __init__(self, embed_dim, *args, **kwargs):
-        super().__init__(embed_dim=embed_dim, *args, **kwargs)
-        self.embed_dim = embed_dim
-
-    def encode(self, x):
-        h = self.encoder(x)
-        h = self.quant_conv(h)
-        return h
-
-    def decode(self, h, force_not_quantize=False):
-        # also go through quantization layer
-        if not force_not_quantize:
-            quant, emb_loss, info = self.quantize(h)
-        else:
-            quant = h
-        quant = self.post_quant_conv(quant)
-        dec = self.decoder(quant)
-        return dec
-
-
-class AutoencoderKL(pl.LightningModule):
-    def __init__(self,
-                 ddconfig,
-                 lossconfig,
-                 embed_dim,
-                 ckpt_path=None,
-                 ignore_keys=[],
-                 image_key="image",
-                 colorize_nlabels=None,
-                 monitor=None,
-                 from_pretrained: str=None
-                 ):
-        super().__init__()
-        self.image_key = image_key
-        self.encoder = Encoder(**ddconfig)
-        self.decoder = Decoder(**ddconfig)
-        self.loss = instantiate_from_config(lossconfig)
-        assert ddconfig["double_z"]
-        self.quant_conv = torch.nn.Conv2d(2*ddconfig["z_channels"], 2*embed_dim, 1)
-        self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
-        self.embed_dim = embed_dim
-        if colorize_nlabels is not None:
-            assert type(colorize_nlabels)==int
-            self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
-        if monitor is not None:
-            self.monitor = monitor
-        if ckpt_path is not None:
-            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
-        from diffusers.modeling_utils import load_state_dict
-        if from_pretrained is not None:
-            state_dict = load_state_dict(from_pretrained)
-            self._load_pretrained_model(state_dict)
-
-    def _state_key_mapping(self, state_dict: dict):
-        import re
-        res_dict = {}
-        key_list = state_dict.keys()
-        key_str = " ".join(key_list)
-        up_block_pattern = re.compile('upsamplers')
-        p1 = re.compile('mid.block_[0-9]')
-        p2 = re.compile('decoder.up.[0-9]')
-        up_blocks_count = int(len(re.findall(up_block_pattern, key_str)) / 2 + 1)
-        for key_, val_ in state_dict.items():
-            key_ = key_.replace("up_blocks", "up").replace("down_blocks", "down").replace('resnets', 'block')\
-                .replace('mid_block', 'mid').replace("mid.block.", "mid.block_")\
-                .replace('mid.attentions.0.key', 'mid.attn_1.k')\
-                .replace('mid.attentions.0.query', 'mid.attn_1.q') \
-                .replace('mid.attentions.0.value', 'mid.attn_1.v') \
-                .replace('mid.attentions.0.group_norm', 'mid.attn_1.norm') \
-                .replace('mid.attentions.0.proj_attn', 'mid.attn_1.proj_out')\
-                .replace('upsamplers.0', 'upsample')\
-                .replace('downsamplers.0', 'downsample')\
-                .replace('conv_shortcut', 'nin_shortcut')\
-                .replace('conv_norm_out', 'norm_out')
-
-            mid_list = re.findall(p1, key_)
-            if len(mid_list) != 0:
-                mid_str = mid_list[0]
-                mid_id = int(mid_str[-1]) + 1
-                key_ = key_.replace(mid_str, mid_str[:-1] + str(mid_id))
-
-            up_list = re.findall(p2, key_)
-            if len(up_list) != 0:
-                up_str = up_list[0]
-                up_id = up_blocks_count - 1 -int(up_str[-1])
-                key_ = key_.replace(up_str, up_str[:-1] + str(up_id))
-            res_dict[key_] = val_
-        return res_dict
-
-    def _load_pretrained_model(self, state_dict, ignore_mismatched_sizes=False):
-        state_dict = self._state_key_mapping(state_dict)
-        model_state_dict = self.state_dict()
-        loaded_keys = [k for k in state_dict.keys()]
-        expected_keys = list(model_state_dict.keys())
-        original_loaded_keys = loaded_keys
-        missing_keys = list(set(expected_keys) - set(loaded_keys))
-        unexpected_keys = list(set(loaded_keys) - set(expected_keys))
-
-        def _find_mismatched_keys(
-            state_dict,
-            model_state_dict,
-            loaded_keys,
-            ignore_mismatched_sizes,
-        ):
-            mismatched_keys = []
-            if ignore_mismatched_sizes:
-                for checkpoint_key in loaded_keys:
-                    model_key = checkpoint_key
-
-                    if (
-                        model_key in model_state_dict
-                        and state_dict[checkpoint_key].shape != model_state_dict[model_key].shape
-                    ):
-                        mismatched_keys.append(
-                            (checkpoint_key, state_dict[checkpoint_key].shape, model_state_dict[model_key].shape)
-                        )
-                        del state_dict[checkpoint_key]
-            return mismatched_keys
-        if state_dict is not None:
-            # Whole checkpoint
-            mismatched_keys = _find_mismatched_keys(
-                state_dict,
-                model_state_dict,
-                original_loaded_keys,
-                ignore_mismatched_sizes,
-            )
-            error_msgs = self._load_state_dict_into_model(state_dict)
-        return missing_keys, unexpected_keys, mismatched_keys, error_msgs
-
-    def _load_state_dict_into_model(self, state_dict):
-        # Convert old format to new format if needed from a PyTorch state_dict
-        # copy state_dict so _load_from_state_dict can modify it
-        state_dict = state_dict.copy()
-        error_msgs = []
-
-        # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
-        # so we need to apply the function recursively.
-        def load(module: torch.nn.Module, prefix=""):
-            args = (state_dict, prefix, {}, True, [], [], error_msgs)
-            module._load_from_state_dict(*args)
-
-            for name, child in module._modules.items():
-                if child is not None:
-                    load(child, prefix + name + ".")
-
-        load(self)
-
-        return error_msgs
-
-    def init_from_ckpt(self, path, ignore_keys=list()):
-        sd = torch.load(path, map_location="cpu")["state_dict"]
-        keys = list(sd.keys())
-        for k in keys:
-            for ik in ignore_keys:
-                if k.startswith(ik):
-                    print("Deleting key {} from state_dict.".format(k))
-                    del sd[k]
-        self.load_state_dict(sd, strict=False)
-        print(f"Restored from {path}")
-
-    def encode(self, x):
-        h = self.encoder(x)
-        moments = self.quant_conv(h)
-        posterior = DiagonalGaussianDistribution(moments)
-        return posterior
-
-    def decode(self, z):
-        z = self.post_quant_conv(z)
-        dec = self.decoder(z)
-        return dec
-
-    def forward(self, input, sample_posterior=True):
-        posterior = self.encode(input)
-        if sample_posterior:
-            z = posterior.sample()
-        else:
-            z = posterior.mode()
-        dec = self.decode(z)
-        return dec, posterior
-
-    def get_input(self, batch, k):
-        x = batch[k]
-        if len(x.shape) == 3:
-            x = x[..., None]
-        x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format).float()
-        return x
-
-    def training_step(self, batch, batch_idx, optimizer_idx):
-        inputs = self.get_input(batch, self.image_key)
-        reconstructions, posterior = self(inputs)
-
-        if optimizer_idx == 0:
-            # train encoder+decoder+logvar
-            aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step,
-                                            last_layer=self.get_last_layer(), split="train")
-            self.log("aeloss", aeloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
-            self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=False)
-            return aeloss
-
-        if optimizer_idx == 1:
-            # train the discriminator
-            discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step,
-                                                last_layer=self.get_last_layer(), split="train")
-
-            self.log("discloss", discloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
-            self.log_dict(log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=False)
-            return discloss
-
-    def validation_step(self, batch, batch_idx):
-        inputs = self.get_input(batch, self.image_key)
-        reconstructions, posterior = self(inputs)
-        aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, 0, self.global_step,
-                                        last_layer=self.get_last_layer(), split="val")
-
-        discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, 1, self.global_step,
-                                            last_layer=self.get_last_layer(), split="val")
-
-        self.log("val/rec_loss", log_dict_ae["val/rec_loss"])
-        self.log_dict(log_dict_ae)
-        self.log_dict(log_dict_disc)
-        return self.log_dict
-
-    def configure_optimizers(self):
-        lr = self.learning_rate
-        opt_ae = torch.optim.Adam(list(self.encoder.parameters())+
-                                  list(self.decoder.parameters())+
-                                  list(self.quant_conv.parameters())+
-                                  list(self.post_quant_conv.parameters()),
-                                  lr=lr, betas=(0.5, 0.9))
-        opt_disc = torch.optim.Adam(self.loss.discriminator.parameters(),
-                                    lr=lr, betas=(0.5, 0.9))
-        return [opt_ae, opt_disc], []
-
-    def get_last_layer(self):
-        return self.decoder.conv_out.weight
-
-    @torch.no_grad()
-    def log_images(self, batch, only_inputs=False, **kwargs):
-        log = dict()
-        x = self.get_input(batch, self.image_key)
-        x = x.to(self.device)
-        if not only_inputs:
-            xrec, posterior = self(x)
-            if x.shape[1] > 3:
-                # colorize with random projection
-                assert xrec.shape[1] > 3
-                x = self.to_rgb(x)
-                xrec = self.to_rgb(xrec)
-            log["samples"] = self.decode(torch.randn_like(posterior.sample()))
-            log["reconstructions"] = xrec
-        log["inputs"] = x
-        return log
-
-    def to_rgb(self, x):
-        assert self.image_key == "segmentation"
-        if not hasattr(self, "colorize"):
-            self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
-        x = F.conv2d(x, weight=self.colorize)
-        x = 2.*(x-x.min())/(x.max()-x.min()) - 1.
-        return x
-
-
-class IdentityFirstStage(torch.nn.Module):
-    def __init__(self, *args, vq_interface=False, **kwargs):
-        self.vq_interface = vq_interface  # TODO: Should be true by default but check to not break older stuff
-        super().__init__()
-
-    def encode(self, x, *args, **kwargs):
-        return x
-
-    def decode(self, x, *args, **kwargs):
-        return x
-
-    def quantize(self, x, *args, **kwargs):
-        if self.vq_interface:
-            return x, None, [None, None, None]
-        return x
-
-    def forward(self, x, *args, **kwargs):
-        return x
diff --git a/examples/tutorial/stable_diffusion/ldm/models/diffusion/__init__.py b/examples/tutorial/stable_diffusion/ldm/models/diffusion/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/examples/tutorial/stable_diffusion/ldm/models/diffusion/classifier.py b/examples/tutorial/stable_diffusion/ldm/models/diffusion/classifier.py
deleted file mode 100644
index 67e98b9d8ffb..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/models/diffusion/classifier.py
+++ /dev/null
@@ -1,267 +0,0 @@
-import os
-import torch
-import pytorch_lightning as pl
-from omegaconf import OmegaConf
-from torch.nn import functional as F
-from torch.optim import AdamW
-from torch.optim.lr_scheduler import LambdaLR
-from copy import deepcopy
-from einops import rearrange
-from glob import glob
-from natsort import natsorted
-
-from ldm.modules.diffusionmodules.openaimodel import EncoderUNetModel, UNetModel
-from ldm.util import log_txt_as_img, default, ismap, instantiate_from_config
-
-__models__ = {
-    'class_label': EncoderUNetModel,
-    'segmentation': UNetModel
-}
-
-
-def disabled_train(self, mode=True):
-    """Overwrite model.train with this function to make sure train/eval mode
-    does not change anymore."""
-    return self
-
-
-class NoisyLatentImageClassifier(pl.LightningModule):
-
-    def __init__(self,
-                 diffusion_path,
-                 num_classes,
-                 ckpt_path=None,
-                 pool='attention',
-                 label_key=None,
-                 diffusion_ckpt_path=None,
-                 scheduler_config=None,
-                 weight_decay=1.e-2,
-                 log_steps=10,
-                 monitor='val/loss',
-                 *args,
-                 **kwargs):
-        super().__init__(*args, **kwargs)
-        self.num_classes = num_classes
-        # get latest config of diffusion model
-        diffusion_config = natsorted(glob(os.path.join(diffusion_path, 'configs', '*-project.yaml')))[-1]
-        self.diffusion_config = OmegaConf.load(diffusion_config).model
-        self.diffusion_config.params.ckpt_path = diffusion_ckpt_path
-        self.load_diffusion()
-
-        self.monitor = monitor
-        self.numd = self.diffusion_model.first_stage_model.encoder.num_resolutions - 1
-        self.log_time_interval = self.diffusion_model.num_timesteps // log_steps
-        self.log_steps = log_steps
-
-        self.label_key = label_key if not hasattr(self.diffusion_model, 'cond_stage_key') \
-            else self.diffusion_model.cond_stage_key
-
-        assert self.label_key is not None, 'label_key neither in diffusion model nor in model.params'
-
-        if self.label_key not in __models__:
-            raise NotImplementedError()
-
-        self.load_classifier(ckpt_path, pool)
-
-        self.scheduler_config = scheduler_config
-        self.use_scheduler = self.scheduler_config is not None
-        self.weight_decay = weight_decay
-
-    def init_from_ckpt(self, path, ignore_keys=list(), only_model=False):
-        sd = torch.load(path, map_location="cpu")
-        if "state_dict" in list(sd.keys()):
-            sd = sd["state_dict"]
-        keys = list(sd.keys())
-        for k in keys:
-            for ik in ignore_keys:
-                if k.startswith(ik):
-                    print("Deleting key {} from state_dict.".format(k))
-                    del sd[k]
-        missing, unexpected = self.load_state_dict(sd, strict=False) if not only_model else self.model.load_state_dict(
-            sd, strict=False)
-        print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
-        if len(missing) > 0:
-            print(f"Missing Keys: {missing}")
-        if len(unexpected) > 0:
-            print(f"Unexpected Keys: {unexpected}")
-
-    def load_diffusion(self):
-        model = instantiate_from_config(self.diffusion_config)
-        self.diffusion_model = model.eval()
-        self.diffusion_model.train = disabled_train
-        for param in self.diffusion_model.parameters():
-            param.requires_grad = False
-
-    def load_classifier(self, ckpt_path, pool):
-        model_config = deepcopy(self.diffusion_config.params.unet_config.params)
-        model_config.in_channels = self.diffusion_config.params.unet_config.params.out_channels
-        model_config.out_channels = self.num_classes
-        if self.label_key == 'class_label':
-            model_config.pool = pool
-
-        self.model = __models__[self.label_key](**model_config)
-        if ckpt_path is not None:
-            print('#####################################################################')
-            print(f'load from ckpt "{ckpt_path}"')
-            print('#####################################################################')
-            self.init_from_ckpt(ckpt_path)
-
-    @torch.no_grad()
-    def get_x_noisy(self, x, t, noise=None):
-        noise = default(noise, lambda: torch.randn_like(x))
-        continuous_sqrt_alpha_cumprod = None
-        if self.diffusion_model.use_continuous_noise:
-            continuous_sqrt_alpha_cumprod = self.diffusion_model.sample_continuous_noise_level(x.shape[0], t + 1)
-            # todo: make sure t+1 is correct here
-
-        return self.diffusion_model.q_sample(x_start=x, t=t, noise=noise,
-                                             continuous_sqrt_alpha_cumprod=continuous_sqrt_alpha_cumprod)
-
-    def forward(self, x_noisy, t, *args, **kwargs):
-        return self.model(x_noisy, t)
-
-    @torch.no_grad()
-    def get_input(self, batch, k):
-        x = batch[k]
-        if len(x.shape) == 3:
-            x = x[..., None]
-        x = rearrange(x, 'b h w c -> b c h w')
-        x = x.to(memory_format=torch.contiguous_format).float()
-        return x
-
-    @torch.no_grad()
-    def get_conditioning(self, batch, k=None):
-        if k is None:
-            k = self.label_key
-        assert k is not None, 'Needs to provide label key'
-
-        targets = batch[k].to(self.device)
-
-        if self.label_key == 'segmentation':
-            targets = rearrange(targets, 'b h w c -> b c h w')
-            for down in range(self.numd):
-                h, w = targets.shape[-2:]
-                targets = F.interpolate(targets, size=(h // 2, w // 2), mode='nearest')
-
-            # targets = rearrange(targets,'b c h w -> b h w c')
-
-        return targets
-
-    def compute_top_k(self, logits, labels, k, reduction="mean"):
-        _, top_ks = torch.topk(logits, k, dim=1)
-        if reduction == "mean":
-            return (top_ks == labels[:, None]).float().sum(dim=-1).mean().item()
-        elif reduction == "none":
-            return (top_ks == labels[:, None]).float().sum(dim=-1)
-
-    def on_train_epoch_start(self):
-        # save some memory
-        self.diffusion_model.model.to('cpu')
-
-    @torch.no_grad()
-    def write_logs(self, loss, logits, targets):
-        log_prefix = 'train' if self.training else 'val'
-        log = {}
-        log[f"{log_prefix}/loss"] = loss.mean()
-        log[f"{log_prefix}/acc@1"] = self.compute_top_k(
-            logits, targets, k=1, reduction="mean"
-        )
-        log[f"{log_prefix}/acc@5"] = self.compute_top_k(
-            logits, targets, k=5, reduction="mean"
-        )
-
-        self.log_dict(log, prog_bar=False, logger=True, on_step=self.training, on_epoch=True)
-        self.log('loss', log[f"{log_prefix}/loss"], prog_bar=True, logger=False)
-        self.log('global_step', self.global_step, logger=False, on_epoch=False, prog_bar=True)
-        lr = self.optimizers().param_groups[0]['lr']
-        self.log('lr_abs', lr, on_step=True, logger=True, on_epoch=False, prog_bar=True)
-
-    def shared_step(self, batch, t=None):
-        x, *_ = self.diffusion_model.get_input(batch, k=self.diffusion_model.first_stage_key)
-        targets = self.get_conditioning(batch)
-        if targets.dim() == 4:
-            targets = targets.argmax(dim=1)
-        if t is None:
-            t = torch.randint(0, self.diffusion_model.num_timesteps, (x.shape[0],), device=self.device).long()
-        else:
-            t = torch.full(size=(x.shape[0],), fill_value=t, device=self.device).long()
-        x_noisy = self.get_x_noisy(x, t)
-        logits = self(x_noisy, t)
-
-        loss = F.cross_entropy(logits, targets, reduction='none')
-
-        self.write_logs(loss.detach(), logits.detach(), targets.detach())
-
-        loss = loss.mean()
-        return loss, logits, x_noisy, targets
-
-    def training_step(self, batch, batch_idx):
-        loss, *_ = self.shared_step(batch)
-        return loss
-
-    def reset_noise_accs(self):
-        self.noisy_acc = {t: {'acc@1': [], 'acc@5': []} for t in
-                          range(0, self.diffusion_model.num_timesteps, self.diffusion_model.log_every_t)}
-
-    def on_validation_start(self):
-        self.reset_noise_accs()
-
-    @torch.no_grad()
-    def validation_step(self, batch, batch_idx):
-        loss, *_ = self.shared_step(batch)
-
-        for t in self.noisy_acc:
-            _, logits, _, targets = self.shared_step(batch, t)
-            self.noisy_acc[t]['acc@1'].append(self.compute_top_k(logits, targets, k=1, reduction='mean'))
-            self.noisy_acc[t]['acc@5'].append(self.compute_top_k(logits, targets, k=5, reduction='mean'))
-
-        return loss
-
-    def configure_optimizers(self):
-        optimizer = AdamW(self.model.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
-
-        if self.use_scheduler:
-            scheduler = instantiate_from_config(self.scheduler_config)
-
-            print("Setting up LambdaLR scheduler...")
-            scheduler = [
-                {
-                    'scheduler': LambdaLR(optimizer, lr_lambda=scheduler.schedule),
-                    'interval': 'step',
-                    'frequency': 1
-                }]
-            return [optimizer], scheduler
-
-        return optimizer
-
-    @torch.no_grad()
-    def log_images(self, batch, N=8, *args, **kwargs):
-        log = dict()
-        x = self.get_input(batch, self.diffusion_model.first_stage_key)
-        log['inputs'] = x
-
-        y = self.get_conditioning(batch)
-
-        if self.label_key == 'class_label':
-            y = log_txt_as_img((x.shape[2], x.shape[3]), batch["human_label"])
-            log['labels'] = y
-
-        if ismap(y):
-            log['labels'] = self.diffusion_model.to_rgb(y)
-
-            for step in range(self.log_steps):
-                current_time = step * self.log_time_interval
-
-                _, logits, x_noisy, _ = self.shared_step(batch, t=current_time)
-
-                log[f'inputs@t{current_time}'] = x_noisy
-
-                pred = F.one_hot(logits.argmax(dim=1), num_classes=self.num_classes)
-                pred = rearrange(pred, 'b h w c -> b c h w')
-
-                log[f'pred@t{current_time}'] = self.diffusion_model.to_rgb(pred)
-
-        for key in log:
-            log[key] = log[key][:N]
-
-        return log
diff --git a/examples/tutorial/stable_diffusion/ldm/models/diffusion/ddim.py b/examples/tutorial/stable_diffusion/ldm/models/diffusion/ddim.py
deleted file mode 100644
index 91335d6372df..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/models/diffusion/ddim.py
+++ /dev/null
@@ -1,240 +0,0 @@
-"""SAMPLING ONLY."""
-
-import torch
-import numpy as np
-from tqdm import tqdm
-from functools import partial
-
-from ldm.modules.diffusionmodules.util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like, \
-    extract_into_tensor
-
-
-class DDIMSampler(object):
-    def __init__(self, model, schedule="linear", **kwargs):
-        super().__init__()
-        self.model = model
-        self.ddpm_num_timesteps = model.num_timesteps
-        self.schedule = schedule
-
-    def register_buffer(self, name, attr):
-        if type(attr) == torch.Tensor:
-            if attr.device != torch.device("cuda"):
-                attr = attr.to(torch.device("cuda"))
-        setattr(self, name, attr)
-
-    def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
-        self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
-                                                  num_ddpm_timesteps=self.ddpm_num_timesteps,verbose=verbose)
-        alphas_cumprod = self.model.alphas_cumprod
-        assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
-        to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
-
-        self.register_buffer('betas', to_torch(self.model.betas))
-        self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
-        self.register_buffer('alphas_cumprod_prev', to_torch(self.model.alphas_cumprod_prev))
-
-        # calculations for diffusion q(x_t | x_{t-1}) and others
-        self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu())))
-        self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod.cpu())))
-        self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod.cpu())))
-        self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu())))
-        self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1)))
-
-        # ddim sampling parameters
-        ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(),
-                                                                                   ddim_timesteps=self.ddim_timesteps,
-                                                                                   eta=ddim_eta,verbose=verbose)
-        self.register_buffer('ddim_sigmas', ddim_sigmas)
-        self.register_buffer('ddim_alphas', ddim_alphas)
-        self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
-        self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
-        sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
-            (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
-                        1 - self.alphas_cumprod / self.alphas_cumprod_prev))
-        self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)
-
-    @torch.no_grad()
-    def sample(self,
-               S,
-               batch_size,
-               shape,
-               conditioning=None,
-               callback=None,
-               normals_sequence=None,
-               img_callback=None,
-               quantize_x0=False,
-               eta=0.,
-               mask=None,
-               x0=None,
-               temperature=1.,
-               noise_dropout=0.,
-               score_corrector=None,
-               corrector_kwargs=None,
-               verbose=True,
-               x_T=None,
-               log_every_t=100,
-               unconditional_guidance_scale=1.,
-               unconditional_conditioning=None,
-               # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
-               **kwargs
-               ):
-        if conditioning is not None:
-            if isinstance(conditioning, dict):
-                cbs = conditioning[list(conditioning.keys())[0]].shape[0]
-                if cbs != batch_size:
-                    print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
-            else:
-                if conditioning.shape[0] != batch_size:
-                    print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
-
-        self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
-        # sampling
-        C, H, W = shape
-        size = (batch_size, C, H, W)
-        print(f'Data shape for DDIM sampling is {size}, eta {eta}')
-
-        samples, intermediates = self.ddim_sampling(conditioning, size,
-                                                    callback=callback,
-                                                    img_callback=img_callback,
-                                                    quantize_denoised=quantize_x0,
-                                                    mask=mask, x0=x0,
-                                                    ddim_use_original_steps=False,
-                                                    noise_dropout=noise_dropout,
-                                                    temperature=temperature,
-                                                    score_corrector=score_corrector,
-                                                    corrector_kwargs=corrector_kwargs,
-                                                    x_T=x_T,
-                                                    log_every_t=log_every_t,
-                                                    unconditional_guidance_scale=unconditional_guidance_scale,
-                                                    unconditional_conditioning=unconditional_conditioning,
-                                                    )
-        return samples, intermediates
-
-    @torch.no_grad()
-    def ddim_sampling(self, cond, shape,
-                      x_T=None, ddim_use_original_steps=False,
-                      callback=None, timesteps=None, quantize_denoised=False,
-                      mask=None, x0=None, img_callback=None, log_every_t=100,
-                      temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
-                      unconditional_guidance_scale=1., unconditional_conditioning=None,):
-        device = self.model.betas.device
-        b = shape[0]
-        if x_T is None:
-            img = torch.randn(shape, device=device)
-        else:
-            img = x_T
-
-        if timesteps is None:
-            timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps
-        elif timesteps is not None and not ddim_use_original_steps:
-            subset_end = int(min(timesteps / self.ddim_timesteps.shape[0], 1) * self.ddim_timesteps.shape[0]) - 1
-            timesteps = self.ddim_timesteps[:subset_end]
-
-        intermediates = {'x_inter': [img], 'pred_x0': [img]}
-        time_range = reversed(range(0,timesteps)) if ddim_use_original_steps else np.flip(timesteps)
-        total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
-        print(f"Running DDIM Sampling with {total_steps} timesteps")
-
-        iterator = tqdm(time_range, desc='DDIM Sampler', total=total_steps)
-
-        for i, step in enumerate(iterator):
-            index = total_steps - i - 1
-            ts = torch.full((b,), step, device=device, dtype=torch.long)
-
-            if mask is not None:
-                assert x0 is not None
-                img_orig = self.model.q_sample(x0, ts)  # TODO: deterministic forward pass?
-                img = img_orig * mask + (1. - mask) * img
-            outs = self.p_sample_ddim(img, cond, ts, index=index, use_original_steps=ddim_use_original_steps,
-                                      quantize_denoised=quantize_denoised, temperature=temperature,
-                                      noise_dropout=noise_dropout, score_corrector=score_corrector,
-                                      corrector_kwargs=corrector_kwargs,
-                                      unconditional_guidance_scale=unconditional_guidance_scale,
-                                      unconditional_conditioning=unconditional_conditioning)
-            img, pred_x0 = outs
-            if callback: callback(i)
-            if img_callback: img_callback(pred_x0, i)
-
-            if index % log_every_t == 0 or index == total_steps - 1:
-                intermediates['x_inter'].append(img)
-                intermediates['pred_x0'].append(pred_x0)
-
-        return img, intermediates
-
-    @torch.no_grad()
-    def p_sample_ddim(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
-                      temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
-                      unconditional_guidance_scale=1., unconditional_conditioning=None):
-        b, *_, device = *x.shape, x.device
-
-        if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
-            e_t = self.model.apply_model(x, t, c)
-        else:
-            x_in = torch.cat([x] * 2)
-            t_in = torch.cat([t] * 2)
-            c_in = torch.cat([unconditional_conditioning, c])
-            e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
-            e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
-
-        if score_corrector is not None:
-            assert self.model.parameterization == "eps"
-            e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs)
-
-        alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
-        alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
-        sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
-        sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
-        # select parameters corresponding to the currently considered timestep
-        a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
-        a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
-        sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
-        sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index],device=device)
-
-        # current prediction for x_0
-        pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
-        if quantize_denoised:
-            pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
-        # direction pointing to x_t
-        dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
-        noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
-        if noise_dropout > 0.:
-            noise = torch.nn.functional.dropout(noise, p=noise_dropout)
-        x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
-        return x_prev, pred_x0
-
-    @torch.no_grad()
-    def stochastic_encode(self, x0, t, use_original_steps=False, noise=None):
-        # fast, but does not allow for exact reconstruction
-        # t serves as an index to gather the correct alphas
-        if use_original_steps:
-            sqrt_alphas_cumprod = self.sqrt_alphas_cumprod
-            sqrt_one_minus_alphas_cumprod = self.sqrt_one_minus_alphas_cumprod
-        else:
-            sqrt_alphas_cumprod = torch.sqrt(self.ddim_alphas)
-            sqrt_one_minus_alphas_cumprod = self.ddim_sqrt_one_minus_alphas
-
-        if noise is None:
-            noise = torch.randn_like(x0)
-        return (extract_into_tensor(sqrt_alphas_cumprod, t, x0.shape) * x0 +
-                extract_into_tensor(sqrt_one_minus_alphas_cumprod, t, x0.shape) * noise)
-
-    @torch.no_grad()
-    def decode(self, x_latent, cond, t_start, unconditional_guidance_scale=1.0, unconditional_conditioning=None,
-               use_original_steps=False):
-
-        timesteps = np.arange(self.ddpm_num_timesteps) if use_original_steps else self.ddim_timesteps
-        timesteps = timesteps[:t_start]
-
-        time_range = np.flip(timesteps)
-        total_steps = timesteps.shape[0]
-        print(f"Running DDIM Sampling with {total_steps} timesteps")
-
-        iterator = tqdm(time_range, desc='Decoding image', total=total_steps)
-        x_dec = x_latent
-        for i, step in enumerate(iterator):
-            index = total_steps - i - 1
-            ts = torch.full((x_latent.shape[0],), step, device=x_latent.device, dtype=torch.long)
-            x_dec, _ = self.p_sample_ddim(x_dec, cond, ts, index=index, use_original_steps=use_original_steps,
-                                          unconditional_guidance_scale=unconditional_guidance_scale,
-                                          unconditional_conditioning=unconditional_conditioning)
-        return x_dec
\ No newline at end of file
diff --git a/examples/tutorial/stable_diffusion/ldm/models/diffusion/ddpm.py b/examples/tutorial/stable_diffusion/ldm/models/diffusion/ddpm.py
deleted file mode 100644
index 9633ec3d843a..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/models/diffusion/ddpm.py
+++ /dev/null
@@ -1,1554 +0,0 @@
-import torch
-import torch.nn as nn
-import numpy as np
-import pytorch_lightning as pl
-from torch.optim.lr_scheduler import LambdaLR
-from einops import rearrange, repeat
-from contextlib import contextmanager
-from functools import partial
-from tqdm import tqdm
-from torchvision.utils import make_grid
-
-from pytorch_lightning.utilities.rank_zero import rank_zero_only
-from pytorch_lightning.utilities import rank_zero_info
-
-from ldm.util import log_txt_as_img, exists, default, ismap, isimage, mean_flat, count_params, instantiate_from_config
-from ldm.modules.ema import LitEma
-from ldm.modules.distributions.distributions import normal_kl, DiagonalGaussianDistribution
-from ldm.models.autoencoder import VQModelInterface, IdentityFirstStage, AutoencoderKL
-from ldm.modules.diffusionmodules.util import make_beta_schedule, extract_into_tensor, noise_like
-from ldm.models.diffusion.ddim import DDIMSampler
-from ldm.modules.diffusionmodules.openaimodel import AttentionPool2d
-from ldm.modules.x_transformer import *
-from ldm.modules.encoders.modules import *
-
-from ldm.modules.ema import LitEma
-from ldm.modules.distributions.distributions import normal_kl, DiagonalGaussianDistribution
-from ldm.models.autoencoder import *
-from ldm.models.diffusion.ddim import *
-from ldm.modules.diffusionmodules.openaimodel import *
-from ldm.modules.diffusionmodules.model import *
-
-
-from ldm.modules.diffusionmodules.model import Model, Encoder, Decoder
-
-from ldm.util import instantiate_from_config
-
-from einops import rearrange, repeat
-
-
-
-
-__conditioning_keys__ = {'concat': 'c_concat',
-                         'crossattn': 'c_crossattn',
-                         'adm': 'y'}
-
-
-def disabled_train(self, mode=True):
-    """Overwrite model.train with this function to make sure train/eval mode
-    does not change anymore."""
-    return self
-
-
-def uniform_on_device(r1, r2, shape, device):
-    return (r1 - r2) * torch.rand(*shape, device=device) + r2
-
-
-class DDPM(pl.LightningModule):
-    # classic DDPM with Gaussian diffusion, in image space
-    def __init__(self,
-                 unet_config,
-                 timesteps=1000,
-                 beta_schedule="linear",
-                 loss_type="l2",
-                 ckpt_path=None,
-                 ignore_keys=[],
-                 load_only_unet=False,
-                 monitor="val/loss",
-                 use_ema=True,
-                 first_stage_key="image",
-                 image_size=256,
-                 channels=3,
-                 log_every_t=100,
-                 clip_denoised=True,
-                 linear_start=1e-4,
-                 linear_end=2e-2,
-                 cosine_s=8e-3,
-                 given_betas=None,
-                 original_elbo_weight=0.,
-                 v_posterior=0.,  # weight for choosing posterior variance as sigma = (1-v) * beta_tilde + v * beta
-                 l_simple_weight=1.,
-                 conditioning_key=None,
-                 parameterization="eps",  # all assuming fixed variance schedules
-                 scheduler_config=None,
-                 use_positional_encodings=False,
-                 learn_logvar=False,
-                 logvar_init=0.,
-                 use_fp16 = True,
-                 ):
-        super().__init__()
-        assert parameterization in ["eps", "x0"], 'currently only supporting "eps" and "x0"'
-        self.parameterization = parameterization
-        rank_zero_info(f"{self.__class__.__name__}: Running in {self.parameterization}-prediction mode")
-        self.cond_stage_model = None
-        self.clip_denoised = clip_denoised
-        self.log_every_t = log_every_t
-        self.first_stage_key = first_stage_key
-        self.image_size = image_size  # try conv?
-        self.channels = channels
-        self.use_positional_encodings = use_positional_encodings
-        self.unet_config = unet_config
-        self.conditioning_key = conditioning_key
-        # self.model = DiffusionWrapper(unet_config, conditioning_key)
-        # count_params(self.model, verbose=True)
-        self.use_ema = use_ema
-        # if self.use_ema:
-        #     self.model_ema = LitEma(self.model)
-        #     print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
-
-        self.use_scheduler = scheduler_config is not None
-        if self.use_scheduler:
-            self.scheduler_config = scheduler_config
-
-        self.v_posterior = v_posterior
-        self.original_elbo_weight = original_elbo_weight
-        self.l_simple_weight = l_simple_weight
-
-        if monitor is not None:
-            self.monitor = monitor
-        self.ckpt_path = ckpt_path
-        self.ignore_keys = ignore_keys
-        self.load_only_unet = load_only_unet
-        self.given_betas = given_betas
-        self.beta_schedule = beta_schedule
-        self.timesteps = timesteps
-        self.linear_start = linear_start
-        self.linear_end = linear_end
-        self.cosine_s = cosine_s
-        # if ckpt_path is not None:
-        #     self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys, only_model=load_only_unet)
-        #
-        # self.register_schedule(given_betas=given_betas, beta_schedule=beta_schedule, timesteps=timesteps,
-        #                        linear_start=linear_start, linear_end=linear_end, cosine_s=cosine_s)
-
-        self.loss_type = loss_type
-
-        self.learn_logvar = learn_logvar
-        self.logvar_init = logvar_init
-        # self.logvar = torch.full(fill_value=logvar_init, size=(self.num_timesteps,))
-        # if self.learn_logvar:
-        #     self.logvar = nn.Parameter(self.logvar, requires_grad=True)
-        #     self.logvar = nn.Parameter(self.logvar, requires_grad=True)
-
-        self.use_fp16 = use_fp16
-        if use_fp16:
-            self.unet_config["params"].update({"use_fp16": True})
-            rank_zero_info("Using FP16 for UNet = {}".format(self.unet_config["params"]["use_fp16"]))
-        else:
-            self.unet_config["params"].update({"use_fp16": False})
-            rank_zero_info("Using FP16 for UNet = {}".format(self.unet_config["params"]["use_fp16"]))
-
-    def register_schedule(self, given_betas=None, beta_schedule="linear", timesteps=1000,
-                          linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
-        if exists(given_betas):
-            betas = given_betas
-        else:
-            betas = make_beta_schedule(beta_schedule, timesteps, linear_start=linear_start, linear_end=linear_end,
-                                       cosine_s=cosine_s)
-        alphas = 1. - betas
-        alphas_cumprod = np.cumprod(alphas, axis=0)
-        alphas_cumprod_prev = np.append(1., alphas_cumprod[:-1])
-
-        timesteps, = betas.shape
-        self.num_timesteps = int(timesteps)
-        self.linear_start = linear_start
-        self.linear_end = linear_end
-        assert alphas_cumprod.shape[0] == self.num_timesteps, 'alphas have to be defined for each timestep'
-
-        to_torch = partial(torch.tensor, dtype=torch.float32)
-
-        self.register_buffer('betas', to_torch(betas))
-        self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
-        self.register_buffer('alphas_cumprod_prev', to_torch(alphas_cumprod_prev))
-
-        # calculations for diffusion q(x_t | x_{t-1}) and others
-        self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod)))
-        self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod)))
-        self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod)))
-        self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod)))
-        self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod - 1)))
-
-        # calculations for posterior q(x_{t-1} | x_t, x_0)
-        posterior_variance = (1 - self.v_posterior) * betas * (1. - alphas_cumprod_prev) / (
-                    1. - alphas_cumprod) + self.v_posterior * betas
-        # above: equal to 1. / (1. / (1. - alpha_cumprod_tm1) + alpha_t / beta_t)
-        self.register_buffer('posterior_variance', to_torch(posterior_variance))
-        # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
-        self.register_buffer('posterior_log_variance_clipped', to_torch(np.log(np.maximum(posterior_variance, 1e-20))))
-        self.register_buffer('posterior_mean_coef1', to_torch(
-            betas * np.sqrt(alphas_cumprod_prev) / (1. - alphas_cumprod)))
-        self.register_buffer('posterior_mean_coef2', to_torch(
-            (1. - alphas_cumprod_prev) * np.sqrt(alphas) / (1. - alphas_cumprod)))
-
-        if self.parameterization == "eps":
-            lvlb_weights = self.betas ** 2 / (
-                        2 * self.posterior_variance * to_torch(alphas) * (1 - self.alphas_cumprod))
-        elif self.parameterization == "x0":
-            lvlb_weights = 0.5 * np.sqrt(torch.Tensor(alphas_cumprod)) / (2. * 1 - torch.Tensor(alphas_cumprod))
-        else:
-            raise NotImplementedError("mu not supported")
-        # TODO how to choose this term
-        lvlb_weights[0] = lvlb_weights[1]
-        self.register_buffer('lvlb_weights', lvlb_weights, persistent=False)
-        assert not torch.isnan(self.lvlb_weights).all()
-
-    @contextmanager
-    def ema_scope(self, context=None):
-        if self.use_ema:
-            self.model_ema.store(self.model.parameters())
-            self.model_ema.copy_to(self.model)
-            if context is not None:
-                print(f"{context}: Switched to EMA weights")
-        try:
-            yield None
-        finally:
-            if self.use_ema:
-                self.model_ema.restore(self.model.parameters())
-                if context is not None:
-                    print(f"{context}: Restored training weights")
-
-    def init_from_ckpt(self, path, ignore_keys=list(), only_model=False):
-        sd = torch.load(path, map_location="cpu")
-        if "state_dict" in list(sd.keys()):
-            sd = sd["state_dict"]
-        keys = list(sd.keys())
-        for k in keys:
-            for ik in ignore_keys:
-                if k.startswith(ik):
-                    print("Deleting key {} from state_dict.".format(k))
-                    del sd[k]
-        missing, unexpected = self.load_state_dict(sd, strict=False) if not only_model else self.model.load_state_dict(
-            sd, strict=False)
-        print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
-        if len(missing) > 0:
-            print(f"Missing Keys: {missing}")
-        if len(unexpected) > 0:
-            print(f"Unexpected Keys: {unexpected}")
-
-    def q_mean_variance(self, x_start, t):
-        """
-        Get the distribution q(x_t | x_0).
-        :param x_start: the [N x C x ...] tensor of noiseless inputs.
-        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
-        :return: A tuple (mean, variance, log_variance), all of x_start's shape.
-        """
-        mean = (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start)
-        variance = extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape)
-        log_variance = extract_into_tensor(self.log_one_minus_alphas_cumprod, t, x_start.shape)
-        return mean, variance, log_variance
-
-    def predict_start_from_noise(self, x_t, t, noise):
-        return (
-                extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t -
-                extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * noise
-        )
-
-    def q_posterior(self, x_start, x_t, t):
-        posterior_mean = (
-                extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start +
-                extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
-        )
-        posterior_variance = extract_into_tensor(self.posterior_variance, t, x_t.shape)
-        posterior_log_variance_clipped = extract_into_tensor(self.posterior_log_variance_clipped, t, x_t.shape)
-        return posterior_mean, posterior_variance, posterior_log_variance_clipped
-
-    def p_mean_variance(self, x, t, clip_denoised: bool):
-        model_out = self.model(x, t)
-        if self.parameterization == "eps":
-            x_recon = self.predict_start_from_noise(x, t=t, noise=model_out)
-        elif self.parameterization == "x0":
-            x_recon = model_out
-        if clip_denoised:
-            x_recon.clamp_(-1., 1.)
-
-        model_mean, posterior_variance, posterior_log_variance = self.q_posterior(x_start=x_recon, x_t=x, t=t)
-        return model_mean, posterior_variance, posterior_log_variance
-
-    @torch.no_grad()
-    def p_sample(self, x, t, clip_denoised=True, repeat_noise=False):
-        b, *_, device = *x.shape, x.device
-        model_mean, _, model_log_variance = self.p_mean_variance(x=x, t=t, clip_denoised=clip_denoised)
-        noise = noise_like(x.shape, device, repeat_noise)
-        # no noise when t == 0
-        nonzero_mask = (1 - (t == 0).float()).reshape(b, *((1,) * (len(x.shape) - 1)))
-        return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise
-
-    @torch.no_grad()
-    def p_sample_loop(self, shape, return_intermediates=False):
-        device = self.betas.device
-        b = shape[0]
-        img = torch.randn(shape, device=device)
-        intermediates = [img]
-        for i in tqdm(reversed(range(0, self.num_timesteps)), desc='Sampling t', total=self.num_timesteps):
-            img = self.p_sample(img, torch.full((b,), i, device=device, dtype=torch.long),
-                                clip_denoised=self.clip_denoised)
-            if i % self.log_every_t == 0 or i == self.num_timesteps - 1:
-                intermediates.append(img)
-        if return_intermediates:
-            return img, intermediates
-        return img
-
-    @torch.no_grad()
-    def sample(self, batch_size=16, return_intermediates=False):
-        image_size = self.image_size
-        channels = self.channels
-        return self.p_sample_loop((batch_size, channels, image_size, image_size),
-                                  return_intermediates=return_intermediates)
-
-    def q_sample(self, x_start, t, noise=None):
-        noise = default(noise, lambda: torch.randn_like(x_start))
-        return (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start +
-                extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise)
-
-    def get_loss(self, pred, target, mean=True):
-
-        if pred.isnan().any():
-            print("Warning: Prediction has nan values")
-            lr = self.optimizers().param_groups[0]['lr']
-            # self.log('lr_abs', lr, prog_bar=True, logger=True, on_step=True, on_epoch=False)
-            print(f"lr: {lr}")
-        if pred.isinf().any():
-            print("Warning: Prediction has inf values")
-
-        if self.use_fp16:
-            target = target.half()
-
-        if self.loss_type == 'l1':
-            loss = (target - pred).abs()
-            if mean:
-                loss = loss.mean()
-        elif self.loss_type == 'l2':
-            if mean:
-                loss = torch.nn.functional.mse_loss(target, pred)
-            else:
-                loss = torch.nn.functional.mse_loss(target, pred, reduction='none')
-        else:
-            raise NotImplementedError("unknown loss type '{loss_type}'")
-                   
-        if loss.isnan().any():
-            print("Warning: loss has nan values")
-            print("loss: ", loss[0][0][0])
-            raise ValueError("loss has nan values")
-        if loss.isinf().any():
-            print("Warning: loss has inf values")
-            print("loss: ", loss)
-            raise ValueError("loss has inf values")
-
-        return loss
-
-    def p_losses(self, x_start, t, noise=None):
-        noise = default(noise, lambda: torch.randn_like(x_start))
-        x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
-        model_out = self.model(x_noisy, t)
-
-        loss_dict = {}
-        if self.parameterization == "eps":
-            target = noise
-        elif self.parameterization == "x0":
-            target = x_start
-        else:
-            raise NotImplementedError(f"Paramterization {self.parameterization} not yet supported")
-
-        loss = self.get_loss(model_out, target, mean=False).mean(dim=[1, 2, 3])
-
-        log_prefix = 'train' if self.training else 'val'
-
-        loss_dict.update({f'{log_prefix}/loss_simple': loss.mean()})
-        loss_simple = loss.mean() * self.l_simple_weight
-
-        loss_vlb = (self.lvlb_weights[t] * loss).mean()
-        loss_dict.update({f'{log_prefix}/loss_vlb': loss_vlb})
-
-        loss = loss_simple + self.original_elbo_weight * loss_vlb
-
-        loss_dict.update({f'{log_prefix}/loss': loss})
-
-        return loss, loss_dict
-
-    def forward(self, x, *args, **kwargs):
-        # b, c, h, w, device, img_size, = *x.shape, x.device, self.image_size
-        # assert h == img_size and w == img_size, f'height and width of image must be {img_size}'
-        t = torch.randint(0, self.num_timesteps, (x.shape[0],), device=self.device).long()
-        return self.p_losses(x, t, *args, **kwargs)
-
-    def get_input(self, batch, k):
-        # print("+" * 30)
-        # print(batch['jpg'].shape)
-        # print(len(batch['txt']))
-        # print(k)
-        # print("=" * 30)
-        if not isinstance(batch, torch.Tensor):
-            x = batch[k]
-        else:
-            x = batch
-        if len(x.shape) == 3:
-            x = x[..., None]
-        x = rearrange(x, 'b h w c -> b c h w')
-
-        if self.use_fp16:
-            x = x.to(memory_format=torch.contiguous_format).float().half()
-        else:
-            x = x.to(memory_format=torch.contiguous_format).float()
-
-        return x
-
-    def shared_step(self, batch):
-        x = self.get_input(batch, self.first_stage_key)
-        loss, loss_dict = self(x)
-        return loss, loss_dict
-
-    def training_step(self, batch, batch_idx):
-        loss, loss_dict = self.shared_step(batch)
-
-        self.log_dict(loss_dict, prog_bar=True,
-                      logger=True, on_step=True, on_epoch=True)
-
-        self.log("global_step", self.global_step,
-                 prog_bar=True, logger=True, on_step=True, on_epoch=False)
-
-        if self.use_scheduler:
-            lr = self.optimizers().param_groups[0]['lr']
-            self.log('lr_abs', lr, prog_bar=True, logger=True, on_step=True, on_epoch=False)
-
-        return loss
-
-    @torch.no_grad()
-    def validation_step(self, batch, batch_idx):
-        _, loss_dict_no_ema = self.shared_step(batch)
-        with self.ema_scope():
-            _, loss_dict_ema = self.shared_step(batch)
-            loss_dict_ema = {key + '_ema': loss_dict_ema[key] for key in loss_dict_ema}
-        self.log_dict(loss_dict_no_ema, prog_bar=False, logger=True, on_step=False, on_epoch=True)
-        self.log_dict(loss_dict_ema, prog_bar=False, logger=True, on_step=False, on_epoch=True)
-
-    def on_train_batch_end(self, *args, **kwargs):
-        if self.use_ema:
-            self.model_ema(self.model)
-
-    def _get_rows_from_list(self, samples):
-        n_imgs_per_row = len(samples)
-        denoise_grid = rearrange(samples, 'n b c h w -> b n c h w')
-        denoise_grid = rearrange(denoise_grid, 'b n c h w -> (b n) c h w')
-        denoise_grid = make_grid(denoise_grid, nrow=n_imgs_per_row)
-        return denoise_grid
-
-    @torch.no_grad()
-    def log_images(self, batch, N=8, n_row=2, sample=True, return_keys=None, **kwargs):
-        log = dict()
-        x = self.get_input(batch, self.first_stage_key)
-        N = min(x.shape[0], N)
-        n_row = min(x.shape[0], n_row)
-        x = x.to(self.device)[:N]
-        log["inputs"] = x
-
-        # get diffusion row
-        diffusion_row = list()
-        x_start = x[:n_row]
-
-        for t in range(self.num_timesteps):
-            if t % self.log_every_t == 0 or t == self.num_timesteps - 1:
-                t = repeat(torch.tensor([t]), '1 -> b', b=n_row)
-                t = t.to(self.device).long()
-                noise = torch.randn_like(x_start)
-                x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
-                diffusion_row.append(x_noisy)
-
-        log["diffusion_row"] = self._get_rows_from_list(diffusion_row)
-
-        if sample:
-            # get denoise row
-            with self.ema_scope("Plotting"):
-                samples, denoise_row = self.sample(batch_size=N, return_intermediates=True)
-
-            log["samples"] = samples
-            log["denoise_row"] = self._get_rows_from_list(denoise_row)
-
-        if return_keys:
-            if np.intersect1d(list(log.keys()), return_keys).shape[0] == 0:
-                return log
-            else:
-                return {key: log[key] for key in return_keys}
-        return log
-
-    def configure_optimizers(self):
-        lr = self.learning_rate
-        params = list(self.model.parameters())
-        if self.learn_logvar:
-            params = params + [self.logvar]
-        opt = torch.optim.AdamW(params, lr=lr)
-        return opt
-
-
-class LatentDiffusion(DDPM):
-    """main class"""
-    def __init__(self,
-                 first_stage_config,
-                 cond_stage_config,
-                 num_timesteps_cond=None,
-                 cond_stage_key="image",
-                 cond_stage_trainable=False,
-                 concat_mode=True,
-                 cond_stage_forward=None,
-                 conditioning_key=None,
-                 scale_factor=1.0,
-                 scale_by_std=False,
-                 use_fp16=True,
-                 *args, **kwargs):
-        self.num_timesteps_cond = default(num_timesteps_cond, 1)
-        self.scale_by_std = scale_by_std
-        assert self.num_timesteps_cond <= kwargs['timesteps']
-        # for backwards compatibility after implementation of DiffusionWrapper
-        if conditioning_key is None:
-            conditioning_key = 'concat' if concat_mode else 'crossattn'
-        if cond_stage_config == '__is_unconditional__':
-            conditioning_key = None
-        ckpt_path = kwargs.pop("ckpt_path", None)
-        ignore_keys = kwargs.pop("ignore_keys", [])
-        super().__init__(conditioning_key=conditioning_key, use_fp16=use_fp16, *args, **kwargs)
-        self.concat_mode = concat_mode
-        self.cond_stage_trainable = cond_stage_trainable
-        self.cond_stage_key = cond_stage_key
-        try:
-            self.num_downs = len(first_stage_config.params.ddconfig.ch_mult) - 1
-        except:
-            self.num_downs = 0
-        if not scale_by_std:
-            self.scale_factor = scale_factor
-        else:
-            self.register_buffer('scale_factor', torch.tensor(scale_factor))
-        self.first_stage_config = first_stage_config
-        self.cond_stage_config = cond_stage_config
-        if self.use_fp16:
-            self.cond_stage_config["params"].update({"use_fp16": True})
-            rank_zero_info("Using fp16 for conditioning stage = {}".format(self.cond_stage_config["params"]["use_fp16"]))
-        else:
-            self.cond_stage_config["params"].update({"use_fp16": False})
-            rank_zero_info("Using fp16 for conditioning stage = {}".format(self.cond_stage_config["params"]["use_fp16"]))
-        # self.instantiate_first_stage(first_stage_config)
-        # self.instantiate_cond_stage(cond_stage_config)
-        self.cond_stage_forward = cond_stage_forward
-        self.clip_denoised = False
-        self.bbox_tokenizer = None  
-
-        self.restarted_from_ckpt = False
-        if ckpt_path is not None:
-            self.init_from_ckpt(ckpt_path, ignore_keys)
-            self.restarted_from_ckpt = True
-
-
-
-    def configure_sharded_model(self) -> None:
-        self.model = DiffusionWrapper(self.unet_config, self.conditioning_key)
-        count_params(self.model, verbose=True)
-        if self.use_ema:
-            self.model_ema = LitEma(self.model)
-            print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
-
-
-        self.register_schedule(given_betas=self.given_betas, beta_schedule=self.beta_schedule, timesteps=self.timesteps,
-                               linear_start=self.linear_start, linear_end=self.linear_end, cosine_s=self.cosine_s)
-
-        self.logvar = torch.full(fill_value=self.logvar_init, size=(self.num_timesteps,))
-        if self.learn_logvar:
-            self.logvar = nn.Parameter(self.logvar, requires_grad=True)
-            # self.logvar = nn.Parameter(self.logvar, requires_grad=True)
-        if self.ckpt_path is not None:
-            self.init_from_ckpt(self.ckpt_path, self.ignore_keys)
-            self.restarted_from_ckpt = True
-
-        # TODO()
-        # for p in self.model.modules():
-        #     if not p.parameters().data.is_contiguous:
-        #     p.data = p.data.contiguous()
-    
-        self.instantiate_first_stage(self.first_stage_config)
-        self.instantiate_cond_stage(self.cond_stage_config)
-
-    def make_cond_schedule(self, ):
-        self.cond_ids = torch.full(size=(self.num_timesteps,), fill_value=self.num_timesteps - 1, dtype=torch.long)
-        ids = torch.round(torch.linspace(0, self.num_timesteps - 1, self.num_timesteps_cond)).long()
-        self.cond_ids[:self.num_timesteps_cond] = ids
-
-
-
-    @rank_zero_only
-    @torch.no_grad()
-    # def on_train_batch_start(self, batch, batch_idx, dataloader_idx):
-    def on_train_batch_start(self, batch, batch_idx):
-        # only for very first batch
-        if self.scale_by_std and self.current_epoch == 0 and self.global_step == 0 and batch_idx == 0 and not self.restarted_from_ckpt:
-            assert self.scale_factor == 1., 'rather not use custom rescaling and std-rescaling simultaneously'
-            # set rescale weight to 1./std of encodings
-            print("### USING STD-RESCALING ###")
-            x = super().get_input(batch, self.first_stage_key)
-            x = x.to(self.device)
-            encoder_posterior = self.encode_first_stage(x)
-            z = self.get_first_stage_encoding(encoder_posterior).detach()
-            del self.scale_factor
-            self.register_buffer('scale_factor', 1. / z.flatten().std())
-            print(f"setting self.scale_factor to {self.scale_factor}")
-            print("### USING STD-RESCALING ###")
-
-    def register_schedule(self,
-                          given_betas=None, beta_schedule="linear", timesteps=1000,
-                          linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
-        super().register_schedule(given_betas, beta_schedule, timesteps, linear_start, linear_end, cosine_s)
-
-        self.shorten_cond_schedule = self.num_timesteps_cond > 1
-        if self.shorten_cond_schedule:
-            self.make_cond_schedule()
-
-    def instantiate_first_stage(self, config):
-        model = instantiate_from_config(config)
-        self.first_stage_model = model.eval()
-        self.first_stage_model.train = disabled_train
-        for param in self.first_stage_model.parameters():
-            param.requires_grad = False
-
-    def instantiate_cond_stage(self, config):
-        if not self.cond_stage_trainable:
-            if config == "__is_first_stage__":
-                print("Using first stage also as cond stage.")
-                self.cond_stage_model = self.first_stage_model
-            elif config == "__is_unconditional__":
-                print(f"Training {self.__class__.__name__} as an unconditional model.")
-                self.cond_stage_model = None
-                # self.be_unconditional = True
-            else:
-                model = instantiate_from_config(config)
-                self.cond_stage_model = model.eval()
-                self.cond_stage_model.train = disabled_train
-                for param in self.cond_stage_model.parameters():
-                    param.requires_grad = False
-        else:
-            assert config != '__is_first_stage__'
-            assert config != '__is_unconditional__'
-            model = instantiate_from_config(config)
-            self.cond_stage_model = model
-
-    def _get_denoise_row_from_list(self, samples, desc='', force_no_decoder_quantization=False):
-        denoise_row = []
-        for zd in tqdm(samples, desc=desc):
-            denoise_row.append(self.decode_first_stage(zd.to(self.device),
-                                                            force_not_quantize=force_no_decoder_quantization))
-        n_imgs_per_row = len(denoise_row)
-        denoise_row = torch.stack(denoise_row)  # n_log_step, n_row, C, H, W
-        denoise_grid = rearrange(denoise_row, 'n b c h w -> b n c h w')
-        denoise_grid = rearrange(denoise_grid, 'b n c h w -> (b n) c h w')
-        denoise_grid = make_grid(denoise_grid, nrow=n_imgs_per_row)
-        return denoise_grid
-
-    def get_first_stage_encoding(self, encoder_posterior):
-        if isinstance(encoder_posterior, DiagonalGaussianDistribution):
-            z = encoder_posterior.sample()
-        elif isinstance(encoder_posterior, torch.Tensor):
-            z = encoder_posterior
-        else:
-            raise NotImplementedError(f"encoder_posterior of type '{type(encoder_posterior)}' not yet implemented")
-        return self.scale_factor * z
-
-    def get_learned_conditioning(self, c):
-        if self.cond_stage_forward is None:
-            if hasattr(self.cond_stage_model, 'encode') and callable(self.cond_stage_model.encode):
-                c = self.cond_stage_model.encode(c)
-                if isinstance(c, DiagonalGaussianDistribution):
-                    c = c.mode()
-            else:
-                c = self.cond_stage_model(c)
-        else:
-            assert hasattr(self.cond_stage_model, self.cond_stage_forward)
-            c = getattr(self.cond_stage_model, self.cond_stage_forward)(c)
-        return c
-
-    def meshgrid(self, h, w):
-        y = torch.arange(0, h).view(h, 1, 1).repeat(1, w, 1)
-        x = torch.arange(0, w).view(1, w, 1).repeat(h, 1, 1)
-
-        arr = torch.cat([y, x], dim=-1)
-        return arr
-
-    def delta_border(self, h, w):
-        """
-        :param h: height
-        :param w: width
-        :return: normalized distance to image border,
-         wtith min distance = 0 at border and max dist = 0.5 at image center
-        """
-        lower_right_corner = torch.tensor([h - 1, w - 1]).view(1, 1, 2)
-        arr = self.meshgrid(h, w) / lower_right_corner
-        dist_left_up = torch.min(arr, dim=-1, keepdims=True)[0]
-        dist_right_down = torch.min(1 - arr, dim=-1, keepdims=True)[0]
-        edge_dist = torch.min(torch.cat([dist_left_up, dist_right_down], dim=-1), dim=-1)[0]
-        return edge_dist
-
-    def get_weighting(self, h, w, Ly, Lx, device):
-        weighting = self.delta_border(h, w)
-        weighting = torch.clip(weighting, self.split_input_params["clip_min_weight"],
-                               self.split_input_params["clip_max_weight"], )
-        weighting = weighting.view(1, h * w, 1).repeat(1, 1, Ly * Lx).to(device)
-
-        if self.split_input_params["tie_braker"]:
-            L_weighting = self.delta_border(Ly, Lx)
-            L_weighting = torch.clip(L_weighting,
-                                     self.split_input_params["clip_min_tie_weight"],
-                                     self.split_input_params["clip_max_tie_weight"])
-
-            L_weighting = L_weighting.view(1, 1, Ly * Lx).to(device)
-            weighting = weighting * L_weighting
-        return weighting
-
-    def get_fold_unfold(self, x, kernel_size, stride, uf=1, df=1):  # todo load once not every time, shorten code
-        """
-        :param x: img of size (bs, c, h, w)
-        :return: n img crops of size (n, bs, c, kernel_size[0], kernel_size[1])
-        """
-        bs, nc, h, w = x.shape
-
-        # number of crops in image
-        Ly = (h - kernel_size[0]) // stride[0] + 1
-        Lx = (w - kernel_size[1]) // stride[1] + 1
-
-        if uf == 1 and df == 1:
-            fold_params = dict(kernel_size=kernel_size, dilation=1, padding=0, stride=stride)
-            unfold = torch.nn.Unfold(**fold_params)
-
-            fold = torch.nn.Fold(output_size=x.shape[2:], **fold_params)
-
-            weighting = self.get_weighting(kernel_size[0], kernel_size[1], Ly, Lx, x.device).to(x.dtype)
-            normalization = fold(weighting).view(1, 1, h, w)  # normalizes the overlap
-            weighting = weighting.view((1, 1, kernel_size[0], kernel_size[1], Ly * Lx))
-
-        elif uf > 1 and df == 1:
-            fold_params = dict(kernel_size=kernel_size, dilation=1, padding=0, stride=stride)
-            unfold = torch.nn.Unfold(**fold_params)
-
-            fold_params2 = dict(kernel_size=(kernel_size[0] * uf, kernel_size[0] * uf),
-                                dilation=1, padding=0,
-                                stride=(stride[0] * uf, stride[1] * uf))
-            fold = torch.nn.Fold(output_size=(x.shape[2] * uf, x.shape[3] * uf), **fold_params2)
-
-            weighting = self.get_weighting(kernel_size[0] * uf, kernel_size[1] * uf, Ly, Lx, x.device).to(x.dtype)
-            normalization = fold(weighting).view(1, 1, h * uf, w * uf)  # normalizes the overlap
-            weighting = weighting.view((1, 1, kernel_size[0] * uf, kernel_size[1] * uf, Ly * Lx))
-
-        elif df > 1 and uf == 1:
-            fold_params = dict(kernel_size=kernel_size, dilation=1, padding=0, stride=stride)
-            unfold = torch.nn.Unfold(**fold_params)
-
-            fold_params2 = dict(kernel_size=(kernel_size[0] // df, kernel_size[0] // df),
-                                dilation=1, padding=0,
-                                stride=(stride[0] // df, stride[1] // df))
-            fold = torch.nn.Fold(output_size=(x.shape[2] // df, x.shape[3] // df), **fold_params2)
-
-            weighting = self.get_weighting(kernel_size[0] // df, kernel_size[1] // df, Ly, Lx, x.device).to(x.dtype)
-            normalization = fold(weighting).view(1, 1, h // df, w // df)  # normalizes the overlap
-            weighting = weighting.view((1, 1, kernel_size[0] // df, kernel_size[1] // df, Ly * Lx))
-
-        else:
-            raise NotImplementedError
-
-        return fold, unfold, normalization, weighting
-
-    @torch.no_grad()
-    def get_input(self, batch, k, return_first_stage_outputs=False, force_c_encode=False,
-                  cond_key=None, return_original_cond=False, bs=None):
-        x = super().get_input(batch, k)
-        if bs is not None:
-            x = x[:bs]
-        x = x.to(self.device)
-        encoder_posterior = self.encode_first_stage(x)
-        z = self.get_first_stage_encoding(encoder_posterior).detach()
-
-        if self.model.conditioning_key is not None:
-            if cond_key is None:
-                cond_key = self.cond_stage_key
-            if cond_key != self.first_stage_key:
-                if cond_key in ['caption', 'coordinates_bbox', 'txt']:
-                    xc = batch[cond_key]
-                elif cond_key == 'class_label':
-                    xc = batch
-                else:
-                    xc = super().get_input(batch, cond_key).to(self.device)
-            else:
-                xc = x
-            if not self.cond_stage_trainable or force_c_encode:
-                if isinstance(xc, dict) or isinstance(xc, list):
-                    # import pudb; pudb.set_trace()
-                    c = self.get_learned_conditioning(xc)
-                else:
-                    c = self.get_learned_conditioning(xc.to(self.device))
-            else:
-                c = xc
-            if bs is not None:
-                c = c[:bs]
-
-            if self.use_positional_encodings:
-                pos_x, pos_y = self.compute_latent_shifts(batch)
-                ckey = __conditioning_keys__[self.model.conditioning_key]
-                c = {ckey: c, 'pos_x': pos_x, 'pos_y': pos_y}
-
-        else:
-            c = None
-            xc = None
-            if self.use_positional_encodings:
-                pos_x, pos_y = self.compute_latent_shifts(batch)
-                c = {'pos_x': pos_x, 'pos_y': pos_y}
-        out = [z, c]
-        if return_first_stage_outputs:
-            xrec = self.decode_first_stage(z)
-            out.extend([x, xrec])
-        if return_original_cond:
-            out.append(xc)
-        return out
-
-    @torch.no_grad()
-    def decode_first_stage(self, z, predict_cids=False, force_not_quantize=False):
-        if predict_cids:
-            if z.dim() == 4:
-                z = torch.argmax(z.exp(), dim=1).long()
-            z = self.first_stage_model.quantize.get_codebook_entry(z, shape=None)
-            z = rearrange(z, 'b h w c -> b c h w').contiguous()
-
-        z = 1. / self.scale_factor * z
-
-        if hasattr(self, "split_input_params"):
-            if self.split_input_params["patch_distributed_vq"]:
-                ks = self.split_input_params["ks"]  # eg. (128, 128)
-                stride = self.split_input_params["stride"]  # eg. (64, 64)
-                uf = self.split_input_params["vqf"]
-                bs, nc, h, w = z.shape
-                if ks[0] > h or ks[1] > w:
-                    ks = (min(ks[0], h), min(ks[1], w))
-                    print("reducing Kernel")
-
-                if stride[0] > h or stride[1] > w:
-                    stride = (min(stride[0], h), min(stride[1], w))
-                    print("reducing stride")
-
-                fold, unfold, normalization, weighting = self.get_fold_unfold(z, ks, stride, uf=uf)
-
-                z = unfold(z)  # (bn, nc * prod(**ks), L)
-                # 1. Reshape to img shape
-                z = z.view((z.shape[0], -1, ks[0], ks[1], z.shape[-1]))  # (bn, nc, ks[0], ks[1], L )
-
-                # 2. apply model loop over last dim
-                if isinstance(self.first_stage_model, VQModelInterface):
-                    output_list = [self.first_stage_model.decode(z[:, :, :, :, i],
-                                                                 force_not_quantize=predict_cids or force_not_quantize)
-                                   for i in range(z.shape[-1])]
-                else:
-
-                    output_list = [self.first_stage_model.decode(z[:, :, :, :, i])
-                                   for i in range(z.shape[-1])]
-
-                o = torch.stack(output_list, axis=-1)  # # (bn, nc, ks[0], ks[1], L)
-                o = o * weighting
-                # Reverse 1. reshape to img shape
-                o = o.view((o.shape[0], -1, o.shape[-1]))  # (bn, nc * ks[0] * ks[1], L)
-                # stitch crops together
-                decoded = fold(o)
-                decoded = decoded / normalization  # norm is shape (1, 1, h, w)
-                return decoded
-            else:
-                if isinstance(self.first_stage_model, VQModelInterface):
-                    return self.first_stage_model.decode(z, force_not_quantize=predict_cids or force_not_quantize)
-                else:
-                    return self.first_stage_model.decode(z)
-
-        else:
-            if isinstance(self.first_stage_model, VQModelInterface):
-                return self.first_stage_model.decode(z, force_not_quantize=predict_cids or force_not_quantize)
-            else:
-                return self.first_stage_model.decode(z)
-
-    # same as above but without decorator
-    def differentiable_decode_first_stage(self, z, predict_cids=False, force_not_quantize=False):
-        if predict_cids:
-            if z.dim() == 4:
-                z = torch.argmax(z.exp(), dim=1).long()
-            z = self.first_stage_model.quantize.get_codebook_entry(z, shape=None)
-            z = rearrange(z, 'b h w c -> b c h w').contiguous()
-
-        z = 1. / self.scale_factor * z
-
-        if hasattr(self, "split_input_params"):
-            if self.split_input_params["patch_distributed_vq"]:
-                ks = self.split_input_params["ks"]  # eg. (128, 128)
-                stride = self.split_input_params["stride"]  # eg. (64, 64)
-                uf = self.split_input_params["vqf"]
-                bs, nc, h, w = z.shape
-                if ks[0] > h or ks[1] > w:
-                    ks = (min(ks[0], h), min(ks[1], w))
-                    print("reducing Kernel")
-
-                if stride[0] > h or stride[1] > w:
-                    stride = (min(stride[0], h), min(stride[1], w))
-                    print("reducing stride")
-
-                fold, unfold, normalization, weighting = self.get_fold_unfold(z, ks, stride, uf=uf)
-
-                z = unfold(z)  # (bn, nc * prod(**ks), L)
-                # 1. Reshape to img shape
-                z = z.view((z.shape[0], -1, ks[0], ks[1], z.shape[-1]))  # (bn, nc, ks[0], ks[1], L )
-
-                # 2. apply model loop over last dim
-                if isinstance(self.first_stage_model, VQModelInterface):  
-                    output_list = [self.first_stage_model.decode(z[:, :, :, :, i],
-                                                                 force_not_quantize=predict_cids or force_not_quantize)
-                                   for i in range(z.shape[-1])]
-                else:
-
-                    output_list = [self.first_stage_model.decode(z[:, :, :, :, i])
-                                   for i in range(z.shape[-1])]
-
-                o = torch.stack(output_list, axis=-1)  # # (bn, nc, ks[0], ks[1], L)
-                o = o * weighting
-                # Reverse 1. reshape to img shape
-                o = o.view((o.shape[0], -1, o.shape[-1]))  # (bn, nc * ks[0] * ks[1], L)
-                # stitch crops together
-                decoded = fold(o)
-                decoded = decoded / normalization  # norm is shape (1, 1, h, w)
-                return decoded
-            else:
-                if isinstance(self.first_stage_model, VQModelInterface):
-                    return self.first_stage_model.decode(z, force_not_quantize=predict_cids or force_not_quantize)
-                else:
-                    return self.first_stage_model.decode(z)
-
-        else:
-            if isinstance(self.first_stage_model, VQModelInterface):
-                return self.first_stage_model.decode(z, force_not_quantize=predict_cids or force_not_quantize)
-            else:
-                return self.first_stage_model.decode(z)
-
-    @torch.no_grad()
-    def encode_first_stage(self, x):
-        if hasattr(self, "split_input_params"):
-            if self.split_input_params["patch_distributed_vq"]:
-                ks = self.split_input_params["ks"]  # eg. (128, 128)
-                stride = self.split_input_params["stride"]  # eg. (64, 64)
-                df = self.split_input_params["vqf"]
-                self.split_input_params['original_image_size'] = x.shape[-2:]
-                bs, nc, h, w = x.shape
-                if ks[0] > h or ks[1] > w:
-                    ks = (min(ks[0], h), min(ks[1], w))
-                    print("reducing Kernel")
-
-                if stride[0] > h or stride[1] > w:
-                    stride = (min(stride[0], h), min(stride[1], w))
-                    print("reducing stride")
-
-                fold, unfold, normalization, weighting = self.get_fold_unfold(x, ks, stride, df=df)
-                z = unfold(x)  # (bn, nc * prod(**ks), L)
-                # Reshape to img shape
-                z = z.view((z.shape[0], -1, ks[0], ks[1], z.shape[-1]))  # (bn, nc, ks[0], ks[1], L )
-
-                output_list = [self.first_stage_model.encode(z[:, :, :, :, i])
-                               for i in range(z.shape[-1])]
-
-                o = torch.stack(output_list, axis=-1)
-                o = o * weighting
-
-                # Reverse reshape to img shape
-                o = o.view((o.shape[0], -1, o.shape[-1]))  # (bn, nc * ks[0] * ks[1], L)
-                # stitch crops together
-                decoded = fold(o)
-                decoded = decoded / normalization
-                return decoded
-
-            else:
-                return self.first_stage_model.encode(x)
-        else:
-            return self.first_stage_model.encode(x)
-
-    def shared_step(self, batch, **kwargs):
-        x, c = self.get_input(batch, self.first_stage_key)
-        loss = self(x, c)
-        return loss
-
-    def forward(self, x, c, *args, **kwargs):
-        t = torch.randint(0, self.num_timesteps, (x.shape[0],), device=self.device).long()
-        if self.model.conditioning_key is not None:
-            assert c is not None
-            if self.cond_stage_trainable:
-                c = self.get_learned_conditioning(c)
-            if self.shorten_cond_schedule:  # TODO: drop this option
-                tc = self.cond_ids[t].to(self.device)
-                c = self.q_sample(x_start=c, t=tc, noise=torch.randn_like(c.float()))
-        return self.p_losses(x, c, t, *args, **kwargs)
-
-    def _rescale_annotations(self, bboxes, crop_coordinates):  # TODO: move to dataset
-        def rescale_bbox(bbox):
-            x0 = clamp((bbox[0] - crop_coordinates[0]) / crop_coordinates[2])
-            y0 = clamp((bbox[1] - crop_coordinates[1]) / crop_coordinates[3])
-            w = min(bbox[2] / crop_coordinates[2], 1 - x0)
-            h = min(bbox[3] / crop_coordinates[3], 1 - y0)
-            return x0, y0, w, h
-
-        return [rescale_bbox(b) for b in bboxes]
-
-    def apply_model(self, x_noisy, t, cond, return_ids=False):
-        if isinstance(cond, dict):
-            # hybrid case, cond is exptected to be a dict
-            pass
-        else:
-            if not isinstance(cond, list):
-                cond = [cond]
-            key = 'c_concat' if self.model.conditioning_key == 'concat' else 'c_crossattn'
-            cond = {key: cond}
-
-        if hasattr(self, "split_input_params"):
-            assert len(cond) == 1  # todo can only deal with one conditioning atm
-            assert not return_ids
-            ks = self.split_input_params["ks"]  # eg. (128, 128)
-            stride = self.split_input_params["stride"]  # eg. (64, 64)
-
-            h, w = x_noisy.shape[-2:]
-
-            fold, unfold, normalization, weighting = self.get_fold_unfold(x_noisy, ks, stride)
-
-            z = unfold(x_noisy)  # (bn, nc * prod(**ks), L)
-            # Reshape to img shape
-            z = z.view((z.shape[0], -1, ks[0], ks[1], z.shape[-1]))  # (bn, nc, ks[0], ks[1], L )
-            z_list = [z[:, :, :, :, i] for i in range(z.shape[-1])]
-            if self.cond_stage_key in ["image", "LR_image", "segmentation",
-                                       'bbox_img'] and self.model.conditioning_key:  # todo check for completeness
-                c_key = next(iter(cond.keys()))  # get key
-                c = next(iter(cond.values()))  # get value
-                assert (len(c) == 1)  # todo extend to list with more than one elem
-                c = c[0]  # get element
-
-                c = unfold(c)
-                c = c.view((c.shape[0], -1, ks[0], ks[1], c.shape[-1]))  # (bn, nc, ks[0], ks[1], L )
-
-                cond_list = [{c_key: [c[:, :, :, :, i]]} for i in range(c.shape[-1])]
-
-            elif self.cond_stage_key == 'coordinates_bbox':
-                assert 'original_image_size' in self.split_input_params, 'BoudingBoxRescaling is missing original_image_size'
-
-                # assuming padding of unfold is always 0 and its dilation is always 1
-                n_patches_per_row = int((w - ks[0]) / stride[0] + 1)
-                full_img_h, full_img_w = self.split_input_params['original_image_size']
-                # as we are operating on latents, we need the factor from the original image size to the
-                # spatial latent size to properly rescale the crops for regenerating the bbox annotations
-                num_downs = self.first_stage_model.encoder.num_resolutions - 1
-                rescale_latent = 2 ** (num_downs)
-
-                # get top left postions of patches as conforming for the bbbox tokenizer, therefore we
-                # need to rescale the tl patch coordinates to be in between (0,1)
-                tl_patch_coordinates = [(rescale_latent * stride[0] * (patch_nr % n_patches_per_row) / full_img_w,
-                                         rescale_latent * stride[1] * (patch_nr // n_patches_per_row) / full_img_h)
-                                        for patch_nr in range(z.shape[-1])]
-
-                # patch_limits are tl_coord, width and height coordinates as (x_tl, y_tl, h, w)
-                patch_limits = [(x_tl, y_tl,
-                                 rescale_latent * ks[0] / full_img_w,
-                                 rescale_latent * ks[1] / full_img_h) for x_tl, y_tl in tl_patch_coordinates]
-                # patch_values = [(np.arange(x_tl,min(x_tl+ks, 1.)),np.arange(y_tl,min(y_tl+ks, 1.))) for x_tl, y_tl in tl_patch_coordinates]
-
-                # tokenize crop coordinates for the bounding boxes of the respective patches
-                patch_limits_tknzd = [torch.LongTensor(self.bbox_tokenizer._crop_encoder(bbox))[None].to(self.device)
-                                      for bbox in patch_limits]  # list of length l with tensors of shape (1, 2)
-                print(patch_limits_tknzd[0].shape)
-                # cut tknzd crop position from conditioning
-                assert isinstance(cond, dict), 'cond must be dict to be fed into model'
-                cut_cond = cond['c_crossattn'][0][..., :-2].to(self.device)
-                print(cut_cond.shape)
-
-                adapted_cond = torch.stack([torch.cat([cut_cond, p], dim=1) for p in patch_limits_tknzd])
-                adapted_cond = rearrange(adapted_cond, 'l b n -> (l b) n')
-                print(adapted_cond.shape)
-                adapted_cond = self.get_learned_conditioning(adapted_cond)
-                print(adapted_cond.shape)
-                adapted_cond = rearrange(adapted_cond, '(l b) n d -> l b n d', l=z.shape[-1])
-                print(adapted_cond.shape)
-
-                cond_list = [{'c_crossattn': [e]} for e in adapted_cond]
-
-            else:
-                cond_list = [cond for i in range(z.shape[-1])]  # Todo make this more efficient
-
-            # apply model by loop over crops
-            output_list = [self.model(z_list[i], t, **cond_list[i]) for i in range(z.shape[-1])]
-            assert not isinstance(output_list[0],
-                                  tuple)  # todo cant deal with multiple model outputs check this never happens
-
-            o = torch.stack(output_list, axis=-1)
-            o = o * weighting
-            # Reverse reshape to img shape
-            o = o.view((o.shape[0], -1, o.shape[-1]))  # (bn, nc * ks[0] * ks[1], L)
-            # stitch crops together
-            x_recon = fold(o) / normalization
-
-        else:
-            x_recon = self.model(x_noisy, t, **cond)
-
-        if isinstance(x_recon, tuple) and not return_ids:
-            return x_recon[0]
-        else:
-            return x_recon
-
-    def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
-        return (extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - pred_xstart) / \
-               extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
-
-    def _prior_bpd(self, x_start):
-        """
-        Get the prior KL term for the variational lower-bound, measured in
-        bits-per-dim.
-        This term can't be optimized, as it only depends on the encoder.
-        :param x_start: the [N x C x ...] tensor of inputs.
-        :return: a batch of [N] KL values (in bits), one per batch element.
-        """
-        batch_size = x_start.shape[0]
-        t = torch.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device)
-        qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t)
-        kl_prior = normal_kl(mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0)
-        return mean_flat(kl_prior) / np.log(2.0)
-
-    def p_losses(self, x_start, cond, t, noise=None):
-        noise = default(noise, lambda: torch.randn_like(x_start))
-        x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
-        model_output = self.apply_model(x_noisy, t, cond)
-
-        loss_dict = {}
-        prefix = 'train' if self.training else 'val'
-
-        if self.parameterization == "x0":
-            target = x_start
-        elif self.parameterization == "eps":
-            target = noise
-        else:
-            raise NotImplementedError()
-
-        loss_simple = self.get_loss(model_output, target, mean=False).mean([1, 2, 3])
-        loss_dict.update({f'{prefix}/loss_simple': loss_simple.mean()})
-
-        logvar_t = self.logvar[t].to(self.device)
-        loss = loss_simple / torch.exp(logvar_t) + logvar_t
-        # loss = loss_simple / torch.exp(self.logvar) + self.logvar
-        if self.learn_logvar:
-            loss_dict.update({f'{prefix}/loss_gamma': loss.mean()})
-            loss_dict.update({'logvar': self.logvar.data.mean()})
-
-        loss = self.l_simple_weight * loss.mean()
-
-        loss_vlb = self.get_loss(model_output, target, mean=False).mean(dim=(1, 2, 3))
-        loss_vlb = (self.lvlb_weights[t] * loss_vlb).mean()
-        loss_dict.update({f'{prefix}/loss_vlb': loss_vlb})
-        loss += (self.original_elbo_weight * loss_vlb)
-        loss_dict.update({f'{prefix}/loss': loss})
-
-        return loss, loss_dict
-
-    def p_mean_variance(self, x, c, t, clip_denoised: bool, return_codebook_ids=False, quantize_denoised=False,
-                        return_x0=False, score_corrector=None, corrector_kwargs=None):
-        t_in = t
-        model_out = self.apply_model(x, t_in, c, return_ids=return_codebook_ids)
-
-        if score_corrector is not None:
-            assert self.parameterization == "eps"
-            model_out = score_corrector.modify_score(self, model_out, x, t, c, **corrector_kwargs)
-
-        if return_codebook_ids:
-            model_out, logits = model_out
-
-        if self.parameterization == "eps":
-            x_recon = self.predict_start_from_noise(x, t=t, noise=model_out)
-        elif self.parameterization == "x0":
-            x_recon = model_out
-        else:
-            raise NotImplementedError()
-
-        if clip_denoised:
-            x_recon.clamp_(-1., 1.)
-        if quantize_denoised:
-            x_recon, _, [_, _, indices] = self.first_stage_model.quantize(x_recon)
-        model_mean, posterior_variance, posterior_log_variance = self.q_posterior(x_start=x_recon, x_t=x, t=t)
-        if return_codebook_ids:
-            return model_mean, posterior_variance, posterior_log_variance, logits
-        elif return_x0:
-            return model_mean, posterior_variance, posterior_log_variance, x_recon
-        else:
-            return model_mean, posterior_variance, posterior_log_variance
-
-    @torch.no_grad()
-    def p_sample(self, x, c, t, clip_denoised=False, repeat_noise=False,
-                 return_codebook_ids=False, quantize_denoised=False, return_x0=False,
-                 temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None):
-        b, *_, device = *x.shape, x.device
-        outputs = self.p_mean_variance(x=x, c=c, t=t, clip_denoised=clip_denoised,
-                                       return_codebook_ids=return_codebook_ids,
-                                       quantize_denoised=quantize_denoised,
-                                       return_x0=return_x0,
-                                       score_corrector=score_corrector, corrector_kwargs=corrector_kwargs)
-        if return_codebook_ids:
-            raise DeprecationWarning("Support dropped.")
-            model_mean, _, model_log_variance, logits = outputs
-        elif return_x0:
-            model_mean, _, model_log_variance, x0 = outputs
-        else:
-            model_mean, _, model_log_variance = outputs
-
-        noise = noise_like(x.shape, device, repeat_noise) * temperature
-        if noise_dropout > 0.:
-            noise = torch.nn.functional.dropout(noise, p=noise_dropout)
-        # no noise when t == 0
-        nonzero_mask = (1 - (t == 0).float()).reshape(b, *((1,) * (len(x.shape) - 1)))
-
-        if return_codebook_ids:
-            return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise, logits.argmax(dim=1)
-        if return_x0:
-            return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise, x0
-        else:
-            return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise
-
-    @torch.no_grad()
-    def progressive_denoising(self, cond, shape, verbose=True, callback=None, quantize_denoised=False,
-                              img_callback=None, mask=None, x0=None, temperature=1., noise_dropout=0.,
-                              score_corrector=None, corrector_kwargs=None, batch_size=None, x_T=None, start_T=None,
-                              log_every_t=None):
-        if not log_every_t:
-            log_every_t = self.log_every_t
-        timesteps = self.num_timesteps
-        if batch_size is not None:
-            b = batch_size if batch_size is not None else shape[0]
-            shape = [batch_size] + list(shape)
-        else:
-            b = batch_size = shape[0]
-        if x_T is None:
-            img = torch.randn(shape, device=self.device)
-        else:
-            img = x_T
-        intermediates = []
-        if cond is not None:
-            if isinstance(cond, dict):
-                cond = {key: cond[key][:batch_size] if not isinstance(cond[key], list) else
-                list(map(lambda x: x[:batch_size], cond[key])) for key in cond}
-            else:
-                cond = [c[:batch_size] for c in cond] if isinstance(cond, list) else cond[:batch_size]
-
-        if start_T is not None:
-            timesteps = min(timesteps, start_T)
-        iterator = tqdm(reversed(range(0, timesteps)), desc='Progressive Generation',
-                        total=timesteps) if verbose else reversed(
-            range(0, timesteps))
-        if type(temperature) == float:
-            temperature = [temperature] * timesteps
-
-        for i in iterator:
-            ts = torch.full((b,), i, device=self.device, dtype=torch.long)
-            if self.shorten_cond_schedule:
-                assert self.model.conditioning_key != 'hybrid'
-                tc = self.cond_ids[ts].to(cond.device)
-                cond = self.q_sample(x_start=cond, t=tc, noise=torch.randn_like(cond))
-
-            img, x0_partial = self.p_sample(img, cond, ts,
-                                            clip_denoised=self.clip_denoised,
-                                            quantize_denoised=quantize_denoised, return_x0=True,
-                                            temperature=temperature[i], noise_dropout=noise_dropout,
-                                            score_corrector=score_corrector, corrector_kwargs=corrector_kwargs)
-            if mask is not None:
-                assert x0 is not None
-                img_orig = self.q_sample(x0, ts)
-                img = img_orig * mask + (1. - mask) * img
-
-            if i % log_every_t == 0 or i == timesteps - 1:
-                intermediates.append(x0_partial)
-            if callback: callback(i)
-            if img_callback: img_callback(img, i)
-        return img, intermediates
-
-    @torch.no_grad()
-    def p_sample_loop(self, cond, shape, return_intermediates=False,
-                      x_T=None, verbose=True, callback=None, timesteps=None, quantize_denoised=False,
-                      mask=None, x0=None, img_callback=None, start_T=None,
-                      log_every_t=None):
-
-        if not log_every_t:
-            log_every_t = self.log_every_t
-        device = self.betas.device
-        b = shape[0]
-        if x_T is None:
-            img = torch.randn(shape, device=device)
-        else:
-            img = x_T
-
-        intermediates = [img]
-        if timesteps is None:
-            timesteps = self.num_timesteps
-
-        if start_T is not None:
-            timesteps = min(timesteps, start_T)
-        iterator = tqdm(reversed(range(0, timesteps)), desc='Sampling t', total=timesteps) if verbose else reversed(
-            range(0, timesteps))
-
-        if mask is not None:
-            assert x0 is not None
-            assert x0.shape[2:3] == mask.shape[2:3]  # spatial size has to match
-
-        for i in iterator:
-            ts = torch.full((b,), i, device=device, dtype=torch.long)
-            if self.shorten_cond_schedule:
-                assert self.model.conditioning_key != 'hybrid'
-                tc = self.cond_ids[ts].to(cond.device)
-                cond = self.q_sample(x_start=cond, t=tc, noise=torch.randn_like(cond))
-
-            img = self.p_sample(img, cond, ts,
-                                clip_denoised=self.clip_denoised,
-                                quantize_denoised=quantize_denoised)
-            if mask is not None:
-                img_orig = self.q_sample(x0, ts)
-                img = img_orig * mask + (1. - mask) * img
-
-            if i % log_every_t == 0 or i == timesteps - 1:
-                intermediates.append(img)
-            if callback: callback(i)
-            if img_callback: img_callback(img, i)
-
-        if return_intermediates:
-            return img, intermediates
-        return img
-
-    @torch.no_grad()
-    def sample(self, cond, batch_size=16, return_intermediates=False, x_T=None,
-               verbose=True, timesteps=None, quantize_denoised=False,
-               mask=None, x0=None, shape=None,**kwargs):
-        if shape is None:
-            shape = (batch_size, self.channels, self.image_size, self.image_size)
-        if cond is not None:
-            if isinstance(cond, dict):
-                cond = {key: cond[key][:batch_size] if not isinstance(cond[key], list) else
-                list(map(lambda x: x[:batch_size], cond[key])) for key in cond}
-            else:
-                cond = [c[:batch_size] for c in cond] if isinstance(cond, list) else cond[:batch_size]
-        return self.p_sample_loop(cond,
-                                  shape,
-                                  return_intermediates=return_intermediates, x_T=x_T,
-                                  verbose=verbose, timesteps=timesteps, quantize_denoised=quantize_denoised,
-                                  mask=mask, x0=x0)
-
-    @torch.no_grad()
-    def sample_log(self,cond,batch_size,ddim, ddim_steps,**kwargs):
-
-        if ddim:
-            ddim_sampler = DDIMSampler(self)
-            shape = (self.channels, self.image_size, self.image_size)
-            samples, intermediates =ddim_sampler.sample(ddim_steps,batch_size,
-                                                        shape,cond,verbose=False,**kwargs)
-
-        else:
-            samples, intermediates = self.sample(cond=cond, batch_size=batch_size,
-                                                 return_intermediates=True,**kwargs)
-
-        return samples, intermediates
-
-
-    @torch.no_grad()
-    def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=200, ddim_eta=1., return_keys=None,
-                   quantize_denoised=True, inpaint=True, plot_denoise_rows=False, plot_progressive_rows=True,
-                   plot_diffusion_rows=True, **kwargs):
-
-        use_ddim = ddim_steps is not None
-
-        log = dict()
-        z, c, x, xrec, xc = self.get_input(batch, self.first_stage_key,
-                                           return_first_stage_outputs=True,
-                                           force_c_encode=True,
-                                           return_original_cond=True,
-                                           bs=N)
-        N = min(x.shape[0], N)
-        n_row = min(x.shape[0], n_row)
-        log["inputs"] = x
-        log["reconstruction"] = xrec
-        if self.model.conditioning_key is not None:
-            if hasattr(self.cond_stage_model, "decode"):
-                xc = self.cond_stage_model.decode(c)
-                log["conditioning"] = xc
-            elif self.cond_stage_key in ["caption"]:
-                xc = log_txt_as_img((x.shape[2], x.shape[3]), batch["caption"])
-                log["conditioning"] = xc
-            elif self.cond_stage_key == 'class_label':
-                xc = log_txt_as_img((x.shape[2], x.shape[3]), batch["human_label"])
-                log['conditioning'] = xc
-            elif isimage(xc):
-                log["conditioning"] = xc
-            if ismap(xc):
-                log["original_conditioning"] = self.to_rgb(xc)
-
-        if plot_diffusion_rows:
-            # get diffusion row
-            diffusion_row = list()
-            z_start = z[:n_row]
-            for t in range(self.num_timesteps):
-                if t % self.log_every_t == 0 or t == self.num_timesteps - 1:
-                    t = repeat(torch.tensor([t]), '1 -> b', b=n_row)
-                    t = t.to(self.device).long()
-                    noise = torch.randn_like(z_start)
-                    z_noisy = self.q_sample(x_start=z_start, t=t, noise=noise)
-                    diffusion_row.append(self.decode_first_stage(z_noisy))
-
-            diffusion_row = torch.stack(diffusion_row)  # n_log_step, n_row, C, H, W
-            diffusion_grid = rearrange(diffusion_row, 'n b c h w -> b n c h w')
-            diffusion_grid = rearrange(diffusion_grid, 'b n c h w -> (b n) c h w')
-            diffusion_grid = make_grid(diffusion_grid, nrow=diffusion_row.shape[0])
-            log["diffusion_row"] = diffusion_grid
-
-        if sample:
-            # get denoise row
-            with self.ema_scope("Plotting"):
-                samples, z_denoise_row = self.sample_log(cond=c,batch_size=N,ddim=use_ddim,
-                                                         ddim_steps=ddim_steps,eta=ddim_eta)
-                # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True)
-            x_samples = self.decode_first_stage(samples)
-            log["samples"] = x_samples
-            if plot_denoise_rows:
-                denoise_grid = self._get_denoise_row_from_list(z_denoise_row)
-                log["denoise_row"] = denoise_grid
-
-            if quantize_denoised and not isinstance(self.first_stage_model, AutoencoderKL) and not isinstance(
-                    self.first_stage_model, IdentityFirstStage):
-                # also display when quantizing x0 while sampling
-                with self.ema_scope("Plotting Quantized Denoised"):
-                    samples, z_denoise_row = self.sample_log(cond=c,batch_size=N,ddim=use_ddim,
-                                                             ddim_steps=ddim_steps,eta=ddim_eta,
-                                                             quantize_denoised=True)
-                    # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True,
-                    #                                      quantize_denoised=True)
-                x_samples = self.decode_first_stage(samples.to(self.device))
-                log["samples_x0_quantized"] = x_samples
-
-            if inpaint:
-                # make a simple center square
-                b, h, w = z.shape[0], z.shape[2], z.shape[3]
-                mask = torch.ones(N, h, w).to(self.device)
-                # zeros will be filled in
-                mask[:, h // 4:3 * h // 4, w // 4:3 * w // 4] = 0.
-                mask = mask[:, None, ...]
-                with self.ema_scope("Plotting Inpaint"):
-
-                    samples, _ = self.sample_log(cond=c,batch_size=N,ddim=use_ddim, eta=ddim_eta,
-                                                ddim_steps=ddim_steps, x0=z[:N], mask=mask)
-                x_samples = self.decode_first_stage(samples.to(self.device))
-                log["samples_inpainting"] = x_samples
-                log["mask"] = mask
-
-                # outpaint
-                with self.ema_scope("Plotting Outpaint"):
-                    samples, _ = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,eta=ddim_eta,
-                                                ddim_steps=ddim_steps, x0=z[:N], mask=mask)
-                x_samples = self.decode_first_stage(samples.to(self.device))
-                log["samples_outpainting"] = x_samples
-
-        if plot_progressive_rows:
-            with self.ema_scope("Plotting Progressives"):
-                img, progressives = self.progressive_denoising(c,
-                                                               shape=(self.channels, self.image_size, self.image_size),
-                                                               batch_size=N)
-            prog_row = self._get_denoise_row_from_list(progressives, desc="Progressive Generation")
-            log["progressive_row"] = prog_row
-
-        if return_keys:
-            if np.intersect1d(list(log.keys()), return_keys).shape[0] == 0:
-                return log
-            else:
-                return {key: log[key] for key in return_keys}
-        return log
-
-    def configure_optimizers(self):
-        lr = self.learning_rate
-        params = list(self.model.parameters())
-        if self.cond_stage_trainable:
-            print(f"{self.__class__.__name__}: Also optimizing conditioner params!")
-            params = params + list(self.cond_stage_model.parameters())
-        if self.learn_logvar:
-            print('Diffusion model optimizing logvar')
-            params.append(self.logvar)
-        from colossalai.nn.optimizer import HybridAdam
-        opt = HybridAdam(params, lr=lr)
-        # opt = torch.optim.AdamW(params, lr=lr)
-        if self.use_scheduler:
-            assert 'target' in self.scheduler_config
-            scheduler = instantiate_from_config(self.scheduler_config)
-
-            rank_zero_info("Setting up LambdaLR scheduler...")
-            scheduler = [
-                {
-                    'scheduler': LambdaLR(opt, lr_lambda=scheduler.schedule),
-                    'interval': 'step',
-                    'frequency': 1
-                }]
-            return [opt], scheduler
-        return opt
-
-    @torch.no_grad()
-    def to_rgb(self, x):
-        x = x.float()
-        if not hasattr(self, "colorize"):
-            self.colorize = torch.randn(3, x.shape[1], 1, 1).to(x)
-        x = nn.functional.conv2d(x, weight=self.colorize)
-        x = 2. * (x - x.min()) / (x.max() - x.min()) - 1.
-        return x
-
-
-class DiffusionWrapper(pl.LightningModule):
-    def __init__(self, diff_model_config, conditioning_key):
-        super().__init__()
-        self.diffusion_model = instantiate_from_config(diff_model_config)
-        self.conditioning_key = conditioning_key
-        assert self.conditioning_key in [None, 'concat', 'crossattn', 'hybrid', 'adm']
-
-    def forward(self, x, t, c_concat: list = None, c_crossattn: list = None):
-        if self.conditioning_key is None:
-            out = self.diffusion_model(x, t)
-        elif self.conditioning_key == 'concat':
-            xc = torch.cat([x] + c_concat, dim=1)
-            out = self.diffusion_model(xc, t)
-        elif self.conditioning_key == 'crossattn':
-            cc = torch.cat(c_crossattn, 1)
-            out = self.diffusion_model(x, t, context=cc)
-        elif self.conditioning_key == 'hybrid':
-            xc = torch.cat([x] + c_concat, dim=1)
-            cc = torch.cat(c_crossattn, 1)
-            out = self.diffusion_model(xc, t, context=cc)
-        elif self.conditioning_key == 'adm':
-            cc = c_crossattn[0]
-            out = self.diffusion_model(x, t, y=cc)
-        else:
-            raise NotImplementedError()
-
-        return out
-
-
-class Layout2ImgDiffusion(LatentDiffusion):
-    # TODO: move all layout-specific hacks to this class
-    def __init__(self, cond_stage_key, *args, **kwargs):
-        assert cond_stage_key == 'coordinates_bbox', 'Layout2ImgDiffusion only for cond_stage_key="coordinates_bbox"'
-        super().__init__(cond_stage_key=cond_stage_key, *args, **kwargs)
-
-    def log_images(self, batch, N=8, *args, **kwargs):
-        logs = super().log_images(batch=batch, N=N, *args, **kwargs)
-
-        key = 'train' if self.training else 'validation'
-        dset = self.trainer.datamodule.datasets[key]
-        mapper = dset.conditional_builders[self.cond_stage_key]
-
-        bbox_imgs = []
-        map_fn = lambda catno: dset.get_textual_label(dset.get_category_id(catno))
-        for tknzd_bbox in batch[self.cond_stage_key][:N]:
-            bboximg = mapper.plot(tknzd_bbox.detach().cpu(), map_fn, (256, 256))
-            bbox_imgs.append(bboximg)
-
-        cond_img = torch.stack(bbox_imgs, dim=0)
-        logs['bbox_image'] = cond_img
-        return logs
diff --git a/examples/tutorial/stable_diffusion/ldm/models/diffusion/plms.py b/examples/tutorial/stable_diffusion/ldm/models/diffusion/plms.py
deleted file mode 100644
index 78eeb1003aa4..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/models/diffusion/plms.py
+++ /dev/null
@@ -1,236 +0,0 @@
-"""SAMPLING ONLY."""
-
-import torch
-import numpy as np
-from tqdm import tqdm
-from functools import partial
-
-from ldm.modules.diffusionmodules.util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like
-
-
-class PLMSSampler(object):
-    def __init__(self, model, schedule="linear", **kwargs):
-        super().__init__()
-        self.model = model
-        self.ddpm_num_timesteps = model.num_timesteps
-        self.schedule = schedule
-
-    def register_buffer(self, name, attr):
-        if type(attr) == torch.Tensor:
-            if attr.device != torch.device("cuda"):
-                attr = attr.to(torch.device("cuda"))
-        setattr(self, name, attr)
-
-    def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
-        if ddim_eta != 0:
-            raise ValueError('ddim_eta must be 0 for PLMS')
-        self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
-                                                  num_ddpm_timesteps=self.ddpm_num_timesteps,verbose=verbose)
-        alphas_cumprod = self.model.alphas_cumprod
-        assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
-        to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
-
-        self.register_buffer('betas', to_torch(self.model.betas))
-        self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
-        self.register_buffer('alphas_cumprod_prev', to_torch(self.model.alphas_cumprod_prev))
-
-        # calculations for diffusion q(x_t | x_{t-1}) and others
-        self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu())))
-        self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod.cpu())))
-        self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod.cpu())))
-        self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu())))
-        self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1)))
-
-        # ddim sampling parameters
-        ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(),
-                                                                                   ddim_timesteps=self.ddim_timesteps,
-                                                                                   eta=ddim_eta,verbose=verbose)
-        self.register_buffer('ddim_sigmas', ddim_sigmas)
-        self.register_buffer('ddim_alphas', ddim_alphas)
-        self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
-        self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
-        sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
-            (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
-                        1 - self.alphas_cumprod / self.alphas_cumprod_prev))
-        self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)
-
-    @torch.no_grad()
-    def sample(self,
-               S,
-               batch_size,
-               shape,
-               conditioning=None,
-               callback=None,
-               normals_sequence=None,
-               img_callback=None,
-               quantize_x0=False,
-               eta=0.,
-               mask=None,
-               x0=None,
-               temperature=1.,
-               noise_dropout=0.,
-               score_corrector=None,
-               corrector_kwargs=None,
-               verbose=True,
-               x_T=None,
-               log_every_t=100,
-               unconditional_guidance_scale=1.,
-               unconditional_conditioning=None,
-               # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
-               **kwargs
-               ):
-        if conditioning is not None:
-            if isinstance(conditioning, dict):
-                cbs = conditioning[list(conditioning.keys())[0]].shape[0]
-                if cbs != batch_size:
-                    print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
-            else:
-                if conditioning.shape[0] != batch_size:
-                    print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
-
-        self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
-        # sampling
-        C, H, W = shape
-        size = (batch_size, C, H, W)
-        print(f'Data shape for PLMS sampling is {size}')
-
-        samples, intermediates = self.plms_sampling(conditioning, size,
-                                                    callback=callback,
-                                                    img_callback=img_callback,
-                                                    quantize_denoised=quantize_x0,
-                                                    mask=mask, x0=x0,
-                                                    ddim_use_original_steps=False,
-                                                    noise_dropout=noise_dropout,
-                                                    temperature=temperature,
-                                                    score_corrector=score_corrector,
-                                                    corrector_kwargs=corrector_kwargs,
-                                                    x_T=x_T,
-                                                    log_every_t=log_every_t,
-                                                    unconditional_guidance_scale=unconditional_guidance_scale,
-                                                    unconditional_conditioning=unconditional_conditioning,
-                                                    )
-        return samples, intermediates
-
-    @torch.no_grad()
-    def plms_sampling(self, cond, shape,
-                      x_T=None, ddim_use_original_steps=False,
-                      callback=None, timesteps=None, quantize_denoised=False,
-                      mask=None, x0=None, img_callback=None, log_every_t=100,
-                      temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
-                      unconditional_guidance_scale=1., unconditional_conditioning=None,):
-        device = self.model.betas.device
-        b = shape[0]
-        if x_T is None:
-            img = torch.randn(shape, device=device)
-        else:
-            img = x_T
-
-        if timesteps is None:
-            timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps
-        elif timesteps is not None and not ddim_use_original_steps:
-            subset_end = int(min(timesteps / self.ddim_timesteps.shape[0], 1) * self.ddim_timesteps.shape[0]) - 1
-            timesteps = self.ddim_timesteps[:subset_end]
-
-        intermediates = {'x_inter': [img], 'pred_x0': [img]}
-        time_range = list(reversed(range(0,timesteps))) if ddim_use_original_steps else np.flip(timesteps)
-        total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
-        print(f"Running PLMS Sampling with {total_steps} timesteps")
-
-        iterator = tqdm(time_range, desc='PLMS Sampler', total=total_steps)
-        old_eps = []
-
-        for i, step in enumerate(iterator):
-            index = total_steps - i - 1
-            ts = torch.full((b,), step, device=device, dtype=torch.long)
-            ts_next = torch.full((b,), time_range[min(i + 1, len(time_range) - 1)], device=device, dtype=torch.long)
-
-            if mask is not None:
-                assert x0 is not None
-                img_orig = self.model.q_sample(x0, ts)  # TODO: deterministic forward pass?
-                img = img_orig * mask + (1. - mask) * img
-
-            outs = self.p_sample_plms(img, cond, ts, index=index, use_original_steps=ddim_use_original_steps,
-                                      quantize_denoised=quantize_denoised, temperature=temperature,
-                                      noise_dropout=noise_dropout, score_corrector=score_corrector,
-                                      corrector_kwargs=corrector_kwargs,
-                                      unconditional_guidance_scale=unconditional_guidance_scale,
-                                      unconditional_conditioning=unconditional_conditioning,
-                                      old_eps=old_eps, t_next=ts_next)
-            img, pred_x0, e_t = outs
-            old_eps.append(e_t)
-            if len(old_eps) >= 4:
-                old_eps.pop(0)
-            if callback: callback(i)
-            if img_callback: img_callback(pred_x0, i)
-
-            if index % log_every_t == 0 or index == total_steps - 1:
-                intermediates['x_inter'].append(img)
-                intermediates['pred_x0'].append(pred_x0)
-
-        return img, intermediates
-
-    @torch.no_grad()
-    def p_sample_plms(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
-                      temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
-                      unconditional_guidance_scale=1., unconditional_conditioning=None, old_eps=None, t_next=None):
-        b, *_, device = *x.shape, x.device
-
-        def get_model_output(x, t):
-            if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
-                e_t = self.model.apply_model(x, t, c)
-            else:
-                x_in = torch.cat([x] * 2)
-                t_in = torch.cat([t] * 2)
-                c_in = torch.cat([unconditional_conditioning, c])
-                e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
-                e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
-
-            if score_corrector is not None:
-                assert self.model.parameterization == "eps"
-                e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs)
-
-            return e_t
-
-        alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
-        alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
-        sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
-        sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
-
-        def get_x_prev_and_pred_x0(e_t, index):
-            # select parameters corresponding to the currently considered timestep
-            a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
-            a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
-            sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
-            sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index],device=device)
-
-            # current prediction for x_0
-            pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
-            if quantize_denoised:
-                pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
-            # direction pointing to x_t
-            dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
-            noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
-            if noise_dropout > 0.:
-                noise = torch.nn.functional.dropout(noise, p=noise_dropout)
-            x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
-            return x_prev, pred_x0
-
-        e_t = get_model_output(x, t)
-        if len(old_eps) == 0:
-            # Pseudo Improved Euler (2nd order)
-            x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t, index)
-            e_t_next = get_model_output(x_prev, t_next)
-            e_t_prime = (e_t + e_t_next) / 2
-        elif len(old_eps) == 1:
-            # 2nd order Pseudo Linear Multistep (Adams-Bashforth)
-            e_t_prime = (3 * e_t - old_eps[-1]) / 2
-        elif len(old_eps) == 2:
-            # 3nd order Pseudo Linear Multistep (Adams-Bashforth)
-            e_t_prime = (23 * e_t - 16 * old_eps[-1] + 5 * old_eps[-2]) / 12
-        elif len(old_eps) >= 3:
-            # 4nd order Pseudo Linear Multistep (Adams-Bashforth)
-            e_t_prime = (55 * e_t - 59 * old_eps[-1] + 37 * old_eps[-2] - 9 * old_eps[-3]) / 24
-
-        x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t_prime, index)
-
-        return x_prev, pred_x0, e_t
diff --git a/examples/tutorial/stable_diffusion/ldm/modules/attention.py b/examples/tutorial/stable_diffusion/ldm/modules/attention.py
deleted file mode 100644
index 3401ceafddb4..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/modules/attention.py
+++ /dev/null
@@ -1,314 +0,0 @@
-from inspect import isfunction
-import math
-import torch
-import torch.nn.functional as F
-from torch import nn, einsum
-from einops import rearrange, repeat
-
-from torch.utils import checkpoint
-
-try:
-    from ldm.modules.flash_attention import flash_attention_qkv, flash_attention_q_kv
-    FlASH_AVAILABLE = True
-except:
-    FlASH_AVAILABLE = False
-
-USE_FLASH = False
-
-
-def enable_flash_attention():
-    global USE_FLASH
-    USE_FLASH = True
-    if FlASH_AVAILABLE is False:
-        print("Please install flash attention to activate new attention kernel.\n" + 
-              "Use \'pip install git+https://github.com/HazyResearch/flash-attention.git@c422fee3776eb3ea24e011ef641fd5fbeb212623#egg=flash_attn\'")
-
-
-def exists(val):
-    return val is not None
-
-
-def uniq(arr):
-    return{el: True for el in arr}.keys()
-
-
-def default(val, d):
-    if exists(val):
-        return val
-    return d() if isfunction(d) else d
-
-
-def max_neg_value(t):
-    return -torch.finfo(t.dtype).max
-
-
-def init_(tensor):
-    dim = tensor.shape[-1]
-    std = 1 / math.sqrt(dim)
-    tensor.uniform_(-std, std)
-    return tensor
-
-
-# feedforward
-class GEGLU(nn.Module):
-    def __init__(self, dim_in, dim_out):
-        super().__init__()
-        self.proj = nn.Linear(dim_in, dim_out * 2)
-
-    def forward(self, x):
-        x, gate = self.proj(x).chunk(2, dim=-1)
-        return x * F.gelu(gate)
-
-
-class FeedForward(nn.Module):
-    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.):
-        super().__init__()
-        inner_dim = int(dim * mult)
-        dim_out = default(dim_out, dim)
-        project_in = nn.Sequential(
-            nn.Linear(dim, inner_dim),
-            nn.GELU()
-        ) if not glu else GEGLU(dim, inner_dim)
-
-        self.net = nn.Sequential(
-            project_in,
-            nn.Dropout(dropout),
-            nn.Linear(inner_dim, dim_out)
-        )
-
-    def forward(self, x):
-        return self.net(x)
-
-
-def zero_module(module):
-    """
-    Zero out the parameters of a module and return it.
-    """
-    for p in module.parameters():
-        p.detach().zero_()
-    return module
-
-
-def Normalize(in_channels):
-    return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
-
-
-class LinearAttention(nn.Module):
-    def __init__(self, dim, heads=4, dim_head=32):
-        super().__init__()
-        self.heads = heads
-        hidden_dim = dim_head * heads
-        self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias = False)
-        self.to_out = nn.Conv2d(hidden_dim, dim, 1)
-
-    def forward(self, x):
-        b, c, h, w = x.shape
-        qkv = self.to_qkv(x)
-        q, k, v = rearrange(qkv, 'b (qkv heads c) h w -> qkv b heads c (h w)', heads = self.heads, qkv=3)
-        k = k.softmax(dim=-1)  
-        context = torch.einsum('bhdn,bhen->bhde', k, v)
-        out = torch.einsum('bhde,bhdn->bhen', context, q)
-        out = rearrange(out, 'b heads c (h w) -> b (heads c) h w', heads=self.heads, h=h, w=w)
-        return self.to_out(out)
-
-
-class SpatialSelfAttention(nn.Module):
-    def __init__(self, in_channels):
-        super().__init__()
-        self.in_channels = in_channels
-
-        self.norm = Normalize(in_channels)
-        self.q = torch.nn.Conv2d(in_channels,
-                                 in_channels,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-        self.k = torch.nn.Conv2d(in_channels,
-                                 in_channels,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-        self.v = torch.nn.Conv2d(in_channels,
-                                 in_channels,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-        self.proj_out = torch.nn.Conv2d(in_channels,
-                                        in_channels,
-                                        kernel_size=1,
-                                        stride=1,
-                                        padding=0)
-
-    def forward(self, x):
-        h_ = x
-        h_ = self.norm(h_)
-        q = self.q(h_)
-        k = self.k(h_)
-        v = self.v(h_)
-
-        # compute attention
-        b,c,h,w = q.shape
-        q = rearrange(q, 'b c h w -> b (h w) c')
-        k = rearrange(k, 'b c h w -> b c (h w)')
-        w_ = torch.einsum('bij,bjk->bik', q, k)
-
-        w_ = w_ * (int(c)**(-0.5))
-        w_ = torch.nn.functional.softmax(w_, dim=2)
-
-        # attend to values
-        v = rearrange(v, 'b c h w -> b c (h w)')
-        w_ = rearrange(w_, 'b i j -> b j i')
-        h_ = torch.einsum('bij,bjk->bik', v, w_)
-        h_ = rearrange(h_, 'b c (h w) -> b c h w', h=h)
-        h_ = self.proj_out(h_)
-
-        return x+h_
-
-
-class CrossAttention(nn.Module):
-    def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.):
-        super().__init__()
-        inner_dim = dim_head * heads
-        context_dim = default(context_dim, query_dim)
-
-        self.scale = dim_head ** -0.5
-        self.heads = heads
-
-        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
-        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
-        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
-
-        self.to_out = nn.Sequential(
-            nn.Linear(inner_dim, query_dim),
-            nn.Dropout(dropout)
-        )
-
-    def forward(self, x, context=None, mask=None):
-        q = self.to_q(x)
-        context = default(context, x)
-        k = self.to_k(context)
-        v = self.to_v(context)
-        dim_head = q.shape[-1] / self.heads
-
-        if USE_FLASH and FlASH_AVAILABLE and q.dtype in (torch.float16, torch.bfloat16) and \
-            dim_head <= 128 and (dim_head % 8) == 0:
-            # print("in flash")
-            if q.shape[1] == k.shape[1]:
-                out = self._flash_attention_qkv(q, k, v)
-            else:
-                out = self._flash_attention_q_kv(q, k, v)
-        else:
-            out = self._native_attention(q, k, v, self.heads, mask)
-
-        return self.to_out(out)
-
-    def _native_attention(self, q, k, v, h, mask):
-        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
-        sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
-        if exists(mask):
-            mask = rearrange(mask, 'b ... -> b (...)')
-            max_neg_value = -torch.finfo(sim.dtype).max
-            mask = repeat(mask, 'b j -> (b h) () j', h=h)
-            sim.masked_fill_(~mask, max_neg_value)
-        # attention, what we cannot get enough of
-        out = sim.softmax(dim=-1)
-        out = einsum('b i j, b j d -> b i d', out, v)
-        out = rearrange(out, '(b h) n d -> b n (h d)', h=h)
-        return out
-
-    def _flash_attention_qkv(self, q, k, v):
-        qkv = torch.stack([q, k, v], dim=2)
-        b = qkv.shape[0]
-        n = qkv.shape[1]
-        qkv = rearrange(qkv, 'b n t (h d) -> (b n) t h d', h=self.heads)
-        out = flash_attention_qkv(qkv, self.scale, b, n)
-        out = rearrange(out, '(b n) h d -> b n (h d)', b=b, h=self.heads)
-        return out
-    
-    def _flash_attention_q_kv(self, q, k, v):
-        kv = torch.stack([k, v], dim=2)
-        b = q.shape[0]
-        q_seqlen = q.shape[1]
-        kv_seqlen = kv.shape[1]
-        q = rearrange(q, 'b n (h d) -> (b n) h d', h=self.heads)
-        kv = rearrange(kv, 'b n t (h d) -> (b n) t h d', h=self.heads)
-        out = flash_attention_q_kv(q, kv, self.scale, b, q_seqlen, kv_seqlen)
-        out = rearrange(out, '(b n) h d -> b n (h d)', b=b, h=self.heads)
-        return out
-
-
-class BasicTransformerBlock(nn.Module):
-    def __init__(self, dim, n_heads, d_head, dropout=0., context_dim=None, gated_ff=True, use_checkpoint=False):
-        super().__init__()
-        self.attn1 = CrossAttention(query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout)  # is a self-attention
-        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
-        self.attn2 = CrossAttention(query_dim=dim, context_dim=context_dim,
-                                    heads=n_heads, dim_head=d_head, dropout=dropout)  # is self-attn if context is none
-        self.norm1 = nn.LayerNorm(dim)
-        self.norm2 = nn.LayerNorm(dim)
-        self.norm3 = nn.LayerNorm(dim)
-        self.use_checkpoint = use_checkpoint
-
-    def forward(self, x, context=None):
-
- 
-        if self.use_checkpoint:
-            return checkpoint(self._forward, x, context)
-        else:
-            return self._forward(x, context)
-
-    def _forward(self, x, context=None):
-        x = self.attn1(self.norm1(x)) + x
-        x = self.attn2(self.norm2(x), context=context) + x
-        x = self.ff(self.norm3(x)) + x
-        return x
-        
-
-
-class SpatialTransformer(nn.Module):
-    """
-    Transformer block for image-like data.
-    First, project the input (aka embedding)
-    and reshape to b, t, d.
-    Then apply standard transformer action.
-    Finally, reshape to image
-    """
-    def __init__(self, in_channels, n_heads, d_head,
-                 depth=1, dropout=0., context_dim=None, use_checkpoint=False):
-        super().__init__()
-        self.in_channels = in_channels
-        inner_dim = n_heads * d_head
-        self.norm = Normalize(in_channels)
-
-        self.proj_in = nn.Conv2d(in_channels,
-                                 inner_dim,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-
-        self.transformer_blocks = nn.ModuleList(
-            [BasicTransformerBlock(inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim, use_checkpoint=use_checkpoint)
-                for d in range(depth)]
-        )
-
-        self.proj_out = zero_module(nn.Conv2d(inner_dim,
-                                              in_channels,
-                                              kernel_size=1,
-                                              stride=1,
-                                              padding=0))
-
-
-    def forward(self, x, context=None):
-        # note: if no context is given, cross-attention defaults to self-attention
-        b, c, h, w = x.shape
-        x_in = x
-        x = self.norm(x)
-        x = self.proj_in(x)
-        x = rearrange(x, 'b c h w -> b (h w) c')
-        x = x.contiguous()
-        for block in self.transformer_blocks:
-            x = block(x, context=context)
-        x = rearrange(x, 'b (h w) c -> b c h w', h=h, w=w)
-        x = x.contiguous()
-        x = self.proj_out(x)
-        return x + x_in
\ No newline at end of file
diff --git a/examples/tutorial/stable_diffusion/ldm/modules/diffusionmodules/__init__.py b/examples/tutorial/stable_diffusion/ldm/modules/diffusionmodules/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/examples/tutorial/stable_diffusion/ldm/modules/diffusionmodules/model.py b/examples/tutorial/stable_diffusion/ldm/modules/diffusionmodules/model.py
deleted file mode 100644
index 3c28492c5502..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/modules/diffusionmodules/model.py
+++ /dev/null
@@ -1,862 +0,0 @@
-# pytorch_diffusion + derived encoder decoder
-import math
-import torch
-import torch.nn as nn
-import numpy as np
-from einops import rearrange
-
-from ldm.util import instantiate_from_config
-from ldm.modules.attention import LinearAttention
-
-
-def get_timestep_embedding(timesteps, embedding_dim):
-    """
-    This matches the implementation in Denoising Diffusion Probabilistic Models:
-    From Fairseq.
-    Build sinusoidal embeddings.
-    This matches the implementation in tensor2tensor, but differs slightly
-    from the description in Section 3.5 of "Attention Is All You Need".
-    """
-    assert len(timesteps.shape) == 1
-
-    half_dim = embedding_dim // 2
-    emb = math.log(10000) / (half_dim - 1)
-    emb = torch.exp(torch.arange(half_dim, dtype=torch.float32) * -emb)
-    emb = emb.to(device=timesteps.device)
-    emb = timesteps.float()[:, None] * emb[None, :]
-    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
-    if embedding_dim % 2 == 1:  # zero pad
-        emb = torch.nn.functional.pad(emb, (0,1,0,0))
-    return emb
-
-
-def nonlinearity(x):
-    # swish
-    return x*torch.sigmoid(x)
-
-
-def Normalize(in_channels, num_groups=32):
-    return torch.nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True)
-
-
-class Upsample(nn.Module):
-    def __init__(self, in_channels, with_conv):
-        super().__init__()
-        self.with_conv = with_conv
-        if self.with_conv:
-            self.conv = torch.nn.Conv2d(in_channels,
-                                        in_channels,
-                                        kernel_size=3,
-                                        stride=1,
-                                        padding=1)
-
-    def forward(self, x):
-        x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
-        if self.with_conv:
-            x = self.conv(x)
-        return x
-
-
-class Downsample(nn.Module):
-    def __init__(self, in_channels, with_conv):
-        super().__init__()
-        self.with_conv = with_conv
-        if self.with_conv:
-            # no asymmetric padding in torch conv, must do it ourselves
-            self.conv = torch.nn.Conv2d(in_channels,
-                                        in_channels,
-                                        kernel_size=3,
-                                        stride=2,
-                                        padding=0)
-
-    def forward(self, x):
-        if self.with_conv:
-            pad = (0,1,0,1)
-            x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
-            x = self.conv(x)
-        else:
-            x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
-        return x
-
-
-class ResnetBlock(nn.Module):
-    def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False,
-                 dropout, temb_channels=512):
-        super().__init__()
-        self.in_channels = in_channels
-        out_channels = in_channels if out_channels is None else out_channels
-        self.out_channels = out_channels
-        self.use_conv_shortcut = conv_shortcut
-
-        self.norm1 = Normalize(in_channels)
-        self.conv1 = torch.nn.Conv2d(in_channels,
-                                     out_channels,
-                                     kernel_size=3,
-                                     stride=1,
-                                     padding=1)
-        if temb_channels > 0:
-            self.temb_proj = torch.nn.Linear(temb_channels,
-                                             out_channels)
-        self.norm2 = Normalize(out_channels)
-        self.dropout = torch.nn.Dropout(dropout)
-        self.conv2 = torch.nn.Conv2d(out_channels,
-                                     out_channels,
-                                     kernel_size=3,
-                                     stride=1,
-                                     padding=1)
-        if self.in_channels != self.out_channels:
-            if self.use_conv_shortcut:
-                self.conv_shortcut = torch.nn.Conv2d(in_channels,
-                                                     out_channels,
-                                                     kernel_size=3,
-                                                     stride=1,
-                                                     padding=1)
-            else:
-                self.nin_shortcut = torch.nn.Conv2d(in_channels,
-                                                    out_channels,
-                                                    kernel_size=1,
-                                                    stride=1,
-                                                    padding=0)
-
-    def forward(self, x, temb):
-        h = x
-        h = self.norm1(h)
-        h = nonlinearity(h)
-        h = self.conv1(h)
-
-        if temb is not None:
-            h = h + self.temb_proj(nonlinearity(temb))[:,:,None,None]
-
-        h = self.norm2(h)
-        h = nonlinearity(h)
-        h = self.dropout(h)
-        h = self.conv2(h)
-
-        if self.in_channels != self.out_channels:
-            if self.use_conv_shortcut:
-                x = self.conv_shortcut(x)
-            else:
-                x = self.nin_shortcut(x)
-
-        return x+h
-
-
-class LinAttnBlock(LinearAttention):
-    """to match AttnBlock usage"""
-    def __init__(self, in_channels):
-        super().__init__(dim=in_channels, heads=1, dim_head=in_channels)
-
-
-class AttnBlock(nn.Module):
-    def __init__(self, in_channels):
-        super().__init__()
-        self.in_channels = in_channels
-
-        self.norm = Normalize(in_channels)
-        self.q = torch.nn.Conv2d(in_channels,
-                                 in_channels,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-        self.k = torch.nn.Conv2d(in_channels,
-                                 in_channels,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-        self.v = torch.nn.Conv2d(in_channels,
-                                 in_channels,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-        self.proj_out = torch.nn.Conv2d(in_channels,
-                                        in_channels,
-                                        kernel_size=1,
-                                        stride=1,
-                                        padding=0)
-
-
-    def forward(self, x):
-        h_ = x
-        h_ = self.norm(h_)
-        q = self.q(h_)
-        k = self.k(h_)
-        v = self.v(h_)
-
-        # compute attention
-        b,c,h,w = q.shape
-        q = q.reshape(b,c,h*w)
-        q = q.permute(0,2,1)   # b,hw,c
-        k = k.reshape(b,c,h*w) # b,c,hw
-        w_ = torch.bmm(q,k)     # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
-        w_ = w_ * (int(c)**(-0.5))
-        w_ = torch.nn.functional.softmax(w_, dim=2)
-
-        # attend to values
-        v = v.reshape(b,c,h*w)
-        w_ = w_.permute(0,2,1)   # b,hw,hw (first hw of k, second of q)
-        h_ = torch.bmm(v,w_)     # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
-        h_ = h_.reshape(b,c,h,w)
-
-        h_ = self.proj_out(h_)
-
-        return x+h_
-
-
-def make_attn(in_channels, attn_type="vanilla"):
-    assert attn_type in ["vanilla", "linear", "none"], f'attn_type {attn_type} unknown'
-    print(f"making attention of type '{attn_type}' with {in_channels} in_channels")
-    if attn_type == "vanilla":
-        return AttnBlock(in_channels)
-    elif attn_type == "none":
-        return nn.Identity(in_channels)
-    else:
-        return LinAttnBlock(in_channels)
-
-class temb_module(nn.Module):
-    def __init__(self):
-        super().__init__()
-        pass
-
-class Model(nn.Module):
-    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
-                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
-                 resolution, use_timestep=True, use_linear_attn=False, attn_type="vanilla"):
-        super().__init__()
-        if use_linear_attn: attn_type = "linear"
-        self.ch = ch
-        self.temb_ch = self.ch*4
-        self.num_resolutions = len(ch_mult)
-        self.num_res_blocks = num_res_blocks
-        self.resolution = resolution
-        self.in_channels = in_channels
-
-        self.use_timestep = use_timestep
-        if self.use_timestep:
-            # timestep embedding
-            # self.temb = nn.Module()
-            self.temb = temb_module()
-            self.temb.dense = nn.ModuleList([
-                torch.nn.Linear(self.ch,
-                                self.temb_ch),
-                torch.nn.Linear(self.temb_ch,
-                                self.temb_ch),
-            ])
-
-        # downsampling
-        self.conv_in = torch.nn.Conv2d(in_channels,
-                                       self.ch,
-                                       kernel_size=3,
-                                       stride=1,
-                                       padding=1)
-
-        curr_res = resolution
-        in_ch_mult = (1,)+tuple(ch_mult)
-        self.down = nn.ModuleList()
-        for i_level in range(self.num_resolutions):
-            block = nn.ModuleList()
-            attn = nn.ModuleList()
-            block_in = ch*in_ch_mult[i_level]
-            block_out = ch*ch_mult[i_level]
-            for i_block in range(self.num_res_blocks):
-                block.append(ResnetBlock(in_channels=block_in,
-                                         out_channels=block_out,
-                                         temb_channels=self.temb_ch,
-                                         dropout=dropout))
-                block_in = block_out
-                if curr_res in attn_resolutions:
-                    attn.append(make_attn(block_in, attn_type=attn_type))
-            # down = nn.Module()
-            down = Down_module()
-            down.block = block
-            down.attn = attn
-            if i_level != self.num_resolutions-1:
-                down.downsample = Downsample(block_in, resamp_with_conv)
-                curr_res = curr_res // 2
-            self.down.append(down)
-
-        # middle
-        # self.mid = nn.Module()
-        self.mid = Mid_module()
-        self.mid.block_1 = ResnetBlock(in_channels=block_in,
-                                       out_channels=block_in,
-                                       temb_channels=self.temb_ch,
-                                       dropout=dropout)
-        self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
-        self.mid.block_2 = ResnetBlock(in_channels=block_in,
-                                       out_channels=block_in,
-                                       temb_channels=self.temb_ch,
-                                       dropout=dropout)
-
-        # upsampling
-        self.up = nn.ModuleList()
-        for i_level in reversed(range(self.num_resolutions)):
-            block = nn.ModuleList()
-            attn = nn.ModuleList()
-            block_out = ch*ch_mult[i_level]
-            skip_in = ch*ch_mult[i_level]
-            for i_block in range(self.num_res_blocks+1):
-                if i_block == self.num_res_blocks:
-                    skip_in = ch*in_ch_mult[i_level]
-                block.append(ResnetBlock(in_channels=block_in+skip_in,
-                                         out_channels=block_out,
-                                         temb_channels=self.temb_ch,
-                                         dropout=dropout))
-                block_in = block_out
-                if curr_res in attn_resolutions:
-                    attn.append(make_attn(block_in, attn_type=attn_type))
-            # up = nn.Module()
-            up = Up_module()
-            up.block = block
-            up.attn = attn
-            if i_level != 0:
-                up.upsample = Upsample(block_in, resamp_with_conv)
-                curr_res = curr_res * 2
-            self.up.insert(0, up) # prepend to get consistent order
-
-        # end
-        self.norm_out = Normalize(block_in)
-        self.conv_out = torch.nn.Conv2d(block_in,
-                                        out_ch,
-                                        kernel_size=3,
-                                        stride=1,
-                                        padding=1)
-
-    def forward(self, x, t=None, context=None):
-        #assert x.shape[2] == x.shape[3] == self.resolution
-        if context is not None:
-            # assume aligned context, cat along channel axis
-            x = torch.cat((x, context), dim=1)
-        if self.use_timestep:
-            # timestep embedding
-            assert t is not None
-            temb = get_timestep_embedding(t, self.ch)
-            temb = self.temb.dense[0](temb)
-            temb = nonlinearity(temb)
-            temb = self.temb.dense[1](temb)
-        else:
-            temb = None
-
-        # downsampling
-        hs = [self.conv_in(x)]
-        for i_level in range(self.num_resolutions):
-            for i_block in range(self.num_res_blocks):
-                h = self.down[i_level].block[i_block](hs[-1], temb)
-                if len(self.down[i_level].attn) > 0:
-                    h = self.down[i_level].attn[i_block](h)
-                hs.append(h)
-            if i_level != self.num_resolutions-1:
-                hs.append(self.down[i_level].downsample(hs[-1]))
-
-        # middle
-        h = hs[-1]
-        h = self.mid.block_1(h, temb)
-        h = self.mid.attn_1(h)
-        h = self.mid.block_2(h, temb)
-
-        # upsampling
-        for i_level in reversed(range(self.num_resolutions)):
-            for i_block in range(self.num_res_blocks+1):
-                h = self.up[i_level].block[i_block](
-                    torch.cat([h, hs.pop()], dim=1), temb)
-                if len(self.up[i_level].attn) > 0:
-                    h = self.up[i_level].attn[i_block](h)
-            if i_level != 0:
-                h = self.up[i_level].upsample(h)
-
-        # end
-        h = self.norm_out(h)
-        h = nonlinearity(h)
-        h = self.conv_out(h)
-        return h
-
-    def get_last_layer(self):
-        return self.conv_out.weight
-
-class Down_module(nn.Module):
-    def __init__(self):
-        super().__init__()
-        pass
-
-class Up_module(nn.Module):
-    def __init__(self):
-        super().__init__()
-        pass
-
-class Mid_module(nn.Module):
-    def __init__(self):
-        super().__init__()
-        pass
-
-
-class Encoder(nn.Module):
-    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
-                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
-                 resolution, z_channels, double_z=True, use_linear_attn=False, attn_type="vanilla",
-                 **ignore_kwargs):
-        super().__init__()
-        if use_linear_attn: attn_type = "linear"
-        self.ch = ch
-        self.temb_ch = 0
-        self.num_resolutions = len(ch_mult)
-        self.num_res_blocks = num_res_blocks
-        self.resolution = resolution
-        self.in_channels = in_channels
-
-        # downsampling
-        self.conv_in = torch.nn.Conv2d(in_channels,
-                                       self.ch,
-                                       kernel_size=3,
-                                       stride=1,
-                                       padding=1)
-
-        curr_res = resolution
-        in_ch_mult = (1,)+tuple(ch_mult)
-        self.in_ch_mult = in_ch_mult
-        self.down = nn.ModuleList()
-        for i_level in range(self.num_resolutions):
-            block = nn.ModuleList()
-            attn = nn.ModuleList()
-            block_in = ch*in_ch_mult[i_level]
-            block_out = ch*ch_mult[i_level]
-            for i_block in range(self.num_res_blocks):
-                block.append(ResnetBlock(in_channels=block_in,
-                                         out_channels=block_out,
-                                         temb_channels=self.temb_ch,
-                                         dropout=dropout))
-                block_in = block_out
-                if curr_res in attn_resolutions:
-                    attn.append(make_attn(block_in, attn_type=attn_type))
-            # down = nn.Module()
-            down = Down_module()
-            down.block = block
-            down.attn = attn
-            if i_level != self.num_resolutions-1:
-                down.downsample = Downsample(block_in, resamp_with_conv)
-                curr_res = curr_res // 2
-            self.down.append(down)
-
-        # middle
-        # self.mid = nn.Module()
-        self.mid = Mid_module()
-        self.mid.block_1 = ResnetBlock(in_channels=block_in,
-                                       out_channels=block_in,
-                                       temb_channels=self.temb_ch,
-                                       dropout=dropout)
-        self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
-        self.mid.block_2 = ResnetBlock(in_channels=block_in,
-                                       out_channels=block_in,
-                                       temb_channels=self.temb_ch,
-                                       dropout=dropout)
-
-        # end
-        self.norm_out = Normalize(block_in)
-        self.conv_out = torch.nn.Conv2d(block_in,
-                                        2*z_channels if double_z else z_channels,
-                                        kernel_size=3,
-                                        stride=1,
-                                        padding=1)
-
-    def forward(self, x):
-        # timestep embedding
-        temb = None
-
-        # downsampling
-        hs = [self.conv_in(x)]
-        for i_level in range(self.num_resolutions):
-            for i_block in range(self.num_res_blocks):
-                h = self.down[i_level].block[i_block](hs[-1], temb)
-                if len(self.down[i_level].attn) > 0:
-                    h = self.down[i_level].attn[i_block](h)
-                hs.append(h)
-            if i_level != self.num_resolutions-1:
-                hs.append(self.down[i_level].downsample(hs[-1]))
-
-        # middle
-        h = hs[-1]
-        h = self.mid.block_1(h, temb)
-        h = self.mid.attn_1(h)
-        h = self.mid.block_2(h, temb)
-
-        # end
-        h = self.norm_out(h)
-        h = nonlinearity(h)
-        h = self.conv_out(h)
-        return h
-
-
-class Decoder(nn.Module):
-    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
-                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
-                 resolution, z_channels, give_pre_end=False, tanh_out=False, use_linear_attn=False,
-                 attn_type="vanilla", **ignorekwargs):
-        super().__init__()
-        if use_linear_attn: attn_type = "linear"
-        self.ch = ch
-        self.temb_ch = 0
-        self.num_resolutions = len(ch_mult)
-        self.num_res_blocks = num_res_blocks
-        self.resolution = resolution
-        self.in_channels = in_channels
-        self.give_pre_end = give_pre_end
-        self.tanh_out = tanh_out
-
-        # compute in_ch_mult, block_in and curr_res at lowest res
-        in_ch_mult = (1,)+tuple(ch_mult)
-        block_in = ch*ch_mult[self.num_resolutions-1]
-        curr_res = resolution // 2**(self.num_resolutions-1)
-        self.z_shape = (1,z_channels,curr_res,curr_res)
-        print("Working with z of shape {} = {} dimensions.".format(
-            self.z_shape, np.prod(self.z_shape)))
-
-        # z to block_in
-        self.conv_in = torch.nn.Conv2d(z_channels,
-                                       block_in,
-                                       kernel_size=3,
-                                       stride=1,
-                                       padding=1)
-
-        # middle
-        # self.mid = nn.Module()
-        self.mid = Mid_module()
-        self.mid.block_1 = ResnetBlock(in_channels=block_in,
-                                       out_channels=block_in,
-                                       temb_channels=self.temb_ch,
-                                       dropout=dropout)
-        self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
-        self.mid.block_2 = ResnetBlock(in_channels=block_in,
-                                       out_channels=block_in,
-                                       temb_channels=self.temb_ch,
-                                       dropout=dropout)
-
-        # upsampling
-        self.up = nn.ModuleList()
-        for i_level in reversed(range(self.num_resolutions)):
-            block = nn.ModuleList()
-            attn = nn.ModuleList()
-            block_out = ch*ch_mult[i_level]
-            for i_block in range(self.num_res_blocks+1):
-                block.append(ResnetBlock(in_channels=block_in,
-                                         out_channels=block_out,
-                                         temb_channels=self.temb_ch,
-                                         dropout=dropout))
-                block_in = block_out
-                if curr_res in attn_resolutions:
-                    attn.append(make_attn(block_in, attn_type=attn_type))
-            # up = nn.Module()
-            up = Up_module()
-            up.block = block
-            up.attn = attn
-            if i_level != 0:
-                up.upsample = Upsample(block_in, resamp_with_conv)
-                curr_res = curr_res * 2
-            self.up.insert(0, up) # prepend to get consistent order
-
-        # end
-        self.norm_out = Normalize(block_in)
-        self.conv_out = torch.nn.Conv2d(block_in,
-                                        out_ch,
-                                        kernel_size=3,
-                                        stride=1,
-                                        padding=1)
-
-    def forward(self, z):
-        #assert z.shape[1:] == self.z_shape[1:]
-        self.last_z_shape = z.shape
-
-        # timestep embedding
-        temb = None
-
-        # z to block_in
-        h = self.conv_in(z)
-
-        # middle
-        h = self.mid.block_1(h, temb)
-        h = self.mid.attn_1(h)
-        h = self.mid.block_2(h, temb)
-
-        # upsampling
-        for i_level in reversed(range(self.num_resolutions)):
-            for i_block in range(self.num_res_blocks+1):
-                h = self.up[i_level].block[i_block](h, temb)
-                if len(self.up[i_level].attn) > 0:
-                    h = self.up[i_level].attn[i_block](h)
-            if i_level != 0:
-                h = self.up[i_level].upsample(h)
-
-        # end
-        if self.give_pre_end:
-            return h
-
-        h = self.norm_out(h)
-        h = nonlinearity(h)
-        h = self.conv_out(h)
-        if self.tanh_out:
-            h = torch.tanh(h)
-        return h
-
-
-class SimpleDecoder(nn.Module):
-    def __init__(self, in_channels, out_channels, *args, **kwargs):
-        super().__init__()
-        self.model = nn.ModuleList([nn.Conv2d(in_channels, in_channels, 1),
-                                     ResnetBlock(in_channels=in_channels,
-                                                 out_channels=2 * in_channels,
-                                                 temb_channels=0, dropout=0.0),
-                                     ResnetBlock(in_channels=2 * in_channels,
-                                                out_channels=4 * in_channels,
-                                                temb_channels=0, dropout=0.0),
-                                     ResnetBlock(in_channels=4 * in_channels,
-                                                out_channels=2 * in_channels,
-                                                temb_channels=0, dropout=0.0),
-                                     nn.Conv2d(2*in_channels, in_channels, 1),
-                                     Upsample(in_channels, with_conv=True)])
-        # end
-        self.norm_out = Normalize(in_channels)
-        self.conv_out = torch.nn.Conv2d(in_channels,
-                                        out_channels,
-                                        kernel_size=3,
-                                        stride=1,
-                                        padding=1)
-
-    def forward(self, x):
-        for i, layer in enumerate(self.model):
-            if i in [1,2,3]:
-                x = layer(x, None)
-            else:
-                x = layer(x)
-
-        h = self.norm_out(x)
-        h = nonlinearity(h)
-        x = self.conv_out(h)
-        return x
-
-
-class UpsampleDecoder(nn.Module):
-    def __init__(self, in_channels, out_channels, ch, num_res_blocks, resolution,
-                 ch_mult=(2,2), dropout=0.0):
-        super().__init__()
-        # upsampling
-        self.temb_ch = 0
-        self.num_resolutions = len(ch_mult)
-        self.num_res_blocks = num_res_blocks
-        block_in = in_channels
-        curr_res = resolution // 2 ** (self.num_resolutions - 1)
-        self.res_blocks = nn.ModuleList()
-        self.upsample_blocks = nn.ModuleList()
-        for i_level in range(self.num_resolutions):
-            res_block = []
-            block_out = ch * ch_mult[i_level]
-            for i_block in range(self.num_res_blocks + 1):
-                res_block.append(ResnetBlock(in_channels=block_in,
-                                         out_channels=block_out,
-                                         temb_channels=self.temb_ch,
-                                         dropout=dropout))
-                block_in = block_out
-            self.res_blocks.append(nn.ModuleList(res_block))
-            if i_level != self.num_resolutions - 1:
-                self.upsample_blocks.append(Upsample(block_in, True))
-                curr_res = curr_res * 2
-
-        # end
-        self.norm_out = Normalize(block_in)
-        self.conv_out = torch.nn.Conv2d(block_in,
-                                        out_channels,
-                                        kernel_size=3,
-                                        stride=1,
-                                        padding=1)
-
-    def forward(self, x):
-        # upsampling
-        h = x
-        for k, i_level in enumerate(range(self.num_resolutions)):
-            for i_block in range(self.num_res_blocks + 1):
-                h = self.res_blocks[i_level][i_block](h, None)
-            if i_level != self.num_resolutions - 1:
-                h = self.upsample_blocks[k](h)
-        h = self.norm_out(h)
-        h = nonlinearity(h)
-        h = self.conv_out(h)
-        return h
-
-
-class LatentRescaler(nn.Module):
-    def __init__(self, factor, in_channels, mid_channels, out_channels, depth=2):
-        super().__init__()
-        # residual block, interpolate, residual block
-        self.factor = factor
-        self.conv_in = nn.Conv2d(in_channels,
-                                 mid_channels,
-                                 kernel_size=3,
-                                 stride=1,
-                                 padding=1)
-        self.res_block1 = nn.ModuleList([ResnetBlock(in_channels=mid_channels,
-                                                     out_channels=mid_channels,
-                                                     temb_channels=0,
-                                                     dropout=0.0) for _ in range(depth)])
-        self.attn = AttnBlock(mid_channels)
-        self.res_block2 = nn.ModuleList([ResnetBlock(in_channels=mid_channels,
-                                                     out_channels=mid_channels,
-                                                     temb_channels=0,
-                                                     dropout=0.0) for _ in range(depth)])
-
-        self.conv_out = nn.Conv2d(mid_channels,
-                                  out_channels,
-                                  kernel_size=1,
-                                  )
-
-    def forward(self, x):
-        x = self.conv_in(x)
-        for block in self.res_block1:
-            x = block(x, None)
-        x = torch.nn.functional.interpolate(x, size=(int(round(x.shape[2]*self.factor)), int(round(x.shape[3]*self.factor))))
-        x = self.attn(x)
-        for block in self.res_block2:
-            x = block(x, None)
-        x = self.conv_out(x)
-        return x
-
-
-class MergedRescaleEncoder(nn.Module):
-    def __init__(self, in_channels, ch, resolution, out_ch, num_res_blocks,
-                 attn_resolutions, dropout=0.0, resamp_with_conv=True,
-                 ch_mult=(1,2,4,8), rescale_factor=1.0, rescale_module_depth=1):
-        super().__init__()
-        intermediate_chn = ch * ch_mult[-1]
-        self.encoder = Encoder(in_channels=in_channels, num_res_blocks=num_res_blocks, ch=ch, ch_mult=ch_mult,
-                               z_channels=intermediate_chn, double_z=False, resolution=resolution,
-                               attn_resolutions=attn_resolutions, dropout=dropout, resamp_with_conv=resamp_with_conv,
-                               out_ch=None)
-        self.rescaler = LatentRescaler(factor=rescale_factor, in_channels=intermediate_chn,
-                                       mid_channels=intermediate_chn, out_channels=out_ch, depth=rescale_module_depth)
-
-    def forward(self, x):
-        x = self.encoder(x)
-        x = self.rescaler(x)
-        return x
-
-
-class MergedRescaleDecoder(nn.Module):
-    def __init__(self, z_channels, out_ch, resolution, num_res_blocks, attn_resolutions, ch, ch_mult=(1,2,4,8),
-                 dropout=0.0, resamp_with_conv=True, rescale_factor=1.0, rescale_module_depth=1):
-        super().__init__()
-        tmp_chn = z_channels*ch_mult[-1]
-        self.decoder = Decoder(out_ch=out_ch, z_channels=tmp_chn, attn_resolutions=attn_resolutions, dropout=dropout,
-                               resamp_with_conv=resamp_with_conv, in_channels=None, num_res_blocks=num_res_blocks,
-                               ch_mult=ch_mult, resolution=resolution, ch=ch)
-        self.rescaler = LatentRescaler(factor=rescale_factor, in_channels=z_channels, mid_channels=tmp_chn,
-                                       out_channels=tmp_chn, depth=rescale_module_depth)
-
-    def forward(self, x):
-        x = self.rescaler(x)
-        x = self.decoder(x)
-        return x
-
-
-class Upsampler(nn.Module):
-    def __init__(self, in_size, out_size, in_channels, out_channels, ch_mult=2):
-        super().__init__()
-        assert out_size >= in_size
-        num_blocks = int(np.log2(out_size//in_size))+1
-        factor_up = 1.+ (out_size % in_size)
-        print(f"Building {self.__class__.__name__} with in_size: {in_size} --> out_size {out_size} and factor {factor_up}")
-        self.rescaler = LatentRescaler(factor=factor_up, in_channels=in_channels, mid_channels=2*in_channels,
-                                       out_channels=in_channels)
-        self.decoder = Decoder(out_ch=out_channels, resolution=out_size, z_channels=in_channels, num_res_blocks=2,
-                               attn_resolutions=[], in_channels=None, ch=in_channels,
-                               ch_mult=[ch_mult for _ in range(num_blocks)])
-
-    def forward(self, x):
-        x = self.rescaler(x)
-        x = self.decoder(x)
-        return x
-
-
-class Resize(nn.Module):
-    def __init__(self, in_channels=None, learned=False, mode="bilinear"):
-        super().__init__()
-        self.with_conv = learned
-        self.mode = mode
-        if self.with_conv:
-            print(f"Note: {self.__class__.__name} uses learned downsampling and will ignore the fixed {mode} mode")
-            raise NotImplementedError()
-            assert in_channels is not None
-            # no asymmetric padding in torch conv, must do it ourselves
-            self.conv = torch.nn.Conv2d(in_channels,
-                                        in_channels,
-                                        kernel_size=4,
-                                        stride=2,
-                                        padding=1)
-
-    def forward(self, x, scale_factor=1.0):
-        if scale_factor==1.0:
-            return x
-        else:
-            x = torch.nn.functional.interpolate(x, mode=self.mode, align_corners=False, scale_factor=scale_factor)
-        return x
-
-class FirstStagePostProcessor(nn.Module):
-
-    def __init__(self, ch_mult:list, in_channels,
-                 pretrained_model:nn.Module=None,
-                 reshape=False,
-                 n_channels=None,
-                 dropout=0.,
-                 pretrained_config=None):
-        super().__init__()
-        if pretrained_config is None:
-            assert pretrained_model is not None, 'Either "pretrained_model" or "pretrained_config" must not be None'
-            self.pretrained_model = pretrained_model
-        else:
-            assert pretrained_config is not None, 'Either "pretrained_model" or "pretrained_config" must not be None'
-            self.instantiate_pretrained(pretrained_config)
-
-        self.do_reshape = reshape
-
-        if n_channels is None:
-            n_channels = self.pretrained_model.encoder.ch
-
-        self.proj_norm = Normalize(in_channels,num_groups=in_channels//2)
-        self.proj = nn.Conv2d(in_channels,n_channels,kernel_size=3,
-                            stride=1,padding=1)
-
-        blocks = []
-        downs = []
-        ch_in = n_channels
-        for m in ch_mult:
-            blocks.append(ResnetBlock(in_channels=ch_in,out_channels=m*n_channels,dropout=dropout))
-            ch_in = m * n_channels
-            downs.append(Downsample(ch_in, with_conv=False))
-
-        self.model = nn.ModuleList(blocks)
-        self.downsampler = nn.ModuleList(downs)
-
-
-    def instantiate_pretrained(self, config):
-        model = instantiate_from_config(config)
-        self.pretrained_model = model.eval()
-        # self.pretrained_model.train = False
-        for param in self.pretrained_model.parameters():
-            param.requires_grad = False
-
-
-    @torch.no_grad()
-    def encode_with_pretrained(self,x):
-        c = self.pretrained_model.encode(x)
-        if isinstance(c, DiagonalGaussianDistribution):
-            c = c.mode()
-        return  c
-
-    def forward(self,x):
-        z_fs = self.encode_with_pretrained(x)
-        z = self.proj_norm(z_fs)
-        z = self.proj(z)
-        z = nonlinearity(z)
-
-        for submodel, downmodel in zip(self.model,self.downsampler):
-            z = submodel(z,temb=None)
-            z = downmodel(z)
-
-        if self.do_reshape:
-            z = rearrange(z,'b c h w -> b (h w) c')
-        return z
-
diff --git a/examples/tutorial/stable_diffusion/ldm/modules/diffusionmodules/openaimodel.py b/examples/tutorial/stable_diffusion/ldm/modules/diffusionmodules/openaimodel.py
deleted file mode 100644
index 3aedc2205e13..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/modules/diffusionmodules/openaimodel.py
+++ /dev/null
@@ -1,1152 +0,0 @@
-from abc import abstractmethod
-from functools import partial
-import math
-from typing import Iterable
-
-import numpy as np
-import torch
-import torch as th
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.utils import checkpoint
-
-from ldm.modules.diffusionmodules.util import (
-    conv_nd,
-    linear,
-    avg_pool_nd,
-    zero_module,
-    normalization,
-    timestep_embedding,
-)
-from ldm.modules.attention import SpatialTransformer
-
-
-# dummy replace
-def convert_module_to_f16(x):
-    # for n,p in x.named_parameter():
-    #     print(f"convert module {n} to_f16")
-    #     p.data = p.data.half()
-    pass
-
-def convert_module_to_f32(x):
-    pass
-
-
-## go
-class AttentionPool2d(nn.Module):
-    """
-    Adapted from CLIP: https://github.com/openai/CLIP/blob/main/clip/model.py
-    """
-
-    def __init__(
-        self,
-        spacial_dim: int,
-        embed_dim: int,
-        num_heads_channels: int,
-        output_dim: int = None,
-    ):
-        super().__init__()
-        self.positional_embedding = nn.Parameter(th.randn(embed_dim, spacial_dim ** 2 + 1) / embed_dim ** 0.5)
-        self.qkv_proj = conv_nd(1, embed_dim, 3 * embed_dim, 1)
-        self.c_proj = conv_nd(1, embed_dim, output_dim or embed_dim, 1)
-        self.num_heads = embed_dim // num_heads_channels
-        self.attention = QKVAttention(self.num_heads)
-
-    def forward(self, x):
-        b, c, *_spatial = x.shape
-        x = x.reshape(b, c, -1)  # NC(HW)
-        x = th.cat([x.mean(dim=-1, keepdim=True), x], dim=-1)  # NC(HW+1)
-        x = x + self.positional_embedding[None, :, :].to(x.dtype)  # NC(HW+1)
-        x = self.qkv_proj(x)
-        x = self.attention(x)
-        x = self.c_proj(x)
-        return x[:, :, 0]
-
-
-class TimestepBlock(nn.Module):
-    """
-    Any module where forward() takes timestep embeddings as a second argument.
-    """
-
-    @abstractmethod
-    def forward(self, x, emb):
-        """
-        Apply the module to `x` given `emb` timestep embeddings.
-        """
-
-
-class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
-    """
-    A sequential module that passes timestep embeddings to the children that
-    support it as an extra input.
-    """
-
-    def forward(self, x, emb, context=None):
-        for layer in self:
-            if isinstance(layer, TimestepBlock):
-                x = layer(x, emb)
-            elif isinstance(layer, SpatialTransformer):
-                x = layer(x, context)
-            else:
-                x = layer(x)
-        return x
-
-
-class Upsample(nn.Module):
-    """
-    An upsampling layer with an optional convolution.
-    :param channels: channels in the inputs and outputs.
-    :param use_conv: a bool determining if a convolution is applied.
-    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
-                 upsampling occurs in the inner-two dimensions.
-    """
-
-    def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1):
-        super().__init__()
-        self.channels = channels
-        self.out_channels = out_channels or channels
-        self.use_conv = use_conv
-        self.dims = dims
-        if use_conv:
-            self.conv = conv_nd(dims, self.channels, self.out_channels, 3, padding=padding)
-
-    def forward(self, x):
-        assert x.shape[1] == self.channels
-        if self.dims == 3:
-            x = F.interpolate(
-                x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode="nearest"
-            )
-        else:
-            x = F.interpolate(x, scale_factor=2, mode="nearest")
-        if self.use_conv:
-            x = self.conv(x)
-        return x
-
-class TransposedUpsample(nn.Module):
-    'Learned 2x upsampling without padding'
-    def __init__(self, channels, out_channels=None, ks=5):
-        super().__init__()
-        self.channels = channels
-        self.out_channels = out_channels or channels
-
-        self.up = nn.ConvTranspose2d(self.channels,self.out_channels,kernel_size=ks,stride=2)
-
-    def forward(self,x):
-        return self.up(x)
-
-
-class Downsample(nn.Module):
-    """
-    A downsampling layer with an optional convolution.
-    :param channels: channels in the inputs and outputs.
-    :param use_conv: a bool determining if a convolution is applied.
-    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
-                 downsampling occurs in the inner-two dimensions.
-    """
-
-    def __init__(self, channels, use_conv, dims=2, out_channels=None,padding=1):
-        super().__init__()
-        self.channels = channels
-        self.out_channels = out_channels or channels
-        self.use_conv = use_conv
-        self.dims = dims
-        stride = 2 if dims != 3 else (1, 2, 2)
-        if use_conv:
-            self.op = conv_nd(
-                dims, self.channels, self.out_channels, 3, stride=stride, padding=padding
-            )
-        else:
-            assert self.channels == self.out_channels
-            self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)
-
-    def forward(self, x):
-        assert x.shape[1] == self.channels
-        return self.op(x)
-
-
-class ResBlock(TimestepBlock):
-    """
-    A residual block that can optionally change the number of channels.
-    :param channels: the number of input channels.
-    :param emb_channels: the number of timestep embedding channels.
-    :param dropout: the rate of dropout.
-    :param out_channels: if specified, the number of out channels.
-    :param use_conv: if True and out_channels is specified, use a spatial
-        convolution instead of a smaller 1x1 convolution to change the
-        channels in the skip connection.
-    :param dims: determines if the signal is 1D, 2D, or 3D.
-    :param use_checkpoint: if True, use gradient checkpointing on this module.
-    :param up: if True, use this block for upsampling.
-    :param down: if True, use this block for downsampling.
-    """
-
-    def __init__(
-        self,
-        channels,
-        emb_channels,
-        dropout,
-        out_channels=None,
-        use_conv=False,
-        use_scale_shift_norm=False,
-        dims=2,
-        use_checkpoint=False,
-        up=False,
-        down=False,
-    ):
-        super().__init__()
-        self.channels = channels
-        self.emb_channels = emb_channels
-        self.dropout = dropout
-        self.out_channels = out_channels or channels
-        self.use_conv = use_conv
-        self.use_checkpoint = use_checkpoint
-        self.use_scale_shift_norm = use_scale_shift_norm
-
-        self.in_layers = nn.Sequential(
-            normalization(channels),
-            nn.SiLU(),
-            conv_nd(dims, channels, self.out_channels, 3, padding=1),
-        )
-
-        self.updown = up or down
-
-        if up:
-            self.h_upd = Upsample(channels, False, dims)
-            self.x_upd = Upsample(channels, False, dims)
-        elif down:
-            self.h_upd = Downsample(channels, False, dims)
-            self.x_upd = Downsample(channels, False, dims)
-        else:
-            self.h_upd = self.x_upd = nn.Identity()
-
-        self.emb_layers = nn.Sequential(
-            nn.SiLU(),
-            linear(
-                emb_channels,
-                2 * self.out_channels if use_scale_shift_norm else self.out_channels,
-            ),
-        )
-        self.out_layers = nn.Sequential(
-            normalization(self.out_channels),
-            nn.SiLU(),
-            nn.Dropout(p=dropout),
-            zero_module(
-                conv_nd(dims, self.out_channels, self.out_channels, 3, padding=1)
-            ),
-        )
-
-        if self.out_channels == channels:
-            self.skip_connection = nn.Identity()
-        elif use_conv:
-            self.skip_connection = conv_nd(
-                dims, channels, self.out_channels, 3, padding=1
-            )
-        else:
-            self.skip_connection = conv_nd(dims, channels, self.out_channels, 1)
-
-    def forward(self, x, emb):
-        """
-        Apply the block to a Tensor, conditioned on a timestep embedding.
-        :param x: an [N x C x ...] Tensor of features.
-        :param emb: an [N x emb_channels] Tensor of timestep embeddings.
-        :return: an [N x C x ...] Tensor of outputs.
-        """
-        if self.use_checkpoint:
-            return checkpoint(self._forward, x, emb)
-        else:
-            return self._forward(x, emb)
-
-
-    def _forward(self, x, emb):
-        if self.updown:
-            in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
-            h = in_rest(x)
-            h = self.h_upd(h)
-            x = self.x_upd(x)
-            h = in_conv(h)
-        else:
-            h = self.in_layers(x)
-        emb_out = self.emb_layers(emb).type(h.dtype)
-        while len(emb_out.shape) < len(h.shape):
-            emb_out = emb_out[..., None]
-        if self.use_scale_shift_norm:
-            out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
-            scale, shift = th.chunk(emb_out, 2, dim=1)
-            h = out_norm(h) * (1 + scale) + shift
-            h = out_rest(h)
-        else:
-            h = h + emb_out
-            h = self.out_layers(h)
-        return self.skip_connection(x) + h
-
-
-class AttentionBlock(nn.Module):
-    """
-    An attention block that allows spatial positions to attend to each other.
-    Originally ported from here, but adapted to the N-d case.
-    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
-    """
-
-    def __init__(
-        self,
-        channels,
-        num_heads=1,
-        num_head_channels=-1,
-        use_checkpoint=False,
-        use_new_attention_order=False,
-    ):
-        super().__init__()
-        self.channels = channels
-        if num_head_channels == -1:
-            self.num_heads = num_heads
-        else:
-            assert (
-                channels % num_head_channels == 0
-            ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
-            self.num_heads = channels // num_head_channels
-        self.use_checkpoint = use_checkpoint
-        self.norm = normalization(channels)
-        self.qkv = conv_nd(1, channels, channels * 3, 1)
-        if use_new_attention_order:
-            # split qkv before split heads
-            self.attention = QKVAttention(self.num_heads)
-        else:
-            # split heads before split qkv
-            self.attention = QKVAttentionLegacy(self.num_heads)
-
-        self.proj_out = zero_module(conv_nd(1, channels, channels, 1))
-
-    def forward(self, x):
-        if self.use_checkpoint:
-            return checkpoint(self._forward, x)   # TODO: check checkpoint usage, is True # TODO: fix the .half call!!!
-        #return pt_checkpoint(self._forward, x)  # pytorch
-        else:
-            return self._forward(x)
-
-    def _forward(self, x):
-        b, c, *spatial = x.shape
-        x = x.reshape(b, c, -1)
-        qkv = self.qkv(self.norm(x))
-        h = self.attention(qkv)
-        h = self.proj_out(h)
-        return (x + h).reshape(b, c, *spatial)
-
-
-def count_flops_attn(model, _x, y):
-    """
-    A counter for the `thop` package to count the operations in an
-    attention operation.
-    Meant to be used like:
-        macs, params = thop.profile(
-            model,
-            inputs=(inputs, timestamps),
-            custom_ops={QKVAttention: QKVAttention.count_flops},
-        )
-    """
-    b, c, *spatial = y[0].shape
-    num_spatial = int(np.prod(spatial))
-    # We perform two matmuls with the same number of ops.
-    # The first computes the weight matrix, the second computes
-    # the combination of the value vectors.
-    matmul_ops = 2 * b * (num_spatial ** 2) * c
-    model.total_ops += th.DoubleTensor([matmul_ops])
-
-
-class QKVAttentionLegacy(nn.Module):
-    """
-    A module which performs QKV attention. Matches legacy QKVAttention + input/ouput heads shaping
-    """
-
-    def __init__(self, n_heads):
-        super().__init__()
-        self.n_heads = n_heads
-
-    def forward(self, qkv):
-        """
-        Apply QKV attention.
-        :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
-        :return: an [N x (H * C) x T] tensor after attention.
-        """
-        bs, width, length = qkv.shape
-        assert width % (3 * self.n_heads) == 0
-        ch = width // (3 * self.n_heads)
-        q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1)
-        scale = 1 / math.sqrt(math.sqrt(ch))
-        weight = th.einsum(
-            "bct,bcs->bts", q * scale, k * scale
-        )  # More stable with f16 than dividing afterwards
-        weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
-        a = th.einsum("bts,bcs->bct", weight, v)
-        return a.reshape(bs, -1, length)
-
-    @staticmethod
-    def count_flops(model, _x, y):
-        return count_flops_attn(model, _x, y)
-
-
-class QKVAttention(nn.Module):
-    """
-    A module which performs QKV attention and splits in a different order.
-    """
-
-    def __init__(self, n_heads):
-        super().__init__()
-        self.n_heads = n_heads
-
-    def forward(self, qkv):
-        """
-        Apply QKV attention.
-        :param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs.
-        :return: an [N x (H * C) x T] tensor after attention.
-        """
-        bs, width, length = qkv.shape
-        assert width % (3 * self.n_heads) == 0
-        ch = width // (3 * self.n_heads)
-        q, k, v = qkv.chunk(3, dim=1)
-        scale = 1 / math.sqrt(math.sqrt(ch))
-        weight = th.einsum(
-            "bct,bcs->bts",
-            (q * scale).view(bs * self.n_heads, ch, length),
-            (k * scale).view(bs * self.n_heads, ch, length),
-        )  # More stable with f16 than dividing afterwards
-        weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
-        a = th.einsum("bts,bcs->bct", weight, v.reshape(bs * self.n_heads, ch, length))
-        return a.reshape(bs, -1, length)
-
-    @staticmethod
-    def count_flops(model, _x, y):
-        return count_flops_attn(model, _x, y)
-
-
-class UNetModel(nn.Module):
-    """
-    The full UNet model with attention and timestep embedding.
-    :param in_channels: channels in the input Tensor.
-    :param model_channels: base channel count for the model.
-    :param out_channels: channels in the output Tensor.
-    :param num_res_blocks: number of residual blocks per downsample.
-    :param attention_resolutions: a collection of downsample rates at which
-        attention will take place. May be a set, list, or tuple.
-        For example, if this contains 4, then at 4x downsampling, attention
-        will be used.
-    :param dropout: the dropout probability.
-    :param channel_mult: channel multiplier for each level of the UNet.
-    :param conv_resample: if True, use learned convolutions for upsampling and
-        downsampling.
-    :param dims: determines if the signal is 1D, 2D, or 3D.
-    :param num_classes: if specified (as an int), then this model will be
-        class-conditional with `num_classes` classes.
-    :param use_checkpoint: use gradient checkpointing to reduce memory usage.
-    :param num_heads: the number of attention heads in each attention layer.
-    :param num_heads_channels: if specified, ignore num_heads and instead use
-                               a fixed channel width per attention head.
-    :param num_heads_upsample: works with num_heads to set a different number
-                               of heads for upsampling. Deprecated.
-    :param use_scale_shift_norm: use a FiLM-like conditioning mechanism.
-    :param resblock_updown: use residual blocks for up/downsampling.
-    :param use_new_attention_order: use a different attention pattern for potentially
-                                    increased efficiency.
-    """
-
-    def __init__(
-        self,
-        image_size,
-        in_channels,
-        model_channels,
-        out_channels,
-        num_res_blocks,
-        attention_resolutions,
-        dropout=0,
-        channel_mult=(1, 2, 4, 8),
-        conv_resample=True,
-        dims=2,
-        num_classes=None,
-        use_checkpoint=False,
-        use_fp16=False,
-        num_heads=-1,
-        num_head_channels=-1,
-        num_heads_upsample=-1,
-        use_scale_shift_norm=False,
-        resblock_updown=False,
-        use_new_attention_order=False,
-        use_spatial_transformer=False,    # custom transformer support
-        transformer_depth=1,              # custom transformer support
-        context_dim=None,                 # custom transformer support
-        n_embed=None,                     # custom support for prediction of discrete ids into codebook of first stage vq model
-        legacy=True,
-        from_pretrained: str=None
-    ):
-        super().__init__()
-        if use_spatial_transformer:
-            assert context_dim is not None, 'Fool!! You forgot to include the dimension of your cross-attention conditioning...'
-
-        if context_dim is not None:
-            assert use_spatial_transformer, 'Fool!! You forgot to use the spatial transformer for your cross-attention conditioning...'
-            from omegaconf.listconfig import ListConfig
-            if type(context_dim) == ListConfig:
-                context_dim = list(context_dim)
-
-        if num_heads_upsample == -1:
-            num_heads_upsample = num_heads
-
-        if num_heads == -1:
-            assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set'
-
-        if num_head_channels == -1:
-            assert num_heads != -1, 'Either num_heads or num_head_channels has to be set'
-
-        self.image_size = image_size
-        self.in_channels = in_channels
-        self.model_channels = model_channels
-        self.out_channels = out_channels
-        self.num_res_blocks = num_res_blocks
-        self.attention_resolutions = attention_resolutions
-        self.dropout = dropout
-        self.channel_mult = channel_mult
-        self.conv_resample = conv_resample
-        self.num_classes = num_classes
-        self.use_checkpoint = use_checkpoint
-        self.dtype = th.float16 if use_fp16 else th.float32
-        self.num_heads = num_heads
-        self.num_head_channels = num_head_channels
-        self.num_heads_upsample = num_heads_upsample
-        self.predict_codebook_ids = n_embed is not None
-
-        time_embed_dim = model_channels * 4
-        self.time_embed = nn.Sequential(
-            linear(model_channels, time_embed_dim),
-            nn.SiLU(),
-            linear(time_embed_dim, time_embed_dim),
-        )
-
-        if self.num_classes is not None:
-            self.label_emb = nn.Embedding(num_classes, time_embed_dim)
-
-        self.input_blocks = nn.ModuleList(
-            [
-                TimestepEmbedSequential(
-                    conv_nd(dims, in_channels, model_channels, 3, padding=1)
-                )
-            ]
-        )
-        self._feature_size = model_channels
-        input_block_chans = [model_channels]
-        ch = model_channels
-        ds = 1
-        for level, mult in enumerate(channel_mult):
-            for _ in range(num_res_blocks):
-                layers = [
-                    ResBlock(
-                        ch,
-                        time_embed_dim,
-                        dropout,
-                        out_channels=mult * model_channels,
-                        dims=dims,
-                        use_checkpoint=use_checkpoint,
-                        use_scale_shift_norm=use_scale_shift_norm,
-                    )
-                ]
-                ch = mult * model_channels
-                if ds in attention_resolutions:
-                    if num_head_channels == -1:
-                        dim_head = ch // num_heads
-                    else:
-                        num_heads = ch // num_head_channels
-                        dim_head = num_head_channels
-                    if legacy:
-                        #num_heads = 1
-                        dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
-                    layers.append(
-                        AttentionBlock(
-                            ch,
-                            use_checkpoint=use_checkpoint,
-                            num_heads=num_heads,
-                            num_head_channels=dim_head,
-                            use_new_attention_order=use_new_attention_order,
-                        ) if not use_spatial_transformer else SpatialTransformer(
-                            ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim, use_checkpoint=use_checkpoint,
-                        )
-                    )
-                self.input_blocks.append(TimestepEmbedSequential(*layers))
-                self._feature_size += ch
-                input_block_chans.append(ch)
-            if level != len(channel_mult) - 1:
-                out_ch = ch
-                self.input_blocks.append(
-                    TimestepEmbedSequential(
-                        ResBlock(
-                            ch,
-                            time_embed_dim,
-                            dropout,
-                            out_channels=out_ch,
-                            dims=dims,
-                            use_checkpoint=use_checkpoint,
-                            use_scale_shift_norm=use_scale_shift_norm,
-                            down=True,
-                        )
-                        if resblock_updown
-                        else Downsample(
-                            ch, conv_resample, dims=dims, out_channels=out_ch
-                        )
-                    )
-                )
-                ch = out_ch
-                input_block_chans.append(ch)
-                ds *= 2
-                self._feature_size += ch
-
-        if num_head_channels == -1:
-            dim_head = ch // num_heads
-        else:
-            num_heads = ch // num_head_channels
-            dim_head = num_head_channels
-        if legacy:
-            #num_heads = 1
-            dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
-        self.middle_block = TimestepEmbedSequential(
-            ResBlock(
-                ch,
-                time_embed_dim,
-                dropout,
-                dims=dims,
-                use_checkpoint=use_checkpoint,
-                use_scale_shift_norm=use_scale_shift_norm,
-            ),
-            AttentionBlock(
-                ch,
-                use_checkpoint=use_checkpoint,
-                num_heads=num_heads,
-                num_head_channels=dim_head,
-                use_new_attention_order=use_new_attention_order,
-            ) if not use_spatial_transformer else SpatialTransformer(
-                            ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim
-                        ),
-            ResBlock(
-                ch,
-                time_embed_dim,
-                dropout,
-                dims=dims,
-                use_checkpoint=use_checkpoint,
-                use_scale_shift_norm=use_scale_shift_norm,
-            ),
-        )
-        self._feature_size += ch
-
-        self.output_blocks = nn.ModuleList([])
-        for level, mult in list(enumerate(channel_mult))[::-1]:
-            for i in range(num_res_blocks + 1):
-                ich = input_block_chans.pop()
-                layers = [
-                    ResBlock(
-                        ch + ich,
-                        time_embed_dim,
-                        dropout,
-                        out_channels=model_channels * mult,
-                        dims=dims,
-                        use_checkpoint=use_checkpoint,
-                        use_scale_shift_norm=use_scale_shift_norm,
-                    )
-                ]
-                ch = model_channels * mult
-                if ds in attention_resolutions:
-                    if num_head_channels == -1:
-                        dim_head = ch // num_heads
-                    else:
-                        num_heads = ch // num_head_channels
-                        dim_head = num_head_channels
-                    if legacy:
-                        #num_heads = 1
-                        dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
-                    layers.append(
-                        AttentionBlock(
-                            ch,
-                            use_checkpoint=use_checkpoint,
-                            num_heads=num_heads_upsample,
-                            num_head_channels=dim_head,
-                            use_new_attention_order=use_new_attention_order,
-                        ) if not use_spatial_transformer else SpatialTransformer(
-                            ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim
-                        )
-                    )
-                if level and i == num_res_blocks:
-                    out_ch = ch
-                    layers.append(
-                        ResBlock(
-                            ch,
-                            time_embed_dim,
-                            dropout,
-                            out_channels=out_ch,
-                            dims=dims,
-                            use_checkpoint=use_checkpoint,
-                            use_scale_shift_norm=use_scale_shift_norm,
-                            up=True,
-                        )
-                        if resblock_updown
-                        else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch)
-                    )
-                    ds //= 2
-                self.output_blocks.append(TimestepEmbedSequential(*layers))
-                self._feature_size += ch
-
-        self.out = nn.Sequential(
-            normalization(ch),
-            nn.SiLU(),
-            zero_module(conv_nd(dims, model_channels, out_channels, 3, padding=1)),
-        )
-        if self.predict_codebook_ids:
-            self.id_predictor = nn.Sequential(
-            normalization(ch),
-            conv_nd(dims, model_channels, n_embed, 1),
-            #nn.LogSoftmax(dim=1)  # change to cross_entropy and produce non-normalized logits
-        )
-        # if use_fp16:
-            # self.convert_to_fp16()
-        from diffusers.modeling_utils import load_state_dict
-        if from_pretrained is not None:
-            state_dict = load_state_dict(from_pretrained)
-            self._load_pretrained_model(state_dict)
-
-    def _input_blocks_mapping(self, input_dict):
-        res_dict = {}
-        for key_, value_ in input_dict.items():
-            id_0 = int(key_[13])
-            if "resnets" in key_:
-                id_1 = int(key_[23])
-                target_id = 3 * id_0 + 1 + id_1
-                post_fix = key_[25:].replace('time_emb_proj', 'emb_layers.1')\
-                    .replace('norm1', 'in_layers.0')\
-                    .replace('norm2', 'out_layers.0')\
-                    .replace('conv1', 'in_layers.2')\
-                    .replace('conv2', 'out_layers.3')\
-                    .replace('conv_shortcut', 'skip_connection')
-                res_dict["input_blocks." + str(target_id) + '.0.' + post_fix] = value_
-            elif "attentions" in key_:
-                id_1 = int(key_[26])
-                target_id = 3 * id_0 + 1 + id_1
-                post_fix = key_[28:]
-                res_dict["input_blocks." + str(target_id) + '.1.' + post_fix] = value_
-            elif "downsamplers" in key_:
-                post_fix = key_[35:]
-                target_id = 3 * (id_0 + 1)
-                res_dict["input_blocks." + str(target_id) + '.0.op.' + post_fix] = value_
-        return res_dict
-
-
-    def _mid_blocks_mapping(self, mid_dict):
-        res_dict = {}
-        for key_, value_ in mid_dict.items():
-            if "resnets" in key_:
-                temp_key_ =key_.replace('time_emb_proj', 'emb_layers.1') \
-                    .replace('norm1', 'in_layers.0') \
-                    .replace('norm2', 'out_layers.0') \
-                    .replace('conv1', 'in_layers.2') \
-                    .replace('conv2', 'out_layers.3') \
-                    .replace('conv_shortcut', 'skip_connection')\
-                    .replace('middle_block.resnets.0', 'middle_block.0')\
-                    .replace('middle_block.resnets.1', 'middle_block.2')
-                res_dict[temp_key_] = value_
-            elif "attentions" in key_:
-                res_dict[key_.replace('attentions.0', '1')] = value_
-        return res_dict
-
-    def _other_blocks_mapping(self, other_dict):
-        res_dict = {}
-        for key_, value_ in other_dict.items():
-            tmp_key = key_.replace('conv_in', 'input_blocks.0.0')\
-                            .replace('time_embedding.linear_1', 'time_embed.0')\
-                            .replace('time_embedding.linear_2', 'time_embed.2')\
-                            .replace('conv_norm_out', 'out.0')\
-                            .replace('conv_out', 'out.2')
-            res_dict[tmp_key] = value_
-        return res_dict
-
-
-    def _output_blocks_mapping(self, output_dict):
-        res_dict = {}
-        for key_, value_ in output_dict.items():
-            id_0 = int(key_[14])
-            if "resnets" in key_:
-                id_1 = int(key_[24])
-                target_id = 3 * id_0 + id_1
-                post_fix = key_[26:].replace('time_emb_proj', 'emb_layers.1') \
-                    .replace('norm1', 'in_layers.0') \
-                    .replace('norm2', 'out_layers.0') \
-                    .replace('conv1', 'in_layers.2') \
-                    .replace('conv2', 'out_layers.3') \
-                    .replace('conv_shortcut', 'skip_connection')
-                res_dict["output_blocks." + str(target_id) + '.0.' + post_fix] = value_
-            elif "attentions" in key_:
-                id_1 = int(key_[27])
-                target_id = 3 * id_0 + id_1
-                post_fix = key_[29:]
-                res_dict["output_blocks." + str(target_id) + '.1.' + post_fix] = value_
-            elif "upsamplers" in key_:
-                post_fix = key_[34:]
-                target_id = 3 * (id_0 + 1) - 1
-                mid_str = '.2.conv.' if target_id != 2 else '.1.conv.'
-                res_dict["output_blocks." + str(target_id) + mid_str + post_fix] = value_
-        return res_dict
-
-    def _state_key_mapping(self, state_dict: dict):
-        import re
-        res_dict = {}
-        input_dict = {}
-        mid_dict = {}
-        output_dict = {}
-        other_dict = {}
-        for key_, value_ in state_dict.items():
-            if "down_blocks" in key_:
-                input_dict[key_.replace('down_blocks', 'input_blocks')] = value_
-            elif "up_blocks" in key_:
-                output_dict[key_.replace('up_blocks', 'output_blocks')] = value_
-            elif "mid_block" in key_:
-                mid_dict[key_.replace('mid_block', 'middle_block')] = value_
-            else:
-                other_dict[key_] = value_
-
-        input_dict = self._input_blocks_mapping(input_dict)
-        output_dict = self._output_blocks_mapping(output_dict)
-        mid_dict = self._mid_blocks_mapping(mid_dict)
-        other_dict = self._other_blocks_mapping(other_dict)
-        # key_list = state_dict.keys()
-        # key_str = " ".join(key_list)
-
-        # for key_, val_ in state_dict.items():
-        #     key_ = key_.replace("down_blocks", "input_blocks")\
-        #         .replace("up_blocks", 'output_blocks')
-        #     res_dict[key_] = val_
-        res_dict.update(input_dict)
-        res_dict.update(output_dict)
-        res_dict.update(mid_dict)
-        res_dict.update(other_dict)
-
-        return res_dict
-
-    def _load_pretrained_model(self, state_dict, ignore_mismatched_sizes=False):
-        state_dict = self._state_key_mapping(state_dict)
-        model_state_dict = self.state_dict()
-        loaded_keys = [k for k in state_dict.keys()]
-        expected_keys = list(model_state_dict.keys())
-        original_loaded_keys = loaded_keys
-        missing_keys = list(set(expected_keys) - set(loaded_keys))
-        unexpected_keys = list(set(loaded_keys) - set(expected_keys))
-
-        def _find_mismatched_keys(
-            state_dict,
-            model_state_dict,
-            loaded_keys,
-            ignore_mismatched_sizes,
-        ):
-            mismatched_keys = []
-            if ignore_mismatched_sizes:
-                for checkpoint_key in loaded_keys:
-                    model_key = checkpoint_key
-
-                    if (
-                        model_key in model_state_dict
-                        and state_dict[checkpoint_key].shape != model_state_dict[model_key].shape
-                    ):
-                        mismatched_keys.append(
-                            (checkpoint_key, state_dict[checkpoint_key].shape, model_state_dict[model_key].shape)
-                        )
-                        del state_dict[checkpoint_key]
-            return mismatched_keys
-        if state_dict is not None:
-            # Whole checkpoint
-            mismatched_keys = _find_mismatched_keys(
-                state_dict,
-                model_state_dict,
-                original_loaded_keys,
-                ignore_mismatched_sizes,
-            )
-            error_msgs = self._load_state_dict_into_model(state_dict)
-        return missing_keys, unexpected_keys, mismatched_keys, error_msgs
-
-    def _load_state_dict_into_model(self, state_dict):
-        # Convert old format to new format if needed from a PyTorch state_dict
-        # copy state_dict so _load_from_state_dict can modify it
-        state_dict = state_dict.copy()
-        error_msgs = []
-
-        # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
-        # so we need to apply the function recursively.
-        def load(module: torch.nn.Module, prefix=""):
-            args = (state_dict, prefix, {}, True, [], [], error_msgs)
-            module._load_from_state_dict(*args)
-
-            for name, child in module._modules.items():
-                if child is not None:
-                    load(child, prefix + name + ".")
-
-        load(self)
-
-        return error_msgs
-
-    def convert_to_fp16(self):
-        """
-        Convert the torso of the model to float16.
-        """
-        self.input_blocks.apply(convert_module_to_f16)
-        self.middle_block.apply(convert_module_to_f16)
-        self.output_blocks.apply(convert_module_to_f16)
-
-    def convert_to_fp32(self):
-        """
-        Convert the torso of the model to float32.
-        """
-        self.input_blocks.apply(convert_module_to_f32)
-        self.middle_block.apply(convert_module_to_f32)
-        self.output_blocks.apply(convert_module_to_f32)
-
-    def forward(self, x, timesteps=None, context=None, y=None,**kwargs):
-        """
-        Apply the model to an input batch.
-        :param x: an [N x C x ...] Tensor of inputs.
-        :param timesteps: a 1-D batch of timesteps.
-        :param context: conditioning plugged in via crossattn
-        :param y: an [N] Tensor of labels, if class-conditional.
-        :return: an [N x C x ...] Tensor of outputs.
-        """
-        assert (y is not None) == (
-            self.num_classes is not None
-        ), "must specify y if and only if the model is class-conditional"
-        hs = []
-        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
-        emb = self.time_embed(t_emb)
-
-        if self.num_classes is not None:
-            assert y.shape == (x.shape[0],)
-            emb = emb + self.label_emb(y)
-
-        h = x.type(self.dtype)
-        for module in self.input_blocks:
-            h = module(h, emb, context)
-            hs.append(h)
-        h = self.middle_block(h, emb, context)
-        for module in self.output_blocks:
-            h = th.cat([h, hs.pop()], dim=1)
-            h = module(h, emb, context)
-        h = h.type(self.dtype)
-        if self.predict_codebook_ids:
-            return self.id_predictor(h)
-        else:
-            return self.out(h)
-
-
-class EncoderUNetModel(nn.Module):
-    """
-    The half UNet model with attention and timestep embedding.
-    For usage, see UNet.
-    """
-
-    def __init__(
-        self,
-        image_size,
-        in_channels,
-        model_channels,
-        out_channels,
-        num_res_blocks,
-        attention_resolutions,
-        dropout=0,
-        channel_mult=(1, 2, 4, 8),
-        conv_resample=True,
-        dims=2,
-        use_checkpoint=False,
-        use_fp16=False,
-        num_heads=1,
-        num_head_channels=-1,
-        num_heads_upsample=-1,
-        use_scale_shift_norm=False,
-        resblock_updown=False,
-        use_new_attention_order=False,
-        pool="adaptive",
-        *args,
-        **kwargs
-    ):
-        super().__init__()
-
-        if num_heads_upsample == -1:
-            num_heads_upsample = num_heads
-
-        self.in_channels = in_channels
-        self.model_channels = model_channels
-        self.out_channels = out_channels
-        self.num_res_blocks = num_res_blocks
-        self.attention_resolutions = attention_resolutions
-        self.dropout = dropout
-        self.channel_mult = channel_mult
-        self.conv_resample = conv_resample
-        self.use_checkpoint = use_checkpoint
-        self.dtype = th.float16 if use_fp16 else th.float32
-        self.num_heads = num_heads
-        self.num_head_channels = num_head_channels
-        self.num_heads_upsample = num_heads_upsample
-
-        time_embed_dim = model_channels * 4
-        self.time_embed = nn.Sequential(
-            linear(model_channels, time_embed_dim),
-            nn.SiLU(),
-            linear(time_embed_dim, time_embed_dim),
-        )
-
-        self.input_blocks = nn.ModuleList(
-            [
-                TimestepEmbedSequential(
-                    conv_nd(dims, in_channels, model_channels, 3, padding=1)
-                )
-            ]
-        )
-        self._feature_size = model_channels
-        input_block_chans = [model_channels]
-        ch = model_channels
-        ds = 1
-        for level, mult in enumerate(channel_mult):
-            for _ in range(num_res_blocks):
-                layers = [
-                    ResBlock(
-                        ch,
-                        time_embed_dim,
-                        dropout,
-                        out_channels=mult * model_channels,
-                        dims=dims,
-                        use_checkpoint=use_checkpoint,
-                        use_scale_shift_norm=use_scale_shift_norm,
-                    )
-                ]
-                ch = mult * model_channels
-                if ds in attention_resolutions:
-                    layers.append(
-                        AttentionBlock(
-                            ch,
-                            use_checkpoint=use_checkpoint,
-                            num_heads=num_heads,
-                            num_head_channels=num_head_channels,
-                            use_new_attention_order=use_new_attention_order,
-                        )
-                    )
-                self.input_blocks.append(TimestepEmbedSequential(*layers))
-                self._feature_size += ch
-                input_block_chans.append(ch)
-            if level != len(channel_mult) - 1:
-                out_ch = ch
-                self.input_blocks.append(
-                    TimestepEmbedSequential(
-                        ResBlock(
-                            ch,
-                            time_embed_dim,
-                            dropout,
-                            out_channels=out_ch,
-                            dims=dims,
-                            use_checkpoint=use_checkpoint,
-                            use_scale_shift_norm=use_scale_shift_norm,
-                            down=True,
-                        )
-                        if resblock_updown
-                        else Downsample(
-                            ch, conv_resample, dims=dims, out_channels=out_ch
-                        )
-                    )
-                )
-                ch = out_ch
-                input_block_chans.append(ch)
-                ds *= 2
-                self._feature_size += ch
-
-        self.middle_block = TimestepEmbedSequential(
-            ResBlock(
-                ch,
-                time_embed_dim,
-                dropout,
-                dims=dims,
-                use_checkpoint=use_checkpoint,
-                use_scale_shift_norm=use_scale_shift_norm,
-            ),
-            AttentionBlock(
-                ch,
-                use_checkpoint=use_checkpoint,
-                num_heads=num_heads,
-                num_head_channels=num_head_channels,
-                use_new_attention_order=use_new_attention_order,
-            ),
-            ResBlock(
-                ch,
-                time_embed_dim,
-                dropout,
-                dims=dims,
-                use_checkpoint=use_checkpoint,
-                use_scale_shift_norm=use_scale_shift_norm,
-            ),
-        )
-        self._feature_size += ch
-        self.pool = pool
-        if pool == "adaptive":
-            self.out = nn.Sequential(
-                normalization(ch),
-                nn.SiLU(),
-                nn.AdaptiveAvgPool2d((1, 1)),
-                zero_module(conv_nd(dims, ch, out_channels, 1)),
-                nn.Flatten(),
-            )
-        elif pool == "attention":
-            assert num_head_channels != -1
-            self.out = nn.Sequential(
-                normalization(ch),
-                nn.SiLU(),
-                AttentionPool2d(
-                    (image_size // ds), ch, num_head_channels, out_channels
-                ),
-            )
-        elif pool == "spatial":
-            self.out = nn.Sequential(
-                nn.Linear(self._feature_size, 2048),
-                nn.ReLU(),
-                nn.Linear(2048, self.out_channels),
-            )
-        elif pool == "spatial_v2":
-            self.out = nn.Sequential(
-                nn.Linear(self._feature_size, 2048),
-                normalization(2048),
-                nn.SiLU(),
-                nn.Linear(2048, self.out_channels),
-            )
-        else:
-            raise NotImplementedError(f"Unexpected {pool} pooling")
-
-    def convert_to_fp16(self):
-        """
-        Convert the torso of the model to float16.
-        """
-        self.input_blocks.apply(convert_module_to_f16)
-        self.middle_block.apply(convert_module_to_f16)
-
-    def convert_to_fp32(self):
-        """
-        Convert the torso of the model to float32.
-        """
-        self.input_blocks.apply(convert_module_to_f32)
-        self.middle_block.apply(convert_module_to_f32)
-
-    def forward(self, x, timesteps):
-        """
-        Apply the model to an input batch.
-        :param x: an [N x C x ...] Tensor of inputs.
-        :param timesteps: a 1-D batch of timesteps.
-        :return: an [N x K] Tensor of outputs.
-        """
-        emb = self.time_embed(timestep_embedding(timesteps, self.model_channels))
-
-        results = []
-        h = x.type(self.dtype)
-        for module in self.input_blocks:
-            h = module(h, emb)
-            if self.pool.startswith("spatial"):
-                results.append(h.type(x.dtype).mean(dim=(2, 3)))
-        h = self.middle_block(h, emb)
-        if self.pool.startswith("spatial"):
-            results.append(h.type(x.dtype).mean(dim=(2, 3)))
-            h = th.cat(results, axis=-1)
-            return self.out(h)
-        else:
-            h = h.type(self.dtype)
-            return self.out(h)
-
diff --git a/examples/tutorial/stable_diffusion/ldm/modules/diffusionmodules/util.py b/examples/tutorial/stable_diffusion/ldm/modules/diffusionmodules/util.py
deleted file mode 100644
index a7db9369c58a..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/modules/diffusionmodules/util.py
+++ /dev/null
@@ -1,276 +0,0 @@
-# adopted from
-# https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
-# and
-# https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
-# and
-# https://github.com/openai/guided-diffusion/blob/0ba878e517b276c45d1195eb29f6f5f72659a05b/guided_diffusion/nn.py
-#
-# thanks!
-
-
-import os
-import math
-import torch
-import torch.nn as nn
-import numpy as np
-from einops import repeat
-
-from ldm.util import instantiate_from_config
-
-
-def make_beta_schedule(schedule, n_timestep, linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
-    if schedule == "linear":
-        betas = (
-                torch.linspace(linear_start ** 0.5, linear_end ** 0.5, n_timestep, dtype=torch.float64) ** 2
-        )
-
-    elif schedule == "cosine":
-        timesteps = (
-                torch.arange(n_timestep + 1, dtype=torch.float64) / n_timestep + cosine_s
-        )
-        alphas = timesteps / (1 + cosine_s) * np.pi / 2
-        alphas = torch.cos(alphas).pow(2)
-        alphas = alphas / alphas[0]
-        betas = 1 - alphas[1:] / alphas[:-1]
-        betas = np.clip(betas, a_min=0, a_max=0.999)
-
-    elif schedule == "sqrt_linear":
-        betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64)
-    elif schedule == "sqrt":
-        betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64) ** 0.5
-    else:
-        raise ValueError(f"schedule '{schedule}' unknown.")
-    return betas.numpy()
-
-
-def make_ddim_timesteps(ddim_discr_method, num_ddim_timesteps, num_ddpm_timesteps, verbose=True):
-    if ddim_discr_method == 'uniform':
-        c = num_ddpm_timesteps // num_ddim_timesteps
-        ddim_timesteps = np.asarray(list(range(0, num_ddpm_timesteps, c)))
-    elif ddim_discr_method == 'quad':
-        ddim_timesteps = ((np.linspace(0, np.sqrt(num_ddpm_timesteps * .8), num_ddim_timesteps)) ** 2).astype(int)
-    else:
-        raise NotImplementedError(f'There is no ddim discretization method called "{ddim_discr_method}"')
-
-    # assert ddim_timesteps.shape[0] == num_ddim_timesteps
-    # add one to get the final alpha values right (the ones from first scale to data during sampling)
-    steps_out = ddim_timesteps + 1
-    if verbose:
-        print(f'Selected timesteps for ddim sampler: {steps_out}')
-    return steps_out
-
-
-def make_ddim_sampling_parameters(alphacums, ddim_timesteps, eta, verbose=True):
-    # select alphas for computing the variance schedule
-    alphas = alphacums[ddim_timesteps]
-    alphas_prev = np.asarray([alphacums[0]] + alphacums[ddim_timesteps[:-1]].tolist())
-
-    # according the the formula provided in https://arxiv.org/abs/2010.02502
-    sigmas = eta * np.sqrt((1 - alphas_prev) / (1 - alphas) * (1 - alphas / alphas_prev))
-    if verbose:
-        print(f'Selected alphas for ddim sampler: a_t: {alphas}; a_(t-1): {alphas_prev}')
-        print(f'For the chosen value of eta, which is {eta}, '
-              f'this results in the following sigma_t schedule for ddim sampler {sigmas}')
-    return sigmas, alphas, alphas_prev
-
-
-def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
-    """
-    Create a beta schedule that discretizes the given alpha_t_bar function,
-    which defines the cumulative product of (1-beta) over time from t = [0,1].
-    :param num_diffusion_timesteps: the number of betas to produce.
-    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
-                      produces the cumulative product of (1-beta) up to that
-                      part of the diffusion process.
-    :param max_beta: the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-    """
-    betas = []
-    for i in range(num_diffusion_timesteps):
-        t1 = i / num_diffusion_timesteps
-        t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
-    return np.array(betas)
-
-
-def extract_into_tensor(a, t, x_shape):
-    b, *_ = t.shape
-    out = a.gather(-1, t)
-    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
-
-
-def checkpoint(func, inputs, params, flag):
-    """
-    Evaluate a function without caching intermediate activations, allowing for
-    reduced memory at the expense of extra compute in the backward pass.
-    :param func: the function to evaluate.
-    :param inputs: the argument sequence to pass to `func`.
-    :param params: a sequence of parameters `func` depends on but does not
-                   explicitly take as arguments.
-    :param flag: if False, disable gradient checkpointing.
-    """
-    if flag:
-        args = tuple(inputs) + tuple(params)
-        return CheckpointFunction.apply(func, len(inputs), *args)
-    else:
-        return func(*inputs)
-
-
-class CheckpointFunction(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, run_function, length, *args):
-        ctx.run_function = run_function
-        ctx.input_tensors = list(args[:length])
-        ctx.input_params = list(args[length:])
-
-        with torch.no_grad():
-            output_tensors = ctx.run_function(*ctx.input_tensors)
-        return output_tensors
-
-    @staticmethod
-    def backward(ctx, *output_grads):
-        ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors]
-        with torch.enable_grad():
-            # Fixes a bug where the first op in run_function modifies the
-            # Tensor storage in place, which is not allowed for detach()'d
-            # Tensors.
-            shallow_copies = [x.view_as(x) for x in ctx.input_tensors]
-            output_tensors = ctx.run_function(*shallow_copies)
-        input_grads = torch.autograd.grad(
-            output_tensors,
-            ctx.input_tensors + ctx.input_params,
-            output_grads,
-            allow_unused=True,
-        )
-        del ctx.input_tensors
-        del ctx.input_params
-        del output_tensors
-        return (None, None) + input_grads
-
-
-def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False, use_fp16=True):
-    """
-    Create sinusoidal timestep embeddings.
-    :param timesteps: a 1-D Tensor of N indices, one per batch element.
-                      These may be fractional.
-    :param dim: the dimension of the output.
-    :param max_period: controls the minimum frequency of the embeddings.
-    :return: an [N x dim] Tensor of positional embeddings.
-    """
-    if not repeat_only:
-        half = dim // 2
-        freqs = torch.exp(
-            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
-        ).to(device=timesteps.device)
-        args = timesteps[:, None].float() * freqs[None]
-        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
-        if dim % 2:
-            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
-    else:
-        embedding = repeat(timesteps, 'b -> b d', d=dim)
-    if use_fp16:
-        return embedding.half()
-    else:
-        return embedding
-
-
-def zero_module(module):
-    """
-    Zero out the parameters of a module and return it.
-    """
-    for p in module.parameters():
-        p.detach().zero_()
-    return module
-
-
-def scale_module(module, scale):
-    """
-    Scale the parameters of a module and return it.
-    """
-    for p in module.parameters():
-        p.detach().mul_(scale)
-    return module
-
-
-def mean_flat(tensor):
-    """
-    Take the mean over all non-batch dimensions.
-    """
-    return tensor.mean(dim=list(range(1, len(tensor.shape))))
-
-
-def normalization(channels, precision=16):
-    """
-    Make a standard normalization layer.
-    :param channels: number of input channels.
-    :return: an nn.Module for normalization.
-    """
-    if precision == 16:
-        return GroupNorm16(16, channels)
-    else:
-        return GroupNorm32(32, channels)
-
-
-# PyTorch 1.7 has SiLU, but we support PyTorch 1.5.
-class SiLU(nn.Module):
-    def forward(self, x):
-        return x * torch.sigmoid(x)
-
-class GroupNorm16(nn.GroupNorm):
-    def forward(self, x):
-        return super().forward(x.half()).type(x.dtype)
-
-class GroupNorm32(nn.GroupNorm):
-    def forward(self, x):
-        return super().forward(x.float()).type(x.dtype)
-
-def conv_nd(dims, *args, **kwargs):
-    """
-    Create a 1D, 2D, or 3D convolution module.
-    """
-    if dims == 1:
-        return nn.Conv1d(*args, **kwargs)
-    elif dims == 2:
-        return nn.Conv2d(*args, **kwargs)
-    elif dims == 3:
-        return nn.Conv3d(*args, **kwargs)
-    raise ValueError(f"unsupported dimensions: {dims}")
-
-
-def linear(*args, **kwargs):
-    """
-    Create a linear module.
-    """
-    return nn.Linear(*args, **kwargs)
-
-
-def avg_pool_nd(dims, *args, **kwargs):
-    """
-    Create a 1D, 2D, or 3D average pooling module.
-    """
-    if dims == 1:
-        return nn.AvgPool1d(*args, **kwargs)
-    elif dims == 2:
-        return nn.AvgPool2d(*args, **kwargs)
-    elif dims == 3:
-        return nn.AvgPool3d(*args, **kwargs)
-    raise ValueError(f"unsupported dimensions: {dims}")
-
-
-class HybridConditioner(nn.Module):
-
-    def __init__(self, c_concat_config, c_crossattn_config):
-        super().__init__()
-        self.concat_conditioner = instantiate_from_config(c_concat_config)
-        self.crossattn_conditioner = instantiate_from_config(c_crossattn_config)
-
-    def forward(self, c_concat, c_crossattn):
-        c_concat = self.concat_conditioner(c_concat)
-        c_crossattn = self.crossattn_conditioner(c_crossattn)
-        return {'c_concat': [c_concat], 'c_crossattn': [c_crossattn]}
-
-
-def noise_like(shape, device, repeat=False):
-    repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1)))
-    noise = lambda: torch.randn(shape, device=device)
-    return repeat_noise() if repeat else noise()
\ No newline at end of file
diff --git a/examples/tutorial/stable_diffusion/ldm/modules/distributions/__init__.py b/examples/tutorial/stable_diffusion/ldm/modules/distributions/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/examples/tutorial/stable_diffusion/ldm/modules/distributions/distributions.py b/examples/tutorial/stable_diffusion/ldm/modules/distributions/distributions.py
deleted file mode 100644
index f2b8ef901130..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/modules/distributions/distributions.py
+++ /dev/null
@@ -1,92 +0,0 @@
-import torch
-import numpy as np
-
-
-class AbstractDistribution:
-    def sample(self):
-        raise NotImplementedError()
-
-    def mode(self):
-        raise NotImplementedError()
-
-
-class DiracDistribution(AbstractDistribution):
-    def __init__(self, value):
-        self.value = value
-
-    def sample(self):
-        return self.value
-
-    def mode(self):
-        return self.value
-
-
-class DiagonalGaussianDistribution(object):
-    def __init__(self, parameters, deterministic=False):
-        self.parameters = parameters
-        self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
-        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
-        self.deterministic = deterministic
-        self.std = torch.exp(0.5 * self.logvar)
-        self.var = torch.exp(self.logvar)
-        if self.deterministic:
-            self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device)
-
-    def sample(self):
-        x = self.mean + self.std * torch.randn(self.mean.shape).to(device=self.parameters.device)
-        return x
-
-    def kl(self, other=None):
-        if self.deterministic:
-            return torch.Tensor([0.])
-        else:
-            if other is None:
-                return 0.5 * torch.sum(torch.pow(self.mean, 2)
-                                       + self.var - 1.0 - self.logvar,
-                                       dim=[1, 2, 3])
-            else:
-                return 0.5 * torch.sum(
-                    torch.pow(self.mean - other.mean, 2) / other.var
-                    + self.var / other.var - 1.0 - self.logvar + other.logvar,
-                    dim=[1, 2, 3])
-
-    def nll(self, sample, dims=[1,2,3]):
-        if self.deterministic:
-            return torch.Tensor([0.])
-        logtwopi = np.log(2.0 * np.pi)
-        return 0.5 * torch.sum(
-            logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
-            dim=dims)
-
-    def mode(self):
-        return self.mean
-
-
-def normal_kl(mean1, logvar1, mean2, logvar2):
-    """
-    source: https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/losses.py#L12
-    Compute the KL divergence between two gaussians.
-    Shapes are automatically broadcasted, so batches can be compared to
-    scalars, among other use cases.
-    """
-    tensor = None
-    for obj in (mean1, logvar1, mean2, logvar2):
-        if isinstance(obj, torch.Tensor):
-            tensor = obj
-            break
-    assert tensor is not None, "at least one argument must be a Tensor"
-
-    # Force variances to be Tensors. Broadcasting helps convert scalars to
-    # Tensors, but it does not work for torch.exp().
-    logvar1, logvar2 = [
-        x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor)
-        for x in (logvar1, logvar2)
-    ]
-
-    return 0.5 * (
-        -1.0
-        + logvar2
-        - logvar1
-        + torch.exp(logvar1 - logvar2)
-        + ((mean1 - mean2) ** 2) * torch.exp(-logvar2)
-    )
diff --git a/examples/tutorial/stable_diffusion/ldm/modules/ema.py b/examples/tutorial/stable_diffusion/ldm/modules/ema.py
deleted file mode 100644
index c8c75af43565..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/modules/ema.py
+++ /dev/null
@@ -1,76 +0,0 @@
-import torch
-from torch import nn
-
-
-class LitEma(nn.Module):
-    def __init__(self, model, decay=0.9999, use_num_upates=True):
-        super().__init__()
-        if decay < 0.0 or decay > 1.0:
-            raise ValueError('Decay must be between 0 and 1')
-
-        self.m_name2s_name = {}
-        self.register_buffer('decay', torch.tensor(decay, dtype=torch.float32))
-        self.register_buffer('num_updates', torch.tensor(0,dtype=torch.int) if use_num_upates
-                             else torch.tensor(-1,dtype=torch.int))
-
-        for name, p in model.named_parameters():
-            if p.requires_grad:
-                #remove as '.'-character is not allowed in buffers
-                s_name = name.replace('.','')
-                self.m_name2s_name.update({name:s_name})
-                self.register_buffer(s_name,p.clone().detach().data)
-
-        self.collected_params = []
-
-    def forward(self,model):
-        decay = self.decay
-
-        if self.num_updates >= 0:
-            self.num_updates += 1
-            decay = min(self.decay,(1 + self.num_updates) / (10 + self.num_updates))
-
-        one_minus_decay = 1.0 - decay
-
-        with torch.no_grad():
-            m_param = dict(model.named_parameters())
-            shadow_params = dict(self.named_buffers())
-
-            for key in m_param:
-                if m_param[key].requires_grad:
-                    sname = self.m_name2s_name[key]
-                    shadow_params[sname] = shadow_params[sname].type_as(m_param[key])
-                    shadow_params[sname].sub_(one_minus_decay * (shadow_params[sname] - m_param[key]))
-                else:
-                    assert not key in self.m_name2s_name
-
-    def copy_to(self, model):
-        m_param = dict(model.named_parameters())
-        shadow_params = dict(self.named_buffers())
-        for key in m_param:
-            if m_param[key].requires_grad:
-                m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]].data)
-            else:
-                assert not key in self.m_name2s_name
-
-    def store(self, parameters):
-        """
-        Save the current parameters for restoring later.
-        Args:
-          parameters: Iterable of `torch.nn.Parameter`; the parameters to be
-            temporarily stored.
-        """
-        self.collected_params = [param.clone() for param in parameters]
-
-    def restore(self, parameters):
-        """
-        Restore the parameters stored with the `store` method.
-        Useful to validate the model with EMA parameters without affecting the
-        original optimization process. Store the parameters before the
-        `copy_to` method. After validation (or model saving), use this to
-        restore the former parameters.
-        Args:
-          parameters: Iterable of `torch.nn.Parameter`; the parameters to be
-            updated with the stored parameters.
-        """
-        for c_param, param in zip(self.collected_params, parameters):
-            param.data.copy_(c_param.data)
diff --git a/examples/tutorial/stable_diffusion/ldm/modules/encoders/__init__.py b/examples/tutorial/stable_diffusion/ldm/modules/encoders/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/examples/tutorial/stable_diffusion/ldm/modules/encoders/modules.py b/examples/tutorial/stable_diffusion/ldm/modules/encoders/modules.py
deleted file mode 100644
index 8cfc01e5ded4..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/modules/encoders/modules.py
+++ /dev/null
@@ -1,264 +0,0 @@
-import types 
-
-import torch
-import torch.nn as nn
-from functools import partial
-import clip
-from einops import rearrange, repeat
-from transformers import CLIPTokenizer, CLIPTextModel, CLIPTextConfig
-import kornia
-from transformers.models.clip.modeling_clip import CLIPTextTransformer
-
-from ldm.modules.x_transformer import Encoder, TransformerWrapper  # TODO: can we directly rely on lucidrains code and simply add this as a reuirement? --> test
-
-
-class AbstractEncoder(nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def encode(self, *args, **kwargs):
-        raise NotImplementedError
-
-
-
-class ClassEmbedder(nn.Module):
-    def __init__(self, embed_dim, n_classes=1000, key='class'):
-        super().__init__()
-        self.key = key
-        self.embedding = nn.Embedding(n_classes, embed_dim)
-
-    def forward(self, batch, key=None):
-        if key is None:
-            key = self.key
-        # this is for use in crossattn
-        c = batch[key][:, None]
-        c = self.embedding(c)
-        return c
-
-
-class TransformerEmbedder(AbstractEncoder):
-    """Some transformer encoder layers"""
-    def __init__(self, n_embed, n_layer, vocab_size, max_seq_len=77, device="cuda"):
-        super().__init__()
-        self.device = device
-        self.transformer = TransformerWrapper(num_tokens=vocab_size, max_seq_len=max_seq_len,
-                                              attn_layers=Encoder(dim=n_embed, depth=n_layer))
-
-    def forward(self, tokens):
-        tokens = tokens.to(self.device)  # meh
-        z = self.transformer(tokens, return_embeddings=True)
-        return z
-
-    def encode(self, x):
-        return self(x)
-
-
-class BERTTokenizer(AbstractEncoder):
-    """ Uses a pretrained BERT tokenizer by huggingface. Vocab size: 30522 (?)"""
-    def __init__(self, device="cuda", vq_interface=True, max_length=77):
-        super().__init__()
-        from transformers import BertTokenizerFast  # TODO: add to reuquirements
-        self.tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
-        self.device = device
-        self.vq_interface = vq_interface
-        self.max_length = max_length
-
-    def forward(self, text):
-        batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
-                                        return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
-        tokens = batch_encoding["input_ids"].to(self.device)
-        return tokens
-
-    @torch.no_grad()
-    def encode(self, text):
-        tokens = self(text)
-        if not self.vq_interface:
-            return tokens
-        return None, None, [None, None, tokens]
-
-    def decode(self, text):
-        return text
-
-
-class BERTEmbedder(AbstractEncoder):
-    """Uses the BERT tokenizr model and add some transformer encoder layers"""
-    def __init__(self, n_embed, n_layer, vocab_size=30522, max_seq_len=77,
-                 device="cuda",use_tokenizer=True, embedding_dropout=0.0):
-        super().__init__()
-        self.use_tknz_fn = use_tokenizer
-        if self.use_tknz_fn:
-            self.tknz_fn = BERTTokenizer(vq_interface=False, max_length=max_seq_len)
-        self.device = device
-        self.transformer = TransformerWrapper(num_tokens=vocab_size, max_seq_len=max_seq_len,
-                                              attn_layers=Encoder(dim=n_embed, depth=n_layer),
-                                              emb_dropout=embedding_dropout)
-
-    def forward(self, text):
-        if self.use_tknz_fn:
-            tokens = self.tknz_fn(text)#.to(self.device)
-        else:
-            tokens = text
-        z = self.transformer(tokens, return_embeddings=True)
-        return z
-
-    def encode(self, text):
-        # output of length 77
-        return self(text)
-
-
-class SpatialRescaler(nn.Module):
-    def __init__(self,
-                 n_stages=1,
-                 method='bilinear',
-                 multiplier=0.5,
-                 in_channels=3,
-                 out_channels=None,
-                 bias=False):
-        super().__init__()
-        self.n_stages = n_stages
-        assert self.n_stages >= 0
-        assert method in ['nearest','linear','bilinear','trilinear','bicubic','area']
-        self.multiplier = multiplier
-        self.interpolator = partial(torch.nn.functional.interpolate, mode=method)
-        self.remap_output = out_channels is not None
-        if self.remap_output:
-            print(f'Spatial Rescaler mapping from {in_channels} to {out_channels} channels after resizing.')
-            self.channel_mapper = nn.Conv2d(in_channels,out_channels,1,bias=bias)
-
-    def forward(self,x):
-        for stage in range(self.n_stages):
-            x = self.interpolator(x, scale_factor=self.multiplier)
-
-
-        if self.remap_output:
-            x = self.channel_mapper(x)
-        return x
-
-    def encode(self, x):
-        return self(x)
-
-
-class CLIPTextModelZero(CLIPTextModel):
-    config_class = CLIPTextConfig
-
-    def __init__(self, config: CLIPTextConfig):
-        super().__init__(config)
-        self.text_model = CLIPTextTransformerZero(config)
-
-class CLIPTextTransformerZero(CLIPTextTransformer):
-    def _build_causal_attention_mask(self, bsz, seq_len):
-        # lazily create causal attention mask, with full attention between the vision tokens
-        # pytorch uses additive attention mask; fill with -inf
-        mask = torch.empty(bsz, seq_len, seq_len)
-        mask.fill_(float("-inf"))
-        mask.triu_(1)  # zero out the lower diagonal
-        mask = mask.unsqueeze(1)  # expand mask
-        return mask.half()
-
-class FrozenCLIPEmbedder(AbstractEncoder):
-    """Uses the CLIP transformer encoder for text (from Hugging Face)"""
-    def __init__(self, version="openai/clip-vit-large-patch14", device="cuda", max_length=77, use_fp16=True):
-        super().__init__()
-        self.tokenizer = CLIPTokenizer.from_pretrained(version)
-        
-        if use_fp16:
-            self.transformer = CLIPTextModelZero.from_pretrained(version)
-        else:
-            self.transformer = CLIPTextModel.from_pretrained(version)
-
-        # print(self.transformer.modules())
-        # print("check model dtyoe: {}, {}".format(self.tokenizer.dtype, self.transformer.dtype))
-        self.device = device
-        self.max_length = max_length
-        self.freeze() 
-
-    def freeze(self):
-        self.transformer = self.transformer.eval()
-        for param in self.parameters():
-            param.requires_grad = False
-
-    def forward(self, text):
-        batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
-                                        return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
-        # tokens = batch_encoding["input_ids"].to(self.device)
-        tokens = batch_encoding["input_ids"].to(self.device)
-        # print("token type: {}".format(tokens.dtype))
-        outputs = self.transformer(input_ids=tokens)
-
-        z = outputs.last_hidden_state
-        return z
-
-    def encode(self, text):
-        return self(text)
-
-
-class FrozenCLIPTextEmbedder(nn.Module):
-    """
-    Uses the CLIP transformer encoder for text.
-    """
-    def __init__(self, version='ViT-L/14', device="cuda", max_length=77, n_repeat=1, normalize=True):
-        super().__init__()
-        self.model, _ = clip.load(version, jit=False, device="cpu")
-        self.device = device
-        self.max_length = max_length
-        self.n_repeat = n_repeat
-        self.normalize = normalize
-
-    def freeze(self):
-        self.model = self.model.eval()
-        for param in self.parameters():
-            param.requires_grad = False
-
-    def forward(self, text):
-        tokens = clip.tokenize(text).to(self.device)
-        z = self.model.encode_text(tokens)
-        if self.normalize:
-            z = z / torch.linalg.norm(z, dim=1, keepdim=True)
-        return z
-
-    def encode(self, text):
-        z = self(text)
-        if z.ndim==2:
-            z = z[:, None, :]
-        z = repeat(z, 'b 1 d -> b k d', k=self.n_repeat)
-        return z
-
-
-class FrozenClipImageEmbedder(nn.Module):
-    """
-        Uses the CLIP image encoder.
-        """
-    def __init__(
-            self,
-            model,
-            jit=False,
-            device='cuda' if torch.cuda.is_available() else 'cpu',
-            antialias=False,
-        ):
-        super().__init__()
-        self.model, _ = clip.load(name=model, device=device, jit=jit)
-
-        self.antialias = antialias
-
-        self.register_buffer('mean', torch.Tensor([0.48145466, 0.4578275, 0.40821073]), persistent=False)
-        self.register_buffer('std', torch.Tensor([0.26862954, 0.26130258, 0.27577711]), persistent=False)
-
-    def preprocess(self, x):
-        # normalize to [0,1]
-        x = kornia.geometry.resize(x, (224, 224),
-                                   interpolation='bicubic',align_corners=True,
-                                   antialias=self.antialias)
-        x = (x + 1.) / 2.
-        # renormalize according to clip
-        x = kornia.enhance.normalize(x, self.mean, self.std)
-        return x
-
-    def forward(self, x):
-        # x is assumed to be in range [-1,1]
-        return self.model.encode_image(self.preprocess(x))
-
-
-if __name__ == "__main__":
-    from ldm.util import count_params
-    model = FrozenCLIPEmbedder()
-    count_params(model, verbose=True)
\ No newline at end of file
diff --git a/examples/tutorial/stable_diffusion/ldm/modules/flash_attention.py b/examples/tutorial/stable_diffusion/ldm/modules/flash_attention.py
deleted file mode 100644
index 2a7a73879857..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/modules/flash_attention.py
+++ /dev/null
@@ -1,50 +0,0 @@
-"""
-Fused Attention
-===============
-This is a Triton implementation of the Flash Attention algorithm
-(see: Dao et al., https://arxiv.org/pdf/2205.14135v2.pdf; Rabe and Staats https://arxiv.org/pdf/2112.05682v2.pdf; Triton https://github.com/openai/triton)
-"""
-
-import torch
-try:
-    from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func, flash_attn_unpadded_kvpacked_func
-except ImportError:
-    raise ImportError('please install flash_attn from https://github.com/HazyResearch/flash-attention')
-
-
-
-def flash_attention_qkv(qkv, sm_scale, batch_size, seq_len):
-    """
-    Arguments:
-        qkv: (batch*seq, 3, nheads, headdim)
-        batch_size: int.
-        seq_len: int.
-        sm_scale: float. The scaling of QK^T before applying softmax.
-    Return:
-        out: (total, nheads, headdim).
-    """
-    max_s = seq_len
-    cu_seqlens = torch.arange(0, (batch_size + 1) * seq_len, step=seq_len, dtype=torch.int32,
-        device=qkv.device)
-    out = flash_attn_unpadded_qkvpacked_func(
-        qkv, cu_seqlens, max_s, 0.0,
-        softmax_scale=sm_scale, causal=False
-    )
-    return out
-
-
-def flash_attention_q_kv(q, kv, sm_scale, batch_size, q_seqlen, kv_seqlen):
-    """
-    Arguments:
-        q: (batch*seq, nheads, headdim)
-        kv: (batch*seq, 2, nheads, headdim)
-        batch_size: int.
-        seq_len: int.
-        sm_scale: float. The scaling of QK^T before applying softmax.
-    Return:
-        out: (total, nheads, headdim).
-    """
-    cu_seqlens_q = torch.arange(0, (batch_size + 1) * q_seqlen, step=q_seqlen, dtype=torch.int32, device=q.device)
-    cu_seqlens_k = torch.arange(0, (batch_size + 1) * kv_seqlen, step=kv_seqlen, dtype=torch.int32, device=kv.device)
-    out = flash_attn_unpadded_kvpacked_func(q, kv, cu_seqlens_q, cu_seqlens_k, q_seqlen, kv_seqlen, 0.0, sm_scale)
-    return out
diff --git a/examples/tutorial/stable_diffusion/ldm/modules/image_degradation/__init__.py b/examples/tutorial/stable_diffusion/ldm/modules/image_degradation/__init__.py
deleted file mode 100644
index 7836cada81f9..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/modules/image_degradation/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from ldm.modules.image_degradation.bsrgan import degradation_bsrgan_variant as degradation_fn_bsr
-from ldm.modules.image_degradation.bsrgan_light import degradation_bsrgan_variant as degradation_fn_bsr_light
diff --git a/examples/tutorial/stable_diffusion/ldm/modules/image_degradation/bsrgan.py b/examples/tutorial/stable_diffusion/ldm/modules/image_degradation/bsrgan.py
deleted file mode 100644
index 32ef56169978..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/modules/image_degradation/bsrgan.py
+++ /dev/null
@@ -1,730 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-# --------------------------------------------
-# Super-Resolution
-# --------------------------------------------
-#
-# Kai Zhang (cskaizhang@gmail.com)
-# https://github.com/cszn
-# From 2019/03--2021/08
-# --------------------------------------------
-"""
-
-import numpy as np
-import cv2
-import torch
-
-from functools import partial
-import random
-from scipy import ndimage
-import scipy
-import scipy.stats as ss
-from scipy.interpolate import interp2d
-from scipy.linalg import orth
-import albumentations
-
-import ldm.modules.image_degradation.utils_image as util
-
-
-def modcrop_np(img, sf):
-    '''
-    Args:
-        img: numpy image, WxH or WxHxC
-        sf: scale factor
-    Return:
-        cropped image
-    '''
-    w, h = img.shape[:2]
-    im = np.copy(img)
-    return im[:w - w % sf, :h - h % sf, ...]
-
-
-"""
-# --------------------------------------------
-# anisotropic Gaussian kernels
-# --------------------------------------------
-"""
-
-
-def analytic_kernel(k):
-    """Calculate the X4 kernel from the X2 kernel (for proof see appendix in paper)"""
-    k_size = k.shape[0]
-    # Calculate the big kernels size
-    big_k = np.zeros((3 * k_size - 2, 3 * k_size - 2))
-    # Loop over the small kernel to fill the big one
-    for r in range(k_size):
-        for c in range(k_size):
-            big_k[2 * r:2 * r + k_size, 2 * c:2 * c + k_size] += k[r, c] * k
-    # Crop the edges of the big kernel to ignore very small values and increase run time of SR
-    crop = k_size // 2
-    cropped_big_k = big_k[crop:-crop, crop:-crop]
-    # Normalize to 1
-    return cropped_big_k / cropped_big_k.sum()
-
-
-def anisotropic_Gaussian(ksize=15, theta=np.pi, l1=6, l2=6):
-    """ generate an anisotropic Gaussian kernel
-    Args:
-        ksize : e.g., 15, kernel size
-        theta : [0,  pi], rotation angle range
-        l1    : [0.1,50], scaling of eigenvalues
-        l2    : [0.1,l1], scaling of eigenvalues
-        If l1 = l2, will get an isotropic Gaussian kernel.
-    Returns:
-        k     : kernel
-    """
-
-    v = np.dot(np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]), np.array([1., 0.]))
-    V = np.array([[v[0], v[1]], [v[1], -v[0]]])
-    D = np.array([[l1, 0], [0, l2]])
-    Sigma = np.dot(np.dot(V, D), np.linalg.inv(V))
-    k = gm_blur_kernel(mean=[0, 0], cov=Sigma, size=ksize)
-
-    return k
-
-
-def gm_blur_kernel(mean, cov, size=15):
-    center = size / 2.0 + 0.5
-    k = np.zeros([size, size])
-    for y in range(size):
-        for x in range(size):
-            cy = y - center + 1
-            cx = x - center + 1
-            k[y, x] = ss.multivariate_normal.pdf([cx, cy], mean=mean, cov=cov)
-
-    k = k / np.sum(k)
-    return k
-
-
-def shift_pixel(x, sf, upper_left=True):
-    """shift pixel for super-resolution with different scale factors
-    Args:
-        x: WxHxC or WxH
-        sf: scale factor
-        upper_left: shift direction
-    """
-    h, w = x.shape[:2]
-    shift = (sf - 1) * 0.5
-    xv, yv = np.arange(0, w, 1.0), np.arange(0, h, 1.0)
-    if upper_left:
-        x1 = xv + shift
-        y1 = yv + shift
-    else:
-        x1 = xv - shift
-        y1 = yv - shift
-
-    x1 = np.clip(x1, 0, w - 1)
-    y1 = np.clip(y1, 0, h - 1)
-
-    if x.ndim == 2:
-        x = interp2d(xv, yv, x)(x1, y1)
-    if x.ndim == 3:
-        for i in range(x.shape[-1]):
-            x[:, :, i] = interp2d(xv, yv, x[:, :, i])(x1, y1)
-
-    return x
-
-
-def blur(x, k):
-    '''
-    x: image, NxcxHxW
-    k: kernel, Nx1xhxw
-    '''
-    n, c = x.shape[:2]
-    p1, p2 = (k.shape[-2] - 1) // 2, (k.shape[-1] - 1) // 2
-    x = torch.nn.functional.pad(x, pad=(p1, p2, p1, p2), mode='replicate')
-    k = k.repeat(1, c, 1, 1)
-    k = k.view(-1, 1, k.shape[2], k.shape[3])
-    x = x.view(1, -1, x.shape[2], x.shape[3])
-    x = torch.nn.functional.conv2d(x, k, bias=None, stride=1, padding=0, groups=n * c)
-    x = x.view(n, c, x.shape[2], x.shape[3])
-
-    return x
-
-
-def gen_kernel(k_size=np.array([15, 15]), scale_factor=np.array([4, 4]), min_var=0.6, max_var=10., noise_level=0):
-    """"
-    # modified version of https://github.com/assafshocher/BlindSR_dataset_generator
-    # Kai Zhang
-    # min_var = 0.175 * sf  # variance of the gaussian kernel will be sampled between min_var and max_var
-    # max_var = 2.5 * sf
-    """
-    # Set random eigen-vals (lambdas) and angle (theta) for COV matrix
-    lambda_1 = min_var + np.random.rand() * (max_var - min_var)
-    lambda_2 = min_var + np.random.rand() * (max_var - min_var)
-    theta = np.random.rand() * np.pi  # random theta
-    noise = -noise_level + np.random.rand(*k_size) * noise_level * 2
-
-    # Set COV matrix using Lambdas and Theta
-    LAMBDA = np.diag([lambda_1, lambda_2])
-    Q = np.array([[np.cos(theta), -np.sin(theta)],
-                  [np.sin(theta), np.cos(theta)]])
-    SIGMA = Q @ LAMBDA @ Q.T
-    INV_SIGMA = np.linalg.inv(SIGMA)[None, None, :, :]
-
-    # Set expectation position (shifting kernel for aligned image)
-    MU = k_size // 2 - 0.5 * (scale_factor - 1)  # - 0.5 * (scale_factor - k_size % 2)
-    MU = MU[None, None, :, None]
-
-    # Create meshgrid for Gaussian
-    [X, Y] = np.meshgrid(range(k_size[0]), range(k_size[1]))
-    Z = np.stack([X, Y], 2)[:, :, :, None]
-
-    # Calcualte Gaussian for every pixel of the kernel
-    ZZ = Z - MU
-    ZZ_t = ZZ.transpose(0, 1, 3, 2)
-    raw_kernel = np.exp(-0.5 * np.squeeze(ZZ_t @ INV_SIGMA @ ZZ)) * (1 + noise)
-
-    # shift the kernel so it will be centered
-    # raw_kernel_centered = kernel_shift(raw_kernel, scale_factor)
-
-    # Normalize the kernel and return
-    # kernel = raw_kernel_centered / np.sum(raw_kernel_centered)
-    kernel = raw_kernel / np.sum(raw_kernel)
-    return kernel
-
-
-def fspecial_gaussian(hsize, sigma):
-    hsize = [hsize, hsize]
-    siz = [(hsize[0] - 1.0) / 2.0, (hsize[1] - 1.0) / 2.0]
-    std = sigma
-    [x, y] = np.meshgrid(np.arange(-siz[1], siz[1] + 1), np.arange(-siz[0], siz[0] + 1))
-    arg = -(x * x + y * y) / (2 * std * std)
-    h = np.exp(arg)
-    h[h < scipy.finfo(float).eps * h.max()] = 0
-    sumh = h.sum()
-    if sumh != 0:
-        h = h / sumh
-    return h
-
-
-def fspecial_laplacian(alpha):
-    alpha = max([0, min([alpha, 1])])
-    h1 = alpha / (alpha + 1)
-    h2 = (1 - alpha) / (alpha + 1)
-    h = [[h1, h2, h1], [h2, -4 / (alpha + 1), h2], [h1, h2, h1]]
-    h = np.array(h)
-    return h
-
-
-def fspecial(filter_type, *args, **kwargs):
-    '''
-    python code from:
-    https://github.com/ronaldosena/imagens-medicas-2/blob/40171a6c259edec7827a6693a93955de2bd39e76/Aulas/aula_2_-_uniform_filter/matlab_fspecial.py
-    '''
-    if filter_type == 'gaussian':
-        return fspecial_gaussian(*args, **kwargs)
-    if filter_type == 'laplacian':
-        return fspecial_laplacian(*args, **kwargs)
-
-
-"""
-# --------------------------------------------
-# degradation models
-# --------------------------------------------
-"""
-
-
-def bicubic_degradation(x, sf=3):
-    '''
-    Args:
-        x: HxWxC image, [0, 1]
-        sf: down-scale factor
-    Return:
-        bicubicly downsampled LR image
-    '''
-    x = util.imresize_np(x, scale=1 / sf)
-    return x
-
-
-def srmd_degradation(x, k, sf=3):
-    ''' blur + bicubic downsampling
-    Args:
-        x: HxWxC image, [0, 1]
-        k: hxw, double
-        sf: down-scale factor
-    Return:
-        downsampled LR image
-    Reference:
-        @inproceedings{zhang2018learning,
-          title={Learning a single convolutional super-resolution network for multiple degradations},
-          author={Zhang, Kai and Zuo, Wangmeng and Zhang, Lei},
-          booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
-          pages={3262--3271},
-          year={2018}
-        }
-    '''
-    x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap')  # 'nearest' | 'mirror'
-    x = bicubic_degradation(x, sf=sf)
-    return x
-
-
-def dpsr_degradation(x, k, sf=3):
-    ''' bicubic downsampling + blur
-    Args:
-        x: HxWxC image, [0, 1]
-        k: hxw, double
-        sf: down-scale factor
-    Return:
-        downsampled LR image
-    Reference:
-        @inproceedings{zhang2019deep,
-          title={Deep Plug-and-Play Super-Resolution for Arbitrary Blur Kernels},
-          author={Zhang, Kai and Zuo, Wangmeng and Zhang, Lei},
-          booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
-          pages={1671--1681},
-          year={2019}
-        }
-    '''
-    x = bicubic_degradation(x, sf=sf)
-    x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap')
-    return x
-
-
-def classical_degradation(x, k, sf=3):
-    ''' blur + downsampling
-    Args:
-        x: HxWxC image, [0, 1]/[0, 255]
-        k: hxw, double
-        sf: down-scale factor
-    Return:
-        downsampled LR image
-    '''
-    x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap')
-    # x = filters.correlate(x, np.expand_dims(np.flip(k), axis=2))
-    st = 0
-    return x[st::sf, st::sf, ...]
-
-
-def add_sharpening(img, weight=0.5, radius=50, threshold=10):
-    """USM sharpening. borrowed from real-ESRGAN
-    Input image: I; Blurry image: B.
-    1. K = I + weight * (I - B)
-    2. Mask = 1 if abs(I - B) > threshold, else: 0
-    3. Blur mask:
-    4. Out = Mask * K + (1 - Mask) * I
-    Args:
-        img (Numpy array): Input image, HWC, BGR; float32, [0, 1].
-        weight (float): Sharp weight. Default: 1.
-        radius (float): Kernel size of Gaussian blur. Default: 50.
-        threshold (int):
-    """
-    if radius % 2 == 0:
-        radius += 1
-    blur = cv2.GaussianBlur(img, (radius, radius), 0)
-    residual = img - blur
-    mask = np.abs(residual) * 255 > threshold
-    mask = mask.astype('float32')
-    soft_mask = cv2.GaussianBlur(mask, (radius, radius), 0)
-
-    K = img + weight * residual
-    K = np.clip(K, 0, 1)
-    return soft_mask * K + (1 - soft_mask) * img
-
-
-def add_blur(img, sf=4):
-    wd2 = 4.0 + sf
-    wd = 2.0 + 0.2 * sf
-    if random.random() < 0.5:
-        l1 = wd2 * random.random()
-        l2 = wd2 * random.random()
-        k = anisotropic_Gaussian(ksize=2 * random.randint(2, 11) + 3, theta=random.random() * np.pi, l1=l1, l2=l2)
-    else:
-        k = fspecial('gaussian', 2 * random.randint(2, 11) + 3, wd * random.random())
-    img = ndimage.filters.convolve(img, np.expand_dims(k, axis=2), mode='mirror')
-
-    return img
-
-
-def add_resize(img, sf=4):
-    rnum = np.random.rand()
-    if rnum > 0.8:  # up
-        sf1 = random.uniform(1, 2)
-    elif rnum < 0.7:  # down
-        sf1 = random.uniform(0.5 / sf, 1)
-    else:
-        sf1 = 1.0
-    img = cv2.resize(img, (int(sf1 * img.shape[1]), int(sf1 * img.shape[0])), interpolation=random.choice([1, 2, 3]))
-    img = np.clip(img, 0.0, 1.0)
-
-    return img
-
-
-# def add_Gaussian_noise(img, noise_level1=2, noise_level2=25):
-#     noise_level = random.randint(noise_level1, noise_level2)
-#     rnum = np.random.rand()
-#     if rnum > 0.6:  # add color Gaussian noise
-#         img += np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
-#     elif rnum < 0.4:  # add grayscale Gaussian noise
-#         img += np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
-#     else:  # add  noise
-#         L = noise_level2 / 255.
-#         D = np.diag(np.random.rand(3))
-#         U = orth(np.random.rand(3, 3))
-#         conv = np.dot(np.dot(np.transpose(U), D), U)
-#         img += np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
-#     img = np.clip(img, 0.0, 1.0)
-#     return img
-
-def add_Gaussian_noise(img, noise_level1=2, noise_level2=25):
-    noise_level = random.randint(noise_level1, noise_level2)
-    rnum = np.random.rand()
-    if rnum > 0.6:  # add color Gaussian noise
-        img = img + np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
-    elif rnum < 0.4:  # add grayscale Gaussian noise
-        img = img + np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
-    else:  # add  noise
-        L = noise_level2 / 255.
-        D = np.diag(np.random.rand(3))
-        U = orth(np.random.rand(3, 3))
-        conv = np.dot(np.dot(np.transpose(U), D), U)
-        img = img + np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
-    img = np.clip(img, 0.0, 1.0)
-    return img
-
-
-def add_speckle_noise(img, noise_level1=2, noise_level2=25):
-    noise_level = random.randint(noise_level1, noise_level2)
-    img = np.clip(img, 0.0, 1.0)
-    rnum = random.random()
-    if rnum > 0.6:
-        img += img * np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
-    elif rnum < 0.4:
-        img += img * np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
-    else:
-        L = noise_level2 / 255.
-        D = np.diag(np.random.rand(3))
-        U = orth(np.random.rand(3, 3))
-        conv = np.dot(np.dot(np.transpose(U), D), U)
-        img += img * np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
-    img = np.clip(img, 0.0, 1.0)
-    return img
-
-
-def add_Poisson_noise(img):
-    img = np.clip((img * 255.0).round(), 0, 255) / 255.
-    vals = 10 ** (2 * random.random() + 2.0)  # [2, 4]
-    if random.random() < 0.5:
-        img = np.random.poisson(img * vals).astype(np.float32) / vals
-    else:
-        img_gray = np.dot(img[..., :3], [0.299, 0.587, 0.114])
-        img_gray = np.clip((img_gray * 255.0).round(), 0, 255) / 255.
-        noise_gray = np.random.poisson(img_gray * vals).astype(np.float32) / vals - img_gray
-        img += noise_gray[:, :, np.newaxis]
-    img = np.clip(img, 0.0, 1.0)
-    return img
-
-
-def add_JPEG_noise(img):
-    quality_factor = random.randint(30, 95)
-    img = cv2.cvtColor(util.single2uint(img), cv2.COLOR_RGB2BGR)
-    result, encimg = cv2.imencode('.jpg', img, [int(cv2.IMWRITE_JPEG_QUALITY), quality_factor])
-    img = cv2.imdecode(encimg, 1)
-    img = cv2.cvtColor(util.uint2single(img), cv2.COLOR_BGR2RGB)
-    return img
-
-
-def random_crop(lq, hq, sf=4, lq_patchsize=64):
-    h, w = lq.shape[:2]
-    rnd_h = random.randint(0, h - lq_patchsize)
-    rnd_w = random.randint(0, w - lq_patchsize)
-    lq = lq[rnd_h:rnd_h + lq_patchsize, rnd_w:rnd_w + lq_patchsize, :]
-
-    rnd_h_H, rnd_w_H = int(rnd_h * sf), int(rnd_w * sf)
-    hq = hq[rnd_h_H:rnd_h_H + lq_patchsize * sf, rnd_w_H:rnd_w_H + lq_patchsize * sf, :]
-    return lq, hq
-
-
-def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
-    """
-    This is the degradation model of BSRGAN from the paper
-    "Designing a Practical Degradation Model for Deep Blind Image Super-Resolution"
-    ----------
-    img: HXWXC, [0, 1], its size should be large than (lq_patchsizexsf)x(lq_patchsizexsf)
-    sf: scale factor
-    isp_model: camera ISP model
-    Returns
-    -------
-    img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1]
-    hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1]
-    """
-    isp_prob, jpeg_prob, scale2_prob = 0.25, 0.9, 0.25
-    sf_ori = sf
-
-    h1, w1 = img.shape[:2]
-    img = img.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...]  # mod crop
-    h, w = img.shape[:2]
-
-    if h < lq_patchsize * sf or w < lq_patchsize * sf:
-        raise ValueError(f'img size ({h1}X{w1}) is too small!')
-
-    hq = img.copy()
-
-    if sf == 4 and random.random() < scale2_prob:  # downsample1
-        if np.random.rand() < 0.5:
-            img = cv2.resize(img, (int(1 / 2 * img.shape[1]), int(1 / 2 * img.shape[0])),
-                             interpolation=random.choice([1, 2, 3]))
-        else:
-            img = util.imresize_np(img, 1 / 2, True)
-        img = np.clip(img, 0.0, 1.0)
-        sf = 2
-
-    shuffle_order = random.sample(range(7), 7)
-    idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3)
-    if idx1 > idx2:  # keep downsample3 last
-        shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1]
-
-    for i in shuffle_order:
-
-        if i == 0:
-            img = add_blur(img, sf=sf)
-
-        elif i == 1:
-            img = add_blur(img, sf=sf)
-
-        elif i == 2:
-            a, b = img.shape[1], img.shape[0]
-            # downsample2
-            if random.random() < 0.75:
-                sf1 = random.uniform(1, 2 * sf)
-                img = cv2.resize(img, (int(1 / sf1 * img.shape[1]), int(1 / sf1 * img.shape[0])),
-                                 interpolation=random.choice([1, 2, 3]))
-            else:
-                k = fspecial('gaussian', 25, random.uniform(0.1, 0.6 * sf))
-                k_shifted = shift_pixel(k, sf)
-                k_shifted = k_shifted / k_shifted.sum()  # blur with shifted kernel
-                img = ndimage.filters.convolve(img, np.expand_dims(k_shifted, axis=2), mode='mirror')
-                img = img[0::sf, 0::sf, ...]  # nearest downsampling
-            img = np.clip(img, 0.0, 1.0)
-
-        elif i == 3:
-            # downsample3
-            img = cv2.resize(img, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3]))
-            img = np.clip(img, 0.0, 1.0)
-
-        elif i == 4:
-            # add Gaussian noise
-            img = add_Gaussian_noise(img, noise_level1=2, noise_level2=25)
-
-        elif i == 5:
-            # add JPEG noise
-            if random.random() < jpeg_prob:
-                img = add_JPEG_noise(img)
-
-        elif i == 6:
-            # add processed camera sensor noise
-            if random.random() < isp_prob and isp_model is not None:
-                with torch.no_grad():
-                    img, hq = isp_model.forward(img.copy(), hq)
-
-    # add final JPEG compression noise
-    img = add_JPEG_noise(img)
-
-    # random crop
-    img, hq = random_crop(img, hq, sf_ori, lq_patchsize)
-
-    return img, hq
-
-
-# todo no isp_model?
-def degradation_bsrgan_variant(image, sf=4, isp_model=None):
-    """
-    This is the degradation model of BSRGAN from the paper
-    "Designing a Practical Degradation Model for Deep Blind Image Super-Resolution"
-    ----------
-    sf: scale factor
-    isp_model: camera ISP model
-    Returns
-    -------
-    img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1]
-    hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1]
-    """
-    image = util.uint2single(image)
-    isp_prob, jpeg_prob, scale2_prob = 0.25, 0.9, 0.25
-    sf_ori = sf
-
-    h1, w1 = image.shape[:2]
-    image = image.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...]  # mod crop
-    h, w = image.shape[:2]
-
-    hq = image.copy()
-
-    if sf == 4 and random.random() < scale2_prob:  # downsample1
-        if np.random.rand() < 0.5:
-            image = cv2.resize(image, (int(1 / 2 * image.shape[1]), int(1 / 2 * image.shape[0])),
-                               interpolation=random.choice([1, 2, 3]))
-        else:
-            image = util.imresize_np(image, 1 / 2, True)
-        image = np.clip(image, 0.0, 1.0)
-        sf = 2
-
-    shuffle_order = random.sample(range(7), 7)
-    idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3)
-    if idx1 > idx2:  # keep downsample3 last
-        shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1]
-
-    for i in shuffle_order:
-
-        if i == 0:
-            image = add_blur(image, sf=sf)
-
-        elif i == 1:
-            image = add_blur(image, sf=sf)
-
-        elif i == 2:
-            a, b = image.shape[1], image.shape[0]
-            # downsample2
-            if random.random() < 0.75:
-                sf1 = random.uniform(1, 2 * sf)
-                image = cv2.resize(image, (int(1 / sf1 * image.shape[1]), int(1 / sf1 * image.shape[0])),
-                                   interpolation=random.choice([1, 2, 3]))
-            else:
-                k = fspecial('gaussian', 25, random.uniform(0.1, 0.6 * sf))
-                k_shifted = shift_pixel(k, sf)
-                k_shifted = k_shifted / k_shifted.sum()  # blur with shifted kernel
-                image = ndimage.filters.convolve(image, np.expand_dims(k_shifted, axis=2), mode='mirror')
-                image = image[0::sf, 0::sf, ...]  # nearest downsampling
-            image = np.clip(image, 0.0, 1.0)
-
-        elif i == 3:
-            # downsample3
-            image = cv2.resize(image, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3]))
-            image = np.clip(image, 0.0, 1.0)
-
-        elif i == 4:
-            # add Gaussian noise
-            image = add_Gaussian_noise(image, noise_level1=2, noise_level2=25)
-
-        elif i == 5:
-            # add JPEG noise
-            if random.random() < jpeg_prob:
-                image = add_JPEG_noise(image)
-
-        # elif i == 6:
-        #     # add processed camera sensor noise
-        #     if random.random() < isp_prob and isp_model is not None:
-        #         with torch.no_grad():
-        #             img, hq = isp_model.forward(img.copy(), hq)
-
-    # add final JPEG compression noise
-    image = add_JPEG_noise(image)
-    image = util.single2uint(image)
-    example = {"image":image}
-    return example
-
-
-# TODO incase there is a pickle error one needs to replace a += x with a = a + x in add_speckle_noise etc...
-def degradation_bsrgan_plus(img, sf=4, shuffle_prob=0.5, use_sharp=True, lq_patchsize=64, isp_model=None):
-    """
-    This is an extended degradation model by combining
-    the degradation models of BSRGAN and Real-ESRGAN
-    ----------
-    img: HXWXC, [0, 1], its size should be large than (lq_patchsizexsf)x(lq_patchsizexsf)
-    sf: scale factor
-    use_shuffle: the degradation shuffle
-    use_sharp: sharpening the img
-    Returns
-    -------
-    img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1]
-    hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1]
-    """
-
-    h1, w1 = img.shape[:2]
-    img = img.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...]  # mod crop
-    h, w = img.shape[:2]
-
-    if h < lq_patchsize * sf or w < lq_patchsize * sf:
-        raise ValueError(f'img size ({h1}X{w1}) is too small!')
-
-    if use_sharp:
-        img = add_sharpening(img)
-    hq = img.copy()
-
-    if random.random() < shuffle_prob:
-        shuffle_order = random.sample(range(13), 13)
-    else:
-        shuffle_order = list(range(13))
-        # local shuffle for noise, JPEG is always the last one
-        shuffle_order[2:6] = random.sample(shuffle_order[2:6], len(range(2, 6)))
-        shuffle_order[9:13] = random.sample(shuffle_order[9:13], len(range(9, 13)))
-
-    poisson_prob, speckle_prob, isp_prob = 0.1, 0.1, 0.1
-
-    for i in shuffle_order:
-        if i == 0:
-            img = add_blur(img, sf=sf)
-        elif i == 1:
-            img = add_resize(img, sf=sf)
-        elif i == 2:
-            img = add_Gaussian_noise(img, noise_level1=2, noise_level2=25)
-        elif i == 3:
-            if random.random() < poisson_prob:
-                img = add_Poisson_noise(img)
-        elif i == 4:
-            if random.random() < speckle_prob:
-                img = add_speckle_noise(img)
-        elif i == 5:
-            if random.random() < isp_prob and isp_model is not None:
-                with torch.no_grad():
-                    img, hq = isp_model.forward(img.copy(), hq)
-        elif i == 6:
-            img = add_JPEG_noise(img)
-        elif i == 7:
-            img = add_blur(img, sf=sf)
-        elif i == 8:
-            img = add_resize(img, sf=sf)
-        elif i == 9:
-            img = add_Gaussian_noise(img, noise_level1=2, noise_level2=25)
-        elif i == 10:
-            if random.random() < poisson_prob:
-                img = add_Poisson_noise(img)
-        elif i == 11:
-            if random.random() < speckle_prob:
-                img = add_speckle_noise(img)
-        elif i == 12:
-            if random.random() < isp_prob and isp_model is not None:
-                with torch.no_grad():
-                    img, hq = isp_model.forward(img.copy(), hq)
-        else:
-            print('check the shuffle!')
-
-    # resize to desired size
-    img = cv2.resize(img, (int(1 / sf * hq.shape[1]), int(1 / sf * hq.shape[0])),
-                     interpolation=random.choice([1, 2, 3]))
-
-    # add final JPEG compression noise
-    img = add_JPEG_noise(img)
-
-    # random crop
-    img, hq = random_crop(img, hq, sf, lq_patchsize)
-
-    return img, hq
-
-
-if __name__ == '__main__':
-	print("hey")
-	img = util.imread_uint('utils/test.png', 3)
-	print(img)
-	img = util.uint2single(img)
-	print(img)
-	img = img[:448, :448]
-	h = img.shape[0] // 4
-	print("resizing to", h)
-	sf = 4
-	deg_fn = partial(degradation_bsrgan_variant, sf=sf)
-	for i in range(20):
-		print(i)
-		img_lq = deg_fn(img)
-		print(img_lq)
-		img_lq_bicubic = albumentations.SmallestMaxSize(max_size=h, interpolation=cv2.INTER_CUBIC)(image=img)["image"]
-		print(img_lq.shape)
-		print("bicubic", img_lq_bicubic.shape)
-		print(img_hq.shape)
-		lq_nearest = cv2.resize(util.single2uint(img_lq), (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
-		                        interpolation=0)
-		lq_bicubic_nearest = cv2.resize(util.single2uint(img_lq_bicubic), (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
-		                        interpolation=0)
-		img_concat = np.concatenate([lq_bicubic_nearest, lq_nearest, util.single2uint(img_hq)], axis=1)
-		util.imsave(img_concat, str(i) + '.png')
-
-
diff --git a/examples/tutorial/stable_diffusion/ldm/modules/image_degradation/bsrgan_light.py b/examples/tutorial/stable_diffusion/ldm/modules/image_degradation/bsrgan_light.py
deleted file mode 100644
index 9e1f823996bf..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/modules/image_degradation/bsrgan_light.py
+++ /dev/null
@@ -1,650 +0,0 @@
-# -*- coding: utf-8 -*-
-import numpy as np
-import cv2
-import torch
-
-from functools import partial
-import random
-from scipy import ndimage
-import scipy
-import scipy.stats as ss
-from scipy.interpolate import interp2d
-from scipy.linalg import orth
-import albumentations
-
-import ldm.modules.image_degradation.utils_image as util
-
-"""
-# --------------------------------------------
-# Super-Resolution
-# --------------------------------------------
-#
-# Kai Zhang (cskaizhang@gmail.com)
-# https://github.com/cszn
-# From 2019/03--2021/08
-# --------------------------------------------
-"""
-
-
-def modcrop_np(img, sf):
-    '''
-    Args:
-        img: numpy image, WxH or WxHxC
-        sf: scale factor
-    Return:
-        cropped image
-    '''
-    w, h = img.shape[:2]
-    im = np.copy(img)
-    return im[:w - w % sf, :h - h % sf, ...]
-
-
-"""
-# --------------------------------------------
-# anisotropic Gaussian kernels
-# --------------------------------------------
-"""
-
-
-def analytic_kernel(k):
-    """Calculate the X4 kernel from the X2 kernel (for proof see appendix in paper)"""
-    k_size = k.shape[0]
-    # Calculate the big kernels size
-    big_k = np.zeros((3 * k_size - 2, 3 * k_size - 2))
-    # Loop over the small kernel to fill the big one
-    for r in range(k_size):
-        for c in range(k_size):
-            big_k[2 * r:2 * r + k_size, 2 * c:2 * c + k_size] += k[r, c] * k
-    # Crop the edges of the big kernel to ignore very small values and increase run time of SR
-    crop = k_size // 2
-    cropped_big_k = big_k[crop:-crop, crop:-crop]
-    # Normalize to 1
-    return cropped_big_k / cropped_big_k.sum()
-
-
-def anisotropic_Gaussian(ksize=15, theta=np.pi, l1=6, l2=6):
-    """ generate an anisotropic Gaussian kernel
-    Args:
-        ksize : e.g., 15, kernel size
-        theta : [0,  pi], rotation angle range
-        l1    : [0.1,50], scaling of eigenvalues
-        l2    : [0.1,l1], scaling of eigenvalues
-        If l1 = l2, will get an isotropic Gaussian kernel.
-    Returns:
-        k     : kernel
-    """
-
-    v = np.dot(np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]), np.array([1., 0.]))
-    V = np.array([[v[0], v[1]], [v[1], -v[0]]])
-    D = np.array([[l1, 0], [0, l2]])
-    Sigma = np.dot(np.dot(V, D), np.linalg.inv(V))
-    k = gm_blur_kernel(mean=[0, 0], cov=Sigma, size=ksize)
-
-    return k
-
-
-def gm_blur_kernel(mean, cov, size=15):
-    center = size / 2.0 + 0.5
-    k = np.zeros([size, size])
-    for y in range(size):
-        for x in range(size):
-            cy = y - center + 1
-            cx = x - center + 1
-            k[y, x] = ss.multivariate_normal.pdf([cx, cy], mean=mean, cov=cov)
-
-    k = k / np.sum(k)
-    return k
-
-
-def shift_pixel(x, sf, upper_left=True):
-    """shift pixel for super-resolution with different scale factors
-    Args:
-        x: WxHxC or WxH
-        sf: scale factor
-        upper_left: shift direction
-    """
-    h, w = x.shape[:2]
-    shift = (sf - 1) * 0.5
-    xv, yv = np.arange(0, w, 1.0), np.arange(0, h, 1.0)
-    if upper_left:
-        x1 = xv + shift
-        y1 = yv + shift
-    else:
-        x1 = xv - shift
-        y1 = yv - shift
-
-    x1 = np.clip(x1, 0, w - 1)
-    y1 = np.clip(y1, 0, h - 1)
-
-    if x.ndim == 2:
-        x = interp2d(xv, yv, x)(x1, y1)
-    if x.ndim == 3:
-        for i in range(x.shape[-1]):
-            x[:, :, i] = interp2d(xv, yv, x[:, :, i])(x1, y1)
-
-    return x
-
-
-def blur(x, k):
-    '''
-    x: image, NxcxHxW
-    k: kernel, Nx1xhxw
-    '''
-    n, c = x.shape[:2]
-    p1, p2 = (k.shape[-2] - 1) // 2, (k.shape[-1] - 1) // 2
-    x = torch.nn.functional.pad(x, pad=(p1, p2, p1, p2), mode='replicate')
-    k = k.repeat(1, c, 1, 1)
-    k = k.view(-1, 1, k.shape[2], k.shape[3])
-    x = x.view(1, -1, x.shape[2], x.shape[3])
-    x = torch.nn.functional.conv2d(x, k, bias=None, stride=1, padding=0, groups=n * c)
-    x = x.view(n, c, x.shape[2], x.shape[3])
-
-    return x
-
-
-def gen_kernel(k_size=np.array([15, 15]), scale_factor=np.array([4, 4]), min_var=0.6, max_var=10., noise_level=0):
-    """"
-    # modified version of https://github.com/assafshocher/BlindSR_dataset_generator
-    # Kai Zhang
-    # min_var = 0.175 * sf  # variance of the gaussian kernel will be sampled between min_var and max_var
-    # max_var = 2.5 * sf
-    """
-    # Set random eigen-vals (lambdas) and angle (theta) for COV matrix
-    lambda_1 = min_var + np.random.rand() * (max_var - min_var)
-    lambda_2 = min_var + np.random.rand() * (max_var - min_var)
-    theta = np.random.rand() * np.pi  # random theta
-    noise = -noise_level + np.random.rand(*k_size) * noise_level * 2
-
-    # Set COV matrix using Lambdas and Theta
-    LAMBDA = np.diag([lambda_1, lambda_2])
-    Q = np.array([[np.cos(theta), -np.sin(theta)],
-                  [np.sin(theta), np.cos(theta)]])
-    SIGMA = Q @ LAMBDA @ Q.T
-    INV_SIGMA = np.linalg.inv(SIGMA)[None, None, :, :]
-
-    # Set expectation position (shifting kernel for aligned image)
-    MU = k_size // 2 - 0.5 * (scale_factor - 1)  # - 0.5 * (scale_factor - k_size % 2)
-    MU = MU[None, None, :, None]
-
-    # Create meshgrid for Gaussian
-    [X, Y] = np.meshgrid(range(k_size[0]), range(k_size[1]))
-    Z = np.stack([X, Y], 2)[:, :, :, None]
-
-    # Calcualte Gaussian for every pixel of the kernel
-    ZZ = Z - MU
-    ZZ_t = ZZ.transpose(0, 1, 3, 2)
-    raw_kernel = np.exp(-0.5 * np.squeeze(ZZ_t @ INV_SIGMA @ ZZ)) * (1 + noise)
-
-    # shift the kernel so it will be centered
-    # raw_kernel_centered = kernel_shift(raw_kernel, scale_factor)
-
-    # Normalize the kernel and return
-    # kernel = raw_kernel_centered / np.sum(raw_kernel_centered)
-    kernel = raw_kernel / np.sum(raw_kernel)
-    return kernel
-
-
-def fspecial_gaussian(hsize, sigma):
-    hsize = [hsize, hsize]
-    siz = [(hsize[0] - 1.0) / 2.0, (hsize[1] - 1.0) / 2.0]
-    std = sigma
-    [x, y] = np.meshgrid(np.arange(-siz[1], siz[1] + 1), np.arange(-siz[0], siz[0] + 1))
-    arg = -(x * x + y * y) / (2 * std * std)
-    h = np.exp(arg)
-    h[h < scipy.finfo(float).eps * h.max()] = 0
-    sumh = h.sum()
-    if sumh != 0:
-        h = h / sumh
-    return h
-
-
-def fspecial_laplacian(alpha):
-    alpha = max([0, min([alpha, 1])])
-    h1 = alpha / (alpha + 1)
-    h2 = (1 - alpha) / (alpha + 1)
-    h = [[h1, h2, h1], [h2, -4 / (alpha + 1), h2], [h1, h2, h1]]
-    h = np.array(h)
-    return h
-
-
-def fspecial(filter_type, *args, **kwargs):
-    '''
-    python code from:
-    https://github.com/ronaldosena/imagens-medicas-2/blob/40171a6c259edec7827a6693a93955de2bd39e76/Aulas/aula_2_-_uniform_filter/matlab_fspecial.py
-    '''
-    if filter_type == 'gaussian':
-        return fspecial_gaussian(*args, **kwargs)
-    if filter_type == 'laplacian':
-        return fspecial_laplacian(*args, **kwargs)
-
-
-"""
-# --------------------------------------------
-# degradation models
-# --------------------------------------------
-"""
-
-
-def bicubic_degradation(x, sf=3):
-    '''
-    Args:
-        x: HxWxC image, [0, 1]
-        sf: down-scale factor
-    Return:
-        bicubicly downsampled LR image
-    '''
-    x = util.imresize_np(x, scale=1 / sf)
-    return x
-
-
-def srmd_degradation(x, k, sf=3):
-    ''' blur + bicubic downsampling
-    Args:
-        x: HxWxC image, [0, 1]
-        k: hxw, double
-        sf: down-scale factor
-    Return:
-        downsampled LR image
-    Reference:
-        @inproceedings{zhang2018learning,
-          title={Learning a single convolutional super-resolution network for multiple degradations},
-          author={Zhang, Kai and Zuo, Wangmeng and Zhang, Lei},
-          booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
-          pages={3262--3271},
-          year={2018}
-        }
-    '''
-    x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap')  # 'nearest' | 'mirror'
-    x = bicubic_degradation(x, sf=sf)
-    return x
-
-
-def dpsr_degradation(x, k, sf=3):
-    ''' bicubic downsampling + blur
-    Args:
-        x: HxWxC image, [0, 1]
-        k: hxw, double
-        sf: down-scale factor
-    Return:
-        downsampled LR image
-    Reference:
-        @inproceedings{zhang2019deep,
-          title={Deep Plug-and-Play Super-Resolution for Arbitrary Blur Kernels},
-          author={Zhang, Kai and Zuo, Wangmeng and Zhang, Lei},
-          booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
-          pages={1671--1681},
-          year={2019}
-        }
-    '''
-    x = bicubic_degradation(x, sf=sf)
-    x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap')
-    return x
-
-
-def classical_degradation(x, k, sf=3):
-    ''' blur + downsampling
-    Args:
-        x: HxWxC image, [0, 1]/[0, 255]
-        k: hxw, double
-        sf: down-scale factor
-    Return:
-        downsampled LR image
-    '''
-    x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap')
-    # x = filters.correlate(x, np.expand_dims(np.flip(k), axis=2))
-    st = 0
-    return x[st::sf, st::sf, ...]
-
-
-def add_sharpening(img, weight=0.5, radius=50, threshold=10):
-    """USM sharpening. borrowed from real-ESRGAN
-    Input image: I; Blurry image: B.
-    1. K = I + weight * (I - B)
-    2. Mask = 1 if abs(I - B) > threshold, else: 0
-    3. Blur mask:
-    4. Out = Mask * K + (1 - Mask) * I
-    Args:
-        img (Numpy array): Input image, HWC, BGR; float32, [0, 1].
-        weight (float): Sharp weight. Default: 1.
-        radius (float): Kernel size of Gaussian blur. Default: 50.
-        threshold (int):
-    """
-    if radius % 2 == 0:
-        radius += 1
-    blur = cv2.GaussianBlur(img, (radius, radius), 0)
-    residual = img - blur
-    mask = np.abs(residual) * 255 > threshold
-    mask = mask.astype('float32')
-    soft_mask = cv2.GaussianBlur(mask, (radius, radius), 0)
-
-    K = img + weight * residual
-    K = np.clip(K, 0, 1)
-    return soft_mask * K + (1 - soft_mask) * img
-
-
-def add_blur(img, sf=4):
-    wd2 = 4.0 + sf
-    wd = 2.0 + 0.2 * sf
-
-    wd2 = wd2/4
-    wd = wd/4
-
-    if random.random() < 0.5:
-        l1 = wd2 * random.random()
-        l2 = wd2 * random.random()
-        k = anisotropic_Gaussian(ksize=random.randint(2, 11) + 3, theta=random.random() * np.pi, l1=l1, l2=l2)
-    else:
-        k = fspecial('gaussian', random.randint(2, 4) + 3, wd * random.random())
-    img = ndimage.filters.convolve(img, np.expand_dims(k, axis=2), mode='mirror')
-
-    return img
-
-
-def add_resize(img, sf=4):
-    rnum = np.random.rand()
-    if rnum > 0.8:  # up
-        sf1 = random.uniform(1, 2)
-    elif rnum < 0.7:  # down
-        sf1 = random.uniform(0.5 / sf, 1)
-    else:
-        sf1 = 1.0
-    img = cv2.resize(img, (int(sf1 * img.shape[1]), int(sf1 * img.shape[0])), interpolation=random.choice([1, 2, 3]))
-    img = np.clip(img, 0.0, 1.0)
-
-    return img
-
-
-# def add_Gaussian_noise(img, noise_level1=2, noise_level2=25):
-#     noise_level = random.randint(noise_level1, noise_level2)
-#     rnum = np.random.rand()
-#     if rnum > 0.6:  # add color Gaussian noise
-#         img += np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
-#     elif rnum < 0.4:  # add grayscale Gaussian noise
-#         img += np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
-#     else:  # add  noise
-#         L = noise_level2 / 255.
-#         D = np.diag(np.random.rand(3))
-#         U = orth(np.random.rand(3, 3))
-#         conv = np.dot(np.dot(np.transpose(U), D), U)
-#         img += np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
-#     img = np.clip(img, 0.0, 1.0)
-#     return img
-
-def add_Gaussian_noise(img, noise_level1=2, noise_level2=25):
-    noise_level = random.randint(noise_level1, noise_level2)
-    rnum = np.random.rand()
-    if rnum > 0.6:  # add color Gaussian noise
-        img = img + np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
-    elif rnum < 0.4:  # add grayscale Gaussian noise
-        img = img + np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
-    else:  # add  noise
-        L = noise_level2 / 255.
-        D = np.diag(np.random.rand(3))
-        U = orth(np.random.rand(3, 3))
-        conv = np.dot(np.dot(np.transpose(U), D), U)
-        img = img + np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
-    img = np.clip(img, 0.0, 1.0)
-    return img
-
-
-def add_speckle_noise(img, noise_level1=2, noise_level2=25):
-    noise_level = random.randint(noise_level1, noise_level2)
-    img = np.clip(img, 0.0, 1.0)
-    rnum = random.random()
-    if rnum > 0.6:
-        img += img * np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
-    elif rnum < 0.4:
-        img += img * np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
-    else:
-        L = noise_level2 / 255.
-        D = np.diag(np.random.rand(3))
-        U = orth(np.random.rand(3, 3))
-        conv = np.dot(np.dot(np.transpose(U), D), U)
-        img += img * np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
-    img = np.clip(img, 0.0, 1.0)
-    return img
-
-
-def add_Poisson_noise(img):
-    img = np.clip((img * 255.0).round(), 0, 255) / 255.
-    vals = 10 ** (2 * random.random() + 2.0)  # [2, 4]
-    if random.random() < 0.5:
-        img = np.random.poisson(img * vals).astype(np.float32) / vals
-    else:
-        img_gray = np.dot(img[..., :3], [0.299, 0.587, 0.114])
-        img_gray = np.clip((img_gray * 255.0).round(), 0, 255) / 255.
-        noise_gray = np.random.poisson(img_gray * vals).astype(np.float32) / vals - img_gray
-        img += noise_gray[:, :, np.newaxis]
-    img = np.clip(img, 0.0, 1.0)
-    return img
-
-
-def add_JPEG_noise(img):
-    quality_factor = random.randint(80, 95)
-    img = cv2.cvtColor(util.single2uint(img), cv2.COLOR_RGB2BGR)
-    result, encimg = cv2.imencode('.jpg', img, [int(cv2.IMWRITE_JPEG_QUALITY), quality_factor])
-    img = cv2.imdecode(encimg, 1)
-    img = cv2.cvtColor(util.uint2single(img), cv2.COLOR_BGR2RGB)
-    return img
-
-
-def random_crop(lq, hq, sf=4, lq_patchsize=64):
-    h, w = lq.shape[:2]
-    rnd_h = random.randint(0, h - lq_patchsize)
-    rnd_w = random.randint(0, w - lq_patchsize)
-    lq = lq[rnd_h:rnd_h + lq_patchsize, rnd_w:rnd_w + lq_patchsize, :]
-
-    rnd_h_H, rnd_w_H = int(rnd_h * sf), int(rnd_w * sf)
-    hq = hq[rnd_h_H:rnd_h_H + lq_patchsize * sf, rnd_w_H:rnd_w_H + lq_patchsize * sf, :]
-    return lq, hq
-
-
-def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
-    """
-    This is the degradation model of BSRGAN from the paper
-    "Designing a Practical Degradation Model for Deep Blind Image Super-Resolution"
-    ----------
-    img: HXWXC, [0, 1], its size should be large than (lq_patchsizexsf)x(lq_patchsizexsf)
-    sf: scale factor
-    isp_model: camera ISP model
-    Returns
-    -------
-    img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1]
-    hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1]
-    """
-    isp_prob, jpeg_prob, scale2_prob = 0.25, 0.9, 0.25
-    sf_ori = sf
-
-    h1, w1 = img.shape[:2]
-    img = img.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...]  # mod crop
-    h, w = img.shape[:2]
-
-    if h < lq_patchsize * sf or w < lq_patchsize * sf:
-        raise ValueError(f'img size ({h1}X{w1}) is too small!')
-
-    hq = img.copy()
-
-    if sf == 4 and random.random() < scale2_prob:  # downsample1
-        if np.random.rand() < 0.5:
-            img = cv2.resize(img, (int(1 / 2 * img.shape[1]), int(1 / 2 * img.shape[0])),
-                             interpolation=random.choice([1, 2, 3]))
-        else:
-            img = util.imresize_np(img, 1 / 2, True)
-        img = np.clip(img, 0.0, 1.0)
-        sf = 2
-
-    shuffle_order = random.sample(range(7), 7)
-    idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3)
-    if idx1 > idx2:  # keep downsample3 last
-        shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1]
-
-    for i in shuffle_order:
-
-        if i == 0:
-            img = add_blur(img, sf=sf)
-
-        elif i == 1:
-            img = add_blur(img, sf=sf)
-
-        elif i == 2:
-            a, b = img.shape[1], img.shape[0]
-            # downsample2
-            if random.random() < 0.75:
-                sf1 = random.uniform(1, 2 * sf)
-                img = cv2.resize(img, (int(1 / sf1 * img.shape[1]), int(1 / sf1 * img.shape[0])),
-                                 interpolation=random.choice([1, 2, 3]))
-            else:
-                k = fspecial('gaussian', 25, random.uniform(0.1, 0.6 * sf))
-                k_shifted = shift_pixel(k, sf)
-                k_shifted = k_shifted / k_shifted.sum()  # blur with shifted kernel
-                img = ndimage.filters.convolve(img, np.expand_dims(k_shifted, axis=2), mode='mirror')
-                img = img[0::sf, 0::sf, ...]  # nearest downsampling
-            img = np.clip(img, 0.0, 1.0)
-
-        elif i == 3:
-            # downsample3
-            img = cv2.resize(img, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3]))
-            img = np.clip(img, 0.0, 1.0)
-
-        elif i == 4:
-            # add Gaussian noise
-            img = add_Gaussian_noise(img, noise_level1=2, noise_level2=8)
-
-        elif i == 5:
-            # add JPEG noise
-            if random.random() < jpeg_prob:
-                img = add_JPEG_noise(img)
-
-        elif i == 6:
-            # add processed camera sensor noise
-            if random.random() < isp_prob and isp_model is not None:
-                with torch.no_grad():
-                    img, hq = isp_model.forward(img.copy(), hq)
-
-    # add final JPEG compression noise
-    img = add_JPEG_noise(img)
-
-    # random crop
-    img, hq = random_crop(img, hq, sf_ori, lq_patchsize)
-
-    return img, hq
-
-
-# todo no isp_model?
-def degradation_bsrgan_variant(image, sf=4, isp_model=None):
-    """
-    This is the degradation model of BSRGAN from the paper
-    "Designing a Practical Degradation Model for Deep Blind Image Super-Resolution"
-    ----------
-    sf: scale factor
-    isp_model: camera ISP model
-    Returns
-    -------
-    img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1]
-    hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1]
-    """
-    image = util.uint2single(image)
-    isp_prob, jpeg_prob, scale2_prob = 0.25, 0.9, 0.25
-    sf_ori = sf
-
-    h1, w1 = image.shape[:2]
-    image = image.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...]  # mod crop
-    h, w = image.shape[:2]
-
-    hq = image.copy()
-
-    if sf == 4 and random.random() < scale2_prob:  # downsample1
-        if np.random.rand() < 0.5:
-            image = cv2.resize(image, (int(1 / 2 * image.shape[1]), int(1 / 2 * image.shape[0])),
-                               interpolation=random.choice([1, 2, 3]))
-        else:
-            image = util.imresize_np(image, 1 / 2, True)
-        image = np.clip(image, 0.0, 1.0)
-        sf = 2
-
-    shuffle_order = random.sample(range(7), 7)
-    idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3)
-    if idx1 > idx2:  # keep downsample3 last
-        shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1]
-
-    for i in shuffle_order:
-
-        if i == 0:
-            image = add_blur(image, sf=sf)
-
-        # elif i == 1:
-        #     image = add_blur(image, sf=sf)
-
-        if i == 0:
-            pass
-
-        elif i == 2:
-            a, b = image.shape[1], image.shape[0]
-            # downsample2
-            if random.random() < 0.8:
-                sf1 = random.uniform(1, 2 * sf)
-                image = cv2.resize(image, (int(1 / sf1 * image.shape[1]), int(1 / sf1 * image.shape[0])),
-                                   interpolation=random.choice([1, 2, 3]))
-            else:
-                k = fspecial('gaussian', 25, random.uniform(0.1, 0.6 * sf))
-                k_shifted = shift_pixel(k, sf)
-                k_shifted = k_shifted / k_shifted.sum()  # blur with shifted kernel
-                image = ndimage.filters.convolve(image, np.expand_dims(k_shifted, axis=2), mode='mirror')
-                image = image[0::sf, 0::sf, ...]  # nearest downsampling
-
-            image = np.clip(image, 0.0, 1.0)
-
-        elif i == 3:
-            # downsample3
-            image = cv2.resize(image, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3]))
-            image = np.clip(image, 0.0, 1.0)
-
-        elif i == 4:
-            # add Gaussian noise
-            image = add_Gaussian_noise(image, noise_level1=1, noise_level2=2)
-
-        elif i == 5:
-            # add JPEG noise
-            if random.random() < jpeg_prob:
-                image = add_JPEG_noise(image)
-        #
-        # elif i == 6:
-        #     # add processed camera sensor noise
-        #     if random.random() < isp_prob and isp_model is not None:
-        #         with torch.no_grad():
-        #             img, hq = isp_model.forward(img.copy(), hq)
-
-    # add final JPEG compression noise
-    image = add_JPEG_noise(image)
-    image = util.single2uint(image)
-    example = {"image": image}
-    return example
-
-
-
-
-if __name__ == '__main__':
-    print("hey")
-    img = util.imread_uint('utils/test.png', 3)
-    img = img[:448, :448]
-    h = img.shape[0] // 4
-    print("resizing to", h)
-    sf = 4
-    deg_fn = partial(degradation_bsrgan_variant, sf=sf)
-    for i in range(20):
-        print(i)
-        img_hq = img
-        img_lq = deg_fn(img)["image"]
-        img_hq, img_lq = util.uint2single(img_hq), util.uint2single(img_lq)
-        print(img_lq)
-        img_lq_bicubic = albumentations.SmallestMaxSize(max_size=h, interpolation=cv2.INTER_CUBIC)(image=img_hq)["image"]
-        print(img_lq.shape)
-        print("bicubic", img_lq_bicubic.shape)
-        print(img_hq.shape)
-        lq_nearest = cv2.resize(util.single2uint(img_lq), (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
-                                interpolation=0)
-        lq_bicubic_nearest = cv2.resize(util.single2uint(img_lq_bicubic),
-                                        (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
-                                        interpolation=0)
-        img_concat = np.concatenate([lq_bicubic_nearest, lq_nearest, util.single2uint(img_hq)], axis=1)
-        util.imsave(img_concat, str(i) + '.png')
diff --git a/examples/tutorial/stable_diffusion/ldm/modules/image_degradation/utils/test.png b/examples/tutorial/stable_diffusion/ldm/modules/image_degradation/utils/test.png
deleted file mode 100644
index 4249b43de0f22707758d13c240268a401642f6e6..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 441072
zcmWh!c|6nqAO8$7B{n3LV`kK(93v(n=FF9&gWOr7x#ec=DLIy6$XOP(=y2x<5$5{3
zs+mc-V`-Qp{Pz3DAA5K__ISMae!rgQE7jW4_~_x2hXDXMYHEV90RS#N006atxj3JE
zF4jW;AOJAMT(%1<xV7h)fw)POC-kfolvQHqR-suFm2ch@PA!sVR~5w&)w^F9S0K~U
zU#0c9q8uPcbg0$j$oq}tN>vnml1{bTxP?g+DiynQo9o!I6N_%E*vbgZuO|L|mjk7P
zI+d=K`&W>AKZIh#!o$NOBX`NMJA*)>jW^|y3Q#;Aq4n&kr^<VWWpM*cuGfbt2N5WI
zshPR!>~q#OBBtfvCT(8H#W{9o?KF0OXT!$_mv{Kc%5DquBFg3b@sO7_q?^dupWPXl
z54e1i%uFqg$z=NZ`PI>IX={rkWUC^bXM^*czmHU$U0g`pQ7yUKjc+^zLamVJ`t&iC
zhXDc@z;14{=4mUN9YVU<+VqJhq?`3MyZ|P+*|}<BxfTbH(t1YyUB`%Na;_r&S`5rH
zyEZo7SSh^L*Vm_Al#6~5+~mY$FV6<vd@7>Zzzq~wlF8)L?v){TxVRY055O3&vbrg{
zA{o<(b&h;RX>9lo!|;7Uqfqe5%F4|tQh4Ef-*!PDFMfB=nY|a|vb(S<<#G>;$qqX2
zIe;GfzRJ$OsO?f{*~dj#N(O_&niw&AvlF|Go5O4z(*ri6szhcjMxh^?P<HoXQ0nT(
zpz=_to^-R2WM#)>*8(MDie??6!N&){dv4x%IdQ+0(SPrz81#ezRI<%+xlBmx>e#T6
zUq7hrDyIByUXJI@r^JW(+`^n|0)2ph+o1p$0<MY*4Go1y#O(apWgLukXGqf}CHkq8
zpyHa%&3y;9w>O!!J-dAZDp@>Hi=#!fPK;CSaCn+CZSTJ0g!<}JmE`;e5Cp(i=ACVn
zB_^PtC~nSu#5ZmKw0!9DQ-eUj&+$%Uey#fQ60p2dp@#vyGPgUkqaQj<4;mnkq!R4<
z>0nSsT}EGEo)t@b(3Uh8K9?OV;3idhuuhvts2cgzpt(RGK#DQZZ((n1ihdE6u>jy#
zeGPt!1cma2s@ogNa|Qa_;wYcVy~Rb&)3N_T$+2w4TKG<0y~D(KvR1Cp1}_5BlREYl
z?>K>@efNTET9Ev0!oIJP54PB})&n6njk2EAfA?iq^ozsjoRPZ$-Fuq%Az8T?dr&4J
zSr9Ab0g<lbIoW0Vdm8&J7vOLE6h~@1_u_UXIGM-W3~VAgaTwJ9j6fKYgn&pxbV)EM
z3j$3A0a-H|q6b`rMEtZb;Y%mAATV$41@+3!0dgNpO$kljh}cWve7m2EPFcRpFV%w|
ze*0Th`(v>vr8|hg#PRPNJDi*8$MoBXp|R<~5E&U6`0(0U>wh5lkAQ$IP>&=ijvyI#
zQ)1@f@Xt9OJwA9KpS-+0CNMPdr&O>%+(=Ikh6VmLF$Zb2b=Ud@+PW8ZYagl1g}ck3
z_yG9_Kl_|+B1~=6)ls2bXKXK5JNPjBjjA}0S7O*=Ogq(lq#!VmHANHemFTXi_};?Q
z;)N4_)pH^5h{?F~`FDrw$jAVPPa|wrY|I)M%-t6D)WJGgm+o7qdAQr_Dz6!G&DYip
zJMQo>XoUW=<I#w=)ZE=X>gyV*V{1)TMb6I7)Zh1;=)M}Eu`w|bjoKo;jTG9o9M<P&
z-jr37PUP&Av5(W3jY5%5w1Xx2+H#}sJS&wjbNeIRmr^sbIGSSVDV#O#u-u&SW0j?P
z<jN?3YPxlvJhCXg|3i8+T4_X0DQNVkiqXf1eX~THoqyY`S~`PaY}Qc2nF@FKM)nsw
zQs$!>E-o(6?T!?o<;L0zbKwDO9L*ayGU~X@-c8024k|S-(`b>%6F?fQ<kboMATf(X
zr?AH>o489W-9&-+-!H-tS@S~D7)(emDeqNfUd4%5MoCwY7A%P;gVN*-QiV5V%)Acg
zGI4HRwacrS<e+f~ux+kd-rZ=fS?$(^wIlCjbz(fy>gw3LE7!`Sbc)ETAXia=^S2;v
z{nYX35JwABdK)s8$}%?*Oa`YWrS2|dv>O5G(-`p$Kmw3?@o$B)G2CDeHHE{!(L)3<
z!FTv<4G0e1-Q2&gLa1*hmSg{A9K2=kPsHv`nD#oeX&VnP#IM2iyL~A_jM#%q@TpR(
z@YXlW&j`6;jM_Js<W`$&ol{M5Fc4I$I9>*SG5%ub)x~6RcY|qwS>tCRBTS-6V#d-F
z8*KTw19N4|js9uRam^hLS9k#{{q~(ATa6%<-z~fYysr7aHhES>Ru#T5G}TxQ0H}F{
zE%JaFyOok{n20yL428BqGjsc2*I5EYk<-GLdHh{@M%@gaK)`LI{Q}Pl#M_`>K0yI0
ziI58Vc&&;)^(KTtCO5zYIxqh&cM2;O;=8ZxpLRBJl*(MC7uY{~ciQM&tzur#6{6(x
zqkwYA^$@p0G7+&+VlKclXQ|lUGnxe<B@!FBKrk6C5&p*wBC7aPkDFdy%+l1#l|8KG
ziAA0Ie(xn0OhM+;@V$DsQIu5@<$CBH1UH}d173g&)s&m3|L>v}0M9+a<?rrRn+3+_
z%2`p=^{`me#lZ0IZzisV>M5dipA{kGc>L?eyROxZFEvh0F<L(C_K!_Ul(hmdeRp{{
zALfFdrkOYd$_;^lin-x{tl{t>4Bx-;UoyoB+(Z!(VuCERE9huC#1EW%2;_IfrHa}9
z1+K*l5KIbIz(iESDV3(UZ?L&+#A>*|baTEpQ=Pvl|It*pvc0WjWu*baf^+*HU;J?O
zCm~YwBwwgJk33349ple^+a0Q5%gRQfM4+(QTZFJ+;?(yR3OF5L({PLn7_(G+^%sdI
z$QLR`19I~pnUNIrIm*jFc;zmjGrTZW?zqy(2PSPVhUO#p+`$Jq8`ywxnRF<UkBA9x
zxpjWGMb;hpnI6F`(v3wI4N|j-AI2poj|vCrJe#{)86r)$Gk&F>H#^l>siWIkV0qf@
zJ_<8ghg;wO_fLE9N{!Y%^AS5U5MF%Lh)Hv1OifXLN9nknw}Qjr9%&Atp}FOp7b{dp
zqime?Y-PV??rJL`<=}QW>^E}^#wIX@&1N^(dO8D>w;WG(nt*AzQ_+67pt=lcT`DWv
zhU-T(Z9IfROE+0l)cook%7bXT-p<-C2pS*uIknvQv_iSG0?s8v;*Lkn1bm}|Tm=sO
zDG)(5?21P_V@++!-RC@<94QobG=s1eb)GV&!YeX+tGuGq*p3~Y_ExcPHc+cb>4iD?
zWjQuI5%VRjIrM;Qw-&_3Wnwm>mip(a+hm;b?62wF+Kh5Iyq$U*Tj-YNE7;BzKQx?@
z=gl+-`!G%f!}Ig=RAji~E`Mm$dtPqR+3q`MnV6o)84b*XpA2$A?7tt~Ax=IN<Q*(x
zhD_ex-u{oazp=4#znN0EyAre9jC%WAjWcH6uzgp{d4UBL)MWvC-#K3(Chf29uLUWW
zMU}gTSw?*N4~$neC;zwr0AX`V66l}o=ge@?xkNAlLF~t;8s5NA&Bs8Qsgw95f;bfO
z-*(^tl08+2^KcuOWgL)Wsrq<jqeFNb1g{+r#K?7^UFDjvrVIkgH&N0y4GcHDs#Jhw
zR;X6zzdFe^gy^|x@d>17$DWwjh?vbm`D5{&R02=->sPXIk0W^ziEd?F0>N?xkfJvJ
ztEtSKI}tIP(eF!mfF&bfo;)8;GOZ5viC(`j^Imm@d#wL5v_JReF+dzY16IWVu43E|
zD<96yrDOHpVAZJ5+`EN=K0`*=N4l?CrDY->4W}wU#OR(V^H+lp7Yo_f#R0~;eA8H}
zJ~dHuRAT6A_>F7+L8$8!&2^n>=WKgTYfk7D&f8((0q@<uJJ4#Qo2xcrn6&1Jb2}IG
zxyXp9QM2F?Hw*eB66p{L5Y4=iNTLthf%r3R%uJNKUQ#xv2xszi_0-r~B+#%V^>=Q2
z|BMdL^9|3-q5ea|nL}gHfI@lbWjIE>qr2L}^|}wGyZe}iK=CVYzZ&)hqtgh4Dl3`+
zg3ZIJ-y@{U*g8htVJ4GQML89g3a_Rn4^RB+RD|qI_5+iXmCEKe4}S0fzjih&n{x_4
zFaVx)oBNYnlV3<0=i;J*n3s~@mnGfi#k<vRba|7>cl7U3D$bfZ4BRnTcVpAeb=8L@
zafoGeiv=r6t0>Hs(nLx%8R&WKN4un~g8880JHd{oK}u?_vG;bRV>FANDiyV=+8{lh
zCWdz-n#OT^e|{uD4!s%KjOaMa{h*r6q1AqM`IW1?EfgPV?^X02tS}S~HLVQRdS*#R
zaoF=6`*SbMgDi>mI9laN0$4?{@3${yr81iFO6#?w=Um@xRCt6L(sccZmM?8*yKjCY
z2DfWwzPd?gGny*%RwJWhTbUtzdSh{5YT7j6CEF3VTZ==cR*rusg)4ju&gJ4#J_66J
zgurZYC&iWE5S3EdcD32@2Nhaht;b3z<IUsad8F2l+RH<`d*conQ<d586d!(rmEO%e
zKhFwfM0dldEy39F@79tg)XrW(9(vc%WdiyRl$XcYKNNv6)*CG;*ZHm-aAx)Jnl~s8
z@>Y-=p~nr^`&~KOwC)?=({PcHe+msfS)ZUv%!1m8g0a64$exY8oud6U=|uFbO}S~V
zq#gn_ys@$};Sw7i9XVFwz2t2w3{RVKctz0wG=livL*ECA$_HxjVR(UHlm@pyHy@yW
zX+W2U2SZ4K+{^tQ=aex8YBTQ_1<Ab-@0VzUl9G_0#j{oWN2p&qI~4)VWJzVDCiDRq
zrGkPfUq!wcuVwDqU+00zyYu`}<cCnx(M<7JC{+^5gOqcVH~n|?9KsmpnqDYH_<EEs
zAYt>7^>a&2l6&Zr7ky{r+HNNLeWbBJf?L11ZHK1-+6khzS<sIT11TOL4@^EbJRIrL
zy5-CVe^76}{XtZ$g)tHBhfNgsItFjdQ@u+3?YiWQs30{MGRlF(=apYf6fvXrSNT*)
zXHha&&GggKGOzT?35_juC8E8Bp)!pp@iOmWQn<rVTTv9){NfuhfCqszi)Bmwqkd?-
z&d5EHI(bwgsE4K6Bxs=2fcIb;fWCW0`ccNCX>}Vq-VcLd$q~>8ryhb&aKGV27$KBl
z?O{i{{~fY4Pt3OIMWgZQtKVy`8^Yii|4@5rFi};eqDioZFVW*d8x%O0I9NH@h~1Ii
zkHo6lhT7Wm5NKBY-Qpf+pl~=!5|4(#1;w!jxt{`nX+8U8t;uF~7j-a)9DXy`Yhi&>
z@knoyA1xOJ6L}B=YlBx%MZh1%Nj5|QJuEO?*=vqjm=k_{&5R%FLkSS&4YtI*_%;31
zF2so)UKlvg%r35oU{cieMcpLJ@>h0slJg#A|LW-DTZwkmK;_SGFLb0jFj}LwZG854
zpJ1GVk3&=c>s4HC+~1`6O&eicT4N+VqPDgIoacg8nlp-ra?#2=I9iwZZcEYN{K%qq
zS6HiaQDGtQV`T-$VB-zQcNIjmVDK)$bFT6M0iDCa$x#Qxtw6NyrJ_2VK_};*YKtt%
zIT=c<cj95v9$#NJ<7iUSk!Gq&o_VT3+Hi21O4me{OH0&tsg^{6UDZTBp?-0NL+Qth
zCwuMeG7gJ|n!AppTVHf4DM{<ZuscS!%`sX=%M`Yt4){j@5FQgRyk%sd`u-x}%WUA)
zu@EN@J8P82DUaO6#&ji@mX9qhl@908?0eqS-)~lR23gxtmgxMulw*@!*7gQGYC|gC
zw8)jp2lxVn11ji<e+Ce;J{3Ws)Fq#pyT(?~2V7Un)A7E7)A|^qP~(wQe8h6O){_eJ
zLm;~SSe9+!-;dN6**KcfDaF_HlnSBarVZIDJ2cwz$O!64J~2+@)5iU#=GFtw@zQDy
z`#yIPz=u83#v2=q_%+n+gE0QaS||Onyiu+nzLcK~%Do6V0W=suM6efvVcJ@u`u0c7
z)#oDNTza-5C9Yy+hwf|q@o+{3@pulv?-^moBSYH|71><)W_BaHzyi_3ryyn#jQ@Zq
z%t<BGH@y-BtcX-Pbw<t7v{W)!*%YIw;v%ek3Vv13J<L^qbxv^FQX3Gtd7WHqPs>vh
zsfK;^UoMNJ9L8YYdjx(i(bQV<f4-%85nq2AT1fu)Ptmi||9VgAM?^B>wv_+7{K|`P
zp5Eg_GaTAwCQ6<i`XPwHV4Br!PhA#&1&`+zhvkycskz)Rs}ao0@-%`)O8#3DGZN&x
za|*Xif{$p)^jSmAc^;s0&Q<*5Ng1eg=E<z;&W0;g6XKNt<O36|so>P^klUIu!ra{P
zl_%p$&zd4nwVwwBDAsH!X&@!!H>F?B&deQphClOFrQP^a^erz~DWDKhWl&Q?zX#zf
zyA#JJa=C5t)6K0<j%cl1WPL)J?B*2cXE~q0;T-UkDgh0A>Nj#$3Jl5ZatYOkiRo#0
z`ujDD3`aR|gyqw_?qaAhdS(JmUS5z8kTz^|3YVsmD<^M=P*c|z#<oqbi1Ks8K?v9}
zjJc4vhGs|ib9<ZK0$b)KqtXzL=i@l=jarIN8q&kW-^b3`JgA)Vy^abA1qymas|Kku
z<2RmiP;hy}3^Ni4=Yk8nVeV0q2xa3Dtql8PbQ?3w2j*_k{`%?A*oX*p5ldVWura$D
z%w+HG?nW}5Faz`N!5dS~E{r9wPfO3-+qH_IZ-qkGD(PadrERq**}pj8(BND}W+7j!
z)BIZHOg$BH`etWm7q_?3;^iu8OLBnf<BRubI~&V);I3HPWeRh5bGC0{cTdM+MB8d1
znFCrp_o9D|?=B>|R<0T)V#^I2tIBy-*WzAAkOo=WMdgdZIt<^sH`jsNmWi(ecDV_J
zCNct!)RMJVOzIknX4K-!G;2WA-!U$ni4)l56v-sqGE-rlc@#-!J6QG20ChBrZt-aR
z?$E;R6E)nQ7PtYjw%g?%;iDpf>kqxWqrK>kRsEwkxo-1ibaSwZs$I;PY;gUP7vgL0
z+aF>!LuFJNE~;2oL>+XHGm3Pc*i1Py_SaqZUq?UBHVQ@Ao@$@$-WuT?VovKnuIac}
z$}BIO)5N#}o;yB4Rv$OE9(J;9LQo+qHS_DIF}0;3jq?6}$@KO)-c_toCm@*aTB#DI
z5>#!A$wqvR(@$&{ekUSkgy8?WGK6<!KARc!PboN!%@Xb3694X?QR8)@UmM@c+}|{y
znywK(b6*M;sJVTWcuA|)HT=?_YnrZu?Vn2x9<jm3C+_2+OJq&Sqoa0XlT@g185xsZ
z>l?`(BKXE@;p=82Zm6G{k2pK4Hu|CLK4|?@XL{N~S{r^rQMsSkIsBja9B<hvmAA?>
zdYzg4^%WO&oeE<Qb{Vn%{-_*(ifCp#<X1m02pZ5*1I9uaOxk`-lT+2in~wf5E!176
z0UzuEp#&YqC4fu^gWMHP@P`NlDhKJ^X`kLOxNuC=(){w)&^%B~c#beQ+Ln6a%q()A
z*r)cQAnprYKy>nP_3U%sKgA!6zsLyIBt7N^q45dAS+aR&Ww>5i=LK>7@qNR0B$@D1
z1)JY^c~r-E;)i|Y@=*x_1TQteud)mifp6$Ysn+ExJWIIG4g8sMWU8Ok<gu0n7b=qr
zK7BG@lnGE_7mJT(gONw@uO3Ztl&=pBR6%9k<@f=I1>P^;n221am>)XP->-Ky6SCag
zNXjk<N*A>12eL9jnMod#SK8qS5~)YhkO<*;gj9F^2QK}=PRy0)YLjdT{3K@th)YRR
zKg<{8%!<NfppR50IdD@ue$UHY@SCCNN6l^yzP*|7sg|u&>v}n+|LkjIRZZ7~uC6X$
z;nw=Posa$4@<xQifV6HDqV&w^)$xdoQ|FUEZdGhXYtU-1v7-<6cbU3TTh=}tCfj%S
zGU7e-bfh+=@)_8^yLKg#{?zlq=~(Bv`J8#2lVSaMw9~}MU77Z&$kEoRl58SJ?3N-X
znthi~{)FL00+`&B-qO^+9PWB4Hms$kmARXa_jXFDyqQU&Z0Z;>d~o(-ZzgtI57-Ak
zqz~3~qj%QVLR)uFK-tawD1da+&!WFJx{1CzqIOAFmm7w92rk{6O3-R%Fnm_Z8*z>}
z9HVY|V?6Tsk8ELBBdukHLjZ6%Ay8puc|k_dNq%TQVBT*>H?PTV|95W{-;#lS1HK$n
zg2rt8=av`+Ip(XQwtp6YxqaC5PF_e>S%ttM@8g74zFyWN;B9(?^5%Yfu~()X4TBM-
zo$+5CHEN3Uy(zTXjA0wgcH#ARq<NyWa&#MyAkZ21&#;qwGdRp-GyPjfKQ%=<WJYsu
zZ7p`Ic<obMsvLiN*#aCY!d3a`%+#7LoR<$(46H)zWeU&n!P>)}ApvPwL51b$4>cZX
zI9i!4qP%E-C6q5OBy(Pr?66GNF17^s@Yl=Q_-|ltUzmaEAi@A_`Td23(Ttc$b5IsO
zf;lJbQA&zCtND0IXPn|;D-6e&5!K(HdhC8`H66FE^7`7nNH?*^pPvl(>Rq!|=bA6L
zo%i4FSj5O(1p)>Wg#2Ekaa>G;?*~&inynGbs)}K=n1KU8ZzrWj$HC0dhKtAlx;md4
zyO|@0R+k&cPHI&}H!~(2nH_WtkKt(cED(JYpPJnn1q76chQ53L3u|)5++>t)ed&8=
z*cmRHD@d6VNZiFEj`$Qf`bGBb+*jK}Dn^W2I>%I5K#ZoRBUV4?c{x(zgr(b|ZP{VH
zvm9Tgz_NLR@<=N<4LT?&E4i*vPcqPuv`h@>z;i#$J*A03g~EPfuu^ys8d}1Q#(yW|
z2#fJZYk`q!PZPn4oxz#1<=#ew<d@!39jb5$Z^Bq)6T@$}Hit;w&JJ`d2S|kKjqgVG
zBW|yqy1lU7tg|9Va#)M*f|P4K1v%k`IS5nxR0v^mG-|ZP50{%yI$R9B60a7o`#kK8
z+U@AWCK9smtwwG?wdaG+Q9<agVe936$^gglTDaN+xoj_4JpSF`$h0MeyG?s=O`Vgq
zEH^SaTJv}oCqc>ms{i=HlbKaYP2VgWPT1O5zK$i8r;@V%1UvtZcs3uNSMKL;CSd;p
zeAsGaH1dE|bRdye(7fvLwU*Lc*EhQzrIUYmLD{cvd490<nbK*~cZz*1*2p{aMEt^X
zW8Trw+<8+D#y%!ZIIyFI2pjp+_92tLb@dJeiJ_q0NymeojT&=Lh1tc$|NFkC#D*mf
zNA6C!Mq7yfaE&%)<X1Uv<VV?K>F%+rTK{SF2MugTX_@xQtSwR~v~ust7Tm75Z1Rq^
zYeor$Gf+;_O>eo_9_mC8ukeEc)~$D2j!J@uB8Boavbj|rCYE0q&``f(T3)d}T-VtB
zV|iMCVUAL>(o&-Xhyxavw&I7ZRBS}~F}Jyb7A{O`zd*d8vJ%ZH>X<<}Q!~>ugWFLz
zGyiO?Ebr24R@Jj0woFL@!E%|eQaoZjq8g#&7t*pUS>bu7;Y(#z>>A%DH`u{_@VWFK
z9U=9LU@w{VB<x%U?<OptZqN`%2>1kbOM~h!L3C4wbVrYlKT0Kiz9qCT%q0o^SKh#f
zU$`$_gwoT-+uK{H17|RK<%`Vyd0j5o>}&r1dI+H?RXP4Q`z{LdiTiQ@T=_Wvp<WL{
zwfM}XY2#BD%#XpKP_AP|)X_;`C^&5x8mIpDi?yHAodNV&mYw|f(|0A4eNKf?aAzhN
zxTfPJwob=QPo!cx{a;?Lk~=(Di=!MZ%^K;n!@N^+Hs!C*SPMoU&^WI#mfgfa^(;v0
zjXfFG(}c#qx*yUu6ps=Ka4!G%W41yF*Ua5kOK0PnoEa_*XXshLOT!((6_mq6#UFl{
zUC*B$B$ldq$aNa~CD&9_PX1(cS_wEah!|{_WL=cJ!zI1|OX0*eU+809;*t=<V-iw_
zH8Tb_@bgr>rmw2Z45H6&4q24rIUt8RRa;Io;Cm=|e^f~8Lk?hc2D^Gv;D<^)IosB<
zEQ9Z_SZ;qnnd{K=j-Nvu<q61xx1_jQ3<MPHHuOG%?DwXLU{Y#sYH?^QgTn;1tVURU
z54p<`a#pKvJl3yud$pUiQ#FItUIEFLn5Y)Tk&oH8ym1x&{_W`=ObXv4w{oo=uhlu_
z8H3DWV>JX^V(+_n+4xESBIyfY0ipn42gPIlYWxmKyXtcV***E58Hq%{_<*Ce_{!ZG
z^~;pZyUDD{5CpDrsOVr$-`zrEAE3AyH7vx4zV5h8ImeRdAK=8Evw`6ejj%tBzOg$a
zMGihWWY%mTClo!!btqYEXRG=(j?%p#X0NPS*f$b{Od>hFs<a(I&rdpBK}P>uk2hiO
z9v$Y0O%CwW<FU4<Ui?yh;mP5o!Oc^f(Z^*lcZ@h7Ug3`0ZKP9w&(0<xE0)w|bLMlR
z85|MW4kvq_my2c7$(e3J#57w8kGVv=aKgRgdLLo6Q*4|MWvnqn#estNs*EFwOpmbM
z6F#=KHn+A`Esq<tEhd|+o}310|Ej*~MVFZ`u$@C7=MntS$6%F|W~D#3-PaQOj;<U7
zw%#ei?^U9+;&4pPP`lzD-rUmM(n7Ba4NjX#(~_YE+#)Ii=@$TrtW0H3Qu#Y7>tjK0
zHVAfx!4bkmIx!BGEb(KRnLH=_Ch|!o5U$VFU=u-zuCg#M4Uzh(xkmoQFQV1_<BFtf
zUIe_LGN18jcK2QPxur`VMy|dT8a-rfz<0u0s0(t;>0CoYzVSvNA75yQn@oA8SD__2
zLt1C^O&u*H4QhC1Ui8qtG^jxaA)DAeR9D9#_veXS;wo=R7aN*7w8;l^u{#D#NvNP~
z!DYLvAN+!T#M+Cs_Pc}e#c$>S@#tfcxQj9((%fQ~zs&Z><&sW7fleyua>|!8Je@JU
zXF6(C%%2#I#8HmYPhIeY0a=LZR})=0$2^zYy0fYzp#-x6i2(ZI%JN3v{IQZ-1LSbx
zi1yp(Dz4{kO|R7@>*b6Pla_1q8cC{LDTM;oH3{*D@+|~h!C%B1&CK<wpC<!jBEn;K
zwiai9Zp`-O=#E7^Ha7vMl^Z3l$_tqDw`FyzRVdWUK%q%H`9M}8#86n}KbSptfB`q)
z4+D(g<+;=a%{3V{$&v&<z}A=x3Vm8%lB^{qe18Uk0Z&sA`uf?4^S2VeC`#!`mSvO}
zoYV3^g^s*@T=H8Z)E*V8{_vr=e0ApfOC%`3OtIjd^v9|O!RQ564={R;ED>=u2<6V>
zF2?tg!XG4YNa$1NCt=k4%AlFqkDU_VLLe}N4434Eh-D8AYxp1<`f#=Xvd4^)J}X?O
z$SR~NvZ?L@_$uApSo`7Hs#Ku_5R5qu|5kVIfg=Yf8rOBY!~>{@K5{|MYrLsx-0f&^
zXYcOpbGX^{F(GN4OOrWTU9k27+tCYQ0%yo0NdJcMp4H8rot@3i@yLVq#gP;tX)~mi
zl@(C^h8;Fwp^gbyjnR5G!*X~!qIQl@6}!(Wirw3o7<Bn>WCZ=&z|_W!baSTJd;|f1
zk^QoBO{-?y^JaOt+Z-pzq{KD!v$T!w%oPN^yzujk_A|?QR?n@2zw^3xh#b48>-fFp
z&CN}*2N?xHZAaXQO$;V56d4;EYt>Nv7@U7|z|h{9Iq}Nb&((KfDB@Ik5E6OXUFU_i
zT^;V3f9*Z&1D*zxfr>h*>3l&7Wwkk}T<^xH9o`V};+DLzR#boDFR2Lh&i!ghk>vl+
zA_<*N)hD^+1f^6#7(&B9ombQT(a#tcCXraNsUj*0`V<GSaji~O&FcC*cpWXx++{Tj
ztgB8YSTHChDf3cJ!NXhUb392x7rs8q;+}cH-7d*whb+9TsTmi(;&txlTUBmBmOKz-
zjq4Z_N|Bd=;%K!TzUl{x?>dFHu21Ne^f&`ceyNyDEF++!@}JHKEkK%*<+f>{lOqyn
zJc*p`e*XW*zZkspch+a9>*~OKxTz`ND&RDs?jHg#lvjzYtl5~NKZ1}sy^a%;lK)%|
ztYUHZO;UbbC28NQndbG+<>FsE)3YWi<0==jYvjadH~mBH@N2bwRbHOO>2$$LSv4g=
zJkJ+_u1@sZCYE@#<6dp66VuO8(jutNoS&6QjcRhJdi?FgivHg;=iqz1w;!}cwNm`5
z?3<n`_1#6!ciwEHr})5mMRP7Ezwu~pMQy1S5tZF?*PcNv`ZHvgwiYDYJJRO@N~IGj
z5+3)xQ|u^oJRV}e=_bSxN!%iF33;T3ykx;l5<v9+svrR(eSXycu`b5C7+;Rn8P&U;
zrHD55N)j~Wfu@?6dVqQUgUPh;sXvGK!?k2ogk|3=r=03Y`ZhU^GY0Hw5z?~jZt4DK
zrs*K3jz<YpnG$g=Dm*j}^yyY)Xi$*<=}fBcsxp^#fM#wnUqtFb(E&%@3mxqEajmL$
zt6DT9n&^`t{J{eDKU4m^%T`RS!^w5!jEMY~QnDA8O9u}h%dh2A(ETsFykc}p@>$ZY
zF}e?pNej{G*BdgXEvK6Z^15yn{{gkNExIgd1^c^YLBz%#B9~1*Qv1{_cBQ!3*+E8~
z1w>NUND^VU#n`+{99MWJlvewQ;NVjk(R>Yym@8nl-~ekg<cp~MA^KiK^Zw3q6;;)O
z<+A_x%DOnmvs@oH=so!Rx4o0mXb0Sw*yzhqV#J;f`2J=&qUn*!4kt2TudibguSVOe
zC)ax3)a1A{!kts^6o;iL%e=sd+Gocjl*)wGmgYChE5UiKyX(Vr$5(`E#rF-uBB=QS
zyNwvmefMFWtDLC&5tud{snn2N!097&i#NV8(Y8QmBU#$AGb{J2YQ`N%97-t-cAg7A
zZ+ku<;PjOXM$(nOhdWJ)7Pe|If@bn+2(!7VDR0bSSKPNN^)rt$M&-xN5PcrwOO|gI
z#*}qRA&>_qmgq0H9zhO=@_A9h|4unbOF}n5RW(?k1s6#P$&)A9&}ft?Z~8<?5Q<0K
z`zj_$&Y`>bvFz_@wR0>r5fSBb#k*n<2?~=Y2vE6z33do$N!y~btY!|Vd>V9F-z@-z
z@oKKnw?v<Ii(jG-WuBM}QOmLm{50JoMdTZSN&br?XyMs9)F*fk`IzDd1MQfwKYVyV
zCpmfYihY@f+-)t13lfGZFK(I1lq2p6?Do0r2R6|S>$6Wlxm?vyorELe!=ws@t9kR=
zyUf;5_7EE`6}sqhART+y=LUGN#jWUSFt?@}YvF-ZEntgMKdL1NQT%H-nfi4ULZ9qO
zzmaU<DdL<yToVdcivZsMpr$nmZ}H4yetw#IOZF{MYm|{|M-ki}3OCF^uIt(VDQbe!
z-Ij_4zdd8LP9FTrVMmXQ1P&0LaJd;yDK*-&(@XOe0;Th07Bgo5?oG?`?hluFFy@^J
zUCIyhzkQN4@3)X7NR&>M8a@Xfxd{6~Dx^U!Id>*+YQ`HRJOG@IO|Hc;lWds4OX(Y2
zu)MtVG`;EKB@Z5@-&DmCQNk`)I^iS+k^V*ibk*Y1v)qixstqkISR)KPS1?JLSOua5
zf+nV9OF;w)>y(O<aOk^>FgF6wffIBE!%Q=094}hClEl8qsJtH%_g+X(|LsK(xD8GZ
zOpMl}sGGux71`NAFE{#mg<oz?HRW>}EBg0q#x<JhZ$?xvS=_-s{qVd13h*0E&}f|G
z+|ApS*dTg><LESA<2HaRTaPTS@Zwn?0GT5dC&DzZBoLF0|M6>K6b12*F+)ZLX;pqz
zKwGDq&!e=W>>xTjy2?Z}V&{x7^2Pl8eD*?Ai@9wgujH*O1yIl;_{zE@rG^vVFFffI
zUwbW&%<1za<>*8(B_#&u$<y2bKFK~(qWf|h2;V@?j{g=B8S7b66th#2{f@cY_=df!
zPyh*VsuNF>$`j?3(&h_-Qp4c`VARE;jIEb!_QaPYckEbJkm|(vE7EL1mpFU(()@41
z<vl6Y=egHXy__b#l;@RK)Fq1>MWq_W<(6{<=!q=4Opg8+BpLA=#c3+~weIhP=RE`u
zdKQ)=XA$k-eG6Ly%t<dnSERqzGfr$%7tQeTsg$)l<lBWAQA^U}rb9mo*rha~q@_K4
z`i0;ra;D}BrihPAZ@llDco0a6w^Fj$KUc5$L`O#(Vqnzix;h{6V}i|q%>eq%Nf0q}
zY2gCqzs10a2rZ>~Qj*Wbze<>|=8>m%os)=e8hoc*kv`Wk*HQAwaD@gv<tPAOk`*Mk
z8vQHVtOu~fu4_(3N}D^`%<vH?<-SIpt8M1PQGwR*I8cm8K2#;ZjH>8=<1-&Tk-At7
zxzv7AFv|Iyx8uSD=-+*gV<vkQ88m<vy~W@$t1Qd#&A6?I75l;37WMrW;*?;u$J`%0
zz<JH^o6skZDXYf0>mNOb64!R{P86>YR6tb98O951r~l5Bl@3{cxv-ijDsvoSP%T)a
z{Infv<@O)F@n%Ya%zKt+jN3K;6@Q*P_#~n0nIuip4{Q6=&!Zw42Y+*D%RV6xp8BdP
z;LnGG)`P9ZzfmzU;ikwsElw-MnbGpJfM|_u7?b+i*z_G#2p<OJNmj?@aTOmz%uf>(
zzktob@edHGGG%Aqi<E?AD)P8p*Mlf)c4D4arnI+bJU_>M#3JQX{YgM3nP>8rBtXxt
z?<Dz(?acfP{AHtG+X?r6;~o6<SH>@*nqieEyp+Pnb>e8iN^?#5Ny{o_SVF!mTIwEd
zVNG%<%O;m|ad<vYL+<cGWaZ4-@#(*)CVJ;@FX8og%EfDVV52wWbE)ewk;d&W30c=O
zQL?GvZ{l-}RUxzq9j!a}qCI4jp6lqd2G8UOS$XC;yKt}~+HLs9&)O-E4~BYrs@L@c
zSe$Cu@`lvw(pR7opX#$bV*mK~aK<KrdTsCCoP{3h%?a|Y;iw=^$d_OVF@+yh`R~ZQ
z=iz00lX++fySBg*Ec1PeW8#-qO6x?`-?CJEjZ<VvJib}LM8B~6Pt9VU6)v!XgSjK+
z9*I5?l4w6Vw<?t3aP4TG#pK>{juP6c^<a<5zu5Y3u_#N=z77`|8Of>3a!965e_vEn
zbCV<KxNf>s6jiRCL%47pLR-JA#IYjx{%)}52L}gptcqGhN;odbn$KqLe|_5Y)~JmT
z3Z?c!ul69z9lN};nob@u9P6&`n~f*1mlX<*s?RH$js{oJMn+!z`bcLQbaV2!`g9#4
z!fgQgY>+&%%?ba9BDt#-PrLV`<xH_Et&c@Or586HIy%~@^CJ}-I$g@_a1!Xxr+1iK
z>AVI7ZoOdPIGxW&dBPC=u<1aD8QTZ~r^~7lUpD_lwElgI3#V7i^hoR5u6SPRfiLqH
zehPbPug-hO*6L>9dGC&;`{5Bg`zg$Fxl`hh+tf}-y|2^qf_F!wMkru>%C{day=HDM
zWs1%4V1r!+V(%<F^<U{l9z$J$&-wUJ>L_)!ihWm`*Inb|Vd);<=vpNjTjki!l;>Qj
z!YTfj6tDd}HH_J68;9wA5fA%!s<CAylGw&?Y-XBBZ>}l4BJb{w(Z4Rhs*qObmd&@Y
z|Cy!6YTYh6pp7d$hDtT6Y7}$N@w|5fWCKGbB%&k=ee~deG(QSJ`m=IBQMGxGU;6K|
zgk*o)<TC<j^xjq+DxcK78yJi^q=(PO%bl~;v4ZIt*synXiqivv?su2!nG<cWr~U7h
zCt{}S9QTwCW;qG(8v{u^hrw)jY<Ek?{#)yoj@jAY?HjQTPVxB4W<+mo)UhV2s#^Cp
z^ZkX9hs(=V+J&LH3CXj4YxS86zIfjS9f=cPdQyJI@id{EjsWY!xss;+qmtt2^KVfy
zo#tYD8jxZI@iTDcPRQpr2+17C7LIF@QgQ3iP70I+EvNu53UaNgMT1bsHSBHg2kODk
zH+&6W<LA$75@~!=aD_KW*>((WXy#4fJN&v5TfB7JgetE0Hw$_)P*x8PGl!cj7}t6%
zh$9MCI$<UQ3ri1@O;T+q@Nyu)foUo{7dAva8fb<*4WNX6i%q7Kp#|JyV_FjB_gh-7
zn4z8#QgeZ;PA9hi1?#w2O6xdH{Xg>Fv&UiDA8|LJfzN-0@RShj0MgV9JZvc=!zCe%
z#0a~=6&lPvg*D{hwjSku+wTI7iVK39j()vn$*GBz-wj0h`_xpVd)^EjVAE=RclI}4
zop`ylcb_(~yZAR)>)eQ%$otdWDdTw{F+JG%7rzQ-%z$a}J@Lhz>V!lIO-=V>+{L!6
zlIfBFy{}7+b@z2#_Wx+a{@d?naz;q<#~51eR!G`Z#L=^+q`8s6{dGF|?oG&Dh1p;S
zPFbGe?6TbQ`PRnla!%buonn;Ev!t6LxoD{#y-R9=<i&ryOg2rd_qXo94e8jO#d!zX
zgpy19z^&ffyi^;r?d?$A*zLcD{g$jW%*1j?@CwDs)A9vJeo`rsK9K_zZ0sC|IY)(s
zyU84WL~@*zL(5^lvzu$q6V1ecmysdYWwOK5FWhmQ7{A-w16(pXjBl$kH)i&NB`m`J
z>~+SA3Qc{QQa*G-77iYYU^X+}T!-GA`%ItURE`+*4{T-PPqimD<v2$Y`i(r7Tl`)9
zF<CvC9073jOTiT!_)1TZWK%+g4<+Qo`0kL7k;b2tOCI7}Ueb-E20jgE<EDZEx#K~c
zXZgKL+*GyrP*0J#p{l#S@z`*r2v(gVfUmY`M`-Hf2|x8QFfo+u=$DJvLIobGN=c~%
zCYyW=)bJ0gE4kX0_^m^m2XBIufg1lRs}R%_O+}i%OoZ>r45Cnr)|iO!aNaiB#`lQp
z>T{aU)5Hl2<bxj!kO2PmxM=BLuyB0>S_?08U-Bd?>nvBEtsUwC##!KIFVHQ!Gte^(
zK|aWl_TH8KHep~SeL}#SSE~FT4E*aF1!P6EB_<&gfSu%2SMlEeBATmwdbZzD8>r9K
zc3k5NZcv(Aofyuo&Q<K%<;8#NP13QPKx4PNb^j+bhgER+PETBjEmt}E;9rpCb{(T7
zO^{<VeDZo)@hck&X)E1_OIWsaGo8lT$4_O&zvR=@${FGP+CAPqUL$;#h!PeldHZ=8
z7N4XJx|d5jy5fC!LEkhkcTe*wTF3ule34Z0<-`sJ;do(Vxlh@s;#`_n%2fyH!)@7}
z@4kG5Vf}+__5I&-dQQ!dRAo@wh1!}`$M?}>lPy(dSyMPqd&A>jop7i|O@Wwcd^|M_
z(165SSlgm_^du{v>z!$z&V~73=Wd(ICkWWem^Kisdn-2fTAcfh)3yXn2ztDNx4|ZE
zQ)fo(=DrPQ;YkPy?_Z|B5XW7=F4eMYSIz=l;KvXy_eA5%Jv|^W(o~Q-)KBt6KYJRU
zM{ZDLsVXHF1l=q*EiY*DW}Jl1s?OfZMbGjOpnA^BIu=1l&kwb@5KiWUyX15psGq3R
zstpOk+i(gbR#wM}or)NVHPuy1s@v-0?8#<61L4;K0Z-NX)%we7?zg%)R(bbQi7d52
zPJXdsLXDprNF32_ZEa;wR4FMb4Js)CQt&N3njNPUwz9D?X4ju>yT3Xj)VYrAv6~y`
z@LM$5=I`z<P3?(F1ig;T*~f^IpNV$-LHKu)v+g490iGw5b3%@m#ClT(M*{N(S--4u
z$ef(Bl_8sY17k`4EEMrsY5od5RI*XAZ!t=+CYb+BPnh$7BL4@DIrM}`{FNPFO=wCa
zoe*@9EaiYNB6}R{?=WZgpn*NEHg6Cbl*tnR%EvXzt89nhXK{<7iHGgF4L%>`!x$L@
z7`t~R5v`nJ{Zz+PJ#!c8cqpvl)|}^k-C!tRcCUF_v;d&=BD)|fj5fXzQ&ofhI9uSd
z^uFx=D?PFM{|%3>C_7;-0qbT{cXc0{bxp-DPb5pNVYkH(D`hw;3E|bYp*!5c$~@m%
z&Dj1O<}+L<1wG0U<)RR~(KJ^u8nIEX!z=ti^>4?bBC$TvJxR7uZw1dtg}~%`woO_#
zQ?~YlwUUe$Bbt+i|D)Ppy0jmV@%BHD=Tq#H5%4WKBW<Ftpsl3gzUgm_KE`|+#_=c#
z9R-?(T+&?1WCK@sDKFs@m~lyNluRy29I(b@)*lKI>rw_zAFlPUXB#YX#p|i?l{Lu<
zA#!*MYR+c!_uq1))NtDr+8~KUfBC~HzUy<#N*rX2X<Di2XZ>wr9IS^P%rRrwO+`5@
zMN*a|*WzuS<M2CKMX~!0*^6}yXWfGxpS*I`qS_!!7TyV4s@HYjhF4wivH(IyejrM*
zuDk%%)Fhc3R1v8N7oScPRe7+q0y};w{jagzA-wE`mjtOMD3Ha=Ero)FX}+60mh=lU
zEGzt0(BDq^5sxiG+gy|pB?!O^c)+rqN&G!_S0UoW=M|z{3zSnOT)nqACAoPOwf|1t
zu;~de<k<gQ1d6?}6=y968Kd<uiGNzg@LhNRU4t^&1i$lHDb?tWp>h?JIZN#WW1Kcs
ztD|6(JM&30<=dL=sc4jWhRTlkYcm5VSeU?L^&0y$aDP9gNNI3zd9T)&z3cGllY|V{
zuRjZiP8cE{e#!o;t(4Qp8X2)gzQ{Hgjk)4xiGj`OM6|ZJWGxC5j)=ZKrjlbLv2ed>
zipj1J#qI6wHP?vAyN5EPO$JUwF}I(pq~%(YZDan}cYlLoP3K(O|NKyRq$|{tNFv`o
z95YKReOzJAuoGUjOmtH`GEgz@VD_La$oVNpkuqBk_BnjDs>*L-*%22~SWcdwZ{68*
zc{X_3U#MZag*l?Ox6f|nWRVqYvutPQLg=tLgTa_QXC<OyEMYWYOM#NU{78$4h%l=t
zcp7xuHK^3{COlWRk{kM{WL3%(-Y&t_7oJx&;gh*J&*J2c$0fbXrxj32eMDQq3X_*V
z;9*LNFOhOth;|_BV=9no)`L!Y*YOWrV2IHy<+?2WiknDyE}9I}`&~|WiLHEUbDu}u
z17n)HxYa~!$ry$fa|)2xLS3GYY)4&qkE5l+$Z3R$<;eOSZaiFHQtYw|oQuHOpxdo|
zbwY%#qZ7Ly<>F`aC-~-o)fMFD<c#vMizurG|K76DfXCTU0`EWR4=Pzn>$X6Ca4JjE
zWzVUKtD0SeHf<nyW7z)jLu+DoprEBF^s$N8!{9Wq)|zV~NH)0R;B2@XPYO>M@4iy|
zaZ}SkVN<PS2)STVQ9uDMf!+KQGv*-q??Zt;-w^<LTJj{Loo?T~7`6Xnm)U=hp`l@h
zT%k=-UR4MhbjJRfIoK`&rwz$IKqdx&O%i>dCUPTZI#-p=h4$JK{O|Bf9^*%;92TkQ
zmH8U1)hpczHoA%)B0=M*7EeB<l7<BbZjB|QxmBJtJ*l#|Ag`=8e=O>bQ^nc$Ff7Ub
z=_k|~0fhNo+QcBo)LY(Yxh}T-N_YPUbAN@gx0Vrm<0;zA$2_jYDs?<Q)25WOWq7Wy
zg8jq9_Pk6}7!O{OSdj1wB)+-|NjH3CVQ^CPu=Q(+Kv%eq@O$h2(PWa0GGKC;mm4A=
zAu!o?^b+<+2%m;6_vfz*mofZ25*e+YiG>R<FK<1v2;ob(JjXqGOY@|)vA>48BrXj!
zmB|MI8?Tp?TqYfXYmyo-UX;%?oC_CR^Jj9ao_VEg^`gLv+&5Ceev4B!n*ZfF*O9eJ
z$%y>7>g8d;#s6!S=XSC274B)~c{q|BZrNE)Uvg#&KDAB9>7_(>s9U3SYgOxiLKSW=
zVc-R4u(#U%4u37M8BijRcsfo@u&X#*P~{#smJ>)<daQC-WGlv{OP6wP36nc-+PGPk
zkOZnH%{%lI*D~jAGBXMN_vjOxw91*&+$NHrb6eDmYc0L~aGNy$QjenpXC*2`=3vII
zd>JLvZuVV%WCJy(@tSVn_U{9w<bb?h&aPp%IIVVLYGPxcpnIX@ySPClgMR5(6sKq5
zhn~0T6+f4wYGr%7heJ0xZ97hJfHNbvQ>0@~8blJ*eIC6}lPb9h-4y?Zr_@wrlZBKx
zWajF%oZ0N4ikg_cotS24dUG}>&Xk{SWZNk753>HP{p`-Hd!B7WoN`pWBvUG?sy#L_
zF%jZqAYh6SykXW*#SWp7k>u=N?cuCMpK{Hvg)-TCNo2aAO<)4<;Y$XFP`T63eFT6u
zrC_iQj?Csd2k2XB&~2~MOSR`PLd%61GX+nDj5ocGK2@AaQsvT-pBWSp%Oq%8aLNXz
zV>9y^(Q>=a#u#xDw`Pey5&Qy2srvt!=U)sGb_-_IQZ{zhc5^s^=*Wm_^3-O?E8I(q
zAWK`LndTKwl1|i4J^i{~ky&_z4)pO7%m{?!m=g|>Om2zyw+)tc;N!yo^0^iMC}&um
zhC8&iKlNFyJou|@ka;%a+t?$5^jmqNu<+lv-5{GnP0Pz|#MABy=<c`g-f-43CW^1i
z^?an7+A!&M6B|;YF_aJijs{6iANHGYd~Gx1k$?SATdbnhG|<Tt1@X9Hq<qy^XYyzX
zx*q;G!7cH4gzKMf8N+)zT|eaXFv@{HfBvrVC?e>7*d!$C6|0nV@o@`HxGH<6{~nk-
z-$`N|K6t>ZGb$Ue`@_|C`FYIw2nC1wcc6OJncAuSzsnnqtGw$?oZtF->~3A`Mhc_<
zN>;E04o}5om8St>_B~lA=EKdtxz}<Mr)bkL!9TES%|tw|-!;)P1^pF+l5>Xz$L3~d
zwe_Tdl23HyUC>jV^_PQ`7&|DPxiLh6w#TKc1E~bj(G+R)Exl=H;nS)9YH68$)^D5c
zw^wUPJQsCGv|?V8YNx(vsn);$t_LK1S#Mu6QN1E!TT(#y0$hB2d?qJQz8!(|l=}L}
z9t*elqWPN7GuXsS2JrwN{F>-yH20<Qv_I^ZazpOTltP>H=tXe~yI^a3yA+ETp1RzV
z=H=c0I;qFW!ak+a^sf!ag)u!0=T`Mch@2Asq4(lOhAVt_cKfHDWwh5Td%Dd`P7aI3
z+73i31-Y3eetQOS^<p+>Or>ma(r{X|Q>1-(Y;1<b6RE#GPs0mAP^bgNiI3|!F*o;9
zKlZ)NT<VR|m(Oh^v@fU*{b(5Zy83>iOMsEtoNGB#obi`aRQbvybt}{)vrPE)vV)Hm
zKe+-Dz;kYj$sv#)xAM#Hra|q#?e1QLRX8wldF31fK!s|~(#B=kgIbs=gGe#I{}<3H
zE5J1$&N637X4-S(=o>?3Nc5oX-I|q&<^LjsQm#4nJZ`G=E)gv<K*;&hYB}8m+h1vk
zU)&4lqv`wq=5_sO7gCfi-hqGjrjLiL$FW3(bs<>!V8Lg{xDp+N`J3&RmR8vzD;@<(
z$1VAxA!#K-^LUe9^y~U8GaZXTs_;djNIz&J^yzuAfIolsGgKm$>vp5p?>BKeuK5)$
z95EUbfo=D@D~q*E98r6inKxA%LaQ4#`U0PsX>3A(5^=bi3+g{_JUit7dVu@5rQDOw
zhE;a8jF!H1S(Ch;yTf@75y~cO7h%D$V1_<l0m3deg_bK}FJzxb@|noumB|bL?LJl_
zb9n5_n9;bw$NA~Dc7mp$d1Rt8!rhs);MyR?cg@^`sHqm+F8oB1DDLAE!jOnUpOe}6
z&52W5n2}yCwq}%#60Oa>zWG7QHTS7Hb$>&*fTtxpt-1$btgG02n=evMl6&G(Q2ZiT
z4fIfPTb6yH@i*kPQT4AM4&46LVnKYoX`&0o7j-6iuz??jMGF&Tul5N*x|GX)x1GFv
z!x=iXqkO4Y+bqoup)B{6C-s@I9@pUX)KWbqdYThDA8>Y$H>>uyQbuMKQ~JjVU=T?k
zS2}E!7=OM}N2Kv+(w|HL`-@LUID1B%r1i_4&~?Or5yp5O-sI>)(cDyzs$*OPbpBaA
zu9Pn`fn{!@ZYp!)z4`#~x8tsubSb($K!eBsoQ#XHaNgWqQ&kz_i3Mx>Q^OTL$3VvN
zCMnx9`G3X=2z2C3HAE;M`<pG(89=ZGzL0K}G(4^VaUJm&){s^0aBh;CE#L=|k$mvq
zLd@cX2A%(&wtFU|_#)f*biJK|KTsY;GbFqFQpiG9cM$rwZ_D%ER)2-Nwen>OVLv8A
zL25qjnM*Qr3vK`Em7HjawM5F@xA&wvN2Oged)PTonQ~}-e6Mb0Glpq;TY;QC;7ipc
z^(?$S-`+p=sr-K&opn@`|NF*AH*A0i(j$j}G>j5qgtU~TG)gx}hs5X*$$@~<pOKF4
zMp{y7k&<?Vq`&9;<6jPEXNTu~p8I{>*Y&z8P}}^mBM(6!^$FMq-Ti^YIk9?i+vD)I
zrB|05(mG^NHw>=E=MO>z4aF&4hf1o>e2NZqvFo;9`&0V{>Tp46C7e)e42f@0aFSX<
zDRsIU)J7YWsz(Yb{LNbul|lhAp>DvB`r!Tj@-WLXR4bi}3y)a$0Vwbo&{J0~<+$7c
znYQ1LiOWbYJZUU=_AJL+8&Ft*Us8+=8aSlQ26e5S`$&IC&uPd3T*C_sHDk0-7J~q}
zDYs1TYoojMzj$@HmcBDOMOe!|ce`lQuWbkR1j`Bi#Z-u@9LGZ8EkRWwYyOD9&``Lg
zVCdVN!ue7q4Ook&ClmywIW_PSWEU1{;t(n(7={;LE&;FD)j|4CDXvQfzH3dZkI3H1
zL}meo?mK^suXmLzRqsfTfp13*+DK@aYs{VDl=u~+>eeg0MijNOc6wzbyXj9v|EHvz
zyCce{_qXqJFs3G)J7OP8QQrF>vM0;7?hXNiE%Aiq*WNJ)E9>|B4zWuA%%ZXflCyVT
zne-pjViA{z_`m})PR<VHc|%SCVq1Kl@y&qybKyqOr8Y<+{MWDqve3tSKPUmoCM{Ka
z_UE>@w}bhhwI%vmIL21y*IY6ZeV&nQ9KQPue9HRt&KGeZIv}6$$&)}4FW#S&GISW+
z=a-~Fzk!BGGA%99h9hueR6yPdR|&m8eRO?JJX{%>%yjT@gk&>mS#cDN!_&@%Pw{UM
zWpGG~<6GynVY%Wy1(M<Nq&iSNrG=UMjmXD;*ERRoRAJ{YcdqcalL--tOk~ZF2xE$3
zi8Luw+P!S$)b-RUsqP+`+;=TiaxvZg!HJKlt1Xe=XuobV#e@oPF8PK9g>BI~2g*9N
zve2uDAX9hM%BfQxEZ`@rt10X07K9?fQk6d()fE_!;>L4DN<(!Oe}znF)+Mc(Ssvpf
zvYDWwGao?DIG#i&=Wc=p1?A(n*{S2`B<0C5C+gjhmB_c``D%U322{_Td^m-ovXNAL
zXK5IpH<>Fv`9=TjJ8gHgyh|1}*Ve)A(cXRxWcBMp`_ENf&sl?|s68TkiPzbh<Q2XR
zUyVf}*aM*+J~IE6uNO)Ts%;Mx`CD9iGM-~YEty-}prGRL*J)c<e>MZI3^Jn?kl)@}
zswidvZ+!;P>S|4;k(sEB#1owvAUoLlyXk@IuI}ZJAfD&9Q<h29QF^5<n8st7=Bhy!
zQJ)jvj@S{mME0R0@s#+jA(7*-z2zZb#iu?b5n!CjRWfTzMx!ZKId*${%T7Lge7q3S
zH6)v*=hVURO*Y>Ya9AJn9~9nn?l<F5lRTB{Kux}OSiO{s`U7OM7MJBB6~V$EfKWaj
z=pf|R7*wRY)G=OP#9$iw(`~<Xuv$Wt?32$#2}W*=WJCP^daDXKHu?vklT3bmTC?Qf
zHmNgd9Wo9h8EsuLLpzD$OptM`>#kgcEH&zVjh?|`H9p27&*b&K*4=76h!ywvucOM8
zwU60!$rd66f?~ruFmR9x;7mt1e(euQTsrjYS`o+nfs^g{iVoymdlLvG0|{O-_YudH
zpG&mn!o8)R9BkVc=mAl(keV3-M7r7QpJk)(pYb-`8PmdD%2(W%fE(`EE-?_sGR_=W
z0i-xzhzJm9{#m^kThny&>M@ONycQihO%f@AG>a}ZE_*B`*Hmw6dOYz{!g^gZjl=>K
zBsl23az@V3^tyF=hKAqebS#c0mVd0nUyLX23;v6lRaJDG+&Vt9Is(wPT7F$NHLa?W
zTTjzhI9e?zslvFv$szxK!5?!2o&5`^0fn0tMkwGP(Ot-Qv)S*xa8G{y7eW?E9NM2F
zBZS8x%cMykPJiMV9&>tW_L4<}f=EgH1Mg22RX2JmsTLa5SC6TQH;|FmM@YXD$Dbf8
z<S;KSZz-b`S;eLYQ$YkEhtI6tpQfyZT_@*|@CH(`;Ui)F;I!x-79X@sgbDwP2HS>w
zJRwnGb|xkApODgIP*jl#j)(INB_(1Ezn}IX8t;qs4duez%^SJ?%u^&=o)YIqtbH$N
z3`PH*(~4ETcX7fxqjC6{%R>#CB@!mJfZg+g%hhF^B=+HvVHOjA)A4g#m0P4C=P=^V
zzC8L+*<0<N*EI6dggm6RB0tATAP1J32WbSq!!^l=CfJCm<^)FY6wktuBvjlF6d#6l
z&$8U&vIEYJPh>pMRp-0&CtaG}_i^^G=$^+>jI=7aaKBrWe%L1<np^q3bMD1bM~22o
zS;nYArtu8k(5|MMAbGmEVN+lSy6fPOY&btSp@ZP4aDJ?@!{RQ4wY5O!qxNVlXH+-B
zWjjhq>N$Fj{erI181RU)u*En!3uvZx_=`517fkA8Wu(i1UXUw5#Kc+d*{xx4vzMZB
zDh~ZpTZZBy@<6s@#cw@gti5{wE;J=c`cxXHa9~VqQ0n6(Y>R%vYXU&_EM0^Qp?Lfc
z&@?tuV=SuKj^A$X?)=)G?EKH|281?jaz<O-YJg4C!eg7ydrw%Oh5{c9ta=+8RqH(t
zF5Aot8cOH={B2c4A3!pVy9LxVhMz;f+7;{oE;g5?N1|My{>bc%Z+kwivQI01-`uo?
zELAHiz%fREE;+P|6=^ZSUk<syJqJuklS_xMO8=al0ah=6_)d3^kMSF8Q^1hhx_WHc
z8T!4+!Z&f6@>xa>Cwsb(c63Yg7}xVk48RLY2mDkezgA20)|_0^78Ek#gr0MQ4z*%2
zs~{n+XA0gLoZaETT+F^vG<N{JxSEZFJrK!JbbaF+1n2WIg=;knMfoNLPi6Bb=vV)n
z{?g>eEge(2t*7?(Y&)h@en&)yr<m63c6yE?s-HknWYd}e+OH~|5pxo2l5MEP8FAC+
z{~w-;O&s%g+ucLIrAzgwp(BFN)g6}wUahD}WMQuD)7D%rD5PL}uk9B03f;ug&%@@j
zPnT`S_G|jGJb9H!6kfaZF5i0zu*YBR?d^rNeCJ^2aVfd5T2`W0Rpnv%M?#;>6u+r~
z0^2hA68%&{tgj!b)p2pYEk2=a-t5ZW15ewUkiX%b6Y5sx#`YOMC=e=+4Wc8q+2UbS
zKrlqd#gk9>P(FQe;<8fv8|!u5H~IALzKk^!MfJTfEixh{T>SJ@XBP+yYMX}>73{I7
zKAic~*~(gBS@#8S8{tm~w&NY3sXZrP0~wBQ!YL~NI|bF~pdBKaxEnUUJ~g=OHmGE=
z65Bxit|-s!C5Qk`_xp+-pJaU5yLWz{{<6B?U}C2?5hDW<AyfC;txJz)okkK{QHCrY
z`rTZg0}fit5c0tO!Kbu{h=g8YS8(iPF4%@aOhnl-GOA|*)NfPRH>E;#mX{3$<0zul
z!Sj`W*+|$kZ`s&rlIF|o<ps(@R)Q{z?w*9*+pjruG9r|W_d}`{to<7{s`+^}VO&J`
zyZum<yDWGuN0^J75Q6Z*T1$W)L@X~pyIE{SQ_6VyS5@Ni3X^^MB*pgSbnCpedcUSu
z|Ep*bTe=LcPaFmJ(Ztwjd-F!;F%GWV%~6!_TH-v;o7~cBj^`64<$MF$IQ_mBfI!Kf
z3@Tj2IcAx%{G~>Kr5!^AH+vy_H}c4Fx*^sDJG>-4AES?@x(8?WsO_J0h8FCUGo1<`
zK4&-<YzM3Xf8Tjrk^zW&f}`nxmJoP<VgK~*%$7ggFE-#+!AAg=0V@Gy6LoZZM~AXY
zENp%r9K6)@rW0(vbmVXiXeewFEWegR60J)S3{>dGfe4n{HQ;Dulx6K~dhb$zHJ(Ed
zjErQe3-d#}`N##|yW1t;mdANo({+E5^6zg7`*iXHAwT@<fQ34DaX%Y^Mdqi}e|pri
z%wwEpS718ZbOA3%_YBmzxf$;Z{<I!-P;v5;OpJ<Xz3)SMTk|P%4)DgPL~%olMqE06
z6CeMl%w&H65)0y7&(3)yoYRt`CQ-a<xUj+;LkXQN)6gbi>Jf@0qJE77(KNiFpGYn9
z%Kc+giry>V<TXV+Pp@E4`%If7a-VAa{q(VrFz$B7Pe0sPVL}SQ^imN|>VCj^OZ?m`
zK7BcGrf8dvK~YtLo9!1sOV|#u{+VH)%dLO2m1Sx2cdL)8^pV}~ru)R~(uyzhX8Smb
z#0hB{{ZDDAA!PraTq^w}A9|*(?Xj4?UPnO>3-$`fccW#0;*he#E#?lP+)sv#pMZvc
z4xFC){#7gd(|1fvxE@|t2>}VshQC$Y$5Ft6Yo4797n8k|%N>xOu`N}^6}#oGQn*}v
zc)K!`^)c-BNbCW5)r`k$qRWl6iGhA{g|{c}>qO&wL+T<#WPBoxto<=8-c5K{TttKl
zD&C)?G!2^WLfalYjSxf#|J+E^D=0yw5p9j>na4i@)iY|&WH8<AB1>1tWfWen#2ASw
zNq9)ji^JL2g>a~|`Tl?yx?^l`W^jdyP3RNg5_$b^iPi}>1Y=#@n}RH=<|F32gPF9R
zEe8#q<8miY@<u%-qR^lslVB-idV$}=6eul@_w=Xo&x1hzA~D<)E!v_}l8xuL>xog6
z|F*A4x<TTv9YzoamvkYOPD7ufPj0pywAEl5+Uef$-7I#hyZuW(eCePH&qwJW-jvS~
z!+n%I0HTHxA*Z*lgJL3V8IsnMmNv3{u@Q7UWr+J;nTIGVc1&)S^t^U4@$}+4uwx|?
zdvzIhe&g!$Dak6vhbYm?;b-<IGv~T9cDE;o<13m(kkSmZD$Kk)r;)BDgkoT9=oQnY
z0>QXSwiOF0RDW*i5b$bq*ARONDh%73bfRM?TEJ;C2LR>?n4*NWuyLtfG&z}EJI@Vm
z8NO7OW&oi=sTimT^e~9APaU>i-Zue&O|o9U{JXW#b-VQ>Y_;)lZ|~2UkI^|WImVhE
z2g_%P4A_x?Nunw+ejTg5F5uWb$vyR70?Kp#*rmft=?^JSo^u+|_X~>(C;ZaWE~8<t
z`tilz^)S@u_3!DG*znL@4{Sirwj&_%F@aa|as@}J14r%QA7D-M&~>T#JocVWSIm)Z
zc@<Bwnzj=<HwoBydZ|a-9(%hxd^+c5N?1Od+7>D`$W~65Qg9ZyP7x*qm+~X*oU{*C
zHYYg1s`Of2p#iV8XJYMhxL>xf9e>JAh&*fpU_Pt46Eg;X4&u=lu2sJ7N7YXJQ6SjR
zN`^8bwi3o}t@4ONx>%`{jyPQgN;q8ZVEbn38&38l_M7i5;J#g=dse9DbxI`Oi<mJG
z%u4ud=XScGsuM$unJ4bG$Rxng*M=O@tG4{8J8a6pMTYW+T{!06A9t>A63L~qG9!vp
zdVSU}BUGP#_GHEUM9zv*+}R=9SYIgFvDb>K{?awGp+zcHBoC({iPZ2Rs7IIs`b89p
zIO#_Z<1ocknxh@1ZU!X1O`$P6t18rhhfP(fSoQ-T|KFbMaS5}P=g|~KUrs;|N61kq
zxmk(`nXo)XVv^muATeV_MyE8E2e#^(4&n5pB?Ifh(ymLd%%V!$^4Q<wQ18fE*hy@E
zpRqFsA+u=$)JekaeGHXc-LyIy<9{O#c%ZSwvAB|CtinuGxl~csb_)E|SNy`egUUDw
zJ+$QQmz4d<u`JW!oDZ}cY~z2%&>{~%RTLQyh0|Wt|Lvxn)I4w`@ZhBOS7P!k!AoUU
zP3CM7r9bPtc}S6tgWx{ia7x+BMJgQL`|QKtB~{QWEIV5s*VrchaQb@+8BW9Jfx*ju
z5#n>wH#jJ>`P1~wh<X1em6@WfW$@t{P&-it!Z7=O_<T&cp?*@uk&0JVr3%(tMBUF+
zZ<N<*H{R0z_mBROZBm&2NY?HBJtgH*h_`=QhF8Hh{0kQ(DNFCC9yG@wXvvpJ`(Bv5
zD20W?;tlPlWGMb7t8*borbnV@+e)$8`Rc)`PNLLkeQ;2*?fD2q;<(e@Z1{b#3uuuZ
zFPn3xMe{|GnKK%Y3s_)(JR?8onKCZy3zRT@0*1cdwrABPCmF9f7oMi<pZ~5#X3hUT
zzvy($W9;`Do!Ntx_#l1@4?LqIJHUIG#LoZQGiqIVt%DzX9_W3Es%LJg5{yJ2T{3<E
z-~~_bPBwv*{<+`ErTTCW?Yt=wIp|>;iiYg~gS!qm)?~F>YESBdkpv`JSQ5}@iRVlz
z<-&uza&Ky<SaaKiyaY==qh>lK>BdZY*QrZ*$EYzz3V$V1A?esU_FfzV!*PxWKXAMX
zkiuDs;p_5)5qRUH6&Z>M*Rxi4SJvn1>h;&sx$LC8UxWic6K{)XkwNEv%wy)!%BdiB
zQVs2v4C>c!XnnUA6Zlp7`?sxZ5#WsEB9LbLnCO$TRWs-D6;9>G?*l!@mJ9T&V5@?%
zfZTLWhd9lDLi6OzZq|G7dBzL*3)e|53&AWDknA#9I0uBLy^cInn0+n}ck<S=@I@yI
z(?9=4UVm}Vc}3}j|Mn|(6iXH#9C{PDViddBK|<8vLp(&4xv!fh_cJ8?r!VDCQ9EWT
zs!Pcr-{zG&PztD<f+Vefu3O2Dl@Eh714;SC1hgadyWguq`|#vR-u~uw1A)du$Qd?=
zX8yFXc0T<`7g|^lF~uaID&KejDx{I2A?*XHP;*<85`Up`#p|mlKME#(k&hHEUS{RS
zQSh^t{W!SmzTnUKl7O3<Piy1Hf2Qew>@uV#70COC>k@;c%GnE3byXf3J}X;M#_+9+
zJy22WCkD*!(zE|1P2aq!3}K=vilp+O_%c_R;x+}D>Rx%y%tihdlCYrw?*lx-aV3|Y
zLVl+V-y(1*6+^p2(hM2i&)BNnG&WCzx|2sQ6yBu}vxrH`+;Vs<KS-Q@fg^QYXURN(
zccv)WFqePPapVht!>HNb*$z`Go^qm8BoWZzxc9=;FVscykpm!q2ZDo%K6WoQhKN-9
z+B_=7qD>wGL`*aI2w}4(0glS#5+bougxYyP6rb}?s20@7XL76dC|HX-V;bdwE79@g
z<fs?lj90*Q;WnIyPLU0_5yt^YDhJA3sKPzcQGdE24}=GoK&Lh+QhbqOCR|(LP1hx}
zD!@2-b@eVsn8z?_Yk&Vb9to`I%A<v2goXm@j)TVh#)4byB-bv`6_A9B(4dgxj-i+Y
z;@+2Y!eUu}W+tBk_ENy04Ty?2exjmu{Q9q&%v=s`fxJ8p8<)hs^kCyksiWX~P5oz*
zK1xyZ7nlE&FbSa<=^8n4#+((8HApSQp}CZEt3ej-Uw0u_X~yxK94hz@FZTE6pSHK(
zo@`!z47>RQxRO?D7EJfWbUHAml8BGndR}oZdnLZ!d0F-a+vZ-p++g7nRGDTJ+Q?sm
zaj7*o$8l{QKxzcNJjY&%d|=Y_ON`SO<kCrx24CEP&mDOOoN3Zlue+c1hqLix#dP#5
zFs^3I=!+i$$WW|F=}a)GJq(fo>_)ia5K1bjQGQPA@exN;I(tr`g`#zGNX3@CX$`u?
zB&SqZIy(!cuMW@3n0Zx|Q<@<Xt3wd94=+jgUch<&bC*sVlKK@U*Cm^Gl{NWM_1JH$
z;^eC7$xSEEb~=!iH}6e6cocE{1=w|V%6GK`B9rzWU}ISDU1_v00OFVirK@IW@siJ_
zC3a~XCh^8LmE{;}QBSX?e;8SsZrjRdAiQ__f@>D9<mE%cfn0jf%G_P{HlAK2QT)S#
z8X{D(DpO&LYIeqexvxTWR4mb1c^Iuin1NhyVqmfwFV(V8IV|~t9t0w1x~=Pa1R__u
zcTKS(@oG}tRSM?9{foUiJN1G%kn2%Q89>N;Xgu}6JTIL)sGxk&WhT39bH>kJ^!dBn
zHp}2f1%Cub=tdz)HaT(0AlDv~$gG)Pt7ek;oZ5K1MoatBZg>@A2pAxqt$bM^9PXoq
zOWAU&=sJwG=&H0Fxi8#>EM3C3;9T6)6GyU|ao*7Gy7xj*vnUPRT$w-v3i02>UKs)F
z#4?_uAjOd}wQ>qjDr&EgYX$eAzErp>6#p_d5dxjL@N~2(<;IUe`j8JVCJDXm<TbSh
zDG%6+kz?1J%%_NJR6)##t{wZk9TLxB_ioy8Y34~ZqWVSDhsAjPn51S3SrzEf${<?n
z^&}0vhy$;dp;*?<lpkD#>yb@_M8-wqCMkfZAs!yyn&nRG<=fj*vzQjm8EPMcZUjzE
z^qv$Dqc3*Ceu=uE3MJv}8+T2l9Cj-2yX?pbd^4x$Dr+iAq{t8OP8mgT*v=jbKgTx&
zpE9Lz+2I!!k;aX<6aWqo07shT8<KMQUu9hmmw7yTE<-QTat8}P+ec=5{r)ucF%}h4
zumjfb=cO6^HM4i-^BE}(9Sk>Ae{qO0Y7o}qvI%ouX*|rW|Ahi~uK@2IO~mr=&ch|(
zrx86`FGQnYPsgba*9p*L-soJO2OL!(kOSJ^*qU#v9hJ(aVY8w4Rpbf6!0V`ENap%>
z3wRmgT|ThN<wl^k<5F8GpQAMBwjB%91yP*g_0vQN8R;wS&k~IOJFiy4@yspdEQWm=
z1~&Llvap?p$3PNj1$T(kf!*&U9c~bHtMw$XzwKJ~`N(_XExqvJgW*XdOc7s-0Fc^@
z{6uAg@{0T9Med9L@<Rr(W5-Hdd7~4D6$POJ`yY-kBr#4l1!W`TzdMJLGW4X!%TP4z
z%G=0XE^6zZM6N9Z5kDr|Uk0TC>gi1(06}fPqvrAhSYv`%)g&Y=3~)YHa^M0OztQ##
zJw-hPGJ*#29Z`JP8G3cQ71$B4Ca4_Sc~oOdj=$LGY68$`ArU#tAxjrGtw~B>drC6?
zx!%)DJ3TdUpzPDg3B5lp)5&_x**+JtVkAo&^FmvZE|i!C4<Y-=6~V??IQ5F~??tM4
z*#2l&hkCcRDoBW0db&xNOxRQ;T*(>S{POIcIJN}@68g1y`oQDM;IwiOEe@fV$MZk8
z|Fih6Y3mAkNc!+dN-kZRJ+Jtc=sN2<gwlC3Z0{v+BW)|Gd0QJwtA`KbVdGR#k)?G5
zACsmGB$GFOM#wKuF}b1FOxI#%?AhEx$WM&P+_obl9{Ye^DlkqU;z-6cV1ikdu5C4+
z+!!&aZ&w+!C6A%cbJUCVxe304aJ$#Fhgq`X6$QP;aASSGy~rmAE9*H-Ti51LYQwng
zf1af#DDkrrYfIzGgK&_bE)&qoTn{tlG?Sd^dv`G>&@>%)s_M?WHQ5Kr>)L%(Wpn4(
ztENrUD-pi^6NSQrO%6wxMj%GnX`bEijvbu(ES%=32;a}25tQ5^qT$J+My+TB@@56+
zSn#jWUhw}Sl?DJak{l*wt149;hqh~j^z4H_SG8i*nZPePIuDiNUc}`DrHGI7K>@QQ
zLiXBf+qZ)wlCLtrwPU_OUt2R=Z7fYyv7ZwB0oJL}9kX%aidKetC?tSXZ`tk>rYUV#
zEdK`*ry8TR#%7Ij`GAql$IfGh&l=i-K3jl5Pc#vy9og`mTjL>LvT0Ii!NhCOUx2J6
z#%w?bQMqa#@XCd|NVC80)&urvjRGx7&WE9vae6tNye9z#VC!4}bsL>t(HIhz^J=@|
zOUyWMt6p_mKmo`DAxTlr%Ah&nZn=JuqTrlSgeI=y1Isla%1#A8I1qiB>6+_AI1Z=N
zAzX6^x2nYHuGdX|4)x_eLW_<zy1_~SZS#ZqSO%wZvFF#8449&+3$G|%aoZCtKTOVT
zjpdhmm3Vk^dV1w*^@6LrgNi0xhuC6b2S6;_<a2{P;pQRKSKFh#l@1OSzhGIW&J<;A
z#Z{zb#1QTx6m<f^rlK}c{!Oky5^>5)&5ClIpPlGZz8NvCf$`0!+x#2jFEK?Nv{ue&
z`Z1&QtuMb&zPqii?6MHy=OR4M;W!G~Bw&t*H5p#=A4yIDpxly#exADUr7N)9ux!F)
z{5kE5HFjh10r>471+%c{em9f<Zo=-*o7PNSVOw4QPGo{T=>7P=h@_qUIlJw<T>Iz+
zoX}AKx8c>c#x5*s^5$oXL0REhr?ux=<phM{S1jcN7J^NH*|Rq9du{tmTK&=WsYsm`
z*?kxe0^dvYZzI{mo)kH@*hGN>V@WZ_7gv-aphBVitUnvTSkPY{n@J5?8P4zSNWKX5
z?FTTjze*Pvg&w~aszsSg#Rmr?`pbVy&;Hc(^OqD;LfDAC#G}}VXHy}~vU7;_z4Udq
zYz#d#N+Qa;rZ4^M;MON#x0tx7BC1a$;!B=6&7WoP^^aGPzT^M<>yoT7YgjS7I?A=7
z(1H?8N6AjZvXl2McuY$<(Y*idrBuaGx+wHnXD8@Ol6lv&cJ{iz#924%C55in#Y;6m
z3%8Xs5`(T0))|+Q)P-$jBR8F1aCY@|(Zf0qV-x9Ox^Wl)b!mV=9NhY0JyEDp^}O0C
ztL*i2>cp7b^HSA2@~Lm(&EcizE4%`uux~eQ0eE`cM2f8IY;MbKO%~I3<xQ1!^BByJ
zr77!bsac4KwZlX|14t|O%MVb`O3-l;A^DX#DqWKary0qsru^;72ME0Vz7_K2`56ol
z_{4CGs~(3p3b(={+gh9Yo;`TA362A>_`stYvna>?SvUDA%--)p^$!iSU~;G2n}|e*
z_D{sLYIh7|^%3{{-;iG~IyyQ^GJvan&VaN72+5}E(bd@{(~ZS?^UkgaG&3|bTPG*R
z*eVm#Lo{cYQXOE*>1^q01+T><BOijuiZ<_Bk1*6l6&HZ45A*>5;t2qc2>p9HgwjW%
zP1f%YUEhoXer|HmX{ZJO^)yL0uL06iZ53KGU-;w7;<6ETxd7z(Q%lvm7Bh2s5mI^y
z-jA!fGC~7-kJ<V;PB{S`KGKeOsj;gwlp$nxUS+?|2|vxL%8gq%rR0}WDdl>ZV?h~^
zmIyLn-j;nJ=Fj=aLZb+~C89M0K#?1P4Dl99U2yE5W&Qns&od>S(?l7ZuZ)dl8Ed1q
zMxTg2uBvZsYmMH+VX$+c7c{{KM}&PP=p|qiV#DR&pAq1o9n(Db(f?p_<@!2qTv9aX
zq2ZR|_$?|*ZDfoF!g9p2v0YOsf6cFLV1umo{)IG&q>`6ntHgYnHxR?83KxzUuU$Fz
zV<$kgn+x`mD_|saciTE=zd6xln#ONfS!hlN3EAbNBB={Gd{%R^uCOy2f-UoYTPcjH
z93`JYSh0W|8+B5vzgMNKdYWU0!JSdNkf~RX+P*}U%sF&a!PqEXG<URocWElg7#dQA
zwzhQzT^?Ded!o4i+;49;Vy^v{{K+1U(T6n~kb=M;o}{FN{ksm7Vj`1Fr1XThXQIC;
zhKtVC3Oc(gd)Updj5^uYd1sd3{QElEtZhdo#|-KDZ~5V((}orZ{qEz+$G(A&mRaRO
z36D(uEJuwUV)=1!a7=~1`|YxRbL(15&qp<~nEe&QEsO{!DG_s=3;5lx&&E$+qi~$j
zqpMBp@$!>;s&8Q}N#--!JTQzeZ+)~#wTxnprZ`G3SFAG0K<xnrrXkA0RCP7?F4E|O
zk%nyrKfi7slv>J5zhlk4$?@1+@D-=k<~(V`gdhS(p?8!YzMoSoHXgZDq~y^}|IS|!
zr!bX>4J7=A+!g&>795weZ5dl(U;4^Y?yhv=KMs0+g(F42yY0T=Og86_4WO}oW`<D%
zfYaV}l^d^h_C;+C2Li*dDO?RYMhV`0+cGtr#%}33wY7CM(Zj%g1vnmcusR|-Ck7Ln
z*;h)mcGFPZxZHqOUk1?;^&fZq;15-MURW6k`ZAR{8hro<p9uO)z#pbEiOzo0eDGU8
zi=45AG>Jl@&O%J;*cQ>h7wq^$kr+|VyUf|Yj<S7$NQ4b!gXlkxUy1CDjb)OIwMU6F
z5;zyUHmv<VTKiS+t5oug7y~OlsQuCtWBeb+bTE6G)*0>K^~Pne^SF(+r$u(M#BL`z
zvEsjg^wpcTHW_DBmgHK~?>%}v1*B)!nkA2rLS4~#kfk$PJQmzqt?I$gwKM&Ah#s(F
z_qa>m)vmb5;6P%m@xI2e0aHem*NM;DkdS~tlsC`@5Eu}GNhll7$?={*TBXHUEMWA~
zgm&7EB~3oVte&0;bIYir{AC-Ess7;xEzhgwjdoh3b|4nfgve=CF#XVr2a%Vs(imgs
z@fL84XZx(4=DO1eY(@;Dr$h`Z9YoLDgjJ<$R0zbd6|c73jjtXEY{LP9a!+nU^}Y=`
z$k?f2;B!EHT+ZU)Y>9T%3!#|WuN@5mMNP6<x%$MyC3kMe9s7D~{z{2;p5i9fX8>(#
z1|SE$AfMJeaaMju>cQ2_$15oj);s#PTFY+<EkM*P4q9|;YP(CS$5qUdK3+k+cDPtA
z0Yhq!TkqW5(~6*+y)#k~PMh|IM$Y-$BL=Ij^g?{lgT^KQ<F%FL<pP~D)K11XzovYR
z`@NlyK!(efys1g@-_ZNi7Ov@Hc8dvm0>ThD^N=IIH=W+uGm`#HJ0~38h2@$pUbAec
z$7WiYKS2A}qzlhn9J^|a;`Rw`z8eaxG`W7Di~6d<3u;(1KAT*VWt+ZM7GD!lok)Dq
z*}~quE|FKX|NfKxZ$(gDT6~5X2f;(RdV}iKXu)VBWsP}iHmUw_B>pZFJE%%ZA$I!}
z1t>lWe?4<9OWHIBa;#tyR~V=6Qx_wx{`f-mnK%{IgS1lOiP*vP7SaWW&Pixe&j77W
z?MeKS^#a^dc)5Ko8T&S8(zakwHlen>(8_*c%JAEsZ}9lxhF=q7G0o>}X=o|~Qi16a
znJwIP9=G16#q03NynTtVm_k=*J&U~+!*rm4<>0zWOG1K6_ch}?Qh^WO1Y1hjeu{K|
zf4b01P&i>i%L27oIL{kbdFkyzqhIy=Dwt(xI;d;KMN!?Ho+OH3I1!cW-9P5*hNLxL
z*j{If=ggcBAAy&4kMpXtkP=zBnVRMSB_*2K7fV3~y4Hx={vP-w{NW4X;c==yU3Com
zV9?}PY4-{_BU`(sC<y<U2gK<12)l6(R#fQC4n$n6M$=D2Dre^fv!;iwV7?&5s1iE|
zmY5{i4>0>qONO~KLAP@RPPp^%^>2=?Ll{H!<l;NbeBZ`;2Ww1`4lITj^r?<Y8Y`|K
zdb}4~v@dNImtzi4@)>2;8l7+MI#~%#n`Fjr|6Kb3Jra)fYC78vYl<G6$nxrzjMjh&
zX#IDW#w8q3bjxw;yn4a5=cBSck(5tqZ8}4n`x}^FUht3Ojz4{*gIH}6f3>ThPqe8`
z1Q-gmByJjbapQwMCvL#o0fY*_zoB09Bh)6^i~v0E<oDVr)BP<kx()jGSiynW^L#Pk
zLsxZ+M64_jEk^Bp2W==*ZV^ND<6)dB+J=X2$^-rBJwvm;t3ir?*zsANssbi}v2*hm
zGsY-{_|p9-w)$hKiH~3lQ?jap5YQpgba>NqO=TDd^Q|E3N#U4iIiVi-DWUXldjt6X
zZUTe9LJ$aRxFwM5YlvuySd7|W>*hmiihr5F#UImOZVMH~_mZF<P$1^zUu~n0Bu>4A
zf>_$U`y2p&LfOp7XO((<xD(oUqGkKQ_8>Mix7742AHJ9d52h=QfcRH{LmF_S9(T}J
zcN+^?8_IrFV9C-I%rKNT<r{Jt*7_f4UGdQJDhl~EG1<U=JHF>T$!8Usm%>A&ih5u!
znTE_DkRo2t!h2_es4;p|x@SrG@nQ27VKWU&3~F|?<jvKZZ=O*nS=VoKES|RV0<}%!
z->JYz@UN;rkDfIff(#wM#lN@VQvrKFGEe~HuldsA1rlX8e5f)?70JtEY+VOWvlkf{
zQSl}J_s7g9N6F$jMbyN$A}7daik6mye&<KB?Sy8cUAu%L_*FUi*#ELg!Y|8~i5^`i
z;krpIS&M!K$H-GSR+%Jc>3`T3!(TY|53!cl+B^+@fxt=GW%yu-UEW?8Wt`LUm~B@*
z?!hC4n=M4dd)aOqIjPVtEsuzt{<!~!fJhiEh0#_^g3LRb=QXT-U#YZTBLs5^KN|+(
z>`QJ0zS|NpQFzk+&D@io&@F+sa{p%5m+z5&StTYnDq=)NKqz_h^lf`f#~c@{LNi0%
zcaAqO69Ror77nEC^nAHE6+Lp<=00LI=9U(dA*&(4g?Hl6cHH{P7%N-h<cgX9=7hP9
zk7C=M+<DKu4|tI23F%;5Pjcroz;zks4v!_iqAOeMpc6C8xgdfr&gb9;sLct^?8QkT
z-^BD+xXZkoap%Vn0BH?9y1W7Y*;P925%DjlCXc@IW%ksoWE;&n=0@dx_z8Hazq{Bb
zP6Q+nDIwVOwoR*qoAbTd@##+4$wSu8@0#qtP1MiqNlktaTJz2Ccv<Gi?IRq$v?K#{
zL?>>R%*P-t9;!QHGpcgBCTFCycV=ER!xt8u9+rAk!D5Pl0Qzcxaf_|P9U+KVTHAJ{
z1XDQ{8HMwXD&E-Z0iABQOCxStw3+j!RKeuK2hTVS#SdK*1xnt^Ck=`mUvol%s+uth
zh_@ip*ja`}haG=sxR}DZqUXw*-uUn7sI8!ha)*DPgBtAcvdwq)&Hqm3pd-p_WJc`V
zqG`qL`1t5z=}va1?-Yeyb`gOlvR~YUin=6@TG>|T*OV9_)M1ZEW&(b=N#3j^n`C^M
z%iS?`0vbOy-&|AFI90nDJ7W%PtCrCi^LTGT#Bn}rOhJyBE8jO?$2Ml0c&@BLa<yXg
z1XjxacxB~q<tq68&tI-d??edb@DG6BFK|Xz><6EqCEO?=npCZ=&AkrvD5}*o3zW)Q
zhq+47O*S&H;PtjTqGkSHue*^SD?goX{n>m~Sqv^T`>?#+Q;gWCOWs6doSFddF}Q5O
z(`D~J&kD-X5Nd%UaQ$j@gcs7XiF-7aa6c>apK3#tai?qdx;lB!`RhcjpGcETIg0M$
zbv@s~GnI_NR}9%BM69w^AgS|Y5HQpkIB4XlsP_KnZRDlCPA&CNVeTE9z$;CoN<+F=
z+?4?l>+yX8+w7ksX+QVc=T7PiE=H6=6G~*?v02%VXnDC(c1J9`-ZV+JQ601R-5idO
zj{}`2JJQD^L`ILiL*<wXEK@$Ru!jA;$|796+I4S(XN|U}GCw2W)udpVv`;LjXTEs1
zKSB~gd|kRul2%^F#vUQsyJ7QR^qS_21)N7&;hNU^0-Nb<g=<q@AcGIY26Okcb|3j$
zk?!uz?csk9XFHK!GxN?YvRqe9aZR<j5l|Ne3kyeoIFLJiLR$OXtxb?24HtoE`A}Lc
z{27c3<rlyZ7yD-rGY4w4zHlHEQbRp3O{BN|C|zcnnu^NQGj-)(w>4JdL8$FM*}U=y
zW-dD&-Q<CY7q@u3F-^jvJ9X$VUsfnF^PZ0wu^9<M`sj%)x5d(G9*|)HcG=WBilPn>
z4e~=g`le#RW92sVgk6Dub2(^17USe-1}b**d?}YMd*_A~x7TIa0qQyDvsZ85P5?*h
z^6tptDY+bI_J@=61UyBfdQ)r?F?$}e;M*sZt)G$Bb8zN4VKF!=mLxoQb0aw;)><;A
zOZ@7A>6|I4KLlh$?qDu6zB!7ub^eNGew7ltfG2&DtfvWcResC#r0`q70<mOUo^=C7
zg2tam_s&5UoL<%A-Xl3?9L3-c@1C?2m>O|qWiKX9y<q1%?*p!#T#X3=TD`1dvh`0K
zdJ(&d9-aHGv;H5de0SgvK_1Daj*9(lOH0+q*V(4d-UT&!?XxAF$KEjZ=A}Hq$*Gz=
zQ3mWImKVo5)CEQ#&z)cTyI<U_NKH&m>g<dTs5#3=(JQS{g)C5nxElePh4Z_qk|<&<
zYP>r!`q}JNww{-ocTURC=9Y-|%or4HcpQQh-qA$DfY0clYF39O$M%hG2u;2(*$p_x
z$!K9u=b+tM@3`!VN1PNWZ+lW(8%i^!z$bfcybaakh6NaPA<!SkmnE5idQ|E@LnSF0
zw1ZSW3vSVDpjwrQMYd#^?q3D3y2Ok-<mnA1DH}#BFRu;Camz5`yIgi6gmJ6q<wxr<
zjYDKZAwd4-hpshVp6P4BF*2O{$(-3WCSB5P24Z!!NZf@7Sz5OHDb<T<FBbT%M&+*@
z`0>Q1zB;HuaCH$vx4L#Y?U`C6(6o^lduu|H?7a*;5?cJY2g3wpcw2hU4H=ODK}hsV
zWl8E5x}2@ZjNd1#lo?c$Y}oh*ffF+j1U4}EJS*bdrYZHRUil0E1#v>PRe&2-cHzhB
zL2K;Yy?<BWzpS;e%kA+MhX$P9LG*ab7K9Y(J#urrx+2@&a&>-r?B8~{cAxd{d~?&b
zsViw^FxqFrn*-q+&a0rWq|yyBw%T!=X+!?-B_XNu5U=5b)L{zvO<!4@={o<JzV5B8
zZn#u`h%pn8VvoGKAi-njeJoA;>TF8mJwAvo=>pS*BZAWa@gX+!IakXVcbG99#mXi%
z@b%Z?OQzRlgb>Sv!aYXeU7ek?Ml}<N)mXiHA)l4f{Xo+~O*XKyPcrC1DoL7&Y1P=b
zN1|qB#61D{D8Yw}b1lM>%Ejx;kt~lNP3-6=c3sca7|i)iS2_u{4%V*crdc(umC$Oq
z`CW9dB$tg6#5FFtYRY-!m68=zwRoVDz6TApsN1rOD175(zYw91nELf?_0xH~M9}o3
zXZ0&?HRO~*+=B;Q>hB(ws=#{3XQx(!Y+u)^I~y8T_lJ-P3kNC__o#o$A6PXTj*P6l
z#Ce;;Toe0z;T-0RHK2_Bp9+XjcVz%&Uu|uj2g~y9%L0%2lal#$Icmy~<7J~~ib!Ej
z(3@h5HCM?H;^&4>HnY9A=k*dTvOp1_N-P1aiB1tjkRV4=MCB>;0gy(WMCIeG`FbEU
z(yB@yZ4yBq^7&2`O_EJLG~W3<)^2&##}a*8UO6h3PQDYu-mU^-onNMHj10uG%r$%`
z258%=8Lu;13vw)9y%O96TwHF!b17@f%Wjf+w4W;5+uQjmVwH2)b5CRk!ykXoWr9qJ
zCDp{f#7`7X=ZNj^P0D*cG?wMq3g8Gw?F&SqrSx%AZyJE<`}l@_vy{~dT@(Ax!a$x7
z%DJPC{>Ddb<n{{1VZ~m*C?9PKUvMfb@TMI1U|xjo(@o`>FI*<SnJ~%rZ_5_~YA+QQ
z-7i<B0=$xW2Pdx9hncm9(#gUb2d!dMq3|q0VbL9Pe%@dI@-HkGj4Ur1$lPtNihtzV
z5O`<)ey0v9-i{%v)vg)!V;u>wIQV`zYgWNvNyhL~{PW+|8&i!bD0lsneQDb2$AO9l
zhURaPjS26!@}LVC5-4xZK=ZSNc%#y+Pr4BvFWPz8tku&}73SCjcDmuLC=MR>c~8{n
ztSN_ryDMS@Ow5Ff(;AL+D+#w;@Qau5gyNd-=n+7+b2VTkLIpa(@;bb7ym*kD?5t-_
z1Z)qGyO)xEHODt$fAWCn!~WVqOhIHDD&?akrDcKT#LhI{%8JWcSC|^?+~Q%}a%$+m
ztge92kO1j+7E6{`v(>d_anCaI9=N?Su17T=^JBv_YIBFxz+I@7E~4_=BT!ZSBk@!p
z-_OP}q=vS4m1v%>Lp_g;*y;vJ5I>>*KD9ws%t-BW^bc>Yn%>_1s|%Ja$V%q}8*=&Z
z-~7^9&yAaRGSab>AfFFO@qF-yk?v^b6ji+H?SNGm34|SbN`#1yh&5f~KVlI77}R{)
zi*d2HzZv!h_Q5%VE0@w6)+^#7QCg7x17U1P!XCBmethIH{$6uGRsavFW-!dg@<;v+
zRS2;seWU)!jBHsohw4l=#NweIakU)>{!QdAQ#9D6T<n;fW1Qn>yD9Udp2_T^1+5QA
zfiV=)eB$*x-XxOx(pqO&w259kUkAhZ-JVX^R}Ao^-o#1@mtgn>f~SC)72FH3duL|e
zcl>?n&~;8LTslrTNTOY)GyxxUYg;i+VX#G<U}ag8Cov9Q3A{3s-UKZuKpu@|#48tS
zGJ29FN_KqNHp^w;lw{vWNO2ZI6+UsZc`DUt<V0C;9lCqtwkru^7R<8ao>JjJ?X<5P
zjjab;^Bc>?!yg2(UJ6GQ@`>-r?rfeKJ99;~wcUUft3DXAO(tm-4PY|$s)Rl!51|@(
z>a(63FvHh^AR9k&`PgTFXzyqU1_;ZM3`WdY(;pqLxipzoCz<8_{?BRRXo6naVhv(b
zfl==W#D(uPpV~7ScADNKAmPvn@5a!lgY=3_5@v=0A#%Veq<=qtnv8;qxe){G2><{f
zsBGZc_=*mmtX=`~rH|=k)q5J1;V0R|UJB@zjpItTJIfAjEgc==)w<5(GRN(bZBGpI
zy)RbR4lXR#XkNJ5GYyF*M7FL&h9Lmh;``0_w6?^}4UadN{3oxS`OKW30{8}d+X%}m
z+s9WPB_GhvRA$qU)Bf{dW#^0dDjkpWN+5=|2ksP|breV-(FOl?@Wu4n+qr676Ff#u
z3icE*O;~^HS*2K?TRSFQUe3w3A5lR{O4brKLf^Nw*x-V=u|OJpA({MO(j9ah2kJ)O
zH%L?hyha%=qE17UXM}_!NrD5Rb;66fGe()kB&mk`%*xtD4*`|Li$U%)b}0qNWl}tm
zlh#riIy&^+&3gXQ`HKHq$4%baYS`sPHCbol6}D{Q>FwXs8SJzCt}yJ;#f4iJt6pMW
zCsvrZ`$~k>(sEn&y;6SJ=rdh7<<f3OG!AWdMLFmJmp!NAe@F$`;|4#NSczUqJ!zrY
zGA~8Vr^gRGzrTN#BfdZx<54n~=bHV(jo1@%$o|80Je%dSv)!0!FOWsxIv8ylKM&go
zWBVzl1KPFKRaEd<O{m1|ZU5t}jGDcaqfpY!7uIw_I}rW2l+%#0)GpC3h59b#&HDD6
zw!qz=Ia5ONFEK_u|NR~UNY(HiSu<p|r?{;;LNF_R$HcJxlKpSBPDs~j%=Gu%o?LU&
z{KOpi^?FAY#^kfh$+SYvx_Mnx*}SO$h*zK~9f%z$o-|2HRwNXEyG)>*g%BJEkrhYN
zb?`u0WxYFMBF_7!E`b?rMr_;V*8S;rT|NDudEdHyY40QUUQ}7xlaFNqzx6&U1_uT^
zE$bmK;%CyE-jx^}w^NDj?46(VCN;HLkWYJPhz{a`uv#ZQ(d$6-Y9{@=OPn<R*F2?D
z=P<4zy&<y3K|YkXI0yF70U)d4sg=!`WRN@u$eV#{z9Vn3n7IHJ{e=N~zojITA=hj9
zQn9mlmw-|<PaQ0l?&In@0l2tExKd{N(b4qaVAl^HqPfmt_#eG<u!#!711q7DS`Aos
zUzo4ONJ*oUZ)EDWr>vleRFS~prKD1p4U$wk`4d_N@YNaYbhx%OJ1$(dtw`Wc@{gf2
z;=?f+^G;{-QV(rvC8Nrt!2ES38GKOTXuuw4v;-ua$~^1O=|LHKZJi11**Rb~<M<=7
z$@!u4KC5d_5QX*gY=fzJK9E}@<F<o-GMUBNxv<K1riR0|dA*p5#h97?Z@;#D0#xpZ
zwO};3!yk<SietaK5jz&_oQllPgSvydmfY4lgPIs(=6_)KodsJ&-ti@5;}X8q(7}GC
zo<r?yGlMGUJ6Z4DZa@q51m#ePGjkp;pD%8#o%0ROT($7M$4RL!Smk~CGN%lhzh763
z>5LPeePpm34zw|ujDP9*SP+4Tocs2$EB#p}yKBqzPhK1=U#d3&F@E<Ir;Iqn;NcK`
z{qDB9(~99!k$pb}9ZId%{zT$RoDK;(1lQSo3q;*sk2<E8ATr@VPEriZCT{M1iWTCN
z$ok6Pb3@x#FZ{)|;IijN@a<<@&~#fR-?!mlR#3V(&Qf3Jzfk}kyAm6_l$QjO&8K`C
z5`5bRfSUUD-+FuB+|{i9J9^1_<lbblL&Cn04!;cPTJc%yy754rZ3<7`CM0gQ$Pb}d
zS38`;v*7Q_Z0-uw@MIkwlLkxQf0byJ!gOkoD+@~s{(SLf1KMlISxeceZg<b-aTqns
zW}mk(7s_m`dOD!1L3cD6D2y&(F2(J)msVA6oHt*@Za+j9lEPn$L<ofY3!>XSg{Bk;
z_@BQZ0NJQt6h@t0YzRQXE%d!tUOA=kw`)`#44HHlkFDZLb$5)S^U6J(OU9rs1#~fn
zgb!1ZX8C_yE{{WYTYsV2P^w{uZ*oN6L%41_C8uik36DE|?{>(!j{!*S$<3{w?I{&_
z3Pb?zA(Ojz#^<YO>26!K4(zRapBC!L=FHBJqo|7nqYmc-<40sEn=UDCLa}?XrSO!j
zv}g@M`?&P&aR;@!DoipUvjlp3D@Ex~Y>MGo#h;GfSrDI&_r2qgW}z&0+Iu&V=DmW&
zerjQ$xY1hRdSK;%Q1HrqsH%Z&>7?uOWP(_nISzjNoVXcHoF;4VT$s2iee~+B>_<Td
zzoxZgWdp-@EFD}&VayN_c`4+RK=V&loDK>==nrkAKWe9>Sn4etHnz>bW#Wmh)46kK
zz)aC?_`Q{5w4I9W?)^+}Q&u^VCO&WR+te2N<8a2WDFOEV+|`buDtbn20zL%x%M*Zf
z2E6@yvY|vOyc67lg4BA-<m+tO+il++Rv*WP?vmU((CcOWCvQSgg&{m<W-ojc^va;a
zd-VpjgVL~d!;oO_R@%ajyUw2sdIF`3HIp@OQvK2F!z+h}4x&g?2S&?5jjcfW{IUQ)
z#6$;MIWJI?48_nc`0{<LjQ=AxzWi2{{L)`Cd_5m8YM#I8q0(`bc+{KZ&`jp*zkjp6
z_gWDMy04;ISgDDUMzAadV?-e&w9(_xrUgs$M$;M!H$8-$vp73r#?hI2qq370YgKBS
zF~q>pUn#8ox9}UX{xwf`>hXCuUsC>~$9fcxuNxE9t%8`UXy_c#@wis2WX;CQ>^OW<
z_;e<~n%8=WK&SWdOE8_$Oue#+1W(n*e~|xPzMa;t+mCm_5#LbHi#l)F=$+tEd~kbx
zh{@wACQME8-()K6PNysb^?y0A>c=5%sEuso<}-J;f3x^#K4z7MEFCxJTmo0Bs#st_
zkCaU%e$;8G`4^wUF6aYhcG(myLMrW5z>vYH&KPr26?+48qPwqlwP^H^V6hu#?)UdY
z|0bW_>JEhbyK@gczh5~F&0{JwP*jbO_AU7prz1Fc7y54@>@;s@CVS`4GQMe!j%st;
z4bQ({A3K?zg#A5z$VQX|B0wT4aIKW`&8)wFo+ADGg@oT%8qdnL{=W;Oz03_djg>TC
zwTH^Fe5B2!Xj+3=xGC7Ic5!z<DrS_oBcAbOvJGmS%X1effC~b;N{#Z$*&_Ty+rIn2
z;s(CN=U6$O^b8Rfj%9vILeeUI86CR`4=;QxpRdq!1NzL5$*7wpB^^x%uJo0P0^g;+
zF$Rq;;36joAQkWgV)%k@ehNF#0z0%BP~Z@TW?i$(?cVZ=nEL`&KoifIMrT=uqw;nF
zn1;q3Y!}0^@+IT<4ntb5<JIwF)KK`&*alEhw&fw!5ppwFMj!dIR&X9qVZm3v-!V6P
z0PY^YNu4nF5`=NR`hUne4M=K7y#l$;G@tuqCI_s1wRh@$AmhUcg*Q|N9uJ;sB0?72
zB|;X?+56`oi3%mm4vzVKmw9bt_)uR`$JW`fWZc%TfHul30qC#d81pU%Du;m!na}EZ
z$WYS1ze8bTTwN_;-dGQ%WS8j4Z%zWenh67;N<cIMxNOxj0ieX-d)ed}hd#bk4Z4l_
z<~fO-mi;z*JW0B=6}Ox1alq(ve*Qoa6h8mg=TVIcPi6NQ=_6a(%u?|va64;^b-{Mu
z+qWTdJ{H&a3tgda-`0+{Wc1F3F8eckVGB*YARpH6#>We~;eY64?KGP8Dn~jb^R(hm
z)m<g=G7kL`)X=A=Z{Lgof+~O!-@Um9z&3nb?v0Gc?IZ5jonhD$?kEoiolP-@t)@9T
z{U0ukdDgnori`FLFp$XqMF1?AP=Ym2YI<%Cg`}W#67<=5E#_q90`hG@`Z2G~40L<8
zw+XPI*v^iQ+`774(IHoVwtPC-ipwXmgiD%NTy8>JWGBjIHqL!dm7QJXYI*{WUs}oT
zxa5@`I>=1e!df&c_P>P%y6g|4)+e8ORM562!}edUn{sr*=$(~ZH<K#Lj6QPf>9R!*
z=%(O5Or1(JsqydpsjabRD#2ZaE)KovzPK-Y8m6}<TT;_BYuN74RWk0MoPiz|SW_}o
z)Uz@I$Ns`GF~Bz*rWS7&0SB7=eXpS9vu%{QZr)E<KF2#velkAeyw}(=ULr;}%KM_<
zyv57Vn~VBn;_KrIvFaQ-QbJwbc~Je5f7sPS*3io(Dt}2yZN~zDw-$UK3vf!;?(b4^
zPM?GUt)2&$T|xgYCej*n$AebkKVo)qwCzz}cY_xPk6vE7hqi}dWs?mQ{`drktX%Xt
z1Y!G9p3W_I!=zqosk)XY8#BG5>8<-f9~_^jwOe}1KaTS@Ry$lv$$D-GPEBX-mkjzp
ziq1Qp>i>`8myjgxwMoX6zS$|6H(O-8_O(Kk9T%6(WZcZi%te$vQo8mC*<8uqWL%NN
zm7D#0|L&hXdPw))&wHHLInTq^=ghI=7y92=RC=8+XJhks9ex&@XN6Aqz!1x!cZVWb
zJ&*jH6>6%Ftk%T+`Kea&E-2GJ@9oq!yiROkJo<Z3nASbWAKg0ZYcq**Ns!1umpXAo
zqfn*e$;_tCE^uk-_|Yj>{F-Xtw13#(y64SGJcr|?;AKdIwRq3U^WH=1ibv8nheb1f
z4Owc-<>;^TKA~4;x6yvyJ49N=l~<yE1nRa|2@|asHB}#%romIdmOQ&d#$Yh`T1nzk
z$RwGc#uO{LO%770=a)(!SaT?UXj>yLlYIp;hH~wjlP&x_yA9M1aKjwpPA{46ve1UX
zsOR0KXSdm2x|U}QOb1Ey&y`(%#PayEwRA&LOO`3e$bnma>g`;KjyI|owFWEr@U`6)
z_)B%j+cFfUE~4<xTT^e|kEyDQ;&krcd7=5p4Dw`-TlBU-xou(UFzEhNyf=wd!;;Au
zo5|CED+V7vTqHiZ%hngU%H5Sodx@gRJ2)f|ww{^dw@)~65K}?&XZ*@Rw;MW&3TB`r
zR;ILd_!k!DXowQUXj)qqfN#MSbG$|qr0tvLtG48cAsH#Vs<1v;)wwC>)*<KfdC)74
zsY{ss`P72R6zf_E{}sFuQ-zlvmMcm{9{t+|_#M*F_V%qmhrX(5Q-1u>1G3NH)GbXd
zvz{1fQKkawVv2}ZX;3HtTobaOPe$CQrJJ7$ttzRugDf}Cb8~~!@d*nWbQZOR)z7+1
zCnY5Ta0k%8#v7LBo506FmK$c9drcID*MWQZwkNK8^l-Je3o2Inl}qB?Ud)old%Ol@
z2`3XbJ@jpHZeig^LP;v}tj>Tmd4Uo(sp7h;`7ga`*DtE|52EU%aZN<zP6G4_AEkvJ
z{gs<dW<u7@a%{*3@z*fXi|JJi`-_NGnJ&Eyzl^S}luCjb=ZJY>`ROE5+;{hqW&^`x
z?8dhU0kQX!p@Bw^YQCst3vj0YVu-VHWR)%!q3G?%z-3Xls9kiwde+U4bv3?k#!rO2
z2LmBp{`aXqm1qw-6W8*)uT|L{*qNcv#>FE!f??E^Z#PwT7Uxa?Lho$bYr#vVH0_zJ
zE{L7(?wl{j*eNQK=YckR^cRdtFgDywg{!De)cab|$f0BbUdJEOdKn{G@2ZkisYKgH
z)_hOadU${HEW9fr+@UcgK4*&)rx7Czi&<;G%&pB%;1i^ay;jdqD7qqZd&#e+-j>O2
z?oG(Z5hK**&Gm7=*Djq0t|j*B;ZevVRv#*=yWM}dq8~E9$#S0Y%S0mACf-nvAx$E)
z9CbaTS}QSB5Y4Y;l@r~p6t0y$qmuuY7G%+4kY3_|g%z_s1ohlkMfLG<JL=M<m%0va
zaBgC-BMnJF{dxz|)uR|+_EC5y=zIO)Wy#?|NP=L!KiPHEqOx$+rxmw{n6-G;fOGR0
zaVUX6M~wW%EoeM9e*gq1!XPQH?JLKzJyxh_ED9<n1`dx!S1m5KA~z94H-#q)+=HwW
z2E05AMX3xi7w&jTzX0#~mDa@_ULGFlrW~W^$N3kRyt%E)wPc4!5$vAU+6y~g4!S~O
zv{#YWufxFV0N7;Kvz~2RP5SC?ofBmOs;`iH2!oY1hqE*OXqMOBe#-o^68d`xC8c}J
z4)LX1P0)Kr?)WrosCUzBQ7S+Oo*a%oX1no)<9id?Yq2#1BcG=&L|FF!zoc?CT16w$
zDVJ!yH$WNyLtSfeFlSmgtn*s*S1d6GHuYXnf`aDfc~6tHE(8)>UbBd$6PvyBb3kp&
z9soYN*J57Zei&J?E>C=uQ=$hC$Bw7hjsxweY_2%b8;AX-Ji_6CT|PLFj(jrnuXRU9
zES<xlCUb%g0!9|R;%Z8uG#GKO1@a}dPKigq1Lsy+_+8i8ixwNQh-R)q-6%j_y`w;!
zoPRV)J{7{{+U0JysAFvwyP&4;YmwTlLh5dcSz48H>R?2`b}7#;7qE^&+V_%Vmv2x|
z&Eigv_y6(N`o%RuzY&42QF#)?K*B=u;kV(@M<<Cmy;}?-rAfp<8VHOg90dU8j*}Sd
zzqYVGxA`Nwv728SZ=k-lefTAzAf|im#E+A@<Yrvq@BZ&kKi^gu9Q3R-tHx3P@q2Yt
z!?~%gdtCI)CU86;q*jOZvt}C{H;`00Y@V?4+@S@L-2Z({Vvx!D^T%#B6J-|Yb2<y^
z!dkO3V}4XT{S{%WeMki`U^^Y{`S1a<xtFys8>w(`ZYr?t6;wmRGRins{60mBwK(Y)
z@L$M7klT%^jghqIfimH_FUYp$xweMm^0t$0uP~DRMo8b`+U{E0VO`k2PTo-N;-fzY
zol1w<VctO)iJ27kSE|q$MJHFZlZP037d8@jQ+#+E!5bDF9IV)?O0a(X{e~7eh$#gh
z?aAgc(9vEoQfHsEp|F5IH7UKzl+&Q|oqEv=gV`zSq5rO{LUA))h=1<YoIL+@gn`P0
z^v0H;gL!+$cNVByV4JqU!x679cRvq#Y9FK|izEN86*wewIL7B-Z_BW~1iq9ZL&C-E
zFKoFgR<G};D_`wcdeJToPa&mbUDRP_B*>Zas}fapf!}5N*NU2ZrBDgEUC!%>zUi5l
zCwPlIwLM~1M&904cdZnA4r-QcOmUFvDFeP4mcqtc*S1@6YP?tw7XVmi$$VW9AwH<u
zQb^-!Rw;<I>>+{E@aWG}2j2xw=Qlbxd<cQ)0zBpwbwaQ9x6jF*QJ%dpq2v|?T?1)7
zV<9;zM!55L`@+fFhDZ5z+}{TRgbRO@Wx64VJg^}E7fzp)JwMf*kP`f7>*B!m#wR1t
z>eQdNZR^J;W)Mk0i9*z&XeIqy$YKE!3B?1eEh`iCW-h&H*ErQb6o6PpAdui~77v#g
zV>*BO-o`7_gBx&XXJ>XsMuvo)qJkzPqt}t=)bCp0fHEP;UPg<9=0JhoE{@}>okoUB
zIr2msC3+j}&RZp}rGB~Vqr3lnp5dL+T<D?SU^bZQ1-TvTmHrM8WVrJgh{hwYbtD_N
z&4x(RaxjRxefwuDZ!S;UAljcQ)=(#lh!`?17xQ~yMB&{AmZk;hNU;Gg5d1Msr-!~m
z4O3GlI)aZMIXfq(0O|TN%tJccp5v$$^JqXvQ{86LikVprk%7*+3V)x;2e~s@TKS3z
zCig9vO9H!hcDC(*2RB?iIoU}f-Qf@51%uNV#vO{K7~Zpe(r_CjqPm;K5lbYMrpH4%
zzgd}C69(Z8tH}PmX+eppOe1VpvP7Z$PXE@jUDHs3n2ou*Nc+G9Q?>40X&X+^<NFI^
zbNKgRIEavLZQ6X$<@{R6T~$b%e!888t(Q+0u)XR=jbgKF6mFS#@N_<@LLldpZQHr&
z+=-uMU9dSt+Qd3cMOg(3OM@7u4yM_rJu#6Z1dej8H3(>jP$fMywNx=xHdMb1N*fhh
z5DL5<-+DY(f~%)TRNq|UF2Rbge-f94J6LAk<(q2Q$oY?zh=9FWL1PnNX-UeG|E#Zn
zI6tb}S!{d2P()fA?dbszCZkfwGm~)g4)56}x$St!Yw=2UE1s_7$;<!XiiR=v1SB56
z-WxNq@z)w~z0Oy9hI>}Z36G0S>kHzFSG@Z<sXj|%3s#c{3nu+hWy03nvN9i2Q;M15
zC2HsA?YduR<2js^ZD~kdS3|DVi(LxJnixqLXDeHl&*OZ->^J`+bo;&8&qLKYiz-(8
zGdl5d%8fS8-{(O_Z?M{KaO+r7`-Cp`?Ah%&*K&L+<=dwD?uPtvRocW7ymQ~x^gLn&
zCJ`qfqF-$hBMWPY&mbNCdeNZb=equsc3tVANM_)hJd4agzo~GPCTtgv|D1aq&E{EW
zWs1N3ka@}!?p(b9wg}y%zyJQ-?8q4C!#%aL%{>Ti;`FBp0d4kN;jcPl>d5#pq>mG!
zp%MD(=0D{T8d0`nWQNgTqj}IiN(7!YG$0Q{J*zmJbJVuy`LAa6len!ZS|}k4k&cWW
z>OPz!m+mwL=K26b`@lCZ9|G9WoJHJw?QO3V;Lw$|-C_ogIsfh43l|+>g**GSTZ?tH
zv(RE64m2andg&o}{BbH5u)=wBImWlg^z;<FaJ7&m?;Fm0kO3TtxN;bybo$b&rKxSF
z$yVi>oaQR*`oH;5V97};{{Qu@|5qsJIBXEqBq0opJ@Fq&RJ{<A>@|jq>bjDN8Lpqi
zU{?rPAEd$K(>XMhQ1*FdU2gQv8-Do8TCiMRD<V|VkR8I(r_hg9v?GKpwgr;96)^HS
zR&KSoS(c#=i{$xQW`$j}<Ty~pNLyR>HS-ILi$q*;AcG<f=qhyWbsYLcW^;c+mKrro
zZxsAYnQSpgb+rJyF+D$2yq!{*Yr>NEWrP6n+D+kym20;_LDkVXnK$$_+fJb_+!=`a
zFUZT=vvq_h(AV>GcUS1^QjW}Y(XC0kL3c+Ag-PLeclFdKScR1P4v$LFgiSp$J(X)C
zVfq)u!iVr~*4immRF_`#czZiCS>FuY!WQYMg{*0Am^XXh3)_&NDt(ZhaLYNCUF|hn
zH^RD8IAeF?nbLrvlbu!39qVBkx52hOCiB~HVUo{<nsCEWx&ca8CMF10%tcw{6>TI-
zei=w~=jAe{P3dKXurC}QvrsZcxb&(+O2%mj0NL;-fG6ze&@l`#zpy|%O&fFHNI;Vo
zrJb`kr;coUsW>wV{f3MqaQAsMX{k@By(VE3O)dAAe;<HHR3em=O<wpJAg>f6clI+0
zR8Z%6dIFo(4o0RarVcZkv-M1M!_~eDsiWqrNE4rlE;oHYUbej^b^2#uG|3=FBFVrB
zVRY@Dw2D)uFwZoM>84KBh=yNu3mue_`PMrUpZ@0u@4Bh)cpQ0dU?^V^FPmSsRvX}!
zoZGp2fB5@-h^=XFNx73!m9~T_{=v~^-KV!>I>s-ynl7-Kzux$(T9YFp7gMHQ&q-qu
zTznJstkfmE=@JG4&vamqXyp*qlfy6SV_X+pA&Y)Cv>zqQwXmf+eHB(bym?@nFEzAq
zymW!d(!#Uy2F7Kstn3Kd*I-soxo`7<4$pQyk|vZ(({m`DuGXNjHOl?uQ`nTZvyOnN
ziZA~^@(ws^y<nVsgC0XQ#T)1LWM?<E@<O%36Ec@@<<{#Ygg#n#KF03!Mu6AvSKT#v
z#D?7;>W{DG$gxp|Yf(cq35{PTVl}AZu$Zbe(3uF*1;EOA>lZobI6K|j9cd-D`U=`T
zkV*8BORB7u!C)8}caA&*?r~c=LVQ<^sj9YpvaG~xGEgEUsXCNTpE_{W@Xf&|Cr~Ps
zG4CURkU9XbuwwVYo3SypUzQ=xoo;Uf6{mVS6<OrX-MA3X)&kbq1=z^0@2Z*cq=(E^
zoq(73@0(A%zo@qE)+sfmKrZ?TdDL3@T>oV8rKJ@ShAV114nqHDlnjM4MRD}X@v4?z
zE`BR{aR;eQwV}305D+g{xcZ5N)2NpmCb{dMd+aKhzg7|`NH{Dgh!yfXK3$L+fc!Zm
zJ=U4sC9EMc4-eM;n`Xz&+}sl9qzv5XXG3;^SpSGyeF4V1$ll7A7GG{ppiqv^6Z#3v
zP4n(U<N2UGP6NX7b)v}o&3eNGyu)iz@ubV{q;}^BY9pS|d!>^`8Pk+qwWSpD|J_q*
zh=c=NqQ?BKkUxN1{QBj)n4xej{1{GzPoAju2eQijjQ7<a;m5uNJgV45yzpfHOswio
zzpDL1+s1h(3(MM#EyclTepUq$GqbOG%>OO9{Y7y&#976}ewmE<1P{om13ZIR;da-v
zM;oK&d?U@74==?Xt^fL@M&KFT<O{7VaZ8sEud1_{PNUtRjpAUdV1?qy@ZcW@oj+4A
z9R+$S*J*{&);EUILTF9aZ~QQ>YiZds$mqA`+L39|6!E4L&9ziXyIR*>P|HqX?G9mm
zo2sn>DM)jK<)E{4sNp8S=7ho2X+4$Y$puMlM2_Xs<YHLZ`S0cR-$gqGmsukId+hFb
zkq<v6XYTqvvZoYhJKW<P%2{mip@%qmh`=8PPq<ts5H3;1!nIzT^-;s!@nykk9D<Zc
zCPNtC#rV_+>6D_3ZX7cH!e4Rbaru0@0`pgEjmc3J{DYsRVcJ`UfBl+KLD!TmlC5uT
zm9G7um@R3S5p??*kp3XpFGn+$A2~Ta7ZL6p=Q!1uc0pa8p0CV#jHmhXf`CJO`^~Qq
zF5~OOAGcA-Wj-qa_AZ~ZjtDa7X1PE;>N_+lD!dSr+1PGLKgwhdA1pL;W)N@GZ;@R0
znEM#;peZN$1AS>t7<5`fY$f2OBxqM5g-nK!mlYsa+5sN>-#@8D<I2LV#!wE6`qaUK
zj>2_>9=oTQJB`a7W;l`{M&x#!bC+%~iBoG%2lb@=u_cxGK%A?{!G8diGohMM<Whm}
zhMN2hkmb749<vk-^{9+O7_46#Cj$`|^aMRDL=Bzrl-?DO`!A@p@~KC<(saF0+~?>i
z>KzFp-C*3uOxkDj^j49#hS5UP1PS;aL2eK4?D#Zbd8qnM&nl{aR>lj$_w`AY2Hw=(
zKM^db6nw;jXQ~BU0`Ssm^0JSdl2RMcYw{P}r6s8huk}2L%vuAlzkdZIpDO0PAmj1k
ze!yXVT$M+P4@dX)th{u?OFJp-gDJ4hWE8Y0P#7<-`F5$9QStMH;h*g$OyV37Q1UYF
zJoe9RMgw<X+ceo`2lsqXTyT*LTzr>7$KydrUEA~>^debCMkc&^e!Ct&nUNtkEcqVy
zf6)j*9P;mk^GFs!sA&8Jl(lW##_wi(J>;M<x}xZ7Lwtc*9n@HRim08)EF6ES@rg52
zF1FpP_PFh2pbccLOvk+k!EJpMku5$X2lnl@;6U!;>8UT3-kaY&oABhLpTRy0UUjok
zA{DNOxJpplE%c1H8M8X)XCDm8UVBD)7fz36(<bmw+WYk<@U&uco%k;mhB$BBWV`E{
zPN$68=xtH{%nM0B^2ViO9@%P7Zc;DKJwMzOmmC7%Naa*P*1_n?5339(q=xNS5vBt9
z%Y$FHY6Rblm?FcsGwxRp&8m(({`z1Ez4qhD&5D(i{64WjN@;E#7-uSsH_t9KFu0Gh
zj_h?XtEZd8$S<p62!GqPY^c|%NvHnAI9%>I#pRn9cYNEQ2%6vH23Y&|8zxR~x<_{r
z!x^2+Q6fssA^(0KFBI3eOnYFg44u~dZw=GGoqNPx3>@l;2BQdrK;S_xCJwj|ip?bO
z=^Zx{GhdjftGGz_xuQGJ6U}4boMhWl^Iy_iZ8-c1!JvN$Q6eRgL6Z=8$2U8HSHdv1
z#6%VO$l8uMZM;XrTQb8=yy5PL<5~9I;VS0iXfYFyhqj^*$9mswB|HfUvHU96BbwM-
z{LqP#g1*`VZ`*T~+K_FfzlWm*eQ*@Si>jnSlwcX#r&cP(JgeZ}3kh?OUO9Cs#@bAP
zyNw_L>wt4BZg~92(({wUbDqBJ+{vja$?nvYkweHA`Jt^y7GQ&e8VL<7I^l{~mETRg
z$FoH+w#QkZ^i_O97G=aMO?IBt&HwUm8oM&MIpGX}xQ9fo(q~nqRZh2sW*Yqt;G_;{
zx^~ohC*EzNY1b#WsE>w-Blh(4q<*iSeqVLRV^mh}{!6Jur^&yCW2D1CE@Blgj*&kS
z3A~*Zg|a@URU!?8B+>qx9eVF~Wpi~Z74P?xe)=w(HMXjKG1Gp!;Dzze(sDGTZ&%QK
zyZN%Qig~1S`Jq{tVr1)<SzVdnwIKyCCM6I_Tg&5w4;=3VdOCz}>l+KLZFkPjHd*Z;
zVBi*DFRhTm=J;8Q2L|RfSlRv4Y#GKCDISC3VEJ_9ukc?%VVJP<gXPM=WZ1H-LE)fe
zch%L#s^has-~fJgL`V)g{yqOQ7&#6N9$NF^G>$!<|9$mY1ObqFn1LDLsMXPSB8ER2
zm5m|L|CGtD6p+!o!^d_13Zw&UYrIF9DHw+Mt2W?23|ogfW;AA|oC+P~Yrgm9X7z2G
zeOZP!L1z`q9m(#8WOO*o1e43{=6`t+dPWbyyXiu}e}q8l4*u=GFCgK>YUfIzad9^(
z<>u(s0K;hd(^DZ<$jg#c=a*DvWp5>mI4<ygo+)zSl2`$LA@(nO_&IAv`o@%hRa@jP
zCx$9ez3|D%#C)a;^`bSUmkq`kPAPFYDas?ifQCs7v+I{a39-<{x-WX;f7Ubo*xlN#
z74KORS$ldKdo6oF9Uq^Nu61}t!|7D=?tcN-B_}V6q^PTgnz#tp@y`X?iFuii@Y5Pq
zZA*di@w0o}ilJvzieR?<*kx4mzGeZ6;(4~RW9%vpttyOit1NI_ZeXE^*@%c*xLQ^u
zJG4UVDJFKl%qNfh+J)Y7ufv37M7LrV?r3kq-#%UO7R}2`Djty1M^}NVN%82KFLzJr
zgjqwnS$=`U^-U>0R}l&$+BbZY-EarTbaEL49!{mzVcY)vO1xHubk5b_{wa=R<p-Bs
zU%2$!5HU?nqYqq7)0)3MQ#SOwR=aI#X&O?{nE6=T`l}z@&aU%sp|zyzoj5iVL%22k
z^JKDnh?B?h&B1jxH>%Vd$jLig=GT?vdpguX5fVS7MD33ID2h|r1LM>yUsDp{L2wnj
z(SIF&VI=3jC!dZUt7!LC^Fj>Mkg*;X&?lC}*<XHz?i?-sXC6&}iYSwv5M^wEG+!jI
z+bboI5bMYqwC($}+n{GWuxXaDbBTL=0!LcC)rEb(0-R=qo)y^h-WdBS|IP`{SEjKQ
z`l{*<@IO&x3p#Ypx}`~P!}(CyU%S)c^V6`!A78ZzA3h8l7}#;ABpdD_`19lJ+h)D|
zt%B|XGNWrF+_^L@qu%tr_+S2`kJhfGkt{9dp6bwddENc+1TfM%xlPE3IX(tY3x4$u
zL`DdGupxH0a_gCKM_HNJqoG?ab+hyL<`P!+Pxjq~QY1CnQuGEr>eC&>`wEzXtIKb8
zKbpCsv7PdUwmqm$wSLB(#;CQWW!7Cr=D3CR7vR6_@1N}LJ!^=MS>ew}Y5aZKM9v=K
zn`<Tk-j+*y#;-@3U*|)^g3xVE(=Ff<iTLNZsXks%fCd8T(f+_NZ?Q|`H(C0YhL5<U
z4+au3fw`)*ijgbHkgqXMF=3O1RpTTlZnw(LYHvaY%D-uQ{xqk$x?1u{1~hN!imuFb
z*dm5@$}eORW3$=U5ctNmG~v)kQNrtfni>0P*d!(-k0qc9panqN^5NgVsl>rJA%^K$
z1B>1Uj(0iriPmo5cSqRhw=`VZV7j2Jy`V4xfe;QSxZs5>&5X6{xME=9&?f;P+TwI9
zP?{%^;RE~;jc|op*3Pc!zOxg`Mi!n{)Yco*7>j9?ndxM#znGL;eht1tQ<<&XFU()i
zPE=i3nTi#a@}@1<xcvV1)?m!QC5#|u<EWzbbYDd2k(}vdsXqHepC9zZzsk6a-o)l6
zVVF*DoT7wD1i@>-+ZOC;+8dS6>%2bE|1)^b*ZZ|GJM6g%_1MR1Hsx<PG{N*qH?+86
zz5B<7!qw|SOme_!aW7{RXf+LAwvT~)-gf%?m`wIs2-ty*U;J;7PM&zUb_n)~&Mi<a
z@@IF~g&XZr>1|&%_ufoe<|@SgKE?Hm$*R|jDY$f8s4Y`1smAhk=I67UHaftGM(%M}
zk?keZjNHDxSv^_Nw{LH1shD09e(I)Pn0#5%KZxd4tgz*)jJ1rwL4liZg@r5N81(3v
zMzT9=f|Ca8q)?dUQ}Nd_p%)k{R^%ZSVuPV!opY|GklHQQt7}*9@E5@3vDll@UtFmq
z#R~Z#1@IAs*w5(u@mKKE!kb&}B`6*L1(622gF3%e+}#W7x4u-C#*zT^<st2LFZx^R
z`|S!;R~BBAZ%j#j{x$_Q>u#)yljKS2>0B-;1BP<M^b;E$(=-kh6T-@MQ83TrrD1O-
zw+cnRf~tJ>z+uD@_wLzrKggtbr4fF!kg%?_6VWc(@u_0e3LnX7cn$f`plna+-&Wg^
z-PzXp@%g{J)3}CJkY`GeBCN>5AI3`hm2z(Zgg1uK3)C1+7MiS=jypI+cyp`ig3(;f
zv}g1cx&JDmuI$&6nb%1_H*$Cz6HTndSbg1#rH7pef!wc?b{1QPod60hGunP7<W^1T
zkTb1hN{ycsSK{k}MUO@KjsYhe@d2uG4h4Ao+0M7RHhR+<7&>1$Fqz)*a(CO%k9Vn?
zmnT+<4y7WM-1mKqK6E<Qx3nek`w9iJM~6b9ii-h4hzje)yAVvC|FE#2^!DA#^1rnK
zPzCpR_Iong+A3>n=<MM1m~m$TtZOaTFxcMj5H|lhh}LUV?9&=PnNdXp^o@?HEsX#a
zuH*@O2mCrx$jO;))*yhWP3W04U!e)uF*1FrE<3E3IU?tR&}Zfk+!15OrxppXI5Al|
zFh!YaSX!`RZw1WE&->fZj<hwjrI{IP%LE0xen4j9%jY=0$a1abJeH8Y0&=!v*e}Og
z(vmS2S0%QoA9T``kB-agEu;nJ5(69KGt6c)7SBeb=x%C9(v>)D{h?m`NPFXgMf`E0
zj^xMTJ`OvbNw;%>Kdi%QD{N(b4IA=>%MK<Vp81iX$Bo`+=_MUqu^CW9WkZ+Bkj5gp
zuxp&&`Qrj3T*aS0FF_vS>OaIRrdWP@KmMX3r$v|_#s?u4n5$Z(Y$b$+f7x(;%AWq<
zD~xZ+WVRRpW@1LOn_@!RU<I9rKOAlr;)sX0^zin4-bKGmT@onj)6;c4W<Eb<e!4KN
zJNm|7JcUF!p79m#z1<vgevo%M5O&^4_Bz&+ntGYl11wnJR5CoAq;!5Vx*Nb%EU!P1
zc3(YNIh?LWmHMF>%pS>a_=vY*mOhB$*}a_igAj-^B|}M5APIDNk|r53nDc+ddFN+I
zN>YZ4jKZ?nVIFSv*k2rm&k^!S&G0YQhKAoR2?Y>?+2JOV=|#ey$79_Ok88y9XCE=7
zy4AgnJLf;)eAse=vzU(<k2{y|6szF*Dx&((TIKIdJ^ypHNfa|wo$s|agpVJVwF`~?
z^c3X5e=C-|ZZg3?y?mhI>T%<cj$f8nRk!l4k9`Q;6Wig)FCBAVHuJc)%p-d#57~H9
z%bpnus_&Fss+({>_|)%uodMox4UFYry=`r6<w^Q)2jpxX2*HeWy&rU#wlp;0We-wQ
zN~D~5n@O#BGrAs*odx$J`!M-y5*fyBVqXolX})kUL;6$~RMmY~BO7Yz$_l=_dJkk}
zvMz<~8=S9)kwM)|*ul;zMl&2L03QluvR5skj$q6!alO~ts^w<0C|OsvwJ_IawA{Hn
z<_FS-S8BLz^9Ddvu`uoX|5H>Mlap@-syV+NzX2uJUDem3#k-*$YrdWxlHE||<Z{r`
z*Ub4u7@s;v?yTm7e8k79r-$6Oy<eW%)4990mUUI)wre}vG_#)#{42+ch&S3YI)uV0
z&L7PNLRZw1)Mv)_{XzRF&~KGOPyUf4+JGt~UT84y*);2M@KI;sB5yF(ink-Q2D8DF
zm#g2iatKh%p-kiXyg2mieCWpL{QPNWQ_K$d5wklYbx7@B@SE1+@32(7z04IzQ#L_o
zW|x<PY!U9Hp~GNXHC><q;nRk0H3yLiQWG&NKYbB|K>GF_j1}=k?AQeKdBf1?s#-8Q
z$Xr{F#{fbbj@-QY9cBCqc=TnCn_O`5lXnvD2&3K+WnMzT6vcTo;|*;0?Dx>vnuJ~M
zx+G&K-&>MY9QG%5a*4Nqk8-bc*X<xXN9{zdw@1j}KVQC7UW$tV_P8XW(6jx+0NGmS
zVECd(lZ1{b3U`~$pT>3|rs5_8ynrvf(EKM?>PdpZ>v5IYan9x3D(NPXCQdU0Z>sA8
z7Pf)B<$t5ZX`Y*%R!E7N-2W_kyhV?pX7Wh1x~K)ayFcr1>HnsL?$vQWRAoR&EvOSd
zbv-Z#V%GRYdp{=aj7Hsb&HB)(-_bLKo!0ja+7l-|dyHX}3|ItTLqb$>AWv~HS51J-
z^_@#2ccGsB>+HWAO}c5YH(m({n))cWH-$b8;r`C|lc#n^1_+cP=jGot_rB;^?gwxI
z`IiWYyu6Iy7XD#W>UIq+ZCw=Vr<Unenb*qsYL9G`#i>o#QK-s~TQVVxW#}xC3$lyb
z2VsZVi)Vkm!s>XBVzQ6h&Wg`<)nu&+|9_mr&i*;&?l~xY{8q{Sb}(Su;wsHW-43MB
z-*(2+?tFqkIkv2%EF4Pt*6Qq&sPg+rKDYIu%^^mS*>9PM`=5+V-$uQCGRCA9GAS2$
z3d`pG<sqOu;Y~Sab}d;je#Wtc5}jQH8*^u=G<Kx+UBzJ*n#{6eY2MGE2vOPlolCg?
zd=;s93)=bVmU7Um0WV*VnYlUZ+dS@Hg9_xk&1G_$B{tTPuS2BRjPAbr+Rp(r|NWB_
z_eQ~$QY%5J+2CVA_jis=f(is}P|+wZm?u9@50QKm9WLFugms|Dr%%ktvVWZcnt$%M
zzOqd3g?xb_LmkO#U06L(IGQhXo%bo7@0@%%8$VwQ+vPp^4_IG7ney4*&pYq{%>-Nt
zsu>I62HDIEcHR@<CwTAMfGd_H%GDtuCF|e8S2$?}vGAK595k!lC#awrvkxTT{e3NU
z;TgsdZ5QsI8I&#{j})i+5Mm~Ea?#qmblDbv($*N<>l9!C&w^d>{BJwo(ssOM&>;v8
z3u(YvVC(mzuRTw>GwMmiib``qT`Ps|XWOVtNnFqleHQAfhl~ZGPz)otV@V;^4uw4z
z@XLJ-J2L*i_`?PZrUfl^pGfw(#rZ(Zt*q@_Hnh4d8OZ@HsYUwOGRWUxHTwei9X%Y1
zVMhqP*JxkGVZ137cI0+r^A#|iv{aX#T|QWM20g8mP+;%NP_!jv3^~`gH5mxy>Vr;7
zBC6r#=ZV^;?9}gv!T!LytQer6dDN;Fv0ZdA&6{he{LXNe2Cd}R`X^mTUR4|^xMmSH
z0yL*JAO1Z!2*1Ty7#qUUCsgBSPdzt+)EurjL(|NxbiMD;(>{s2_r+<EIb_aLTJW*6
z0YQ#<mOa#oGFzp)ya`va{r9kb$~1E;i7%xJa<yP>XGW?}L|{;uA<K!Hi^X2AZnJ~-
zql|d|U*`JWE9^ylRx|v;PIo|k2&_-W{p)#K($<v>B%R2Nlg-D|IV7aA>HNTR;#0l8
z-?@?<{&Bdfln5^>BxkeXj~-n~iWQ7X;{!I0^O|2sSS}-hdPktljlQr<{wY$K>gA)r
z>%U?sLIw-<<wOkk7J3L|o2g$vJXeg*o!M)5UhF)J&Bu=2Af<v1iO{pdjHEop(8ImV
z9;5dZH*3O6u$k7ycS|Z`u^Qw`Ykecz6HV<t9|V2bsq?7c&)q{O7D(PgYaqF4r!5SF
zVa9SBIfp9p5<bN2{`Hg3yLx%bP225bomg?QWS8w@6@E`W4J+Ex#qhnUGQoS4Z%a}r
zCgHJE@Hlos>*o)xDmUpa+NBK)Y(+$~RoMM&JVIU0O|VbomVIt!<#wx_6e`)N_E}lo
z*~rP%-Wl#2I<5Ax8okj=q3o6rwXM7r)BdU!+98_=|Ah_4N^jqV5wAf~`1rb~+%il?
zg6wX4Bds(BL?eDc+Y4S&JbiNm_A^FLw~t1mbHD1B>rTts1E!JA&KDIwt(!wdJ&G(M
zO{+(?ZzuXFVr=TB-CtqLpE#O&bSM_RYQ&+-BQ}1iGe|N$d)N9t)j^wZ9GBbeVzSKg
zE|$)%ayt1!$@ys5j?#(UdHMN%*guK0l^7XDPz3JuMjX39k&aZB^X=no`VQmcN$ioZ
z<Ix!kk+CKaSGHW1!!Y+EqL&~ekM7@;AI`c8qJ5;F)>cmIV45&Sq52CM|8c9al~?`!
zU{r%-6QC(9?(~gVucJg@u>q`iJvjO0LG;!}T!U5H$-Z_<#;Q<($bwoyUCjXF$lH4n
za!`is_Ujknv9#b4?O$W9qwTVh)9~#`*(Re=+&@Hyh(q&t*f)WM({^YZZ}Fv<1R~n$
zhJkS|_-@FA*eQjHpZ{Lm_B?1i8z*O<CSsMHn&U?i{79MoFFvx|FUTQz*y;&+JrCS{
z8;9i5t>a9ll$!D%>%R|v|MqWc+dd-5Jx%Pe2_XOW5T}M&5eieQbY`~?d>gdZ#=NvE
z!;Y3?CMPe5i-@TD3U$qU*34fS1loM}PaIKIU6NXr-EnRklRZ>D>m}2qN4wBd0=MJI
z11A8;b70SW?mFowo%W1~a)fBv%xwFY>O_7WjOkqd`xlRQ_V#X{C>N}oaCHNN=UR?L
zp-U3N$Ayg_9{*##o+|RX<6BKy+($|;w;<6t%<v2-1ehER6H@i3D;0hv(1d|#u-;uX
z&rT4f9^hFt<ckuVB_FM}9=5r)95s%o-~*$T%4Mas#L#m;hU`mwj)tz2S%;s0xDiAP
z(bE4wPAZ{<e8**LFch}nijG%<7~!I2383(RM@P(NF>Z5tO`SkiBi<{OqTw^G>SC7j
z{S^6D?47`Kc~y2CrixeE1*ix)@AIQrR&Km?e2zSPinynEFXU){tvD|<Ffgzb|Jf`-
zLv6JerY7LSA3(*zy4W24d8QTl&QzT$8K?O$rw9hEZJHn2&bf)pv%BTqdi-a3K3ir)
zAi}nL;C##LJTB~COxtk7#C<OLKNR#Cubp&0DR*{+L)KH<DX*7MQCPv8G$y;widgPL
zC^YQHoo(W`8zB!ZqqAf_MPR>waz6HFZLp=Tw?&&P=m3nR<iEuNMa!Xl&gUBR7~6fF
zg$v;xQiGf6PXQ2x^@j-FPNhA_pk#F}wCyrKS7t8yfd5I&z&uVlGBE4@ON5+(dg3d~
z<X%x*9kBuj4LXV!tmEVQeS6Q@&l>a9%(^SoEy+)W^^alY!G$aW)>BIHVP520w%y5^
zal*{$Ra-5L(wj3mjA|0vv|r$ZsuVBCTGwgAS5<G>aMip*E8Zuan~kJ0y*yz&I}AGk
zPv&{e`9|IO9%v;#1?7D)yFR4_D7Vs8w~vd(V1~1yG0q%u8P9TqJ!G=$GbfEEhw&dm
z&M<K|c$W~x(SY}E9t@-j2M{l5qIIl|_DJjxsiDCXVntcygGzQ&EgOSKk`xgK<EP5(
zh9x(nPj3b6gwUc1m2X7W?$@`pHbZWu=t%mhg@pwvB&_WIdDdW^o0o@~^_XPrUZn{q
zt)-xAZ_1f+L@y6@{y(ca6WIU@tYLev(Q;izrmiA52wg6$bh>v3qAMo@%ho`7EHun}
zmcJzq6pkOP3@fE6Jz~D9yJBH=EjoYLQloevK>phKd|P$}k1h$+ac#SN#a$(B7O6s>
z$|W8D-!I{3^QN0bEY?KVKVHTSAPf%JpA6z_$Nn~L{|=D9r*v-^U8^eqiI=Q3bL*4a
zq57Q0<=ET+{>j?zbrHerdB0pzaZ(;jDz<)nz=_F7bKxKu{F;Dpbd~8DFOK|wMA0~^
zg(M!}Tx*bn7DWuzU`3;?+R5|Pvmk2z#eJGqu#m;Lo-JI2sW|+ta(%Ol3Y(Xy4P%@W
z%N-lxWf>o$;CHIl+F_AQ0avZ1GCk>qd+jocrjY9Ea$;YS5>(tGIjSP^@Aj~r{kq`y
z*TrH<Usc&+-pUNoSK;_F-Kk%u%2X$+tNf*6s2b8{pgKH|m$TXume_!>S-0DcUbUtV
z)%uCR|3uo^GJMQ_1M0<wJlNkcKY?gGw_u5i8XOLWMNhb>??7+K`|%t8vf;Ak{>qr}
z%<t03^Nr=ddnYIRjA17$j7ldbM|;*9P`-f0;`beqBMP4yDA@cO^t4LK5&$5L7?}H?
zi)#(n%R5Z=A`L^Y{}ALZYIjx#*0<@5)Z7ZU?k4plDdCjPDRrUJv7wyFgReV_o;s;h
z1sR$m)u>q^sbCa+_5H@m)6U!8D^VPeED~DGlplrhs%mc4H&?6sb@{aIX_@Ceqp}#q
zP3rfb-2M=M-3YJiZM+1{r{$0-xO?MMET1~hCHKZ#x0CxnNvmCln~3-c)?iQTF}YBg
z&R1?jg{g0{D3s&BJx0(|d*JC(u(jhW1(#;k?$ltJ^6Tn6V@Ldbw}P&GSndj0G#Hgi
zd?(gj@ki9R0tgXu#O7)D_&BA+cTq!E_kOpC$O+t(FMeAv$8ja2n$}s_=YWz4mjASd
z{J4)Zhxf>Slw9_zxKFO}voqDZfdKpUOgP^OIqaPG<Z;;fhx6BqXLUrx`Ol;<1P_Do
z4B4o?L(Mza*_p;r^L^QKZYKVX<)`)5gmz{KmxzhR_gj%1u7c}q%`!^GVk2I_JW{n~
zYiqlnl2^<LiM9vqICUxg*H!O`)ttXr<Z{G?Og&SO;7=9*S$~=*o}SL1tw%!*h=A2E
zKr73R%y6JzIF#{Pe8$^ppv0Sa(NixI^r0&4FEmu`kUo8<g48#)o&O~-9xs6q>|4>W
z?{XO^9glGkx!m4am@C}`&2*J|ra73aZ7!Aa#QBNCrR+c3Lmr!roy)g~Syl?oJVmmA
zyi%+lPLj$<(Gf3eoCk??Ju&>)4EYo>OawClc^h$d(kl>+_-37N`f=x&^z+Y3k`h<P
z+p<Dw&$~syZ;0&2p*N5I@O#S4BKxadAm4;v1DX<1OT(5?6Ly#F1I&M-;%*gwf-9#J
z+Dszc)-%c9I^sAE%#9ikG<UmqEeSiu*kW3$v1h&|JDgyJwcm&t6SPJXTr0I?>9YZ4
zrJgBJV=8EpJl{6KU=9;csj(1ndMA&S;}$g`M>SK>LLwblAAUTyOoUlSL&nD8p_TMH
z-U4xNBi;T{SMDclN%PR}TAA5bZ+@9b`?TFhYm}i{!*HMNGT!j}x6LZ1OwDej(N8y-
z%1Qgnm3kRoU~EQdB3?kL;Ar|V3$9{Ht4zJoaU!2#>~gH9D@heGIxxD$pC=*k);Ie%
zI8%%k-!@204bMi4{uoD1cPFT*qeR;MLNbmNziQ3%xx{?zkt*4_`ZdtOW$kcxH|hLO
zOj6qkKu24<YJ7bd|7W6}&2yX%e{Dhsn46_j<$MTNY!-{fF+YRl31~KIPkP~8L)Vbd
zXWVE`5%i1M?53vdd(WS~E;`xY4|o=O--LKg{?WxNk;=-fkyKy5@Ow~x*|>I7t8}9!
zu*@Rh6^Um=f!CUw>#?CU2gaTt5$2)-H&pRWoO6(_b#L|M-27Ws-0f9T@#&kk>9#6L
zTa+I%NHTHrS6^kbAc%`xSeT45`m>PzF?>3sZl=Nmx$NAY52>u3nZd<jpE?q4v@a!d
z#rc&Nnk;L$eR|7M=G@i+FL0<1%wVePZf-`hocws;bb)T>Npwk>+~Wb@XGZDWj{K&0
zFK@Wq$g!4x@V-o($-#oejWt^5qN>SS22+bm-rIPUNK=hh^=8U7cK=b0O=$P2Z33zQ
zF7X54gjvK-+=J&DJceLHsKU=TWX|`_cRLr);_AY(ibn#L`Rn`t2Im?|`OPDS)&CvL
zTcHan!(HBh6B_6*xD{FbPZ^%DGWFMHk&Kn4S5sQ5o(5A`jMw5&VCcQ(GF964i4>bj
zkE8Y*`ZcC3l1hn*C9x%>F_7?<5bhY4Yi<ca2L(B{vMv~B4EBD!TMD`uh<RbtJHWx<
zDf8lr?NHFL=n7aUU!Py;d<EP$M|L0`3|_#rdsGz^5FpFL5YyTIxz0G(I9}4Pvl>LX
zNV~ojc(^KU{?;?K=h!FQ#h|4IbhQWWL`5~D35;so(a)h-4XMHf30Ap8U}4(c){Flc
zteCL!gw1Y?|56!BrxK86<Z}Q^5-c>JqA$Mvc*>6uBe<Uu)dCtF3oK@(hIqpc>)RS6
z9gE%I9&UFd+XIJ(EH8nBTi~;!y*-~PPidWphYs6XY!iq{vuwT;W799zWw*8(ol7b|
z8<PkzP^7MO0B5;ybUny+&sN%O!&R-Ak~t_Z8;~A_CQS@Z2l2M?TZ#tdb(MMYAKRcs
zd?i!u;QaP$&x9zmn4VnXow%F-jUhvWf&R1lSICqCt6my@kE4>I-zTU`$;!F%{focN
zC>~zcp|O#~w!fH<uA0(}5u%B^PrKwYH1aCKMm}7Y8256WMP2)FC(GkEizw;&E55ka
zu>;w)Y`v+#^X27GMAC}Orho?-%%?-JBSHp5Nt($*Lv&4jcXtQ<+k!!)bJZ8?-WYK|
zXhHV^Ss_nab@}k{-%k?w_)bQ*Mpu3Y2scn-HKD8?!X0=v7|gR;c*<hP*U-E*k}oUB
zEnf0y$$wT1ViWMlxvuwl{<sf<N8EI7J+x{5{pw5gs2<YV?#km5T`d1I-(0+$@-odv
zmDHX_LpatsivDuGT4X{`f2BTiHLa26*yYF9FQO>kOobBW-m3Q|XK}uK^Lhub*Ul<d
z-RBd<AiB$xLFvq1^7m@Vd*O@$PL&d!&r>C3WmTT8&h8t=du5I4pSnukcC3%)|9`Wa
z?0@L3^mamlsez&KV*=e99Y6kqQXQd7m*YSy=T_mZ*)HxE7=A46$5hn7p)P?l?3<N-
zY+v=!(0XJ0Z8vohjmf})y~&|Z^S<qIjtfWj!XG(S5W|FqB}9#x=j^tbdSZ{GcInQ=
z>#Gy%KUJ$`X<u*R9{UguOZsgVnY(s_#YaJ+VwRlqG;Se@$PAmCU;Mhk3W`Wn!FCBk
zn@wTpqfT-ix&5NiZX!d^q(M>=?NTprxi$fXt|cT$mM1SS4+i6S!&qs`>RCz6`^zPO
z7{7Nb@m&`G^z0d!jRD_}uVOMN%ks}%pV{0Epd*aWK<x?BGyck-V45Z=I{<++=?CYI
zz^o$K9qDJcoohHV*8&hBfx!T14Lm%E43{@aF+fidfmnB=359Vm83=75n3)3tY^o{;
z5+l#86n0`_b$n`fJ`QRu4VTx}qG+4M8<y0tKrIBi7JyQcg6-K<Hyf)43%63r^hr3h
z?alS?wX;K2vln@h>``0RWzXxgx4&^26fHI3TFAoB<nv5Yp-1^U6!Fi>%H-HdUCNQj
zPn$RD0jli^oNc{ZSADgaQl8ggiE{1lA0U6lMmqQYz-g+QFxeZjjNszX43H*ILsMti
z1j~zQ%Uc77!Mh7RfUw}^AuLOD|6Er(I2cbS`1k?`pf9vEImX69Rj1r#ica!Cl7)mA
z5-J??E1dAo6`Mk2a6(mS&$Yj6tJK~d?WnXE1%(i|tIlnn3~v#hEWhB?2)!#dR!CnV
z2>a7$fI7qMme_6Vf6i3O4-dsBO#Uqvn`Jjo??b+KkD}O0cUKWIHyn`Wn=smhQ&@_O
z^dqf8pL@|pm)V7VEMNcOcFEMnSblOo&c`vF{*UkxNp_Ja4(VPx-^46}mp=Nw>Ru*V
z2rr*%Ryd`srJSpbzsu(AaiuUYsc>vmmNnyhckwa}QVD;98Wfl-odKAFS6*u{l@q1<
zQ!L+rQ}2Mhjzya3#G<wl8$&@vcnL&D<nAyttdJvG4@}`Hyqn`I34Fh}=_cnaM!cf;
z56)^0cp}TkhIzq9y!%etTWbT#EhGbZDjvJ(!0S9coj6!H?~~kX*yb#eZ+zWTHqKr8
z<Br$*?)BU6V%IvdA{<T)As)5A5AXE<#D|Jix6AY{n}_^RHL^Y_GG$f$bN@b)8eS}n
zm)^0VRWWsSGM%N>sno@?2{>hlnCB9MTdl$HC7!~1ELO6+JHe`^_*PPe{P5|m_o=;s
zJKWH9x#7_<*Sj-ZOfKdAF9QuEsbNOY>-+vW{Gh9CxJI&nDVq^4pEn$RdAiuFz;|t3
zpg{6&MXikvSKNCot*QDn<S&d{Wm=#=;bqx>J_NrIQDF5OCaH`2UZYDMyPs<^<CL2=
z>kp<Fbu*i?xo&RHJF4g51>{yS=H<2Zxzo>qN9D(Mg&gRi*d^6DPk`15x|1shE|;TO
zmdzxa|09dXQQAXt!;tD{D+-rYYxW;8KSU@hrXvSw3V5U4s5aJostRWYiy`enyb&d_
z)oW?)F$tFFAm04Dtei*N$K;)%(OSH;?Rw8}5_pgf_kSAGy0o?R1%`%(qW(S6g3ALh
zqE;zLUpAe3*3DBcNeu&aB&L(!Z}(|1{$x1tB@4CyDpR^6Q<=umv8)Bx#fpN|-7t~l
zd}qJ(YUA%2+pF4!R;5HHHDE4Nnp^A{3p?8TH_(%mF(9k&=z-rk-cT#mFm<E_v9bZ~
zMRyvwxwdq<8gb+7Z0OcL9zVJZbmXFf7%1g(ZvowA^X1JbxvzhgPlI&s+{%sza2n1Y
z)a-6Bn>{6+B;XoZ)ux&K`wlUxpHZmW3exZeYmuvmP~lG4X-jo*;K}Vo<E%Szd@0kR
z<xk@82VLsEeYd4B@EcaEw2PL04(0iao1y@tpAq{gVBUOpu*0=}20m4qwAEe-Nq^8_
z(&R<Mlb!warG%wdYti+vD1Q6X?mfe@%^c(8t^P}XoHOZVAKhn^Dcpm+ZKfo6MdEnb
zb7`v1n^BiTUogVO14j3rioB|P*!5-1qioSc#736e{1%JZgbp<#HZ*r2Y52puUKsPY
zpynpdgV|PZvC3Dd1FCW1e%27fWV1o6vYQa?4E0V#&(z`2mHlJgk9Reat8_jj#pIps
z7iSpcJt!yB#wJ*rhL0uJl>Ndn3&K7PrI#@=p*~61nBKOCU&7z5yc@Uo8_0x5ND(wm
zvRR=JUEt2B=hWP`G((!vV;5eq)H7Z**{`j8ZQ0H*mtDP7ra*&qDIo+;Uy3&ZjoKQZ
zE-gt}xZl)avr(TU`u*S4b)wI9X+%ypLtp&Es!FP=CP~<R>%))L6^||%rn^{a=%Sr{
zpX|pyhRSKu^o`9xJ;+`b=C^XGAY({YJ|aTTv0SI%JvQQkvk?@cdz#|gE7}gnv`7@<
z42sR-7(dE+C0^~&Q)vcN8yFHKrHB*4u)?Z48L0JjbDE2CbsHHU40@5`#K1GqGQ1l=
z72XP@&G~WCq^j<(1LuuIY`33uQ9o|r(8nk}%3o#7_4NdnyN=CRy#NRJl|7^0jn>w>
zb6m+HXa>c>!U-fvD)wnC1#NAU9<#K$BXvnSE(pOjv^x3kfVbb^9C2zIzkAw**|=G_
zyzU<+&l7sIthsV+aCGd={yevFp4lHm^=SKzV}12*??FLKNZYIjYy<SZ1%@h4l{-x*
zEyQeZ|FAd*7NEn?VvuKSN(G`Q$Y;BF+W$Yj`Qbw<a!^h|6EDrGCj>?{qbnK18Ki$F
zC&3NzazuMR;u8)U*tO#6h*n*!z6~ZQh*^~?5_cJr8KWWO1ay-`BB^2LXxm?nFoaH&
zyoW%X9Mpy7!|gzjbh(F@tB&IYN0Lgko^hV~<>l!r20(kt^zCs^!~~0;VlY7%^d!b8
zl9#s)5gLl%X6ONdrBO<zzb!)(Hj{-cmh{|A3N*mCMQ)cmysyNm!;0q<VK=U*Ohsa?
zY3=Lgm6{vK%Xu?VtGLVUZ6U|MJ7X9PhKIw*m@x95kCqM6$mI&eCpV%YQzT%=G1du0
z-y>F|xAoQ0xfWB$z`6vI;X8x4)>VWyh;4&3y3Fxa9m40!e*@%!KP-3nMz{Kxsx4sH
zj<^DyS9S8iZqORj8h)6I^iw<Yo3Q+&rap$B#gm+j*tb;8BBPKLaTYxcm)>`odHxi_
z&ZVe2|9v<gGtCT5cOGlqvJrdyy3o-Y*6GFFKUv&pHZC&El~Dc>F@cX4tEbD%JjxE?
z5$C25u~^4`DARJS&{|9T*GBL2aQhyFDIaRql`+F3vBz)3f_)B!vR;ys^|jfjdRsx6
zkY!~_*IL+J!l1_9LgL@Yc>M<aq4ne8*YWt@4)H=DuHKBeSMYhM(*JV)h3iJnXi;_d
zLAuRvuP?$Q?|#;(68S!1!t%h7E&TH;BH=aTh#Y(KV~;`8NC<y7;MccbRIRjbYi>Q+
z+aH7I`40d?wO{*^D&L2)aKbB7777zXN>i8k$c;-T?ayJBtk9dQ9qQs;?aYHUwa7`~
zo3xah>_>fNKTvjFrseL+aY8j!&fm(bo;GY&pk^0L9yMg4%`R(88XL~SS(~n^S6h^z
z;^+c22N8)YvWW@VRs{<$NVX_rIw3(~Z%J4;6I@l@mDSa4^m>&P<nEyaoFbo;n^iux
zvin4Ym4u@A{xd>E1bubdYMosKDD+e$XkB}B@^5AL-~X$L4GfY9hsVc30&Y`f6LOD5
zX*y{PieFnYP>Sxk!20k*#{A)#Q`*-8OxL`dxs!@H9Dg-(FvGf+YTlWxXK7hL0lU&D
zCvMd`_dgTyhC#Hm$Ljp%I<@*(;pfF2t32ou`RFt(<mfmokZhnz$6plKuLRfzV89Fp
z0pVRs?Wn9F_V4!l?G<bmYlz%!)&!wK@t2zciWFbKT`;tbypX3~Y#o>8Z}G~NC)&Pl
z@#<v>zxis;K~_cfoNi9#n9xzKj1+Y4`F>~8Kf6<#_;&hvJ;?6*@{sdXbuU715ZBl`
z=G^Q$tM;Cysp?|*z#0$(egTAme_^W3|Gj0ceL!2aJCHf9V`Rk2W&lw~?N8WoUDz?u
z3>0&G2;i^KhrA(La$(dK|F)2;Ocb`LU^3Z5r`b(Q!STC)|Ll<r*p0*r$NS+wUMwD6
zfgaejO}odm%8*wF)&}9~Ca!(N@2J@w40w>tUGpySXJ|l$!lRd3Fl9_2ZMA$#UJSF-
zK3&Vm3q4%Ru;*AnAS|BE{a-$gi2=qY!v~V?zdecn<LF!*ng0JTK0+h<PzbqB<bKON
zu|$c8x#TkUyX87^$t}4hM7iH8nfo>7e#xcDU5za*B(YdbExGf1eSiJ~Z14B;d7kGy
zPPiO4{##mr^AS4{xO*p?ucJluTpmnD9qu;e1ck<2a7~r$W!?dQx|^2yv~jGJUtN&g
z`_Vp7!=ke>?_O(m1>O{~)^pG|ksOvSVo%3wFC8btc|G>MxR*0-ndRJ<nYUph@<!~!
z4D3%om10uu&D6Vd_?I6>;;o)SYW+%+QDH1U<L*YM5f~xDbao0UA9Qaaf3hq#uH*_0
z>wmpD$#XJFX+1;qiHF>*C0eRA3MNZX^1|QOasnhfSN8ZdK~s*Dts&d#q}~^*RT6DD
zB!8(nlsh(|I$q*u>@VuF#bE{_;1&CUHdyq#K1RQ+z<KlRzy7fg&YSLn=0Q-Gp%^Ox
z)u=vR3!t(Un_R$mO0ZWeNePRD!&pW3GLn_Dktj<mNb6z)bMEv0jv29UGWr4<Z}@|&
zy81}l0+y?T%HUuC+$m$8Il59<6|>)ZDYXNiOzHYHwpfIv33sgEIlns<OXD=5g4r#S
z`{D*@Ntv?{Ud`Z2%{QE}aS~8A;ZGqSu*4^|Ivff)n(d6sy;kz3pD@+=^H=4*Mli-A
z3@6d40<r1KUiX$=gjk5C^vaupYU%wKPwVOlXpjWHOni74JejXi{WF`{;D-;WhWpVm
zj^}q}RgBg3)h}>aw_ewpg1tYzzRwP;R^_I%xbU8A`=QJD%fi>qF3z-o&mqNgt9@G6
zK+@k3)wjR3w7ObOy$bWxlD%=3E|q;oW`BNYGWluAPJp`br-|sxY4R0sWKUIqGJ+~q
zxsddbO@zN7Ej{C6n^Ga^O~Bm7KgDpeff>439v{|GRE*}BF8olKoZ;{iTrkm#mm!r{
zVW%&Dqli`X$yvW_tKPXxZ5$sv=;3H$@v*VX>M5i1^Tq!~^9_3!)M;PS(SB)inH*_Y
zJm0+IV2McWnz(8CpFD8pK&ux|_JJR7@ddhKSuvNT8pBB+tT*TZoqW2!7F_ui7Y>NG
zhH$S514Hnw`SpOr1Kd0SJ+i!_AReDZ578vI-q5+9C>N3o#s*%;{~B^>V!Pn_p%}k-
z{GJMklt3NaqtV7F6iUCuC3|AO$H5A%Lg#Q$2Pt>H(l1dv{Ibla&$kwL1kMOKI3{oX
z`<KX$Q0opyFY@U`himeuLo2^M8wN?ZNnj|wt@l5P@<4zi1o+q!oz?lz){QjsCcXyP
zqe(o4BCX-ZP09Wr#*FGO)1ZNy#xNwemH-^e!PCWK)!ai6BkS(ww|JBf67ArZ07H<@
z2|0##q3>v-0?ZxP`2GAJ0O$#})w>`P{9#EYsWx%6w$Po;;B%JbVp+#$!=e{&_o;F-
zS#bzV<%$Z~)vUtS`E6A77BO;6)EmT5#hZHk6;)LcRF{<GvyD}T(B5p;0xKUO$h4X;
zX<ep5x>JJqLCjjy#Xcw(lFHu4_f2ka`niKgz_iczV8-vPFZHk{d={1g#+As=%DBIc
z_PmlGLV}xxPFsIvwz}w0bLpN3GxCDCqll66K-snB?fjqEzp`)gAoX9!`bUvH`Sqbm
zM(J9jP<pxyiCA?6*`{iO%VJ3R0mIial?Oli&pN!pzBYG;3(w$YGd**>r8-(BwCBs%
z%0&>nD{Et8%c2sbQh{vi_X%5+#psi%<5hnbRzO3c_3clcY*WUbvS4%a?+q#hLENs|
z!tO-f(!;vQ(pQd_#>sr*yUM`TjBM03zM=MYak2Dqbx{7;Z-pF0S&q~%cw5w$%|yQ!
zlUIb~3N9*hUxKtK!661?s)krgVfss#AavewXHAROzF@1~3Nul!CEqHz&fG*_cjahX
z%rPGx(>j1udqtG7bg-4SssHHyiHUJn=jDn&?lL;;UII@~X0Yy7{LJTbUE_?}PAS#X
z?V7hBtX`429B?Z5)&zR9b@iU-0F^*ZD)k{Q^|7OM<I7*A{6`*A*_)mk`Q17rBf%P;
zqj=#(&MPTpZeTho%{m9md2Uyu%B5L`*SbO@wfU#jni(oKHqCf!-_`>Pg&4d*E$K)b
zd{*jaLC07Q0%7>$E4CQ8GL;<Sc5VO?hk4z0fI82W=}~85NO}%L-SQ^F49Htkb465H
z-cN9!3*V>xRSW9|Dh8JTb)@LOla_<0G26>0G@ZTq7@qF7l?;vx8*^s|O@sDMo^Lza
z=U{I~@Njc9kXG}w@@-aYrb2d{JYpY-G8)9Qh7XIa+4h<1z&|DA=&_V>!><d|rLL81
zuB}aF-9~>^ZFoAZ|Et*2*s@_e#L|i|;`%RI^!t<HtXJ{M7YiCm8wKSc@-nSUda&Lo
zSNi7e&rT)|OCAPD6cI=q7hRPL#TjBAKCIbu&<l>#UM=GC)^k0pc2MvA@-DFZRs@!$
z(c0Q}4(4%oD2x8&WIZAUqM&dOplq8z?qPkq3JZ^%+eAyVnlHaZZ+2}q_I(^~z}%tz
z?OHst2cbSq{RmXu!I8tQrCutj{ycZz->dbz14y#EH2r2Vr4zUuEEhu*aXh)MQ3d5=
zdtf8Qx1?A`fW$Htz$Dbqj&g!`4G7Go+SCTwRDW?BvZi!!*JsncLgik=gXlS06{l{D
z*D{{`Oe(zS(TkL?K7XRiC;4ml=wPRB|Aphxn?7)=KiKd`%aO-MY{}h}{Yl#zW?}5O
zVVu~FF>NffZM=`1UN8XFg(9$05$Sp&@vW5oR#YbwT@=)wDB@y*;%nd9j!2`8%Rt1*
z_gN9dI;<g=nfx!KYkb_ivu}lMU39r2#-0t^$|sTOwWAG~^+xEqIw=7p$A>8ve8^oa
zfAL%Em~Sa|pA3B*Yd0N}{TdrNW7!=IM&;jMAXZnn2Z<@kX@!YZepr7pCH+@ot0RJ9
zZkI1A-P$gQX?Jt>=0!TR3o96`79i3Y+zrPQpijygp+<CWaCSG3OY8`Gjf=Y6k}!CN
z^xeUqB_)H$mVJxw>0fI1p3M74A07T3DrMGX%&gv$YiV%ki#}QnA+=2If$oR3y|oU4
z{>>UI8Kku`gV{R`Qb^B#4zky_dzNesO>ovslF0Q=r(ddtUH2Nk7(Yuyj1az1dk-Qy
zi?ojw__EUc9p(iVy1_sjtP=&s;$aDTi4>IFS=pzjGLr|&q`y{QKm9f=Sdpc&kq2S}
zty1)Y7a8bEKpF2jtdpnohp!e#=;tuojg(5GEE+IVZu{E<HyLi=t_nrMVPsZ-9EUJ4
zqofL4S)GbP2pX#?1Bv5Wx3~_@#QGt+|Kl4h3%zxxDyINlLROwSPk-@-jEn?c%Tmr>
zD1*T#Fc!!DIe*VK+$u-(=iil6d?=J;Dzwx7a!Oz;A&jcLd9)f}pjcv3LD{Z`xl{Og
zYn0#fxB?6PE%#EClp&buD?PcBhw=<12?zC*qGn04i)_g=h=S=X+m>$cn_?VQqL~VO
zt`^jMbO~+^^I`4waCSof!zKclm+er|(I?uIFPbI~w&WL&<d0wW?Ew^A%x_1PG=Gwn
zy91HJ6)b1Mp{H7SJW<$Ye7@X!e|HVP9kI~~klBKFFWc|>w%NI+)?W9)<!wB`@7PN+
zviBYD&)3}B(I8JV)U6OHAte<En>_PL0#A3xoL`&FpUAHeUW)E{K9BrWJqR|(nFr@R
zu8~w8NLVTQ8$8m%Pxh=An`#!crGK@F1j(9@cb3(M!?cR0r@?*1uWd3M_$B~Z_4ZQZ
zgvXoO2>Lsc8q}`VF%5?#Ma*ZFiE?VunPmm7$OxW_Olg2Q%LO`96V6Yy2P?-Aqy9C1
z`hMs|-BGbQF+0itvY<7e3>v>BaYG0z{uIMWdtwrML2LNYs_~YCe70>07S!3y*uK0w
z46>>f7jXO>d+gBHN+;mmojq1a^at<%n8UTpX)5ea0?=BO0=Wkf+BT?2C|uoG6mnm!
zu-v<JT(aVMGcI4vm)Mfl?9oNqu_xc~A`z}u<Oa)G7AD%FBvp1ZkgbFeU%TqmWrcJN
z%wN(53POi?mb#t17xz<`qI;WAjs4o>nDZ9<_M@iY(mKms+l1g4+<Gj$U8*Me=Zd4Z
zKv+ITyYEr^-^o83H;<n>zFkeOyL%ra+&g22Y;$Q(Mmm<LFOAh$OOu3-zK6i^FYeyG
zg_P9?WeaDPEjPsxdi7|{skO;2YjtCEfNGe_B>aPu(kT|+OK>^U+TRQo?dz0P<d6cB
zu991-+wmSX=6I}cpR%KKq>eN}J{7sH>%o-x9AF7F$nzc*5BbtaH)BhmYz-K7S13Ts
ztn*ROv?jN@8kIM6C{~#0TZI{0>8DM&=Zl*ULU%$52jkh9Qn~XF4m@N2UA}mFnMRwB
zM#ntFhml!rE19HZn=`p==EboFXI{N__oHX)iO~2f&27BZ!uPTCaiGd%hRV5k(RWaU
zHrky5ZapnLeeS?mo^J6>$&pf3)Xjn4m<x0q0On(i=erTnBl_C93cHH*vG=;FsUc_j
zy2*;|mh{Cp$7(qrVxc0Eq4GKT+2)rJKCL(&3dFk9LNXgYK9comyLYkkfoL2@7(=gZ
zb)2q~AugQ9JRQeymGv>UmHKd7gCQ32iAKY3Yo;#8Ba;Fn!@{-}gw7_Fdoz5~OqCRK
zO)dAYb(SqnAJlt?oN>I=P6Q&b{uhEMuoHD^(Qr8khWFOXMbf^72;I`ZbKbhil|jYj
ziYqb$wvI-1hFppSCTeZq{$sFp9^7k4S}H&kkiFB_zyVbL#Q$Wi??~t9!bwumi%qT&
zgHcfOx9Lw^=nlt+Z2S&BUaNRHnQOGAO+Gw4EV__9(j9eBPT6<nK^S7ZSiA%h1NnzH
z9y|;UZP+J_?UicwB3Lq;65|c>WdLhQBwp%%Dx)%9T}wx2r6@rf{Ts)71NJGz{z6;`
zsFw_MNBO4yURn~F{M`uN&GJN}lqVfNvpd>e?z*Ks%b*(ssJCjPC;W;1)+7Rf=Tn$D
zRSSkwri_m9xfbB8t1;TzTG6|^zau(uJeJxn#VUFey>pwbgfB;aLInZZj_Jc7zKs1d
zKTikonMf8M?+9AlZgA`~(SX58Ar;Hv=}lBkkL$10r<Oz19VhiFb9o``w7m!cvQ60}
znjr%`tfHSn@xY-~KNzy#wG`RQqqMAE9alzu7(2;RfLmMJ^Dd~EkFGI%CdJp3i3`+=
zFZ!d-3K>AQkv5Kw>ih!g0wK?hI*Q~K^{J1WAMhkD{~j90@y*IxaZr(xD%mbj-}B18
z)GN1KHH5hhaTjXPHYv%qr~8!}yTgua9c=_n9M;);-Ryo!;A(rGiX`O1^W62<v&LHE
z;tvOeY+{t7=AJazb*`Asr0l%i-d|QqO#aY#Cv>dWE$Umg&fDma@JEKPLD|pGD{Jfy
z(;qP_N$#T%&VE>4toFo5JJ<J0`x-0PUNc(t-Gag4kIUnHLLS)!jf}~$4hhc-`cXF>
ziO7?!L2C37rSAZIYk4aA&vDUZ`ix{Mj%S$fFZ~y@yG{x9CC%E7WVE$fF|ni48b>6V
zD|v^N?hp}5`p#pBaJF;SAM#>ltn-=eyXN@+x9{|b=w7NiN7leW$J^ABgx&F2-z~zl
za%%2!#}647OKLI%kv)if#K9)9=cyQPZ_ga~+WsfwpqICeT`yGJ09pr+%EVH3SzcYV
zK{l=DdL-SGS|g*}u6GBT%~8$=Og@g~bN(4>=b1+5@xa7ebd)B1t*NW`G`Fi4|1FeE
zW)neL!vG?MmeiQ!fPZO4pJ}{cG(yv|tcT1p{Zw0;EgbZ?SbI7cDRpnKbsA>peC6zW
zHwJO1rqCoiy{u8Zf)cvmjbVGLV*eZhu676vqyW5w%kwTVTDfr69dzvB6=-{r5n*9J
z2BH@2lw!+_OKDd5JCF=C>{Jw<KGq!9v2d8Ih~a;eemFTNWs2%?sL!6`Sy$@`?d1gV
z=K@b^ZAhy(pF(00rArXPV&ruZW{$e{@+Gj19mni#>m2Ev?40DDd@t-X4LUhG>JfZk
zPux(oLv08hZw*fAoB+9^He`JEY6ySg;F5U!*6ck;x>x`8+dR~MsU5X%kuM8vmiUv8
z=yZ^mua}a#!XhH^`gc$VvVkZRz1z<iBT!GKSzhsq1N;s^@K7uCb7f62Ahpm3bvU@D
zCIgQF6`*eYsfd*RF2tyE+I_azCIZVlb>m@t)sOBuUn5z(a_%31G(p!>4-h`Gp!Wp^
z7Kw<SI5?@r_#!yU0!Nkxe~}n_B_=X5Oxf#8x&HLUpt*ifPFPIbk<C2>1|}e?Z_gJJ
z8xDH=ilzvfdezy5tx-q&vu%sfc81y|>DVlF&z?GKYafl5^5a}jbhk9yA5weJ@fJRl
z8emAD`)f}GC$u0*F^-QD$Kn`^Ad#fW4jNvgD-zw#k?GBEZYX(^QX48)u55zm_XL2x
ztmpY1Y!iNMq3OYKUpICpJJhG9@Obq}$?`lH<gfiR1DY0v|9;(yZ5n>1xhk6(FdA)8
z$G*SihmSfx+qB(t<b2FnY4gGH<z<E4ZtIA>3pMnpguhM+iv;}b->(u74S(58C%4F3
zD;V$SJu3Or>+xZEFK)OOL%8wDIqW6Jih&ES5IcSpD(_aaH8jUO0}>Yg$S1jMcy)iu
zQ3015)A;RKDTkK?t1i?@UL`73Yb;faF-z9e_3>rY6W*bBMjtOTNrWu%LSu!`n39oQ
z^OKTCf0zBE54wczPTiZl?bDRQ{zTgQw0TOI1~9Se{PO!J!hrfkq%If}Dfq!Ra&<kS
zi4x?j{2B#1%PKt!X;_>HCm}W6J<hovIZIdH(e-e9LTxeFclyftj&q0!4dWNr??;{Q
zzBPnc|E;X$n1~DyEXk@iuYz+pUAnFu$I5CndEv46QteCej|CR-5HBy^!YmOVI{Gs(
zi@0jL%9RPb!=@aZ=UM|d{hH?Ex^kB@IqfEE@nx4}IGj0jWjcg2Rnn)eJ(+L!orNT(
z(pq!Q6)-Vak<+=*+*Q~t<TqOJDNRx~LVw8izHM9I#-_tYXWc}|q!x(yvY0Z)jmoeh
z+kcz`@wa7{T1|aPdxwz=EGuh6Z#Xl1saQkF5Vx-R2x`d7*KrlLsF8$>Aj{Thvuz%Z
zwblE8^YJZTI%P&>b58RzbhM)ZlZr1(|E34kT~@b6)rR)B)%njrgyUZ!^dQw3m?gP;
z{}i)Rol9&n+Oa1pi5_fUi^o#kdqUC2+iERcU0q=0l*ELmgF)C@E|&t1u0n!B`el>@
z(jZ+<)Ei?B4V1oHI@)15wRp!DAJ1MyOJ;X1BHPqlfc-$u_!^Dn=~fXb0li2Dx03I)
zwL6Jl%ENbNyL3*Pb$Yt4S$i5yDjMr~6qJK0*m+C)r=+9dr)M|zbk+@?LM6%GJPF}#
zq_uy3%{Dn|lNy+3>C2U?qA*<pbkUQ>#TH|{s_`jlQYFDQC}iO(xp*hu0QVz{<pQeC
z_7N-bQ9=Vznqy&cUe))ivANvJ=B9~W{h+lpWq%@EEq$4Wr}3McjQl7ayFxBrU>0dd
zu+TG5pQOg#|5CTpCdFhn|9&apVN8r;O%@{2IdAo27^wv`^MO2mi2do<4o;=jfQR2~
z(Vy1TuAtl2fJY1REwWeD7>!0RKc+2cQvSCo{9fMHyo0YTcbwb<#XR@eKVGfi+eoP_
zG|+#RDGYSMfV!8Iv|5Gv(%et^%y0p@7A=`@UX+>5uUok@a=W#{4#L-M3Y;*Hpm|v3
zI;{gS_japzF)oL@6}hjsR&GVfUEB&EEO%UDEU}FN9oIKC@kVP$uJ=u+Y_tFMLY~Rn
zss5PFa`XMNEhe*w>;7f&V`aPFwjw`X<MmO-Xobc(L-vlg8Eyschpw!RBH|YLaf)!R
zoarp2;pnV7m85-tA0SkQg<{mTg)%2x?zIKTs2gY6w8uhmywL}wMLwmNf1WY>HIBzF
zq|m;-3IFKB!Dr;L4slhz1&vi0l8(_PzD8K-g-k|<oN9Kn72PG}jbm_nY+kmNt1ve=
z)GSUXQ}Ao!n5^))EZe8ToUQ`6-@(S{M~4{Bfbc$EhIkaCVq!c8w5<Mlz&SDODG`Or
zsRq@H2FdVv1}w{&SE9<vjF%n>3pZQ%)@Aqh$+Z!t0(-2T@<};z>+T5rNi$AF*Zo0q
z@+*&CkKbMFlv%%y>KFND7+q(F2!5wkDa*SRh-#RnkNG6h{LcTZ+o?^Y`TDhA9%TDU
zw3d4bqx6lsY@<5Gz)9p&sQK+UiGh66T93P42d{)l9{4tU&(x6`qlOZ9kkaaaXqr_~
z)1|^@db*g6i4QQ`b)R!-8o40R<>%g6&%A&&J_A9gYGb0C&H9V?Lx*_KhM25Y@8n-v
z6WY7;4_wn6sV3fC_1*>e-@!es?*d4z3mpi<QJb}0%n$FB0l-^o^_8z6A8<yWB|6Sp
z-M&K8s^Bz|vnTzF@}K{H9hqN@<(ubCQN#T}m<+!#i3iAr!KK6whX&AaX7yN3Z$=_N
zkRKOF524ym0;i~%%BjhG#A_u{`0~6*+r#bk<9`8m9a=&!pQHvn3H-O9W*in0b6S`<
zw!ODESGP1K!oab1;E=C`SFXq^DE}BzOmo=qGOE(J4Ew~ene}Nc(ogq%!7HDndK9UZ
zG2WTSq(YQK)p!s?Z%AqdeZs!1pA^!tZuFfF4Hv+8Zk;E+b5!oD>s5eQS2l(q4D-v^
zA_QDU0oSo9>g#<q%#W;duRpc`k7J>@S4Qxt#tLVoDRIb|huy3nlNBWCUUG|V+Af=i
z15()f`;jib8|Wkf>PUHFQFG{VP){tJ_}~F5v};3ainZ~lG9>>I0FBP^NVT;4slMiT
z1hfOJ)ww4aP82^(YW4}r*@KKEVIw9)GwI$`V|7Z<e=EzUhOhNnzgj=`EKYK(yRja)
zrN8!kTTT8|DYuUGFD>|!hthvr5C7A?0=f4E`m@C&EA)}Hkn39O?-kykXIUK#gly{_
zOq#YEwEwd$wP@&UWC=N7q@?!V@mUw^vQ;@&6vuMo<@Y0}|Mao44KaykD2QCI9mjq_
zjSDUW`;i4qx#L;ry*jMK?H?iBN0BwLl!J*)od2um*AIX1Eg6&t_Jd9Ny+yl(;%OcZ
zU&;Sje;d$JlE>dKfc}^Dn!)Ip(1$w|$CHVJ9U((^|J3HzEc#l`L4CiSKZGk8bOwsq
zh_t4I!ok8ef3~^BlX<FKt~B3Oa%888r5k=H4PD$^OsOBtLl?M2YYDn~SPf-Y&vbsV
ztq)M(jq;o!pW&cmv0%@>2QkvWuIEz81%H=;Jqx8vQ*h#f3%`Y3;@3}kE;f6Tdk=H9
z_U6=ErlX#}+MV;rf38MvK`Q{OSz6PK59>!h3CC=i^w`8Ak1P%j-gl$p5VxXJ3?WSL
zOWEhvINfUX<HHM7UJuy#%Y+m*>a^T3zjGs#zAQBELOp`*ttbav2Fbx9k}V}R`7fyB
zXMQ__;zM$g2t9fHtp`$nwA1vqBriEjv67at@Nk{e!jKn-GRfpTY$aG1E&s(IyzFj@
zA_@5YXJ`djWkDSZaaIkCE%ysLA%VtqwYrnuf{9D(0Pw{?y#ei*8C$uL%5fbkyn;Y%
zw!QW)2nV52?{LT3^8aEImO_b1F<-Ykm!>%`@51#GQ=1<E@DvSAYHe`nG1M5=cwlcQ
zees53IVIcQ@4lC=p;)f^RC6`7HYC|v`oF;}#NUM)l@VoQT;AYP<NW5#W2F_~>tdf~
z*VY1n<RP^N2f*7mJ`5sMHwa|zvo@pCu#>jthWdIQf=Nv?rK@+Vzosaj<Cume-Xq{1
zbj_O97Zw7Ec;lzqQ8|Tge#008F~F-L!wbq`rl=KQtJI{dDx>;pE$)!NDjUboJgB2Z
z^Lw%<(;_<@P?9nTmKTaP>g8ct0tqPMP9Z1^`L*BspD`)^Rme(E59?qyYtsj<bmACB
zNk?0o*R_r0AS>irWO3~#t}L(C=hpoeoYks#3Wn3eOvkidMrP8^f5W58d6lPGiZ8!I
zPFwBkc5ySeO8%otDv;{B(W7XN%)p{F5$=wn1`MYwTXJ_8!WAl35#t|WP0q$5dp`Ti
zV7!VGdSC4vef%i=wsb{4rY35^HlBS3Z*Mex(pPc7a?t(v!*0G76lG?&+)ohd`E%8Q
zsat+*J=aLdx~Amaxz#fRtfOLdNTY3pe1eUYsAClKn3cZ@jQ*qZc^gas<)(2D6LF;=
zcEA4OxpyvJ@CZjA$pq$vN=7{;IELj4-2G$5M+>nl4`GYSy(m@VzAkXV>N$A{I$n1F
zZvFbqhdTUGzpO7sF5oRA$iyYC?f(5sT+?C8r8yoh_l40`Cte<{=|C5<ix&G%;vtE(
zA}`Q!j2$cPr!1v~`7Xf_VcuP&`T2z>?*N_nrJ#^&D(0;)_w}0v@$Wl)j0<i;^3f?(
zLrDEI97?6(bfvSmA78Ia&;2V9m1N2Q><>=~Fi4|;q!-;ZtaN|u-P?yV)<!pehB!HB
zpP3DT(C2aWJk0;tSJakEL3M1+NDuQQaM~2V?N@%y+<zun+GNKeIhMOlao{<ciGy4e
zVnk?8;p0kq$dMGdGPeC3{aPg;`$NFW^4cBMQ$Y?i4IB7pMNy@KgnWxSch@Qc@{b#-
z)qa-)BmDZ*DJKVmL~L?|Wvu2Yp22ZeqpvIA=3x9;!2-@d{5wo*Rs9*>ItT#V+jc_1
z;ALnXZengy6?BdNtO&!UYOEDI`K#8{pn}(wksD%^phrap^gnl2nj(|`A|D)Fd@Gk5
z2LTipUpr?2a~oxKI4giE6rV|@1qd|Ukkog^q}@OHGf0hTdT0rfTaRN^nAlxXJu(X}
zdo0{<eZ<RQShcL|AoqAwFE6}nBSRrR_Ue@>n(~AquH5@U7f^D46xFGWic&jGo(7Vz
zETk~1ksRoQMS@usbiCYi>_OSRM4Kp1qWKd66pXk2Ua?=XT~K{Rpj5Ve?o*bJPgv=B
zT)AeZRoV)Ms1LdTi3h|%mwyLq31WL7?$&|8^+?kaeYXUhFzKLJ*OnmlPGfjS=fukw
zJ}^;!kN?lo@3n}IRb`nouaReRw@Y3iFof@s%_P5CY{w60{^UcyHk7U%N+6LgGrq=!
zdzdVG!<VGGk`?6l#Y)~!_Y54*nLnq|Oxmag*O}%b)Y90V*4Z1-2^wvL58wzNdOrIf
zflC}l<>&P&0H?0k-qvA!m+6>AvMlO}C`qyM__E_^3!{zw)#FW*jw>)FfI}P3DwtlF
z7f5ujl+o~{Qy}~t$WDGm#fr;_ExGf=<%SH5#SqcU&!(8H)g1rmKMD^_bhKiiA4tX3
zo_XT#atdiV*tk!Ni_FREKhw;vbD<?+lnx@%&cG8L4PzLi*Ml%){_yoSl{h-4CdSG~
zMp+-*a>sI7wFJy%%Ko@vZQ>EOH>e4(XCbeUy>=@;?AKox?}R|n2Gcwo=aNjzIQz$8
z5X=lxoy$aK!CIkQ!F9yo-vo(>oryW2o>Yv5Lq*hGi?PGa!tb<(5oeT`Vdu^g$EYWp
zp@AjGb2=xUjxk3YZHtlU|8+sq0A*$`x9W?GkQ&-%QBsNJqM9*Kz@%vhFJpR<4K#JB
zOHy%6BN;EAE9LSzpJOUTcBHU#C>Yi`)qXLB2Dr1Em3A%JniUtCvZ)hSex~4k#I9ee
zsx8U3){oC*Wq%Z6`uqoijVCCAdJ6U8<j-qprP-Cksp#JqrbXhSO<5-svRoL&VWML%
zPU$%*;zrM9l-o#!yr2DedIxcih;m^6nDM)xI9h>6%g)(~(u;}1mPN8)h#FJ<Q?b|I
ztrS?B+osTOT{6hV5a`X|dD6DqxCX41oJr0nF5T%<7875vfp%rY<biXpoE-WBbnulq
z*xPyESFQ73&kjO_!Lq#v*Grn$UO@(l!{9TzmE8i@+_U;#<@rtS4Llw&#$}+-!fVll
zFL;`{bG^($!Yie9+tiRB_LAFSS}e`YGfWL5)twvA{)?{Pc8{qI&rWaol6Ya*t8a0C
zk38B)ZN7Gfa$=$OV3+bQKG<CvTnl4hpdL<49M9HVg{uqRB}bUK`SYU;t0<i$k{!7l
zyBehN(!>*GObii@WpXV@qGj9EHYF7vMaMMH-T%^>U*ViC3Izwxu@y~YbSm1g$+wGL
z3&#5xm*<u``bYWnw}8)KWR$zFFa*xdYL;rY-6y0I9T@>^^YT7;u5Pu&w*?Ppt4BVP
zY5MQ2g=f{tMWK^^t%V0&UGvA^YOfVf0;MU)W6sZKhm#TV-Uc|FyXpmS&h2OE6^xrZ
zITz;VrFK=3!{O<%WeTB_VxU${mFZc<8#7M6k2XfQiDj)cbGiHu`W7{{)2_d?Zt1?j
z<Y>50I;ThUmt}R<7neWjfWal-KmWLJX4OXT13NTerRYycdk(^PlpcZNSatG+`ll*N
zD4jj^7{U&7kjo8Er!GWAfz7f5Dfsf)7%Rhbmp>xiS@kQ*4x{P=W5F=#xONMv3>EB;
zXDpx3ylt<wba*$2Wst*9{TZIpewXY7ADNCBa>@<KKY4#;#+YPu!x#>Ex4Xyud;gkz
z(Ru<`97-`hp$3+06t*>X`)@}z9lhMT4ac@qR&K5+%H7_vyjE8niQ+Mes`766V$b!8
zZXBIypCfJ!k%K?!2w?1&vSuNUVrDqUXn60gm|txVVI==n|K14N<-uLGm_14$4`Xby
zYN2n{Y`aT{=w%m(CCeVLf6#s4$Iov7&;pn=N?cdyA&D8e_fZM6%E5Ndu3!Fv5h0(l
zqF(vjQimwz642BHXNO5hz7VPB$}kXZj%RKnAfz7$Cg03p;E9a|lFvT(<8)(as+Zmp
zuam>_Duls|MIy=!1awjvaItc-kYv`_^UkrI2CKQUC6L%4Byr3g`x`vtr}lNJkL&BW
zq1PJ;1?Gf|70eswjQ12a$WR{RhIQ+QMlY#6aHhg&0c|vuf<i8N!Eab4^EBoRn9{Ki
z(O;lvxBYgO4*phbrSs=Xnz7l7+t*_|vVKSScDADI5QS2ldectV6V96(#$KZL3G=CQ
z=Dr~@ZcI}5{I2}eK+?V40pu(F&Vu4`@!@>#20GKWr`u<NRpZC~dpf(b^v^2-o#gOg
zZ2cE0U0n6bI3LHpuU>=<6=QtB3Qq+^L66<jzt%Fdt_GvWK^=_wH~;b|=46XF#dFfX
z(FJfP-t>>Y6?3m0MdzaBu&TZlqqJVCV_&o@>&DJOxp(9iE^ln5`aQ|kweEl0qCy!b
zsJbv}st)xdJ?LBXZ@a3rySuv<q<Imw(^afUuq{M)I^5Opr0~P;3?e&tTnoxAH!Mj1
zU|PoJskz=MEcZ4;uU-Oehtw~AK+!4Ud~2T}oJm>HOg|6JMF6pMXemM<vq*(L6+Wq<
zloiBRpIc-E?M0~-Zv2})swo`Yc+e%A@N5`A{0tnum;U?jzh{a!*{7#fsI@eQ^TKH#
zn;MAQmZ^#>?Eur@PfdM&_oX0!u*X8X=6k6RU7>W+L^vJ1elR-&l9394uqFXTNu0D}
zkBUR!G8ZgrUPuac?CWx908~b$6^leIVPV?yim$45ie=1UgwE%G)J-C~Z7Dvo*%ze5
z%NZb(wy>EspAt_F?UG3te?e<bih&q7sQxMvb>#q1R&#Nsg|5C$zuQ?inx;>J|8YSo
zE!q3o*=6gR{<Ssg+4;=6aNAxcx~r9)pEZ=n<-Y&K^3U7M?0b9bd-)e>k!G_D5bJls
z3Lt7HKeUs(32MM(f30x|&Ro6?sX1-Q$Zl=kIxFt;Tb_x*_pVCe4eOI7+j<NCT7r;_
zd;+`Psn%VL)2Ty_E}@kx$s>&(gRdY1rt9$eY+dJEsbOVHCP~J8>ttzjj8?f&^g+^^
zX5YaorR&~~#Pl}FG4dphZg1>lH}~${Q)sc$dJ6CB^cN)omQsqAvCjO_I|qGT>3Fap
zCOMBwzR-O$5NE)&omSP|9YY_hCl^265~~i%fFf2$<Ase_vMtACu^~T3buTqa;F+Ib
z`QoHb?pXFYy^V`EePsP6i-YwQ-6aM+6rqhNR+|15({Lc8+X%gnkmQHd*dLM7xS$k-
z4;Ry02n@?|iH`0P+oOlxoxv}qp_-Vw+L>HMK8~T&N%0uYb1jqAuW!oEK77_5@678;
zSC&<+P4M~I7eHa-K!CctAT3}&m8xZQVmn;6h5Aa4>?LNvBV`K??>(B{bCR+7I7@d0
zUH7XN>m<tyz|=;Tiwm#0=@wS(fnZm1{Z-JY<Vqfb@Yg8KrYhkIv78zAcQU-bDk~4|
z+&X^07**iGKMeW_ZbNkO793Rj#SlHcRT&t82ew@FTRYe{7cJ%rFQbMg2lSL6CewCu
z?r$CYwRt`Y<jJ^0dO-MMi=jqb=c~wMfWFskrat_pgshLyc}C-`=%92W)G?W<hcy4=
zzl$dTIvBpcB(xCYS*(^=+a!rui@d3a^4($+acFp~7;nJB^uYeE9(LyW8EE3P`mGK(
zCAXeibnN+w&IO)aX~y_?Xv1Rn+QD%oALI^6IchN+V26kl72Hqc)+7-YpKn&w40}Z|
zHhjUG^gCGNiPNNa*BW)PR~5&jM40o#H+`u6n?I0Uqhch*22lX^#Q5Q59#_`1KjQ65
zU;lPZl!F8ibTR4x3{TOI4J20r`E{;orMiL+r`$0~5RU(w6Hr3VgPpqSY3JFqQvlPL
z5*49yAtgRXk5=UXD{-WDaY@O`G)3?%JtJbf-K575>kSG$6oz@eY(1sM>`k;S#x*Kt
zA-?{-NcV9Xlp?QA)PVb}mf%6XP$JkPW^FsH<XqB1ChZ>M*m+?J8bCA}#R8ypn?xMF
zFDa|Qs2-+^Q#O8_tOwUixp#MYXkfW0K`i%6>xS0w%Y{BQTm*II=8(p{>IVg9*1}Z$
zBQzd+i|NR$+fSQc&nbKEewe|SmpLW+san`>^y9i7cKV+iZ&B8*#ys(Lxsu%HEkTNe
z6S=XvDVda&M~*Eu`ws`+>$!??NrmrY%H_Vj;nv`O<-V_H{UJq@=dIWN>jC?Lmb3Pb
zPKoj%KC<nl!z9hdXqz#(iqAV}k$ycgT5veaNSgIY$198ep_C{qxFTnAIZp$TGC2H<
z@O-oE26Qo+wyn;pSb+zbo@!;RD%oQDvxjUAz04lCJn!x08C7AcCGNr5nDs-a-Wt*q
zxSMYbQ544U)+i23*r`~laQ(|hW|nl`a;s8Nr^>`S^xLvS!cWmL#9n7ji8WDu>TAtN
zNQ$uuc^h#|&#WM#Ff&TAlt|Q@-WN8|m_YE*jX+#E>IXuYA^7aM&Pt?wp?m#c?lY4&
z-0WEuxB*-{yH;`*B3?R{{W(<OY2c4;&(-JVE7c}&9FH>qcvhBArd0QR(iqBcmmD9+
zsLGfqyUfE9_&u*Q1De!&#us1BjN>QAd2Q(47l)4t<M=%Mbn`sxv&lR{C4#`xMWP=I
z;U}v<O;+j6Dh(cT&QtC3U2Gdi1_=(Endxzw>Pm@hwsj%L6j-(6kts7A99^xxy3@IB
zd1u%)iR%9q7`<HtwnJlzdFo`iE1PT^^S`KPZfae=#Ma!gS?lv(aEvw7@Vb1~2&#B#
z1)01QVHPNNrOg*r%ozLasPFK-4rsO75jtM|out!uVrB2vy%XR5V>oDH%<eu)c3Ih5
zce*npYxy_N^8NcjFR5QfV9J&5F0VyeneT<CXHyhdL0ll#+8po;%Qa?fU9-YV%tsBM
zT1H4b{!t{KYgFVq`Xt}q&##~ykUh0byzYYotQQ+^{nONbo{O&OiW3^E8X_X4pm&L#
z?CnuCNou9mqVZZ!VpUWtg3%9YN1=-sOO-*-d8B3uu#j2%(3R0vVDvi;j*IgPzN%4o
zBqx3cT^syu%_2P@12e|AY&>EQ`WP?~m#k-{7bFRKffhmQ$f<7|8+ALq|0}RbbXL?g
z))-a~0oii--jja^REM6375ny*so~*UtJU{!bG-*n$%BIfZ%|dExn8xgqIDO?Lsy9r
z&QlppLX5_K{@3$gKl@4%dcwS@cxT}aRy+P%x9E+)bWE0aDxrPQp3>7`nLbrvo6la`
zToAe1gO5`)#yhICy_~=6*!d|zeE03?h&jlb*)xhn)0g9cg8cT9vyS}V#~#bgy6K1L
z&HUMPe|@rdy6&A|T{c?Q+EHuak8sT!u3poB#5pLTVT(+?_bGRMJoRj5+!<PiBg<tO
zMr(nLoY53iagMy>%7xh{!M-#`htckLRuE|qtUBv?XZ_Icn!_E8Tl7Qm*>P+V^zrwf
zKqAk#9smz<?AF$AQTf2ii{wGjz*>mf+ir+KYnAB5Lh2N=E3MoS^}3x>VwXP!UOzpa
z&;r|HWM@gneU1hbEgSR>O)Je+@_mN;;rCNVhunj2<&k}jyEoP~m<E!C<KhjN2rq7(
z2tCqcxUd`c`_W9>hge1>iCBp<h><vK^rm{=nJaPX6n$aEQ=V|)mN=7)>s<6+f0bTc
z_-gGCQHmwTuH2Org9K0ds<Fi;2V~pkn*-Z=r#o@WZhAXoVU1S%<1B~S>E5CjO1diL
zz~z@=<0NJ)8uzFEiGec&b-y~i{jAspXN8N>sZ4C_D_PcXQ<mlguJkSA$MN3GVmHV|
zP25rssns*KQn1ycXaDnP)4w_b*NOnpCrYN(Vl0I#EFdF1k3slf1eKmkQ)i{Ai_#Md
zTr8BVz4)rK*(~dm%f4}*vq4FtM{N9)yo(^OHuk@rqR+rCT)ke2oXSPNc<%AIx&%JQ
zE3!)_R}5F&^7z;L@h2DReqBhxDyZTT>^<68t8fGXn5Yw}-D%}V`+^XY3jA86Bqp9j
zLZ(2zF3w{>iN^S@QVe(j6`8MVk|Vbk7#6qfCH1u<HG%Vo#><wCupWB;d*SVe<>ePp
zRAQA+tIM6KVZ3sx;$(KsKAoW7Szn?J%23jmRK{uKenUJrNVB+j=OUT?^^bz@TB$b-
zKR)m2>+93_8b)=w^QJACNB2g2e7hFOqO+Go{Wl&E8EsOfj{m_k?0o;ABy|qdN{F-P
zk@JE|DsJD1f>bU3i%?6L2DYQwrB(6lanL^=tSNw=3=OrT7IS;J$NbFGp1Q&@L33jz
z3>d6V6skG)9PG~%sb`bygbgI8mOqEs*NMIND#jN95|L*rZ<5(w(}Ei8TM^cT?ec`y
z)VMyzwMOcKOA3bj#!WpFGEfOLPRHVGDYZM3R~v47U~+ob=-l{kdPrbqY+V?XZbDK;
zgbKdp#-*HxM3=O*2)^#TEqRUI>b3mZ8mpAByNyANd{_MxqX)CkF*1a}hCd?(ZDb8H
zD$k8err><p9y>gNjmfbcllurEv-`X9Thn7wX9zA`hdH^19p!RYp2o_sY4`oiWoNCQ
z;n+?IvtjKm+M`tUoZHOIU<i-8nqhEH7$V=`<;@vS2R-j}(;!wh<A!IYoNFpaRR`9M
z<oEFN2Fc2&PE6;TwlMEG@2hjaIvo$ccLf&Uyg|!4vgUdV;R)pFbm!!~GkVhu230=V
z(y$h-@OXE>MD|Exh}(Fr<bv%AF`SQsxiY%Hi$nhmBc#Uf$T87X;0-VPHbtTxX+|j<
zXB}!x4;ua5h+DegCk?6jsw=>0G)oqN*sw5CA}eK7tRxa{XRwMW%H4gM;b1GdAs)+{
z0sr}4=gwmepOsc$%Y!T<lcrS!qXA^Z+M^0<?TVTR=gC9WfnrPBlvO;;Oj4gA_?MV2
z?8exqoQ!vNFH!jw9YJC+b|=$(d0-cLoxj?|J=ruaPAqT*=f~{Q=pd`ST7?&&qUXIq
zDKp{Av}!d`(Ji%EJD#?ug#-_IrF7T2S)Kw{Ey25D!`>zFywyA~X97RB)d<a_5JmUS
zcQM!2#;t^0?wwUVm7HvvZ=?5QBH7xf7^jG-9W<|+<y$WHjhpKD8D)cz(-3(1Eb*6a
z!hcqdtpmrPL|cu-X`b4|oYtHDp|*YqOV9}gS(LS%m^`8T4;Hf?C+dNk2=8wNT-s{~
zn*r)!o|p9|b_>b8nQ=SG%4bDsSjzsq-@}-b^@&1a1sT71ys-#y+};I=-VE2bO_2T`
zpJrb*)w3DPb)=gEx_NI_L03J9OM+pwx2tPRvhQT8m%tb4m8M^oWhCqF?@PoQ<9W=x
zK4XfCo58K-tXdN3ac{l%CPyV9QcF-i7bU6R+_|iE^slXNjOKV2|6%;#f8cHYvpYwv
zV`5|3hJYJw?x58Kd1vjK$}D2-YC`^B*NO+8{G1%Vc~Fv3U4DYGDLaQChTyd(Xsn>6
zMWhs6pW{}yaWFv&rS_g|cFji%nRg;5W7S!KXf<>Hb=H-0OFlof(nZgOQ)h>22(}v1
zb9#DuyAc9exYDHBa=ZrF@2uMT_dIjpZT~W@e<8%Bk${lRrEKr5*{Cp2dgmzL#Sg(=
zd;9qVzkKoAJI-8JWv>Y?DH%MGVdxH_x9#uI`MywBDEoXghGBC}WuK8}Z@BN?ZT7ln
z!cVn-jj@OcKg~V5;J(#tvHkRx-I5VUQ?zsW$e6^=fKjIV$6jrLsGd<Rh_&ru_M@hN
zYg-9VtG+RI3%@C~GN8CVk%=7J7O)P7$s3=L8z@_spC-wAz&LB6lld&~kj$TXT?+#q
ze%|qQWrkE)jc5R~l`!^~zY)>Dv-fD_<smHhP%Y+&vJ@o3?te!%q*#Li4!wT#U&ng%
z-I+M6y3&xRgQehs%Ts7RStVibU?y#w{7XVwo>M=JV@P&RaX-IOH;^}l8AC6<FCOOx
zaZ*IzME<LK=XEU^!p(37!7NY-$`+!ApBOZpdSuIpVR_T6ae;jGLcyp$BB6kh`C}z#
zMEC<~Usr))4u^R5?G@=p(kACAH*R}hQ&r4c#a3TCc5Q)|!-3<H=gjbl&#w?|eJ0I6
z)z%|w6s`U{?<~2_*`VD9k*we`zhg#qN6G$vK0B*SH=-;uV_QUnWFU$X!wWoMr;=4J
z%eknqeA0XtOX#Ra?f0IxjS%BjoMeLikCgzj4axrwu3h<&<(leyzWYC<&q&qG5KQcX
zfJa5R>LlMKy<eSj80D$v#8H%5+0HYO#}I?ci*NC37CzWXNxsM6K-f9OgR-%Pr>ek|
zCS2=^Twp^7T?f+)Q~{10Wcv&*NnCkw`}Wdxs+Rg?*t#~@m>3@9@5Cfe?c(nmmirex
zXr7m7axeIqq;vSlC{QH%!|%PlpdLOY98adM-gvFtQzB)1X~~@UZ!!8{Luc>uNxPM{
z;4?s*<|mI_T`sz;J0G34Jk!FdNcZk&{z$&>WY=9#0YV3X+}68pk<h564LWviaqI!t
zEcgJF>6t>d7BCj(>HQkc;SrJ5m|qX<C`FaoPp~5K&io&ykB*{<Aq3k@O=Dd>+;pKD
zbpgD)B6<#&c4ESI|E`5;KS>VV)I1wl0@8shmZ}8)vu(J!qhp{7pUd<qMM*BJ7vH@s
z6!+!&qYuE158Qyw^^5n*mux#!jc>Riw~R@wgFtd^RgC>!H^14s7<sU9&oOE?G%5uU
ztQ5ia8LXK$s3Iwf<5B7Cc4)xy2mpae1C?6El|RQrpAlZN8oqR163?9ZpvxD^2IF&O
z4@CEwUG<>c-=gC{)e$L-bO8LESfkSFTVUK5QovsRJ7j=h3rUrnphfmLL@#!8^E!?`
zdj5=y({`AHo?&U~^#|Kg<%{}O&!#VYXQ&;-|7qYVz>lDc8yN`hbL3k0PYL+3{5NdZ
zV3B5=eJ$@|6!(rwU0%6-J9J?iA8!{&4|wJr{dtq0pZZ81>5%L4=Ci1<e7I{%YH+ul
z1tfN*S=d+B>ZNCr{P$Q;tPD}f3wwm(U}t>tx)cgm@#+^&$^H@_(6-pqIk(6#rIGub
zPRgj5ytcO16yU!4pt)ntH)eZ`Dqn$LoIuN%p}Mbsvc>pfpb*8McFn)yx7-cgnCSG%
zMng=RxT~aM@y}9OYZ**bYG>Y;ePRj3su2`(UbD!gn&^G%@ST4o%X)*MOn=G*4n@%M
z^wZy~GL__jJ>M%mGBZbVItjCTxN&C;!3pXVU6SWFP4{msZdtzahcJJ~LV4uYsiWN|
zUN+B|HObtU9d%*!!LK)z(LC&l6ikqN6^398%#=4E$x3MmL`iv|KuQ6jg=ym)8&nU9
z72odXuqNDxRu1W_B;114&t{k%molZnOwZf0SB`hM)z7Y5XRz-iJx1eX_M|L=n>rjm
z>$NO`HZQvnX{W@o0R5rCrEfht#U^*31-9RTmtv(VEP|LAw%-iAr^bRstEcPg*RKjR
zttoRu0-tYqx%a1M&Ay~jo0v_lsevN2Hy=Ds{rz4bOlf)B7R{S@U9WpbaRMqMmV|q(
zfEGW+@&;3Ov7DKsXNSfQm=h&0U-s+-_;2q^p4K@E`@0ZoU-M2^&YkBaM;SH3y}^d^
zfaF_OD+B6`39E=8P#yuS<j<C&^ox<SZ3?lWhX%SAe7}G*=70Zz;5J1aT%(@YSib|#
z8W9G%rUgMi{}0okoOdsBQlsXLA5=t4WI5EAT)R>L@CwU#P&u@>UIUUs;a7~8*Y*I0
z8xUGRK)L}f%Q0)~pcSW9*h>i36gpGkN8<G_DBn0RVPO3p-L|;76W2a>zpbmd=|`kf
zrgw=g=*KNi4mKut-(n#1(`Sv+w7`6@tUfPzwuAbme$Fp+Fo3Jde$iJAL{1gJ<lNz=
z_ZgfzH@F$B*A*P5a^HNaLdg-VSsOF7eR_~`Sb{Nd#LtjYp;=~c7Z-&;;qfmNS2sw+
zg(T;^{AP6hI#m9XvU9;|5slx|$%}Lrj+h(=>d|;DTfEg#>;JaSS?v7U9#mN41t0Sr
zd=|nXNsuFKMBL^s8?qu}yT$UCF?XI<zc-=p_!E}{iOlGE9#Z-8shy=Glbf|bH3S|X
z=sl7<xUw?-nlT7@Gaut3oAq8EGVE@9BX)bsKCbMZisG3D6aBYV{~ak}Iae&D&KXLZ
z3AUJ>J{NwSVTFc@pE}d3vydR`em~74{B=w1hq<vo?p6B|!d~pAEOh79VTO&q16@b|
zNQ0a8zI=~$6%4M!Tp8#-f&Z@#%Lg_Zc*bqC#vFUb6u2B7ES`wGZ@Cm8M|*8jv_8Kb
z_e)lY`l*Hoaee@a!PQ>9sN^-oA`@Z|d*LL|)S-fdLFL^!H_LCCAC;aZm)%Qlw$QH=
zysg8HY<L->YHO;i%i-jdXn4nC<jMKwZ%GoBeiz49#I2Q}<I57pBvm%md88<rH};)6
z2jRX^@kU5~XQx4(>^SWEO?czv6DWH2?vv@PEX2sT%s5^-r>NsCnkhbsVyXN$y(8<n
zIpXbAoLh2j)1)ojqOg!t^qh(a1W{#V(xb_<<>G2guY#-eHW;{*Y)^tAQUVLa=^5xz
zX;tGmTh}{KHYXJvA5cM8@3;k9DB-?-{ptln8v?+-<!at1&NsAv9;0~6c(neLI3t7j
z<jR$RH@(KVU;hpUC(_Q8*vJu|b|7aGUUQ}^KHm?twqx-bDXy^5OU2UEgI2;zz>|8I
znNG&+e55RfS#EgmH-4Ds4Yqah?*PB+s74Y_)Ku1ed_;q|?dwk(B~6FvY4Q6-KRk|w
zrhkZj*j8+|2owvfUmt4bK05^GoN5+z*AS>J=Ucv)$lr@X2Tc+w8TiPal9PY_G4m=L
zNAr}U=rKfvdPUZ7U?t`UQXZtJSU%vF&lA72Opw|dK$Q~s{`MXHv*TDAl>|;njf!7A
z{v;;4X&bh<W<vDQ<~*1pqyLvbiHXobo@V0?kBsb|TNHK%G>L&!$ra#NH;Zi`_G{2K
z3;jXX4=@%w>xs|A<DF97>6Dn*ykA)uN`^g{Ov6Lz`D)!$pGZE#tKWT75+UpYidoMD
zNQM;GeGDgtY!s?B-0;E$7A^ni0saE%7ywBK)-=X*>f&2{@k~QbAAWL$X$sCR!>POi
zh2@edAC>|R{%uew+nTJ$tN;HKA&_^v_^_Txrfh%NxyX1<KKBakJI4#aGyC<R5P))x
z2ZCZ^Zn@zC!4WlGvJE`sNXm3{2~O!~;scr5F^g#LQb)ous+qN>G;}XfeyQieXEW{w
z!qmlvkUN>#Lrr(Y-|A)!l*I|I@cnFjTe92v*IF>ry<&+4FRPPh({`wNN`{M$4iY41
zB=~1co;#|>X6E_>Y=4BtsFVG#HNB>r3N$0e=TkBn7qs7vdhXF*nc<VBryN)H3fI4s
zo<sBnpVg<McXJZ3VUS|}$Gv!ezhQ@@Rv5h)_82K`S;=?_L(k-57|17NB-sH)lm0bq
zQls~tQpw)HuqqUP{!FvQjho?f$<{SimaANOTyUjon+LJU4><0<b4huG6Ry#|Q><q(
z(xocxa~k9ByC>26iZ0X4<uYGs^#o%zo`+e4W5gv~-V2*E^5u_3l?}r*pY{2NvALan
zk-wny3>h3E`oTGIJD9)TOW&aMHB$}RrOE7TMLn!NV+Hq=ExGDckf=JBv@tP9Ir8$F
zI_oJ21GOg<pdwZ6WwAe$(*YtcODo-Vls9v~9=q^A10+o=0mKERjz%?!SXtpoS}iMt
zd=0AX`zC9-gPSmCl!p?7c^2>VDotIv=ONs}R2QmE{9b%d=_%Ha_h&{?2ue|p99e{4
zUN-+fj?O)h>F@vJBMRNN+@&y7bEmmXa#teM+!x9vx4B<($t}6eU2->yn)`HN<Q_7F
zQDb7^Be9TMau2`r{q?W@Fngc(d0n2*$CIeHFS>f5k|KX;s&i(Grg|?AOG&z%$UADk
z3xx_J!3v)3jv<h*P_A7sOO}V<@uS4TzMNzCmjT)fERlp;Lk=Rkuiov8#p->cE(kh$
z<YzZ3)cUmA0TxG@uYjIzt6s}SG5ZG6caGjnZ<`*&7*w!*-JUF6m9S*lx!ExNZ*=1<
zn?B91d2e(}hTv3aI{OIwo4P@r3j+j2>Zaf3RQ!LicNSGaoAq=)$N3QEs2c30Y5}G(
z<i%&E`4C^QyjaN}bbj0lRMEAMh?O=|kLdKSC%pjA2gr0Y&XGDXp!ogwcb(cbd+fhj
z|F+epW;%V@>(z^H73229&1v=P>f}tXv%m&4fH(woYh)WL9-1yk2vC0!Z?CCCKUm@Y
zpEdd4kNtwl)92p5^(rn<3Z6D|KBlSfI#@b=TnMrgfJ({f-`IfZwgh&m7*hipZCE~q
z&=r@x+D{fUY3h{2gqkNGrEUcBrS_DC<bw=Q@97BAxW}GR&j1)^0sk1|H~!?*v0kJR
z$;nX)=1X+%FDavMh8C9Inu~IOumvn70Nd#|x9r8hUwbi>(+~ORN$2dCb5JS(iu4Y?
zt0|_*<6wkW8gj{TB-kT3OhSCyw#+aQ&*?viU>d`ND2igwd6mciYfeeD*$v;rMcKu!
zn-}DAS$Q;0^5}AfcI45-&y}Cyu=sfU?86y}7wn0VE&j{|!)n&3I~`$6_BBg#CBND2
zs>XO@(ect*ufkDt3-YC!3?NtgXp^;-(!bu_*_pIdn603Q8-<zlaj|{4l4^OTfA83l
zi?gLHt9M9cq*|CX{_MfZLzYnfFLn?UqL2Qb7jyCZ@mswKb!Myd`GSL$rGu@ulgHg_
ztRdBkskg;tY@Xe5FZ|O>f`42oMH+2yO)mKsfi#14M>q`TP~(m`SyEJc@;);Qf47PP
zO@vQ@Cq2E|)m4G$C@;ePG@Z>#hg8Cs!r9%e%=$C-*$#g+Z8(qnP}+tGcZvxL7cGH`
z<mc=Li?qA4nr7)B9>Bi6-0bfb6-GnYdm$pZdLx%@o*7EV7M?8TewG1IFA!=AxY+5w
zrGMPbI#{AZ`{sWk7n^H>rV35UdKhIJ3*}oYG`<-@#=xkOk*Og~tP{^UhG`UVtlst<
zdu02<FTV^$V8e;?ew7QT5=;k#Uft!4tm1O~zONv5kc016eN^%nt1Vway^k2|DQoP&
zqjoSJane88R<BRyyLlrE)V|tRO~44l6w!?-GeTG=;b;@|=H^n*_?b}k!W2>F+#VDe
z#jB&M9-}g}s2B~Y@q9!l`l%xKB?zCQJ03Fx!Rfo(x4h}c3Ug8~GM+*yl+Kp~wq+}%
zI2wGxJVpT>LMrOAC!H#oLMmuue5MV9B*E$MZD4W)HUPmm>~x^MYp;hg9i)YxE!)2S
z@tY`w&oMH2C~|#|xACXZ^lE(p{$%uPu14C}8z8XSyawSbOE4U)h<d_d?&z)k(!2c#
ztn(h%Y%;bru4|}~i2H}U({Ad27Q#~GVawGUI4i#=Wj!^kQ&a6*Q{b5wOzWvxj<dgW
zSwa^`3i`z_rt!jmH#yfX`%`)tvvUEdsH>1?lYebpqXUiSLlW=){4nlSSW8tq-R1N6
z@&2-GTja)Q^Dqn6R$83QGJwl9WODR~DqTe<Q&sM<6L^Xi<<5XM7vjuyg)kuAt-|0U
z61rxFNwyW-RzK9*rQYeeBjyf<H15Y0XTG2GI@{2YmUb5O*P*SSa%F(0^f0G=FU#Lc
zkX)_38FJ1p3b8b@C=e{dlm$l8cbk9s<_4s=6H_+AF2rHR_o303wO7NFb3kNE?!0eq
z^1EWUfa0~i&831aKsbvWK!CkUshhc??*|m0q3txvk>Q(&VA5o3TUK@PG`4<n(|TCk
ze7Z*ap4A6?Lnf*}KNDZDh?-R0S$kmDTVQW1HLa0aW_?cWvV9W3BVs)c2Sf2TsVwF-
zR<$txi0rcWJbJ`3*^NQWgE1zzSozKG^MSW*G{1~Kc1OMCMPfVHEkBtF@r~J3%%$Pp
zs$2|jbl6pkZ8f{5T`o&`&-@}Ud!Y9LTU0soT87*$vyNk47g{Gh+u9~4y_@3P5qShT
zC=*7@=?>3vSV(66Mpw^S@tgwiY%YUU@yhycu<HF_0sorL<Z?5O2X<Q#O*HK}qP0u4
z&RZ#A-Orn^xJ_<+8mi}Ol9JSRirGyHId7+Z+sZ27r-O*tjfY9qaBa-DnJV3rc~D3%
z^jw16+bAYPve=!|n%quuZ1*AEFRP@FTca&>IX;D0xYoEe;lxc~ME9(``!ePhQh(<J
ztc*TGS=WPAvoY-J;8)A(&ufgvDVDOQ<^g-W+$r~Rv%u_hccN!_KRhuq6O#{x2<KEA
zRg#WF{{FkH%lOXNza=)-lqjpk62_LV3>Oy9$^0Q_41ZZO2k~Hfw-hR@3%B(!1{|sA
zsH>%$($M#BL*+!_S52rmSH9P7_mIb7J#N7JAobOIzuqli#yi72p{{8{vZKl35$PNn
zgkZib@o^;TosSNpMU!YDQ5b9j!WR)f9pssVPW4R{KY10F|6lAl=GD&#AsYjK89>Qz
zQ-OTq1760dmhh)IK&uNae~s5>Gde#}dH~AF7kd$cWmt6)2ouQM+n;%JWZC}ja`?0n
z6~rRo1`YrVezNu5BLOf|MoZTAB9oIk<h7@_K^3~UweJnw)9CyHve2VzGIO)>+wJ?4
ztLV=rLpOOqu1Nu%{?zVgd{;4ji|+N@!Kor?CTlxqDNx|1qbF6VeP>P<3%W@IL{2~9
z=RoIV{q%ouArVj+RFB1M(omo!0)_5my(c}8zI?p;#eDFE<5&)UZ6x`GBNXjUkEk85
zH@ceM*9h+g?>V5feZyO`>*N1;d(q;<nXb`^b%Qak=ZZAc^RM7c@j3Q^dqRHdv2&_h
z(X7*OK)7-$*qXm6^~IakIaT{_|4%!=@5{$<V_|UwQiwW%sP}T<;z;r#8YoYCD2b2R
z7))<BPu^%9^sD%+%NN&y0ygjhzr(c|o!jFA&EQoM6!ms|8MLAY06Ip<yeK4;hn@DW
z7I!Sip4XHD$&JGuj~cLk6x6GZOUR_~e%V7kN`h9@onxFV?P@=A&au9g8`5j@(XcVd
z`DNgAKsUE~{a4!xcg|4DN?$OkBdrC0`~4@$#oe~*TLJ86qGFD;O9AZ03QHViW%t_i
zWGk{Zqw$!61PXwSAdJ|5sLUBsGYM^hmoU8|z+>jUkG2{Rmt24g?q#N)okN#9f{=}o
z5LQ`&yd2SfZO!!NY`jkLU0bu%9$-ZW)D^OT3o%fp{7Twzx?3~toRROjXDDY#OLJuF
zxJUM&fTpy#k)w=w_mxh8lQPVRo6&m<T|EGi8)2wRwi5gCnZy+$62`CKq)mmwqVR`4
zdsXebXZ2sd&boi_LoQVd_O^Y0{}s3om|}Z{&!Nsb^<s3$n3MXDzwf8d+2oWgT|W4F
zi|@T^Y0?B7#{NsmRQI(74^+&->MwH)TZ}957vWRl*j?LVhfeX9XOODY+AO9gEP4uW
z4>?afh^kJ6-O;z|E#H1?Q7nOa$mRHM{j|?Lf95N=8@d&#FrHDRl5$xN8AB-;d~bSH
zrw=${s7T|k&3wN)vhV|uTtaDX+>$ui<5bA%)9pqs$Sb8A1TLMbSkb8Z$|lj_Ym=xl
zo4<e^94pxqavZ`lQ)wo;UOes*WJDE)Ski{i!roGz%QHI=YXOL<zh>2#VM<|-!9I+_
z7`!7p&4&~bNwmk;1FW3#K}?DwEq-_#oRj8*K``G)J9~CRaC^Ts@DXrtw{A@Fmj$i(
zTtHXA!9}{C($MByv~-(?U7p1yIf$Y#mY;+JHYO7xjniXTsv<f6Qr%;$48yRZTg4^2
zRVW9o!=}EoGza3$kjBr8j)&?<_=9!N#)6q)$id#=Lcrn9!t7iaO1)svc_V2RT($+?
z27QsR&ZA9@f9LH4?)+13Kk7f))7ZC66B3f?GKbxBoK!h=HZ3jDo}3GL&^#Wtx#H=X
zZ!~>kbyn`+@Ne^aC_<cd^;52!V5yq|d1d>tt|_+7hT`P0)?sDpf05iFu(7pF&)C=v
zpvkwmX|sulcs!yXEX$-0ng~thWbPS7QjxTL02LU-i49QI;!1Z{=&L|}K@9+Szwqg6
zLM{Hj`EUI59hm~+JfzvR{BG0o2OrhaSLvniSa{@(y4Hiy+6IFUo5!J@av-+BlXq&e
z4QPm|0Hs~ir8p+&h<6lISSEAkE5($0LVI^1f3#TkN~;DCmzaS14}gQUsmMy{F7u?*
z#(0u>?zxaM@|t5xqs0xe)$>M2z(gL0!?`08un22iq(`Vgnc;^KOgLxt#%qV1&jZQA
zWz>!K{h&07f4}yN8lzm#@H6Y+XOTnS9Oz01rAhwPXpqRTzUUS$8Sv^ujFiqdmmrN`
z{#Kp|`E#<SIZ{{_Y1@<6LdHWxN>++2`UxZ{nI-vW8j**$`c`mO)+1b}nH&sI;xIW~
z&oK~i(W05m3?cK0oy%U@G^ktJS7+SD0dXveC{LLXXMMG_*VAKS0Kyqd-$$066^iW+
zae-=mw4%mbn#*+f>SU;$5&P<+f$8t2?xO)z)o+)m?k9SB+X8;CYo9el^hHn+&2pAo
z#qu!&S;^v`&<4qz{)#c=I|v&CUy73F`FsdGa&zpcFXVDd@X8#fuoiRgS)Qa56C=6y
zG$doY@u#ogTy2$9fakTLjTc8x$085tHyKX@JFHH+enaV^%RIQQ*o?hKemSEpugmnn
zLWm8nIKV3B{`ri}pArZQY~CQQx*HP<O^5WAH)bpHAQi23LUg-~Ej&NHKN0(xJH?vU
z6LRGvtEO<RKGWDaYwdFvMv6;>d))^_nDG)?##}8MB5!~HP<x+A5bxwjhr+BH-mk^l
z^K(4i%VlJeXU;JaVs9NRG*ZNSLTUzR0t!E)x2I%_#>tsBNaN2^>S(=~(|5moMJEOF
zz_QMirB)nASf=`i<sqJxyLFXLTQMLnvpF<T=2*P^XTvuQ4Ud=dkALBFB)=1mZcxg>
zV8uUSP-JjqPoWqIy{hy&&zk}}PK818ppu&p5r}_={lcB^e};)EIF%iO_ji&WGa#!t
z?v?V!_+DgnZ<s_9Ejf7L1{DCyM=dj1joeE|PyDVYjQH}M&O114bEB6oWj?e-AO2l~
znv~68ApgzJJJ(LO9qs;aA`@uqDo~?H7>SA6V@XC3-pav&4f^C|p0^ZaifrRQ@5o)=
zqrHYB*d1lxwaBBhM_ZPYwnR0?znujw-ADVAmfr2|`TyfIyg@Mrox$X%0R34l#aA$<
zgYRuz^jQ|+nEeC$+IFDo7%KBexTR8^n~!}g16m9vvVgYHyFPgkjMR-%G7`1HBBFqi
zyXCQP$xLXT<AC_&^kmfqLwbEfP*C_-gDoATy?7^Ys@KsTLs&|rn1MFzW3bC|1pVGq
zDI3Fl;0oW}j@;hII|{!i8*QkZ)*+5!Tv|o7fu*MgaZ{n0%d}!{bjgzWYYaI(k72z}
z!+%v9q;l=I0jckZz};2Ge-}(fua&`2X=y-?;tO(UyuBQfQJ1j9OTPuM)wLuG>yQrT
zgV-4Ln2Zg%<f(gikMSazziwZD7NUMJZ`V2K>d-U2>UT72$J?Jn@-L4P?=9)-Ze|fw
z!=Vq2%x3UOq*SR}&r~i$|C8_-@is5KaUWS)CA1Qn)Ad8<itH!Fhb~MXJHwA|Kgf&>
z4o%agOw@a3{isst!ASa_F8(6ifN_V}*Tg0J*Sxc^Ps=pzOVTxoF(CTmwX8-_{Yn>K
zj-avJ_9fkAT$<HEB1lzlSl;BB1mSL^v1u~!BEiAvRT3bz_!1GUYa%fq(s9yw<Z0Y1
zk*)!oaSZ(tF}jA9&+Bw|o!v1dFjfCFuL^h~N|!N#_m1CvigC*yt}9eOLUV}HGO#uI
zgEw@i_TBP}UZg2Ahfg2OH$*y~g27+$Iks+7Fj%5Nz+ZP3T@}Lx1GWy;GI#Cp2Bwb!
zO9I&BLDW%g<jzXN>g=N+ACZyI{xY8v`Vd%jOTHo;8bLP5>E@BK<}m!0_&NNwMDYn&
zI@_~Tuu<Nx5)cR<_Vi1IGB!Cv=M`OxHxkT072^fkG}C<T4}K~3^uik*m|Xk5eY|n+
z(qy7MMogi<Atxh&?B>->+mGu#mGcV91Z86qO<(!FLlC*=9GTAL0!#3f5I2`0nTb<-
zGV44VJSCY4JXqp}I*ZdYF!;Uk@n<F)IC&k3ZGvd883#A3q(`y&gig|Yi*79B*F9wu
z)8szYs;RpQN$(!b)|0``VBB-6rLneaPQtc~%U?(LihnhDE@$3{Ep`R=B684s_5S3N
zcl|}lIP66kM&>IKXlY1M)@e7=+0EVo8XvtLA-;KTkW}XhKmZ$5Q>M1S6ZZBMp}u1n
zrnUJkO;>Fnr@NjDqch569NUyNXJ_eZ8Ax|Fme(^M16%;rLQp?$?;kvBFkM5E%U5%7
z(2^h-6hL8vs?(b|_t`PIhvE2nSL0~W@~A7VU7q4f8+jx9qHR;Tqp{v|KIAw|@7-QR
z?ew&bjLT_J83H+1V0HM%LID1<QGMMMN^F26(G93V2Z!@74xdM2B#xFM52gV+IE^L~
z*A>dcx70L+>qVwK7=9e7USuf!KRimqiKXu|KEIk*0#KIOZ-8H8wBSwCAPut`@bZR1
zMY1Cc3pP3E3|n8W`cBO%#@o=?S5S5g@b85#UQ{m}a>4dK^Q3Qqe}{)te5R5;$YU=b
z-B4~msGn|$JURqSKahR6vO>dd4T3_<Ashd^sJmyjmQ+vEjuH6u-2WX-$%wOMNSDOr
zIN;puneSY5DMhb_G@tQTC={%t4p@EoE?@r=Xa7-r`ILX6(Jayb^WNR&H<*y3Qheei
z`)oAZmx&uG*JB&mQ|PJ?eFOuKMIg#D(e(Pg`Kw9!Qi+;<18C^l>Wo+Iyt~9)Sg}r6
zMN^h|#t+Lh4fg;Y17E&4^dbn5ZEa=!&z|px+EOnY<X5%g4c(C$smq)BOjx3YIHC5H
zYA=V}SV*P0N)AwaTg{of_icBNq0RbU_ZApSsNNm@?C!zi=uW?cOewAg;=yxw>GdFp
zMCL=i8%dP|_vc9-KYMa9`m;)^5!Ll)#o3F2zlG7~xirg3p{v@i$s&Z)vO+A+{OcDT
zV)7&{=R@)tAyZxr9gkrY#CdE?(xdH;kb~f4Qj82^04}glmi4=#ecuS7ce80uQ>-Mg
zhC>7=mnH=thw7o#%j{wjxBN$?&iti3VY_-X9z-{Blhi-ODk*!=(|;p*ptH{u8w|q~
zp5|VmNmoE1u7kvvhSJ&&4pxxf*zXN$E~l+5&b)#-jQ|@TEjlXo-+SXfvd&jqay_N1
zB)Ogl^0PnX<8F+8hK@>-V>8RT_R6eJ+}>=s4NFIwAS@xG4p1qGXtA7Z(e~14Yn_oi
znM?5<3)Dac=b8e8Yx_~8PXkKsoe%C$2~othYkS<+ce9piiW+oda9QI}CN>wE#mk@7
z^I1{7BoN#(9|EXlv?eqs+dW2M!(?gr!Te^y8#`-9*HqD@a?xZOyHl?scIkE9L0Q>l
z>*FMOGA?dQIX)Z1VjMz20qrY-@EV8B&Ii_ORagZnl%dVW*x-k3&dp^S5Hsb6ZYkbg
zOq~8ZIabi-xrbR&Ghs;-wG6zRKmX;P_kPL`?&irekiQ5NqdG9YiwV{`cBT59V*ss0
z9`J#>?b!06elnnLZeHG+_c^B7GmDt}H%fnVSfz0^bR-nHu?o>lIT5KGhA&$%o`B{<
zK@~cy683NB?y7;~ZOW}a)>CMQ&yA<LR0jR&O2F5;1Tfkl{mBx0u*=w%L2hrqfvg>_
ztN>@bzzs;@CgycvEqNF;G{||(RM_SgPzO%6K5m#^)^^38B>#dr`1O(G`1>_kp_Ou$
zJgyUz6GAOP6k!49BF9XOahZLKT}=*^N-YC)R5B8zy@Eu;0#7y(<Z0$lTzY)$^D0aa
z)}~G<r#_XD1*V&Lpl8;;V?Q0pXCOnH`MQ$>hfBO;PPhMn8;Pi87f}IY<P(r#(Zev+
z`?9=X`Q-{_$0e5}Q~b=czaV3w0T%VNN1jw<hxJXeFkJ51Qe9uAIq>u*1RAs8lDCQm
z&M{Pmd^i1ogaq26XtC{?KPb&?^AHa=e8{7xSy}Oh8gHAwx`-S!B^th~Ch=jPo*lp&
z=Fg8?k0FV1BP!sF6!{2t+((O}OB=gC@q-ma8xXCX#yN=8zw3)a33@R-NRUYqg+uc&
zyM(L5s*B;8$Ao`W>%p<r*0~lI*REbygkEF(_Ct=8<l!sraq^D<J2yWe-$7d^@zxv~
zylI())QD~Zw0n@@ZQ{?8Keh)53X0{fe7cCEa=~c#6E;>h9lF)(0J2ItNkki)hsv>$
zTv{U{`B!bpHiiIqN0%tH{ZRAjJWlFyCUy62*=@{1KHzvbxe(8+OrhMbU5_MveJX_y
zT(;559yP9HJ(-Eup~eiiDCP(;jtPqh48FLbBi3oSR%)z0@iG03!y@y;gui($ZyB&Y
zK22fWYT+xl>s){6@_m#OwzT+B_Jc~{gJ7Jv^Nkx<#;vYf2b_po3rXg*{`d5W_z*m+
zn9pN45Q^uD%7|qpTC>E$GYs+0Pn|sJ_2F|EiDOoGEDnCyZv9w)_YcqKh%duP;>y`G
zRr$X(MJXGwh<)`gnrCM!C>0SqZebvql$kL)>891`Cqwcxzie)by*prC&6@JlX&ysV
zgk;Ri4>~5Aa;M5G8u5C_O<6OvU2(k}Ye=JN*Gr()A+QVpsgx}*)2m+Kk;Kn~s`7lb
zXjECeNzTj<zzYLT>=2=t-<|&nGiy#l9<LJN!PN@9BLTHW?efM#crCFU8v}XrmF_NO
zqoQWFmU+Amn>-opF>TARtu_=FxCBATD0p)*>xS4h#zehsIedVWnTw{l87F|Bzd?|p
zosSxplnXKp6Tf`_T^P85gz;m=$kv4^5D2@q7_vC?kJ+rocYvc;qFm!=fw&Vnv@`Bz
zAj{0q2DIHrwe~5-kl6xNX8jfm7I{F_`otZ}3N&$N1=Vl1(K{2AVq5>h)JLl*fm
zS+M_Ry0y7IGSI)4_X$2p-s066!#QLb-cQE*Q~D-(zHP~atz#DiN=<sg9^lD92LX0o
zUior%COu`BqZSLB$32;p>64Ue-qm`YZUPC({<L(L85D(<OQ-83F04}>sYozmUs+#w
zOT7va^g2sv0upWY7qitgpX%^zlEpDn>$VX!ApDJeF2x!GO!xt6H1z1hS2WO2a59UK
zQMN-m)(-KSug}?!ZPN06i<0HTr~BbS82!gFFEgUDd`?ZF=6IIQB|&bGdRU-O>OCh)
zxzqI?wtxDs4;UOS>e7{__9Zr#Z(NcmS%|TTyk;R;n18@J`7a4%&x`-C?s<c$s$H4)
z*;tuu4oL;I&BI4i_neo%K{^DMS{umGnqb{A+j_p|){xe+3zOzeeowYKm$M?lLS{Uo
z*t%+e3g1<uR~N^@abkN1%SKBu|1zrCf|CVmstrD?fXQXu^fK5}b4u)$yz|G@xJ%oN
zqslG?1Iq+2lzmEcM$P22mghvh+j*T*TE<C!9RraY3I#^=*!f^JiZ9JVQ128!)+2E}
zKs`yH8iVE#=2L(G;$pH$yK0&1$)QtOd8lJiK{Wzj<$j@)a64IS>HbCY{JQzWDvUT^
zoIlseEOTTh9Nl;>WBVd()QV<xQrKq6@^HN;STx2-+Vu<x^8SXPr6!AAN-dX*selmt
z$XAIJI&u!<EaLE=9xX-UzvJ)YxF)%uh%0x0Q?!g<%sBp*txftFWb||8Rngi%A+A|+
zRW;Te51}yaS#7BIN!RvwG=5kVnp4a%;R>=i{y<LOm^Jwa2R~1EUA4gpXcU(yj6?Jd
zhcaA2SKy(YyA}kbgO~rR&lbOSXLBMkMKW^&nR4lIxus^|B~Uro`1N&L`O#Ee3ZQkS
zS4hxef_T)yjv6_47{iDTtY<f(J~HCO>w!%2K5HnSJUQ3FY~T~7$}QW(#AGpD<IFLL
zip$d=s+^jT&}c2gM+LIs+-zxW;wrFe^AP82(W0%T22$||7RXYCV_MRrOoi0vzg*+)
zd;olqR1qX?F&I$+s%yvFLqjGUz%Bv`V|v4Vx2jYe?q~Sg)S&#Cv8wv2F<=&rL*69$
zc;tgq7rd$*ZkabL2&zx_ip5cGM~4{=xBCZ!bJxocM54C61pDt_|999PwkZq55R68P
zM-=Y{I_S|d1sshKDOA^Fl1$x&fel~_@b9ClsJgVCG<cTJz=3ty`6Z=;IM#V!Y1EHv
zst?~>JOoPE$=~3Yn%Syxi)O>8_Mqe8(ZU2jig>A+<Tc(FZ%<dEqLC}Ufh?c^h=r*`
zz>S`lj#tEeR0R5q!`&IvY0^|){mK7@spg`_(58AH+m?v_vnnI#<{C@Z?S+6;-!my=
z`=-<ybE2<*?b(jAU7HIF?7Cjw-YN0~uE0N|UCNZJ!f|h7?6Wdozn72sfFB|JL}{zO
zV3hwv|LJ(Vnm=!>iziJk7xO;_(?^X|QHX#e<egt%V=`b}#G@;1opzWYuPqi;SN}a~
z4xEaI?iqm(j<S>CJ^e!zZZA)M%T@lpesFMR-nso|r#)YhiAm_`2BWe16nR};K(EPX
z0F=q6_fj==5#<Yg6H24?%aAoc5CWTDDvgrtLC3C1kYos4DY?X*e_??5W{9g*D$Vu(
za<4R#LzrR>uT8CA%eYnSQemA<<qOltRuq{IAfCyC-GLkE){EXs-3Bg3U%`0$UhyTZ
z1^oKdTqc%S`5-otko}pPU1v?#iRhcec~9##SQpexE(5w85M^GdQIZ+*$#*{P7Jb%e
zwxwXc;%73h<3u<MK!*mj=p)#1>~YBn{&+@-^8?v{zj%Los7HO53LGwC{vgXpdbR?$
z;uuitKoobx2NtvE+Yl`C)up5=uAJT~w^_net@Iu4wjKmUzF@t}WWxf1*JSnnx<#Ba
z{d{3`)_Jh_?o75sxh7&k_vOXjrZglOF+QAb&*UruCq;bmP1NE^6^S;hAV5mwd0@H{
zauArsrT10#8pYOI@2{{Sw56r+l?`pAshqux7E$Kv=O79(>Q~Eln~1;4MQh>20Z3w)
z`zFRiMOTdBaa!KeE<DlD_~d%HwWaJs_aZVrDJm1~*EjJFmO-MSmhhI(g<%yE8`9nJ
z52_R7OPZ`IJWqqx10vVMf<wu=|A%>B>lOLoU`}J_gmObX$jz!5Uwzz>v^kZ7j=~Z*
z{Tc^0#LRC7e?7Sp+FE3M8S-o2&hqaSLB>L8>zN$M5qBbMHhyEatB32A7B#S7(2J@h
z{XR3}+}-v`40iOUD8VUNo+eQouZK=;w6MSASX~<(+=l<r)pIGPU@-O6nXbP;wV<YP
zJ+12{<;<)~7Mkp<k08{2bGc3ZR!@|_R2qP~Z(KG@T}O3u<v*g5^u^!^Jr}TI4cbua
z&Sm*%y;M)P@dJRDJYe`KU3-}1j&vk?HYufUMDAN2y=Zu}y|%^|-QL*v#p=z0^V1m8
z2-=VmQuM0U&ZKYj)IMK}cN5KuQTSq3jl#)XZ_Ars>UoXPy0PWlu(W@Xy-?A~Q)X^`
z!P<>R=99l<?2CUcPeF}sUt7YH+Lj)_ZyQ1TMW|n7xaBi^i?RlP1FE6!#~14A>KfNG
zF%J33ZbU`eE(ZHKk2N2PJ9QN-h2>J49_S<ZOH_u)!#=I^c(U)E@mVz!jyYI~pF$s>
zt=Sa|wb#hO4Ir7E@JjSgygHT`gP=%t7Yx@IXU2weiJHs~40w()yIxHl;ASeUr5jBL
z-<PyZUy@7i0!mFC&bTEX{+d}ig(L&MGJ9~`-dGV5j-^kCgONQj$nG~!A8yU;KFKSn
z{}Oy*l6@Bnh~>-lH(UD`AZ9f^3`S2+eCBg~?stop4#dEkc|7_v&J}-m{<XqkAg?(#
zJS;4?^scq`L`>qEp;>26X_Aqv1Kz)aGG4R971M_Tf4XhLNJxI>mFmlHiV6%KieYSo
zQshN~8ULJXLn1=`K#r>cB6R%9$5-;KzNyAS6S5;G4jhyHrET0pc4I4xMVX`O-Mr5z
z?^Tk5l;V^Nbm+k;yBB4i8pmAKJ%CY~D}TSwjI-}WiAE{X1geJhh^!E{TmDUY-_qBo
zm_Div;f9gY{9<CdeqQ5#UnGjbclgZD*f;fUjl}3NURdBU?Yf`;DkM2&>}^jfltu>7
z66A4pXr1-Q4MxM!oA!v2&7n?JLTq~R7W(C1XBJ(XFzdh*Th<ri4Wb-)EcOj_V>Mw3
zEQZnooTqS2#`hj*S(LuwdoP?73p;i7>08CpkwnOGH-wmhl}+gBz*SzSXGnuv1i9mv
z#@@dB$o)D>PR5RkcWAPUi>Q59vtC@l;P_8hB2Pj%X?wKtXBA<<lnbhNCZPZ006D|o
zc*b13*Ol<BTkHvUfG<VjrtkO{pqGrglS?FZJXRQ~J1tv)p1aIL3MSF!F$@&5r4Cn<
z9Ry_i22Dp$m-p&Dn`oTb)Xgv184k(E`!MHc2hhF;`u-H~)~C3vD$0yoVY$7L<Q?cl
zX@&V|lNhXsBYxXPGM&w*0zpBLJyOJ<Ajse(CnfYsK<hH+Ds?18yc(`8{pJH?vr^xk
z^)ewq-rENh!=XRb+yU*=tK0kibAQ&>UM@#53umDS-;pz6>KbZlr)F~<-qC>3bZY8C
z)I)*Gf#DZwESW?;x#iyxW^pN8Rt^G#;k2_Lxa#74((?3~vkBHF=&gFQ2!-14W|XGO
zsluTgXv^2<?nFv6aFk+}kj5nU-f<<endGrNOh<^06C=cH|8VI|`_$?qbCxG9+w@7$
z%(L{NPE*4|)YV^5%BTsfg^J*=3f<Tnd^F7={OoiwJ3#y`&4sqv6gEJ5Np?dF6QOCz
z@82hZClH%ViaR_63+<hm>l{z@@mK(QIuH5s+bHlVe>_$u)-VOJu)EaiJ#hx`KDy-Q
z@qK)-<NPh#PHfKtzEoN=d|ET)A^%k?l0B4wXk(b<IiKO6b}>!1@O_Ce9BuHBf@X!Z
z1S*c9237e)Dy4e1Z4F~6`_%P6f7g)4j-RcA{*3jHV)jRsy}ehx$^aj9#*vtn4f&;T
z#`NKtDMp5R*v=h;ihjK%@3RdYv;~R&$8@CD>wjxqXLB~XRUHc-11>+?3?La(#_)y+
z@OJT}!}y~-k}C+t578g!s4D3f+v<!k)xuJ@`bPt(22aP2SG3D-5I%eqHN%Q4*e<1z
zP~w{rx+S1vfi&3(6@nGq&_*Vj`58Xroy8aiR~|w&q6Q1$&oL1Z1U@*S9O?j`nAg~=
zuv;N>{9tfw-cs(yT#FxaAxw>+!`DPu2m$Q9o1o@EddiWJB3#%;T2D`zFW$jmQO+fq
zdmf!De$9K-+DHP+93@Xq*TPIOLFy>{oo@NowI7n#JN6$ucFRK%#sxUF3pP#(i@R(F
z2GS(zUghMaW<-69dD&Y_p%HkznPDP#2U($peFK7y=b)1bi-A^>lDRG8Ufe`5;(WD#
zG_Mf>)#8%Z4utK9l*fxCFm+{d`<ULMVsv?p0*LyqPPX?&O62QKdB)taj;%Fdt0cLH
za?M0viI#&@ov(zHS7fEWZb?5zy8ShOx?X`-FG}()+u4s<v0skE9WF6_QA99b9m|EM
zlrA(wLpAl;8{ca(A!Hl9FcyY`2tq94n5yE}_hvsH=392*ZQj3qjjxu^7no^Lc8Zb2
zYOreK9(41M)#abK73G@JC-u2TmUQLm!zWZdydw8?IZpIv1X<qj)G~R8t6!1Qv$3|l
zwY!@q8RU6hTTYk4Cp^54$G!Y>FnJwjMBQ0iv+}zEu`e=R1jH0)PDg<<-+br7($;O|
zcp$gA=4gu$7vCI5x)sIy4#)cUH7Z~?yuyW}%{Ve!=wADSv~%zD=`or6Zoky!%X0JY
zC>2E1F1#7*S4jmz#L+5Ckp0ppKycEt<uV@eY5;Yb^h<UT4<%sonYf3-8urDl&Cfp|
z9;_|Q?gq5ksf>LM`hP9SLE*+D>h}H7v7^E1!xPqD&DjmUjISTFJsOogT9C8h4$PbN
zs9tDpTer{eSp;x-0W3o${NG=zlP2;d@=gOYEAv|`hdv)BR~l%I7_7PF?G7$SYoB9a
zDK+`~jQ5m+Fo<m?6&S6CI}&9HS2(=|8JUJe1owz7QE@m(MBJ4QXb_#^J!dt-_a?m$
zFOloU13am}gRXgp6VV3dp(83bX(oWcmV@zOf0<+>$u=KgcZG<HDyMXtyH{|#>p=pq
z-^s2}np`w1RZRs7yk#}Ni*a{{f?|990p1CnWM<C;C?5sn^<eedHRoD+vK2#9j8RSP
z^n1>h`M=U(ciD>`13m=p+R@*gOhb_6%$~Agr+abTt;pFiMGm|yZ&P(+D2;8ag+Vr>
zQMT2j@agjkLutT<EKcsVai61ZtZ%I-?UZ@~kt@ZG@Azip-9JGUiiDqqOA{6${Grit
z|7H1n`Ycvm-KRjGR4VgGW;eq514XVqy{=&S5(qc=TkR!mOUEpceI05<{*vzy^=6=x
ziPHJ&e0tfpkHgVUqbQBkuDQ_Ba2=vzcZ{4sNvUUR5Mu7Lc}fUZ`JMM&W2<e>^B@Aj
zQ2k+MEEMPS!7fFS4weDDj1WAT%k=g;&-_U*Wn<A|phgiv`X>+EW|`U_bY5yP^dCU^
zO7DF+vC8!9l%mLMe2U`ry9WG_4>XkV1>pb37*bfFehCsj{cg0y_k4{^i+d7CBY<-*
zw|prtCM5wD(0bh6{=}^Ql8851yta2L2o_4Z?tQ;n$2L(XkKP>30HRUOSvIjd#lJ{t
zClxOyKqX5)YuJrgR7H7PlWV%U0*0&6_Z99y6-@<By%bhpelwq4+4a@UI8UlXUUGm1
zcbtz$)a@S}4uQe+^)22BFq=uiN+ll2kY5FsxL*}Ua0@05Bh$EFFYmQYAN!;B(4Oha
z+dY*eNOWc<7q24C{ULkDy(LYje3x)e0~vW4q~72)k7kQOHp7YyHK@bbcAWl&6hrAL
z)z~SCv5X{{A+9nK_RX;pH_uKfaTkRDv(O6^gl%k-)OzTliirt?_auZR-Je!3yz}z!
zW!ps%C!w9oKn=PTU9$Zt^Y^oxA7vEtfeQ{o6j!Te0rRAAx^>^Zoj&8Nqw>FW@Z-;*
z(aZ>-53j$6P2OaJR{JAv2z7cjbI$|bL>JUlH>zt;gpi9aDzkP=A&g}DCxcz-lK=U*
zuiK!Xr3v_}-_NfH<Mfo;t=XQm9;%)eC(P=`SpB$n!+yc}>$gW8a`Qk4@@6Qke1Fn7
zO}g-Fb^;SW)Jj4D`XzF^;As75NA|FPP81<dF!R6VduD1IY{Y=&V6dx~5K}+CTn{vV
zlLt#<&66@nV(9Kqo8ENqXb$Xfu1rksRlxdqRNzVX4(z_b2uT$uAdsgtv##D1cWh|(
zB!=Y2g$OJirB!OAH3km;dHR+_yV}|{wSS)Cv)UL~Re2c%<a(VCai{b7Ku&OKYimPw
zMc835E&j|a&$R!G>@C;7%e;Fcn(LV+mmthg-s)5OgD)4~P6BC26Ww;yJV>=md8KuF
zhM$Jnb!^=?^EIekh+yn*fxs*^ED>Z;lFIAU^;UbY#j~Ax`P1f)pNgR1SCumd!)4Gh
zd`^ftE2}i|qR>TJ;Bu2w!Zi*7eAj<^PI)ej@0*)O(}DHnG6ZM|+z&hOeL#}w<F$BP
z+{JIEyxQo|CYb8EZv_8F`B+x1R)3it%ssn}asALzbPx#nwZ|bQ>}|7AjLpTxj~<8P
z<u7%nw+~)cg7^_?n9>v7u`<4MDCXhW4Ywj*3lTA|<(8W44EI*ML_`dqgP53COHC}U
zX@o=*SPv*-xGgl*2*no*Ng?ZL?sslg5zp^M@YS4&_vQ1z0%okAv#u1whV5`>eLX;R
z%4WJHZ1;~S`^WORgW+x4Rh=u19-8tnl#3>uIUD`EJ#F@vEUH|$w<TQYRTtl@$<!iS
zaU#uL&=H7`9@>Jekdq>p4E5mdMDpA2JLMRI;Dvp!$ZzL+-OSL=fa@vWP21Vs4auj3
z4C-g1n;IMKvqP+V?~Vn|egL3NUBf=j+@_Jh!uPL7#xG10xWcZ09gXeH(%zM(n$`G;
zd>@^U8CB2b?Y`U4{T*nHspXlSo#A8WmA}bc;a=~k_37hx_$!ioX-Mu$lBYJr>vn1w
zTfJ`u$yjxK<JzfLU+$-5Md64@o?XI=$Geshvs&D^0oSEhwFC~&mXCwmNAI_rbSpaa
z8jT<DY||^ho`-#UbUlE5^-}+XVxj)gsex3Dj5FrC(r5#wk+tyb!G3FVb}s4B6Qfcn
zO|2^0@$VmNE46=TGBZOGISh{pzZhfwwWq#$V4StFC-uIs{bMvl^BytLwG#wRnkaf(
zn0#{|40)*Zus-8e=3b4-_iwIM#CnS)|4%6*THuBmeXOw(G<AY;VQnkv+RY1uq|MwB
z%m&7QScx`Z-Kaf&_sYhekKLWWDnqN96e2S6+v-OC5H-ZM4DDx~v^*dkuij8!v!pfz
zoDNoFflq^=L@s{6Ga#j-Z8b$gNw$CNegp`EZ&2SKi0@4qye%H*+RSnc+2Sv9BS8P>
zp68@3tp)mpN}W9$xkIPy=O{OHG-k!#^G}spp8D!=id7qajbS@x{XTFys7s~oL#yKB
z)`!yI`RrdbHLx|1_42;o8k*&xaxpDvTWHTq#pZtG)<PMmpTXlbngjmdQqv+_JjYEn
zQljggGE?-)i+4Qvcpk7GL>`PD4MonJJ+Pg+rmcFxq-@q#5P0qf2x>1Vr*`@Zh9Z8H
z#V^rw8f5Q(>l9)_eDa(g_@0}y>tV6-GWv`mr4yX$jnkDGY4wN6OHJx8Ohi6GOAvO0
zs4)@92u2|hSrcgMkHCE*)@L%Uxkn1$(f+o)mD=moK*J!i3f_9unLO13Vx2f9-UVjn
z>_FRjen_%WdXq7eAe^&8A9nmT`L|#<w8fu9vna>|HfJ*+%F6uOB=fzV*zA^~rDqg1
zigDzbiQo&0zYbRgZpcZwQ&X^L5LFg97=O^Q`yamcZ2=dje{*y5y|LIl)3LnT%Z61(
zj>*nNVlOGzmUv*krh9KkEDHUXBp6gwj9BLqCYK*-CiW@UL4e$G!rO3x&y+~0pAbk&
z6dl8G1G6L$F~2Z@JU~e~NJVB-7g}nn8fC1jURPdv@ojm#*}j~3;r~6ULuH&OxF5mE
z<&HFS?KeW7lt{Wig{Ah=1BFD(To+Cyw%UXfY=O38A=Wi--hk`^Er#M7rDLAdEEFe+
z+<Clo6y3N^yQ6v^%X+6jL%E!))@<ao{4|Ji#ujE@qF*rN?(*Y{gfzq9^2p;J!Csnr
z&TC0d16jj_yK%*qv4PgVpH&fTr){jW6n8Im^4xZX3|TbhrMwghV^hy9Y+Sa+R!p4`
zQe#$ubDX~)DVn~J{ns7~R7qZ7qxtAK9RT9LMGOe-#+n@dc+<+-`0c2mStbn0PUm~v
zgm^DqV*we>zx%fGcmY#xLgTL={PlSQ=LK-|o!DP3FqrQ~@y9fnqjTG9_hSunrJlDu
zgs_(&ADxY2IdE5q+Sp-%x<nBzw#DmL+>iOZd@b&x$sdn={iGiwcI<6s)SqnOm|x$s
zCtVSoI5UHexJ!Pl!!!X~8%whhVe1dMP7~D5?-a)K6jS}ovG3(^e1-o1;qx@c6~53)
zzE9+zguzv{wQmZ4-*lV~N_EO#c}AHc0V9-m#Lmy_!y$vxjF2LOJ_uA>lhvSi00s0|
z8@qMYwUxT7wl7<1hzc8*%yr<gY3BqR+uBm)!B9DJv!H!3y~eAi#>;D~IULvJ)vzHV
ztR)i`87bt5<W%6M5TbDP1~H6T)4|59KjnNa?aj@OUZ5d<Z^mcN{VY%#Qm66Tt|d%_
zE$HmFNzZIVdy<Ip#=pNiJHU8g`NnP!+P1e?<(3!McJ!C3EPKDrl@z|ZKdo`m#5M<=
z37t){d=tIBIwgJlVGjDw`}|`fyK9{5qux2&KiPbmPZYcjymLgpztf<#?!DhQz5n-L
zOqAu=n6h%iDM+IHrjEwme*((7k4v-{Gqzz6s}I?#MkY%cgan=3?9G|OE@WZQ{(;=7
z5Y%9qi@EFjKHIStg$5b2Z-e7Jb@w)|dD7L^cB~9z>p7Sd?Oqj&RdnIKX{UvqJ>DYK
zACm9xS-K2~iV$p)$+Af`xygE31R5gt_ZK<?zX8OdG<@MJ@cCdU-Cz5~q$c83CV?DE
zcTEypxU$iS2gW0s%k(3|lEul3?N_S{J!;}dnUlFucz_WN-`DIb=zYJ%Cc9hzcy@I7
z(OSTf&r!5b!z-vmg<8Pm=H}omb+)NS{?Z+N+eCSZW6UgmKYO=z1DfZ*x^sU~2O%zR
z$8UQ)vHqY)FZe?~x^YkA6Ev=P8$dCXWu(vVB!>&NtfKv5l7l0IUKrLdoRIN-_EYfW
zdD+rc)nk8?i?=~c+0Kqi^3x!%!CkdNaGJ*ypZkYTi+*$`)8HUmMH4dbELqnz^Pn~V
zdd!VCk_0x;1^}ZG|5ZI*gTAuah0%WS!CL`6-VA2mpZMV9(hT`Q+zs4MTS`?qPH*L$
zr;X7kX9m9e8-rC+B->`lU0{PPirA+VTjd)!T)&>Zak(?*A~cwDH`;^sgYVU&3xN@w
zMv;fh-lLc?wIp1$pW_rqtqMbPlxHp2cjc`GJ)~!Va_wenth9Q1#BI`~JC=y>+A=v`
z8_fnWs3K&C*b07$3xJ_+PR$7YYvgVJhh2?&BBS8coN29;{nX?8g#)XHgalR1y<EeW
zl})?eA{iUHbrl5SZ(&uxf@}OQ0qYMX8DYc+l`ge57gYTwsO2-Oan6%N@m<uQQ2&Wx
z=}n`1TJ{zE&1c?P`Z%rGVwpcno~#pzDW-4O<era?D!KAXlvg|POQzbrC1DR1N4NkE
zeY;XySKs)<AhnUyWM2Vi(^U7ONyDce^9nNGF1vb8#ZUgmm$cyK8KweUZk}GcKGb$i
zOVj2QV$)IMZ|cxh0MeX^pKgDVN}&(j#fn59L<niS5@G2bFp$|KO`>52d?7M;$!ik0
zU(kD4u;L=iD?TY`?TPKD&gx*fKV_IY{fZyiwmk)IS;<Bk%_Cx4H^dfOQ8xQUWDdPj
zc6LC)$P^oBoeiSr9~QajdZgTM3*CG!+qS<)wbU?Q1ow28ciSQPb6TQP$=>FeE{LuC
zJ1F<pd%Eqy-XRcVg{iha-uJQ{oAi1x-@Jc~$96s;{o=)orEZJp#U6RaclTp4T(c@?
z+Pm6!?IK(Pu9?fZalkR<<fHAeqX~_jp2LfbfBzGPohdYZGV#Ip;T<tvom29g6+?vE
z)Wz0gEPp_n_B_Au^&y#HKV)HQh%Fmph%-lG0Q9q6UEyw|%&(Z{#>R-<KM$>QYD}7C
zF4ojwusv>m`o0bRy5c8<Is4|xf;w+emvU^d*VV7LKRz<nJ!Lw%{761{?N?!eFXpQQ
z`rOm3+uUsf8|wjZ?K_dGg*7!b@e#nKp}t4^7IM?w@qzds@f?eLi%l!=UaXBGFT*zR
z%e6N-G8s4rgQ?w(5KPX)eBQf(l)S;4#L-nXNa8PGD_sxfde&0YuSU;C#r)m*TX*mw
z^Y_9{U2%>nYh@RSFXmciDJP!&Mk{ToGOCXk6t{VXS!2e^Mgk$}hBd@!G?tKg&f%BM
z*x7>)A&>s<l(E3VoN=5FNol^cgwkBI>QDk@_ADr1fn<O8R6o7cG%^9w!4acg_H+y)
zsuILWeEZi@=%39Wpdz@}78x1}V##$7;~2-)LeH_-f>Af~=d_Gy8m4D5wfDNZdfg*j
z-+$?3spB!<grDHr*OJ<IKycc9fgX`hu0!>C+g6Xra*yQ6;2u{LWv)efO>jY4d0FJP
z?@8X}?W^CB?%Q%07wt?bP@4D#uJs8Fsaywyag*KhA;_5Fph9M~xw41Wd&OsEa5&i1
z?JdXhitt}CED-UW6b{K`Qa*f1^s8gNla#x&e(?H?CxNfKN$5DYxtUjT6a{wmNLr(D
zKWnwnj8(a<d0phmxPIK)Y-IYKluai0nt#E4W_K+Da=4r)-5c}0aBRa;>nRhZabK%A
zJa31?EOfocAw0MQSDPR)-ShYBpRFilFU-Y4_!^%o9*a^HV!f`~p9Z^=QwwzvDoL}w
zXTdFh9?As*#^>1;PPKOXE7!*PQ8<=wrd@%}m6D%)fWq81kW4<5=c224FGR)1lnyNf
z>mBw4HS3z^2L_bKoCdzwDK~xFu~mbr${KF9>shkxO-GCU@f%m&mRfupek<~a{l;TS
zTE|4;sy7Q@!9-e19Zgd?C}g#DTT3q+2n#DQAVcKcYSY*oyE1jSCf3F<JR(9S<f^!#
z&d?~^W8!Lt;>5|T@VORz+GSS2WcgoQuXFQr3;GNQuPwC4jCG@?2|nM_Y!hvhr+5GP
zo8``T$JCX6FdLpQ=;P-G9-S7c4Yh`enm1<<Wb)`W@38LwE+o}W&>qTXPkuOW{aWa2
zgvMUM(u<1`e}?`ozPKDOA=atsR2aSN3j}9}LzRn>M?!}^3FYQ?v_Lx^)?eF}@dZm&
zRg&7F;4#~)dXp0py0HL4>F$qWrlzMOgZGU;ET3)8{9)gIvdC<6S$SIRx(3~cj&U2v
zrq@aJ?}i=EvM?MAETi7)xX6q*?s?Bsu*LSDnvr-82#*IeX|$#L?%vWHp_Nr&Fh*Jf
z-&F<q+v&-O2F{t{WJo#u`!^J?gGwO<YZu#S`1Hxu;}>di+Y#*=0~<E?q_a+2Zfm9#
zl+O+MR*ilTshkMBu-Upi@N_$c6>=I~P#Th-?Uv`5SQ)B%0eIY+B6b$Km>^><V4qkx
z#K=5%lD_dA{h1)u|GBgD2`j*(2|E66rl^(tMyb6h4H*=PnN*3$EXMV2j*cer%7O*r
z#(FN6<Sb~mZ!hPZUOBLVHt_xV?8>L~rk_tu<oUB?$PjsiX2oc-BzuKYHHlf63HDOH
zH$}!*{e`p_y(SnYLHL;wI}RHU%-b-~@A@DVbPXOF{9!0Wk{#tT|0po<+v;{{UF}4i
zzVo<av`J(6_fw>M8=<O&es*OcTOyzMxJLIi#VUJKeEaXQm|ut!`Q&-Fk~b;**i^XO
z!<wtwlNQ8#rcqrp=}j?g^FiI6A+}@Y*jos4zmn~B6~+ZF58^V}H|>KAh;;}NI<s$^
z{BAXl!MM2^o`as=1~pRVZ7rt{LhuQs^L*#lse1{EAHM#5#PUf>Lh;j@sLYZ|B2NEa
zZO3ta-5cL!N$^^Kiju1BV)V8Mi}@8&9W7!ZXzGyAe)zUK!>+U9bPeY*ST|c#NqLC@
z9KPuKoP{2GTWfhnnY}*mVTS1JS-QpGZ)72z=fS0mnwrAxgOOi9o{G`?eVR#6B;e;T
z3gRgR5mJV52x~&J-!gM3K1KRkhsWf86pwm|i}JENS({<fHa$95{T?0~TR$ztrWHeE
zzdF1rYRD>kk(cOM*K+!^QYc^4*>_7B<1L>3&E&9KW(o2WJHJe!=2VKP1MV5VcBq_m
z<E5>T<KGl(fw1+b+gQ}(^z|Hk37sjA$F@>rhqylTwnzGx76$7q|G+0bkwRA$^P#sC
z#(}RtvsIyLNS;MVdyG(HhzMk}x_46k%xpT^ec(ei<RcM-D|-7=5m4F&a8L)c2YDYx
zvQAz_LG*P3enQ_F7H%W|UVt9~Qt|X;+P?2mnA*Wwyhd-MAQ-U-q7y;!t1~^sxTWXq
z<*jtKwd!08J*8P)?@liXyv-v3mt$t3@3a3sAeW8rI`j#@5G~YW&$mZY8LNc<Ja*2C
zi>af>r*wJtpT@uGL&DKa<mL+x8@4a!o@l*O(&L3Cu!iZsQ6Fy)-qkjb*y$4b*VDcy
zG#WU*9H%Jc;PaF1!u&(9$5knm-avM9Ip&~~yTCZ`B}f1Jw>UMm-;bKRW?oV>gkD@6
zYmurtuwk+F*BZt2zXzxuMp;=rAi5Uqw>7xHU?ydpu*to9Vh!~BDYI=rqs`z8Ppqit
zakvt~3Rx}B<oHaa%`U{?Xv0$dergx~7Z|TjF#9GJ5@PG0%OEC>bV}Kdq2TAmVYiKc
zn!U^XxAwR0=O}r6(C@U5EiK4<nnJCU+MrTRvBafo+6qJ0buF>1Xx+;tah%_}Dj+|5
zZ8Eb%P$vElw?u4gZM*q~r~zRSQz^6kIkLdeSV5Yc7A*-fHgo*D-(|17V)b)jJs7Ml
z1)ID~U&iw};7T8wjn*uAR@BbhK){3}W&#Z3OMlf#89(Oa8sX`aCr>QyhMf_);+l9X
z8V*;s11?Z`#%0E@up%Yvf^vzU`03WiZ%RPQ?jcCIF7yW_m7-k<Klg+ZoN-&1CrfOg
zhw-YSAmK`m4MyDWr=W12z{G%V@}0x08vC`=M^oNeHts~AT_&t7E`F9cQ4$07kR@f^
zV(5N4@kY4O>E8D`3qxzBmxdLq%>V~A{ztLi=`^>lU{JGL(dz0-TF<ME72tv4`3<bH
z4s0BE+?hU29u99GAWiyPtm;~r{*R;c4y5}3zxcK4hT>)xg{y(=O+sAdOR|-52~l=j
zH+y7frtDc{ucB-3nQ$rGYmba;UtIf!-|O@9XMYrV-}meFJkN6;M+0L7e}Z&FRIBdK
zkmrv7PTThhxWCNu5ew1NSQ?m(tD{JHR~m7<oZ<-(inqwkn#i9e9$6m_$iatuJ^}Ve
zi7j3ij(-2`^S+~U6$XDv0k<kjMkho@lErM9;3ih2t{WOo_g?3DS<Aq^4#n_K3?THG
zlSU)O>8hevQ->y@Y%h<B{TS}$PtG`!hIOKzJT!E=obpAA<$?s;LuEI3nga^lCHU-1
zy4Bs)vAI9*uz)>@dPuYhyged10`NzpoD!s?jV{IeZu37<VTP3$zFbv>E{EQMYf($1
zFNe-W!(Tqz5sfbXqfu6uRZxekF89jH;lp!YDqO!n436W6q_z4tyk(i|nHW<TM44A@
zAH3wwNv#`?6T2NbFo(g}r7?bA2ccgpjm4pUuS`^I;Swz^e>Z7oEKQb}AyMsD7wys7
z-*#-4CN1P(Q(nZO_XH^;%Vs}N6!px`!w)sOKkuuM6*uThh9_6AH+0mtG&gT~EYz0*
zY3+3xny1-!U*Ibu<g=S%&G72p*5-Yi>MQPPNm3QvHQ$l1x8Dt2vrB9whQS~NuVaz(
z-z~)1Bh7#$=mQwjx232_tVdM9pY!SS{qw@&^TmKAF72jQ^Tk&(Io<|8*7MrwEqTUW
zDGV_1-Lpp=0jJNhxwsk+z23dDO<HT>>M=)je_`NRG&5W}S=(OQ?@N$pt@h|88)$Yh
zUkOJ#EUm8cR9u7A3qv4ZlJpz~pdU!PzP@e<!ZByV5Q7X4!j0o@x`MMVRE)kO0t()c
z_A;3TK$!w;(O7GxgL!5*Y#uA5F~C=0u*&HYXMBtaZGAZ%%3*-r^mRJN<3&{{^7;uN
zhgGbf9373NL0T(?i?`iDKq~`(Ej|M!os?Q^hwK9szv1Dk!%oM6I}wo{H-G<a*P1L@
zcDDK5jv2A5_c@PER1>-atC?G5SH+_Vr2jqanU>uO>nZ2~<BWY;fhBv>VTbG$Rboa-
zS=J!vLcDbg-qZgzO%HwxCLHQ?VrdI^>@JJ>z4(~Tof#&x9+{v=-=@$}iWY4yH++a<
zfQ{OulG0+LWJTy8{e3~|`Buw)zt%S!Y5<1&zoB^NZ#C{@KxS67YTx%lhFSS@<;U&q
z?W|}Kc=y3ZeKUVl#9^D9%Yzh57IH0i@Ns(T6Tu~bT)|ee-Fl2+os8$GQbmZ|7kwKG
z>-Evskj;I^t*3$HXK~uMal>~Pr&-B!ULgzv3_cfm)mI$eFrZiV{Trhdz3bBQ&(vGp
zVPOAvKt?(^_A|reGM^GwH-27|eZ9)T#d~$2TJ@poHga}6)PYVqmxKMUfIwek|1hWi
zt*=0iaCUxDu8B}0y+2zv9TPkgejCqbEcVxS_=_NA1g4TVH^KkM`F6-xX!4K{9Ypmv
z7pBGZ9W@#eSdI~5{Pjo?ZZQ0oigIY#xOTHc*vbi^Y)EIJOHo;<rxx`-DQ4$gyGf<E
zgHww9Myr=dF@{eL5~-}F22ITKjD|M~YVxbGIGWObhG42M6#FUpPB|Cf&Vtjp*g5^|
zS5BYllT=y!o}}_Jwo?Pa>cXLKW$5yG?e_mhb#=7+#0^ms<#p1R6hGWiMuP|FY;+x8
zh6C+b6BPO^%I@k6hle1_H#u`;CE%2;fgS(&#_1Xa0$%Gavvee&@RZfm)I59ktY2Yp
zSX@N#s%l|genN&vnRM0S3tLK=y-YfuW6>*q*roaTGs`r@S22)6q|IMxXm;r@7>*BZ
zDwtnB8SP6BFKO}zV)}{ywj%e?s$?c16+B%2$6ahOUM-W;4-{(W>b`sYm?ZYQA8k#O
z!^6XeclanfWV;g>zi@@&vwe4gR0hDym+;dcRH8%785>FG!P!T|Q{p)$z~85|^q%es
zjM@Jk+gWJ9sl~tSf7a)@5cy7bG|C4v>FRK1FLNW#0$fsvenc1l{SK~v(i3RNpISQ@
zY~JXGWyF{D`V@CgsgR@bnwgD5Kv0*|ZX?e%nMZ#F8Ak;zf?Xx1I90nGJ>X!D>uSs)
zTYi3VDCoPpTCgCRq?f<C(FE){h!F;BaE~GqiKYwFqGK*HS|A@yO!k`Z7V9o2nmW`a
zND=0M(zfPTre@HD+Ojfw`QU>Xvxx(i%!bEj`>mVugjkS4Tx@<`*D_r?B|W0r=z&*y
z;sAZSk;B_6*;nSGf6&vTqlOwcq6~|A-3<<&i7)Db;h^EvS?VCBP_#gJTpn0{YTI}C
z?)>v4A7mDVfwq_k91i+DL0m+$*UOldkQsLdx<ohLKzQ<A24@-;WuW-4LAv-19<?}5
z*{i7#_`)I(cl^@^U!})9kt1ZJ=l>7UJq?BAC7=haw$jAlD#GKOC#EH~1&a|GsHq7_
ztv-d>W=~inu^!8ewpVOqtDya~2h*F~?+_g;9(4JWQLVPp^#Xqx9`+4+u;5G`suy)>
znf|lHo4f?D)CryTm)Yy66z;b=yMT*h;wP~N{_)Hez7Gl?1)pH=Pn=lymE8>UXAgf?
z;XF9}W4UpbXdQiFhJEFw#Fpfs7gApXNOC!#(#FAo2WtLf9&BTSue65lJS^sJFIJ8S
zWLGtyMpb$UOK9e4gyfiL2$?B9l@?I>o%j(bdKB&}WM+N(orW61Z1e1l<XB(_oprmU
z*?~9W9d$3o=c3)q><Gl1QwwIA#n$z;Bb^~a^RJ-}Itw~~)ujn`Wlu@1`doK_N~g|1
z!a)Op!Dv0Vg%v?RzZB*u6!=B?N7;Zw#ukrGxXPUxrqBOblID1OUbuT@fcdDW%efdQ
z2ZPO3qCdc`9(6$go&U6ZM?&V+#77|~Zd06;CzlE{jVbj}pL|T+wcj;~7c?N$$}?V%
zSkvdz7<nj0Y<oEnYg5iO!CIyO>A&i0jvtj(IxsOUS-#$EoWo*^#oct~*iB$;<x5<U
zE&d6p6L0rt?kPWUtpQCW;AaO<^|7zCPOawj=H`GWYCShQ-_AUVfvCt=jA~x&{{tym
zqo)-Vk6j>S2b_(+a>=+$?Jmsvx2FI!2Bc3ZDP|SOb%o&2+_>&Vvq%_I8m{vFk6*N+
zTW^cj8<;Jc&c1S-u>Tp@9N6E-kg$);xq{*-K4kE!^^~epvpMKZ<~Tnf=0&X1aGhI#
z4#%3}UF~y^^J^g&GU^sV-N9tQg3G@NhUW8ADG`BEy&FNiB!QRfGq}d(M#?|3TjtH4
zzZs|?3W|!39vR<kRNaf1=JSkLuZ4mAEpB4MWMJIP4EEbA`M+hSbYFHe5mZ7e;%%D?
zLDF#KHk(S>eh0asqWNFt|9rg2RxPtPu7sOt@^_KL0%J_w<3A*!RZwq$QPPM5hLE7&
zyGh|7c5#0|xDZplXr{4QKXczD9*Hqx0WY|9AG=cSjv*f+%N}6K$34zv31PH~P#p3P
z5VZ&(_+mrrkZzR7l2LI;FoV)DlTzK}rs9;}PdR+Gu@5Z1aL=!0mdjoU@uU3O+;ayz
zr(CxbeIFqf*!3EmK$V$=MGs@c$hB>>VkSs!uy=??Bc3}*EYCZ0wi{Bn0+g|XSZ@LP
z%cEw(+%GdM*g{sJB!^zKbyh2}hs|t+$-vr|8FJ(*;EpwexG(oBt!$+$!6lj|&?IX7
zbX;J9_3GRA_X*?#;4uvM=j^eWz7A96b)s^yTkc+=Qd2qV2w03e4YfEUy?qgG2??#r
za5{A~Dr~!GJ&>1TUqY+nkeTnr&fWcb=&$G=2B){S>N^o*jYs!iHqc*Vlo9czaVj%M
z-l#Qv+*vj-{Bk*OMh?zqB{>~5ekWtFKp4H4;mQ`%FrOi@7#Z=lA=7NiXw~Y2<*VP`
zg5PB>J~Y(YUrM*Y)h${5jzUluOPf;_dI!iG&f@|OQjgyNg7_=)(UL1fv_bt;{Ww_b
zvTYJcS>0aO%=9lyD{H6?Rnnx@(ss}K&?B<wTb{x+{W05be9~)9zi0!-{mjeZ6t^s?
zZw22GY4kqSjkgVkK|VwH;sw6Fxi^+p{mblw&qh5X<%d^n(f8aGwRo0GN`mk9^kdSr
zlB>dpeYF3xI9G=L8GkjOLDf2uQs+gePk?Y+a6W*Db#_iczPmb<B3^De^WV0V%hc|x
zub%iI>WM(uzA!@gXe+CQ=;|`7(VX^7R~DOe*sQ+s6niMP9M|->e4X((f(HU7#oEbh
z3ViTxQRf&?LrU<nn+jqOQ-eImkLGvTWN~rb1QM0Ppda1>X;Ji(%s^}gq7rUm$6fsH
zO{M|ksOJvr&+E%2RAWezr#KhCmg<Q-_Mm4hwl?wT0aD}hpx>LDn*}b49N}jt;1_L*
zN#y!B7tRpJP*6jxlT!BH*&R)n$&P8*dQ%(8ANE1;%Xc?GpDZU~FXae`mjppSOCNKc
z*PIod540Q&e*D}^S@YNC>oOPFdZ0(?aPYk9yd>ZxU}tjnd_V+LU4cHM0XYUgz(kZu
zSP5cKlyyN>J;21Ks;YzQ1fU%Ztm7>Lqz0={H{+Ag*G?4kggd$as_^$Ua<_@ntB;u6
z&wi>f82bdmWPp}>H469xZ^1ofP<zn!$ceQk{PYV~IBsp>G}y+HD{ipZ7lk72F5M$`
zRQC5tDk*W{(NavjZdflDD$oevvpcij!Dw9=3WI9EMj3N1Qw$%2Q=Lu30xZWPS7egx
zESH7R;Uy*0&IR!G$`nG8l>1v5C*+7^=v<8V;XZ!WjG9?Iw!xC~u)($E{+PTFHL>v?
z#*^@!w^&M5dg?v2i2+bdhsQssc%@yh(X)3D>Uz0gP{y!mhF2pr_Fmekz)$4cReqUQ
z$$!Cw2JLDo(l<^j`@7pcLh`CJ*odmpzs*3mOjfeo(Fd!_?&+<mj(Hsi{Y2(Z<MkZr
zQ^LlGm()?Qdu{2j-Z5keR~kPqudZJ9_4T)PhNojfiu2ifPMe8j#vCZogPb8zl@{js
zUvtM>&nEp*GjR%@cb4fG9O)SFDTnJuC?R6`1Dy=q6?N728(*4<4nlHFuyJSJJmHYr
zgvS1;Z$PAA4~{JFGqLBt&GdJe6x`(>#^~1qvjErvAKUUo7)&0GjqzhB_ClHp{oIRJ
zvm0@nItu9)+N(y1M{rO)A3GXfeq|$*A7N&el4<i%%9g`2)1z!$s!ZUzFJpXo_(elZ
z?X~jFs-3+hFY1mxYA1!t&;*KUY<yE*Qd#K>EdghrDox{QdwhOG7#%$n%jkxR2#TwJ
zjf$Dj`?Ohne1w;7zL3&?r9b-c@6PMd4|}=A*I6>079obp)D9FZ^UKPON)eIpk5T{m
zWUhn*KR}*=!^XzM;fxoT<KP$~;-@(GfIT&gKH?6X(1g^=`P(n9-s)3%D?zFPojI*`
zcgzHt+*qpFO9Kni%rW~ll~Ps;PJ=H0i$mQjy-JFTCi{I3y>$k{Qz;)H33)>6m4Z=}
z?#P-O0?Tu!lrR1!rms8H>V$048N4*cXFQ`3?JB(CZ>gC5^g^u#5*P%)Y&`9}hYdL1
zPaf<cpoI#+u<|z|W%NyYvu4MTg@whTDuGjLc<-#k#}LX>%*P#V&1Rj9Qp^UEng8Fr
z0c#Djr|+L-H^Re_8{MwTAVns;wA@8Ux+c#RM>=*~ccu}GEQ3GyFMQ_$tu4P-a#h#q
ziSl9-<*`5GaH&PU9Su!x(6}m1y5w*9mGjl}e*s|;=ld;(T*tVY8Nohpg_{pS8`>A5
zel~q4Bg{d;bOHROZxD93@m9RP(M7~?l9#u?w=emC1HPum5XjS69#<WF3qfph9gYb(
z@l~h<DSK6=jM&E#3}DpjD$zFgAzEpC*YvwCJJK=##?Q}BAv44#b$MmMKcH4|!}{}_
zipF%uLdU{Fu9vHyqX$H^bEQcJGi=N@`2>mrR%`IB&U!Rt76Pa<#Z|dOUx3F~+DCG!
zigG+647GPaUaWXidwP49_7jH9Or@$60pRZvEQsL`1OFWWB-r<RrYWRTdUT0SqK7@)
zk~2)Pd&pitaV|K>B<kSk$iSfp6QNQvDknqVv@<;rFP-O}h2@%=rC4*G@*<+JgM&S|
zHUE~w8BBM7RGo+k`fw&avJdwT@uQzq*}t9!`%d&(aAp@&ip|xsZo8<d{*8=JY76g}
zkqc0`d5?Mck|?QOIHi}>F8!YwCP`0Y{I_B^yM9`}9oQF@g_i`QlQ=Zh_EuNDvhFrq
z1y0|gebtDs#mlbGG>&rvHeAS2=VvXmhfTcCCadr5&IELp*q2O=jkq9ZHxC-7Wv?p@
zyp+y2p@OiQ<J3Yb%FBZoVoV)4Ri7Z;XVMvpg-B9Gw3FiFjpHflO;1_JH@iVGIhovm
zHtvP#MZ}vp<gpTikC)b$ZDK2zcwIEI`n$q-#d3K?w`z{R{dH#3!)HD%Xe_to){KRB
z&jtI%2uw{+<3DB|irt<}X<VI59-es<G_XDWRH^XN@4)f<ubxrGtEKS3<zI0ai=F*r
zZ4Y{VGc*<|q9+02R)IegG<+VK;dE73tLx=&p@R~q)=bJmZb-c*Cn;&P;PdA~L1^)X
zr7uAjZ!uJczVApgepx28<#9SB!yoiT6MTxpJ4SbofU(|DvL-;raaNAa`VE$b=y30j
zP}N6AZGMQT#MQ1bIbtIUl7>3?&CtO&D4sbO_@jJW{Kn-i>BjGF*23@&zMz~SzVQmp
z*YVQ7isw*RtQYK|yVxCj1MYkAOUTT@s3t37rq+vt?Ikm{k-5$clB8Dt!X+N}BGSyE
zpM||snH`hQFt|b|+qhl<|KEoTJYkS4sit;IU43k`$6MOmf1X+Vhn*&8XckF@tYmPV
zxT7(^-Pm%vT2P@)7(Q%1G$l`C>uUSgrx&Q?l_8ey*x8w8&(o7n*=MQQ{-?bKlvd4`
z4jJOG<R3N<-`SmB!Q*+l(dPEnw{_t~^mXhMCI-)6k|OUU<Mr$hcw$}Itbg_$XP*%N
zNu2iu%x6n%(;@_2E(%Bdjr&3ieL493Ec?8_#lFDA@cetrdCS4tlBV0<LAuM$n_C*@
z$u_qilc(pc&kBmu%vL{XD{Y29WC4HNsJH$3v-+a15{Mrs{W6UfHu2Uk8I9f~*d1Ws
zweJ7|wWo4UNyq5AwmF#p#I*E#rMl&{0M5U!^6HC$P=`cj#7&mk^?F=q#{H9Wmj+u@
zU2iexZ~vB-rlzeh#m?922GZKpEC3=?zOJW1oN3{8%Wn3PIJh5QgNTfnA%E-xsQMe*
zuIxRogvOG*G~NaeX4U7r-W~yijj8ve7i8(JH8L6ejqzStPP+f;UXH@4#+tpwC-Ym*
zD`Ykq8K5&o@?Y)<q!MVY?2qvDLx<YFhVW&2a@8yD`zg_bdbACRBpbW3JYlKj?t#{t
z=~@YZLK4bz=Ijxg9pC7ED3VryV|3boK;JPsnz)1*3-9YEfz8*0q80xZA{pDL&R8RB
z0S49i(UN4_NhK|UNfc(gPy9aW;$s9g*MjJkY6c0^yk4d2o5n+7PihkmFu;~aT{|D^
zGO;hqJDPLK`)_8PFo|v}?7B@bO2@qCDsnfIcKwcWUJ3icm4RAMqZqRfF3wMId@AK{
zgdM*h2cmo6wdu8Wf^~kcWmFn*)1gc|PCJ6#in<exOilms2i@cqIu<4QOBtWerS+(b
zF=t>4!bp{%#Y!)Rpc6TTMGzWY58R_*jKpA}U{j~Omo{QzpHflf>jISU;<7HrjXJ}5
znVp?=Uu#wk)Ajm?Vj7FlmshrPC&O3-e=xJyK79OwMEBe|(YQPU{-G#^#Xwnk!^Blx
z`aVYqRkayg3F=_-y@2D1C`?BO?jQr}y&yAI?#C=ipP&pgcti4my%q#FVxy$deV%UM
zt+8Z}c$tET{*D1FI)}10dO~mw#;7d4FD}BH2tY)k<E4I?s+!p~T0&KVeR`%$@yDLb
zFlEu}54h!8ys+1F=-1Squ#7%uH|Uj1msA^z`k|L(qf4jtHuhUP=ww*+RLFYTwX5HH
zOIuxCRSBx8M2}(41r@9I2f1Rm5oVTUOF>EE>8B5!3_~(=%Qca+p^Vm%ITN*#&R-4O
z>gwyydb5u%8J=#RsbVGgE+bw1T%-BX(`**L$6d)5Oga%jVkOltCX2Wz0s`c$@4=mr
zA4+C7%AHl{=giDC%bqOuS_|j7Y9e1<?7#nP!1+qA9K>xoMX;ax<!8iNP+0=0@+Lt2
zW@Xh|LKj2E$E?0WbNKQH$81GVOaH_^v7gi)HI_<HD4#DWomQ~>3gyh?qT1rCZm-bz
z|I=mp_o?MPE^Yx_R6yB!{p==q6WKe(T+`7sj_Y}aI#Pq&GFh%0@kXJni|3m)!>F$>
zEW6A5?~f*fO4F1iyOl6)rQ=)sdAMp}pbEjSQubXmPNIQmVC$HpA|y}UgvSxMWkZ<$
ztL2s0pOp57UB<sE830{uVy2}<bq{C)nS(8-n*jMj431&@w5Fh4&DT_4pGL52nZ4YR
zqUWAvU^zv!ngti@G{(WV#E9+6{;9M>OLnrbdwc&B^nc8?|6AzO>H!wGgyelsP#^QA
zwq8T*BWl<yi!kmZb0s%^Hp%8e=F{~`_O}F8Q6nBmCo@8pU~7E{1{m$LMQxl(%h6xD
zPuA&0e<Zcbq&2UoPQu_gBNUc=dIlOj7GzDH*f<En0TFUy)t$OEA;++&?|~*(PMI(7
zIST18-;Kb^(cPR3IbSBTs6txxB6%KD=0F7XOp!P<TJjvJo^?0|mLso9nJS0BYSKyf
zY&c`%d`{$azGcUuWx2W#HYUtA{#N)-7*!{`>UP`9WG;(FJJt7mo!srolaUTW*gDcN
zv$f<LFhe~4smvow+=v~>3z<7?Kt8!qlT;;`m*%NySYLiP!?RE+7$cV_>G`S^1P*4H
z2>;mOfy-uKSooJx_T&CtQ}e<JHlOSal-Mm6KjFc$?>iU#psdaPa0IjO*Rtz&(5#^v
zoMHiCRx$2vDa}iX=`q?}T=CVqD77zAcXA-u`@vLgy4KyBBljqW@^C>kOfJb6uAkjW
zW>AG0Bp40^b!J_o64(8`*UP%yuht5acX4oK3VlcUK|=f%-z8|^giNSOLi%g#oq$&v
zJ?R_>KYzui!(Qwsr_M6bxfDF>^$C9rs9^>+$Yk_D$ci-QjCdN}sN`^Yz@@5CpW61p
zP3^kUGE8GB*FIoqEc}x|LEAQZpx>onMtd)-N`fK{8kFOQ2<W2Q_X#BViYNx2tfvGK
z|07kDA~l3`Q}WU){tST{98@enq29nAiBE*;%REmAlS^DOd$MnH<S!g`_m&%f9!{ft
zptz8sE%XE4%~vr9V03b0{py>J6}npCU<2n0J=xpdTS~Ub%r0P5on=ruzaOv#?$BFN
z$u;EDvCH=%#kA^KzKpGm;U5ASYg0H2HxS{jKW<iw;ic%*lj{@YecBA~@ldEdDdOs8
zq>{<XUp+pq!=2bqiIH5+PbvocKI$>Ej$Ce;@#aCbA{O{4;5mHUe9k3Deb$$`<{ju(
zA2Ls{)*kKjRV&e+F9&QB{RRDLRU(9`-LQJC^ShL1GqZ;t-R%FxzkDA-CU3oK2kV$G
z7R#$04K*b3`@1Cf>hN6pzM-yrU+)M`n7yI8GdkjtdAygLRwwrC0>ns500BJwJnYz9
zhAVIS{P5lFqdd^op@MWa@Lq6vzIe9va1sx4PW?c!&IV1kcsYV_{;9vr*YahSkV7&u
zS>Djg*E=QIu4eLh4$6Pvr;U|M){~T()cbWhttxyE2BOVuBF+=sVS%A@y>(xRe$eI9
z;R5HDN%9<CN=ZR6o>Z$EVkA?MKXk~S=Z^M!vjxOD;dO-p=*mwXJqIZ;)%M?xS^+01
z%D-lk$!mEE-4qeRXzFAOwX*4*h@i`(yz;E=Uk1V*`7F@>Z`rB;EIKnNO;^7xy(a_>
z5$iUZTy-}|>biNUp}<~9?iIuxL>Q}N!}~V@t%1=QEc#Vn#5aVKCrq$3uEm1O`>*JQ
zrITYW8&_VpW8RKAW3+ak(H?%;*zA>xM~g`q=agaJBRR4BuZ2TRs%R|(LhIxn^FOk6
z##{^h-(ptQ&4+F63mhoyy_&U*$Dpu;b}hqssmyS)Bb3o<4e!WfZ$ByEER}5kAQJuV
z;D<$!(<O*Tk=!fM2ZOTL#a)D?;ybjMoF1{h%t%g<Ux!Mc&?`sucAB(Z%q{-%ob%qt
zNT$Q-oqv-A*o#kIzMU3(l&>j^knPg&Q+0H3M4<O2%XcvaueP%<@~ZlWxK=B$ad+n1
z<Mi^6b#+a>t|z^wSJijmxf*%BKvxa&Kbe3JT*jV(y}Li@RT*oqGZtegT6gyPCphcT
znG^;H`+IWw4g&`(a2?acL^XFzbQ$~QJeiwCp*B7Q=>MLFB<7?OR(E#Vas;YB5~G>J
z?1$J){NKy`&(MhyvaF*$bM3jkC_~{#`IhqT6dg!rvT;6XRrcCCzVmj(>z!>sJkQ_C
z@-iu1Eiq6rg_@JrXBtgHl}L4Z%5s*MSCE@4Fj?A!@P@~bBe2G|X!P7UI`UKmbxm2;
zd#7V@3&LULL_ql%?H9%*X)AdF`AO5(|8O2F7l6jJ)Hk0T&+en}_-EK-8JC$x?|pC1
zi>6*i<))wCKH<4T$IH0nm4;gl30x3`v?~l2x6gRFg|C-@!ntGjdY3YseSvUHgX@Zq
z_K2HY_MnCT@zO#^2YCFiTi3P($YwrO|C#siMhk)ApHIs!7vl%>f9Wg_brY{CTCvu;
zZVqgrOc&s*w^JEmn!+<a5&3o(A^*V5$l7r7jeV2C(BxAiC25g><%+Y0>a^+yYqJXe
z$39BrOs@gOTCc1vB2k+l0q<U+ciGT;HGK&za35K)K5n}hEH+4}g5LW~HzPe+CME$?
zyla*iTt6K5q#EaD44792<&CB5<sehA2fpS{!_mkj8-LGQjvmW^u_cD}N~~)3+lF~?
z!jYx8u7sDzF&r`gSvQj~uK1uK7dGTY%;4(wPkoktpqHGH+hGYZOsnqlKzJ}UZg48O
zqcf1_j>PZ#MeiuYPyepI;o2DU5HsqPnt`g3kwMQ2zA-Z}Ez7a;HYZFCG0c!{jKQ2U
z+Y;bnB{F&VV`ABc!jM%Lc8hX(f;Ju)OPl4*Hs9WNIF6z6Qp|q8eR7h&{-?5ZMf5WG
zC$C+xFuid?0l_t1pAM`WAX!aJew2tB({k!pMP%gHxlV3+C?1Jz`qGzSKGMP8`Z3gT
zTeZAX8hBG3<(X>gfMNCNR@_Z7+-LarPH7uVHOZR7)vZ^sc_Y_)+IGFD<XDR^nFUp@
zK~H8cK=`}Z^_^IjyNZER&&Vozy|VBE;Q@ErM|tXfKnHL1nDg7V9OzqhuY4Y8mzH#!
zH)uYuJLCt<gTybxU%{(sCVBZ0zf^K5M2wo6vXxN*mYAXc3ipH7^v9h^!rzRlLRX)*
zgKoI)cKRUZambw7Dy0%mFA}vE<+82x@7FInM`u#9gys)SjuA5ACo9v5HRh5?KF7gq
zrbPdROjUi3=jxSpZjQCF#+1W7=hjs0F(bRMb)G1tar!*RpS%m3xp$Rcu-v+sUVPQS
zP%IGDO4<KQN5fcAu<ar-lVw70Wj6!@^U|){t}GW&xdNy>J`lrrJ;4W3?*!z;&4w92
z_w-z(@_F9XXt-(Q9vcE{RURqB_w>m+Tm-JJwC|pXnzAo7Y_9@yi$NmB&ZOJ5vyThH
z-m|l|X3rj$0VX!UsM3f9F*gM~E(6t986@^t&87Zq?PS6yz}FuLJ<Sb(rL*O?)&R^>
zKI9ytx9tIz=^Yer!l~<kmdm#1cQ8*zp8?4AQvygf1RVdpv)?gq?}75Up?|daNt^zY
z;gA{5e?wpC`YFZG`yZ9Gg_3q~Jxyn`M$)NhiahN@p-bCqZ=f?GeU4K)CFZQOAXoq}
zdBDT)U*nqlM*YZPQFW!@tNE~?Pc&x}^fVEl3pqq0GS1cgnm-r|UB=a_(pHl(kAJ;x
zuDq6aKK4nA?NLNK&2sh3Kc$$OfX;g`m=;4&au>G3pR^wFRojK-L*Ud{z|H$`(;QaT
zw^%tY1GI6nS5|*K(N{j;cqzHB^0^w34A$SNlEC~OCE~-`W4ZDvLR@pnNVBL8@*?_w
zER7@}3o!(cYGwjF0~7~xRMbJ442h(4PI?x8k1Fp{1mp_&^cx4`sxM%bJOBdW{>|b$
z*9;5?6=J_@@)Ia82OU0>dA3li$)EI*;ttJs=wZgQ`6SH?y4EZ7elD9q9_2}H@;BpG
z+!ajn-w7Eof^Wu$%?j$=Qo#{B$oiuU!jJy^Mp65KE-j?as2l^cg)QOwe+T*AYPZY6
zOHEb`c}&-P4~qhuCU`mwJ%IyUqj$?9DmGdSw883$UmZ6#Ok%&x(T}|PTUnjITTe6k
zl1PX=-Cgmuix5xZiGOW#oxi`gw2^4kZ+ImV_h!ClqV;7+RC+oG@w8H~aTINi;;(z2
zGB3Jk`r{@<*MUV9ifx#%F9I%lKv3GpuzbU%k(@!j;(Q*DC(#VswQ*otQrzoPaFt$5
zC@-S`1?4#k3_-4~STT*7NbK<w9B;{RFsVY}>A3btt%jqbN+_I~D6?Qhy-<|TPBG#v
zS7M2q3sA7YdJUINPDZF-YNduaV8e_kAUp{smwLTkDeuhg3Z3JdU%wT7D2y6=bU$R^
z-HnX12>Xb)-}V=HWH#lVH}r7_C4a7r3n9mRdS|L4`A)Pj@+x;--$#ZIh7<`hn4GHY
zSNamN28?4F-4SXpCS40D-%Sd93gUBqtJ6=P!$Y5xV(%20w)trc8a0*n;4Y7+GpDL(
zTH;~WPaeMCaR|5JgjA_6J^?lo?3Lj)imsC4NZPA3M3&E^_S>brZf(z)R@s-xIgzAf
zw|ZSXA|W;0@|$_E@Sk?D!0`l%ia7lbOyx4#n6+>JfBd6*YNKhhQWdkl`}-g9!#j1j
zzkDB1xA9<M6Gb587JtlTe9eBszR*edNUALT9p1c7Zf<BkJBVVd;SXgTo<H#h(g`~b
zt($o2|Lx&(H3UZbQVyU-sxU2e!_`6_CMd+HDjkR=xA*5IHI_rJ`>oU?LTnp=;cd3j
zi%CWFY^+kSuX1X3wkhDe%^$PAe$UNaL6Kl);A#BKInnfVdiDex@Nb`?=3(#0J~W+d
zGM5^~eBP`2IgxrXmQ5})W6j4qY->4cAr^{U)M~_yYiyQ=EUcyhkiTjgcF;gR3V9W7
zgIljod~Qi;(Kj<ec|Jys(5NY}$z;Xa*EOw+SB*?$WrK(HQ=%F%6XlPrN_|&EE939$
zYqm_<AJ`w*+#Gd|%S_K-JyOsvkf1zb1Y8VI`BOHU@JGDDrWdj2c^(nSe*w;U!V~)3
z)CrSlREM!a#<ai42r?<jLrm<xKqlj!r)c3_<v|2`t3FHJ+Fj{%SV=0jV-?f`zjXPW
zD!$>-;Lj;&8j^bH14op75qgDe$$1%Wnq-4CdfZI(A^0I)+QPm~eO}?cnpIGgkO4}J
zGcOCbTg5tIe=o~|ZC(?K8a(JGzbgAZJly;SprRsulQk}y{J>F@533Gp>+e|*BVZEx
z1(6Ytc6MhtsM>dNGD+6kMe7+J?Kve$CZXd$hk^IP*cYn)gTtc_NX{U1HhbB6Taf@}
zXTIWUuuGZeI@WPV9>&Mq#`*L7Yg5__Kg$dt3?gq_c?1>bMOsR=79?0j@p1XiqC18v
z9A*p#;gA>GPs)~-qVP#8H+<hvErDCcIL;c}pRiY94domsUogeV3V)gPp%lM?BBOBV
zSMlquMXJMp9VstLy)_fCdEJa*u0J@wB}@tbb!`rb@+Q0=l4BSWjz-*9n`@FN5|4+y
z?iG&NOt~isgV6t`dK>Y2ZPnr^gkneO#6CBF5}2SFWd`ZvAI-J?5wyJLhUL~t=*##V
zp^TV9#2_wE7i+x!G=|$Nz8Z8pl0UUVUf#|3xaYt=^r6~^49Z`RcGwC(f1Zk<c-Q(^
zr1JY#O~WI@2SKuw09<4x<Gvu<&8T>CwB8*BgYW)`e{aCTm|9mj(YnwydkccVxpA~}
zf|w8}e93>-p-J&rWvbj+8R3X=vxe_3`a*d;$8+8+R@x!%y|5o0{TtTfs{Tb4zTVa-
z&?!5hFQi71PrAm~WBIyFi{z-J`so141|rjr2<GJ?oL5%Qck;%--Gy4En;>s{7jA;v
zKRzx38s8RqYqSK9#PhU{TAugb%?2?d0e}Xe4*7@|`0Kku)_wBAQqL&yMJdh%CA^5j
zAI4HvepR-u3n{)$m?*vtt?2X4!XBj8CtR6M(e(24e7<};><DNl^)KUciv!n&4rkB)
znL1!0^|P0`RNctUhgWckojxpL!%1-rF%N9*{r}jO9`9JZn8ebRzLugDdyZ2r$NCU{
zzE;6yI%PAqU$_U!;Z3|;F>^td$=lKC2LIJUn-3jo_3A+pk;;XL;+3CdMv5lNv+n33
z*XVHBrUy5)>s~VXvfmm2o3+lwpFUhxPs+ZBmSfCf<Fp9{yfig}eG47dLN=U_sdhB@
zN$!+^jAnQbNWT3?o79tL)j*v6wt<FuQeJ}8(F!Bgac$Y5bjgv8`E~pkE{BrgAMB=h
z@`u6kjRK3QnW^SlAjTD8vId#ry}8`V1vIb&eY-&BHBOESAI7ztzR48IV0z5UB}cG#
zsAFEx;4k#5li#iW{hgy<r`E(F+HS?i4f_}7W<Ky4kO(*5=DxPjL?-3A`gnW$de+LU
zj_&X8`?p*S%&4lP9!piozS}W>neMyKHJLEUfPdnWd58!a+N&3c*B*v^O#;TS!A0MV
zoV=QhL4sE*&6}3l^iQqsuD!Z}dL_kaO~bbJICQlLRE%5YVR2dFL=u`JRO;0v#vSR>
z<kj0{u(>Sm{qDA@G5#Y-cFVcEBK+(!gudg&qs*9laN)jotw?4`i(y92I-~W}LBRn<
zUi~hftAOY{dw54dCe9b};=;X6otp9~6K&h?zSlU6`ad&33`Yu&`yC&?#%Wx&ba)a{
z{8NiQ!$8MYrwn2>{T_wY*MSvT4c?8no=>8SVy7T8oQ-UWoDbykL4HaL!JO=ce+%8)
z)hnfr(S&zLcSJcc!*cHGs;RKHmOZ=uT~dPdq^eG%`rcp4S|<~+BiUn$57$ewS`kAp
z?UGp_ZMwtujcoTyi=m4NBh^;s|GK?NZGRV>{?RNfQB6D@_z>O4V5p-nUt;p_hYeSx
znEP1L0nY`~;rEe0jM57uc|TGAlpY(!3Cn#CEQA9?Sn;X0zTcSk-Xo^8{9b`q*g8yN
z-TUIhnef{Fs?~b~k9az3mqAH<eR+s_tbJJ>XitUun^&7fC!EM+`+NJKMi2dZS(r0A
z5>A?lz+t}yzPPr+EWs@l59UX&<J~2YwryyBC48G;1G=z3Zy@&+dFmOho-`ZS(qQwX
zFE|4Haz3MViIcFz)lCnw$>i$Mlh0XG3R(^W7!2B6Su&pwqo}B$P^dVi)Zh1SJbm!Q
zKZW3xc85ml-A%Z;1(NrL@U=_tU!;L!$!u3co%LyXu~yfVN=$*JRKJ`g4TpStAGNJD
zg`>Gt&ax%Li;MJ^1|n=o=@d<n8V>(LHV+0$>c<yy173R1-7)HnS*C!fhjB2<8@(`6
zxur$}&kZ>V6qespM<~j&?M7KZMOM2;M?sXcAVS{IsVvFpdnqszMct@xCh8_a51u^}
z*q!gV$n{S)I@^Eue-1(+veDMtU#bC5L!(zN`LwoNF6y-g68ifITJ39f812-A=w5@X
zprJ~1hb*l?Ar8m5v$_K_Xx<+$=u2psuNnQbWug!J8mK;8*>CX_m?>$hvIYs3?B*?s
zKf2dVa+Wv*`$#*-fM4|gC~dScpqBuXS<uHrMf6Tr6q_HO_fI8AR~4|sT+NegQq?+h
zsL5aMZ!jnxt@{4*+E-)@3cV4fzF><O&%m2#KC!Er4LCa{_|3NX9G-QM8HU||)$Jh<
z|8{N8zxgQ#uGOiqYNx9Hj^=gwdTO}?KVpW~75^GKtQRU~7h6ys#eaab&;XNvjA>3e
z&bH5yE4a9uJ@`-{$s1P>K#Y=u1Up3z32e?$gt}V3U46@ch_&TMvhGcF`WL`kh@^0l
zi?P*fHcv8|7xrEQe+b{mtns;(_hWK1{}LYXMc;sz$zeDZ1VN;5WxoJJ;=8iO(8%v9
z=A^pJa7*Y6ej>WxCthN`vh<G36&}Yv3mQYrAFN<tYfK3)jtX?sF-H^2X$7L8{I@0x
z1Oh!ks9I{4-(WO@4tBMkowq@#r0+}B5zQnrRh|*nXKU+1xSBG)=zp`dXNB|hLCfBB
zz`Dq3YuK$O9ovwQ4<jlyA`>DisDaskUkoT1)tC(}N&QmmlUfW;x%6q*68RfSAq`IW
zuon_(Wo-RS)K0XLCZ)yS{A5@lRn7+7uS~_LCw^TGx=K;k#-Qj~HD}*XYo^JRIOn@$
zq(KvE7HLUk)V<Nu#Tb2One+WviFh)>zWzIVuw@E{^<kmZ<jn7ryQn>C0|~^BsPgOT
z{`}`g)iLB{f){WmQsy}7J83P~m(Tbpmtj&2NkfMn1BSh7Aq~P=+Jl0X9EXS>!X*)3
z*#TF^GC&z2K^7ZnTF>!F`KZJRep6P5yQIMuw4dKT5HDw@raGB5h3gh;Tv`BX;=&hM
zg4KqK2E5$h&c)&@20?Klv1-0hu}7a8g}jUjXwHkvMjGW_hA^Tp)<M{qnKPd53iH;M
z;*(+H=b^{TOHr;Z5*P1}b7qQl%^`J`ChQjk43q4<eSOmkE-|v?eUG4Z8M!%WPr-Mc
z4hCRMJ`s-6lzNOTw-Mj>SdJq6I%sqy%Nc!$G75~_)-Be#Jh0DtRowCrmuoV6uJjiv
zXkcttKT_SIRNjR;ldBQ+JS2oNe?Fu9*Fa@0ZtA<Vh<U<Ur;_d0UJG*7xtlvhyhai3
ztmN0gz<wW_NWCyf-4IB!S7wi~9?hgo7wuty|7$JnH#4OgYSxsI`R{YKd|=vgdJ24*
zdZ@BGF7G6PQtRvkJpR9%td|d%z`cFF&vv6Knva)H?c~$@RO#EoJ<;q8<Oc$=e%VbT
zp{m=08Q@0YJJ(He7oSeK6uj&O)15R=A5z5o%7au9KN_UlBEvfr9T^xR3sJG0T^}I7
zBiQ8#jpfo;L+AR(57yQ|r(#GsF_RP*r#&p$#Xi8joXh;buR>mi@q<d||G-+D2N5Mh
z$HRH$S%+vf4y>I)zUINAeOolJmNCuSrNykg8nH<Kkdd1pukU-G(1_x{Onb6-G-6Ux
zW^AP9Fi_qzVFLXEY$wNpiFgoYwn??Ex+%PFf{L9%IcTj<cRb)3H@^W(O?A-Zzxq4-
zh$-Mu<b1b<%Noi9S6-8U{PI!v)ODU56x;+QRGmwS(ds0gh0Z!Pwi!7vy;XmdP?K>T
z0&Sxsjb{T$BI4_M{ajNft1h}2`Xc`>Pl)T3+~Xuu%M8`~#R3AiVlz<A$YiGn%Z-<K
zMr@7X1ZljNM0<^H+e-Y~6~^gmqAmnNXoTM1MG|pn8fB)M@psPslSPkix!vi1gAI78
zcE)yoFne48OgB09shU5Gd2(Ubq)k=&N;=DOPshT7*e?zu*-ddRL1{9HlzeiAhSa*>
z;?-ov9?NiE*s*q^GE6L1r{(?|`16y#;X5ZwiteYSG$V3s(St#Y?2M46V`@3PJq^*Y
zt38nx<aQE%Po#-XraQ{sLPlgJPw7aWgcToqc#=MItMnQpa5hp7Vp38~9dxz!Y)jSN
z2t<wHr-6f?!8K5Q5Ywr6BB3Q}Dou_rlXBCkTWb7PNZMLnz79$9^7r*Fo-!%=b9v27
zIDNk2a3)X3!N!u<BwAyl&vx*vBg%7y3>N9@lWcx1)<(|09s$XCjN84s4;`-TC;s0s
zUk}^rl&MQi?41X0(agp0ZBA7RI+!=nX<e_XebN?=`?Y!uWMadwBiBJNXidd_YPYhq
zF9^aD#_UmH(WDnhSP+x-@bhyzj8N+WPvWpElCFePyTB&PYFquOAukWNye=`=|EShy
zRO<tSeHp#ERWq7*IPkeP4K$r92K*jB^1>kBUvP`L&4u>+eensrq&D*W>s!mB#Qr$s
zuP)l_$UR|hN;!h)hlM##2&A@qEp?p2**AcCugwSX%$W0zj>@g@?Wlm=mNI*4rnAw2
zb&IRE#A!gmJF;XqxSfBrJdo}ZKp+7qeM7_SzumqHRPXX=5((H*rWspK#|aB(K;m1r
zGhSxknmO+$9^D{MZ@DX=`J24=*Op2qzYsa1w6-xxjkaKZoe_WNTv1`AQL2qGLT$Rv
zz+{y-^w_M&xZd7Q_A-a|>Oy#l@^QE(IJhf+`+4jRfMpfcS-lEu{S|%~6Pm?Lj{l(Z
z!rgCCKR9|4ykR$cT)~p%Xt_#THKOlhvdw_kCi(1*tUFR%TSet%8;<>w@zH-U?Z+^3
z<F`?;n>TR)5s^Sy`X<q+_e>{eCnqL`CJrP?Q)3LB6^aSnt?@=xsgrKW#oX3b`gLSo
zq8_BJHXgyQADK7SA*c1J&EaKJEXAi7R0*k`J<`T|8;yKW2lNDZ0d2$d45z@x6(Njp
zpVje18HUkX?2sCQkd&GfIsL;W_ARzG0)laq(2C;~k9vw=wi9d=zE^WW=|aG`RsZLa
z3{t%<!3@8D;do;FN8y!*iH)Qh55bTZf?mTlDR;Dm=dsnKDl<ukd54l}QyOZxin2|I
zZpLA{#8U?r5v{Mk|CIF`NzEVFmb0peK5q~^rwSkso-3U<op*li5h!+-%VT2=ztI>6
zjWqwskF9gMP|w&3c|murN^$?=t!VYq%J+W^_W@cCu1j<L`bv<%=U;&;VnMtt(k7pr
zxR?=j?`ZEqAiQY-ImMkkCM^3iF3`<-D2c{Ug@#+%9qWGIIh`DAN8!YML0MHJCUGG{
zg3YC&+3_Mh^mk9fRk=!}6fAYKsjM!8m!-OfR^-`ClDqaA$RG)#2GzADZwbTuZTW9l
zHP-oAt^ogGJ~Pncx5)mPZ14b--=-hXlNND|uHYi7z%yT88ZvR{$~nA00CwZgilX@Q
z{;V<hZ=Z^^{If-pYB@m5&pcSbE7C8Fc?556m$9g*)G=nk_#bYNzZlEQ@@d|?9hJ<k
zmklaE@cShqE&m1C8xl0I3(mg#Yk9>uGi=Xr8X+MEh5c9ko{z8#gM*Jhdl=f}zmvA7
z$4xOxL)fhDfd4j8vqaNe^~v}3AFKW?GOBDOB3rHdLup;7|Gi`5+tv#;7?+I4W$~F!
znaBvNcWL_gbk>MnyQD_V{E6YMh>L}AWd7>7dNfdMRi=z+u=lf7g60}4mu-Z~y1vfb
zhqzye%Q+N=%aOYE9__y#;bwVZ#eDMMye^;}J4d!SmJ#&RFnrs9ytjYWfy}OFZx3%^
zAkNOFQVY;{pfLgayG91hAWIJ_PsK%mxJ3XQp2oW7Gpt7o2<0p&eJ9%>lU98dvI|67
z{K*4&;<DE@GS64$Pj?9hDNwbvhXEb6K48Rfu1QZ`I$pg56M}xvPqX5(2F@;wU&7l(
z!KbKzG2Y*&)2EV|Wj<BGx+NxQ!A)1IVjzJTsFG3opdME?-*0hx2737EDYN?o!YW6+
zy^$2?XR+e9$&%R|sVqIVJ7qZAwRq3}gfJd2p`AK0(a{n7zECA%S;Rh;B`lz^wrQGp
z*s^chV#5aq@GVCUd$o3P$Yr!brZ)J%LVgG_Y&`qsqm$;jD|UzA6!-0#PO4$t`<EMU
zzVH$=d1vi2)eNM<-Y9%5ev3S;QX=2?UwY=>C8r$`!@(C)JMJ3O@SFM93|aV&+mG@-
zu9J>{kV#hiwcw>ENg4Rt6<HZ8J&nRr?<bu(@fkHngK45Vgp9}!D6#Ht->iJS5|3u$
zeGrSN!#Miz)38)Jd>HVWtK6Hw8Bpq(VmzIxZe1?Up;PSYuYOng>H>T8UU`o4MBa}=
z(W=O_I*Z1YNdxB68mycsh!;P3>#w%Rb<z+p*m6ejN8kU+*4`HPRt?VaP_>5Awx;{h
z6l6)u<DGh~?i{S<8(fElS1QU8?MmvN;<HK~UwW*=0Og~Vx!3Y&!#x&Zm@1S1RgyZ$
z#F)iQc~_c(S100&D>7M%(6!+$lNKLktLJiy_x@uZY62fPYcv-gMHJgUSc*|mPyc%v
z*a5}nY#E$QR9T$LoXvDN>|qX0k6QFZBWs#mUu@-?BY?rD|6lLZS}i5TAKZTvnnAVD
zg0qKWW)fQJX6O7H*^NUi8KkjqWNVX8n=x+{6kjbb?4;B)>nq&MCU_}sbWf(Ajxq3r
z{Sjv_!f+0fo>?@VuJzR{)RI@TkBcVRYOp>k;UNX9(GdQ;axS&6nBqylkrdN12(@E=
z9s(hH;Ad^b2v12~-I?DxoA;NAee`RL$ROfuA?S6wyA=+qwEd&#C$Lp&C7T}B^uPb~
z6xjI#sKWX7*+1V3FEwSOshE!S^pbMoGI|qNsLs<h*Wnznul<ked>~*|`~2SLPJyqq
z9H~P&!j=y27-2fDJ_|CZtbUwWW*r@wbG8o`<}SZ&jX9)>QB(VQA5qb+$DzrjHnGM5
zznb<j8*&*~S>u!oR|=zgZyu*b@l{Dv=pY{7WQGL}p?mp1Lp{-Lw;@hLU&W!GlA#}b
zcgbg(cOfj-;#w+&*MX6;se9k$G&f*<{>|<FzIt%M1i~u$`}yL2AJLaUBKe*T%pT2v
z18X#0yP~3eARB)_0F08S<c8tZ=&!@>83p_9Stg0iT&mIhYPYV=qm2!o8xm~BiH9QQ
zO(%=P+K1$hWDCFj?d@+Fu`%g)iJ4qsK=-BEp--rCj^+g2YYj%VuJdm4HwO3$N7~DZ
zf*5A5vfYr|W!A6!@~UOl18J`X1p&PJtk|B0MF*$DL*R~R=3+!*uQB^>&v-2?3-y^e
z_<4I%2F?MO;ls{EJrgR@tO~-EF|D;P$R|Y(y*jL|lv!{Lva|h;23M%&HOpegqu~x^
zKwsqAQXJ6>W<ah5N(bY;((*0Y!!z%$g<rXpAu~hbhGRyCk8i<QA5}?p9W1=RPLh8`
zY*IB#QZvjTVdp{dbXg~(RLIc(2odGrP;qh2w=?-u%uOKmhHulgkPm+@1&K9Znn0*M
z|K)1D+;`(L*Zub+XmtZky8+xHR$Q&$2Y>$}MnJPS&r0mQ2ei$^1eg(MsUgT@$Qxre
zNY$P1q}vy0#zVLKR>I9wG=;G~ys)t;BMqrx5oh{9EYm#)L&EeB16Z~7O?n`moQ!QH
zxJ3)fZH(!?9=02eH@<<bwCrS`^PaC|pWd5nr=?(46sCc+ZqQ}&8cGERECi0P?Cm&k
ze$1sad@}m;j!}cbC`{msGun<x$3?(qZ_@WpvP)lM*~CxZkWzRdJP0MNMG+_-%zeSz
z-`~iPdbiEgs@B-@KRbsWJz|Ctsq+Yu{%1FhqSfm$wnTZ}etP>++Z*972*<VJ>jusU
z+C-<HC%R0zzpSodOKJNjj~whZA1|6g43q{@!Zt17N5G?z!ZulV@apdz?vcUg+AN$g
z$!_|xCFjKj_sb2_9j6^>nhPgqTYmR27}{ToVa=MwjkA8hxo6&3>%a3ii8HZ&*~T2Z
z>8UWIq>wpC8rlrlz3aa`S|YRfL5&4u`*kI?xowwi{a+uH!@LaTvhTn(#47z_+w%<U
zj{o`AlegzrS0e&bxsxShts~|YGP$_2pAwkldf=t?%YP$6(DFC5Zi>WbGUP3SV{dRU
z&I~1&&1o6^U3>LJ^g_5$#m_Pl*8fCyOU1pok@&GNLXwp^zP&vtofI}%&N8J?5JUX_
zCFp|bdfd-l%5=W#h(zho%IiVpD>2V1UdO&zyFp7C`DghVP3+HjDxKv=A&Z=!L*^n>
zG~V+;=BOpA7NvM?3D(={ix3K3trt0sskL&1P!39Y_eXRW=-%r<F81vcMe=-W$$fbV
zv;PKKj=||u#dDOt-a(i>!2y>s(W8N#s;U$S_CsF{PWNYz%1ArU&LjfxT;Sxcesgvb
zRWnOIWBdGTtlR$y^2zCn{;TgJ5bhWXezq(n1=02W0jJqGxmP*=Q>#zl)GNt*$m3pD
zN>TvWE69sv(EBC{;k;3{Znc<aLI-(5nmR(#Q!oY}@;pMM<pVm|<fg^Ul-cH?Sna|N
zd!vL#+2!#9lAZZ-ij{EQ;?yA(AHZhXrRlGgmtW}zXPuY1!9GB9t)0BcHJ{xt0Hlq<
z{>ItOdSQ%F6Dje!L{#x(l}?*1Y)?VL-##`x!dqT@e5fiUZzSKb%p2>4Od<^HK39q0
z@ubn>k%o~}mL?D(jUEx^)CSg(g7w0hGSqCq@q$UhrQLmu#oU1iel1?h)T~yXILJ$@
zKVvwLw>XIkm=`g9!0RTEb|Ojdw+4q*z5%`%#dDz+;vvCI1qyjh{+CvDc%?U<)G)o}
zt4Jpk{6K>r`kES0ocb}+%FqaPM~=J7Z?FtlD)rtvOcK5>Y4j~myPzQUqMP%NrPF%c
zgnbF+%#X*u|NHaJRtFIh`awWu^zTnOqAG2ms1}NU{O>Q5lHm3CQ<YC3WBbU6w(EK%
zqY4abG3~GRTjfM=1id0P!IJ9#>1*dJvxnhl|M_Vghx2w04gQa%^Ny$T|KIpAlcb|4
zp>T{GStn$VI3%-<m2u3-$liPA9Gj9I<&$+d2**Ci9*4>}%9e5L6DK6=cYhy`pTGQx
z<9^?-_v^Z@=M}H4mJqofNKt;+^9=kdr@r|nnh|6cck7i<;xxTJJe`3pHvuUMY4ao8
zin|NhSo6H}f!c1*Y2^+S;h_0AlA*7Mh0m6nr{a~>Aml0rT%7C^nODECdo|s02E43I
zoNh99yxsplQJq`&Pz*yDfmn&EeDIczVI$`Uef;cc&)Wr-*>YiV6X|QBqPr=1Q<b}D
zQV~u)%-0661yJ3xDVR+(xSVKg6n%tksBZr^jrrBset-q`9l)ng4!bx(Ki3`!x(amK
z6(Y<_0L;pNHI-Yx-iJ`uCU+7!NzQ;{BFCVV(tV~rYJ59$)_mDOIVkAl*Slx+N~*5L
z^?lhQmv`)CThMtWw?XeIxQA+t%3wLNB|0i^;`9#yZ5&@7e;``?!k7VO+1oU86`8f>
zY*q3TwWuy6z$edb$y3cU{6^H-XQn(oasI}7t?t*<Ykk5dYpbC{r2lHtj#!J1fG!Y&
zV#JLg7(bO`39$ESFcIZel7FP*^UiJhvnGo6kKGW}Z@viVG6xk1{XjTDsEA!)AcUk|
z>xYi_MLz}2_PVkdegF91q9usu=3ZWdI6e2EfH&{oPj})M<w?`I)mMW-wB9eQs6f;!
z;3uki=xd{(Fg6@Mnk5U&CIbVV|B5&)Bpie)s(jug|9f2sfUC%tjGzAx*Ob+kUmDMK
zxcod(L?`TMcNeIhINvDzxBJpV`D`C$Jq6L4M}1l!6d*_Yi1n=JR}0Lr$F5J|BpQRv
zN`q{d=496pMt&U&utw)xyFcBs7>LhIeZ`hb+^+@(9&c3lU8)jic>_{E>M_Yu*K7SG
zyTu_aLb<R%ywkw;=T$gbs1ew#owa8$cC9-hUG2Z)GA!{vNEM!MisIu^q&IG&pDTwc
zlytCl#Qr9*)}A+n%JoV4&b5ejZRhJCO<i+JezsiQx<zO7ZYn6aspASP7BDDQQ32p+
zM=CHm4E=>QA9s`P&c83|l7a=%xwjpEcFm1kmCFvJNempHqUi%yS9@eRw6u1304+~6
zMM6F7h|MNx65e@2cUeRj7PnR5%JKx-n}PT}s=#jqhOoXvU}{dg$mbd?SHH`V8%rvG
zzH9m}%OcUo^DEQ2fHXJ{9&eGpN%f=0%}#mJO2fk5kI}A)TK;izUV<tOoN+#nuiU+r
zUTxS;kV!cfOnC4l`}NNfrP}=1jQs!1?*}KIIWaudR=%5x)Mk`7piCc?>x>5t6$g^5
ztE)p9W~YG|qwZ&Ueqjc18;tXcSRm}C)95jxg0jnfc5?Bp<Iv+=x(SU0sb)_Sq{EHy
zNit=SUkr+L2&bB##nd(KuU7Q@Dr?<3M8j`{PcT{+6}efUcpB!OQqk2`zj1T>%v%g<
z8i{~MJr^5h`b^+>1hRNSUso|NzwZ_-gnM3v_Zh7p6w~DE2_oAw6zRk1WYIqJx37le
zmW*%IS5+MGYQ$JHI6q}$(wE>H>U;Hy$F{-Sf%B2-JrO~P8Ox%b-kj1i#Z?C@^`zy%
z>`?&4qF^jd!^_ef5%_oMTEuhL9mGB_Q?~@O!rEb|VwKLrilHCL-9Rvk_H|H>&(fm3
z+s@uzxenfkD$Q<+*VcX)1*}pee9B$1BRt`6f$BGjejrtVy^MTg4~l&KKJZ!1pSFBS
zp6shzfRWw3@kp6t^F3g23Y!WzylVXEg?&U{*i`XH@<}?~?tFx#K|cUXoNM(c8HdG6
z=#koW2D1;~FhAa2)*D{qxq}M4ZShYYagdnaJc{-d0I~A3#%|Xnm_B{>lHpgbYdOZH
zVXExbqmxvVh9dMfI{P+=Mn20fOP>4F1kr_sAJ&!X)^5wE<wh@n5iU9R9My<q;85i&
zYhLp0b#`To7jdDwBTIh$@p3=d^0(t`B<+_V{)gq2LRP1`-(NjWPm8|79<aPLRdDn5
z*4GyQ3&7a<o|ymoW5#Ao$k|2(Ih=HlSUkU=B2NaI;=G|A^P&D>XL$1ol#q>1`3@jz
zimv*~f2HWcwc1zSP;zz39dx=GVMzcv-{@TiLgkOTiqyRK9iLI$L%v7v@>hQ?EUP!Q
z$R{oTJ`Jr`I1DYw1iETm0E{`XXb8_OWjzH>hmgJf;wS&}vXKF@UN9h;-;nzs$Q4kG
zzFi$xB)z%(Sw~3@c=R0#9{DvcSl<DV>@f_S0n5GxCC(|(UH{gP|6?O0cZ)(8YA$E`
zjC`yo+A+mu`@ponD+0bwsXx3xZ1GX!#ImgxZqhk7ou{`aS=13Q*+O^T95H}0Z#3-f
z;TLxL*t=WS>|Jrjsyt*CTdojek>Xck%kwiopS`XkU!<s<H(m@`pIp21vFrnoo=2Tg
zfqVe`SI0r>`R6eXAHEWkjmJE^I`)tH8C>ELK%VmNjXsf4=J&1I$p0pwPNI{${mF3m
z29BcS;uR6$c?H$$Z+SKx|Gv<%b6o3-Tzvl+e)HjOA+}Zbb=XqSf6oMxy?<uAR+Iu5
zA9-y~kWJ1@+XlVRK{E{k-$KDyYVFN^Ewsq?L}><)ub6a8y5*gc^k%v30(IU)J~NEi
zHGSXeiDfeT3{o9`Fk^DX#xNPVCTZw8>~Pq{orwQiCM0e5S<fSDU82S>0|I_dzn0oW
zilh~`yz*TBR(iBNzXn*9r$&MHk*-$^h*z`ik5ry;J_C!m&K=mLuvGN=cF;pBK{<$k
zUjn=N{n_a^J^;gn=D$lBmge?zPU))fhu-DU=HB&ZeOj6WMw2F-w7=c&muY5H<A`Dh
z^Y>U@6zEiljE%lzxy(yXruuzJ<?qtj8D5Dhv!kG}0K@|ncT{6R>m#=RYpDg8k@K@w
z0Wdl8h}81Z`O+opP``kS50R+u1B3tVU;(qS0$`WVm0F%vNQJ-mN&Y#ulkaeOt*wWB
z<bWrTtX?DV*wqa=WinGM0xhoSHXdY)cLy|lkE)!RHWBP3`#2!lR997l7B@D^aTv@>
zes4Xwx~j!i_0K}+@#>Vy>6hiXlhRfI&#s)&7ECIkK;tBQk-i0Bgm(E>T>Em{8oS*B
z9hsSz+S7#jxnd-Z%?~MCH8H=Gbk-+PATS%r52<~_v2M#K-LeTs?2Oz=&R18B<;Tol
ziDs-ZfPMP*>GE*HZo90HyHPqe9+p{NkC~5>fXLDY3?5{Hb008Vk#SFX(i3GCOPw~j
zRy=9q;-&m|QFM`$<>1#&92!ll8k;hpK#mW)acnD>0p1GlXgj$G@)-042wNu~7tX2*
zFM3I&<r$B5;N=A1n1Dx`c%(EX(DyAi-V(YRT40y3)gt?NxP^#7&B^+AKY3yHItS47
zfWJIqDu*UG>3fj3n5X@JEARF1xzAoZ15k1(TPeEF8eafsDv(RP)!0ZM1LBdhV2)p6
z7q`Z9q1>(3Dz@?f^S3P76&f@$E(Ms;fQpMrbDBu%HxyjM@?L$F8QjHwoLmu2(f=QM
z$F)LcYHHbC<$X!(wId*0ogv?5d3#d-!_g=H;wa4R*5WF1OE{7;6F@Ez2q7i^K7-jj
z?0fE7f#do9Kpc0IblQ~|tv?)%R$r)dsI{ck#Y)#D0iF@4rm=LF)~Kt})U>d9AY0eB
zqdXZNNMbBljR9x!SPRY@$l@3UYTx?To+>a^c+y;WG;h|Lv~Dy(=o4ddv8%FtAoqtC
zwC<~n4cciH>#31;Yp$$~V>i^^YN;G{37Xf`yzc0`{{F@q@Y2HgB5&E%*Kk56lCPQV
zz%7Vzm^7a(nUK2cR#WeCPO4-c4EIN8P~`PU%apoR0LhpxB_4!)WxTYs--|P$oY^fo
zjwt~@nQI<njaOYV2ZSS%Hu3{2;uLjJuy9d&J{wWL_eH!H<xr>b(C&Tiji#b!`fc|>
znB?Tl8aW*(x|Th0OYMHY&egh`o)!M^z`;O4)GY;CX_o8}yCKez7?r--G&A|@JU?kb
z82V14`A+Es0+aLm4aI%tX;0H}VLY}L<N`gcd1txiAUvF_(-2ye{AJxz)EF)EjWX6X
z<FS%&$lEvaC>5Rbu(D=#IlI-^b;RO|`pWZ9I?n8U;9st<!Cim>_yqWjoJX4j$@cYY
z1c0n;cvJ3f;hMt+!sEBS_GJcU)Y-YGme;T%-GBNG@=VKLFRHBno+>On@;#Y5Kd3OM
z_V7rnAySta7^JT8v#QZP4e5eQ(Hl&VEcHt%vMGm*v2t0`)%QX5eHb958*mbyHs{{K
z`S>O<=rCSi1_LDp0kFRDJpj@^fZaJfY3Ke&)wI{LhS*dTwLV`A@#r}Hb-dJl{O84@
zh^f!EDXq{7gAG*R9nes+(W2ZNzH`z}{#H&c^A3acblT2fc}HD-&uhibqf3>nRCmZ*
ziM#A^TO@g_J`?d1lO?obUXhb)VNW;P^yY7hlrL{%!gSL8;u@vF8kXy~aa6&4B1V-}
zxD}_FZy0qrz(5jCf|$EJ;{IqXAoWo|N5C_Z@l)~Sz=*=jwHinQwXfG}nDImObva|m
zt*3M!sIF6w1%&s%ncBxMz~OWzy<g`;w-f-&n2goz{6a?uFzUpUL$lEO<INI$U+9c4
zx%(BAR&C)a>|X2t=G?8#uME!~oK={LtAtVLSr+hNjcc<_$=!=eaqOV_j0aySEo&13
zq<jrI6gx5BO^~zyJ@O1D_>-B#7iwnp{ae6F8(&03gv$G`TF-Ul3iUBR%-LAz)=|oF
zHgPT_N{{h2^*^UKoY69Z|5`-0S`LwogyA-k%;{H}><!UsK)uQhF~A%2Ppmi1=<~!w
zy>Thg<_6;_^ZVBu%&~kpA2Eo-Nbw6m2fFw4M}sv1-qcL~u_5_stG~9(70TP!PVV=5
z>GxfZE&Uqzt{wV+>4#J;P2?-=B*N|wEVl5hx$rF6`oa&C3^XT!c|U!eK-`yd7g1Y@
zxAYwS`(j>>ZPk%jv8{A_b@Ywsu#wvDFc}`i@*0<1WAOmd<Z28x=vnf7K-mApW&*{|
z)Pv#6gKZzE+{<SNqcZMYOdyOZWcH!p%o>jtEunZa-$ysTO>!eIK>xY+Xr#nf(!`ru
ziN<^f`=~mhgrfqmJp+suR7f?|b{$+N@e-s5`zU$#mQ69{Z-Yw8tjhkqi^Yqxg>zT5
z5O)B*Q*S!^5A@(n1K|fUQcF-+L+1%h+=zwkDcsxc+&9nleR7QED;72c4^PDlS{}xm
zMkS1=FdB2c`+XqHCTQmh)n$O<Ho;CU|MCU*i84l*z^I*4Br_P(Xmf|o?MG8RJO0|U
zN5&5F0Eeb<EeP=$S=xM0p>t$g`#s;w;gc({lj}}{!dNf2NzKRZ58s~*i{A%I*Z#JX
z4L1=9DO_(Pzl=yQ!%F<j>^Dn-{9++vM}RT>05IFxHmW!F8d>ZY(4`x>M@|>N*iS5G
zrS|P1<hCr;d4vsvyS#oBSKWIV4l%b7D=sUemhed${H3g6iH^@E46cO&6(2xvt7~|8
zcmWAoJaM!<xm2@esqMG&ul*~%swgGA(eSl{nF!mzr6t?k5};(y$DKA(b7x1!+JJPt
zx3@HxrP35}bnu`ssnhW(O`_C0oDqqZ>b;gjRr_imaOL8>Qo3ky-vMfAR>vcGna>$y
zL`w?ePrbRv8EVX~CM8hzWum!$#oM`(v1o|N75aLn!I@eIv1;zB!rWc;xND<w4d?Ol
z9p`<jU%jn{ekTS?#OX{HDsE%z*cH2}cn5Let&)LDs;|cKYxueriPB#k?v6V(+v!dB
zSrw1SM}8YByTWUB@@Ysv;aeQ6iF{5A@<pnI?acs&t6rcxhWN5Dtfi6^K%NgH%`61%
zZ2~=pDj>{J!`&44dYBdx!oOAmns0ShCixvG&sth9%+BX3ElM!+#!-%a0|K%DfR(r~
zX+s3kIu{rgcE{cbIBaE;kW)9lw>W|Tn06J`%N{bjb2$3FMFhEUEzDZ`wfX=2T0sBA
zG!Mcx8Ojm_QTfAL{O9zvt;*GvEv><r;$Jlj$a%h_T{FahpOLYwhUkxzf?WbHjrT?+
zDYasmQ~wEvWo_q{;jpfsM}~srNdPFF;(!14E#^+b=wNfro-s>uf0Ni&Jz<su2q%5E
z3Irb2xhtU0#EdsopKwNouVvQW)I~oxT65=0A1F$=d=kEH-S>C$+l^-n0npbT^UBY4
zk-rPfG9BoOfR}<_*!wq<+bQoj8y%3q4Cv<d3vW#<#{HUoZXwBi2`Kc^fFlw{6(pZb
zTaq+LLCo`J+n>_OZgR0@^Kxqs4Q`|-!hW22fLK4sk_c49va2FvO_L_-jE;dNpjeym
zCn9dhn~6B7w)7m)Zxxw#R-q-*)Be(ogsa(ua*oafo>xp?tG4>R+G-S#qHa;mTX8_%
z^E@nK^TO>!Lh{ah${T?XV}i}kj34o6@Ygk=M73Uv{d+i(Tv^&>LO~DkXXhnGM@B!W
zO(>$QqX`HW)yatj&X89#D(L8kswlO*WR!P>Tk-b$I>%`-&6btUpQUjOkXdt_jfARq
zmI`Lb5-9LdEHw6&9HaS`?63xA;|iy3)SGbB_8-*{+oZ!g70LK8JcZN3mqcSmLdb)1
zq+f}*#NkWlfTI>z7XUiI6aWicKHNV%1aJ_?qm3Om><8p-*lIt?X#T98ipCkMQx8T(
zCuFbU!N-VP41I^*uXzRU6{YxfMe$@;sM+LA-WTUZ+)<8?pG(d>76S;HD}K_UigAH}
z?qn(>cgtJsuj9GnLmv*h?`=;FfO|<GyvMZKsw4bB*#A3~l9Eb$#L40LZuj0s;KkBT
z_uPzMuCdI5+{s2!!xQ?OJZdJ+(T6oen-%uwG1gfEq}LCvtK+&-z;J3+-)TDZcR$#t
zY{qplL1(PDp|PV<>uk`k9bJSiQs+M5_eJUE*8={*HwXH<S#{Ytt&idEIRAxWa&&Tl
zSmu|9?2Io#Z`2tVj=NM6PWk?J1ub1Uj$~P4i07q?`SJin&n^;nYj$@NGc-g%{~jTX
z7GXl4Qr6WHd;|PVUYLApJ^ub5X5r|B>2zLKcd7wXYB4wW?!;g2y-N6L4WTZ=CiKX!
zJGjh)aGUB295bM;Bm~>nvHX_lozT`hAC(hT7M%_Gwd4s(YL9ca%%%nl$y2!+%H@2d
z5{?91MUO8%t%kFdyPNRyZQu{2&-)B6dP`H)M`0y9clG{^7oPw5dcK_n31g2tJFl71
zQy-<&*6&s)L4M#S{6z7?=QR<OHN6C&S2_cshK^-dlYO6pIb7SZxb_UFt0^oQl$*dB
zvy8~Gpfbi*z1^^ou{26CV3RJ+P-oaOk5kM*w4d!NNRP-RcmF-ynaS{G3T?Jr3n~W`
z_4j$kI%e5HKp$2~SjE}i`0|D~q^o=B_u9^ugISYI(L@U={E;@yZQE@huH>?7I+xUf
zHx8(mO*S>s5k;=}DqD-0!WTP%r87ZJGG_+n$0_6t?N!=W?gT~7=Z`!Bg0(l&9n1hX
zZ^PN!oK}an_3J*!#B0yI@(ddC>vHK9SZCpN%TniGxX)KhQ&l2|0OKg|J8{NUy2MV@
z^DW8h@xp(|Yp4P&mzEwsGbN6RfM=B2vcrtpYLJU4Z*0Z`-na}fEOtHm7RUzjSd_$j
zRW9UZ9|1>Xj{;^I7gzW;=cQYMU?2hplORw44+^Mk$eajP*T4KQUIn8iLl<ewHS%I0
zMtt$k8^q8M>%N2?172G2SCj>|*yM<6YCg<?)lTXqE9B4olpnw;th48E@fe0U%I;jd
z=)ah=zNo%9S9vz04W192^^alwO<&W81$Ox<F~gGJfo@-03;$!DmweUkAbIlY(-xlV
zQAd_{S#LHRcMPwG(R@A=oN{LXPiKN+u+nPV`2S*-jE{i+E*X}(gi8UuR7TFi9Cqy9
zke;lR3;AmN5vyU=yz87(Ph@0VtS%Qqx80E%oF<54+iT8p*LMFfq6cpu^lwnX*x3Xu
zTq;$5LN3k-GUF};Buo-<8W_uuL#cqNX_S0A+40Y`<8=Q@iKn7cmN}?$V$A#<@%=$r
zk=6hg!+#wQlC%g!V|W^m=Ys}w3s-!GK%Wq1t0l9+IY6;oyiaSwjsgDY-x{MxvDU4%
znoQiJCS0x*+_e#Uetw1uDgdNcQ|mC?%tE<YixruBH#A%F>(rK?zsEOO{c0;E&^ThV
zg`g68v~#p@>aAS(zpRjf!IolTLO{QGyZuZOEBz;%Pj+QIJVn#SlLlci)5Q5;8|tUd
zE|8=&w(l)Ps^3^`pN{slvNL_toz0$TNzqM&FE~Jd<2Df?x1K)ZVTN%aoJX<`15hWX
z)hxejTQ&8w#-C|?VU5ppD45D&$a!K+O#%;A@8DQtRkhIDi@uW5J~PsreMU3N%)%xg
z|0u83_dcF9v7epLY1bFD|1GQZBYPt0+3k;4*r{qNb{cINa-A<Nf`4;|QHE7QPy7d#
z6~(`PEd?ZmUlrBur>o-OAB{I_Q=s;JEg9p-D@xXH+byfP_9gUC-M0CH#s>ohM9%EV
zF>R=06F|Jv17vD|-RE%)h0g&}e@Em6>*^^#(IWX2dj^*84qozltzgRy(75dE9p=)l
z)o#=^bkhQNi<8Y=&0G>-Y?<%v<gz71mCjX7+6hh_M{Ay(!J{U8xlqHazs|l$=^b@<
zA9!yRY^w4bgFYm#@S@l=9d@uRh`Y5Y%G@1hG>}G-t?mVhWhF{Z?n}@zD}Vj%`Y3BY
z*z_6b3i~9}2>g9iaVu~zEp(Lif_UH!K;W{(qNdQk6|eP))xrmhsw@Vx++l{Vhd9U-
zS8WCJjYON>N{W)Z7Q}I{@ZI(C)&TOpL<@(Z%zgHieDW@NVkH>gVB%KM`fvqcnjToC
z_K(+AVm6tgdPI<E6I(xPMIt|7iXcc1If4ub)biprLsXx$?*R%8K_pSRn_1&bp7Fx7
zI<5sh{Z3hZ<EkSUB(rR0axyo65=msNx-Zl`gft@g?RA8kiemqTiTBi`OM4D?^T8Rn
z-SnWVKw;-R3I=0Gd|2KnuV@QC`*-mt;&0x?)J1<pD!lQbKKu{8%m+OKC;FNlJky`v
zzf7Rm#SE*6#IP%x3@(mEn(H-8{QNeCJJ=g<$&=rZ|5<%e{P`0^CF3Jpc5RHn)Fb<9
zsqj4b!MMaHFAlU>>x4<z@B7I7k|u0XwqnMIqjo}K8RTv0U|S#|eE<aW90e39t@_MK
zB%S+lty}Y@9;U+wK6H(2#o77ut0>a}XDt}!r+CNl>OruVN`&sx*NbBnphFgh4?lBk
z<TTbW3iXdgLrd9X{P(s9$#<YwBCy!_7@v*FZHjWXssb9b`eC%pws%T5&d=S=qH`Li
zhc<JocxfMMd>@$w+tc<qSP@=v+{^zRj4#lC$*uP(!*GZ`jxg>{3zwv-^Sm;1*l~Jx
z*6l8XBpsigpMaNxn?p~ARYEQ*thLFcaUV0&j_ystI2c@p#^WsBgQt_Jll8lKz(*Hn
zq=w5(Wu>l7kzgkSwcxjO&c%etD7Cfh9CV~QI#%x!f;&z7mMRGH*)yf#h4BZ$n_Uuj
z4)5qP$<pfw<@*y9SFiZVmg3_<w<F(=_6OZoXN;o0eksM#ywmMBGWa_fPV?QrgOASB
zlQ~=4)dpTA^~l(ASi1*CFZp)<bW-Am43UQiHgnh{S@XEO*5#Vg{Xb&5jg(%dPF-_>
zlFq-?+}xDbuif5mID8OgZDa=6@%>gt{MdQ?`TX?sNAOF#sWtx%$29Nxosv1Js4oZJ
zcq?(7^u~2VF7NX}gK)?lfN{TYx@p>-MhZFyT%tgQ`u{))sQioVTM>tU|M)5+?SM2!
z%5s{aB2eX7vb=X3j3>H8bsflmo$6hf2l_w%g*Ko5rn$#_6o8WPu#W#>KC}^9j@vES
zTRQ(>!6ar~eFbb)Jp~*fc>}!H0#HDA9ZijcnI5h@P=p`yzdGdNO2^I2$N&$kV=^LU
zd=<bRVnu2H{o5)`T7K%9mt{7Y{o~H(yUJt5H{J;Mu%p>OoS1~l=rI#vb><aO8q8xS
zxQ4l<ngz0<wsLeLv^T%?53$0Y`5~RGUDBR6zK@+E5tzaODap7XMI5V~H+THGL|8h>
zA0<&xsoWq{z_b$pb#OG1mdVC!9t2l8Y~b_O(<E<Hxq9gx?gBH*Kff$}R`hzPy!+=W
z<<<u_jw$ik!CRQV_2WOUgYq&SphT@eJg>`&kimCrDVOod_Z^T#Oc7vMUuAM!mcTS4
zxI@TeK&jDnd-l@7{p-N%C0C$|*&5V|Z(!Q}JW8|g!9Zd$N8&0NOlyJ4uvEpx>?
zBcB_9-qe4<iSh<<st~a^cR^kV4R-EW^k1NPD&1v`@^`=YwV|JS*7Fib*#;(qlq`3@
zvgZb-q3>Il4{4(757F=~x|A*>nHNS9EN{Fet^6VJpj8X|4oD87VJvSjnE1#1M}pXg
z%)Go)Yq8d?qm-7VS3JdKEkFOZZ9ImkF`fJNI)U(0#1<3|;vR!Av<`f=-<Oqz25HD#
zUw#<Q@5r*J)#=4CA9gtE0|9Uo0pp(-hHL$u^?#Y^m<?l!T%mHs*$@yKD3x@s6#=^G
zNM8)c#@8pG>FmTlWbRJ8wDzGyIM-W#I{^3aah<1KL{9LI^5`ZWS#8SC75C-w#rbod
zrSJ~!epty$#~qNR(}ekzbfwA4qq}N<fxaPNt9u-J{`TVcJzS-_mO-I%+r?T|e`%_n
zAT}P&*L6A<;@l@w!rwXKKs6J;z0T&v`qTr*!mEQ0ol&Dg$5T;FiV3=am^uY>eYB|L
zz%f@r@nv*8)Qnz}X$%VK2wMH+b6fdYnXdtG)@v0~|4AKohZUhqxS9_#4XYgddTV56
zaFfNXp#3)CNz8j0XAXIC65*Rsp0vrL>cAJKNiI+vY|Ra0JLjk87{>z#r?tK0gMbA2
zrITQJQax2lM>XhROMWVJcABltHgdSdvg9PliW#*J*tQqWcNWxoZ8^Z{Py%Itr*R=X
z`1BLzWG;O68m|Tju+<6|pAtRJh}M6vL@cFV{7t<$xHzp2XbPbmm@Z2wh6h)<0;K}D
z7c9pB5yAVe^2zoNfDGFMMrH>y?ehQSBSKuq6w<FSRiLXJl(`MKqR~d_?9n;eSs6tr
z%!(qD+>~I}mb64i5XG^PTVwic4gkzeIY1Y}PtG<%S>Ls`#*~I%0_Xyd8vmevCBZ=q
zx&&o&6UPGo@q*u_))y;)THa<=9dY=w<L~*&5k*lF<(yEx&;}^crtea@RYs;<$(tYk
z5kt(TcK`7CdfEKhvL`F8=A*VR2C6&Tb%vo;4xEeYqBL)5NVZJ8O%L)fv(y}!XglNG
zGg{^?JRtiEX8{+5fJN+FrZDVayuZJ{TTj(mI~llko${#@dYmxG7L)fR9q4P8kc#4)
zYZ5C-TBx(|TtS+rKq1}S8PfH@ySaflE3x}cgSl-PS+4Q#71||13tF1*os}{G6JVpE
zT?lX9UNDHvu+z<9aK-vqxL`T&CV(<%`8}Txw}@59ARx$`riX0F(D}K!)qHMl?p4i6
zF9$Oy9jh8+q>wN%Do03y4>x}~oyK*1T3C{lf2uDMjge0ZYHSrR&IFu=%yJi4pw1hm
zqSC${c)C`Jv58J#*XmTbcHqxQH3(v8$=12NQ&M?PUE&7*9XE~X4}UL|i7Ncv?vvMW
zTA>p6;6v4n9ot}Na)J2Iisv(b_h4{lDymOMi>dh>EDu6f>xjjfn=g`a6F~*<L5pKg
zfxcGWmt@U`6GLq+BketY#b<tExj7#(ZgL{-GEK)@uf~H~t=?uwH?G`02Ko<Po}7u1
zUI`+D8E6YiogvhtOquhp<-;x1KO~X@Q0lrLiDTxe?!z{gS;dpok-V|2YDv$`>oVp6
zf?_gopRPpe?szwUx=B+v2P|pUes$v5SRSp-2b1SIsZqA<E?Adbgtna9Z;@ZlO!fAT
z@^=d@wm>y`ntakVUR;J8z-Arb5nP-deJxbpjoS<S_jjl847EJoKhII*GZ^}CON=pu
z%3|?K$PFvZ)JSZRASmxKf{uwgTfDH5!A1%O#?2SUm%rioGwpBBfdb_q&*F<j?3^6I
z>1e*5PrPJnd~3)_5LW>GbJs#(NHq%vE}Jb}p}aSirSx3yx8{f4BcMU%L_e6$?qxXb
zH(@>2H<0^0msBz6Nz{L9G+unhv=7}}TEtWrabJU-nOQu0!Lni;VPjPT&aHgU)U>py
zf8<qa2xE$J23>@ylkJmYv94xf7vqiOi}j7;iYO`@v#jDuRCmFfR9oui!2m9xbLDXB
zU^-+$ueA}_s~zsn?fG)`KhpzJQ(0T)cU6=o$7H{mn;Z$!bOPtW(Z+cH|BODl{^WUl
z$k{(>P$*?YhBWRBfb@Kg$^sW(>7NTqUSj_8{&^M!u;zsVo63`2Noj?GQEPy}_p%W0
ziY4H|KG0PZclh7Gdnn3keV+eOK!6;Hn461<FVpt0(bUD7JYy-cWnT+&#vaU518p>G
zU&Bc+RfgjY85hDi#j<={!#X-b4glVz=Q@6rqA2cj3xM9|$VlsvIeXAttqJ(XX-mp=
zFnHp%ViFpc;a~>(u1h-Jyj#AQAL64GO}DmN5(tndT0|&*CUt)AL#olh0%1et?CbgH
zLQv?buYAD-K&Ab;6A)!)G$Ij~hG-Y?vlT5qG}msn5(9euEC7eJVtcT({#%k1An36{
zaoQfcfqp?#Qu^JpOmZhs#g&hh#`QjnoqL|Lf2pQgArLwq{v)qe7uAtzO{o;ItKJG5
ztD#h$DwT)xeO~+T^T`qBTS89sa;vuNRC}SoRv-)9wIr5l-bgFv5#NbmSYZ~PY`JMY
z^-Q}e!;5NIHpQLU8QXH&ZD;_tK~8NQAS}+dI;7A3Q)&LEpQy^l$}+*kqN|W&xH1`p
zwwQelm8E55u}SPVqDb7!f^lBYiK=r)%9U~cxb=+yk7U1$w&`X9AUt_I6#t=*<!aNp
zCp-LqES~w)2|nVXw_$O{H!BZF&h{F)-#5558Vg2wG3i&LSwJpskm*)WPj$|>0sg)>
zAi<?=&FKxv;$!{*h%krA#>RHQIhS6tHKK`PL&wjk3yB+0*H2pcOzqf`K+8NXQReh3
z&4;>&eSdCAIrfLeaF9^(zgZ|)l8)^vto)0gRXTM)dndS==&$^K_Ga2xrsn5NW1RTb
zw7Jh1Av{8LmSLoQ=IuO{*8o-e#P<2FZR1Ri%jMNz{Na3N(B9g~NS@5tpFHc+{j;M5
z{Kff+$t~nm|248fYGJUeepgNMXmsqS8tm=({K}P+5j~NWFGz9wCmq%~8+t7+dJP0Q
zIO_x04V>WENrS`|?m#))F+;XT)k*l*12rdlroQm2Pwa<K&O_q`vWhBqT66uQ%s);(
zANrtFDy7%zl}jur&6wl9jjo^SpJ(}|_9w2=52^75<iNZAV$hBneZSTx>gv@<_(&zq
zhr^*FE?HZ8IGrml8w7hGFEbpP&kT>%<qU^q3(!4d-<N%70?&xCQ^m4Xx;94U8wwR8
z#YhXY*BJi({d>C9{g^+P^84RTo(v`z(Gn84jt8{hk*6O&vD<5deqTHP0<<F7=G>wk
zmJ8Ux0ry19j~PDN&}++cbHIz~>syBspn@rXT!LwMzvRdtZ&}>9**cRg6#3uxH0NA@
zxtAN>&G|ZRu~NH-1G6s2r-@&~59=1Rq2gEABYpAp4%z+zM**1YKIX1+*3}N8Cuet5
z%l)oK0clA)^3LDDeZP8tDv9p}2<<5Fr%bzP6mKc3#v?roQBoatYyULVJack9z``wK
zFuoqt*#mVjjX_ptZSrp5-uYqucEXzS?xLNZVX;}$I?h`4v9Hl;47#+8PD4E~hd)8N
zUA7SbZIM$jH;XR&92N05RgEbEn6wMOP91yW0~dGn)Z(~k+-|QIR5FZKBtu6iY`Q)P
zCBm4M*o}<rbzeDa@mmlcyYcyuE4(S=zIZDQ{*tCVf2_<-t~z}*>LldTC1bkZ@qBrg
zB_mUfS-|OP&!^NFyc2Wodha5Q{Bi|ZG%AZT=8JI{M>FZw7mam4uz3886tm6Y>8Viz
zZL7%G<>r7rt6>(CZ105IEbyMNp6$<%BU`=HAk38*UQg|DMR9!NmV-M^fk=G6hDnGz
zCz8n*OC1V1!my7^C71lIYYhnD@RFllfaTs|_VGJY&xF9N*akI-u9Ne|7rrh}g+EGP
zpL}6IyRGVZku(44d5Mc=guwZeMOHP5p4H&@S#oh&GK*mz@3K9n!u+bE-<p~KRLoVD
z_`_zvU^%qe`@>C^!Nvvezo-9g)fUdxr6U^|5Zx$_avR$)<W~cgVKGixa-v(&ky*pN
zrnl_1dsZ&b1$cXY&As>LKnxZKRQ&C7aQ+T^J_HS93{3u5KaXxm&&+MbF?EV%)m@6_
zVT9<nmLHva+iv6evR@L)&@3#MSfIFQx4_4ywGW;R-3Hm|wmNg{FP=A>Z`A^f;Vb|n
z6LxVPmayTitgI5&Qcz&@&#N8Szg2BMF`p0T<zcxj#Np<uX9o(6&zwCU*6EnDWNqxb
z+A#Rb#CkF#(vtE`gY(&~8!_`5x!F#?#mq%PWq-uxbkk|*X%LB;-7m(rS}@U=5reN&
z%RbGc#0u?h;&b2p5>xdQb(bL&FBLl#*v0Gvi=?fMDgT!7{r|GdrFpRf#<oIU-YOq*
z{58}L=<5?BACW7v0T?*!=@t&h>`1IRQvRv6o_XixQ{D%>w{4^dF4Ses2Cz|_^rf?>
zAIxupD31)(C$BB2IO@YP5mN{%k@hRgvP6^pwD$Qhzl+~4W|`#n3uW57eN<oz4};WH
z(y)LhLpnL$ssk8Z6$m<VcfJ;9fN3B^bC;2228@aJ%+CJVS%F->KFI+fDe-)w9B1VC
zXv1F>*m1;Qu)`0&N`hiL$KAQr0o{Q$u<-EOwQG>tDxF(Y-Q@PLh2WFHDV5_LVB9C6
zPRIboD>S^_>edE^aadQ`I+-^(`$MVb!8|}Ug$Ae1028Z~IMgevod9H$K~+<U+QaJZ
zZi$Z{scp4;Jy9QR`!d0jPUt)XSu}3PZ-ZcP$2Xz^@2vGzxrOB$r%;y&iYbO>Q^n$N
z03HDB%XQ>0HdM~J&xbGe$zC5wrT6-Sn~!7BbHreU(%;^A#qfW=flUkKYi-!>sGQEO
z2jZ?~I#4VQg*st3tR6Al$7KdtfjjT8I!>E=V5dy*Q@$f%9RK<5Aj3q~`m6no-+~^E
zz+Y$g*{)+RGlI|qtKNRU5O#u&k`_AUIBHe>!D}UoFN`&6%(V?~$9;L7;`ptVCt8b)
zyh?bNY^4VJfLqJH+#ZzOu%RR=Nyn_(B$9|;$<HYd5W)4Z+H2c@)!5^_L=$D$fxP70
zS$9ML;``17ab%k<JO7^$Qm<G29$N9a!xgG=Y;}u6Q|s+xV#9}pQ5Km&Hl9e2p-^g)
zrexa_&HPWlw~}u|f;XQcg9@q(Qn&8_b$Oz3u(*7m-#q@;Fm70)dkM3C>h}e{$6w5w
zmB!%w!VpHA6_cKk=$%gW>FjTyNUcuCmUv03Q$PQo%sg>m8o*8x_eHRB{y8UQ%;k2e
zGgq8BVAcX<Bh)@=eP^o5y3z}#ZZ!89l6!gdaAKOUsc6lzx3px;@aSCwFWK$u%J`%$
zdocNKbpdJ{!c~npKf2U?Yo~HqvP+^V+KY?+;*XWs%ZMFq(S~ECN5;b~6RXo9Zd-oM
z`7WZA<4O$YjmuPNx@?zqqtaN}4oooNdf{KPRzAA1Zlw<E#jHd)lm<29ho4?03_Sg2
z>w&~Ilt9pZx{Mekd#4dRH=53bk)EF@U*g=JU%}xc1^AJM{oJ>7ON8-Lap)g3TJ}q<
zT`B+N_JN%IbH%HvcS>P>k-8A2T*7oO4bQ{WHs7JX1kRDWX<sI$BG*i7%~u28%t>(g
z#?Y%KOuN!OWp0<p?^QA=eop1>%LY3P$dNubP74QCiP(bJR245!D$N}oq}J}Eg)s7<
zDsW-riu<Y@(KuruTj9(MEd|omhFbs&-LD|E-#!UwQUXJ+SLp-=2iuhZU2Op#KtJ%^
zl+xwZ&_jc5z??q5Y>0-DM1IL>0(r{uT9K2p0)}g-)|y8bgejxQiPrw-ne5$wFRX8_
z{$m}X#A_SbezL2OrG+yAE^|}=Ys0+=hydnr1(?G7HKH&mvsUYx<8}KV28K5tfymS$
zSc0OyebUFH2jT$05PkeNRQdf^-dJ3HZD#(Mtp15oX|{$y+_zv;;}WnTK`tcF5xS))
zE<S0;t{Gs^KvcAGFk5pdm~<|Mi|rxmvJnzMQWq$%qW2GuLnNen5FIuk7rzS6-*)Uz
zUC_Io9D)r$=~vPFM$*k|-Q=ZxW;T?&RVMt3OMQ07UodP}$L+wo9jz_8x;Jik_l@sf
z_w-}&@Ld{epENNn&cUqDHVN8Nsoj$MD2aD%#62)i7#f9jz>D1p=0?GKzUhxdJ$7lx
zD`Ah`idFIis;Z8NhT5<a_M)%tr!Jc>8_*dDW9ik{#RS>C;i&r+dzqL@nF?w43E~G_
zzR!=*tB-gjU4gE98U+QL3TC)0Q@3oA|C=qv7%JZLml5Yn7bn$sCwodW!o@hV6Fwvc
zfGj<96Ewjf*DUN%n{ER5-*6c?lEzKI#bVKe8v&BIA??5bUzv`$SS(|3H#C0wjG<Fk
zIM;3{=(oi{@<x)bEtmI@T<l7rUa1qcT7{cvaV9b35@cu2Lf7Le9=Z34IP={_tVtcd
z(Vp=$ZieeekNYoXxZ{NnI&grGr_4zUiVry&CAgHkKmRD2N$stV!8a1^nRs=_`R&kP
zdxBi!^ypM#gJz`aw<p=i;+S!%1|s0eP3gC1*qdu@WY*S#0?g3`#gn3$mOt~u=3to9
zVkbzmd(n9q)|0o^ce?yCh4ZFarkOzY#&Kv4DN$TrsXH#4Oy(}Jl`Sl=KvwsYmd8tv
zy*IIQx%bq4&9Yj4PxY2FReln8Bc^I%WnL<F?wHqnA8Z);>2Iyt?<&l~A?p^y1PAc*
zqwaDq1SOrJaCnV71};R^oR`kvw^YYrChM*HiWW4+5mYHl9mHCJuJpNNz}lB4B=k{h
z!oPpu1iI3rTl~&RK8i8Nm*t)N7+t;Ta^>{gPz-fJ;geVR_^5Y?tY|0E2YG(!^T-W5
z(cUyfWG7d4c4R}KdGr8}%XPKUpyz89kNot~uT<@C{Mo!`4QOF|f7>#nL9n&flWupe
zV2`i;_gI{91!9!Og|MksaFG^G9q%>$Kz4gwsNS->S@m#j)z(oE@KRtPKwoZ*^^ym|
ztV|UO(A=neN}5hNgAkq%=r#{K{Mnjche}P<_SoEpyae@u4HKaBn)^RH6|KK!CH7aB
zCEV5>|6y_Q=P$cGAhJdrY#N(QvD5hMm+wn3Ez8zZ=)ly5JVUKW?R1*PweyD2D8g^l
zHEzH|1b*7Wt~w-F!Y=gVh2vsj77!o8;pSbOu_*9bkeGvX-PWqPg&vfY-{x`O_1Wv=
zVBCzVVvp>~Ha9oCO+(v7zuf(2$Tqc#Tk-wVVwKZjTn6P>YDe(Wr7VQDXglHojpIJu
z69%U>L*P&9#UOTVxSQV&zHN`spqQJq3Pd!Ob>*P&GLJJAtcT=^Tb;8I`=v^re;VmS
zSvY@)x?_!aGzE<v`1wY=7eR6oS76aU4}3Aj|2osaO53dmj*h_I$c!rV5QLS!cH^a^
zk>|lx*o`!_uQM#xmWu_H-ayF$Go9iD1RpP0FxltF*@6Rna_MQ+xADhqF4H^vX*oZP
ze1>3aou?gVTh<q65&Hnb5YhpD8<oQoiLM=f#qjg#Rcas8V(hKo)TK%MHA~W}1W>Xi
z+lFh1hI$WY;><QZW<fJ)m!1Mx<t*AiXy3LbnGV+$=-+$n8g09hhT`pW(j-KwZx;bl
z4R0TpeL)b^!EZJ{m)D%Eg!ccchHoCe!?<|{nREGkBu9Yu@_!P#%enU4R;tdH-C9Yp
z5?hd3`Ypr&TETcpkPq%nHAuW=I9<=C%S-DhQJ1j~RNyqW9htc9QFE(3HG*@+c{MJ3
zaRHbuJ-0dE&FNwV0YdWZqR)!x-tdGGZJLd`j67dTOoYhfE3SU$T$}NUH}&Ux%a89}
ztl4V8g0Q!}nWGF0fV-%Q7J1=fGmq47FxD%#t4uikY;$*@C#elhg@679Y<)KsyA^F=
zeWg*1^C2YBR7t&mG^AqJk=fwY2IJizD`DglBiDp<Dm@0C*ET#uA0X65tH<)ytUhSD
z;(TvM{wy`#x@VVIAe*=%ZJSuFFVxOb?_xWlEBSJo>Z(S$`%#sZ{CM=h?8>W1Vlf+y
zh~G2gwKT&7xsSZ66@cq7+o`}}G#>40?OKjb;=%P3?MLI;(F5qTH1@01&s7;C;iHFt
z=-H?S`$#(M{SJWX{C{6)n+}z2|79>AwK4#I9j+D8*}AT#C0CkFsIPQ&6YXm%ylcrC
z91ySwX4&?}w?$l>A8$14E~iOSW6Yojy~@vAX`CDZ(Vn6jORpIVYq6X0KZ#G%fg)Fo
zwYJO)3Jc*qRG*`skC?jxig*DjsNl{`0)h88@IJkUib)7PIuDU-$~Wkion4s25wrY~
zxhN8S8GKpLYrOP^K)@wwxy+Fg>W)nr&p=e+xKc^xc^tT1$op{l0+!COh2aO{S4M|*
z;p%tce;yP9g&p&kql7xI8eamz;{1>*p{Je!M~k-vA}o=<X~@sU+S*Ww`{@Z2CI_0z
zr{jeeCkyBJg@A=m$J1=D{)f~$@#6SO@^M8P!$Y5?wih3bq+ztSdT(Be;5t0Xf1`k#
zLMX0Z0wHv9@3sy$3k$MRn`MOLUK&<ls~iq48-D;C4lD`*&y8C#^+XqAHqd^jlLUJl
z;8r?MY7*H3<_v8h;#7On7>o60=$0sR#!hxMbkje*^|O(f8Pu%zTon%HeLZ1(RRqBi
z7YdNipblm-mfkjM9H^!9^La|h`<UVqSYLi^ZnPOv=Wbj^ZNdRjjFkBG^I<EJDP8t2
zjADI;D?B}mI84_0JmD55Q;{>Q*Y>RCjzWVmZ=hVzhrCyV!Zt0p-+i~Z2^0whUzy`D
zn>AOUA+;^<aF>~BJF~}SKzxiQ5!$22ySwf=B!mfwqH*2pso`v#EB+vFCTVsN_P`_h
zc>vYF&i*_ltc7ZWcwCFGb1kn30WDXOfjDf?Fq-8U)^gSgVrTt0SyE(cmW42JJT&d=
zU||OG#rL>_clLDaz;+DK;txfT=@5&|V%%Ne*gu#LHDrujmR;lJJ_K0aL_Yz$YeKwT
z^Cz_gxCk<|PLo0Io;2kX6Xeaj@#JR&=$nh*PlJj{&ZZ1+8rmZEw;i<T0fH8L_Fa|G
zKtQPlw92g(MV0Wes>nqHIa#CVwyS)Sd;&HSl6>OUWgi$gIi>|V;GB_LDlQ$1nT-kJ
zck<4HmHc%~n{4fqG}eCoNGEy^MNvC3MWL0>zc5YIizbS_^S&E5S8{Q+{5GhF3YymX
zSA({mii(p{GcqNTg_jvBLGv+61CsPe<UyH`Hkmb8gcihe2lP$8Og@TZB>pE`4V{T0
z6enqhX8Hzh)iTzsqkc}ssYY}6J1zz7Ih=&E*f~n={(HLd_jDGqiT{VU%@u@b!7}`i
zYd`-paB<7p0)P$-P?93kvQ#omQ1yQ=w7`Shw9`*FHpZpGAL5gFLqkJda7a=PI=fqT
zCCCaSw8~}A_Y3wIa`-!?dk%<@!s;Rb?WeX<Ps2|8<c#XF_*9zDgBbJskl1Xal|VoM
zEp^T(=?N|_bp)|z2DMvN4c8J8;S`w&%FYZ>8$;2n5CB&U9JD<BFz5aXe?#(;6_Jj`
zaH!g<x7%GDSw*p<hpr;>8t>tZvJO9G^gGk0IY6_9UxR^Em49m(`ZOtyZiK+fl^DZ7
zX#+~wyEgJE68Z|<R+9*@Dx6DVua<*qatsroi2h3kY8%ff5gXTn$#~o}2k$d&e1LU4
zFT`gZX{QS_cfA{ByVdfVkDECJ;{D_OLZ~`=S$5#;?8iS%F9DY^<h@QfOv5o#45RmO
zS5|*q=+*|lLYwTZ$bC78U32XN16-@8Bv`&cZ0ZWI&O~`J^jO7}I%BZPKqp-&zP0r$
zw2>H^x1N#7t~O)_f8|&0J^;O3SBC<_NGmK^=j}|qJ#IRfn&jK2ka=L~98)-23It@<
z@vY)Bq2vh{Llz01E@$?1=bWsqgCVM;6D7$;a~V-58|FX%|A6j~3=1c?v)Gj+3@}*e
z-<aO{mE^P;V&KsgzVZ)dU*g8Q7^6TfX-vg01@~v4BGLs(Zs5`wq&1nN47K6kXY?cs
zfP<jMiVwDXc;dx1$r^e2W2W(>8Uy4`Sf8t=7W(^6DF$pm{jj=cS>k~gIm|w{RUDy_
zH<tW(wL_Da2iW~@0kZ)6w-}4+TN~6ayiNb!4#0lH(Xj>0ptQ50atv<|(Z%(Bym64#
znjgRmaKqlPONwGZ_PW#4ZypOWu&b>jPeT|j%985mpI~(;MD0daq)%qM!H;-o?0WsU
zgBd~}(~|kiL)YeR>EPVH@UPw&uszy<TjlhZ!NuS1fCZxYP)j!SdRmZ4UIg1_-{wL>
z@)1edp5^$U>-W@7R<0}S*}=Kj)z+}jD$L^gwH7yCMmF;ARUtUg{?f4VyFZ$}wC3Y3
zsqssH7lsh!9n?V^7@pBtIvSr7Z_`ItfqOzLV~#C+zx>vhwM#}1t|%+$KZ%v1wZBbO
zBh>PiHsf0-EiO;0x`;`W%Hr)j+z`$u@yre6_*BC+D^7^_!*~C;^I{1y$AUsD*CHCO
zJTn??6k0+gR9J#PU~yH(dd3dGS|Pv3m%SgXb_;J+oi)5V*9r*Dz)`;od;?`Z_dFCE
zZpqhwDareZU5$*yhzxuG;I_hthXvf53UbF%#lnDHjsoLH;)9fKzzgNAkHt~;ng~-4
zIpR~sMgReHi)_!tLLi!B{y3POdfZeZ2?HOP#TF&>D(|6K>GEDMprlL^(mWgUy!FF<
zs<ER0ym4#J)&T9O&71%ArS%$M^Z4f}+mjzMH5~L1*ZDbpE&p?{;?wftxaS04_Dd+0
zces50do1{frihb&bI0C5n52p)6?V1zgD^ANFea#=bZ)*o@%u~Jme;1u0;Ama)GM8(
z{RTjX!`Xb{MPilYEDsB9WLXgw7i8wGKVCl!#TExzlU{!$c5%6g784biV<lF;e5u8<
zsS*GYp8-aouz`YIu`-f#`Kpf8XGB4H^L9y}3O6%T$l-@1uW7#>-X2?a|C2j&!k@R|
zsZpvT->y>gH+6K7@r5amK?CaZzK3vFaVIZHF{Qgrq(DMqWd)^tBjBf_cDKYzKH7=B
zR53+P&<?<C1vi&z-vzRbQu@G6JHvc9*!)bbZb_%hKS!|iZd9(hF0>d3Qteo34+gE{
zC(W;>fv3hS-|ktTewn#Av<~$YDCl<F@n4f&hZ~KsBFmdDpPmWZ|1L6>vau61k)W;D
zo6ukHp4o}@UixUA|7av~;Ic!b+`RuSs=ACf)lYRr`;(l+w6E{DqlAjJ$n|%A+YZ=|
zK8YthwB^6y<JhqPQe%QiSd)GR(LrkC9h_pe9vG4#h_qav<v89__Ajr@ZjoOik|Jx&
zzSuEA1vZdwZ>wLacg$y72PO(5NDe9Z<8(!)|5TpnXHs61!aGu7m9EZX>L0-LS@kXY
zmq6xZyfow?4Ko!y^xcXpf56u!lM@Z~RVyRO8^+dbNz3f^LdC(^uhC`wAF8$r&jz_8
zj)txC3%K#Teo=z%<E%5G-kg8O+Ct}k2sBPFf0u8J+TJNWm(FACz3?o&eZ4m4qg%({
z+Iy{MA1$N}arATLV(IZ9h~zUgtT^8;DROo)&F;WGfAMCxqh1~rRdzXx6V25Z@ogS2
zGyx~{8D}T<$^(ij9se7!5Ks2)iV339Q2){d#_=1IYU9G?q6RAL=Zvmc0wH#+H`vu-
z!;j9;u0a(|1-km>HecCBYs8-mLH+<#EW7bYSX2LaT|!(Z{&3jOD$}^wu(SJ|=T|qF
zpAqke+7COK3#h(pb+&!}QvL3<;EQ5*`z&Cr?;i&2Hb_zTjIOZ|$->yyIzv{cAKZsf
zlKHR%Q@gIbN@1o0kcm9B9boujq3~Y+DB!3&O$8wIWk#0>Lm)xZz1cu>uD=EF&N*kl
zlS&F|&uDq@9ry=)lXGY?+>#LeIF?DBSy%2J9Dp|GSNi~P*a|HQkh(ko$oPKAuw*SQ
z{s)KiL_f2s$;{~XL+<|0B=ssRAQ83X7;B4;5V)2k^%Nk!;Rv@mp9Z_6+BrQ9<5skK
z?G!?m)|xB)gc5PFY7o(V85Aqa6r<8f;F61F4O<X|oXjaF|5s*J&3wzW<8N*b=Dh)l
z3k6owhW1^pZmUY{PBhdOqv9!=w=l)xg8y4Pp|(>KPbmKPsQ_(shY>S1YzAjW0ky%Q
zo-$0`Emc_K!uN-qN^C5pUiDMe#<)psbgQ$e%bv>ZOTOok1~Eo2M0+QN+>8H@qw|iZ
zdjI42u}9(@sZPi^Nyv;c65^2Tj+J8*#j$5b93>9PDA}Ww-9h%~7@1|2ajYE4cFc}F
zf1mrifA_eLhr{=KKA-pd^?E+(Anpp0{O&4NIq9|?P1koGx4t@-i4}HBUNO1;bi<xS
zBC~|KzdNwbDsno*JFWEMz11BOaCw$Anw3_iQSn26O&#i!PGnDyBE{-p!Fdcg6`NYi
zDC_ahT>j1#G*mRqkoW9go~Lrg^XJTvraSXTyY*;~f2(@CeoT8_XZ#AAa*OahER-0<
z_#af?#C&~NDr<sxUsoN9)!2x3s=jM^M`g^wP)A(Y?l1g#n%2eBCTDt;;K9<t3wjYY
zq*U5aE%V8uxPCPnw+uQJTGlWfbF$<7{MjjVC%ndhslq|Hsu~0hqQMuU-TK*Pq)d1-
zw^7x$+Y^x6;7e)!eq=rC7JB|Q?DEp$xU57|L&{}w3vF0J%A3M^0tbuu$g9+SA_qh-
z-+TRaypB(!g+#u$``pq@e%yLfdg;Y)6H8}j2^~a<LIi@7lhn~%HYWN<-AWSzIZ+8R
zo}3MoP3SM3kU$3B|NP$kdChVC-(UFCfA!aLYzjW-IATi(uccV!+Z7>nOK&QMiF8mo
zMBOD?WYv9B8sn=!RiKOtMP;{#f2noC6BxT7!v=TC!yq(iX$h%<L&naYkH>mR1@x$w
zVKsCO)IwNl8cfZ8{hA*Typ0e)v$va=U&rWrGY5T|5KrCAKAAd^kY7OkC>P}qA1R%x
zBmxb#S3Q`il#%+Aiwlhn{eo-h5RzrY1`zahb{z!!$4LHremByK{1-%SBo72CX95{|
z0GysSCaWO4HeC>*z!Nw7$nC{($iZ^Ag#RHhI~5Y&poAP&fq9nbyF_4X-P`cU{`WIC
zyYDgW#_wrCv6pRaSr3-}nfl*;>X5lm?0QI}_MYhQFC8z808w`$*lcdBO@?sh6HScg
zxLofdv*S8D^A{!<w4h@v(OnbumnFkF>`QV31xUb8ljcoY-?7?3H^?!`b_j@s#Im)<
zvtP+6D15@xnF!#|C)m`~?3)8Yu40m>znV^KY<99wR-0yEu_sGA5<A(4PcB03qDfx4
z8TKqV2r{yD8vOTPq%<EDJ?IkQ35DsqXDwjXEs;@@AoZ|KA~?VCz~G0{?&Ol@xE=EX
zEgg|M!c=!JGR3IHS{mgR@2JeK2tlrT`4qh>!_~we;#n{69Fi3eWeSqO!vkp^@t=`n
zZ7rKx9RViL3&YL7&67zq4B2L>0Y9aSG@USnca`Gh)n@9CGmU*~(o?`EEfBfzJm4<^
z8)rq4g#4KSqr}d*!^sad7t{=}<rh?-OZXikm-IrS4!$M{ak_pwC~=Z~x@c;{b^7y;
z72@yxo;3wJm79}V32y&btljs<>~q}i`)%HKkxH?_X4x>+<L@ut(%5pjK%254f8!i9
z()I4Slwo9`b@H~C{9&!0ZpM`r>Ch?YKJ|ozEWe|B*;i<(13|Z2#Yl~j-rz$*VUwHr
zs${o3+stBS_WC<C-Z{sP6!g5vuLb(N<^$k|$~+r=lG5X1b5bJ5LEEFsD0~T0o2RD!
zj#0JDb&qJQz=x1xzwuU+!J9AIUAN)4(ly)@XM%r?Wev!n$jv26Bhs81!_*AkIhd6s
z*C!32FGu`p`tD$<Eeuf!LjsM&48_naiEZk5IZ!^`t@d)M1=kM`as^IvvXT6IJ~`{V
zTPOR`Vq_dcO%-cS(R7?bdGd{U^zGBJ(x&F>srmN)Q}1+XI~J*S$)^ZooRlEz{g0O(
zvKu$My>G65Mu^gM(nTgnLT{*E8`Ok2cKvW<N#%!gQU@WgzJV3+UrLSQ{A^5HP;@3U
zo;rtqP-AhZ;OsfNJjiow%n;gaJ`43vztH(&VRzuIvdB%Phz<P{@~JmI*I()dd3zv`
zttT5yKU-TT4|kPCDcm&qaR0;O;M7tI9YpE$h!OyPVv68%DNQLlz%4zQoRGB+J>Hgk
z?P{i%EX^+3&2EU+-rt*^nF}1#Z2$p&k%ieYD%r5oI^K`F5jdUyp1Vj;nzJC+Pw{4?
ze&_#`sF(i{$J+Ymqx%hN?97r3iyDq!D(KnG-q@8|tH%(&9A7;C9bly{`oRa}Sn%f|
za!Sg(laCyW*i#q*VqnuNgq6yioLfnDuo^iZ7HBHq<yD!cX>D#OVCT2<x|H{CY4#oO
z$#s>ULAh~A6%b)w_^6>{PTVjtlzo?6)X!ug?ck`?Mb`$O;XR%RrI<HSf&$JlpfQA+
z3wFr1W^Q|qghW*AeVshdf66AGI#VKSw<>OT%w~d^?s*9l1pb1uJEqvroEZxAKM*W<
z!~Nrvy*V!UfwWdW%i*-XoFXUbi&5)y98DiWVZi#*S1i2t_6n?^T1zv*tfl!WzBpfk
ze!hDft^KTJ!+$9ki{rD=a^k*Mf42_Vwy2)_+VNJuG&|x;YMnwt@V!}%Q&eWjLY}1|
z$m)N;IEae&Ua|9#c6Y9-{_S^o2j-g)V8;EBcwd!Kejx+3j!!E8jJcD2`cMB)_UR9|
z?x!+z6FSXnr&P(Su6Lhv30<p=Vu-S$S9W&(Q*n&nwBwcI#|l#seBASxHT$YiFnce$
z5V5mNqItdqtIRkh>&d)ZpO73_q1+U24Za6-VIo!4oT@%DYU`E?=}YCJHc1gHmKU&|
zjIf@;%9iTO6>d8gp;punyuZJU=CjjzQ16pK6ah)hjJ{(d=XWJi2M~@R%mnI10C&D;
zhMjwCl7N?*=Z3#hUYi@tlp-htAeY}w`Ezhkw+9Mb8`1KB&H#uPlV|AN{~ga~8o2h3
zq~NQnz#PL!ECQ-3IqYCbWfM9ojVg2vQBqdE9H-w;*$2{u#y$Gr`UcN916ic9!c&^F
zZIIwFwMwZERA04@>!(kA^LOk`_lSo3Ji}HUe)B9^qgj=`s1poL>M4~DlG}?}P0yYO
z6^rcQxZIcL`0FQQ!`-EU;jWmWpTUm{aLBeW6*$sZl(FCqMEC|D>^%$)Z54Fz%6^q&
zdo$%M$A@%ZT6)J@M$)SUrl0g>Qm}qU-+pp=CF$1X*IMVy%;ScZT0>7m??tPDFnHl^
z1cbkBq!a`?r}PH}5cz|`9gBBcXXKu3zbysEA4ThUCg`2L*eLn7<LP+R4QxM&aQOFC
ze-zW&I?r-?{){)-(g>6Zmx4udSNs~BWD!u!>*OK?f7zwI1ltb&vR7Y03-2kx3NUPu
zyV)corWj3%VuPu`r>CKz4D@)WbDJj~O4)bj7^0#yKjF!NT<_k;;lK9q+49_goo;N-
zgpAb)4p#QJE;2>x7<K;@=^#}NIWtqoQ=p?Xd*#uUg1NWi`ro(NsAx7Ce3w^%UVwu9
z)rNH3nJE6ihjl@wN5oJ<Uw3&p+#E0FCc<(tYmYA(cQV;@Vyb++n*9zHob^ZUHiDV|
zQ%3dbnTI&4OL$HxPeyWmMt`y=w_ObBT~QI$il3oYUG2T}OQ;Lr1q^9sU?IG6_AdJr
zR;blSBmE8~o~-RY*^JVWqHnSxqNrM2Z>6ACbE@WKa9jFupu&|6;R}zqvy0`|STO#x
z<8$!lJWQ(Nlh@Au8Od0XU<!F&M<q|&*Xr>?37e+1{^!F0BjVU6qXi3C)2t0nTEf-i
zikgP?E{>0l<JjlSH$G0K5KbXjloq2c_dhfrElKxn(azjBn(k)?d)zs!)R!X7b-+Qh
z9HIMdE4u6~N^A5$#lh=l<fTQyE7IQft-2kyj~*4M)LxaQ!7}mDG5?jI5)z)|7}7w7
z)xC(S5Sk=F5z;T5Fuk|0_1}>-R{6u>9n&gF?|5@0{e4)V+Y6Vw4n%Ze)d9i;X>`>?
z@}>aP7{>n>{>JHM8T2oanO%ONCaE5MSJF;H+LhCePk@U5chBKn_2d9`K7<>-MjUUl
z<cd1;byAmD9+;iu*u{ZM(Cm>be|7&;!E$M4l+vwLe5#X;qaELqUTOy3h#iuSSO?q?
zQbF1O?R*9KLd+olbNfg8FimxPocv?=2kI|zb~;_ZwX#d+d=vqEkuq1lr*WBEi(ssw
zZ)GI;eB$%^Ek*&ANf+qE<FV?h#nK5$q=#s^e&%qD)p)6SO|@`C$WYz>zjHN}7*$%s
z4Y`tL4pnXx9F-Pf%k3TgYg4h&XkE$b5rXY_DbF%v^_Mz&RSOAz`TlRQ)It%_F+*xB
zZk8xM8G`Hdt-An+{Bf8%VcPNOIhZ=~rW76YrhxG~5|rxu{ob=@o4>ufSC+W_cxd|I
z>5WT&Jx29;L!bt|GqqEzVD3FiPu%v9t0YD}F(Xd81ybf+dFU0|n`Kt=$a6WDRUgsd
zgaf&=H8YdRi)GHV^FN)_N7{S9c*T&Ilf$bVq9cs>^HmgaCl?71yeR4OR0a-q3@9SX
zlvXiQ&pf_Rfi#Hxl!(Git^3~&2Q+EBrYT_aNFG413f2$h5~&#nW%gpv!oZhq1&HMV
zhlrvQjZKUc?G9~ue*VH>fUSOs6a;(hhL|PKjhllNLk>2WP93QwwX)M-s|ixH2Z|CD
zU-7DX;(g;DBjmHX5@!T3U0<@lvsmoE^vrB%RT^>ZRY$$2sRmi`367XR5p_Qn9klv4
z9W1#Yuk4;2S<CUnk>D@}!zljQ#HhlW<(f)34b&wPrm6#@dUm5{t!DMFTR41BF5(s|
zs5F}3Mw6v0heLFwR{aL$&guj=W?+P0H!xB)yZx5(!oQ9`_At=@0yVBe{qXbSDiFq@
zWq+}Mw7;*BgKcYr=^PFI3c}-~nXgd&jjyQ5(gb}UnOIAKd<!k%Dpq-h*eX+}*U+&0
zpYOZda8kEk(Es3&3gduTlxgrUyV`Qu(;C7y9waR^!xEx$4#VBjpCPEr94sy7GwnBo
zM4j+S3N?ZQHH2bKJtywY)9vDosiST-iMRaH>h6^=Dy$m@O5+`{{px$CsrjGQYg70-
z8Q$>GvsTVFYg1d&=0;~*+?MY){RoG%R1&uXl}1dcMCn;=`KVbl3^l$9enU#oUa+1>
zk549!|FAIm&vr)sS!FJ*LNVBe1D9CVpypY*qs1d%tNh(s2-IFca5CpMQ+&M^SSYwl
zbZ1aB`8D5GC8Fk!-AbLSb_D(Et)TyTRr0<&Y~c)?w>k7=JPy2km-t)_gkTbHS}r*x
zcjx+#t?tM_{xv^F3&GleNm*()n#!fIFVMR#AJ0v%*Aqq)3AZO-?nzO@+^|k6A9Ap|
zI8s6sL~s?gIfTW!P*YQ8vFU*uxfKZHRRP&^O<e}EbDiBR3v;IK>a12<l)x9kWI^36
zr;lu~@6BD8HyAaXRY)Op^BDv2gDGz-`lGs<r8UlT*U{6r#X!3hgwj%c&N-Fb6C^*^
zdEb#k5Ehd3jw_7pn&qEl7Eib~!F|^l59#1gWl*CV6weudde}D^XFs7c%z`x<y0*fs
zfm7M3X$-EJT8I4cQrhS9s+j^)MtaEFI&WYym=mM1PdGb4Ju6#3pZpc))VH~`zZ5L5
zO=in}cI&kuUJRY*^zTdv<~%ejJI->J(odrL!^W$G_$i|sR%M&s0rE{JUP{&_rig=;
zivC8;rDL}B(vdoI-*b=2gsS;2CFv-B=k+kTmrz_7pNFPA|A%O>7c|ruvh6Bi#^7O~
z{>s)+4GKzYWkK5;KLg$IK*#gnJhr=BQCa!i17U}n^tCnChDmTyJK1OjCX#Ec)6*WZ
zF*H)kjtgL<214>wpeFgX#Me?=OISk?F{PSV&RK9Sf9Th5XS}Rgkl7h2sf40ow24>{
zL}jb_;kz|XobO9$pqB80t$=DXWnYnHaQZ~N{i%s-)yY`)(J$Vkxh4hcQqx_zbbb{^
z#s1tRuhI2miFe*S`AiTwNvbq7u7$-edVWly#QG|8nDH3zQ%7p7uUN>d%aQyMqw0Kr
z<yb(8wtMuZ(6H0Rv3(WsFaM|_`dgt1G-?<7k79&bTMe8Y5*rQ=4<io~86;Vd9M7NA
zJ))`*8!=1?;MOo!OY|IvQQVqnUNrOPB|?pT8vC-YJguFu?#mx6-y$|Q+uqD`F2Z(6
zNqU=&5W$K1ezM4*n->38BE;a6`&)(ci1EsGM%>~hDgJ->hB?xKv3N+M?jTYJ#Pl19
z=%9<W?Mw3+I}Mj1J(M~3Q@@kWrbXw7laVWQ*-en~IMLfRx|b9fwWyA=W?S{Qm^IQd
zjIgiu<auDZ$h8qEDr#e-uQC!u&63j3*&azfEwrMaA6CfY@A&h$aqt6rUA)S}vgWO1
zjhM@X+<3^0I{_9=K0LA^rnB}H*R^!y`lr)!3c1<6B+(&N-M5FQmFB;ekOS(XyX4lu
zAEoMhw6ocT(y)XV4N}1uI<eaL5pAvr+q916up5Pp37uL|epHTv>|e&02M0k(!SW0{
zJ4!1`EuN%ZESN*hPmAn%!T-&zhO1LJS3ty3l7%Kk45ZoD|JGf5n0D)d<O*}TVLD^f
zT^mDpY(sf7n@#O6t>&0`4u_AUf4e8xBz|O_j<y72Y^tqKe6zR9<tuoF!6sHDW$F#Y
zO(e)p<qAa5?i*YvKgFA+r$+lNo7BT$<ar}39fT=H*MQDQr9so(8QX(QfEnF3lz}wE
z<woLBEIO0eA%pX0S<d%PGV{^LB+8{mJA*C5t<?!FRGX{dy!HCyvU_#VlhTm&n!Zl^
z)n`^FhIg{`s${}%=neuxcoZ&dCLd&0`}O<#`~Cg>-9u0N&&s!RM360<&E|&>8Eq6Y
z_3&h0tG@iVMWFFGkh@)!lkXQ8eBh(NWB(($8>~K3sM)uOS(t$1gYDL0Q>k22wq4NX
z!XPcW75cR?lPt~?F+J(nB*tDw{u|OT2Id~{U`?#=>e&=m;T`VF!=@F1lD4yaCov-9
z9N*cQjpboAtxM3*gTblMRllicyOa6)*(D<w{Py&8%dQH!K4foW+N+L;uIQWrV5>`E
zRDVFXw`StqxU*Ptw|cuB3*{>mKT|-i2aR57%8HhcNVwe1h`@E?t++&qxZ|jz+)9!f
z<Kg6DuLI*(*IPq6zLrE(lIireobI&G!LY>1ip24?P`IjxnfKTI+d+83tr4O6E?Zvv
z=r5m(5v(&DDZ2RWHLc=w<ac*mI2ON4ynSx&>zrtivqth7^-v;;+^a07*H7_ncwu^U
zyiDK{a*vR{MH4fcJ7Ut{BN!_c0mnVz?woX+LQh>_cNEWHUqVb)v#COF%AUSc4O{f^
z_u#g~aT_vAK~#!;o0P%F?qz7f&|Et)=;x^}3lr-GZYgxMD@ph^_u5WFVPbucDN?`=
z=3tU9NYARLe(_WW#%}mGyxFZw8yKcXP)QELxua*dm~C1m*LFd|!lDm9%+D6OE3tZN
zeLDCy`*xnjEW<AiBvSR%gS;B_!gbn~o|%!cujPgs)-3{%gI=s%qk&NuFvo{3h!%yy
zP76#t(^_O=a3B<{YGZ7kq<$W*ViZiBK>1^H(b{_7<s;bVCH3&QzxT_gm1Dh+l%xII
zn=UwD9B_EPNg*5}YvzQ}?fo6n3W$EjcUu%mbEQXf$}v(Qr{EPo;-KRcI=bLlV|xav
zDcruuUW#T`*;esy#=qs;GBJM^wlv7{t(j(cT&vKrz9e^unI1Ia)gjqQ@q;~4l(Is_
zJC`KpGVBB#i;20VWG$pfy(q??*l_n8l11~8(Be2hbK7Sd^C2M@_f3=Uzq#n<Q}ZPL
zo{^?W4q-1$8H|*j?bT&)m#9GQ{c_pNjc`3vl`iTlZSP`qpY85W2L}vSJ@T67C6wB=
zzw*${rh(RVM$?;z<$Y9|UzwXA7<J~rRc-A5g*jhTujt8Wbu^_@+qrqN!(=LR#lQex
zKm3H|qY9+uF8>Kl7h@`8psC>oG<qr>^=AV#9%tf-Rl}(H{ez<|Bj;Db%8I|&cY`2L
z1A`SZ`%b=p1t-derdQzu*Vpn7jJoUf8ZNK-VWD!^VQ$KU?Wc-?%@eJy&+yy(dlr~y
zN?9p;;Olekq)-00u)68*XD9oQfo<J#zI*ej#e7}Tfye1aE1Pmg?7hzN@Mftz#kR=S
zNtsxw_T!MIiS;8rwE6MYo|C1iJ~=CIdN=gczr*PX9y)Ntx_kUU&d(`qX(m#Cmu*-1
zaBLm=t3ULF;@))ZXO=|X2HBu{Z3~n+FkJ*D_`N+^$e>K@Op2jPZkFBxf4*nh&f(H<
zF@s~;0!H{cgnGHY#PKq00dT{C@+sY~&sGFr1<xQP8JZqN#-lMs<!9%NesV<l4OTWy
z#LX-=uaoGQynm4E`zRKe2CGuqy;r9j`7>^(V;8OWnf^Zh(l25d<M_u+T{su@Bc7t$
zk(-Bq$!RYm>F5<V^m)0xG_icLRE<xNA-T4P&EiGmqS*^ddCF3}_YZJK?yhfiiGqX)
zr#QJ=rh4^#{+*ltfy{qLy8Zr<1Vb(t4f0csxCnBzfjB-?x<duMf7XbDs{RRH44IyM
zmDn5^4<^Wd1a0y&S-q1?K<aDeu`U!ZHhE`~u9*y8H~fzs{iH#MeHRCX@q4`~mnWUO
zQS1Fa5iwAB?wTM%l2<svK7lZ~dpFi})0R8)?wrh9Te-HBp~_c#Z=4!*+_jH&1z%&G
zOYR0dh`j{40i!0xDxCKI)HUXSUfyZ6EQPfUS@d_ev<2H{<kWm-gdiVky?kv;u9oZ(
z4uYr48n1RNzmhtjTC_T2olK2&tAFCix{~LENWKW+<KM5o9bR^y!Q<o2`|A)aN5aoD
zb<26pEKP?MQ)psURbXgS4L*)3pX`uzQChk(ao|=9Er_oExp)D=`b+NTt$zlln;Up8
zDo1+SB@LwT0b`_veHsA?^L=-bi4XPw3Gq>S=l_|oOZ@E>(+iC{tJxb*!$$*oqu)av
z$@lv1u+Gute=*z?5j9#aM&5T=td$2!-1p)by1F+m#(aH_`tn}fEeovqlinT79*W?$
znx_52O?r8?M$$bi!ZlRi#)fbP@+L*y(4JPp=8r|JYf-@?#QPjVR3#CK*IwX})3V7E
zXD*6?Qy<>pv1fcj)$?;CB>KIv(T8*NY6b?>H^w8DYYrUJTX|z(i7vZ)dw{s=$!YW`
zwclY5<?KrL>+y|;n1CZ7uVmahiRA60R3$0s4Z77dWPm*dKdnZ1YY0e-rF>&sSGFig
zHZ3LnUR|e-9RE>YqN6xF4d|^!oT@^(J>AaEZh6!dinAG!Xl>zFtuE;G)nK=(7&S*`
z$~GcBj)dF>XjcGG0cIi+a-MA5OLPFtzr*z=<t)9EwjI(`<FmutO^4<HI2D^fi68b6
zJ`)zdOP!#6ctio5@q=Kc)9=|w--8KnB9&p%ouI(^{|2W>ZU=n#aMDedbp{y*vFWO-
z(#n;jY(CBCkfJ?UE-sx9cow460f&L!t)kK$_JK9W8|F#vbJ|k*i3llbDD@g!&^0G{
zl%)|s^N?lTcMdZrTBjqn9Ogmj&8~~GbfkX*)X1$(N2a9EQ!6w@a&%^MQFj1@F3po)
zwP(6;*)#W48N~_9VjI*{R3rE-k%7XNs2#(&sL|<z5SOCZnLj$}%1gm7tcdSuX5`o(
zxQm5*$Z-Q6OSYEC0DcO&!Ijv|0+sZ@vgpM4T$!INXv-QS2*Y8{#pLoGk_AH;7CXQ#
z{LQV>xeu1rJ%BbfSkisOb^btWmBj>Y@5slIV5Kc=tPbU0_&TxP>tcDqICY`8Xx0BV
zqTZ?$d2@k3cX8oeYD?MicEc7a&D8EZ4b~QZMA*5T8yBc(cDv)J`Sq%LZYHL9813mA
z@7DwO(>ZA@LIfiDX0L?GK;6R}p5m_Kd9DnWGJSUXrAi3mi}jm!sMM!N^ft_0O03&l
z12Q`N^tA9>;~(7nr541hh4Wb;HLFoo)v1QTlDHb)K290Kkr#Y+jI-jxI}dps-Cw%r
z>U}PZ#BPy{s~M>)bZry8{pfUY<Id+`bZ4mMLX=6zFQ?R5@mrL+gQfkt35ZQkJSES^
zi?-WDpdqGpG_aJ8ScScWFqbh|8=b_=Q=Tz+q>lMEMlT;v_c4%ch(T;ppAuDZMdc+J
z@|~9jB7Jrbq;vSR>_q4hIT|NzT;=O2)%P>m($)A_1t;&>h%5cp2(3vDnh2%t!k?-l
zx8I$I^G9C_^m*GDb68_VcKESKIP*I^YvWBrIg<+a0^P*7W-dMtsLWSJ6&J2d_Xj_5
zpXtMfp;hibLy}Vp<mI%2Foa<f?c#y6Q8XBXD(GnYb4H$0=5UA__Z6DEk}4=8iG}42
z^fwMwdeoN)0Y)J=Cj+_(<TdO;O~^s|>A(HGqhM|?yFAd5$&lNa_JBuxNJ7VUI#?O~
z;t=+FXQh4q-C@IR*#186@zHAWA<!J4kUNLZ`u1BR^*d)^(BI6?HZwAL89jfz>Jm{A
zo{(^-nk%tj2D`NHWSJakTDlg|`s&=~@!>9jTJp^|5b{Hf5q&^kT?=qySC`uMPL2;}
z20>?Jy7uMh76~BpLTfMnA{~N;^c`;2kXtt>@f4Z0o-U(?F%WI#;o%9{@3ubq&U<?Q
z$C8J?Y`-+D4zoAY3C`U51noY6j`ifuG?rv};a109MdWlhs!mxaq88%f4v$Yj+D>+T
zZHE$kyfeWI0<_5iX5f?%jcbj!XX#n-w3f<CwEK}NoQUd6uDEjO*DyIb8J60YY*_3Y
zT05b8`zCPofD^pq#$2-X5SmhUJ6qpL)67e(8<fT`>fc%!Ao|*}hRmacr8+zfB9)4l
z(Izw=zMng5CBI-Lf`)RVj^4nA8@BX%gg>~=`l>=0nxeYoSRGBAAN)GD%|<Bz3~e}c
z#CH`RRAhvCodRt!Q?%#A6y|j#M0I(x<bD<xxrUH@GC*`NOQLI5gbpW5=_UP`u_+>>
zdtBQ7J7AoB1Y!n<0F+#Bc)O!Aot0jTv5H7KK%?F7Nba|>0UmEM7|dS#gg&e<!KE|o
z=b39jo)8{PdXyF{hHYoT>>>CXAz-n!(~=r0uO{XC)8MO8oz^Fp#PkX;NBnoSAD$`;
zJvg)R$wlzi%_jSX95xn4NIc!pE2`geZy8m(RJtY_AHC3?93Lxjl&>r6>zP#!qNMy)
z!*2&G1--k9P##oeoROn?o;p$AvJNHJ4-qZS&Bo3j9^Qs*U|mixwz#(b)ljR?ZEe;F
zhZw(Gv$FNDa%22udu3^-bzK90%Qt};o<bMFYJ;VM{EV(7@9)o`qVIW<CZ`Uir%&_9
zUo-R<oWP(I4nb>;T<Y&MEF6I&ZdgkIEl(tWBy{+F63V&GLR=5GZ@VAw{k19dCv=Fe
z_rts*)Xr=KU7ROM7)IIu;Ebon*_?O7R}1n-KtJXS732D!_PX5>W<Z!A497Z}ZRq{D
zx?Eob1uea;zVxS*4CP~w50dbhcu{BfxcHe;Bta_OUD>VDcWdn#rl)#4<@9?!Y?;Ys
zu&T%zGms;cg1h9i_IFdtveKWbyqE6L6C56cuNoO3V5!WK`ErVM<KxGB(T_qd4pK#E
z%JYTgUwj^Q=8`sp-nsmT4T&O}kHQfIV!iLbPpA9tCmjGQ;u(5sKm;b}Bd;!?C@AWW
zi;Lrr04&AG6!qx}f3VKnp`gjWnWYrRRb(67d%a$~n-?@nGbF5!XY+*z!1`KtDwu}R
z2`<t7weAiy&ojqdsQAYP+w;^fn{!1ib2OE+`*Y>K4!3g9>)$35l$Div<MJKE+1Pg1
zl@3Ka<N*2?6jqgav;RA@65Jl#4__xW9)f;!r>B?tlNj5XJnBv;@TJrf2R(P9CxT9E
zPUok%^{<8Qfld!Fb}&HXc=XBj^Z&t(I>0r^E0(Yej%}e*EHAdKw2UduYM=37oh6Y>
zE{@#V+5&^%D4Zs51h?vFx}`+&I9ey+yBP1X*#chOJ+^-VoXUNE1z=Wv6sQ(&NR*IV
z@RlW}Q~k#)kChKMHd}%F+Z@w5_`O2iSquU-<QAG*^O}FHDy*(9)Is{>9m?c}uL7u~
zV>e!Vj&SG|sfCQpl=Es&#<hh3lcW&u;M0wd5JQ;3?i+)tu>!-qE&L9pefJb88<sHU
zy4Wb@QB7GRhES=*NX@Ne>|M<L67&sCZbKr&EUHRp<YHzx>}bQ$m_C(CCJm&*FJP5M
zB$gASyUPp4r#)0u4G9)~IRIlF-K;s|COiA38jZ1Mx!lHLbTAO42_QVlMi2Pyj5iy~
zQ(waC<GDy*!#0)&(ArWk6z)Nw>{Or|rx9la3nc7?zOMRqaAP8hsvP{$WsbgcXe?+T
zyxa5ooN87(eeUPTzmU;l_VU6n?sqIAiikn4U;`1S8js^z1b<wSW(<RUsp7Pc=J%0`
zv%A1x5?DaT|4YN?-?g@jB`2O5Z4(?1EIa<5B9L}=##1J^RNp0FOOKJlp;f@kS5`M{
zjEM<j=01o}!}5{ohLe0-pu%|xtD&0<A@{JvlJ0tw7HEYQ(As5{W-Em#@aPM-^C|^P
z(k$r)Q9czbve;})?yiGh_VMIp?*cqKMUuq^WHVq(`+K4yySpl~7Vg(wEtAScyUQg#
z89!_{iej|vv)2A$BR+89P+c_MKgF2q0{)zVJ-v9|ZWelLKbIgxGPWcxeECtc)7cYZ
z)cr@WVl+fU#W_&=gb<v&Yg%@5Xkx?j_H+095*~xYQ=fR!WqW$u1Xv8!*>5r+5}$|j
zS@y=LEwZze$qMIEQGL-Lxl0vP<6t&YK7>ZsI6f+{Y~NjIYs2AXZ_*hW-i1X7-|!Nk
zr7<>G?R%ikVGyCKR{XqXPO)HleJ<GFf4B5xxJijeKXk_Zs27|yIiU}H(>jhrlI;%m
zmH=6ia-A96E(yxNUyOb2iPMkUnf@pt5Me7LEt%mLsdBXC<9vKEvh^gm(PH819B`XV
zw)G0<CEk#ddK>41UPl~z)Sd~Om)9M1<MhwTDR&Df5{`PJv*dB=5gd55*5IC<6_-%c
z_VD7T=xSoUrfi!nx3R0)Me_WvgY@Np1&$+RCfGV_5OR3{`m6Nye^CgT*8reDK2dC1
zihN`V#LFr6T3-^upWGn)?KOcFzuVH5BXhxS4rYu}b3OblDmtP?l&^}}dSo;xgU$!+
zZ)|?lyjPjg3C5`Y>!iHI63VuZ;^a4&RJef_!3tQMC6X(qT*1FR+GI}v2&rw7+)wa~
z(on!Wf&&qJc+e{my8lYneTrx2kd0UQ&a*(D3|&S~!;TxfB#f($`H$PbZ<Ijq8l|l(
zQvr||VB`h9MJ?lp(HShMsAcl4IY(Ch=jJy$JO$cN#Z#qj0(CrUnt!sN73OlPM1OG&
z5N~;LlY6r!VFmfl^4_CAa@j;o0j7|4Ht6gWQD;U-wg3;BZeSt?*6?X*aZ=}d+9gVq
zY;s74xfu|H`p@JvL(<uxoBVkxhIl$g>X)`DkOcX*jOoDFQmg(!H!~N;pcx+4T6jDi
z2_W$tk=i)ub3;v4JsL#xj~f!?^7#zomnk&Vu*2$pSN1jIMN4PbZuESB$*-mjzY|kl
z`Ue7!5C3*(H&3KZLA!oJwyIBR_tEZxywZG|p#=o#NNA3>O3K5tCkK;3#vvhrUOw2{
z$%?dnVHr6HOjBPndbxV$S~Ne4ynL>@v0KaakP)Q&w~VXvpT5|_tyD#|F)7v-tKxP@
zR<8D^@Hgq@!05ZZOt)h4l?4_hZ_#)dp|&DoCxG*WpYz5!W&aJSZJpL*0dMV+k^TLr
zMfkPd2I3)@d0NtXs#Q1qS^(D`W#z*0eXk|LLPpcx-)`_L=eNbI1}nXu**n-KgEim4
zt#wl;Z2-sORsNbRPEvoD3!o?0=N}PSE@9Q+?C|L4(e-}{hDAie;kPJXD;|wb>0ZPg
z<lDANy>Mk3O+VkmC0K)XjlSDzt<O@Er4wrOdSxec<=!ccO_2n5XL8ocj@b=q&v5=|
zCR%C^gC&cgQbJ@z!MJ)4*8B=H^FvH2o|>A$PqHAox}+No&IQ_d>fQ%0Yl3M7=T-2p
z0`mY5El*SaC6@k)&|h&ShHPxKTZh|(&Y|Yv0&u$lAHeQm_R;LMR!OLBpPJaVPvHEu
z@pV7`j!Bu2qb-2S`7eOTR?6eo8jb^J;6Q=`#|z0>PsX0yp_KAV)x=VaB?gZN`OGo7
zn4(+|tn<HUKd%zvCsjf9_1WP(5c5(LOEItcYfN~R6`vjLCCGmb+I39RxM$7F8tK$k
z5+i~+-dk$z)X&EK^re-myU@paNQ1c9B+0CJhiB>Rdftm2lVXq-2*|Q$5lKVwbXw{q
zd83mYER##Xm-H=_8AeFo7FWlIst8SfB*fhGT1)!vEpsv<9k)fo3Ag_av>bUS-&h}-
z>+h$SSwf*}Ir0e#K0+cJH8J=d5=9c!wcwy`<_Yw5>wXQka6nxsttfIK<vC^e-7dAt
z^9%L2HV4Bs-p(&O3jL<nT36YE9*a<8VLvn)3{2G^D$%fng9t{HX?U@7Vf$5dtK`!F
zihR+8XS`f~46di;=3A*YCvK#VnN~3#&_sFwL56g&`Ska5=9Z%!!p^YM+Ke3EzU9VC
zYnp+%xxv|{XUo0M>`!z7&%g{>^i~P#&C_8^%g2T>c_6u*lgDl+$GryG@=-YVK*!Te
zf)Z@}5~U?<Y?LzQ5Dt->O-QBkwA98Jf8v5fLQw(;8js{PJcF0;H{v09kz7#6S=NVY
z8ve)y9n+PSLFmUF6D=~~CG@v}kY(U{G`8*i1fQM}+O;XvG&CgG;Jh7V*45ER8vB8=
z47>ERwEN__%i}v#-!#7e8zRB4uzwI1iE7n`5FIuLZ|6OtFPYuSG+~?${0dkWmQ+9S
zx)Wmes)&CPJgR^HxQSb=vq$`eP(dK~k#~)>J<&D&srCQ@sWAK#5|~V+{rvR87m3r&
zh=Q}+illlX(cF9dQ)C+iTOSLPs+?18eUgJqQHvn`*f(lW$`rqFWu2sGDNZiW%kXPB
z2JCn#-I;9#ux`(a1uJ>9w%G`YlkX*5`Sj?O6QI6>v7B2?(BYcp`d@jv|Av{X9iD+9
zfsIyrtdnba?(Dea!Kf%TTyo&e_QR2NyK$;(bo6{0CKoTHaqX+4F;!bc2!Foj1Xm>P
zeN97z{~fNb<(-H7Zw4>BSb1)>Q}2DOLyVR-$XGM9B>5d&;f--0Um*tae{<dV_tBcN
zqib{&x5kWZMi90efz^In@9&143hE`woP3!{E*zQ?U;o7QS)r>X1~YkE^1+Q!lizS5
z8P*T}Od`$$OgHc8^|*Yej%UIl-WK*^BtCdOVhLl?{cya*`)nKRRMt<kLQjF}5b*JQ
z2+j54Q>Pn^JtLq;vRbChy|yO&3Di(M6tY_0$F}!m#pzRgVee*^g6n!(<?=EDUCH(A
z10x{tchH-T0lllLo$8(n8l`{n_B#CGa0~Yish4Vc;8<Kq6~Ez?8VW2#VB4ZqczDd3
zK&)b)R-D+K3JvuiJ^k+}`mcC<32412Z$A#EI;9CpH&}?%Ly)<~jI`AF^Scv06br4?
z1q^0;ZLMr#{cBPu$BzNxu!GGEWoc3FE?e+@VJh1BKWw0|c6e-LZk{yO>X+|?p3jwa
zA`FcaeLZ=D`(6ku^6>UwTM**B2f`b@;0rv1S<V_Xz@dijc<tL+Ss}w9^r&`Cv2b}?
zA-gZxBt1=U=09>kY{Rk8<h+C6tmoDkUkCiz>;U>+IB~gLv}aazD&uX9Y{>mrQbj`i
z&=1{PzGpCOtU<;2UW%v)Zq(U)#5XIsxgRKc!1>?PhmnVh_TI5L#l?|aAMFD<Bh>-Q
zZ@|`oro=~j!vFo@;9=!fqmAV870dnF^Vvw|6V%-@EiFspqKe9+E1L1#g@0~4Az8SE
z${r`A!zY!But};Md{YNcYuA>1{;J?h@+n_#+OlR*-_bQpvE!j%P^lj0zTd;GsQEHg
zs}Rem83Nmtxrwwv1lV#HzeB6nr{*Q9g6=H$4J?kZO>|zzmyqWaYKU)6$?kt_Or@h1
zzkbI1`m0F|uHR8ST{?2Z6mg6;+hFd^(m&beE&4HQF0uYIB)BnEMCJ*<#_5kdOO_H!
zaOQ$<RzksUlA)JY16TJXTm?Dk#=zw!xmP`)n<2t($K5G~{wD*v_*5&qxBI<4uDrj}
z1}=gPoV8vLdl@fx<46UURPep4)8fZ8$QskMOAM_1P)~!aOVpq%*8Ip(<Z>Hp!)9gT
zOeZf9b-sY}K@4Q5KD@aqF`&`sJVLA0qZW)z*S-1@Kqse7v0oZd^uvpga$n0cYhiG6
ze=u@4fACAq1)3mafZ0Wvo2~U19*nN8cP>FGta4QD7EILs`ywe?>s2LHqE#1=w6DX=
zbnj@vR`TxdKA&m*>{<|-eu?DF=7Jc)kM2$OTd6i}jbsjWTl{mpaOC^0&H6M+H}a_=
zBF&BT)%C&m#c66Ht@GT?vZ8vCCtPiCzh496)exMxselt9`g2uj)oz~U`|EymYR*4l
z2`{gDTy!gcCP}Trsz@7M5Rb#3ODhtq>ctdN<xF?k+dVybd^!Vw!C>s;-|!6Mo94H>
z%eFvSYK@HB-7Vd1I2LD1-ShCTS$fdorR8qjgfgI|Zw>%uTgRW1n`d-&cN+ZJ{QYgB
zHRapM`zff#rqBj)@{QLexQX>0r!F0%WoLT%&f}AK)FQqZExbL$2ClXC7*|fv6gq5(
zbb_k`HC?e5kJlruuzcGt>kz-ub+9h|P~z}gmyNA7JBw$SkJf5I-H6c!o`3At$zS_C
zpkcN^yn7%$P{q=0#6fIEMjjA)i*aS5*AeVrI{7}6qf@M#0`IWyZ!UdTW#Ok~9ihyD
z;}ibYJ)w4$K{r6ZuLRxDQd8_0en&Xp7d<hZC)gAfq+Xe5LcG!M9TPCV3Moq*4|*lF
zBd#<ty1FB!4kK73kq+07R{T%5l#iCs7~x?^o|6k8zfH2y*ql6%gijGJM=CxsU3zl|
zNo{FxmLThB{mwn4^l5RX2ugc9rblIkt13VL4aLJ>ZY9rholqU^T$NOQL7MQu?L*bh
z)?Q_jZ_UJ{^6!Q<YvZKBa`9KGx`Uc+jg+8EkM`aTdD~ugg2*x(MZ?_p+3XOO)D#(A
zO)e(Qv+=NJGt`ri0Xgc2__?PFCEWu-r+b_B!o?HcgVr-2apwbK#k7aV#M5=xtOc>n
z@YZrsO|Wn2K8r*rm*mJ2YW4r5Z$)TA#UdcsfppGG1{ljWyJi0H6qNhrc=g<;TdJj&
z<K@(_sYLSP+DFo1_jT%8I&Oy1jKa1>l5I`EUoqmD_#h0F8Y=m|A+>?2g;-YvmwJ^f
zC`q%IzRRnZuPt0$^#1z|iEud0ZV4HPY>lpl(>B*WyB3yW1U`V0Vm1L51(m|8_R;?u
zwk@fTvu?h?b=N1JzcuN?_^H17Z?Ne#f44wi#CfNI%W{8ILP9yz&qGq>Y;f`~6C>+O
z2+0|~@M}CMr|2gM-$J<KT{9xc<GlUe4PBc{Ux+BoFCxpviHGc(;7zVb0PK^xrvkJ+
zQQxopG57@VtgEZRiokUV@l7>W=c-2I3VKGw#V6-e@>A|bUy_IHmuyzY(n6$RO{(w8
zEws&;U_NE=yXSwVM5XA~E1xR=b8)S2PVSBhaH<Ov-Q0Q0JMlGsLw%M_;&K|{=+eP}
zyQ_7agrUad_{|!ny^(k(zM}V@>O#M3JusJh_`7rHa|G>3)>ja>{yot;?}|?*62+@N
z3i-8Eo0Y(z_g}tDi03j^eb)yG^R(%s73vX85N6%<v@9Wi{*Y>>w{awKdeya5I-(gd
zggzcF4cRWXentZU$%x&^hN)A+)IzhqzCO6~Z~b;n(7-tqj^wt%Cs$U^rGV>#-k^-$
z%!I7F{Ug`<srC59r{Gigx7iv=Isl+9AoH0HRw@>L$(IwTl<e!k^K6mqq=MfbuTEs9
z9gQ1osBs4dk!K2-#JPjz;p5iU)(}}&TG~&EgOih%jJ9yVya$aB3*Evz9hu2HfLr!@
zX3@Y80`HW{UuhmAHQXyAnrH*cR#i1+$q3{9q)K`)VZ&o|-R}@(!m2nw?WL4DZX3$?
zt$Y99(&n$?WbZI*<==#7yMkVFR@v#Aj4eG0jNB;_3&r;=5D%iGem5~aLIbDDiLp){
z@a%2&l(kN~StgU_PEM2-Dn-e@jVD{zn!pdYt9<$~+uMd#0O;F0Jo)m7fm#ocYB83x
z3jM+zzLu#uMRO_no~k(kSyVAhLF*97*)Uvu+2`D(H-=sjpU7nv>KE*UFp8Pr{u~kv
zq9bl;f$k2LFsXb2!meAZ+?sy_Iar=YzbetGCcR^+Oj$1Rt~yG~SB$=TEZTe<EzT3&
z%fs}=_dJ4{p8MDBnT&gJZ{6@kMbL=X7Cj+i+xR!3v36Sgw1Omn5NK@_ebxP=WW?-K
zYMmyWK-DE2uo8=_6m)KGQ$qh1I$Onu;Mew-<!aFEKNO&a#FJ*^V#BH;7q$A(*C7Wj
z?;Uf#AzM?vI+O(%7RJmZdY4pLNQ{ij{$@r%2je55H0O@18g}}d9_KMa!l)te2jofr
zYmW5cj#;peE-aNx*-_tC1LL?K>fqCPUl!H(c;nmWN@Eo*Wt9;PvZ>Wh0`2fiV?WV=
z^{TaLz%?~;Wy;_=DV_aSxj27Y>on9}F|&WIg#o}$;>UD)!}*{O{+u-;nwuu9+Et~J
zhIDPpm}XTzseLKy@n%W1n{TcvVc{I8oT=T(qUoH3=Vx->;g*<cI-TBZ&8I8{Sa!P~
z&E`iQM#jH^L9ZC{AqPW1xV3Y}A**WJ%7iaX;_>Q$?vL^`r{b#N@gT}6{R`9?1eZ+@
z*euAbtr?PF_f*&%w`lE59SUkSJu^of+{iIFo6A1TE!R9|HCcJ{Pc(mSm1qB}ud-!1
zW^O$H;q=M=ODjJ&iKI}Wkkx~MbTau~6P;W2jb~2%8+pvswVoY}=RaS08g9uqUU{y_
zS&)9feXJ6<s0mS<Ivg&sG@@-QI1`;iJ&wnloEP|a_;p(kl3>G_X!fBKDJX0=YiL&V
z`N{bGQh@P}-HDUJEtUg0I=CkEQ2?c}P5I!Xa^T^7FJO7acg`&BP5Mt!tMt*Fo3kzO
zrJ13INKGmKUiUvt%;@Hv4*>p-UDP;8MeUC@^f&JPeFdD6fB*zS{>QDrB3Eh`lbf`!
zaOl}2O3Byf)z7!@d(mAOw664!t8;TZT^7Wrg!2Ja8jO6xp-QJK-L1A$%g#0PBSB9d
zfW1p{_f;2i!3BF8R@uUr0r1hv^xs|A5F;RHm6=z#8zOI=m8CBdkMg5GE8R=sR(*PW
zFg#HMgc0k=Pus+z#JN%D`@(5s<%y{4umHcyu-7w^tA1)cckVcWl*|eckr_rmFd~*m
zK>fgh_~WIFzI^;M{vWM>o;yi9<Yh1<M~4>#bP;M$NfJt@{}w{Gyf0QuN=xNsutweq
zfQI)c>|H2uMr+AuO9;;$pQzxN|J;+7e%W01xkTrcFm-deDBdJb7v|=e!v!ePDUpiR
zo>s}$$AEq_Q=!QlX`AL3^4Q=V$r*2fxj6{FBjvbnA)VKiBKzeRWbeyT+sOVznIajb
z$|)n&#_)agFQaqsi(b!^rSIe>ju)N#cK`I$<=FJCtCQDNT-{-QwHyLb()HCW)htSt
z>MRFViq+h6{E04ScE^*WhG2Ml0DRow?CU4gcP3wI(40-w@>S!mYtR3^|0gKBmM$IR
zV4PZmh#g&6>7N`yH(nS<!yeFOBR|<-iIQ|iR0%9q^qF5EHh5j(%dC+zn_Z$kIpzYn
zi^g}6lf@MjMKSYxoSb>FdBpMInW=}xA<Ge6@E4HW*?=qpTVO9$vpnYYEqc2smDX1j
zJI@?`@0Q3Cj2a-AfboTn_8pIops8?;Un)1Y`a|EG9x&u@WOmg<Dk5Old8^l`OFPeD
zEfv|ASRm}J<)2>QH05GwAWzq!_iKyPo@aTFU*c1J@_hF!|I+o0yuzd&1mly>zlMG3
zPTiLc)i0m@CCB0&;~i~LgnPIkd9^Cln&V5o$*0>$TzRa;vLm%<;O2ExRl7$>E#a}`
z*6zuWn$^QRj250Emtrx|Tg@geC215>@o;ynaq0kwR=`E9)1rP=sq)zmvzgu6Pu+6C
zM|+#Rr_&=&S<Ygyn#V5ho{Lwt$s>NruwG3@8+Be)=|DfukS%elcWoHkufDd>0ylEK
zDQ2}CXr1`RF5|D_d_b2I(K8YPQ_O*D6Xk_d-4$9UdMaZ_HEV;jN?V6>>PBAG_L#kz
zSMvy?OYQ{6Z0yVIL`gIMrX(l#<B2M2@%y(C4^zIVQri;oV(pwS>B7U3FE9G$nTx!y
z&(y@D=xT3BMnR<|5z=p3M6l<FnVQ(6%IKE9I*HkMzLw>BPOfq+!5RykOk^LAOx>LN
zFHZ(yn+ln+@}MsH?6j3_P^L;3!!pNx_Qwnb;KpU1Y^N&>C#a_-nzhMKZC@TiJ3FZk
z?pkUEq~#gLfSnX#ApF0a6o4Ut1l7XfaY(}?`va~0PF`ifvtR#?R|6&{X2OkIWA)K_
z?pqz`1%TB)-PqU|d%VsB#%iF%y6L0jR&&^y|CsE;rCR6!r-w$#cfutW-in4$Jyo>A
z6fV;yqZGNV!2YjODqO?2rwGN5C`3@tNaiKtzYHqQS$wXV)rr3P-;T+z;f{3Z;VSr1
z=HH~Yv-`T*XR1Rng|06sj!`#%d=IFYaJ`kjHs{fE)f{7nrAs6*w552*6uO$@KetNa
zkq>B9?DJOriixery!D{%ovpri|M_T?4_8{PkB3i7IWz>qjIR4zYHI@Xd@m|vMy}`s
zY!7NP?KkG!h-b@9N?}J-TMmYMJb%G?*=&I&6+R=M41Y`~M#Y!=wBorznfO!QK}BVW
zKF*0=dM01D`1W@7$|m6|K3km!?U9TR>*yB<fa;&w9|fJLzLKWiZ1yU=Nax(^D24&w
z#Oc7yTU_e|FYVk}^vr8<Xc*my*8K$+{NI%f$2U@@92-h<3(gLg89)^ZZtv4~w)JNC
z`{>(rHW>V%%v?sL3fewFl%I&osB#?)hdR#z3d(Qe0KfKvh7yL^6hY(uq%>_^$RdG_
z)G4DYjgfRbrz(Cmuj^WMpOr*%Mg{*lPWRWoq-8xD;kv|nxWU~Q<wC?ty}_bSdHQ<t
zVtUSX@3h0VKiL<Gep{~K+q<{zPWMwG|6ZnVAI?zkQ(`oP$oU$?i{XR&Ua{UqU)ydx
zoM<km+)Hxti0xt|XWgO(okZcQ9S`f6Ull>Uf8d_PJWyNS&V9iMMR<!v-90-pqArBF
z%xUN)h_Dxhjkaz=4m4O^{yA(^pMk+)=8x$N3{V+A$7CnmSpAw-=YJ&E^;~UF%%@y!
z-}h<+PwmX+ocvi+8$XDEv54fW_d_i5bMlTCcuHQNB5;~SG}-cr<zW5K>f^?x8Y08c
zvah!Q@?o_{ZeqrOVqSny+8GusU*Dx0Hc4{M)~GfuZN?vHg3qX`fa3>Ks$&H4XX#0=
z!A!M!^-0Qaz7|s!Pls{0rjT<hQ`L0S+fy<aPFh;C=nh$rT;sid4E(id*Q~C56t?+u
zQ=>o)<Hs~br>}4t>MNKFxk_RmH9ZBATdG+<)Vp$ht}+c5e$272eDb}?C9Q12HF6rw
ztUoBHYdE#rS2yWbypQkEd1S$;B;&2+D*3?)DmC}UzAf>f*RNuNk>YXr?;9D3K#Fx~
z>*VC|M2-76RO|Z)c^f=(8(vSw=RwZjb)p?^7-QTv+Wz9t@u7cTUF-DK4gu;uvxTFJ
z*`X)B`c3?5k;$##VQ%EMdRXf;?g<BLg3O8uU2Cm0dm}EM1r2-r@>1s}%FJbBmYYvU
zDv2!FZ-{pxJWdqpF|M)N0EKeOcHkwA1scwLODFN%Px7>({xuVB{v+f$ueu`jz!5)E
z(0~l1>uX&?=+h@@^K*0m8JM0dRscfj=?NjkP5!Z~udx6x;AVEDbiMZ`9mb>j<fq21
zlG41j<mE99jq2FMd30)*kLBU!<`|j+D5tyWuf4NyDaB=#t{0zYFFgGZ4c3QIq7!<W
zYi_o@dPlcHiYT9tik0hh;@1AL{M#OHe2PbTi5<KVC_ZxkViSoL89@bnk6}mv4_#iN
zYg(RDTJ?4x|A!%G0PKei{I#ehnTWt2U`pt+E5ZNi_SYVr<ptJHf#E-L4q5UC$K*pp
zPk*{s2X={M>mw10{Q+r?cevGQlL^7XS4Yrv58W#=DlVWW&yU<z==kKkmO)wa8EC#Q
z79N=CJE&s%9@q6yyW@#2HRS5{>o2+!L^PmHYZ~)>z(M(vnIRU}nG-WEwaHKTS^jcx
z>}`}!cFGFRU9Zs{e5j4=;Cu~vy)0vLJ=CG;_*STT!HEv8xl|Mf`G;b0w)C4PUT1rr
z?K8&6_}(+`)r(H&ZGj?T3njWrqLg-qVP8P3aEIB$p0vY);r83UBV|OmN>RD!-KpnO
zN_Pz2Rq)+-^&z+M%-0We-3U4e72^%av|rQlGP(41Rb=u)e&;oDbrsP!@@y|Q_3)X5
zUEPJBa(5nnzin{+QDNJdmT+C)jAHE4jfK|Dqko%BDGPG?Bbe9<wTIHMqMLb|!s^W4
z(O(id?d`<BVIO_=CNl6UOzkwTS~7l^c$s&FoWQdD=0#2s3pKjYSx+k8$nLW|ky{<h
z=v_{fXz0=SA4OLl&xHHON5~n<og9m5a_5>PF(piydxkXk%q>T*$}wliZG_ynH1}09
z_fnYH56Q72Ns=P`p6|<_Ugn=Y&*%BPPqSiJ15Of`neOa^UY`q<+|Fcqn0&7%`L1Sv
z1a$LJ<{t;GP2cH8`KZ;ASK%lQej_QH-$T|A4}Qn{cL{uuR2u{&%@Jv8Cvr*0#zrN%
z{tU0qiAp24k#V+-B7@SH=EpiQ<|B$o!8oZ$tV#Y>k1>C7$KQjsfFswgK@c~u#oe)A
z&-9WIEDkcDm8hD{y}ef`un6a*0xW23iOYVr_kD+3JW2K49q}9Yz=%YxhMub*<=UFQ
z`G$d@^q_eb`gm_a=qy$=_A-a3TDP~YUOB$Aeq#+Tc8Wl#Za5JIJU^5Fek^lG9y~wZ
zSy~-<=$bU%-uOd<b20qjoDo;@@e}W!+08w$j>B%^zG=N{chUvo=wdc-`J?vBzlLI2
zY>mF|+mrAxL0w(FxaHUItu78E)+WHT$_ISGx4=Jx+yiNjA~pyTH(BHmqap^0{|$)k
zy}hVEej4Vcy9Fnz7EZd_jSM{WCMvWmy1ow*B$5N%Bda|wV>w+}`qH2IQ-v?Sft6tr
zf{wz3kG{|&zdVUFaL;=Y>NL8KuNkugei?=AM9DK@5Rf$u!qJCxs`5ZpbCN;j^S1gt
z-t?dcd9KPU1>BOJYG=oJl&F<6&3PJMA!xzYLv})@FmK@58O>W!BLxyg7bMOqdwh9e
zpddWCca1^x>B)k!p`wmqNUEx(&Uux~tQjwThGp-1Sm0y!>~kz;9h1s6XNc<|ZKwJT
z478&!Fl`W3Sp@fZ%r??(P=?dRj-B(+&mdB^T(Ybb#}AhdOyXmhH{aIoOV-wzcX)+g
zWO$>6tQOB*V`Cx)At#}IU&voWZLVurLC#2RvT&C8n|O?q@-qC;e(TIq<w97`{FaTm
zay+Z)$&tyw#QU{lv18>M4BKb-mK(~sod{8n1P89jp>WO$cc&p9irl)gT_}S_Yd!`M
zJ=6evQsuRfncwP&!I#=IkgWoLiqMAxX*y>gIj7^J@YkJ3K~(sYyrSag&@|uC2QG&0
zst#++Cnr8setj~=cu%KJFN=Sdc7r`n<>sqdAi>={@;ZK|&>I{Q5%O<oP`vl}wRYFT
ztMA_L$G+$E7sY;;iQ`F9b;EZccqKQIINZgCK8n)+YRab(Y=+w22tOkFzqP#Umx8hQ
zjc&KBN@UA{G?F7c7K|tN%W?boNxYp~vUk{|&`fGmBO+Dy%66#76PYhYDWnVa`fZFx
za#aF@126o^Xl#M1T0=k0&mnR>vFViQ^HYvSv*?$IU_67D%G^kCH<aJ+Vd+e`vW*?5
zcMN_zT9<{&U?T8HTh=*b(}5JlThE*!m``N>78Y|=@aPnk&TL^LzbvU>zyV>Uqt}RW
za7{g`waSjpy?3bJtKeA~psDE!Jf;_wZsO)a)#P8G?JImo7mxh+F%Ucw&+D=JL<Y_5
z9Y#b7>JpU1PE~{?P(Xtrzzdf`tJ(wb%lQ}R0qt)`|0{K4a2Fa3Nukwqz21mKfOlLw
z#zT%e*Rz#3vCa&>>+&GV41?n;{(8{jK9}*q=Ul{L{$kkid=Pz-fo-AZ;Q?re0yS_m
zcit%&XdoxQd+l62{`P$LaLuZBAB3&Y)%U#fTCR!R(c+}EcOG;;n7n>nyDUR@@{5`v
zL<M>GY+z<oHby6kz*+_T;yczC5if758;)zpv<TOr-^r1gW_cu8q&|<7t$<1EC<Kzc
z4dZceQTvp=gU-WxeEtLz!<W{FSfYz3^ghg!OVX4gojo>I%qCHE*<s--mK7mmRZp5q
zaGPe|SY4Hl{`BO*VOQ+O5v_5g?f0$kEkL%le}alJFDQ0Koe*MpHU=SR2d|bHRxNy%
znXo%b<g{s&soqrYUURaKE5_i4({i&}I9>~1j>T2M%Y`RwUF&X?^61h?b~HI!7QzC?
zgC#bz^60*B#lKgfbwoe?vR82Vov)>Du_D7u!r0C~zu>P2VvzcNiLo&Y1Zrebg`6>D
zE&U<DUIj6_jvU5tWLK-Q@iJVBzCd2-z5}aX+7H#k-MXxq@`ZR3LdmJ$AT~*AoF%3R
zGJg6vpuv&aOhNZ&CX>8ot*xPK`u+<!{uHv13sw3ec1*(r@sU(OYrml4(NO!gcVFR3
zb0Fos@Jo@WDXB+p2Y<Gn??H;rDp!9&3Gh(waq^myR{W;i%O)#rl^1fnM;4=d_9(4(
zwm5$KXrYmr(~nXHMwibpDzv4glo}}^7<F9gnGqA~8aHAEqutrAQL)JDz7~vEu6Q8E
zK5QoX8?7)VTI)Jpg2q;^gtgl<fw0N&qoo~nZgAfQJ$ShH;AD<}A<8oG%4_o|5ve=(
z>c*Q=@8&IM7>%rImU(oMkWYTf2kdz-4!CQcd%7K}p>5J+BCVj!?9!xl17fZI&0=)R
z(=9Jz=67c~Ej9v+xvP<;@_mVX*yo|&Nt}9<i5o}N_%K-JX=bOlWk@4Zc#>_zm;@Nz
z<&4%RJAKfXf9>5CwpH&8Kz(3uNq~O}yz_VO*v2yTE+U{$cPc)?`?AOz9o!dUPx(k~
z3T4CLlApo8vOWDxOz>p5ifI#{g%ku~8dycKZ|iZeXU@unXDXAw$%VN3kjoc)fv_<Z
z^epVv<89aSP+qj(caDGE=>?r?k$>VMlav_c4|EX0nhrq93+n0VLjEIV%2#Z$SYprd
zQKxSk8rxaVpXw{h9oIj)G+^yYxCz0H)o%rLg59%%yQI(^Nn1D28sEu)vajDq*1hcQ
zY^73BgTB=y9-jA~o`AF$Zo8R(pWj7n`yAh66_)z*;~9{C<Uc=TApF~PINl8cZEQc*
zUWeRq`h9Q>`ZfmuHtQ~(tLQuQNtNlA&8Zi~EB}j*O0-1gm7BKS%$(G4o%zIFNN=~a
z+^!*kK^|bY_2lFu)MaQU#t7E$bFF}a2QN6(h$c*1TbqQ(CFx1wZ)Y9{QBDx{2`2y5
zvf!$~0^)h*@tnF4&WO@F<^Xv%=~<h*)=tgyuPUqDc-m-<%NQN2GM@Zj3vyD;H-+@l
z;|=OIjQ&!>qAWwXA7eP%M-_T8(#m&UBA8C{prVW%H!A}k^=xS3>VoRoZY0xHv3IT@
z%=3z;E&T^NdbT>e;%_>=r(1~Eg2;CYWe%OqdCK`#t*T}OuOy+aeE(DKfXzFyAJGWb
zL|$I{RIs<;P@k<a$gj<dP@%^1)b3ht+QtLeL<M%@Eebab6|mei2YcaXqDAYMjP!_K
z`wZ@Fj}|!WX1>-pv1Xgg2u>waP?lz4sN!Dq?^UAA6&qX(&RGQ!-C<H_IJ|tXjv*~Q
zv2h9|^nUYy4_fu#q6~*g38f(>FgRW9Fl#r=<?ilP_t>m;5eDmE-kymPQ3=GjHH>sA
zq$(lj4*Uw^H}!B@i_5GKjJ?xi_k$j+qW132<KMokcbNP<7&@xACF#0lC>{QWmp+E@
zZ0TB4=KjiqsYH50<!c2IJ&&J8rK64#ef-g*AFfGcpvdk{RaJ?2x1Rm|%Zb#g=dnr;
zw&kIQ?TjwiaIu|0A?r^*=h*MFh?IXjkh69Q;C$QXzXUIvTn*K<D_=6l-GpKT7=+e;
zZmx77&WpU156FMexuqV|GyicSH82`MPFE^wo+&Bixk2i;p3~*K!JAxb=hBRBUqQ3a
zc%-r>tCD##rKkPewJhZgorM}l6utcUYePyoOCH2Ran&PuYWgmBegDiw^x<OAGqirH
z&+>5awp#Q+aZtgW7g}FH@u$1{TW4VxUPlj$^AlyEDiHos*`eYrC!QsXV1V3vBun3!
z{R2Ory7<YZ^PE|G?+jbf3aX`3PLiwNrw7Bwa0h0;Ubcd{+$)+gGpbrvX{$i(m|)WF
z!ZeJ1tFwr36Rkk4Ya~Zck?`_m8~g9Tm_7V2%9C=?H>Tu_t1~cx#S_SgGJrIo&+5AU
zdIG3VB?{SE_XYiu#Ma*P?#N^9GV}JZ?WLWDo}+#4qfgxOGusNsKNTVmHx>iJd*>Gh
z_#DU4qOk#*&oQjx+Q(YtV66hfs?a#u)LMh24zE>lu>~)+0(rFX6Y?jG_5Er|<qmN!
ziOzZZIMdxeIp+k}465P3{gQ!eQ(DDH=XMS@G&EF+%BsgNTz}&9x==$VA~}!Gb!Vaf
z<=_*ga!n+O4_6SBB217WTrSJfx*S#!sxPj~dLx|9BSE(IkPqa5f}-q{xg+<Chx)Rl
za$>;g4?oM}UhegjV*KQ#73c-^Z+)b(2t;mm{l=7ys;jC3Bv8MJnvrKmx~Ez`3!WT6
z>S#?z8MA(Q?7zS@BU!EiL@3}YQ_oLUy~*U^%<#TZhfZv|P^Pk4Mg3fR9y@jS1)HN2
z1uj@QxcvU1IudM%pg+N`b>6eHMb$n$Zun|kW2~<P8pj|-XOGfsrlR#V=x;pwnk>zD
zC#Kz@wfs3aP**?S<4)sYoT=XapG<we$NSTTMrz<y9j_{Ijom<Q)@GyYY$pBu%Lm0`
z?bPbrc!Ru}l3`WE6bW?X5zFIaE(_A_z<wTwN>EaxYbA#`S}+{{8~?g^__2E7?o*L_
zH>Hn!UTayb%MwJ&EduM<c_0z*|HTePMO8nr;Y9J1QuUc^So!oy*Dj>5PF{XNaqYWn
zbi2(5N)DCgXy=Qy@^|IEp{qL6oy_iJ@b!*!L%gvwVCuLBr5z!#U<bpdY4EaJ%3BL*
zp8pupdF!p^-UIz19!X3?_6G4(#nZ*Z@3b97_S^&}Cw4ujkMkDbw@_Zir!%8p1FoG@
z@E_5yn`XFu_X#scvln_N9gd{NmD3*NcRpxtuD`L7YJhaQaVwF=)@5)owRMgstI@E{
z7YQkHH-%@ThT5jZKcp9<Rj>VhrrfMsbBy~_oDy8~=<~}_Qqnze$X@YSmEJKu<%m)2
zJ9{~>=6<i$`D{(rH|U!&yEV;x`iOSNSZFCU6ah_Vz94`&!Duwb$k141snBI7j#^5B
zH0n50GR<U~$DO!1`-kpuZ!H{r@)(En+s#RrfPHTBZ<rdHipCl$yi)*LVGO{{ovEuM
zJ^vw*&Gnn*A(~>KwncsKpw)XpbBnog>p$ZwT8PyS*11{#HI(EsR2IAi=4>2cQYAZ`
zH*H9x;x|KvKLg}#o<BH^^qt(KS$q(Q>MKOvo>K!XHSgz!$io_6k;v_e$fMQl>mfi@
zJlgwj1JD;U?dDSc+RxhsGG!`7Xa_uD1Q=)pgm!dxZp>Ns^1LK&`Liwo8xYtO*}qk8
zqv`_lE|z?6ez$OyZe|Dw;p`2ICn;W9*RPyao>Vz^YLM8h#MiZFcZ*zU)7gBq>QhTv
z8-Z5#^u3N7ndUj2D{EtO2AO?#I?&{6c|RwzHP8R#R#%S;*Yg0Xuu=Ye&(>AQrLs~X
zUYv<@+bO3@tL8Wxw?JGr(ps-0BnOqAk!oy86U3^~@JaqKQr3EbVCqDrw}Si_S6UDS
z-fB%^^mtw`;gkT+zCXm{NNe#1$u>JbE|jYv&E#hv-pMkQi_3mOdeBR&VQ0B<-x#K^
zGf{V|n-m!HNfX0c#m1GkB-3F~!bSl(WM`n-T3VN^tC1ZtUzsIVIpGSf(pMnn^`W?I
zj8kAv=5*g1^t0yoA&E9gO8Cy(u>XP)P9VZHO)W|TVT0~@Qy<L~(nw?p27+dM<yHwR
zPx_7+N}lAVrYo1p%5_%Pqoc@{XFrteTTAlT=RU0_`3h)E`S02ej9~a*=<4VpSdO_P
zosPYaot1|5tRMG4*i_bx=EJVTEd#IJU}Eiiczw{Oq>`zLTAdehZ5#3W8pK*x))D{3
zD*pV3&~irqz?CFbJWX@MLyq*a$ip^Ra@ErFwx#4#)1|m(N~kgm;=Z9sR7JzBoMh!(
zkAIR$7zuw^yI&KYmLAC2$7x11gC+S0P44>T-VYPOChcn4nN>w)RJXa<`jnGPd>QZY
z8uIHfsyx2rHubG#=sqRasGiZ87bYRR5q4${**bgBa!Wa?G>=(IFOi+u(SoO^n#PbC
zpqfN|>RsI0<{=s;h2yHmSB(zv^4AMHb~{0Eyt-j>cAASym!Nxhs^{=Wr)_pc8Y-9H
ze0hUsHcs5!oK|=yv>@ipA76X<(ItuKA<eJvFDu7yaQ_qYjF)2M(9j`dX8##?7AvK-
z`qyPy>-$v=Q!_YF#$h3}P1&?oib}*AxL}G>^0JQ8XHk8r=uhEa_K%(f?s0QV`~=m(
zYGi985=9lc^Lw5al1KK<S!=Q!mL3EwqLJaS2#yssyzp?N_Whq5h`=RINe%j3i^Z+n
z3{W>wh8Z!4hL{`XR=`$Qi1YZ?J?<bBqRh9mbCWKv-HTD^SNu2d{P1tSx_9L6)8jwK
zhoik(h4pqb{~^6tIc8o?6l>3-C!jg&0^6w-6&2`>&Y-Op8h&QiLo^ir+z19}RZv6x
ztY<4gypme&cNPJ(%iuhMpAJ7JNIReXThCM@xppD!+F#96&zj2Jb$e%l#-E|E!bWa4
ziKIl|(8vM@mx_i=RTtHTQfvmlxflLo{WL;p|8Km1|Nb4csDYWRG{Xgulb!mNbpQHC
z%u*m+`75veDZZ7E6m1Qh=SLNYMA6x3i@F#-u_PF@NNfTjWd8GBa{XO$29cC3QaM^u
z@na$Grk(uq=7I`2VI?Dxa4GX6h{4$<QFs2P`=MslS*Oq`aeDG0&0oP}Y%0%slgSUM
z#X#OdDU)MRBx@e8R?4K7NqFzBq#KH-Cg5@f{3dR!zdyg#d-3b#u1CT#qgqG~`|;ZI
zCRH)hS9JcI(TQ=k350S*GNhq4PP5T#1-=Qr*fVh#!{4Kah*UILUeF_a+&k-vuYy;+
zl`YGV!CbyrPZioa3*}HsnM^Vb=e?#o&TMQvA{B3t1R})I*!&hS&MCBf0o$Bm>oiVC
z_zWF%o598nzH}e-{`1Z|-f$?m0f{&NlfXEd!2hVRGSP5V*MQ|wrdPnJ0DFWv1Mle&
zdO(eBn|vc5t7IlBq&>jsTduzVB%pixD!sr;@or$X5*;OTA_GNQmV7IdNKgH3L4|k!
zj7?H<DlmJie}((T$4aAT#SzV4J6hyN`*cnjD6V=wG8!YYzpKK4b*b?Fhbf@pewy86
z2^Xc95#v3XA0D^d(HG@_o=HPnNTls9wB2ntx9~BiZg+XmUQC{ZqUm`TUKQ-#*cG|F
zOcJC1k(r-}obh0UwtlJ$w8AmWM76JEnE7c7-6Eqmx^F9;mrR8$3fPg3p?xIhdjKBF
z=GS+0tg5NJ`|BWd<A&Ftr5byS-XNT0w0^uKjtx_@eR`6!^>WiM1zfF6ZHYvO%`Juf
zx2sJ)$woT7P$^5A;`@egXsF;v0jn=XmY44OS1GwNebJr4tF^E_RvGlua#$AKVQkDn
z$Hau>8LesnUEUrr#^vtRFM9SIeDBQeFJ4JEV@%6fPA3q4{P~G7x+eB?StX9gql)!D
zCmhEU&6;ZyBqr!>ruX{f#+a7t(i@`D@V-q3N~e@-X=!}kt94hKa|TX-xv75q?dfrF
z>!yRf{m~EC>wA*_29JNK_x@W|h;#h-`=PE`^NDEkL7ckx_un8v)T?zdbc?}M2PH!?
z7Z=&O2B~KG5F$8tFPS;@srYq(4G#)+@8M@c1o@UcJ0G;A#ATaNgmr-M;RJ7$@P^8%
zGIRbp`~&bYINiHTt6JkQ_reY&m@QailDS^_mZo)OwhV&&u?1h(JU77I|E242zosE#
ze;im|M|%&ycZxrt9R+Juo&T}xr$h~~6MU@awEN<UQ*x71nQu;o21h^CKQ2bWo{Z3Q
zGWM}AKy;K}d;YG&7_679oTj9#8$&|$p_(rHJHm9F9HtDXo{wA%l)HkPs}ox#+)r}n
zVr!OY`_zf#W_qWfu;6~-B2V96@C-#xUXr!AN*Pkp-#T|%!!?PAkddBgP>pG&)dgv1
z12KsI%${Laai7*5n_?PqSyaEsJ#XHN0|Ez7S#?C)%*M1PmTN$xB4>h!AW;-3U!5G)
z6xCo?(kS%grkNGz8fI`?WbS6sspY;gs;Tia#i+)J9!f}G5`n#j-DsGX!NP3BC|y|y
zH=OE-<+3o6#^LoKCzzq~`+9QE)%)re10bjA*0^yq)3o<oAu`fW`*S{gtMw&oyzoM}
z@t8mR7s@g}hl4;&c@hzjFA8ecA+bG+1Oxd@o`RhO6r*KXU_g|v9wk2`drWONXQp{v
z#3TP6Sj&nClF;bdilb29I;D<pNn;*`jf^LlY91*-h2Cs9cYzW~K?IY)tH+3*UGC@I
zY@qh6&47WaPsp+9>C@|DWhNGF8T5WQKWMX?@ikBqr(`c_W{gZ~Wy+BM%pYFGYQcm2
zZ`kWT7!sk^d0Pur^SeZ$P%;*VOUwe+r|$hM-bK3B#_>zrew6y>#9~w2V^w<hI^)!B
zy~7orYRi)#<|V^J{!7<_9~J(L5U&%k#BDX_DVc5FF4fw{&tn}FKjZO=)0*^B=`Dxq
z?<)4ly4(Bsw7~DW*WhUeUZ;mhZ)H&tZ=^r_RoQcFx0>M40^TfE8#roQ@Ht=_N@ei5
z{b_1Q6JwtMoqZKN+oVdi!Estqs!EAuU%+oTCGAO;^U+x$PMkBEoY;iz>&KgmEp}Fh
z_&n-dBEfoNEhID|q6Z@n(%<lNsG0{o`N-p=LswYY&ev~5f2w`l#$Rh@L^idKwEl)C
z+>>Tmz||E5N<FEX7J+-nixLJ!!d3_}eD8<L0f2IYx76}CzXl&U9Q7+i?$8{bz2+A0
z`S|KAJ&ZLu6oXf@<xd00ANj1LZ%4EsP-!)!P$7F%1sQs+PL<rJ$l0$bgay+XYUn5p
z$*BPFuCGuRxA%h9w+CQ=8&>#H$c-;m!Asg*Ae)KF+^i>kROQ_pi)mwh!#ayIP61X8
z)au_>NE(wkIm9FHcg6L?eeQ_g<3Ol;cs+7|I=lDacMxbgWMgBahy14W(t026tR+LZ
z`<Wq0s?@d~8Ya$B<RX?pbl0t6iB%%Q$s*IDL(_nAB)0<eZvm9JPyLkR6)y4Ml&DTj
zP;#o(ck#BPJs?r76Uyh5fe^QtjZg%G#k!>th4c{GCj+-9;xCeC9}4P9JVD&l*<+6;
z7*6-SBbt4%ea$0Rze%>nnhT-qHaq0Y%neguGIAB_`K=~(=BNblJjc^Qw{kWVF?4t6
z2MEK9jwBR^^16-18tU>@SCXp=?(_Y+=}+m<aS|iJ^g`rpyp&6d5*iCmdg0laI*&VK
zUt&3%Db_IH>WJbMQ|2hN2l0jUp+vAv;GL?|xwcWD1nywdrG_D4QbTitpqLGnt-jF^
z6G(PWfk`|1E}Yf#PY6qk+~NAy8)<aeDY|1h3Nd%B<}pQfTwcceB!R@&lEv4Beyb6U
zcoSIAn0@XgDf1R2fWj*or9tmh`>6kiQgWEf3Q3oAtyIS9qDW~5lPVne7L^~*HDXyg
z`+eH&%iTW!-ZbsC$ed>@@`doIOL0(^3byI^M?Uagfrn4>WI%gT`_zU?1qv6VM3yO{
z!XV@jdYBL8?8_RmcUoUsAT?xJ<iz9Y6EmEqBk4;FHIgi8O_F~m+9o7~yOW|CJYvr3
zlf>R^3?BpdtS^*$p6zFe-K}~|PU6pKN(fyqr6&v1&RCsLU>&^LZ(S1&jl=y7k-J^9
zast8Rzzlz$>Z<PEv%j}p(VGIradq%_sJ`0$&5RC@7K}n`OYUhG&wBmkL_c9LWrfrf
zJ%(*4@Xi+++B?SWqN)$)I$|ub#>*OFrvP(z*=iGxO?v6}BWzMl{#%BZFGXfoX3QY?
z962NfQF-HQ*~*Bj$~dqJ{aAW}wmgevSP9S^aHrmRwSkt(D<6K?Ivaqb5r$Om6mS1$
z5iMU_K@p56RTyiTPcsbZ%)|w4Evg}}ASZ#}Wv_EF-0Jf2fWyCm{8nyiyOyAyOCf#d
zFUk;1-o?n{!<|u}!&(fa!s+3(Kmh)9VdB6<OlTabM^#H(ZT-)y)d+T_`9c*=B6zYz
z@(&6}X<z)M&2;!BzH;Mg%s^fXn`HC_!cd@4x$l&NN~}lSjN`-wSjA%ke&&17Lxk4z
zcR+l@kq!O<Zs^<I!GGy4{c+sKAawY8*e*DOhnwC;a^W*)(#Y9B5DdC9cgo#Cu!YLa
z=`P+}-<<9}yTx3$96_4|Rw1m;+mj8qixFfrjZ_eN-mYP0bk(CkUrL-&B8!NV6YDCu
zFy^UFe(2zli=A3zjz%YhHhVZZ%MXC6E^~%LmjH&S#<%I#Y-brDl?kQr-bQ8FH$LZ2
z{M3x8=?R<kW<ppwbnk^P(uw}VBgYT7cqbl4w%b>P$duX3XZJ5n1a4}ilsS#>;Ao({
zH08yJG2~Nd_oZU+D$T5se5@*vwTpMHGxalPA_f+3J2kH8j^2NoE|m9XEH+8<g^gvR
zQ;Kk@8ghd;aDm5^a@*Sp%8qQLCO}wE=sK^Bl$qO@pGdquiHsICGat*15&RlgNAhu2
zi``#vkGX)VY0pFLHy=zc62A);=g7YQRn(U%sluXP_K`r)aU=M_l*&HcJ}YP^)*Aa(
zb|7=cZdd<wBc&F4j}tsn0{v_o?A{fjal`tlS*qNQfl21*&CGRI2uCy`HriPzE+j8u
z4kH1{L*r;c!2GO_*nNhvQQ>`rPCWd*ydA7{XD2}0fu57jE+(9jZF(RLv9>Y96;xzi
z=uYNwN(xN&<`6oMDYewOQehn!hn|i^IN>KBAUm6=&<`@heYS8W9!z`r%&qL1Dm~l$
zOc~oFKT}R>^^vzzE%Q%Fx<+=o^@+T)1lil(>OLsvf~e=pmZQJ7@gEs^?Cj05%VW6A
z=DzJ_|Kb%A;J>;)QKl(jKg9O$)lpr-(BkFEjYM?ey<Hw_$gNci-rJBQiI);`&WCDs
z4Fh+m%4{57?2I6`>p$m1@2u6@Qm8Z*tY`Tb{)gh%`Skbi->@Eu`)70)gFOCMa;;Xi
zCF4UN=mYas6>VSnOzbQ>N7>2occ)n<Uw|~a=Y9H5&<=jfu<fH^Jn2-oKi(Wcg<oh^
zov$B_v7iLC6ZvTRQLS5I+F^gf@-C3Z|NOk@5Zj_BwYVQbjM$?AX~wmUd`f%mI}W^{
zjMe!I6n~=Y#x+)xozjI^iYDnVsG(r2KK~AmTsAipNbA|^-hsF!&w7Co`k<gA>AS}H
z$0nYyPcj)x+fnG*{Tz%QmV`jdS;LwOHv{2={=@&3P@L2~4O<3@iih)uaSA=b5n&5n
zEnwpunB2ims#RY&2@wy2k#JXewa&d8oOA~OG49Ocol6qbSr$EK?dS5KyHDd%G+Bu8
zk@br;jK&P%$MP~jKtQB5KxwEpm&eE<(1b}*dS$s#UYrv0{Apo5%4|8CcCcga+U*^M
zSH)7gF7LyLBhv>HEp-WN$^NlP+^=mOcAwfICMmr>si#J-PDIXmyE_$qsQeG|JZqUN
zotVIHQIWz*n`e1M_I*WpV0YC$_2dq?x`HAjd*{76>(4`R1xn_^fvXguE1b)_q2sWi
z@LnejEcATI?-3*RANPeMJ7S+Q4gR(E=%#trHj8FGX|v>ISY6=t^?g2~-&P&ZnT2V*
zq^al9G*CklQMPN9JmH_T40>&nn<%sCJ9ojkxahYbd*z`Z-;VV&x`C7}JxK|0uEzXo
zPCR$9b!{2W*InT(D^y|JmI5?$KuD+Wc>Vw`osBYqsUmT;t0OWn*|N&=I%Rssuh%HU
znvWj2L|wPqz`PR;)=JfLsbwo#gVr*Mve6*N$TgdCF8lAgS~44O`i0)OmrI#i>lMM}
z*TYDx1d+dzP!0o_wCh2>ccdtp7oxCi?y=gD#1sVy{X7b_e@RXAC2LAGcgWQJ<Eo1H
zJxRSGhq-d)3|x<;Eg^w**m~yGp<2K>5sp{H#;DQy9BmN~msxa&Ha|~E@j#-hjhQu*
z(dc~!!W46xI@!_-<z{3;a$1t<BEVjWeW9|-`mtdyic0mBc<26ohB^t6;2ia8PC?DF
z)NVJalh%=XO~!%GM-;U!TxEBYyt%nK?+Fl1vy7Tk?vFC_AnqsM)RBau+>b_b<fi%T
z|2Cg(Cm21yLC`Q+-u7uBcRf4?K7I(RFgAI!gQ9sb>-zlK_3POv-DO(6G?pG>kMMK)
zwebC*hx+s1FVEW9t-WZO;N{7$vW!%XA36TBUg|7atoN`<J4b-U7+iMGE8kG0=5_Dj
z&wK{1_iN3N7jHv)PU#uKq9vppBL4h)7rA}W;K$}x1V<G0kE19MzMg*%$V-=$uU}Vx
z1O(6i&Y!=D<9uNE5CAYg)_OAhdN4#%RIp}Yh5JqHl%>v#_%0=cTG^BeHLeJHz=Oug
z33p)X*P~i{X&a|oL$}s1>Ad>$GLFY@m}Jgi>T~;|BGZ%v$m-KTO4Hp7^_^0Di8q;~
zuqq9<w1O7*I#T3bP^9|1X4OL0sc11CLgk9GnF@V$H58;Q`4hR)xL>1N=K=qKr_9ds
z&`U2P$8^Y-@_RQPh{T0!KYvGmM$n)SPk3~3%%MD6f-oD3DP!NdX9#pu!iQkmxBz7C
z!}|Bv$xKu^ahzO<Vl(%D#hO=xy6xj9fZgks4*g$*H`w=*lw5xm#@~G#71hL&mRhs+
zUI@3}!%++?i_)pmCw(oWE@@Wom<<Qcp0%Z%G<BHq72P{HbZN%t-OLt;3>-v=k|)2G
zwrd#-VG4j)JO^qephspJ*PTqZgEq^r-LIR~?A$duvnFH3pGNgClBdFB&SZX3k7X*h
zd|-kyoK&esIbN$xX6Fd>@4L%fgleNE=}^wf6N@2l>zkv?GxDV3Ce)A@p;QqpX7ZK#
zk{E<lci3i%l^w!xRQY#lYwg|cvQjL>4asIpIn!KdvTnfz+0mkl)+tp3f|I!OSu{~f
zJyS<cB)@*LWzT+=isoQc%#)dx$U@r`Wf-!8d~8nUfXs!D16raCSwz_<hQc0VKfzcx
z|8Q5cKJ*owKBXkeTFidsNj=J9N#-|gE8-G~acIQgz3!~6-j_|t1&qhXi+?XPioPcx
z$XD->t$Egns3bQ|krSReG7=$+ziHvUvwtT}3qP7E<tymAYuT@3!x?YY#U*g=QmPWm
zumo>d`ZDRW9|UD1BO$UJ=B8%=AJ1^HV`6BBX@tO}8u-pa>-htice8Rs&h`74zVB5o
zN&HeMn9XH>(P`SH_TCpmr_brUztOoCc}+j@FIg35^chXfe>@Yce8XR0%Vv6A>E@=I
z6B`ZFDae0%eySb;`2d)dgSZTU!*|XDO0OQTm$bdQTJ-GogHA5w&Mf`;YDO9>FWq!N
z5U5eFiE@W6Wqbz7TZdj-*Y`3(rm0NC8A!b}oiFU8XDAL|h?QRr1K&qi=1GgFnU(_X
zs|X*evcIvf&;`XmKrTM6etqXPJ@26Y;lTqs=i#4zZlIoVu-(`@=vMU-y~XfvF!Imz
zs^1Y9$aR6r{?6`QkvnTZ)dkYFFQ`NmUfk%C?p)jwqYY;F^TrRQ`u9-tP)SO*xLSX5
z9=aw<R?@btdhb_B_J+DgU8&wpN^Yv|nKtTbA{zt}J9c;&KG`CJHfx9b12w4o+`+*^
z+~%$yQmMK|Oi~1!2|oa+%<+==@oRX6F$$)SyEs#gJ(&iJsqhtx-B9F2V?XoVu;#k@
zndlYr#3QuSb`<)pdq>;)>ehp+s=3n8r}Y_ms1Ju{^rl+sOp8lNc8izqUr9Otgy)0e
zDS}YvmnW@cHRKsxp?WJ`ohO`cNQ|HQfu2|0yS|^4Ardz6>`Q1jXa7WC-J0hL@lM4~
zM6at$7Q0)wI=-`sTuSsCDWhs8!(F>aSxqah(LIwuYi+te;7Dt)Rgkh+3w9-VupaqT
zxRs&lGIn+IenRvVI5H&{8%QZJ=Wju_7kIy#H?F8>rZX3<)e2xJaC<5DvWz#GW`?rg
zh<lWpMD{Bu?wQ=Sadde=!NntCPnRtm`*^gP$96a&9YWf7{72fU9tG0MsURVoY3qr6
zgz)C`TGF~EOB7i)8k6Kpj776Qw#MqQGXA*uYgr*~k3;y*y#R#3&0Q)#!%as4fq<#K
z?x+>w6IWcIkyFShWsb;ky2?JvCh5k6(;1A~r6fs)%CXGLj3v@W<O3Zilv?`_MH5I#
zqG&#{!TW{){syv2;xkZBs*L-~mJx~?&8svue!gUb>?_~1%M#>usUm%ZtvVOk(jsg=
z7Bd&4b7as`Kgb$VAAUlM0-1PUTc3=Lmh+QiC2KBs1>Ud@Nadb)Y3}syth4#1_9Zr}
zn*#qi+<R%M*-Py5u0G53A7fvP4V%jQ^~NJ5Dr1Q^TJR!vu*6ZT^<~%N_SPj4UV#Z7
zDr}t`IG^5sk6G;oggQrRyCWx2F!WO@;mN{1J(St@fWaB>W5|Xv%LZqRy3EFNos@=G
zSv@)!!+%^c7;)sR_Mxk?6PFN{b;4#VOenTVSsjlnAW^%Izcst(r3_B)?D(66M7DyG
z1#TqGd@XNuX%$fWH%d1;P>EuZtUlcsHT=Q%O<E4n;xgUO`(#)}c<gpxR6psb&cR{5
z@^x)(u~Ttq8_;{kL<g&y#Y-`rHTd@jEnnw44^E0}m4qGrJ&22o8~qzX*?B4+85W`a
zE$9F}do{8hQ&`ANvSd(1*v;6`16m5PQX)x8QOd|=t?{|CS9XiW*reOMZa6Mslt(pN
zvRe8=h|XOsGY-GFC!+Wo!~0rDw?uO=14U~AI;dsEEY#H1>t#%=$Vr~hyMAByd$t}5
z<W|(GZRSV3u&g7v@uW?laYMhu#j*dk2gSWbKPYlm>{P9Mi2l@dG8$t@tjJJUzt%MC
zg_BzdasJuSbE|J7FgPS6-q(Jnur#CA;XP_(bd-a4v!(MvuU@IfyJeMCE#zc01Ub1v
z9Q6I4r&?pnll`*Y*B~y;pL>t{;IPMBEUtL7qugCR{CYBW$xt&)>GJX=Mw0oKCrwQ@
z-R!=L9nS*__`ICYtA2&$?;kd#ok1(UN&^z4X_asW^&DPPqsQcojFM<O--Q}yW7sqt
z0t}1;HQHIiX)-&jemCFrfIVW&UrLW+gQVwF2<y}aE?i~yG#i19UzX-_QjB`H!G=pt
zhEY+&sXd%_!KZm}_>o-P`?HptI)N5>*{wS4aHwvbR?>Oay~7q<XH7R{(iq2=D2ZZu
zfjdhIwsijL()|9d_?kzObNzqG!>RYB_>wmz7`O0-KX-l{E)H@-LJMU@d*uo1<r~){
zBl+Duc<Hsx@)blBRJm*=$Avi%`*}dja_6F;q46GLCgS{7*dxq(Qlb|Aw!>x8RP~>X
zFX>i4UO#+}O4u2LTBy%?yml^Am_Z36oOmNH_Y6qnWzDv=ywqf>VsLeSgs!n`s&{5e
z&)i1;P=wy)7R42CP~pnI*z4+qi;6NNAgK&wbTUN;Qw^1%{Pk+yYCV{E;MC>8s&Ba=
zwaMWn&FTG-rWpW{$$D%s@`zV*k@Xse4Qj*Z<RmJxp{q=>-Dobjyv%r|XJ2IGas76O
z=Y=AdY;{eKrB<qD0&8mmDdQu_>F1lvFSBgzv$_R@L{^^7UqNb|aITor@Ob}v8Baw&
zGQ(fjN3`}rqAgudA4*2kR$r4wmg{mPqY8P+_h`Gc`R{)NcEJ%SuzU7K|1Ekn^J&Ek
zs1*Kx#*;&`3qaI)HT+5D2FPAHK5E7<asdm5rVt1N@=g-UM}0VG-_X<r>70t5`#pHT
zWEjCfm3aM@x+D`Xto;(3W1T(^Z#Bkj#Ay@kOye!bYZVUOYKMzQn~TA5b*;UB$F~3j
z5|^F*f|P<|)@6A`6$bh^RYX*de_qWSjkPfC3YDkJ8sHYz*8yC&zvlCD-}Qa@>~EMx
zYRo<JaT$r&q#b$C1B)GT@0?ddaxhM6JT`C>VUl=RmKBm`!ES2Ue#*+#&JdRUzrFlV
z;j43|TSPF<N%Cs?$VoTnyyc{551w02e09@XKC7YK_O4vfDIEizc`Z%y5_x=AAS!k=
zH92T0F$SJ(%-cEVR|nI-c60jgeXz++f-j&(EKf&$qSE@`tDJiO(QSc$XV6wSq)i=4
ziqg#O*g2anjg)D_G#~*}?KiB5L{0)YM`>p};6q_K)sS4u*I-3}F+PUa;*4%4-~TU!
z#Jbv_a@E0b-b-&XdIWe(){WU1GjYQMnzW#E3Ad%`sJe(JIEzcS2TqOMuk~F>^f6)M
z)#ZU`h;k>KPF~JIIjvoYMvaKtM8{7YC=Oc3If~w*htKSdJr>B-zvkZ^<nhXS^t27e
z&vya7#pG1|Oe~&teOT(|_~4Dy3C%cCbz-D(JTs9b$>cgON(b&5C3?yCyvHB^kcFCO
zJ*_n=Fzx8ZeX;I6S>==`q?Qd$RkwQ5=|@VjHmHQY=^EUy#+u5RG>K4pl7G%!M4P$J
zz?0~qFYM)oh_NKMgbyx+vK66XYPvrg^8A<?vD>?r#4kX*c&#_UdlY@kAW^z2u;a{H
zH_LpJroASPKa@ENh3h!gaXNw3qde^D7i;WS{i=*zg_w|S)7nsFfBvOaJx2!JmmDc!
zPsmiS!l<m~!n-{2wL1$My_XbzT=+a8^1??#?qJ?6iM!`30y4yEG)c;z4|mgiDZu8;
zFyS@LuATPj^>z68^<0D0q#Kx@*Z``IwfhYO#7TbL%)KtCXItA?gcKE9CO=lduAHUb
zXrN)KK%!Zs-Z8BH#D&I8U~R6O*k~wK75UG~%uXB!g&+FmQ+_>7V6jm612gU%v&6OA
zy45jXwEMi|!106e%>NL?1}h_hVbOF+)h21&Tzw>6MCV(FC1Onj;EsF9NrJtNAD(tq
z+%JKgGhfdwDTIyP_^}NhhP#^phuYf8&E*`BjQF>`Jb$RIA~_HYcR-m)3+k=;^>m4X
zt`q9@E<jcsfq635a`0oR|3T+fu{*%t;wY-`lvmz)(Cz;_^c?pf)_8lI)*AWqV&uPy
z-aq<DlMcrRy$L)21S5~$Mf_U=fAB;}-j!u(+@-m9@x<p<%`6nBu=qc%chDcdHv!eZ
zuwJ+|`1JZ02`@{%%uoI0i@);6RVag#u*}Z|TL(Rw7Z90+?|j87Oq+OBS)9^#{)#>?
zL?sow)5-%E286`LuM148bDYJh<=6RJx@VTV<LdeYbFc_L=~D_bqZ-MOd&z;6s+p<f
z<rf}=L0{rB(U0X(3;y>(_XW*RdTme|N`CgTz?be*H<uN;`kmW=y;7&dSE#`M;Lwf|
zB6dOVCbNr^;-4b;(6WZf4iKptQ4Gs8TrEo26I`sk_8WUUvD0osepE|(1t5UrQM2IG
z3Oo5wwf=X#GMK)-tyHEmn41gyYKDH4hjga5@Uw5c^1xD4*N0M=X(l9r#IjDj!z}xg
zX3Mh%a`31xvok1QHCEw5(meN(i{2Lg^loThF1$i02FS@R4dwg%XM$MV3te!=x7$}b
z=JCmrb;A2Gx#ino+P<q3;FhTuFw4!=uSRQ^=J*Z6ZZdtixT$`v>UovU>Hxb35^W0i
zQbIsVUjE4d@R*>8glL6Q1t!7Iplqe~=sN~LT+S#X8H<nT^Q=yle*CHP4@RM#^YYQ4
z9q#GiaxtEU?7a;K&@d4Nxc(i*6inhCRpb^0O(zx~ui=g&56OxRE7zl{30&!t;G7dG
zGt7;eVvT23GwvsZek3Sz$k^6iQIbDm+t7OP`hR4%d!c7U^i_$?nnFv-e;FoXK4mnc
zeU|e6kiN4&V!fpOwC%L21c)NnxMjEKt$x<;X3mug8oJ3+qth~K<~m`NL7~;-F##d3
zq8&WI?R?GBF}?^*VjN?wObR!%jnaot?B;tl2Wm^8F*QtGN8AyiZ4FO<A21REttDS*
z?3Vm`$p0h0QbTVvQ4(9=udN}T9!NaXFmn5x1tAr8*lqQ)ZMflyc*&5+vu1%A{n9@&
z*Sp@MTN6swP{9xL^G-?pKv#OtoOLiY4{uvN?w)?fY=?<_oaO6V-0>^0h>@8<D0;D%
z&R;&&<n`A2gbpyowVqQ1U_(YaK-)o7Gu)YX)>GY%wu<qiASb~!R&H}X1^8tw!Y=*%
z^9P)~h#``Yvd)jmL#u;%ipW(;^K0i5WcXz$U>4@VH(GKByj?-6$mv{nQT=;)<$PvI
zX)*aN?U;rc$3Nei_m2NX9ylC-ei!N6PCTX`zdQc78Pt1_ef-lwV@p1J2dHNr&<@hq
zK@{|+`h0oksb#GSUm?<@FQpsQ9|5G6l7kvi<eV8T%Mh7!>7;G$0eRg+-r=%uEw)?#
zM6o@msIx7&T<=GS+D8`_9_qzr@o+7A26Zr3q8_?tN3$Y;{HNi!iCLY#Z$drZ2v+=F
z$CU#DTccOFGJH8mrU)wyEPvIw)BG3<cQute{m8$56TPKX@8@yOmab7F+P)QvrWyq2
zm5*?CArii{Xh_8J@P(qJyr%yQgyi{;P-?^tO(#zzPjDh3kd6OEw2s0z)DgvuxPp<F
zkRnS;VkAD7?4IXZ@HsO<q3jhNoBST%SnfVo&Kwnz7x3m1i7=!!&iE5NCpKMZuaG~)
zc&4MJqksHZ<ARh54X3JZ@!5=*tz<Ct2~Dab1CMeBVghdDByiRk0l8afAxBG=-0D&y
zYjQwDI?Cl7XaGmPiA>!s@}qpEgKdu@%oOeudYy`Dc9Iq`uNoTtQlwLyy8fvG%t6U{
z#y-rT($i(Pp<sHzV>YnVO1wdbr1+0iO;tyL%Z*T}d=^Lh&opz+N-ay5m7w^}-w-z7
zRPf%ZQG}jRGUdTj=9k7PO9ofO-|_?RW8zWgu_7bn#EBC;Hdw;g@li!!Z)CQA55|PB
zLey6|bH9$CwSbh#5e4$2hvXk8*2<>=LbEip9M73nzsk#7TbuL!9t3(?=CKUp=z?t>
z&fMLO$jqP`-ni05v^7$meM#J%o^d$(`uun{qWAfICtDA)Ocy3n0P$+0oHjSqUz15y
zE|W-aD*hVgM)+3pGg#W^-nqgoG?d^}a0VhFfeY6&z|X{#tLOP%l;{`>(1UuT;%%9`
zGffU3F`xf;G4imQG_ekYVsLf(Oi@E<?{J?UU+pPA-ZP9s$`UnauArvl1H&0=Jnm3s
z{2uyiLX>J##2}EN+Qv;r^^whi!^v13adh_ZT?VUyzAu6Gwree!oVF_#>UK+^Y1z;Y
zt<oQcF>-w>GF=AbA{XKrxnT)P=9|+mKom2*fAq%HBJg~ar@tXBT-WOBNWOZf`P8CF
ztZf_cC1@6~`Vh+<^QMT<)sHQkz(v*G+q+m3SumvaF1WmZ+Zu=O?AbtL@pa#bk-<AH
ziYNL4+n_olaE`%@-hb<(k%tY()8MampZ4xg1N(gM!4ha{0DTG|UE>Eh=9}KVU|r?j
zS`2TquUEP$Ar!Y1pgE0hJvav`#Z*z3LP=zvPbzmR`j>*WtT&QKZ+`sxC0lc1VJSex
zt+U==@)e%e+C8`YMKFbsH{sS_ekq2uc60=Uay%=8PoEC<zrWhB)nxg5;j6Eup)<H<
zBt9zw){5`hsRuhmYipg3`pDq6>(Axg?jrc!PMTOwHbQiibPne<Hgek8A8!5fF@~`P
zEnMZ<rc!xcv3%{~2ODe}Zr)4H{>&(6CTe=u)}NIwly-NPkb}^X`62q^>_s}G)v4s?
z^jX>4;=}b87$F|OR7s41&GZ&i63k?o5EV<XFu(a~-V4M5XuaFf0&P-zKuKyR-#&#h
z(w~>$vk$PTE>L7q$&E1>ABrEFKeCUCbNobH34C`t!1a7|s{f5>qM+-7vjr~xl>;^6
z)m7m*)A=N!+!&*}xyV;YTpf&*z{;+<Q50n|J@*_$%pVk$>g!uxIWO;-oW@S#(?i5+
zwQC^J&W^*d7T*+uk3n{efzZ2t0;w(TNlHp&l(50>_WfGHETLygHz)CH?qYE@`w^QD
zY<U{Vz&Zir&sn>DZj6l~+2s0yrr=!5mb!d2`;e@`dUy;O?V`(~%ZfD11>^E>7$Xy|
zcBdyR52Zr2(W^}JWDFVBRwJKo?nsc;{85Y2=j9!SpFD$*s2IcON8zNUF0imLz(^Tr
zi|lvK+plI=IC$5qS8rm48u36id-AQix7g7@?|ydv{F*Yxe>gMq<3%i=wWC0j#ATnr
z7wObzm@FlpVQX=;I9hpweW%Vo9GYQ!DpX33U@|Mb40QGwRwu674n0h9lhynQM#aA5
zJ{zMIO;OklQkO~lVY=+@CsHZMmt-vRrNohAK4G41BFL4UGmJwF4a~83*U=q3#r1)1
zk}vp(2tMRZ`-JzG{|-p3oBWYUyVZ943Uyst7cs`~faLh9mf`+d_g|^zQQ;AfRp9uh
zHd)`TIY$CCHop;IWaatVc!vs=g^8Vd551dUbB?a_7Vet*eer*1u_hH9?<Z6qC4~gH
z9e=$ODH68R2?Xv3;yvPn8k~BpuOA+-&Nwtrjjk$V;(%|kb!))}2P{eDTggiG+nt>S
zxGu@&`OOOV(69)XzP+tQ?q!X^wqBaI+WeA_0m=Fa@K#+v5CkLe6@Et@n239hw-01~
zfL5S?-&`X<{`qbN6cO3r^A;x`hU|SG2dIulp~)m)MF_Pd$0&Knlfcus&A>}{AAzBj
zsX2K@Frv{lQIldeCu2$`S30Q>wBD8vGrK0g6zZt!aB5acB%QjEQn9Rol<?6mu)qV|
ztV)WLTkE3cR#z+jVZ-mBxPG(SY>=cT8j@S5`l>`J7_kWU*LP|Tajjpwwt*9I<Dex&
z)<E)=ugTb$?1=yQJ^;;VC@1>d_O(m`c~EBT3$6G#Rw^25X6QcGf<LQTbOIybzTuIh
zTT1T9@Q;VNhkvjLM|^l#!N$eAidus`u%p{e<kSybWxtmf+rLzX?^IKX^{b-_nc;kE
z3`8J)COazJ-{)HD{0n_7f1}Sna8#mNyLJbKw!*h+Rp&0k>7Dg~6d2arZ+k%!KvV=F
zta-waP`;Tk`G&e`U9*83QTmCj?4=F{=ve=%X7aNR&(aJApWa0hp1YCtGM|@es^Xxz
zVbjfKc)&AGPE+~Jn)1pQStY1n7l%{DV>eWiorVsIWL}rEb;QgE75LZtB~K<1$N;F8
zZlb2TTWZ?{{1s&4A=t!jOrKU=$V3%><}>@=K?&i8ENh)k=Sd4<?f52pA^PyBhvqpU
z<d_3y?l14&*T>u9S^e)tq@qD%JtM?;GOYWQZ_3z)Vzk@PFDh@I42iwc{Hb6S^B3*s
zO<9}?1_`jTFSV2YJPaX^7}$83as;+c#Ooy~DO)j_2l!g}CI-f~o9tjXt~^%k4d46z
z?&#Y);Nj_QqB1<@j>rKD(UJ7j>9>O@k9d+mr)UFff^MBnUvSM$9coPZ*Zqlr>hHEm
zpp0aXy+Mwji}&s|qtsNU8-Y&(&U}>X=;4pCy_q^g54%S7K11~@Z=jMI))O*2Pu!5;
z71-TyVqsEkV=o>Z&vAY3YQ{gR$9&J!@_lBZJ^Dz%(qrI+l~R0^Y{J-`;_21o$?DjX
z5FU*Cn=*JXvs>izC0iC*Oxj)R0g=(O81_xBn(6!dw;LKyQAG5bwBC0(%FNM9-XnZC
zB$hSyCn!oloRQJY-pBL_(1PMUe{^*U#l#!>Q~k4u749OjO)Klf|CYnuP`KDr!k^!p
zo%3G6C4n*q=7rH!ns@KP{ze1c{b=4hvUT1|Ol;xXUQlO&W?uqHjSe28i`u<fbiV)C
zj5T~+{nTgOo#yAq@83oIp41q0I2t_u7_iy9KXY8YbpXP$?)?M#@haobyC2>$0C<mg
z>cqh(y@$ed;)bG$Kha2D;^nL-ai>h6O0~?b+naU@;pdDc0^ZNeP~MjUPwY#tPjsr=
zmJGn36}e;35oqgIP!6Yi7PQkA*@m`*nH~XuZcCW}9iv>XeBy<Y$4CmPd~~%m$C##O
zM}xf4I0bigbZ<S}4)&o~X1cPqBG2TJ&)RylN49}hrM<yr04<~5=jHwL_;EV1G$To=
z6F9iO|Ek=kXwtiDI_G;f(YkY<)g>O)mqy_=v2xLk7eYopJoHWWHH0xn`2$2(HK377
z9obRw2Hmh;6+um23h@S$OuPm*JBp?+Eb_|XqTJ519*bgTAf+gOoGL`y>kdV^<>ifx
zncEjf2)xo^xzyM0RR1kJD#NhKuHik+#-UV+B`wJ&I*$;H+GZ^Yq^9QHXG}$Vd@DJP
zUKb75E?Cw&(Wio3*HMr_vL1AH0w}u$Z$!y7Vl^6;EI|S0j?!N4{!8BSqQmquY$ns6
zkxB1@G(jnW;IB=d1=5!pt#buNKmmN{g5`{#%J8c*Hjv<F(sQ$gPX7;Q=UOG5iAro)
z-ccbHyXR9iP)@<Ev|Q(Q#Wcx!>)gx+hQ9g@-ATq0MO=QmnIZg;_PqW0prKDh8X<72
z2=g+pFB{5a?g48zuO*!D$LFb8yC(5gmYLu9RUAk`Z^-x$R|)7BmQH;pTCLY-ynHLi
z;RYWrtEBeI=GN#q5sX($?$!~$vptb``QxyEK61zVXk6jw-#cQEgkO6hHh}`a=9?^S
zWbOx3Jzvtq!uo%X&OM%~|BvG%B=>u+u@xcYo?8+lVeXW<6mq|t`#s5>klY$d=DxX(
z+{<Nh&0QuWF<Oj9?)=X8_ov6hKifI`eBPJW>-oy%e!rh*n><@p&vIXWwq>z-OEJhw
zRLjz<M^9Zcj;DKwdO!Q7WA<B<{xLX1P8&8rpyk5WPc?#5@eBOVY_EVN$1aXJ>3eeT
zv2FuxP1^m2vCDJvQSL5;LFJzeiA@~WbTpqC8};)Mnl0($`co`t=fo-+K3Nx1Ym7d4
z5NAEsv&9P9ld|>r`d5+lkw!qMneS9Wg#W{weE<A)Z*7lOZd!L@60HWE#&VS2g~gdp
zQp{e5kU?v<+7({dvpK6|25M4!*DWl!)>+w|+3$;6`NV-&M*j)6lXkYZ&$qYJU6Tx^
zzu-+n+82CUR2>;uQ>VW9C<5fFSFeo3-T(mxER>BPY4E?uVD<m}#=-5j%Ln-SNb;yr
z^xv6B2R=*c_wK6SRqu=hj2|a}1g_JgpxFn%I=owZeoi8Ub}f)Hch=U7^^SJ>uP(Jf
zGV~}uvA}(*$w4iPfwuHEJbe|b*9(xZWy;*F1#JL#lnYkCf$I}JxH1YqDksP1ft;c@
zgz8UwE;I!PhifFtF!g5W&o+JRB2nBxeGfvLtoe;76Pms5MI6@6dn&&5k_0fbyAhp`
zOso`c@Z=qbevymSwiN@6U86&8Nf`zk2C_+KtOQvCTtsmjNRNdvZmnndW--)b;)&rK
zWW`TsB67&cf2@tF%%U!P4P8mor#vSw34s#}%C5Yhurc#+aRh8tdR9R#`ZBYBf~>jf
zaaB{W=KRF6P{8F(R0ZiOzd>fk<j;I=zQV`@DS=nb<_Ra%NbS8(+>oeV@R^c-gaOLR
z$eR+hAzeuQSx&c?0UE>vNE7d48~xE%Ho$1YpMV5Dk;UEWqNrKZ<j^8u!~~~|WN!oH
z{9%lW!=iZ<pxHR-YY9NhI8l~zKf4aw?AD`{UUkhJ){VDk*-yR{>{T8A<2VVVv3vj=
z8u>O8kLht7_CgJj-IZ9J65orZKrj^sSSY(-BsR2k!#3y%{dIz@_5Q-r)v?zvefph3
zr3)RSl;jP7(k{(BL+C;GqR(p;FV1e?jG>H~Qlf+)6lRr3z(MCx=GVb=Nsf8-aVs0o
z4fD&4`$B+f!ARj&EC^-L0X6&mm%r+Yp%V>7ps{0)@|a%J?2L)Q{(;9k+6H_%y>yD-
zHuJ>>;oik+ar+N{DXgU<`tn0%)#us*j&W9Xv(_<HGi*1i4Nu>=XlmTJ?@`DMyNVxE
z>aHt{shQ|ac{Krd&H)_LKA9}`XTKxq<a^Nkpe444IafT<PfNJ4A=N>N)dYr;3KHw>
z+vWijyxPCw|6Bp_dPsx|7EH_Is3(P8&xEb|Z^jiU*GZUkw<RQ{_Z-4U_FKIvlZ930
zB}%eT+?xk(_RLQCyXMRFc;zlblNNw5*lZ^lt*uR+>HY5r^bu}`iDhn6;182=)t1b?
zaIC*VW1##(Q&bt4i+sIz@#Dp#&F4WqGJu4`m0>@U5IH{6)4WwsHZg7ml|Ug4fqaGI
zu8W<cgtJBU^1ai)dxyI+@+*yB@3jM2fMlm40)_<4R{($PdNyw2;O(dy4Dc<rZ%LH{
zdG+1rn{$+Z526o>qYuv$cEd)WMBIl0_9@#ro)W>Xi*0{ppO&}H9;^Hd7{_(Lf)`KO
z@%PSG?|HWaK`I?;cfoZB*J2!zWp2)xrBKZp6dX7y=s%Zde_?s?NsVBtFUDY?n|RG~
z&v`56T}~BEEqxTEUwQPR1}wj%oJRDr2Fgl+GL=toO$AEgMWd*{>@QoT=28;BFW$AO
zy*kAk*lxT@Rg>1SE#1#CnyNefZ?Pee?{#9u-FKjD8`Ck}0QJIG2_NF|*+|D!fFbkg
z_-CsnkRY0um%GPhmYg1d`*s#JC-3DN8{9Q3%YbqF>;I3nVBbh@(^sj<_NoH?x3-}6
zd_sr-M&Hl9NA0Uvx&6mv-9f9j+i>}RS8#rliaB#f8h2wUG=ej|xe~(!a&C)CJf*xe
zz->f$4Y03nm83V77M1{5pdb&l0H!yPTh(1(5|o9|jDyo5I_++wvH_uxda=70C3goS
zSm|oV0a*TXxTEBw#y|OQ;H^1q781!Gzpcekq!XZ{;B)-XYXiU4b^O(V7SL9RFKR=@
zr0!cnXD=@|r^$F9S%Q;qkb3c75XkF1e{qK_$i^Q+xt6P1g$Fj(OUnoYvb>5%=YeuL
za#D%0wY1+VQ~OD(or|Nv5J1PKxUZj{*q0rq9XWpbh%{hdGwk=-p@w3G+iZwBvj!~v
zNg|gRimWMWlYv90V^q>O=8Ry~9}P>7g(aMJFC<%CV_Ov$+<)EIg>vP*LC;<HNcySs
zZTsJosQp0Kg*?->0)6ctL#aZ~Xm8DA>q+T5-(~}))LC@+`$4S6)b7$>VL|Vz%d5+e
zT|Ud;y0EevJ6-d=h^w<Y54PZuaF4_Kqh-^uh7qRITvok$sp!F=J2|l6v1f%|8VGL<
zDC2Lp(e|8v?Tie-Y*eVH99y5xAv#VFx}niElWp|)=bEAB)AP<NS4>h`zElOneY7b-
z_m8Mn+Zi7hho#oMr1O^e*(My+d)=><<73Z@+7FoXHm1jP3iRLA>((A}8JH*Th~${z
z8{JfV89{+w?SO@_oe|QnRI5=8?g8@Pfrt^2xb+2}=&O!+@BdsAsM+3?cRX||PU}cO
z;q^OHiZJ}@UR15_m#FRwAkVV6xCmft0B<ICdj&{4Bq48MJ@4>&q}4xzP3;54Y2*pu
zFDjOrd%p_?xXo#?e=kSxm~n(eMV%iWuD(sU1M+RPVH(h~7b(q4KWRT%Jl{4t-!lS2
zHdFrFMJ`{=bic(tXUV&N$1#9c8b>R{YvutMfSen_j_dy|F4WcUE_c1uf0u-n5xMIc
z5c<S}Mjwcf?Op{K0`S@{)ZB`A%-b#(d#BcRj3kz+_i%Uh1GdbP>&NQu5YTlAJ5qh-
zp$FkM@@`ab>U1H;P<=o;s02!aLUgu5`85lVpl0f(f2Thh|E_|&<_cIMXdVjH1^Pjm
zb)=ag4CJxZFqepf0|h&YL~6qn6OMdC>SyD}aY`QFU8m%e<$vSZA$4`@S*3t^^FPR>
z;#<30CC>B}dD{C=h3M@))BKi*irKTA82V-?)Yp3bvaYhNRL8t$8^*;MdsR<tu&oB}
z!D;_YgdpGALJId146JRQX8X|O^>o`m2oj+#7{5|Nu`?zARi8NSy6FxG4&fvL`r+d@
zvtXl~!ub-NeW475<y0DZvnGhsYaux?-Q$izD?(+<$}u&qd3I7BFeF`nW)1^Zi}**C
zv7g~Y8&VC$r)*QqONN&i57xukuk^m?o}V4$C4&4wXwksK59?5>j0o2n3+}HCx`VCL
z4z#SL+5D@2z}d))(K~-H4j=rin)nDxwDB;MS<6E-&&G?mKB#HO#boL-a-Z&olnS>7
zB#oN)yg~V+=@cG0pFW#!offAQU*@!mb8b^DHIWEgdXKwz2^8z2*}ZVXw&NBjNDpnA
z3~{e3jdvJq_#;Cvly&xN8ux}S7WMprW6dtGa>z*4k@^XdO0YJ`S~JdB03C?yJYgm~
z)rGu)OI%>9`qnH4g(AW;jOIUR8~QtT+8G7R=nK8b*!c0_w@vJ`)I`5AC8Mw)%zc63
zJ`_as9bGkov$qP*Qs`5P$y?GpTG4|^7VJ9-O|AN@9qsHpT1RPAw~DwxHiA?!xet~T
zzkBjv9ZMd42?(8hdGGOGH*d141p6M*;>q*{R3$hIdltlpM+>Q3Wm#!9Npz3IeGlCF
zFC*kXkd$Hh#b4&=djk(u!yx<v4;M%F+UY?fzLV1k%<R!3W`d!VW3{*U1@_<CT83Ge
zaz_)a{kx>{00$x!Rhm5{&)7N*tp5OInz4V9SeDG#e?fUnxl_vhi81BFt<!qWyBE|T
z$ni>oVASF6hwZ@l@wgwDxm!Y>!Pa+0OvNSGBE|<BvOQ}MW8R7j%4I7m8~MW{x}rV8
zk2uc(O0%qs{4Cxt4^R^<h?HyMgiO-wb7JGf%DK(ZfK)HYwpzWMtoCZkH$wUPWbx_G
zSF6AkJ*AY^WBRTv8+rrv3rWowL0!KD3~?{E!#xV!WeUn7L&)*8Wq!#rU&((u=s#c5
z7}BnNVc8y9ga&vv85tfkkqtwE3KA8+-B&t%@MT2#9ChYi3u@L?rL@&ABk{(}YZ!#>
z9Po+Px1(OszS>P{9U~(re{vO@TE4$f8^!)r``UK(OU*x@XXwBsAH}!G*P4l|O8!n1
z0NE0@+A+1m>LE!B9A!v^@jVV3R%sAPEo`4z6IKF-EL#77TNrUf=&DQJvXY?$&8gTV
z40U)8@v#@suqq%IRD%Iw?RGFk!h)O2)e!wKPZ>vBSJ!-OKLbD&LsSH?*X02^ndhp-
z)P{}mlhN#T4i~K)2IV;w7i@6LFLNeCUD_T_?_X_Y<L^xP<Z4*Ij61iJm&&v09|8V1
zF=bM7XCo#@ZLaD&Cn!4`0i^x#dY~2X^D%X^A#1Xa4;Da%<*#E-`>k~#F8q);rv5`J
zLKVfdLbIm+hD3L{jhmQ-D6uv8K8;O<zJSq;gSCmn;~cuYTn8OUiAMf=;Yy9iv=3WY
z<;|y*hlS3{F6IT#KYMqlkSnO3$M}wkYJU8qT4}>s7nSTdKL==Lu3>IuD3WHgWS}~M
zlpY7iNpUM?=w3AD`}8tpL&0D4C7g=(U~&<9Btv~`E7;ydC!=WNW7S<Q)}J?3O^MTq
zi4{zq!%x_<{8Zw2sxcfjI-MVx9_fDqb&Nfw*J<fWUN}S{czd!eg*ETU8vnAKn6aRu
zent`T2=6WV2vk7Zj$gRFrWc*8pg}$7AfjL0Km-3Bp3OPgare1@T%FBr_!k*R?;jc0
zp0%G9%B%x$U#U56LKg`TabnlQIyYQo(pP<=PZkCSZemi6^~IJ{Nw#Rc>DzpAanA}D
zJnx9#x|}_=3A_<2Aq?Pbn?wVrsL-h0rD~m^bibnVY&ODg<RS@hngE31d4iW~(UT(T
z$F_O{ZV^#Y*W&FLd{VJpZiB{yP6am|8F<1~u0@}{eezWX3u|8nP>YB$%ut~5?L}nc
z5bafpVDJcvsm3zI*`+2h@tMSmzv^=zUi(2|Bl0xX*0rF_OFlE_3rPI6Nk$#9&Bour
zgyJ#j%>6uv0FVLl{PEs*%BPObx1(AXfP}O&=6e9Fiu*&b;e;*(sv%oOR82F3iCr&X
zF<Ac3Moh^`Wh&LSG97eRZcfFr&pNoay%;tH`Z>J$_w-Z(0$T_XY-@JP{P=MI?jj)c
ztArriWNQt`%B4`*LxDvP?~b)M6Wa~<>2Y}&5B<u-m(#U<8Iz7a*OpQN%=x1k6?dYo
z`y{#{w5Z%KRcsx}>WdTsxh>=z8_xAOq{cl%UjdhLd?hrJR{&Xp*MeOFkO&(_kgqCx
zHmYSW={Y5j%GTOxptIh4rBJrj_9?Z%QjS4J2VP@m{QLpLf9wMV$rC-&GuGy&m(aTc
zFywIDnj0tQ09+Q8U0!}O?P*zm>T>Oa0wody&l)VP9?qaw*+a>dh`eSZZq8actE^ce
zT&6_GhL&DQV`Q=jfZ?o&$nJKzS-3{P((eR^v|D08(kmPx8CrznccJo~HIM;p`t<Uj
zY!F;utfB=#*=T<AR(R|5Jd1)?3Qnk|f&p;s-+!Gs2tX9`{R_(TTj7hv8XQ6j^Ap|?
zo!@9oo)cewoRis|iKm=+Yz<_zZtOp)cj$P?LzM$-gVV7_oy{1XZ@h}K^XJ8-I@kNG
zBd^>S?!mgdU*9k(J90>&GkH4Ga{I?OO;iRL@}fyLFn)I8;`MQ$84>N|VejJ|omQNU
z-H)4VIDnr~L0|n*2zpFkRikKj%-*m;$PIm|?W+Bq#c#sIJ>YrU)#`o8_woH!S5vVT
zu=HEY$$MMcgPL;U26PQ}Y5VWh#j09lX<9%|i(Y1NSDi+gdjlJ6{H)qc;`FC{+w~Z3
zonbljJ8Vt+jQYhT#Dz6qKGc4ipR{X9yAO`xnrT9)Q!O{a98-aSW3_9qK=0a8&(00g
zf{lRV-tgdntzi6s4}<lx_9#sJWdnWMvJkb~e|3(SVG1o(8X%P4gnW?kr1Jt0^dn=m
zrlIM({sJ0J3qMU|=fYaLf!U|l6qQC^nH%qbA3x~*?sJB<0_KEEtX_(I%kl+op(@1=
z$;WR3$2{Allt@}ar5qow4ZXh-_p9T&kh8CM>q3nJP6d>=E++hM%w&p2Fp(jKLkh*Q
z=ID+_SrYE@$PHLQJMO@yxE;MQ0Yj1=5!bExG^d&v>8r__gKGimz^;)S=N1(fb`A_-
z0V(t`?Beh+M6KxKEYb{}8nYC-T7ys|l!EO(MwH(8B)0BWFknB8UFN>dszM5IdJs2b
z^)I5!_4ess|3tqfPes^=V%`l_AaIF716Eh&$HoFvNk>dGifc>6{s-jqS?*Q<q-KbR
zSW7}g@+Bc9G!<@xO%Ajd`NRXz!qL_(FN1q_X;p><UjeqJJ*^g;IM+LRG+C`tT27UO
z$IySyh0lIqPI3h-lAGUy06T^PwB#!Oq|B`y({3Y$<tm9!Z2Be2rjHXX7lx4KpB%*|
z+Js8Ufs?=z!*sj|0URr|sdSWl141WR#R}vb0y%~Gd=sdYqggnkkOJ<P4OJgEvGXd0
zqrpP6P>tpT;lAN4Pch^a5}3{&{{5?F_*%iad)X&g)kl(t$hXsV3k3`}#wBReY3EcJ
z=RyH3yf8ujE6SN=!g##^AvTlnqi}Uddnk6C_XRB~8&GK|k7edYKnb0XoT)&0+KuxL
zz!V%JJ5p1H)Mk9fDAAWHiSm_vHqyZ2oOA2Ckoo?^V@3=Ijd31T=ChZ#t_*Td_k}x-
zK-8|XvSLh8yri}+pLRunj|a{)^*|fYV*h%?r*y<d5RXQJ@-KOFK6uRScEu}xe;W0O
z^;E@Da%y4}Fd)5Hxw)X%{iMY%+R}m%Qas`~Y}I{jzPo%p{EYmt(}q~OJ;TcDVY6x@
z{S3PyPFrzKJF7x`QJ=1#8K}_(D{N4|8M||Qcngt0f7VPZ<12gs%6WN9qo+UlciC{v
zn?6&x)4cE0->7PX;*myo$!N#%oP6Wj&*ID7oONwK6jBbUD00$dtunN1lKCXdoWOBG
z?F+$?Ps9OT3sG7g{O929S=1{4xFkcL>9juTOr8P4ACBj`yO6hMTiegN+zQIe30pzr
zgy=)^qW8hU#Ttp#ulv8HxFEm<yad<<=dkKj7THO<$Pac9^fjn_y5m}j@ABd{;7?0m
z<#&{Fx^@%t=fsd&rFYGNLBZ>!-KSe&wkr7<cRU9HAeVYO7V-`F<%zJGTn%y*r6P6C
zom87+Ml_zy<>12gah4b{0^CZ>@1<k-axNU9S55|KxMFMLA&XN6GOYK<P{c4!h%5rg
zctc@G7JOOyGmf|VL%M8cS`B;G(F(1<fVQx`cX;@$w^JsleIDpGUa&~KH{)%t4t5*U
zrif!1+6+;DVwlh?a}%rdGqmWWEBavl^J8GlFkoL{=7brT+FjeBgg{FnMjn!lfCoSQ
zg(2rj`&~-Aglds2ZbxS%u(OwDfYbpFs?jnc{%mn6{4GL^^PS#W0fNh(<-@teNX?nm
z8gB<P7&z1l0gVV+M3-Up-R+Q~n~tg5IN;>bqG3gn0p5;mVowtKK(U-^7!rpDe6nqh
z+_G(SAW(AUG+g0)wQNKLVg(J()1Tn$QwDxETsnpbz_aFB?*ocK5;#(L4cmSwz8wW{
zP1rk7iB{03-rmlxU_uSsy`^xEz#VF+ueg1n-CGyO8efx(^sH5-uf+SeY2TfIP}x~O
zZHCp1=7^y!K+?cM**=ZE9(i}C!{7!MpaLTk?2))iw*uI5ZP1zor3L7U*3JT;F%#+k
z(kdRhf(`^r<Y&a#E`OaW<u4Cxo~3(-6OPlxX6KdkG}bgfF1b3c>_|a%B?<YH=XjGH
z!rj=kf2Wlv27}_O``E`!Hz(1c_a#k<pdm!)DbeYqWFY>wm(2W10OVDp-l`Kw2dOqj
zM$sO;v=uS$mIUS988+IvHeHw<XQIa~w4JHmA&<x4xYtv-C?t%tVK@x3oYC-h<bF=7
z-s9bF>~Vx%_9m&;(5U(ssW`*7fk|hWITMT>1QToOaiK)(tGuhH4g+03LU4eD#Sty{
z>=xT=ippbuyM5u>yqiNmZLiez1h-se6n_~Ox#|9UU(B|gnt86m0R5QuV};10!s^-b
zM-%J*upJ9GafzDDyiY_dm_k&dmG-)%+^u2Fyvkp`=v);g0P(PJJpRs)FdWED!#cm$
ziNFw52&bD7D#eDgz&lfjwcA=`F@9vGkB(eU7zeb%EnZ~O#o5K*h2``0mGk?O4BOj+
z?qXmnLd}79<Gj4>!CA}lz3z@LY%Wq@Dv}zjrFS6~f4iKc+^uLmlpC-((1HL+SUpfv
z1LgqJR|&nlAvYS{^8`i?@eU~u01%USzmGNvraXaOUq3-Y5K;uBnfS}J>!cS7(9*JM
z)HLt|)!>f@geaf0Hm|~h*8K7kMdO?oDd3L<b8gZ4JIPK1R`lTaZg>%HN1}8ol7XkN
zI8lO9oG;vZR}>2*7gT)5f@@HeUrO>=22rlKte=xYN=(2;aMic7EIU&)<g&(lw>IF4
z;q%B_;P_B~Bb9DQ0cJdzuI)Ynu1(ZkbC*zn0rxaFrhK?EC#<z7N!Ru=;J#~b@u@-J
zsjC)hqH@LOknEr=lkA$B(kk!`$(G3Oi5zMWklu_$^eVBq^<CLKckA9n*T0#;%>jFS
zvW<Iwc_ml%&v$tauL08a_BN@Ya^mPO@DPk#pf@xHV;jBO7n)%oo|v%a6Rco&SbVQo
zQttuEbj%#lSKuiH++)qXhVgaq*vmREYa&t$GJ1L94ys>EqvPPcTg*#TRMvz9b~^j@
z_R-6_M0^};QVlA!YBi+O24f>9M5m8_+4_xoy|nC#slFa!Y%^@kG-bL+;mM*~(duRh
zfR%@tp(Q%t+mtcWiXY3aU}piGj#bQf-j7Ql_vG8vR&iutd`xt=cS!+HHG7R7t%XR>
z*4&j5XMKux;$*^HpwnW<3UD>o=pi`<nKp=Zo4pFK7HVTID9wTc#KI_MZ=uZ+3!)ou
zU@`>aOaVFjp8%FQ!){IGj*iov-UE{MZ(hyT$Md!7J>p#2k5Rnwpm!ZA?3kLbVKsEp
z8(-W~%^np!TEX26Q<h}^$ZK@^`rdHjE^o(6;-8N~i5JlCyUDLRO`Sc?!a5CH(b!q~
zt2*zf!5$GCH9lWd2rvACBDv!NX%SM~1hLT(C#P4_92YeD80c(?+Y=URD~=sdL&dkv
zKpZ?Ob?&{vEN`u_`&oih1sokkF&pV(#oh<yc-?Wf?f;x(!@QP4LsUY0@x*c_MV-Dz
zQ0h}G=sJ(1EiG&Z);Mpu0-(l_a<rZ;?OWdTkmbk&FTKH&aUVrq_~Gek&q&+Oov$+e
z<sxwaJ-2!*2)K@-4^ICg5l4r+yK5mDZ|<<YF+J|uy7Sg_m^>g@?)ODTT*?J`DHGX1
zZYEh%A#uCXX$xD;ZoD^YdZy$l{t})<oUbpwRT>yx^H&j|pZlsF;fvt`0vj<XOAjxe
z^-!rd!1+o)XKHGQdwi2CkjVHhTNd`ZzS*mwi&Q$D#}4XuYqJ|L^`C0$0i^ZG*G-cF
z@)FS^|85$4%m!=6I~iq!6R2*?yE$_C04ks6DEX4wU!2UpA8!t8H0>7~>osOUdBL@E
z@_7OxWI!4E^|Ra_r5vaMdlO$7{X$0`%pCkEr(-+_m}COx{n5XH1R4r0THEFQ$d-A}
zn)=%bxy1b;EM`J-oGQ9y8R%+(@QCt<t8c2NM_MhH_THX(U)6kVR%N^%I@zNGjZfu0
zjl)?x=7lzHabAIP+|U3KQUhr~>!A|Zcp%s0(Xuw7<G`+P5yZ6(>y!@QZ1Zom`8sSV
zseyjPAsv4lrKOuobmA?CSCI|F-?3;K0%zciX_F9#Hg%!16VM83O@TC!gz-3$!)yn=
zI9(tQVDgvT-OKG=TAzTnvCNdtDg`>G%C2eB334!WOK;`!TVg#T?(>?_#Nk?5cnLKM
zV=9eh1EYl1&7(~}-Up0l1vr_}ObIMZ3xq3R;eSNnpD`V9n+_Zy5FWS`=6r45O>1eL
z_T#4RvvMhP%*lQYx^7<ybe(xA<j>sJ2wwkR;?H2fM28|tPg0kKx7Z(p9LygryXtJO
zh+EGZ@N+G2ZeOI;tVCt|4?fg-LYtZDY|A67^0!_^hKJGXs-f7Gk{yQu)Jkzl&S^}d
z>)@Ayu?cxb+ES$12sZgA1_3x7@v1^44HM<Po22u9q%tvpV6Hh;aXD_AOV0JuXIw^O
zRR4&Eb=o7&pJ-8Z!6gkLS;aV9<=qikkxw_j<yO&q69<S|ktN0*pI@}_(tuL%M&1n5
z<x_d5MZNKqIdybu=8hXL=+ksLn}II?-!IxneSbC5v8Mfgq<gY+P2-p~p%MF7?#u7?
z`lsczDIxp&U-7ZuYsk5#?@8%VwQU%)mEU%USCXh(R4%oQ@Uof;ido&CvtGf?tpuLk
zYPE$pi%GQluW;U@`e+NaDQvTRepy*CVk*8&ZgzRE7F0neWvl%6zx*U%Z2#HnU~g~l
z@K96;#Q~CNl_t0q5R{%s0k0rl9=4fM^d9EgR%Y=z3p=Zt%j(Pmszgm^6_SOoa?S5W
zO&tKs;?kS|6#@St(o*Nb)<0aReOLR!#VO%laoOg*(}atiV8B=2E4_C%d$JxTkUn6h
z*XqX0Dw-e2DU#a&r3B^5h67`@=A{z_3*|!=&7Gbs{YuntSz!S6l&3sapG_%B?@yVB
zUI$MEwO9n<FZ(d|=cQr8t_LpxYIf*|vJpVtv9So1`I?!JRsg~gNG5!tm5JY)(+;}4
z;DQl$k_-FSyV9?Z%1PSB>@NXKX1@^`N|WUSU7a}W`-rVX){p*T!fQ}n*v>)l2>>S^
z5njvAs}3}u7os5mifBNlxiAVuI1z00`ju<F17Wmd^T77nO<dU8r7##awb3G)TO-o}
zf{ZD<EHrEPD2-C~V8x{|j~tOAFT>13nm?*S8s3fB`=t|xl?e_E&tqh$K0T{(Z<pr)
z7Wg5`)24&GivUDoy{mJMDZ903?!Ntudt(EqL`@b%)4`r_SI(ntph2+>lUDtxJj^A8
z<TtY@b2X;SZQ+N2(Q80ACQAu<tR*TvKnu!Ag}d@PA_H@1H868in<2?$HKs>i9>(JL
z)5ykpn-MOORvtgq8td2++zJBkfy>SHa{v{U-GqsJ+_O|Y?#I)5?LdgEXQ3kohzR+7
zTs8~Or~a(8uphb5G!|}ix=qF6pI+|QgoA+C6ucg|nhqXU_Ge?}hJkb-HTt@1uW!Mc
ze3@fFC_36*+4q0jN@^NEBGOYO#mEho7Wc>DlajPhS(-QUP84sn<Q@$TNoO%3G&Tw^
z#jLJ~YUB=MN#V;>jtp$P9@+J>x>#9_Aly)!emZUOmyIXGgDTrXa6-hC5{cCf<ro4+
znAg?w_Y~s?YiEc!?#~zafaa<lu7|z1w5U9kzLmfG{x@S-dJRy~fLQH`HVgl3yZzp4
zJ5qc}&^d`+$WMpjl!zVmOb;(do9?bK5;7MMLmsvn3PJjuH;xhA7tTHoshkqjHkHFp
z6`D3)5fmE2Z}cD9l)m(MX)S)o{h{?Y^~gV{@dge1hi`6iv&bA*O6ork2nRtRva_M2
zw=8fzN|0Uc3-5Y!XCebjlh?<D_U5hQu2<w!*$y>z^{P%vHmsLiI=coiPPsM!WQJii
zecp=TILyqGBthntQ)*W<=9&RmsPG(y<Ujf2VDjS*uq+6Sv~IxTHN}Kcl+{C6w*}1d
z7~&}5;%{~DiZtx?!Im0o6%ccz<M3PF3voK3o7QuU##3Vy8gqd#S1TpLvzo>D7+tZl
zf)qu@S>@;E*eCX^F)Xo{ji5`a5XZu`o)LvO;LlIZ9{4f}jnUDz=YPadeg#qlgkEL#
z@6KgU#jpqak;{R$$cz8t1H*63XDZSbDD^CAfpkrhpWV5qa8*_H;=*TS`;Qtq%{{2`
zVa^vQCF{y>(Z){c15i~k8JyBT5|^maSMOMB<FmR^cyhS=o~W|GZLW}uH1nQJOv)!L
z7{n3CFNBpTHrhm9I|^yeF~`ramiYOgvG8OkPQBOg-%Dv2HyCyXKz%lC6T5<JgH;MO
zeam8#>Z`Hq*%7p~U;0L}P3@-K<wW9PyxumMgY52-|NPtLqUGIkj1p<Ze|Pwx%L0Yh
zXA53x3ou<4X~>mi4JWYr@jvh&H0Tvr3KRH|!AlBa;u?t~r(vz%ihl6}qMak|5z>U7
z0{<Xe=^txKkVww#A07<4EPLd^b6pu7h*Wq{PW=ls*2&6%>34^RX<_il*ubZB4vPHg
zQy4z6kn)!e%ebl_<z-U7v{Lz`_taKUXA}@Q$#97*|0bKo?+x4|sa};|fU+vq$;>KE
zLu#gh&vXbo6U9AZ;zD>;(Io!X^Re(7_sV%^-|aT;c5iOOu>=to2XktN-)-sNO5fC7
z3g1B95z(fKZAsPpwe?^SJ?qDhrlCMTn{Xb&+;DJkxFHC=abK2y<`a0nv)TKWyQcF7
zb77TyC74EY1j`L6Kv)QAc_is_*v_BMjV!(NdKylm*f=7RmC_>bF(iPgmKi%=O{g+~
z-)T{yM<IR_vaZiRbuwPnre3s=PK+B-$ANL_Eey+Bf^7;+%faaviGtf~4ES1g(z$0Y
z@6v;*sG?;6YnS15@HOcl6c|HCf!u<$)7}#kwC?fS((2H(vLy2S<9M}IqB}CyNZ-el
zIA2LuAYd58nZ5l|JW-d*nG@e9-e!{Z@j=0MqyF;VhlO9^EhyFrCYplr$uH8e?eC}+
zJ^fsmTgajMjEZm1BKsUnzQ77%e|QgtH#YSyrh4-g9e{&6UjbN<lbsOt$ip3Ty|W`i
zu|>g62U5Ef+oTF?xhpa}DoQbh?Rg&CXBmo5n*m|RJ2xO?HNHGD(WkpiL78OZyL9Aa
z#UdU51Bv*<l+dd-`q}{1#PX9sQoq=n0)iw~hUb>0YOj(OHv@{wUuM)yF16>G9uM4$
zYF4$HV}#)0VqA7NO+E7ASWTDkFaNSbND1#>$-r@wYOjXdt}zdM3na_ex-qKvvHah&
zLf#7ULMsy-Y0IYo!PIvR|0x7v*8MeC<la&XzN<4LMJ7zJK%R7o9RQE!=H@!tS~Ov4
zXn_fdQS|Zas*Wx@egy3`7QJfd;o)A4#&S`y4`7!=ITN+j_RD*pn$mA3d=9UDM>`!@
zD8P<-{Hn1+TBr`e;H+X^l9rB_`y*Cv<8C|U?%_P9ZQe;g*6L(cR(ZC|9Q`fsWh=;4
zY#yTn)C@v64ACKRO!f7>8e$e=^Q4Q|=46F*97s<@lC2uVA0II_>Wfjk;kdmpj&3C?
zMsg6Qvrn0bAy_HtWCiF`J+J`?$X*t?z*33<1k0Yo5>*BmXwizQ{EmqRsdOn)V8BJ2
zd;1CKX=8L<wWa^4XX7XvB(HTEAG~<P;a&|6fv8=(%ubm-8E@~q#QGj_CJ^WYgPV~w
zy}Sy_;MDiuRvb}Mydyd66tQ>=F549qcdNfz9o5}M`ilV)%)F;&MqswEAg&Q31K?n>
zF*B@~4OlzrYWexb$!gXKOW_<B?-x#1Ynjc+XfC_aH`?X@uinF25X~klI-RyGtY5Nq
za=&19R$|V?+ycInnE?lGypdJWNzd<m`j_cM?9EKpwz7lLDN2&8{Kp`s`t?i(&?ekn
zf8Q{EO(AMGQ|cdon6QIe%@Ss8p97_YT^vlQgO;AEI}-{KZy$ZOM)kIp!tn*O+^4N~
zgG$D!@#%`j)X#^WQ8UB^EPvXTezSPtC86qbs@VB@`19DYXZ7H<%5xgpuIpLbInO$c
zDj7@7Bbb@+f<K~Km)7{}STz@J-hgTYgpEI_D-93&0BZ7`xNr3c+%-I=jrjq~B9j=Q
zgZv!n@=t?yCcaMx`g~$Lk%WR&`N(<7IWCr8&HHA$FID;7c_4D{=lSR@wvvsLhm4@`
z;AA(>gHP!vjZIir?;oX<`0Y1uE${!hgY&qcSERG(+cW3a`g5!v=NeS8Iv!kqedse{
z%*~4Mp452<EL-&TCKOpV*u&h8Ir(>L;}IDW5f%}#(FP170oGqxc30F{SZJrvO7GE1
zfcl3)Q*KG`57WtsfATwt9LK%B-0^<=od-+Q7J=%g<l^pE_11*7<%=^PVnN5jl>8#*
zj-nCWN?Xlvo57U1;tEKvTl?_sYrH#n%bhdUC5YUxSP$*fFyzEyzAFI9vh&IirlkSQ
z6ihb_gon$CJ=bR2i!{=$EQ7<vK1kX%8{5^kgxaO{2aW~XHc9k#5rLK0?#g7r!I(<L
z--5Yp#r76pM=kJb;6F?uEhWZ{8l>e>xS$#4$ud2=t<L#cQ;Ae#M_gF}|5?aFco&ps
z%gKe3P)VF^)o(Cw{^U^%QqejKfgQAH^8iP`qu<CzE2cF&R<~-FI~NoqN=Zt-z1~H1
z7mHttaTjyi04SfcP6|gftVzjEH1LvlvFi+DM*x^~GYgUQ85k)yVRZ03hTRgwK)P}V
zt8Q!RDv&I`BK`O{{y?43Au-$_B_{0EGlsPkP!6!Z8W(9oB~)Us6vyo@_O4C>!R`m*
zY9`upAAb(l@fSm7A*wBqjTn5Ez99|0s>9wdmU?2FR6PE7Nc;Ke@NII42QVZt=FCeH
zqWbEZ%V7LwZf*`<J;Nn8wT4ag&K4oImF%x8xzt*Ox8HFSY0<jD8g3yf+Q)r|8-`Lq
zXDv2dGkj$`r!bM#7{exWok){_pBe|WWIDA^A3dLS-}!vGc<OjRl;5W%OW(_DnS9U{
zo)Q?UU#qPF(Gl^U^dBpU$@%d9ZSH{bO=IuXTcOVqVkV)VV&A<`Ju#!b`~FF8#~4{D
zUBSgzQCN$%=q|e)=s7i2N-`ZSx3wq7MH#)!Q+5`P(kn4+84G{fo3>0B+?c9j{@A2;
z9{8yWk~qOZ8pMUL#=>qN7c!Uiq2OHYnI{fS>%5@5bpGYvhAQq~(5%n6fK2`JU*fmR
zB22b>U=^49z3+C`XWDC}Xf;OuTxyN^W1Bc$QAx@9sbh5egnJ2>oBYMr{0d*j&3D%7
zBwLw_^B1Hs%AS^2@=tjW0F&ya7ll6?tPFiG%%I^wh&Z(|XR^B#P{fh{v3_1x`*ikV
z`x*F%#o_7;l0)K>^^9?qLM>6HHpS7!X;g;F>>Xn-7$-$lY^qXd$V_NVWEkjO32^?I
zn|rmryt|_#BJzA_<lgz=>SjO{TF*m0#LZe?MJ^?oc^5S8VCG80wr*O~9eMJ1|45nv
zKy{uS?bc@}H(jC7+&LQ;jDNwr9oAaV)1w@$Qp_Ul!<QtSdFip=^!8UD?3&Fn;4QaC
zdn1e$L9Xce@8ZgLGJB;H6P1%epO#v4au(o>VD2^ta20T62B?e<H%V1jVcfKD+|uf~
zOLGvS#<J2-lWxlZCUjti3)m`R6a4sVbFC%Uc6V7dJUnbu%csN$fQzTC46e5{j;7!M
z$Zcw0K|q~~ce`(($=0s4_kx_BXVLfXDa2;I_17PyssMB3@+}SY?Drs;ho!Ngs$;7E
zAPtwa=El!5DaehIk~geyk0~XdSiSJ}<hwV)I`iN1^nkGjeGMC@>p0ZYet+rGJKakF
zHhjRZNjHRa*UYJGbypNZmd(vd3fVaMan@GTszB<r70a{IF5J8dZ(l0Qo?7j_q<Yoy
z!H5{@cK3Pn2*5FY5d7?CwpmdMOlkQ2Q%K6u<8@_JYTO@Wa@{JorbpHkJl19VeyYBf
zhP;W58}$`O!1oQiTq|z7tomSe>-g{Azgr7r6{C1l*^&yOSQkZ=UgcbUyBySdN7;=A
zmiGsrv+hHM(N{RX)gL^+GGFiZfb{TA&bMdryUfMc@<MGTZ2Q9ntA00>5x02Tz8ze0
zU|~P0yIx|+mmyU0W$bz#J-Es8K5VUYxD7L0BQC-_DWVa__lp)*pv)?nAkduB+2r-v
zOQfVZi$YV2ca-u`Qqy|@S5Z?B8e^S6B(R_GjZzEJszDpB(#C<~zwPU0c+jwl=x7om
z(KE@5qjr>UO(Rl*>$2^QT`kP1{z)%09>>81Y2#<}C)hH#KX_;{GJ18$DIJX5{4hfz
zkeg$?F0g|d2h^Sy*YPcv7p*?Fb6+g(^+v{nRb9kYm)=Zu<o?<=y&oev#Wf1Ae|Yi*
zm~td)^Jes8l)k;W9@0`fxIjY_>^A%l$}1vH68UQVip-brPvpb5qD8L{J;AC|UB%wr
zE$puUTom61Tp;OA_T2GQd)L0T*Lsp>mq*!Fkjy}J_L1eA3^=ADxXX95ZTr8Z=@l8=
zmtP>0(uZ0f?DfRHKIw+rbTYr64dS^?R-2jClS<@vI6YiU=pvcv5gSUgO<|J-6qM?5
z3tK8M1X}WNoZ4sj^H$gykmn~4qupugzBnV20U9&i&Q8d-!ySI6b(&GJ=)osNMc1N_
zHvtd(A)w%jHW0h`O9YaQTB6_PVp9a<ZV#(`MR)`vad5;k^MVaUQ{Yq^Sc3RGNgv41
zG7`MUG_cwx0{QLGal02X88~tBVxMjn&+>UmGsbnP!IIDO1~;7UP?|p-@ZCs<kT^7;
zIe}uRW;_&+J2se7`#Pqq8Sq&U`JgX*s;!0#{IT<xRV&fW)+;8Iln1OCM07-z7v!`I
zCj0o6o7YOQ-!YG<LzJ$i>0Mc>_2{x2Xj+Hg%4?YpIcRpvG}D9fYsoA3su`gb*<UVv
z2oF(WXQYuX;0s=+ndVjkOzx%Gnu!TO@it2Ckn*TTlK>%%UQ!<1dtOh@!ZEeR)A&u&
zwgUrL$N5_PeT&hX524B~8Ug9;U6*4Pydt_HZFBpD%%EW*A)^0%N42X;`?fxTGszjI
zlo~BA*%j_Xn?dcN1paVNv~^mo#Bc<V?{qu&m7sdS;=6aj8Vvu4N^-$k33TE;=|gSj
z|Ng3_t$KAd+c4PqpksgAEA43&BMimZ>*>3uA&@^d2^GnxZ1`||U2+qiUv6tl@||@z
z+4eFGUxsKR-+&2<HM7+!V^L{TSvci68if%HC21K$RrqOU(AOr5sGy<>mtvSP>wk$p
zUvuZ?vbjJS_;+(#lwRkOnF8NH&ZZRP-o+;&|8l@W_@O?&WOH=tJ4Rl4_B|`#%R*JI
zQkzF>z)NGxAF<a(?=`w1iu+N*_cX6ed96cB?;BjQ^@bsM;PHpJr_Wbqp!nz+cZG&g
zugjgh9@HXpmZmpk%|M{XE{C={A5*r1I4L)xrkCn#1?W169H0%%Xh<(x%RiUztRLGI
z9p<JwBTY{<n(heGLR35$oE@Ca0Om#Pq~S->YRD?_%kd?LSK{#Zo@k1WyEj4?J~bzY
z-b!1nl=_yb)4sT2nbeuazsO=j-9En6Im>tC)XV92=bBFt<#(QjV8z`|noz+G2Ygx;
z>LCU{4O#a(!$Y%tJpQEb7D$T3ZH(S{FrF+bDi?illrS*xz}X#UE)KHs0ZNJCwvmM1
zoxMYOLo>Zwas*^<Hh?H8zBnrGKG{BrJ{s;W%}yV<4v9SZ=LM^tw$#th<uSijRHUwU
zSM1Z=ZMTjzZ~#k89+MV<HrFj^?5-M8K-I!_E-<(4X(d1$di>H9{s?-ycSMdpCf^fe
zN+1yCOoumpfm&2EBu!?FTwtadBNYVk%6Fy=iwJpITE>QS>~n$*tCeRn_4+DZORx`7
zDi*vZXh5g-bqak$wE<Px#MLeplA9u+z(b2_?jkasvHY~67!3{Tv>}8l3i8DiNZrbX
zK!zKBA(O`2(D|WOx!IOlKfGq->H{kk;zQJ~nGIo^@yS)x75d8ubqiD!O*sySsbnS=
z2^)$lg~8Zq;u>7f8~ZPk;WOWc5S>_gD6g3wS|P|YD?i(GX!F`H_a{A*k?z&B8Tyu>
zAnzgqv47}BA?0J=k{KJBM&>Ss`xTI$KHFu$m{irX6=pHc?U?vHF|RPuK)OnteJjJY
zk-~W<U<s0Danki5F4PXTkdz3crhwIfXwlf*+($$rz5|9{%Pw=Hw6HGA{9?Yiu%`+A
z#1hNHa{qoQYi8%bgsFjbyRFYyXb{6Be9#@XT_+hUqvN1i63QEWb_6_B7ZG>il%|p`
z@BaHellX=*)8to6^^Fmg+>xn%R?e9Prr7tiuB>#i70^#3db$QeqC%53{TW+#zn3oJ
zSXuP}(v2H;Isj`^97J{5NnM<iF7O90obp&>l^pzS=aO{cb7civTxNK$jv(M!`xTTC
zYM+?F-+nodHNmgFB&%JDxe}6OY!IX4T3701Qsp|-n(bik4=I>Up-(luTaT!*c&-Q@
ziBhkGz_V;5M-zON1oS&aHG=;XtE+(lnB0HoctHcN!Vhi`3viOoQwz3nO9sU$zGSK+
z$7i&jE%`ku_FVdA<C}8|h8&b>Yg!ZjeMu5%$3qu`6$YZJy0ne{KsvX4mbDXt2NS48
z7AL-VDTEpEzc+D81dv!vw;QPFzuyQibNKdBM3KsqT07`h$18#Ny787ojOwysB({lN
zzXop;(0y`ydo_5<uLF!Zxy~Wu8ptk)pEeyIec6IK<2}@isiN%r$oO5AHxc(rELN2E
zDeFU+>6<pUc-kwBtxJ!rzqP=%AwhsrvNL$85urO>I%}DWs{7C@X6aV>3rSX!Tn!12
z>;!&>n3QOhqMy<M2>>5>xWix=*e1;??Dy{M9PaG)K46D*+1WjL;tzlAcrD&rX%B08
zwSyv92KTl2o~oeU82FHUBmtRQu;4t7;vR6bW0RAuuc&bw6{{UihL%rN5|04`=bQ`i
zSw7)tb-<|jUbIdhoT!v8wr0<LO|V<FFdg>msO9!>TQgkpk$LGFGB<E{j7LO{0obM^
z%QyG^_di>Q4lGH1i)99I)42<k?u{TPh}Ksdy(2|KVp{>3mo;*4<WC`cW$v(pfFqC?
z6cnVZ*x4)l6+8W&S!-F)=z;#}B`S|6m-^k{vw@OWmR&}0`J4QE(dVA6eCUU0PV?t=
zGm2?bK*!I^p1|#nR$jwbU2mF3P3`JJTm0G<y=!wlswLxoXbOQ|TTYvDW`A3`HtR4a
zWV9WkQmHpIAJnkib;_l6db&vHwP!K0AY0ubb?WL9-MPh7o6GJKzWZ`JT$S{_g`#p+
zH5M-UroC{HI5?TdB8|GyhL@=K>D%dbvVxX2xu3ZJ#zU>;gUD#U2d@X+*i&+ZoaZMi
zG86x-CxhH<+qoT6)el)-B<9T@$*ASPwYA^wMY_=Of^<Xg_B%2Z{z-4jqGw}s^qQfl
z9sax4^5k~zRqMMf(_t@%xJ1|}>VJJNZEz&K=;sNxHG;~-A@4X<-mlDb5bN(zPj!R1
zJmT;%N<rKFQ*Wrd$7;e#iku6k^$i7H5GjS+K-<X%T-U|n=IY8gK=iC`cE2Q%>)&o$
zm58`z?mjrzG(wy#RT(ICwNe*{Er<FrjLTY7>6+D(-$+y=UKdk<N)pAJRloK1(Hi{d
zj|-1gEq1sm%?=jjavT4>xw5ync^&eihq%@ce|DnPUFI2sT08<ZO|v<0Grd57<2MvR
z#RdzU9l_~~XYB)ww=ZGwGR_d$sVlNV^{9N-_l}oG7OIW5-_ENyM_t@LxecV`g$G^U
zmJR>joM?;ly*IM#yyVS7!@kEi98?RA>J|6vz5b7as_n8hI1>SmGt@|?o9Or)&Lv*~
zaeioGqV3p0(Ja9b5}qA?eXf1wx$5)Npo^)y$>*@b+V0hle_Gtw!@s$J=f#o1KN+C)
z6**O?`*QUA-d@YKwC%^e?o=!k8Tpa4-V2?y6z*QDO14tD+`U_hGjRP%H(SKRmW4#<
zK(DV_p}yGd+qES?1K=OpDM186iatdH@m_Yf=bUVrfYhRfnv~E=GkD@L;AUGHD2|d)
zxL6gux9shBkAc?ptnAe*z#SKTvi68-FCBqF>v{FJ>Kat!%_0$Z`O}NlBgd4D03m%@
zmZ&oL=QuN2ZO#<9G+9~M1OSxQ3b#YTJ0n6uarC(|!RBq>2CMX14V|ERlqc4z`Hy^h
zJ&_4$D?^(4r%&o6$ZEHXIuD|d-q5{O=6iioQD=YG))uM{KJQUab^OZmwDKE^Jl@Nk
z2L_f-wxiaJF(~slD6Dhv?dtI6tGEVCjPr#-i7bOw^?q3ElK1LXkX*IDRKJ`uLGYU4
zm@+58C7G5y)~(Aibsuci_29TO|2{b{w`^wipp$$K@bU@Kr>9$MsM(+2<dajOBIOaC
z9grW=0b@tAztz6d(1nuPU9Hl2CiL<bI!2a@y7O!4Yg8Q{e4A}5%kB&o6<Js)VI3V1
zpEO|g{)&)PAoPTL+jBPyRXEwE8WJ6vI8s?CS<9Qoh9Tl;1rc{F7$UTev_^lv4DtmY
z5*Y0vDNN`<ol<}_(UJsNm|O*Kb7fU?GI&MaFlSbQpQ#UNHiRl&;Jp8paQQVRFME08
zA;HF^u-vTxXKK1<hkZ^52G9~(@)Sx!An=`XkXX5st$<gO^psNiD&y1F9Bi3uU5_5Q
zuB!73J-$acT2WiV*-n<w8SB?83F(X|U;HKCb4L$%tixGkf@}dzrgi18tZEQ?HZVUo
z)Cy=uRAARx?y9NFvd6-EWLa1m%%x3uJl)jC@j-8EQHHWW+;274rlYAqFCRtoM}5#!
zXJ@F0uq*bxmiN22P~9YcN|5}#vc!ie_gnh7A|8!dCm0a|1E`Dj(Z@Rh?Mv?1c`+~z
z>2U!+6VUvi6oURBTPmTDII0T6iw{ALs+ptTEUo*qC$4yI<^SgYU~!7CG>C~^_`Tm8
zwW5wz6exiSYAkZSYEix%^A2}?`!$G#Wzd<zGuqxIWJjPeF67>4(i5$~gj0?hPQ;U9
z5XHQ*pyo?Bx$3~j9l_I!bKbNRZ;lSNh`7qo9sPUsM%}CEw=sP05>AxnuVs2x&ycp3
z_$J-{q0wl>i+jLEo3FruU4k(=2K~g+bkN(Qwd3~qfxUZ+2+4L%Td-rsBKCL3$Z%W5
zZ`s=6hR1ngs=+D(g$L26y@KaE3HJ)<j>6j0fvJO0uXIGnvCyQQT^I1v?OpNZW&+|%
zq#4jyKmKy!M=9bsbM)y{aeUVaS9jV}4FuIJ!WIFPiBNc@WM7_{Izw;q#p%~7-`0-Q
z)jWjX*j;tCYeqMnFB`+XWCC&`ixU0EhShAJbczG3!)wf1f5Mz~<n3h!z7-l0h+Aw(
zM0!k}qN@c(pK`Nt7kB%A&0>fO>qNOxal{TN4-o}~C3XCFAr~~gC!u+6pwvbY>?Y%Z
zwg@pUN$@1_-zWFZHjDpLU%z!?P4o<QbwlU46U@heBFH2>e#&qP3PsNl_uEFuTZ&lA
zgmd}5%+AJ15eGdFw*SyOd{Z=usC5P5%F!dB0M4@=%TFrFe1_fYt&<pja&Zh8osQNN
z)1hMxRtTxS(Qvytg*7TV^$=3$!>?P^Adralqc^~ix2pyA$Q%e)h@0;s=?$W>EUF++
zm_+KgR$eWZ%w0wCjVQUDSIn=R5W_vwB67}eOx>%4XbTDx^YinPUQ!MI9pKS0(-XdG
ztnt(4@9?m`o^pAOIR1r;PGhw4)pl=sZO_rr*G<%fkJzAAJB$6*0cJ%gtk_iliVW{V
zox~9)e<|h2qrk3?E_}tmG1ja>6<V=ewX~K%+oZaOTE-6Q`<;lT#}HPSaAg(Zz90R8
zmV;NFl-Sx#oOE#Pty~9xZdXRu>sj<;RXQC9*y+%GEahY6%E5V1VkrN>!hYFg#+T6U
zLQ8Z5f~<xi|3Thy_^Y^N;Pats9N#dqUSyMN(bSK*^g$V9Fzuc2I<an_4qJxR^*||r
z7T<9ANay|0b_Kc1!8F`oFIq8ysGfE3vdD@yS5=h-X4t^R*A!s}OcW;6ET)EMzK1L2
zg6fgSw8`};1HHSuyDJe<H`ZA$u}cvP^fjSWz&Tj>Pj+o%q2B%Qji0_QK;BY8&{sVL
z->M77b{6&qe@W+fX1)H8-DX(>w>2lQJLv`bysp?xNMu<%zjMGyvx$Lq^kyAkx4hW#
zbsb)RH%MJV1&wKa$}HWKLy<KUJ-F~Q@3-ceCxqnyu37fZG=<2P5DHLUsn^8^$<C|G
zwsaa4LEg;4bsWoUmlh()W8F!b?5>m+_c;`u=)l$wVz8M!Cpx7fvk#s^TqMKCeB!+|
z9kx&GxtYrFrc}e4)Fx11K4SDrFnvI43~1S!v-1yu`Qf!!K(w%)?|Fne;0X<~y%P{Q
zBvw~l`E56#J?&!P;=CKU73zs6y>-Co-Vf>iy1aZ~e4m^(mz|rP@5UXr2*G}Wrvi-|
z=xG5Cc<LJ=Y5|c0BgD4S?97TJyV^UPgJP%z;7HR-{ql^{f!xq+nxKwMzx&k8D?Q}!
zbaC?(%D)~O6&k8sXn3*KeOeA*sC^Q3a+Dh#VC(Y0tMX`V?XCR(I6Cims{TKYU$T;L
zGt0iAC`v{)aY=SoxgvXA*GTrP#5IfTtYmX#-77m;ak<9j8kZ}PtFDo3H~h}`=Wh=W
zmvcU!_jtWt&ye77CYdt%r*^mC`pl23<sJ$FrFI3w0B#$20pIjE9PVX3wd`)O7mKXA
z^6Gv<#3GxvkN6_5_OuS)JCwp=P-*6LRu-&!{5*2)6(6erxVF0Da~fWB;=wc&P-T3F
zk(ru4=qJ&ndP+!+sO(3$?b)A>6`GWjeSocB7tKUZPn@)_>)jNm%GcS{TK^OpC*+6r
z*P9o-!RqYPHC2HhgLNO%)2eFV481UTlEAwg=<A#h(J5+SV6uhXT=i@wN;$1!*Y=Gd
zjf3^7S%aQ=Mw-n#aUIv(w$7`)>bf8%(t+)60S$1|q^E_5LNx8nZzP93GPvw>HG1Ec
zr~NK57}r8+ZK_3cQx?*hE$KBo*r4fVk*E8~Z)<1)AA=G8>u~v4{SNb_g#m{>&&#e_
z?B4Njfv|T|jsD##WG;{V0hh}*D68_9y*&=M!Ou{%NR*}KvlNSZ!0_JVpr|~6my_~v
zFzIgd$jx*)F9Wf!?BjQW<m#*IQHriXY^4NH+3lszuo6>N0xBL1(+GG@i-u9IsXtDM
zLs&L$CTpFLS0WGPjjgqM1ea2h!loH;$@nW*d4CrM)0$f*$`HLlR*!5f9^^{qdTLuH
zx3tFvUIL&PD%Yyan4>*k#ER`dG<_jZBuU#aP=>TBnOL_M7uGTLP`Rd4EGY@x07Cy4
zMa6zRC}>s;nSc7u^X5W>G+ORqhB%Y3u(yU?5IgIJs`LFP@a}+n&w5KMbD8^;Xn8mN
z<0lQ9W3Jn*F177M6bc_aISRVW%@=v-ZA}yta=tL4Ub{b7a@HkF$$#!}*4r?kbrShx
z&P=yACE}H(1wh&unF#fYrdbmQnOV;Mc2lS?RZF_Kh+b+cH6#hGp4N@At_5Yi%pW0k
zam{NhW9WU)u9qOH%^!l@FGA5)2-;%4e})8#QOwrVzr)iuP1j9FpI_&ZbdqA_>1p~l
znN78cPCRX_G=u=aAwB_QI4qN&p300~;3!}%2WF7cJ9NzA<cRQaIqRdn(38FCr*IUd
zmGzaBHXSGn`<g%E@US~13pj^iGhs0NG|-rLh6mu-v<nRgeov-%|Ncc3kY@s@Q+&sr
zQK7)dB{+|K6x#CG!ysw=$0W-`z;V6z_jewfjNwiV^YCW4vU)`D;qLZ!@LIMEd1&1m
zuWWP!^f;RYTnW-Yfy(e)yr7`qhJbr3dU=~SyaHW|Z72~%_^-ms!gmE}IDh)`8k6yg
zI4c8JFN~PD;%I+yd@pcR<P9~{KB-1LEdJ~Gk|&j2YrU9j)CPI)#;hbAs{bn!o0B&H
zd+zC0QF;F5FI&#SZI)s0oUEDG(t?I<!HO*VFE@Y+3CQ4@{PF7SG;Rx+=u<|%A<e4C
z!>T-wI3qNA*;&%dP$?6mShHzbzuR9kGR}{)7(#dd9Inj*Ib|b227mMZOlz%y2db1^
z29oWDD@rHSz|WyxG0?QK+rX8`@+hX?fF6O`=`-Oefr?MaMqmAzPADZfisfKd1Y$oV
zejqf|8ldPcpnO*o_4OanwMx&itu^0z>c}+QUV9@tw2_U$#JzGXo+-zcrGC|`j#*ab
zs@^+>mUz7dVF>${HqS>R)(dGBD4LgUZbLKb&kHrewTeCX;S$|#c~f%gR!k&M4z7{~
z?-E_YTXbT^6Mbn(h>^vWOLyx0S?38bA`Gu~XHoPo2<DFo+#;0Fy|FjCjzrq%r=+Ip
z*$<~CNQ@7AbtLws1HUe9F`<OO_0}DGjutLKOTTRudo%NG+mb;h>qm&>IzMAMP(29O
zw1|LGWnqsFoQ@(<JY4i*tA1C2Vxw!{cKQUbA^UK!57xVRO%$9zN_OB~?;yZ#pld0A
zao^g(CO)#otCP9xN}^SznFOu%-n!EhYF$nvA_;SGoxDwLq#FH<XkPshp*frON6=y(
z(XxM8A#(Fnev$BrdVf9(n3l@hDtoJBytdYFJeq^n1UFwc>5OuQm;Jbnn|N$2{Bdv5
zu@rOk?CfuqV+iko17^XSw2`a(em21^Dh8DBJgRl$^4%u?vo{IROiXXTW4<5`zpn<L
zqr!aNjTcx!d^}ReSIUc+cN%WupLiO>F^`;K1lvDV(YK`<#pndzFg~oTl*WcC3OcT5
zx2*gND98$txoRM-)PIOKiK?45)n^!Uv5q`FBEu-az85I(2GX!upGMD)ea>2IE7SFR
zDY@(tP;_gY*>BaZn>){x3y(X6j)9Al!$F_&^B7oAFah`JeuR%R{7*rCo1OHsVkXQf
z$?MC*T{OlImg(Yw2FlTt9Ry5M=JAdDwLnjbLgSDAEt_c6!tb>Cq7b~HTPKM2ksZiH
z_#btKtsYnVXC!tE%zrwdxWUT<>}}2AN;f4{Q(*;V>298r!anO*STC46h=2zeNtE`c
z_~X#rTiP1C;xykX>W2FjtS?WsiqZ*-53qu!fz_tt!4?4qA8M)^a!{lbvZ)3pm%=I2
z^_Wca$cTuG>bnt9s)B8`jdwIvo+2wAXggD8T#9G<c+X`&v5A?@!OnzU1&8E{*AZKn
z{rE3hZ<AjA-Uf(Dtdv$`ytCZncbMTFwX5|MW&HZh9j{qF(2S=|fEa{)vY=C3B@%O$
z-#`K=qz3mhZ=k*XL~LrfKXC1?`~7-Pu|zxgcXwyXLTsnQ$R#-mY>t3B=%bDX+wJw=
za)AYxZKVR1O4Dl^0&23%iZTO6><4#wQC6d*Aciq;L;iCJDsy6Bb9vxR%;E8IXq(-|
z3%1kfmGzDbGF6^ZTr@d~vGzPqS~)I8OO(r_B*bQi`&jrwbSHAJl!+BXTIAbUM9i4R
z?JUq3*6-WDB!aujnjVnAsdb6<7C_Xn+c@`CWA_%kpkbo>1dB)2Z+;(ZSt<=7cSfXd
zJ$D7>>7onaa(6s276z4a`$F}V9_hTD^EJR+b**WT5BnwkyG$yZ*`PA&gz|J+?B^3;
zTP$Q;ezgq*Dr=>sqpv~VZWF(xZNFt9nsq4tW2U<)xwk(2W#$E$YDb*)y?m!B2786q
z3lbfLKilS(^6xnH?EF#Rz<?{?KlPyg+Q7TJ4TE-VzxTCWbX{D*EPk#YacCQBSHUiw
zoHZs~c~}VvoWhzV-s1<2J%7VC%e8ZG|Iey1QE;WR-uu-9Qp{^2Q^<Y)**lEBZmst3
z5A~8l*)p}>+Yz03I?gz_2HCAOHEHkcgBPP-3%D(8vaDr0t~<^^6hQYp+eo!|RaT&R
zQ;hx!)E`NY=+p{K-jE0Izn&|y?5LjY6SH8%pL@kc5sQV!Zunb|a~6Jvw!9ha+!7MR
zarbWKUFgoyZ6NdyPh-W%6P{8n%4EkRWK%61wRb%MX|)Q0$n>%oPo+MwyS2J-4|?^s
z7E@8KLA-8#jQzlirhbwQ(?~918rv6zE_~^yCzfHj?g{Uv4>X469n@1MauHxg;jLG4
zfX-3+LRjG1OSS=gd_3m3T0BF~GJcBs*1W~8HZeebN-Ji#U^bqa;HmNY$xu7q!01W^
z!3w3(9eF>a)f=8rh0cW0d})7*CU^k!#QMazWp?ej-dmq;QF5C{-Ya<gVyE`pzCC`M
z1(+qmjV13GU4X>$b^wu@BH8#G<iM8Uofus{1=Fd#WPAcCOzYV&v;EzVW|W4TVvocz
z?)SE`bJRXm9Ru(&*VROkm^G6Db3_JGv-=Vuf{7q|`K8;BJb!)XHFFdSt)*wnV<c8x
z#U1SoD(m}s^TK7G8<K=7uRt*;#_0fceug$%MKXMFHQ{Q*7?&3cMbYXRJ5byB8>+=w
zZ1%gghU~8>V6xxwz`!%^8Svv=(9YjH1$q@kll<vt%w#>QCra+gvI~N#A2b96p?iTX
zD|u9hZ%W<FO-kMQP2$DtGdK=!uV!b^_0lSH$elZ1Go7I6;>hneL0Ni1mT!&(Y~cMu
z)pG=1kZ=rp`+dkVVE2A-?hCKV7spR>vuw&cA8Z1ciyP;>fhel%^2VlcHablx77u}j
z(PIdn9?>?(KWt@8;r`53W1fwJ!c`#0Tul*~NHC29hYT&G5)=k0rtBRro;9DH0<sPw
zkh}^ExARSak@IWz^-Wf$k(x;)`aHPRsG`we4#Ni)rqzS%m=hlh)Sx8n`Ivum!>QsH
zi&rH!Z_MvMh<t<kMLb)7>UY-gS%7kIHL{R>{|J-+yW_N*p`W-In{R3M`BCtAZ6)W!
z#)NXKYK$4MBWi+`S~hwwY^!y!-G+k1j!&Ep*%rT$VrZ%64s2mIRE+#Hs$lcTB0CHU
z_h$fs8Z;#=r#1?8zb!b4Y{8|3!R+0q&)3u>qJ&&Q(azH2nqj4OFplftz}Ww}az+}U
z6UqGhyW^@BJ{6EczF~M(@q62s_qY=siv$#u4HhMkFN!H3dz?8qA|zx`iKrl(a+l}d
zQTV=kiG0-2->J5-gA?G&Ld=ea*iQF_u~no*xEV?o)2P_k12pQJ5>kX@gC?#Sq3}P3
zMz9<=N-ZbXmYU2BGZfm=nyfa!%}K*g4HLAM=#x=db+Mkv4gWJNO?S4`1879ZWGIQ~
z!RNT%CY{w#HkGzG(E#~zpb`GNHQV2M8bDC>IxtlQLhkl>LPhC(zR5!L;7UAB?bJ@U
zf%h`!KBPk@18kusd9vUz6I&j0{e6~$6HqjIYG<jgMnLwJqzr>lN?cFR@~a?ea?H2x
zu-O|1$Ld!Sc%9a%BvlYT+y)qU`=ds_zC2g)>oe$sJU17}6~iOClI|}}V6`!jZ_74%
z$_FbuH64cy$@Ktx4nd)8U~-UdhDy%%6eII~+1e8Cd3n=G%d;AtuBHa0nfKfV8q0IT
zwSAP<xs)X6I6_z!EU~6K0{AQdf{`KI6D{e^GOiK_#f7|L%yLN<5lp!cuJ7d*hlq2G
zTRZCxDUml=?+ptVdup*f$L*Z}PjW*=)rg+y4Cpbz#co|{FvVhb?^p_zgKV3rce-9w
zr1yPxOEQY)wn)=CH#L82sQzFTZeP~Y#Zj~ZC2@V6`#!H0lgMKe(`uAAY5<^wpC99w
zx9}`jy$rf}Z`_JilhdQpqCQf6eB?`lJ+i(VRRxi(AYw!zj;V5vC-0(*Q0S_O60j_U
zuy6}870*0`jP|-EZrQY`Tm6=s#$QFohfxLcM}f}b19~+ZI`=d5WMJ{k&i}l~ZoNi+
ztVyAIRPeEHsA8dUdF_@H^0SP;1)t*Q8(lPCnus;rV7F2iPwIv1%5Nm?ALo7RWZF={
z6IW&#KAfB<-DueV7**(f(9#mI9#MP}Y3lVuE4*u-W__;v1iM*LWt0)2gh#k#5^64@
zojpox8k=h76gVAw)=3w&h+b<QzimlT=Z>x3zYKzGN!1E&DeHX^&|Kg=V=|<BAHfH{
z2x{tQ<fI>um=e_z<J5$>+jr84m$lss^*7wY7AD&^)JX|x`*MHzpot{!oTo8^m3!z*
zXsD_N+BV-bhnJ%DV|G+Wb|w!1va9xD0O${Z6achaTTk9e2@+E>!t#YhV!s1~@P0qo
zv=}+O<w@RwgqZ1L0JkG9^keK`J$v?I9MBv0@|q19_5S!iRExYI)61(7aqKQ~H!$#V
zkp;iw;8e}$aVop^9~lM{=*6b=n-H`-SDlLt-87qkGP61z&jy2(B<ewGu7UUu_LoT5
z2{Uaic4%0SN%8T}@7CIpD*FznAB4FEap-#3h5Lb^)>^ofR2B=!Hc14ScE7{aKG4bA
zS>?YEAeqI1uCZx#LhH}K@}QE!Hw&($=|Acfz(ZZ)b%7sTV0;Z9Z0-E>*Ymu78(g)2
z_OnY<<SZX-gCg0Y-kVuY1h9gCYOda}DGQNN9r#<Ug&c0%?}(SP_RlS|wv0d$&sTpn
zWE*}ZbnGDPN{&DT^vG(~tlc<#>tFwyKk-5(8mY&$ecE}yiJMDWsoX56OW@A7oDq3P
ziLUErWxC_xwOJw*LoB)Y(3QCxfNZ(gPCNIs>GK_n^s}BHhTepr(CKE2L6NsEHN<Uc
zYko~95H}QV$-c~GGKKQD)$V9!o9o8ZCicsMUiJZ$aIss&zoL??N{6QJYbcP5THobS
zxw%O8;^mP=1_+Sr^V**ja^c~q1dSFdpi8mZo90JC*qiH`S0UuQ<P8_QRhRnm=BYUl
z61a;6&kuUBC|tS7%>c29UH^L%(yHdHU|v^cmH1%SxT-?u1xLu10cw5zgmtk5VM{JW
z{5#37>mC^gtPut3k&h=DN-VdGOUjr>9^ajbnQtiN@g~xGwZb{I8lvmBk4n*>O1}P!
zI^=Dw4GRxHjyg3t!$%!`I+O25BIj=E9&Xf4hsB6Q>sOb0|6~ok64qd9BE!j=p6nOW
z#lNw_vniETw{7PrUUAo%wR^1b0M021LfBmUx@FV9zu<}OTCmEC`0*=Zl7=KS9>L&!
z|LdKs_KL|oLRuq++0|sGY+k{~>$MitLQ`ul@|{A)z=<0sVw$h0-Qd?<iSH5Rm^cG!
zaM{USQkiT+f&y3+Q^r>Rh_3j{3CH|LWnZ57wG<!kuC|C#(5eF_#N*tP$+c}}kc{rt
zBd+JUa)4*6)$UUkw(oZ*NFH!gODhqj6v;`@GOavvh;=5AqodwC7BC(*V7Q}t85lUh
z;lEWI{Y9Cp)(?Xfg=xNMGcEl$mvZma13yrD_4^1&0Tt5nC51trOb74}<Tc70sYXSJ
zyIR_=y0{7+Zw9L>YWXIuC&_3zzFqAAnkwJ*xDSl9roJor2UWRdLmu)*w>2?YF8X4n
zq7&J(VLVQMaP>PfF=BM4rm3}wZ%qChKMuGU84d@c7y@t^PmDC9)C+S$3V5)_P^hbJ
z42(}k=uMn-;Le>K2ZE9hAJUtg=_=cC@f>ZFRVg~%**0_e=y1tI<7axmA4_-C@xKpK
zxjQg;?Z~+=KSM(zB3z!8P7*Ndz0;2-+e8J%aS{V50%2QY;+`xZ(^6D(^#aAxnN3mG
z^a|@a(0Yakuq60HDwcyGJZzKta>_h@qS3CG<MFCtTVFE~*AEu3m-+xReVTcyYYhAO
z^XxgVXImS6nr#-@TuG8GkH3ECuyssQ?{%>r@bE%~kkN@d4Xb%^%iTw&&%j*Jrp+x_
zm2xr=mK)V!>pz4UnXhg3<F|p!91oNaz=kNzK4PCNkuQ2D1au54iNf~vV_GU9hBC<L
z`J8-g`rVg$N@AJ;!y>(A<HDbea_=#LhSxE*cV3CTM>l5+&%L$laav!tnB)(uTaJY4
z^<s8;6U88<oOr7lob!yUekEF8uf5NL*=xzg%G!<b;^UzTC%1sGD*YxAcki%OBSd}v
zHb}v3qrtfNqD88QqG06lvJj5EGo9sAYKgK>qYYLWaL4m+8sSvZn(||9<b)r>3?97h
zh9uc^8sQOn<wquGyC&SNQu}{i<X4Q<T8wSfDG+KgR;8Bnd*`c0YQoKI`o0?HET~0B
zEC+8gQWb1`H5~R9eBQS^nv}Gf12mP{EGdK1#VN7R52(Ib{le?g^><e{jOx7$d!!Xm
zXseyvOo{UIP*1y3@r+pUG}BoP_Z~Grs)AuNE0ew}_;A4@d}lNjE!5%g#%Hn7wisb*
zbB%VrRHZn<-OP*b%R)@7)s`1~1vvYCn#)z&MXOmv9bY#5Z<yoU^6I|djpDZk4=9dC
zHK0Goz)S!DUInW@c()iy1q>BtY5q;M)mAMF-t8gg`qKxx)*?l{3M{WAF2y-8ImlW7
zQ8uMhglxsY`95H?L+DlV5v7q`&GTudG}J)duOBtk4g)VzTc=Hyl9*NuU{lI@nNuoX
zU1tV@+cY;<JMlQNgYeh+78$B+ktV=QUTCy@r<?KY-@iL=61jzo#T_LBUCmy@_%vae
z0k-DT0Rs+ev7KAWbm>;8+zFXa@MokP1Fv?HXC)^xK6{}R=c+YVQoy7TBUo5~yk71s
zXcssi1Z0pyzq;gXO7tl4M5~S_N*RKRic|9a_2@|V+oDy<mOo5?00hG(CU1ZYjh*>`
zhxb@}9BhFBCE3hVdv@{sW1`i!;#Q$7X3!N!7WJ9@FOP2+<iPNrxA+He(X&>3p<2ah
z@6>$I7+}waE6U{md~ctyQ4GSAK+^PVr=c-8;-)c41+J~D8R(3Jqr}Re*l2s`qG-MV
z1uzuAzQgrz>~xbhNwgalG#Jf~Aq76uL|_w6;5e_-x3WwMtC@-s+Skhp$he1!vN!rQ
zsZfpfPswpiikN;haCcxi`#lo}^16(NM5z^*scGAvDJyV2#`BXVy%ahL@9w0_1qD{>
zUq8&qM00>q1ZRR~^G^na{!}0`tGf7(tnxhCiNWV>G`ra=*SN63KJQEJ^o5QoV<Rt0
zieEO|G+>J4Ri?v<-EaqxlLI&#yE<ClCo~|zhc7f!1#CPYEJ$i+3{nJBCQHC+3#goY
z9r)ZdH<!6A)7r1#!=1u2d{CjrBM+R4>!O0@`4@UR3O`;ceEo*=@1O{Cf$WKXIq`St
znVCOaCFe<WCePi5F=^5hrGLyc{h^X$rA8POxM)GQU%u3rP$Ks-lTv~k2j_yysh}_U
z@y7zg&0FTg>!Ie_?I+YI5Yt`2Nv;$UR;04?QRQXJ#V&+*`+HeT(*r3FTe8jIg$lh9
zvqb%c%`Q-&&^0H%?nw^&fg$ct8vTRk3kPj;y4AFoe!Z95P8PgcMzv~F<9)wWgq$?=
z3@-l;dhwQ1(*X-1iZ^$+^J#nZ4427%p>{=6Da7Y*16xe#xIA{`ZeeTYT5Go)Ev9&Z
z;n+zGQ!_DGIp0xEK15$kO;wfOq}&QFu~!>uBvJ1#qdTxk+-U~Ha9dr}Bs!((&aL>6
zky))IU|!As>*r4ZKFHF;UhI}q$|(wnfwusWBPer1I%_(>7LJJBdQZThT?H(PTgxV4
z8#)#r()V`{75QVO4}1K4Q?gtnN^if_rW+7BEN6RTVgkhEwRUd3Ec_w&MphW)&C@`j
z8!QCW9dV5ZOMwEnvJ992n_Pes0cKfJpD3bC=1g07l?hZs=yb!FV$Zh{;Jpa9Px|Is
zLa`qx)NO^$*#>^VzyY->07S!jioN$7%zaXpXXaL0>{iA40n*M)P)CNPJ{;#)ji$8j
z9IfZuWYL3LJ58v!w*EONX5%<g+kH}OSLb2c;@H?nQ_I`#EzL^TILgjx0={hLCp^WL
zeGIZ1C3&gY=bh?Ijo3g6UpR$P9y#S^cf#D5P#=7e?!XY~sc0`CmIV71xITPM$gbF;
zy1YnBoYl#cM~fo?=A2U!(p2ADY7%1&KJHwO=={H>=<iR;^b_W*y9)$6B2dGfD%Mbd
zzo2JhbBvC0mf4pCt!$m9tuUdlt%m(=Sx-FQnN^Dn8d`Xsb6z%Psu_Wfa3F|`;Okxs
z^Ao*Cpw?vn!1<CZk?x79Ft%%mhdkYk3u!OY-+D#U3z_L##ayi7&&hzpL75r!tDPuR
zF;lUO`4_^axQb}cx4J{FM~(`{sjZ#<xbi0;d$=>hh0@)1CeW+qXTM!<MLwjj09_9o
z^F22%aLi%(@cYSkXzzsNCBc+9jni$m`oz(!N3Q$QkN2*prA6OK;N@TdMaPI%yuWQj
z6VGmRB_X<1r2;zHUYki>gu?NJ$tyOw%vG(64hZUB5eUk1w$tCV`s5O@_fL~R8Amgz
z1qZ8yH#Z8zW7?A=9?jjyORG#qSZrWpI=3JDn0vXcaDDmWJD>2CMTMyGV_}-4u`ckk
zqPR}R9u6b7;tYk&N?dPe97hgc1%Y##v6y9ql)QEWE$q3-_4Ss&#NE)l1)!({VN5B`
z`PIR2Br#VZoXt70i&5BOf&O(&No?CCCSjTdkDj2X3>SUoqhY4tId-P`yzS!Bm9)mp
zSdiFH3gvad+D`fCn<{1?a~fE8uRU=`0Yp8ID+ZC=slkeYGFt0QetBi-Qv1Fj&;{+#
z)BRywWfQxaY+;I9pdBXxBUypWY#@2xbyd$d8CdQPn+}lf=WWB)%v1q{@Y^KsuG4)R
zRG*v^w`&E+sOf=rul*5NdJ@JZN2W6aPstGGJS({a50;kk07CA770g+(Di03}4Htgi
z?AN~Fjh6%Cf$Zah1>mHM{O4S@{PYOhKO8pN&f}s)X0n{SGrA)O=xt%VX||&^-#-&r
zK*Ke(%<N9~5KD(ZN<(B=$e={Nu^Z$GUODBXf1a6EOpmW7P)MzY7<-`5R}in5v=@)q
zV><z-ls<Y+@cv4+jmOj4r$Dl+l3*+i_*3%@2vi6;L#>;hd*QEs*s-r3<C4>*9wo0_
zlHO38D9K*`5NedW=p-BQ2V;2?D_P~*zSmvI+5_~M>#g5Af~2#|SR8RmSvNi8u`iIk
zviAT4GumoHH_E6I2tvqn8C5AW)0gU#;n;D#kB~aICN~9ZKM2HnPb8tPmp-=z2|Hp6
zAL`FieU$}r;z4jYM!YO9j}0lu0@!6K4bLOC^7+bt5qD%|3P;O3x7v79Zy%Zt{UTzs
zo<^SLAT4Zo+AK5E%*4L_9($fsggH?mdm*?|=Mubgux*3)OVW!oeM-jkQ<;lR^{u8b
z4;^c&di4yU&Z*r$N@~&f403h)eh(#GGY9GESOa)07QP1!`66W*pqZ+lA=Pd3`#xLw
z_wWpwA}#q6k4R0ne0qI2uU|@8^Dv@P70Ne<{jF&(h+PHO;?6O*q}(RvbI74-UWpRX
z{PTh0C@le~J(Opn*|;pVo*8nDeOIGAxYy(*dEbzUQ~UXFslx2U9w^7+qMtRj$G%f`
z{<p<GnZa};!v2$n4YB)={yh&=hM57u<$Sb^u4&z8@!+Qn7;Mjh(a#kAP7gH}o1W%b
zbhm@b<{`t|mxvn}&dc_pg{BAh-0&`pN2FvOr)H>UrM$r)%30XbV@z`W{pi+p6yk%7
z-*D|9^o!9<Yfin_$sP8ck?ws3g=R}%A@9&L?Wtz^hg+GqK({-67Zo>_WE?o4wCssR
zi{3B$*}R*CzjX4vM2kF*s;){0peHD2E~Uaz(zc=zDnsrTnse)<wQQQ~cCNR~1Asb!
zFAhKO5VH`X6;8n;!p<t@>MwxMXn!6Zv7di`Egw;ZB7tP3>Gsy&?1R}WGky49;fnkM
zZS4P!{_#4^hhEmq)#J%y&A(;_Hkd}CR@2ELy0jF=U(^qY;I{z1q#P(0<Kk-h!oOQ#
zbEV<>5sL%0{-i8epvY4%l!%{{ROu7_%dty%vPsmxWudmtyK!UK=B!UoyE2~u3V(rE
z(q~)$!&HA%f(J8F&%$3C>3$M}PER+014}a4`zlp+RYkHV0@z~l<68wN-2I=WFR18#
zaahy-v(uivjWxTh(<|)56Fz-Fn{7Zt|7PBlzD_=$y#4cMDdzZaw-;xV5>;u54kdeq
zf+0%#xJLh?u|3=MR?5_WK&s`v(DNdX??1p<8_2@Oso(i+gp8^Im!y(JMgF2;2{T;Z
zC<d7l=Op1`H|^Q|{5ycFjLCJe{OIJypSTdM&D4yQB8RIMu=dz@l!!~q(*(VCeAI4a
zWLk=LK72th0^6yjH$}QjEA{?4JY0)dYmEj`a)<8Nu+-+O5yTb(hw&jiPEV%M8MGh~
zI!NmiyJWwX1zq`M#kR4%(sUPFxIV7>s4a&b1iOLrV;SExEaGhw1Q|$2U(#|%&$d08
zx-L>@z7R4!tlC|}Y?OXwKES4T7^3&s02KiE^q!^Y;rt|Yx=CksHxxFj7&*DbMgKAp
z9_R+HvpO(|)HzX%?9BDy`jx`8kUpuqm%uj*;{;j=xqYt?2OAexM9KMgDkU|Y0%z*d
zTW{I%4S$?qDJ=BH|A(F)HDN2Bv&fzeDTFWlQM~h5e^S+rSu*)8O%UA$iP+n_ert?$
z>99->LRs<^T17HvQB^H`2>l#%u<%^S-p8B)r0^k@#x!&kM2B#V<aQJ?Dk<v9y1eOj
zBwkg&*gD|zz3mg4!%`tFFv06lT6?m}!mCX`s$5K0SHi!uBlHD^=24FtTW_=;c8(L;
z6t6%ZL+xpLBMMz`{6%xOd{N=T#W1n(t3B=?o01~Zs9B^J6V#jN_(LlVsX85jgmuT`
zwu(#Oi;<%2VsyPm*?I?@e$mbGZ5j`l18oj)GC7A5)5=EguKgti#dVYTmV<3FkFZ&b
zxbk<BhO$4S{#6n7tEwcG+{B+JVOv$B!n~`nfY#8{)5*a4qHU5bO9W6}e_m|%8MTs#
zrK|e8aTH?o>px^mq@Kq}$jNsh_A%Aqx!fW}SJRw<?ZAi*TWP%Nqle@f8G*P+#of52
zv4f61@h9Gopy8pRj%7=?cjG!&p7WJk{O-YD1&?BtZ&jb?aaw;?Pst1zp01^6ubBM?
zG<(fvJE}%I?KKoyW<W#65%H({7IwG*Knky9-XX&xn2LQnQc4x*Skxao=}1kPtpdTt
z%C5#rL>bsK2l9*`N8|W)P%rx^Sr(F+hPpNMTiXV{12|-8INvJtyz{K)dfRbOL>nX0
zf9h)*z2xgvoQ2&B?GP#Q*zP#M1%NWUjL>L;sJneJP?7rY5`CU}H&d2;*IK>#2Vl>T
z<Jk(XkMym*6W5i+LH3Y9Q4|XPfHHI!fS1{ow`9(Frt0dgsPYXvvY47v;w4L0EC9&W
z%-(|B2nsE&OI8CblFyF^UJ2}c;50y@H+XpGbvB?RO7Oe_7b$kJC0p{#f@8l+X}B1?
zF!gs#e+!syiZpv=o|ke-=;)w&=BTCy-dvxi;l`DeVlL;!+U8qb|NiqoxFCu}0PjR~
zyViZoJ$4xm%L-hJ9!)+L-oO)mzywV&)!)X&v2%WTj!hVY<^@}#TI3%xxEJ?+lLj$0
z-q~rbDb_Nq$~AW-gT5Ec(As-H<$9<co;~})iHt#6pyQihG~2$atPv(qLcx5loEM*n
zrwa&#bY8z%&+!6gP;KN~ZRMB3RpgQ3Iu=21DtZb0s)*AfBk-5dInYOA{O+fyBWw@F
zJc>6OnFqJZdS0I5`?1&tq4#flCrIPpA0Xhc>x8;k*cIslA%EU#V-ph8^leQ}s3Xr_
zm9122gL%w-11mTS)A&(WOv9J)NqKFJPjzRI7%PME0LxuTgwqRCoyDI5kollb8>$Ld
ze5zHuBic_`Pl%e`&GPQ=o_bMUGj~$q3WEwE0}*B$)k#3g+h=nd0}CCPrGzAx<_Gz4
z&!x6i{~qefy{4w4h|6=4QDOc*>6d<#B|Mr-ze-GEWY>(mTNISBqA-G)(&w~j_H_n<
zM7c6f2(JD9P^<YGFQ{ni_S2v=zeM7FD4jSfNyO2f5h79ABwY;HwziOOAH;uOTH<{9
zH1j-I2ex?7f|;JarSJtLOef@|D#$jx33OSZ9XUQ96=?c^J_#TVM|wii%v#&+ELNBw
ziSn|>MXu?r93B1K+tV5b+(w&RgUgE3uoBBx*1+bYm$J9Nb#~ee46Grcss-%zm_Sw5
z#fZ~z#odTSmHkhjrf)rsdQVpBXH5l`Q|7s4GUa=LeA?^bf6&e#cL){;kOH5xXY#Yf
ztD5XTV^0k!*f{A)X;yrs`K-OB`ek-C4=>Ud-cS}``zY`qgOJEq?wJH`T^A$Xp4&6k
zF*)bQl>|9qaHb+jXSL^~_Z2=T7|fh!wQ(N(SPsjxt@q-V?r6a2|DW%lKep3SK2<QR
zxfaOzqWpv~%P@#?43H$QHyHHirfq1;^e->f6uzB(@wK4_uLz#~U?~0-y=xa{lOPF}
zdAPU-s=Jw}#~0hJ5Z@%O6F_<Z7q4(=YH0O*YJ1Q6CaV!X*9PI$=TutfB<^V@u(Jdd
zu4ekUQo_6>yV-g%__tAR6~>B1^gfuZrh*|x3oDV|pLaH}2D(enfzWD1!L^4nn!MTO
zb|YoIP?AzcV252YJYMFK8S>sebGw0%hAigPkbryz50pgJ!Tesnlcy1r$J)6|3jn6L
z%<P72^)N`dDkXoxHEEBVYe03haY!6*?@FGLRarGlhz}=+NTYA7EYuRr@VOHT7!<wv
zL1R*kp^VSLLrh+nI2Qt!bFTNBahtI`7%M&N$pRknVMG?0!Da7U0N-^`14A%Y_)JCy
zZtVH6q)*!OwXJ0zXAUo!jeF!lpBu%048qYMMt;RwoCjUMWEQj*dAwbI_OI{^8?+b|
zS5c@QdHVEB<BSsA0c_*KRDy3j-%DZ2eOr|PYCI?Jbp9$xwz5htfKJ7-lo=@*rnFsq
z;WITE3ahq`pXCwV#ffbG8_9n)$0b0um^r*GpFNqrO%?L7AlRxQxB7|d3id)N#ELuT
z3nPjV-%cVqh?Qkhkwaf^z|rvBs+Y`qjmn;3lT2TFMQ4gcviDf&7sba^lKaF<W_}bY
z47~WN`=&cZX+pPTHmJ$R#W8QAjA#B6<7c?GSE{Zi`>TwD7mW6Xd8L1qb)&7`f5`gQ
zbniB3;tJ5aKJdtK0~})(1`HYaZEd(zxjYoe&+hpZ42_{6TL0|_v@ZZv-?Fb_FqyM%
z@^^$HKdT;c9lEkSF#2EafQiQG=NfF+KWrDTAtR`i0K0Ho!d@bvpxng>*YaO;J6t=d
zd<);xlaoORK*NhMWwcf1(m%02PXW38)SMPme^I&x!R7Km{mQ>RioZm_6|#q*d8JXz
z?CeTKDP&^F*Hkod$|39*<5PH}_n?EKqMKf<2Jj9DCG}8&{=ZfFWE-%-LkKYMX1{`8
z+uP$xw;-qtd9q(_t{e)cFN1~QX8#5tXJSo&v4}`N&Pt6clTcCFn<h~ua87qijW$6J
zbYTaOmT%Th)>7J}uNW-C8g-8XY{@RLR+0UahGFIzk6SfJU**CaTbx9BI>C}*yZZsW
z3_IF4@fzc;U@0YbE_$!}=M&>{bEfV*I|d@_$@jdc(|xo&*#|S4#pxWfVPu-72Z=IO
z6~W*Uw3SEOWxuE#TFA_G%Mqmk-u8oD(c<r>I#(QbmwXk09<}5_hB!b6`OJ+;x(-4Z
ztry%vJ%m7>lWrjWlfLT@0{$Gp=;OF1!MitJk%5^UVPawEO{9(9A;sA@5@17w5aS(~
zO+?5k=4SQ6HAAX}=id50Uk5T;I25o?S_9LomPyZX<CY5ll=x$Zwd2k0rTQ(ZAOBOk
zuAKHryUY(;IkO@SJGd~{+EU(YPLGOD?)$vYoHwEmWIBl8(Yl>(t5fYwke(Sl&u8GQ
zT3hw7fNADH0D^{P3}yzjl|6oK$5l^!<BwC+t&0qNji_I^MDCs+LVk;ptlG!V-c`n$
zR$7g<mlZ+AX{E;HZX(%%9s>wki8wr^0*g?=`~ymBKYP^i_8VJ%zzIq@d$M|-6OI7Q
z_{c{#G9zDKzXN&3lz+8T)=DJ!+LmMlFe%l?BktA-39PHccp@@vbhIvHi<E>tYu~Ke
zZcF;nX}a+}XEPYmx$Qu`)gG0Rzc&}lU5vixJu|0}^Ssa@`0OyKtUftRR)0ic$f{bO
zo(o|q#W{XTHz}udF&f&<&xS;9nT3MyNqLbj!8h@S$*G~{E8qLwCZvfoLi3sG#mtEJ
zcAwRtC2h{*AT3=6^EdO24HlprWd5?$L3jN5`wrD`M<T6;s+xKpt{4b_xZ?O{n4)_0
zJ-d<sSn7^zx4vF8eE>h*-=j1G+tOT1PIjsT`&7I1j%xBkub!o(LIAE0xV=7XoC3-~
zV96I89L?x%WM-U&1@=&oo~6U19rBWIMd`fvGT`UDla&YlPf*XsC?0hAQx{;~3hkKZ
z@G?LEey2Z%Xmr=f#umzw9vx{DcGph&kw<8xVx#+z5}gLiIpKQ$DxliEm|dGF^SsOe
zc+<;Bs`)q}jmY&@Z|g^iU4KRRSs5Wf=)84SfYGN<pP0V@JVNYVU|TX%G;R<xUR%mp
ze0SlU{C*X%Sk`UMj=87PpeLn6)5=>~sa1=%tL5RkzIs`K{FI~dSDxT#IbMdWnsOJ9
zLh`R$<W8fT6!>FLQrxgzlQ0F#mCW%zlqDUfB~5A>z{*pWGW$ljc_GAc0O>x)5e;Ah
zDkdH8z!N!P_^PU5er+$5ozD$0z>~TNNGLk0O5`)r!EZ5tOw7QJT2p3x5Li-!6XWN~
zy_W~;s$X3X?tSg?V-lJ7HXm$Rd4j3CBA+tA`)aI@ce&HhH;psgG}?$@!c@#1`gM3<
z4E&yjCVMP2q;%4(I>?)sb@;P9tn$jTiV1-2UQJ84+UPH<GOK<W7~$ra;>zL%kIpvm
zMnQmuEBeanOA<}AoQ&T(qCF!I4p0Ed=^Z=j_Pap-0*;EgPrT$fy(ifqFM#joc#I=#
zdh7sU)ZrYUDYHayaP+s7S8Sd;b|q7-339V{%X#zICUTd~K=xYm1k5o<qsf<o>DPK_
z#FD|jqe4e+^5glY7n)2B=d4sXms0iUGTTAjJ<C1YOH6cP(;?RJ`FfPN&92j|vwy#K
zuDL$dx0zn)xKIC|X3<0#(3b9JS;>mrzIt6Sh4Ji?raqnl1TliXWCNDcMb}EgxFONk
zi?0Z@Rq{1EMBB@~vCyKO`X0_S&YAa_(R8L_<}i=?*KhQ2tT|`jShMTFP|(py)FJwo
z#AhFmuJ_juP-`bCO(M0epJG}KB$M&7kcXJxLx^>DS!Dd`1Iy;Y*PmtW!}>FhsF+Wm
zr}XB|;qyU_XFk7EFG}~eNz+eWVoesDR9lV29?`eq*k9-v<@AjmDrlMM4yf*)+}<rV
zHM6obFxxs#i8gGL-&G5{OX|n&(MG4RYo-i1Ng4H}t@##+o7!fD<P7AETzODRAj$>S
z`;+SZThc>-WN$y55Zl21T;$2#qc;33*mIU~Ut3=Z7)P2>mV1_$m$jz@(7-nWaEo%8
z!0p>CwmdZU)|WV>@OM&{RX@FX!zQE=E<6B;2(uufpj?BNl)IocYNR5t_!8<kxb-Q^
zBX3=iRffG@&p;_|?57v%p+vvCAbO)$re}GdbPn3S&_6v50EWA7)?M~8>MNH3+Nu6w
zAgJSwT#-b~Tf&sFrqD}?h#X*TDq*za{NowquShR1?44>#aj3GOb`oB(%2Pr<>+3Mu
z3ad~f#la0k3X+r_>E{EG&a7I>$PVRX`Sz|7S$t%hA@?%>BTsp}CCj*=twak!mx+*$
zEV$WWXW3SJRkZOhC|X83#+l9N;B--Z#9|ev5=Ls@sjF<}qS2T3u<gtBou%K>RQNnx
zn_Y~^anln0Q7g-N<3@!-eHAD6_Aaikc84?|UDkvlv>!-hc;tEBfQ-}_AcZBFG@%2_
zKt;{xQz|NM49_jA%=C=4qZjEX+~iQ;co#%c!ic7SZ*O?#;sGgj-U=1B!zUm~e4it#
z(F|JxjZ4O)mN}Sybv|x*I0;@Acz>HTJ+ZF&2`%!$pOpqV^Xvf-F9w6_Xboikv55B)
zyrHswYz!^)u*)iqztV4vvwr)TOMx~zHL&fDjz12<-BfL_a0ja8>`0FwCOZ&sA*!oU
z^g@nI#rmb+GS{rU;u^h-M4cN@26_GmeA!M`<clJ5L);*Q>$x!o^$%Qf?`y#DcamJ7
z_j*Z7J7g1?<8<KU&Z7L(Hv~$=e}A@LAy?a@Az?y}nvOPsU1v*i4r>G3&q5;XGB1PG
zBaeeaIwEtr8vKhq!F20+C&L@$AIJq9?FTNSP5r+IMLC<-AqkRKkJ(#!J@D&WLorl-
zBJ{G)s};masK%wBwQhnKm8#1(?*{ft<)8-rjr88oW^ImPJEyr<D?VFZDPvDCjYE7C
zrq@i8kp&sw<hi`LDC||iFP!6p`x+d;sI@p56&&|sF%psfQ(<K5%mUpe(s`0E5c+dL
z)r5(`;>O$S^^hWVStX6#g`Ojk7*qJYhvdU>fL~cYvnK^SoO7`E(|wQPm~7W?O~YxF
z_^CTRN5E7gGy<g*_3zKGaMfYT9UQj7c}ojfwkekQ_HgrG{_J^bB-3eG5t<;5+6uFA
zPip-p_GVcstEy^Q(az^;MPPkb`xEI;pH6P8s;Mb$v6lZgnFq23<xlp1Hv24gPMR4E
zDP;+04($9V;sCyGWhE{o#yKDXn=GKQ04Roe2gsFtw*hU9=Mg;ZCLRc*q<|ZJ_Vw->
zcU0(CL+n=R*<&4$_1M|o<U`-cI`prGPTFWafAtu4k?7Y3^*dyLx~W=VE%$1hj+;ya
zS}oP-4Y0jg@Yr%=<f#FmXG}N4wQPlfbIK&LynGnB#O`ym`7DVS{q7*Y=m-LKyP+*G
z20AbaTR$wCB`Vw4@*)0cEgNYth?ko-tBFKYek}R^Apwk#XMiVK9Qio2kHyl;>+|KD
zkPGRwGoLfl%~{6lOkFXbxgS=RUXgO9;h)uQ`^fuB!oX7WXZkZqrD?MlsR|-LI-=u8
z&ffa7b}MU`{~=8R;ae|qQpGK;U5Ax^+0o2<itK7)8){;2lVMXYCNZzg*H-_bEsN4&
zlnUmVt2gdyl5N+EZHeYXlm=K`L&yq^Z_a9ybOeu7^w6`rNc%=1nw&~)Pw&#qC!CXn
zktZ+-dT_RYtaS?vKH?>7RpY|TBKvN~)wGM+?bFP+X{K(8f()-eQRQa^**A+lM3sUx
z^#|48lvZ%mTYeOIC{ZeHX}O_=d2lXA3yHFV{*}p%`1<cu=~`H;Dx*OHO@vBMGUX1B
zZ69I(_rtcm_o8^Mt2Q3GdBVm1q6re8tnS^90m5Q@)rCeGwE*+bzoY4=nIWgU+vh|w
zVU|<e*F0VdUjC+H0{t&P#8IOcSU%i;FdKGe5_Jmr_pm#6byb9w=-4vb`hRDeE=hk}
z#7&96q*6zW?QDKGzstEE!|T)}OK-nuLUM3XX>PLqjFAl$(ji_rx0q^IhrPe)%)jpV
z#^`a^87?BUlElSQcQyHQ)I!j^C3SlYD*=POx<wR_dGSr%1=E#8G&;AHPNWWcT)^*}
zB9^u45(iI<o<Bm&H-9?8uM;7;!mkZy-S_N}$o6=8_7~>|<7WI>Xmt!Lm*P6%Bd>~`
zoJ-6k*8s>DqM*EI=^JUGe_Z`&x##?O4OR8Mg+HMow&b7N;rk+GkkW66yj(|(U8Cq6
zePia|`OP~A!HTf@lD!Mlur~wC+rc$cp0BffMmc2n;y(5zK`zN;rjY~G1laz|J<SZM
zgk()mZwJb!bgbMNk>yY#<H-K&jvk1Dtp6aoJ#pn^GMR{YPD*`!O?m4T<rsI{FRWvh
zmM&TyN=o!O?riR>BNR(^Q+t-a7?!ToCJL#C*R7S|kDP@%YpNI8sGYK0nfK^jJLKMT
z%F@t+6FIpY$n8NO;7`qT|ElD=4W!sif3OAPde%g6nvpZsM<UojZ8rJH_;{c^Y8Mtp
zD6JWB4KpS)dHG$!POzO*DVKzDhHHMEN*8$lMzLVjjbdC4vFr_t;$_9l*-G<(W^}z!
zfnoCIw;`8}uS*8px_v=rKSD^mDKEvssw|n<Jl#_SDDbEkt^Rb^Uxn1PWb4t;kV77F
zk-z=n*g99Nb{#1Kq%cn9VMG|_=06+i4RdO_LnKg0-RU+!6e$gLo-kqul!JLvzXwSf
z6ENwFoYfHLRd!m4@|M0d>8O+9b$O$W#OR|@<C1FzyZBK-;dud1BEte92*dXUW<U$J
z;KZ)zV5rYoQU$KQs&cs}Ty!aIkT&P-EN#}^)ah373+W<v?()j|!$*$YYf`X3ZY!W)
z`=c%B-O;rTwuJveKr$O$)cldX?P8h`$HfCf&bWIPWS5pFwcJ`4RV-umdc)tu(`xd(
z+z|vr_qiqUYO*gRuIEiJ>@%Z^y(0zbB_J_h42@C+B2PE*Y#OJ&>V*K}cqmCD^2`?a
z+;a+0O?u6@-@@EFNLa;ci><+b2+J5GaBYQw{w)#$I+N9*`y{aJhoIW?wAZ`djc++7
z8~wYsm6;vk2?Cih)l=e{^(8-+JU1sUw}JO{A9M&D5CzduTEte8m#t4|xYN{r_pUhl
zey^Hw2-|4Kr$CT>h!ci?Lw0yx6eqh~#??+Sw>XGdZBEA9MNwC}1@1L%?WjvPi-^%i
zH+_utjWhKVvQt_7d`53M)6q|CmmT9nA~)fMwCC<Lut^81Tf9!E0QTY@Qv>kpfjCAE
z$8=b7@+TuL7E&A`*U5F2{iFW-%%iNQQTKW^PM6r!12xW$i90)uP5VG<{`QW!q&yk8
z-1L7r8b|-9l^XTO9ld3XbqyA3Kd^G`s-<iM-azwSv@?(>tytUsOcR70s{&F;{|j1M
z3T$ED{uTa=q^7Q(+Y6>S?tBz2PFwY~uq%HQaG@mWwQ!T$CFML!MIUWC3TfYL@j90C
z5H&P}5p%7=CJfy*W0c4q1%O`)-Lqsb4cqDI=N&N|VkyUeSqI$Fr~tc6?9WX9*Qh?9
z<tcw-yrPI!^#%|+8ml<6^YgE#%x99)df2&LeF<^LO-Cr`COtbGzWOw^c{mp}v|sQM
zrVoVQT62?Vr>!kYm_W*aq?RYlGMn6z4wND~5H1O&TNKzcj6Rsm|AlaEq9h{?-|DSw
zH~+Sn_o|AZ&qpq%bk9ql6PVT`Y@gv?=KwfgAR<w26!L?=yac6S?@Q`qzvyX9`K^6>
z(%sJ6b-MAu1!vtGvgO1SvG|&oAV0EW7(+TT^VF|)a?2eAP6%Bmq7Huuy=Y*}V^4?T
zJXsp+TivbF^;^Sg;$zP5ni1y@%7!TM8`qoh9|;Ms*T1Xh8=s4=STS?t5opt$@-jok
z+xGbbUGZF+rvYjqtS0J9Nf2~h;Hu1a<TG0KG@4y#35aUOw4Ud0layJdJOg3(V2i0n
zuN9O#z3lPh_X~3T`NS|{$5>Y+ML>4i3JZHSWQn<RrCQN^3P=YT+QZobRa(nxUdMsM
zQzc&wzwG=S9J{#a{TK!Xb{S3iJpSfh<Ibpe%$jp+Qfqn8-uq`K<!o@+s>|uW1>oNe
zLSh?|ns0b{9NXRVl!6*Yuh+d<@KvSTvJlIf_jHLvqlp1ErMJ!3i-S%F3OmpCvI+;b
zt@Ih_JVxAa(%Or<J^OfKm=z&A!!-xOSijrT7s*-pBozA0TP3ymr`LPCw#;Sqzx&IA
zo1EVxl-Kv8+jf^Vd~FUEJ9*OQ55AuG%ny{FN;fla`ZLh5BRejLh?4D1#&teNAwF|N
z63dBS<s1sr!qoduZ9DwDLWP=C5U%Fy2z_E<cf_RkP(?=t(Mp=$SqsleTdzPLn2_=$
zsfT84DjkMcMhw9fer}E(6#Y5Em-lohFK+}ZvkrWq6w4FNMK@|`B+l!q?!NLr#qKnB
z{aZZKf|BtbYj1$rdFP810Eh+TBCbE-(Ms9|NELVbE*h|Ohmmz$+LM8>?)O<o0I1KH
zoYy!g)LIL~$$u2fKwFu^@{d1kS=m1A3OA3&|0<Cr1@sKij9joBQXR$SW`<Z<ZUx)o
z$Z-RC1P|Md{?;<Nk>%x{qn)5dl}K|s_jd_8e=#G%AA(MORRd-0d;p16mWzd@IFK&5
z)~ZfcO@8O%5C#C$1Au}SfCSO9L!TQ?Q>4lRIFJe)m9*gfI3SO|xAtlodxUf(KmN|G
zUjCO`j^$O$a#Gl@h9pK=1p7NZIM>B@m8HV7!&<Jcu8H3qe=Kk78KNiUQGwU9W4<Z5
zRhAVyjy6`3+d32L8}Y!WjAb{}ubsQuUi5(Pzo#tM{#oJ<P1C5y-5{<Jr8mof^eOz0
zSb0c}_MfO@JkUlQ9_}5@(*BiVJNV}th_j{vtd)d$Xq}Z7i-@om#~=pQ=gw~kf8A<t
zQ5w{lxWHVUadXidd*@cns@WwFij*<uSqs;vv|o<~Vdk<0{G3(@l0~UPHU-D3qdTfw
z6OQ!s!azMb>{=i@@oi1bHFNH`bL%+LHp`0ATEi@C<SA8}nU^Lt)=ZQlzhoj&m;LG%
zCaGg{2za^}Q@3nm6ADTP&R`l=;(3W$m1-8(ViIgmy-U!lEEef^SOfc6XTsnTv(ioN
z#FpPFb3e^sa8gU{*7<sQioaE0O_f;IoTQhU=h}SPQ*=qS2Y*(Gc=3}2=3aK-vWpRE
z7$mH-$dQUCA*EGfnA}vS>FnEMCvvhg*tr`A>rW6y+DRrtLUBEsiEkeX1uO2Z%r}76
zGGF?ryxssA#c<^vbTiUbqN);hj4f9^B0Jg@M}hEFGC>}^9d$e!RZxC(gl)`FPSmTm
z84wil0bk{-3)IL6vi?wY2@ymmp0Y5;#@Hs-F@2@;;ydc@T+W9@tn%)uM(|lwX7F|j
zV_VdPL+Pa77mJ7|4#f5ub;izz=2Y$3bh#AZytrRoKO6KVNF!r0VtD@r_u8{URqBDs
z^Rv@+e=lpSjV`$@2ma-EEW>YUi`h@~`)%UWz9PDHfv)?SNt$c4e(qmdrsjs^Ms`#=
z8r{hepqOcj%gFijg@Qt!2w)5s9v&VX(eZDe=wp<Tn)VIA-^vkS+Q{;EZBJ^mh#A_d
zyV@EoU6B!;Mp!nD$oqgN=qBYD&_nU3tRs<zA8b9XvqX(LJ|PuHu^S4?WY(VgDJ%Vv
zmTXp!f@#--yZ@u;+~b-4|0uq>CD$#*T#AWm?uy(>E>kXF%AL92FLQ~x-$N{yM50_u
za%bc+_fZiQ=00*6a#>m|VRHX{e*4#h2ea+{dcV#&&y$&Dio;VGZU8`Wj${Y{n>PNk
z+f7up5a8}sHo4i8#1*~nQ-D?e^8V6N#G?TFz^|oE+xt7~+vTum#Ku7cBM5+q0TmPM
zKL-X<v4CBJqn_$~4>^8IdfS`NoOZMo1Ym`{^88xzF-QOYK{{&jd6+8PZZs|a|6tT9
zYUv7)aBj{st;9p&Hw67cZVA0skAfX&nuFl~&S`1Hx!Kv^#DPr-RwyIcNg3NTusVjq
zQ{V5AS^gtyg+4I5n2D*?1NtnJEgs{atWm=4-F-zxJH^*0ucQF?L_DdM>XToCSLx3=
zH;NiidD5hat50o!p5#joJj|9BxZqjs#CZP26`;NbS4$PpI$G}say++=j%l&o^L`f3
z%?q@dD2xn1UPE^h<&BP>Qlk*zLP=myjg5xptN8}09JYK@_carnG$_P(_90Pv+a&OH
z89*z{Ce_kVzK9pu4Tcdn5-w+Y3?o5u_ZzuihiR{KDs}XqYhUy+Z*VE~dx>Ny0xUFL
zZ(63}l+Ii_2YstkgEz6Mtjx}Ctm5c#Ow8HqQgpgPnI;C^RoFkY-(CZ0>O^tmgz85U
z8qQ(^5r%oox4*#ZHE~$eY6euGyzmRX)O6?U&=$-Ez}xchu0A4n%$vvj3JVj1D=mtZ
zd`9iXO@@mA0K8y3&!l>?Rb29psd4_$A%~|kf8;D1w(N)Ih{b1jIBnj}g5-Q(>KgpX
z&GT^<NMr8gE^0d}wP~#&e{Fp?B3${7s=weRIi8L}r%}#gvkP8jUwHd#ina^Zg3kgx
zH|(F)RY2GQ5GhOj!p=W|CFK68op)qJM1zxm7cdaaW;v-hUVPdy@M}|9Q1q?h31`A~
z%=pvUrBg21gwXDrOn)!ogWjahIyZ95&)<|%TVGgy^qw;%y5@Ixwf`Ho4-K<>pMR>)
zo*((d^r~?C%Jo3{?O%saBHrA6NU3$Ssa`m~7;QsPBAX(i9|B+hcl-Wc*zrbN!jNu~
zXJ*nY-wfLi2sjhgnb89sfkKnXeK3__^_!V)dTrS;us~w;LU@Rwj(M(^@rZ?l+R0Wq
z?aie(5U*F1+SJ7J9Y)Bp_Pdm)$BPM!m9h(fEj4-Z_V{i@W$BMIg!x+hq}OY%{-v$;
zk3SBj=J9%wC-hr}&*#B|?CUSvA-`XonW@=k0Qt34-+rz_$vP(@XF63Vq-Ly#pcMEB
z0to%4E=N&`Hhnbf!94fE@lO96V4?#8AuN;QTYv7H+&Sq#UFtsO#!sG}ESw%XtaX<(
z8I2o)BP3%@err+Y%a~#1Uw6K}SIKq)KvuiW#2adW<uRGG5^@o+py2_Locy@^c-O!C
z=r7>l&_<g3j0MV6EfS~^&owSxdP9^?yfSHPk#qlTksw`Jc8!JFzPYs}Pw5;?-EwWJ
z4y++e*c(rd&Dm6!qT+L3c}KIZy&(TRUvp$Rd3Sw;n{LrW>IZ6G+$sS~2a45@xLCk9
znr&cIISJ4`{T9dQYwykenKWhlsNaMGXY;?K{pa2;bYdF-$(b6E<cSpat{iAsLutGy
z#QcW9S8K+SY(J>>)aJ@0)l#Ea;Y=4c0zLd;qQo{i_8XEKX&>v+_)FYZFT5-p9X#vM
zeG3TMOb1sjhEeBcU?TnUOjl4-owaBbpMk!@mr;5DUS_8>=-^($2oef@Qow!q_AS!$
zHK%CcpxvKA&FS-@Lo7F-ob#C%@e-oa<-NThCQwvvZC05&J>Tx;*Uv3LntA^mH|1yI
z_stWw6R+VOc~+u%aHa1ta3=5&)mxGeD#-AgaKo{zi6#EZy{2(IJ}1yPvmuoI-{Mdj
z4R`({7ZV6vba8|G?RHEh4qU7dKvbcj5zT?}BNk7du(G)>#$jqFZM}V^S8dUfrTkK>
z32KGh$8d6AfY4R&?n?fAtyzIb&@~D3syNZ5PAR}1;?Q+^a>y;Lez53`j%PERcfVtN
zg^S4ZP&)nMb+uV)!MnG#_)y#$uuO~IKTu3$Wn%y!XveNk`vAy<7fe;mUG&Ou6nFTX
zZIA!+WutMQHIyiy^Btk~+HD(6#yLyGTnJaRk{kUI-Op{liLc$}*5<Tm@$I(R+_}AS
zUyV=sd;4~a>C#QJ|C~e@ebW=7<+YIv*JZN5WX>~TOW<)<pLs=DMSSQ@3r}`iM!TY!
z1+{gN?piIL7eGy^aDb)k8yNcgd}svsqn2Hv2WKy;-bFgr$t0f41#8FNO7epNh`lLs
zzAwJOUj0&Xfe99B%nu;X8rFK&Cm#D5uJ@E@+&TIO2p-zfyDbY{R(d90z$vw~T+R*b
z248o_6y|;<bEoqghV!~0+Tydb>ZS}3FD~}O%z~}mFI3_uE&N+%ugl_^+0r9(NI7no
z4wjbIm!een?CvL4PGkcuIxcAWyR`YvnkU~AK?4o?`M-mjKdnydh<4i5m6{oge@aev
z5k3SqPq9V_;Q1AV#{lkO{3PrUMO`?dw+yTTd&~vMXt+f%P$3iCdn||jLS4mEBd+8D
zaQlYRGS&X4X<8bi7F5|gr=N&$=d`U3n)>BG-6z!U<Js<G8$|kdxO4Rd){Oq=x7jE_
zZJ{<W(qQ!5dqs+umtJzM@9tih2p1-L&<bpFR)JDpJ6`DhewV2|8)fD=yQgUMe~F2u
z+2OCyzB#=0a2AQ)L84kn)wia~M1XR^U@Hr7sgC2kaW{=ev|?=Ra+smoe++{Fzm}w^
z8YNJ_EN`34gUUF#>jLwiKmz}OliAqAI$$?<0K|BQPl0V_J=!qGO;3eK1t=qGzQW87
z%0YR5UF%OV3^Y<{%eIJ|r5^)nw4IMxGi3p`<3Db5_dv9eHKneI9|fVr*&aW<ZJkng
zSr-lbeo`O)a&)$M>o(9_L7p_pyTIJ!{-i#!0_pRa`C8FAZ4cddIOH0+!>%qV3BdKq
zu+VNbMclerc)2p%dJSd3N>-u0MEjC5Qn0_8DjBi~DLH27S{*Z`T@~YZGbP0Fw2nL3
z5HwEZsyX<>6^4d%NQVZ-QIbC0Dk(Pp8!Mp}^3mlvitSmvcjX1}mwkw6(YoL}%w_~Y
z?9~6Ar=Z3OFtg$_8LW@uT>5p5%zOzpWtHanw#6V~hjhG26DdX1P>eE>myHw5`|$5p
z{0Js==a=eg`+ueGU~w-2n0hoI2MBi$sC3)%cI@2PQYNo?5!-G>hlyIa_pi|B<6xse
zGU<1xsaI(esu2s}k<4h>-CboU{;1-f!D_gA_@;|n<{zY!y-GvZ(d7KJ8FW@?l(VKc
z`q4Dg>0m7l(4E304-!HfSBEr@t_;m8;SNuNuFg!b6<%<~=q@$KN%|_vb=!RSy>Row
z8HNhKj}j6ys0#@+xiE`9yWjPXo;W)efwhh-${suNCaDdcq0yxod1<M9zfE&ul>A%t
zmB=KS86n>=OGAcEy{*w77MWH)o)O%=Y^*?W%l*?Qg`q@YA|QxG=b9YcIi;PVZGkAW
zDe-v8^Meu(&TXiTNw{y8^E}XXo&4tHj|*bsbob7w?Dj1%fMJ%uu+X#Yf<EWMG?Ja7
z3%I1W!v1X@(vrrvwtB_~sMOt6!0&-cdz(5>{XeiHG`dnJZ^EK(N>cO$XiGZ!w*+W)
z%bkO<hqDV1;9HWD$AMF*Z6&BpX299vYUAy5KY*_T`7F?4%R)@PJ{^Qq*g0MbdN4}e
z_5K=GPd%tryTlD#y}qfOZqQ=2i-6BXdr*$sMmrbigud5(e3bDfUjd}^DMtAv#1B>G
zX6Gy3Ft*p^7f?&DrCtlMyT!q%Yh!AplACoH1DF|T?SK?k^=m!oTd69&c_$I}-B+bb
z6cuQc$oh-m;_fI_3K{R&2To>NHYisQprqS??<p<~0>v)ud2hwo;*i01b_*&yInYT|
zV~zP0ADew<OCF1C=k&yAz9$d!C?;$-z>c`moS708N=GdkUyFvyFAl601j=YHB+FO{
z2c-?|JWCQ91bBslNeT6S?hJ=O2^}C%HPsF%7_|h<@z!O%!UDi3F)dICSz!dn47HwG
zDZF~K#hN9~BsFx3`@Zl&j9fDx)EkF)r-pxrhT_#)<5elF@0xPZc59Do#&nt;I$yl5
z-m<6L`;2{a-guf1`_{x=^&|Sd29N8_f|QFlmKvmHZl2s)wdD+8-f;0OYxv$9EFwX_
z{GLb4+!$Z!X<qGHI6i~Y7QH8=rBPN;1YoX|cvMyCx-Vx?-rr@Z@z|X6j77kQRWj23
zHW6_-E)G>K)IqK{{YqAYjOU&`<L`AHcr8@%O9Q{^<v`r$aN35^!}~=#uIydTK`X(r
z4AQMct?ZhEa}yBim_(BW265_8ds1VWBW2CqJLr?Xc?lY+s?DW1-@f9O8Y<Qyt{HcY
z6~aWIF8pePYeYD*GfWQ+7+9E-vR_dOZhv0fL@SIZg#VTbCYW(u-GeOE`+QfDxJr@o
zY`=_Recmz}^fjPC=8_RW?Yvbs(f+5OTlnO-`>?ztxq}vS+5wd4+;~V~Nl!CGoGx+4
zD$iXC884A=8<|sg_4uj^G%58}so`2kHXe@d_Bs2zyQR%lsH-nlCQ!I4cVrr`nVmEv
zz1{x9?#cpLJ+UQ5jzI+NE7TVtP#8R`(f-$MO8Y=Hj4U*CN7^X=P`^3w>rA9+ZZXky
zKC5%jztF;8Kchvnd8n3*_7(9~^v?@jB;05^JF->oA68hNg7EZ`wLe4Kow9gMy_((^
z9As}yG&HG@g@L?Tmdx>QUDy`vHKujdy5jQ=90cH6qIb1KHP4~JdTHQ5Zdx}A5XfwE
z&gZOd(&?7n2Y5VCv{8?bQSD+_ge)s3!9K-wJCNLBP||t9b}^^(G(f|%cd%deUSWs}
zrCkCR9##hXOT!C5_svt6YFr@DU*9tO*+~kpDPd;P@xe<(QT$~1Ym7c|ACjPz>H2fN
z17O@Z!IIs8?Ig*{k3gblzs>;Ga8c?GIR%Y-m*k;N{?f|+g>Fm7PIZ|=@IEi0uOs8*
zrqYvVD4q9^qBpoYg7vyjdqQ#eH^;wcVaxl0mH0kbt{ZTn1qebj!gM7iC3zNi^*8r(
z2j;wi^rrl#l`>9`KQjHTb}xiLw>x_SC78Y?4;l4q;d?avru$ZB<iEYsp3`TfRqpQ7
z<%~Nq-$clV^L~~#c2iQHRIf~;HVe+mXWOosB)WA2a7e&fLp5myMvIvn44_{DUgjIc
z5z6tHn@P_wyg71Vo6&Z>1aeLR;KLSC^>|f}_bW^~@EHbJco^bjxUiNSvapRr$}h7x
zX!)<39B#)4l|zyvwabF&A0I=`3%e{G`^72?q0p4=S86sC&{nc=%+Urdrg|CEHHe^W
zd}m=|fBnU3Jm_pudinbA95*5&eHlRC7S7&sh@iSwkPdcsR*v~I6?O_)bH!U`0*&Ok
zSmRhSjjwuEOW&mxl)6v3HKs6zQH0{g%l2e!s#8+VUPzl1wyO0bnRt7=25QFpl{tPD
zQ*mnLmBI?c=&#lqFH86Q&YFs%AH3}(wF_Kr{^<1|jnFJ1LWWJc^YBAPx7=})YuvQk
zY`Pu_MHd<bgfg%n&ukFR-*ss@cO4`zCLoJDV?Z^O{Z>O>A{u@(*Qt@r_T#shb$F3`
z4odq!w=YY-dC=AHVIRyJ)%w^Gr*qCl>f8&IH4|iVQ$ttdAJ3&EQP3R9KizeCJSvTw
zcIS6U&%pYVx6^U|U0{9Y7%HeQoU`MTzW-gedw(i+dny3iKfWz|f)&w7Qb0>{Y10TL
zgFPdH&y9uiuYa6X(8)n{bgc~%T5?QJcGmmL%Vm*QnEY|@1cK$~8#VGoH4O!cJ3xo#
zAq!@G)f_A**1mZqQJ2&%m$t0n<sHXr>kCrta^+!@h^6ywVg3IGgbCgOimjVe3^Ssx
zyQb}OsY1`Cbhh!tm`t+A@%raxp=<L7tZaPo;3D0Ld#AGZMrM<+t@t<z5Y|R_0zaOr
z^ggH&=i7jMB>64{WV6U80@&VaETyIFx5lVH?{&sM&wGph3@a>jyd{E#i?Cxat}mSY
zUMR@9C5x8Czb3v>ze8SFh&~)@S(b}=!v^}jcL+NpeAHeu?x*y$r@+Dk#|l6`K-ch}
zVQ7G2w$X9$SvKZ-#NlS>0->vGGo+*Mb_)<97b+(m0Cj&RU+3Bb=$j`?Cv=O%T({`x
zXfOz!2LFX;ruI3F-0Hc1Cnn-{SD*(TB%Ja8rtdaepvtCG%;Q*@lYq~4)6AZoo$WYO
zLv|L(;LItX6b$8=M53aj+Xnd{j!D^LkavxKg}K8SZz$pFb3!OeKc#%Y%|d%?J?P|U
z|AxrZ=m+-$a1x@9=1MP$19)^cWQ*cm61iy$-I1p!fhu-HOl9zE4@&1ZaZ>QcC*0)r
z{`vSl_|I50&X*8LONN;mv5DEBorUx((Gf%hdiqQwY&pp(h*)`8j_1r<#|3YC*)cO^
z7BsRdKYd4XT`|et6zH#?sOsrUjss9II``!tAD&-&P6nE3B?kY){*Z#EFTZSVr__kv
z)t<ob;uK34CNd<GP?9R>XUShx0DzK?xHL|YTA=9Oy}<ql$hT`CQ{+D>-k9-R^r@5J
z$M=ELbYEh=A<)fl?bRs!JKqJ9<(caErcF}z?C<<#J6J7%KC`ZYi!LX~wv}XaWcs{?
z&8bbH61YiOAS^c6Pw3v}(ahnkVQdizOH~w5WfxIFb0=$02-!fi#=e~OOY@B4!0n+L
ze&}(0E3BLyfsE2gTRC-aX*vjI@E6gy7k=8QrdC8bKJQ$*GP<n}coXWbO}A|;&i;{r
zFdBZ=NmLAOO`5Gt5ARUA?6;7KRl89M)=tAS5IkN{fHtZ)NW_E2B4dwdFOOsV3vH&%
z{r_ByQ)4Xp&YF=fEB5cJ2aXYWU#klR1yB|So`c6d$9GOTVq0TdPq#A`j+ZjJD?3H&
zA*rk=%&Ff3imF47Zhb?4Cv%mc1iA~S6wD|mE7urk-SU1P$2|(Up6XM5zQy9~M^)L8
zNdLJ*DixR?8{a?EG;BB7xdb5vvyU{81pUokC;q~ouoq>3kO|9r=&)yL_3FJ|Gxs|Z
zwCdZ-?I{B`Rbv+(n5mrPb3N=?X>B2Sf2HP)nb<@>kAheWGr&;>W9fbh4Q(Sg1B%s6
z@YzjLIPWrVX_cv&G<=^k<xyLH%k4iBU&1ca|5Qy)=iHAv#LSu=Uh$<=>-_evyUQG6
z9uzcRU7JeTMf>xZc-_zvNpS6j-Ml-(0EbDEK+hm&;f+DP!m&s3t~)yz;$zsI*a)e%
zn4@3S+`?TE(YKS7t<Zpv8ii15Q6fj){<re$46I!5rGtZgTc~i)(9#I%xJNkVbh%u(
z+{txqdp`}S=As&k!<jCh+YN~gjm2Piw8NqS(vHD%?Kl%jQ5H%dQTF}-b!#`U?I=HX
z+y7);_|(ff2WXBv1ZMgJz@Z!Ak2K0>0gp!h-<#nC`vJ%aQR{;}?o-fJ!`)V;zQjcr
zp*1(4<x#ZPt002<raQ_Su#1wMV1zwH>3mZoF;iZ4SeDu#!jL+*Ft0|F$+gDJxulKz
z^%w5?L%*t)`zCjIS?V7Hi$W9qh}S;@j{dCz<|2NL2scqS`1^<c!9vN$e(^jd*&!WY
zRnD-MP7JITm&H@cDFqD-nj*z`4;2b}B{7(UC!we$Krls(q6wIa;9;ETk{kz71)WB=
zlFe<y^J>p+?DdW0*s@kx+6MB1<UT7?YA`c7&rO<1^jhVV11IZ`q8r(oV$fP#APpBR
z@%U9f%PZA0aXoSsKR2}+<qjbe*N;L@fR2^m$s;bNL2=1z?Yj;UQ;`E`>nSsRJm~`~
zF>-Tel?>6${Vctu=^~bhcG+|j0v6z7?Gq@k?XOyt*CwJR5W@9LflFnNUscsAlvLdY
zw#o%+7*;1>_<=>Uf=tlN8t6jSh8l(+gkhCB_RDN=7!6gu-A+e&`|5k4Kg|hlI;Vj@
zkkr_$*hwOA?<_H|FRDfGafn{M@KUXEG_C!kpvwZ2hpw(h)yYphA~i{{y=lH`iLLWo
zX555DCTj`fyPf7Tb7bds;soj*lY%RbQ~l|9NlemA9`R;+1didUb@bk+xx-Ub8=bG3
zJqs^Z6mTO(sOw}ce~-4%9}osSBKEfwu7Y_TANN^>wWQ)caRc(^=3b~Uxm&nAKqz^<
z^5YASU7`&&@GD2OKqzq^_Gx3D5TvU@ZNcY2(zJ$9FYOJSc|)b^6>}U7K4%j<*8o5X
zX4Kk4s-kqz52Xh-=6fH<TJ>7X#w9I9iJlxV=SAIY4mo*DVlZnCeR0c&W7>QgE=p*z
zH?Yrj>ndpqte(=;@J5Uech~a*bH4Pohs&*zb@-ZrPMOA=3UqtSH@qT6TPS{~j{sfD
zS!}Wft)z>2tb5Nt7%fR6+;A}bhWs`P_j$4}I9Ym_Ek6UT)U?U>TTLeewD`MXvtAn?
z1vFi$FzNsAjRyp7b-S6K1x20wE#_ABz1?hL5E&l6s~)wz(K6Q-7z+^W<T!E%_8+$L
zmv6moF2H>Ie68Czjcqc*9}7e$_5)QeWsLi^2w&CMkr$!^FAtPIvF(9&Ww2qjob2||
z@tad2t%<F#gE%~AwI;Hc9BGKaQ9F`DW4D!Jmk9t>nG_~A^M}*pB7fuOxpfP`z@Yy%
zk{hd1TJRiyrSD}RX@!{<&5;l+gVPff()agwY8_VqaG8)|jJ>u-ttCr^`Hk4ddKRP!
zDv}ClOh>?xE<a4oxN`y+|B9!kF2|i;E&D<z4rJ+9jw`%C*?0k~BqYqmOysiw8mAeD
zl7y2N=5OEXG*CJIbIcvFZ4MebGyIKzr>hK4>LW-BmH%nod(E*s2LVZWLij+Q7lLu(
zl`lD46*;xD%Cd|!3aR9_Y@mxLlRwF6Nat}l;wrAhdGxQttB?*dQ28PHqu?Hrm!$YP
z@;*x>Bl<sY&Jr5;ndns6ZJi}WT8|u;2*jH@2d&(b?zvRk5{Q(`;qZDL2p2^c%AbqB
zR}yY-RPG|f4Azk=&%()g)adf1R&dM<EGJkVIn=$t1=HYM0d_h%l>DZWJ!b=~eR@+;
z4QrV2=B6;y!&e^Y!e1%oI;&^<132U>uC_E^sYw^KeF+}5&ERwK8jpU0)xen(cnHBP
znJp6*aa2$Oi9J-a*F>}0WJhA7(==EdlS{7plYez59dyRY?#UMc1747gi_^_MpO3Sa
zDVp88NukjFpONUnbIu~@L>-P^rwJ^-h^xC@9mH6Jn421BWk6Cp)4YZXoJ`CkO)EEp
zaKWW1iqDu`F*Y6-mpI^wFcuBhXKzk_<(|_0Pf5umk<y69_I&VS`*+X}xu;a|)2)nd
zphFheBgX0=XY7i8r^@%Kup?FfplI;Cj*jFvr@tu(N_ja2)(>I8#yJur^qfV3I2=rK
z+M1fsLRvDv{ZWb}B0?v7`;zb7E{>h~@MlAee2$TWAC2Q)-h9|vNIxUY=r@-A=GpC;
z-pG8RXJCzii_ooh?M482Z7&17A1AV2&F=nkl9EjDwW&7h3}>%=U|95CuB8gTPEi;p
zm^*FaNGJuCCpKu@%%<Yi)f7>CC*%dMsz5W>#<%{}u<NH{g=fs)fgvIl%P`}4m-ay4
zZrICi$FXMPa&<<{by6d{tMof3^h@0@AP_Yv^f*x@F1+t}UYfrVASO4J?L^#C_1N6p
zgt@|!9FBMEfDuq>Bjoh2!-)aFxN}S!-K%RVgQei<ODBFutN+%wbIbet-Cmm#NqKuT
zvSAM_eu1{vBUge_%(STEr)H6Yy$G=<N8eR2k&%(Hr~e=K_M>X$Ir>J{cOtrM^=tcP
zAfNUY07&S<(b{;q>~_O7ZVn9~oUyFD5pqx|7LS^3FM~t@<!C_3`^(3#FbFTZ!=$WC
zyu~t7^TC;NxmHKKrCax%l58sl*s~p!o%!kA2~<8s=lhjRGRn})3&$qME~w!SEE$Xt
z>jv|AeNJQT?w|(;d9W|4$cg6j^ExPW|1uN;fNpCVx5sM`EP29Tq;Mf?*M|*QHP>&q
z<(=6%GW2S^1~-w@G~k1dhI^zpr{|%E-2JDl%qh2=Db#&$9J?mHBa<04)F9(!C~^Fd
zh!}foI;3Whh!0GY=c>=nB8f>^EK5JbVk4)X8#QhxaJwtWBVPno$Itmh$murFiP#Se
zuAYAqifvHgH+n0AvN1+4?^Cu{qegYbGFd>x_5cAFp`CulYY{x9)dr!ngAsg?|2*p?
z%3Fpqo_SXif>Vs4IFul31VxLrlvn$1cl=a};#|u230ZVHBYchA_U_}nUwIaBk$?}e
z%OR&U?9Gn#%$5FfoArCg6`QvZCo+eI8n?@nP@}LRwnegyZ<|G{Q4P`NL+2GZmQ`Sd
z_@Y{l^$W|koWDXkc|Ue$|AT-_jix9si*}r6!3D5~9i=SaZCQr!*-1kine+%+e|-<d
zAwadG*1QSQ!NuV#3U~5>r$4f8X0C@J4rq({(*I}kIJx_<qx<w1wJwr|wpVC1b2{Bv
zv21?)7=n|(6LYc?p}JXqyU_~W9tT&*(T(M)-nzvED)p#VOap;IAU&j}S<b-qmYJG>
zBvz=6WkhvD5OU`uH*kd1<JW3#xIRfsx+{mp(>FImy6=EosJK19!W*77^}IXLGa>9l
zi-CQrl!U#2m-LSO*?^hM0M=H;1+dQ)Yq;q!yThtXd)C=!d5Yc$uUEPIorAdrYh9BI
z>y&|W{N`_u9>1`96IZ=lJJ|^cY}Ui&1(xT<kG#>GBXNmjG#WqYoOUTC_G2eFE=Wf<
z(*ILhEPpt2M5zYU#AtiNHwI{se5eJIzM6=u(WKd86TJh?;!$7da_{S%Ii9a0<HN+i
zpG=)b?rl2dg}DBQ+YJI5&7vNLwkXB!g~moI|3>3lLib27PmkW5ZVwDlm)t$l0M-VI
z0^k;I87;@ObaZz|?wt@qyF%UlyzKVIbb#FNyY4o^JS`O7O8{)=;YWY1&_j6X0XB}l
z(ajSXmRL;d_l?jhR*@ODSIAz0sfs-dG;a9*a9#g9eHZZJVYnB;C{Z*@{UQd@SCnJ2
zrYN0c)(o`yG!$Tr0GR2pAR4q3rM_C8g;^pVEIxI4Pei%Y1m_O8z2<|kxPZh;YYs~<
z3CXv@lF`$hB~6`|93E?00a0QkQWs$Z)O**sCG|{1-ft6$t1ru*^PXIhMswC%Ppujb
zyK`K0u$L(x-;V-o42`tg%Jfao`z24U7-=UGcgbF<vzersPAR)pEYWM?ZQKrNDmWdU
z1S!Da@Y}1!(FwYq4{!~^c6_1*ea4&vAS<-f63SZ!2#5sx&t<&UT%1?_@Ae;nXyIp>
zdUF}L2I42l{Fx~KZMQD{V&r{MufqCVlV=YE&0qFf4Mg4kfge!2>*AxYw<k7z?qlTy
z&}dG$+$isy?uAsD@)D&)(KwY5;DD*&0Mj(8{;ME&dS2ZlOY64ah%r2A=em(=Vet0#
zI%*ZA0~-W&_9JLAf7aU7mSD)UuARz8Fu9zyQfLiqp<-WJ`jpbih05rgy!!KeAeJW1
zKQq^LErEJf!b;~7@GhlPky+96=R#gww~V^i?d}h990@6macz=LZ&QL^w-LL*=<nr#
z6g7!VaIaA$-W3}O>?$y*eFeg%opV8Y7F>2w6nI_AsNqY{qmnY=$O)6*q$h=*FP}Qk
zMAY8g*hiX{@Vj&!(DRc+X~JZ};r?*!$)u$)_#-MQy<?q~2}os)D`iN6MmOJc_gERU
zN!@^ZDX2cv5fhOkG0_y}s#`*$qBjg#zb04IsepD+C>&PfTH6FUAMQLOuxA#fk;LjK
zP=1(v2um8L>Y%l<KQ~3t7N34I7~?Jer45S`>H*e1F)iI&0U|EVOZfF05D-%fRuB51
z54Tq<nFl|2WmCG;t*c;azhG=ee+;al3rc!4Azp<Tkl>wIy8MYSX<=YwfXa1KUk%>G
zRm61d$;{Q&zar!mUySaxGrNU|pP4e%AF9l$;S4N#wv%&QVN(YvTn{dTZT0HkWG5zm
zY2ZXpdiq`0i!-EYCAv-@{JM<(9nYrJOdO!5(+>E$)uTFOy!0^ADW@S~`}FF%)5Ft~
z1wdFi8CcF8-+yfM%wQN#o)hx(^SA0zl*!G;e0w4_FGLquY9BN}02J0)LwOcy+dXo+
zya>YIP^GkAHluANc&K#33FZ$}2DP-vGjZOm(V0|_w&p#)l~g^UoU|CN{F@u$f&S0q
z`v$i}`+#^t+?v?BopRlJkm`*A<);VCz}?UlPgO~5mD*k&A72gBLohmey?6}0I_8mE
z0l3WIU`l6wb0}vq4|Iu$4T_9R&>fU6-CbMzb25LZSZb&@YGaS7G=F4|PTPCC``W}}
zsEScU0Xe6ZYPR`1jj_n2O0M6jwS(Y=e|?{lZ6aY_%bY-JhcSL&aV{{rcUxvMP4($=
zd##9b+UN&%-dY@6sy)m_NQ)x^V8L2&@R+1&%K*Omg2vh)tS(D#@GA;_QbNT0UNA|8
zR)fbio1|i4#d2Zb5kMOhOhpP8Y-1+`%d-`fzK74+42#>i6sr2VNv12frpdQo^Vjlu
zhVHm*mmvMEy_OJ)PH*ILegIA0OK7GB^8!p6K*;*ybMvy+^)jsu+*eYuLEgvSnG$BX
zM2NHNaEhoz;588vj7E^>g4+12@b%WFzaw`n#`pcM`m58?zoRmDi{lTy(?{?R(>qLU
zuhT1F^r9BXTLsqQI(NswAk3uB1H^gFXI$<b3PCOe`6LJ$vEBJlI_QEO54<R{LlnaA
z0v*9R-w}|;I;I)_{JQT*FoozgkCk_g@*qk4XkV;+-&<0%{o25O&E*>$+F0;2nAPn`
zhBVz2nP#%Wf8EVxS94i|+!<C#9q-{<Vk4(aqtC)mfuaC<>|Xcj_qrH<Ed^xU2wDf^
z@0<1!WrJ}a;gHbo!-5<cCz?nf<O_@_WUIO^uG(*b1{$b0j_G)OQ+O9gj{<DD({hgC
zF?a>gN`oQ{6%koZxLq>8i)Jm|mg#!7X?p>slQ*4mMU?5mmg@Oy*U~D)8?}Y{<T&75
zPA{E>qSFK}RP`(Qx3htm#x#QGu(c-W@z)j*moMi?sGI?}wgL<aZLg@t1agh!WX{x0
z=*JNbJ|L}7IDkr<+exHG>20@7N)gW9FvulVK;s7qlVmh4y)EGxw(S$+Cqix(#vY?j
zun04?{+M*k@3J!{ZPUR16~Dw&xAKe57H<r?2ySK1YKyN0mefhnQZnxY(*7kGgZ$6+
zSBnMOeCA5-85r2!R}iEYU=DkNLPLQgb1(GH;ddq3x?#XCDs2Dyjn3l9Nl#46xTLG}
z>c5By=MN|w6rh@JjZ<V1r6~Px)qB7qR;CwL97=)|vGO>3)Z(ryfr*HOFMfG`etsbx
zh*Gn_*J0&9N7{jt33&Te!+t-++%Qyg`O;reoR1pF%lh@6TPfKhWd;V)pnVxMxp^C_
zBTAb;KJGt`x!;#<8)#H|-uzWR?8zWdnLcLG_vLq$9s>HRfIUUX&!0?)ee?e=JCX>u
z%Wr^QZ`N*<f4MKf8EWY3kK64d?9O~E)3V2S)ZzuueK0(v?xq1!TPUPs>-`;(AL~)7
zZ-(KCq1@b{xV#<i$FIOhX$OIGb=uXymPfoeK$RolK|o5qMKIZwgU-cZh^poUnV`TW
zEi$5gFXx+{8zXyRk+%vJYSnr@vHD-5CG+nS<oBK?eq^~`GJ27GcyS|w)@Ma2#K*vj
zu~GxYrEWme!}6mu$cd2JPt2f@!}+15By5gbc=T=iC7$!GB-r}6Px>c|s{4r29a4~<
zuk;I)=Z$I=X<eq#6hu$qRMQNgnX;9@i+5|M%Ieod4ev1fm*n0{>v1eJwa&q2+Zs`B
zSU1kLNowipU$8;Pb9c-eWgKQd=XH3bFND*)&M;VLuJ!4YF3eZ1PvF~3&`2aa5v9jm
ztm^Oj9CK~wDl(Dbh;K<9<-YI=us7$FPD&6Eq$;!A>yo%o!lRa?J`nV5Y9NneS|jmX
zGYVa8*IrNbdEvrh_GhC2FVnj;-_O0Q+y@!UYu$6R?v6T87-k1epwffmKYYSlHvGDG
z0rb1D&Q*Pf%<1!w7aC6aG^s@tAq0auV?$550Wj{u(UFHd7sCV83nHrzD_Rm~j(0pa
zGbVX1@C;d1mN_xfv^E>t8&0*=m}s3svoSG1Q7)P9TdsVHuv{cR{w%^@Jih*Kb$~Ch
z<ZX{Yd>RWr|2zotKKt*%L6!-jBT|Wj$ItGXCytvC>~ryKK}L_<(~fy%L-<n}BrRcE
zQ&w^IV5?c4-E$WEZ5!oV>gs#!v>Era##aQ`jI3snN?#1lX=+yiK;s9jqp-ikWaVj`
zmsjJ@+|Dw$uU=oNoidaxJ}<lOPdP!4OPFlx45K-01;;HCz#vfq-Xu_;nrG^b|InA4
zWos!98VVn4S-g60y4md5m18_sH%{aO%h^-sLI#nW&HE$1aa0<}^37cS*WVk%mo=(`
z?`#HE!Fzi)mVhpp|C@IX)@JL>C88rShet=B3b*sTeRKDH(l0!db?9m$^(h-I%apnn
z-Yzt{fB(KpDLm1?>!d;e(V>U5wmN_9YP|A)!rdnan=<8!8+#pq91?6DbF?E~y0d8~
z+;z!1%x>&DkaXK`=;iJ>OiqSH>7zZoq$<UbLe941zX-uVWiYbSHfMS3+8{NnU=L_e
zi1;~YLp`YX$JLdmy4Ma5{5{%9Ol)asQQsl8o=p#UNKRjFnx&xt<6V(P<;MPMFzlYW
zkTPY#er9{$3_b4CzTJ7W*3w;FIyhh(s5vI%m?J?g(5LD%K$n!$KdO0p2Qz3fELGo}
zLb*gzfjuuPB-zP+sxKZX?E%F?AxlwRp4=-WnWVn}rE&Ys@gPjK&<OF$_IhXt*{jdI
z``{zk(au7eaanp}<R1)0rR>i|ntI%41!qDG5&L1Pbps4D|NLsm&s9Uu{Uvkv6#agW
zr$3xdhD7=!aCrrJfzG@`*Ux|pcanQFtqMYWl)1DW{jd)Vc%WKAWS-11yFxXE2m4*(
zE`<r#<P%N2QNS&mQ$V;A3;V@v#W;wq*tM5shoU@!zdT+n#H33yL{P{UoT9Fe%p@L+
z-IzOctTA6~S&2}6jou)B2i(VMXjj}P1Ga(ZaL0;LI&ry4JCPJ==?`U>;{k(%z)UWz
z(^b_immeb9@b2lMH$f0j%<ueKJ3I*Vi9oNqB=~9V_O87-yP?RjK;yk6SnspZ8V#`1
z5C`>#6r{M37y7sjqpiuCRF;O5?_*1V-$44;P(e<4<k;<C?3RLq`fc^{?9R{*uh6hC
z%hiA`TakMCN#^9IRxbsuQgx>vgWal~x0-QJU}|x3VbsSU+&Zo@>a|VA;TG)_SnQt&
zpM30hLi^r@crp~^>~hEc8w!p6XZY<2_^TDu?-$k`kN7!Q-^E+G;+Q@919sFy$U@Hu
z!RO}0Xe5{mR1box{=lIDPh8yCKLR)_ePQUrJyN)fG{2cSwrX7(zGn~2y#BNW&hd{D
zm$06M4D0I;g8Mkf{bE~Rg&Zn#k?&CnSL6tfTN!m}bKc8DtY^)I^AJD6WP{#{;=S(z
zmq~JaO+XGFh4^GR(HZq3_S(wIGNfOgV|4#LzDcgdWDG#9m6qU3+E!lJOx`ttkbYfc
z0vmTV2h`zmHlt~G-`+L@)vsik=uyp80w4bc1*Dd)P@5A$3_%z6{Cx-8-M-EWnf`t-
zc{iV3;2Cbisp2L!C(2J(wOigHe>w7S=1=OnByE0U!?0GdcWINpNv-QTIH1<`d0j^W
z`W}-%SBuIj5Fk;1Go@%ZX-j{9CyX!f>w(^-Bqv^L9ewG61|f$GZklkA-IePaV`?q6
zCcpe}+)BngS&1sRf$I*>JYk}lZMOEON5^|AlmMa7<b>eOz_x6kVM^mCfCAu^2P9F<
z%G_QJmz!1+CPLH;bKLlo;9SP7f;=^t+o4cm+tHt%JukyT@uWq<8;VM<v(##V7PjNK
zw-EYvrE^BHui~XIt0o9lO&PaFR5(p+UsUwBlx1T66=l7n?}pnYrar8;%V&(_b0R!R
z4Q?(wLRI5$Cc?Bt5`nbyde|)ibOajV20l4zV{oJp325xSG$*?$-|~gAZExHzo&E>7
zKJ6XqA!`&fGKS|mwqu0)Y#Z|EyJr}uwcN|S{;QlIF%{KS|8)fh7ST>P*Ry1}M9);j
zl$bZ+vmiS0^RxQCz=cnU0#oJ`>m1+4(WG39YC1DdCU;EttXEO)6%aOT*$1l$vZ<yF
ze72?}$d+Q??{($Opc@76>GVxo1WK1;-jpH!IpT&?>bxeoQ?V26;uop^G{pWQ6D6|T
zd9_O(E2m)O*!B5(QQkMdCcoFq{8|CzA^<dldB3pB4w;AFchGp{K{Pc&aGDlkchG)i
zv(OB!F&()v2YyKPr*gy-_7qpiDc&75jd|~7Mj~k-|IvFQ`FvhH4i)D(R9i?9=%OsG
z4r4{p2R5^=<3)W=E(Q0%OSPXO#H0=uzR?%HDe^yJ1y^ABu9H-Ot8{^h{Wwa$lF*>b
z;WYjK?uu2h!sso(KQhYctBOtV9uAG9g3vitOI~+TZ-AmlaZ^NojMNlyu@YJBEJKnQ
z(Sug^G(3F4Di`kq5$<Jubg4q{7(@NT+2SjSptDbs^zwPcR?llG^o_oFVa5+;5LqhQ
z)nNrjc>)ISy8BPfW0!N@<oN@~?eTMG8QzN9<SWyh)8Ju0ouOwxCMlEpkVE}pt!=Xl
zbASZ2;iQ|WAt+li=TF0tvJ7bR6}iVV<<G_%WPlVFik<bneBt|+9N^DcBSYN=Qe$i$
zemMX>@6(nmGGr@00^gDwtS-CV_F_W+N1hw|Gjni75u^whDS}Ckf3m8#U8T<nXh9y}
z?#ZmT$AI~@MT(#5m>2)W+dRKH&hK~UVwbPMLC3%vhYMK8&w5lF{!2fa|B%J_TkwVn
z<8pA8FGem2=6vfnofN5|`g^Z0FWegB)-?#=ZTyG(G5P>=+gycXzNUMho?oXPvwLuG
zu)ocR7j_cYSRujz!!vMq2-&Pre2+IjCsqRM0d&l8bvt6;Gw|Oo&Mo@<Vd%2of_J+1
ztxFCkN9A{N3;g{3xs_V_Mu4iM*H6iLC^nVQHmQO3CNk7G_3zq#Ru*Zt1?Y|rjqI4E
zK_0y_MH$GzStxV29G;dxKiNcO22az{<;RtQ-r!zD9U-_Fs8N<8M_#1QXfm_L6=`cr
zs|hZSM}9xopfk=+*v4jKNGKy@AgRjG19p#QyL^R1++&jfyZoW*(%4;bNB|j6#tSDt
zmd#~v45Q(Hy~gT4BIV&X=Q{v{qx#(3oOj0yck{GmIgW8<E{UfaL7*b}hQS&qqdY!l
z5J~0wc^6N-G@6K*vM{f9#%+-UrvC7nn8VqhiwgsuV0{@u`1$w{F^R;1K~=&R*oLl|
zuaQh!OD#a*_VyyG-az5bBTBPPR717<RD6If9YjwpekxeH)C!$09~Au<49J&w)P+%V
zt)w)3PE$@A>!geY+LT4tbv6*vvL;ZP7xaNAE7$^2p_FF4ds*V(!Mh*?9$$uYdA~3v
z4btPe@&U!(M<<!+<MF5}kqAKrp^J&ks(U0(RS%m=^+TS5v4(7tSx?{KMK!kd(K}Bs
z7aJd6Vvu7^RoKjbN_x3LsWaq!n5X)vBD!A`74FT!Z!(UD#DB1m`0c}|smb894ipmN
zcYzc9)o4+0Z7QtTL;|!)kMuhROj1?@Y3V{q(q&x6dXP5WgG30coOC6gSiHY1C2ClL
z2Z}*LctlRNGGg0f!%pi?4jk0@vZq>ZLjFRM0_NaWpyocsT0*^u&l59j!{dBYKHe)J
z8KAZUJYq$~X)7?b4o#5xE6-LPFVi2QzvP&pa0K}Zlb&lTXTo;-d^H4%BqVV4Er-z@
zTrV#=SFcB@SW_9k|GvB#l6*XLeDaW0zS#Jh^{!{lz}XwQwxsO9Z=l6#e&jQI8#{_P
zZo6FuftwQ)eIjpeS(Ih1{VQ9YL35<dHw^;^W=(mHc1KZNe_RQ0p&bvb4EGh^wP;gv
zEMyTBlm7G}&U^3&N&jY_1qIhAu?|LV5++SdBoMn~R!zxkup<A;6TzDDd|{tSw?BM;
zIU(S9-$z;Ka5fR|2Um){NG)<fl!aacO9D$*^hZh16VbnW3n!s?fIQ)pr*0%HyZtvJ
zDmp3#=utoQOdHLQsmq5%3EiUzi5iV>@^$|!#n3Z=i!AV14Rl&_64JZVnkIV7!D8Oh
zq>B7~d!I=JRUM%s`=%>O2G2zErz`a#wXWxA#hH8?Uj{C#`d(dagjXS^l&&=~hK~6a
zoQ{}qLcJ_mO#>J(fx~*}uDr=JN1VBbA5o_jT!f)N#Jns+l&=)b()K`}SdPr3TpM&j
zs0!akk72m^1Ea)pFD;Nup+4Nih|n1Fw1!$Gm94(;D`+&O%&S0)qc9S;5eU5dIY9DE
z6e%nWO^yxgUZCkXTCe3&atET;)0vr|eNH?1sFKMUv&@Q*26JurvN4dG7EqB+N_lUg
zS`onvo@%<jeSFNwkIGe~C?;00w3<}xmC=@O4M$*G$diHnqIp9w7rAuowXi#B3H57H
z?JHTqs|<1Wssgw;(HFxz7B&}KUWbY2M0oIWO8zBg$-@)p6?W03<)N=%<+>RnTFB<T
z&}4At%~whFGc|9^2t7-8G3@v_v!SynoEn!)TK$TNjoXqvIq$J_s7&d5Z168FO^|el
z$y0vlBYsF`<b6aX>;e=Y-)q(LVMG&Ig9nYFhODXP(m!&%40MUtZ{%iW^jp5*U`>_t
ze@!MwbC8)m2zcI`)||P)m5QmlFwxX7VFQ_@3)-ed8HM65Qjf%4P!v>3iZx`^ibu{T
zfKaHEshWxL#AzafnX2Q@f<LGl_Cr3|N?P+I!=gLeI)Hj%vULZsk~D_mduA(<P(Y;g
z6>6r*_o+g`CUxh!PXC!>dvb^07<?s$yKsbh&85>LGvNTd;XP4sxR8fUBu>*A!s~~B
z`?kaD-=E1Thv?(o{tO2j>P<<=IS|If<G6p%;rP$`A!Ydn#CjQ|5NkXt-74W`WeJ`>
zj|O^0<Zm8$7Q)3e7RiEU?jpaE=VQ_sMSyGrLdpl{Ej}W^K4k%ODG+}*+nCZ7=|Ao_
zN29aEMY_ky-Z|NSb4odN{mSe@VDDMlj0tUh(&Je0Tr#LE`8r#pZna`Luq#BdWokmS
z5T8^i!LKl=Kz;m`bg#^(-t}u;kA6>iMrtXz)@+fJVLyIGzb96$-Ny@h6$0zw@iBA`
zIZn{$LcS$map(Gqnl5$sTlLg~R6Hr(4g7mfUs}AJRQ2$T79oq*9T`8gSmXXrY4Nb!
z{A|mSwltz=W7hxQ<#q2lA4fjdMo}N=CBdzVA!5;M<!s5n-)*LduK~XmH@O2xtDPL*
zH#69Ra|GC~twyzb&i$fNtAQI=t98z5lsaJf`#}=Njw@X9<04JFy#jbwk!a#H>EM>E
zswaA=t)##JP1wEU%&HznL*oFetuTPOcWbMuy4u;+*4EKY`$TPDD`?%zPW&Oq!6-UU
zp1VXp<FYX@Ft8tGe4(cHh|Dbdi?Ck3Rzv8H{YTIDPm_;5IT`L>mK6QDM^JqP(R>*x
zW4?StqZjfP|844ph29-0(fBM<Qw<uSbDWk3D!JF(>{q#pG4!&_akDnNzf}7OTID=#
zxU38WFqt9>W$^;q-i1#slX>DedWvjPS?>AvnO~PEb6j(KZG+b3%dF*xv|pP*X<<QG
z7;hZ(1qsWl&5Gmiw2)&*ziM~cY;#3&kkx2)UTxAGCre8)Fi}twrEku_DVod)WtG}4
z!0<w8g`kQl8yq5TB+~Rnx;km*7F@Jg+%YH82}%n1JT;eYV&Dt(GNpglSFd4yG__p4
zM^=$#u=z`v{bp(vX3c9(mzcKFM+`XQv=dDNW%$7%hlO}9u-abNRwmLFVoeF9UaXAZ
zh}dNG+UE7mVEbjCh%*QMBv_-t@wU4$@B^nb2@Ub<bNE%Zf*YjIUU@Z`#OaP23__9d
zJ?iXz95XF*1SZh;(e;}Odz}1FLJJXbU}+;;3Ig%pOVXG=1VPLi%DyhKHTqpLMKbvi
zc=&UYbO#F`ofUg}xVI1!85v56hB>E=Pq(tOKMTe)U_l6Dp!eI>#5g!v1XL_OFld0n
zAUb+tUiNM8H<jV>#W1lSyQSi$ftnEQzVIm<DxP2Tyhxl;pGeyHre*iZZufCVcL#x>
z-BjcxRDqDY2<j|#IQna;^X+u!bU&ki`*YYm7VCETyN_k^zb8HB*oSDA>|Wut5-SZ-
z7V6xac`X$$75|Bt1UmyBM2{_A(9Vk`<Xb|t&^UMpf}^Ddbvr@@d57asq}^1i&VS;u
z2Y|QmaHHF5xpT`2+jFvZlk44<{T-GmmHN^@BtUNIDi{U?SyEvr5;SHbU2Wo^f=9I~
zAstH$U59PjaJJDebo*e*Bx}mD^1iY%4bl?ym0Xpw*0TQDuvL#Hq2GGf(s%H=exem$
zY_Gk6WSjK>9K@7<x1w#L_|63s8rO)`AnlNAhtwO)lYX2E8<wgkP=6$X;y$G@O=X>5
zE(m!0{e!CpsP*IBSI2cy4*mfi0G;JlRAC@PFkq>QZnY%>wN8z7T_^w6$0<N&MIQKI
z$puWI@VkW7k8&LN99q{QF)|c0twuG`7Y$Sv3Q4{QN1?ONvZ{Bg-vJ62AwT5q5~Ct+
z<5XHmfVQ(N$FR5_<&E#^yanuAt}(cNO@hRYM4Z8|>v^OVmKAp2>|1A6p)`aWqisl&
zhd*mvym+IrqiB^Z`DMAzOBo`??jp2f<^QUFJV#^Mq4V?eGTQ)iusmx(VS|2nv9ZAD
z>UtUrvz4B@Q+PxKQH6T@0}O+~sBt7zwb;;n0TsrS^o)He09HPK-US5GR^V7xNC!O(
z=HCWHU_x7TM1(%#J5|p&JsmFc7H?tI2`o>EsJ*{`O#-11bWJc9H1MkmA8l?mYX3NN
z!A^!-zVH7*aiuakk(w8T74<`(0RbpwJuB;=!43!`8-oB^$11a)B*6-x^f-Dtr3O0X
z8d2|9&lZ^OYWgZBI^w>sC1dg??zx%?Wv~+4;CFqY1GjC^;J2YydS_mRcZk}W3g(5r
zRs(vbBCZxM_Rxn39!n%aXk~^KQxhI^6ROf6XE$Hznij--9>}B%A1Q1ADfCie;oya^
z8zpmZ1tuxsT^$|lA6v>?E`UC$lwV;>A0!o);WwPsIWk$JIZ{$X+|kChMvCX<`d}V~
zjY<D~y-*}cvb13aaVAvY13f5=qU+H*)OtugkR2H+ZASG2Fy(1)Ut7&YH9_>m%g5Ne
zJ@a+0+RHM9>2L;vibh7}b@~ZFSKSgmhREiY=KNh<-)}02q>4DFgEhd@hR*|C5~m|+
zuUbZbf<S@Nb8RH|C#Vg=-i&JM&q!}3A%1tT7y<avzJERx!{0|-_L=)SGPMwO{O1oy
zw^Tt?d(1lWT12(DM`C{dyro0*Hr3(H$$0FEgTA!of&vW0^pkM58rE-x*6dXomI4_v
z0b^XNc52C*oTd|<cz@g#w}CUDeJb9Ya)Pz{ID)(%CTdx+a&Qcmz;Pnmfen2-Caq<#
zb^nvKx|&_5?(Qc&kfe_hbz^FeEov`3pYt{x=q$KDu$mRf0MhbeC1`zBDJ`+#`g7M5
z{N#PDRFokhu6vG9!%gCZKZSGkDGy!9;k>4+f+JW$Sk|fw?lav?<kOgtEtX?q)VKig
zBsm+Ato;}t^6hdo&qTO58T47Zv7@xLkUD8g68^FIx29~)f3z^ZVhKG1eH-yJJy?co
z=rb4U>-Z!$?tI?S=c(^2f>T38RAkicOBr>fm;aBObj)@*!2sFC8O!>G;ME|+z}e$i
zqsRjIv1!Nq`@pa`Z;9J&xd88jMZGMW?*^O`5lS2S(x(Hl2X9UX?wkTHnzQ6d`$Eie
z6NZ>Ez7l`tkp)KI`o0zKSs4-01fh(pF9OiS&kBY_l||_*MJrhbF)`t{-xz@&#>NUK
zKN`X6QgwLB)b<+A(`<<VQjZ-gF;alR<CoW6u}J|OWJayDkC6*DZQUJeg+}BKuNmml
z(t8hH;I(UrTmuR<CL%&Zry?DubPlB+AnGjuWP&C0Do|^C9{7y_laq_S-oT|$HQTAx
z{qSJo)8!J87kgV0`R<})7Y#IW+z1jE50ts~_Cj$a*$juBRRPBxVmK|*Lrkg^SZQSd
zWw0smCL*YH)=*n$=Ti>G2(9iw1S$IseMXf(AaGYz0ps1f*_S`XyA)HA1Dz6<2V2Yc
zM62*J1I3)zMQehH?PXbwydF&V$&u*)5N#7<kyo`m3a>D&6#&-XDwpgp&jN^s%O!fO
z2=Vp&*8fp--qBS5e;mJrWL_#=qi~CFgk0HVQxTOF*Y4UkWMz*K;+ol`Wsgh7wJ#Z6
z6|StSjElrg*GShEe(&Es{&gJpGv2T9d_JD7O>ItA)*mkuV7d}=^C;R5390|U<_!It
zx31AN@^?e8D8ge6hEFornFAFCw3yp^JYD}AFx&1&h83T-Mzj2*ROtiT**5h47;Ath
zHtSLR*HnkE()jQs%I+I(MIBUpu9cesh>@GIGID$6f?hqEEkuZxZ2Ku!eC&xUzsn8h
z)<kD&*3$pd`g)<8@m3*tW}W55uZ5hl$%yz-^EPEP<h0g22gH#b)CIM*x6}$dRn>-Q
zjF9d(57?Q#T#i<J?kE|!2$^x8L7a+u8jbMvDj<!AP{?$;kq_iD;@QMmr5)qj8_6g(
zm9BitmTgJ-lva|tK~NCD8v0Ldggu$jMF(WaDGyU<D}f!Wmo9mR@6cY~#H*g{t(Yuz
z)&;}i`M|7#smmkPvdL<B;=og|;rTKKy_@rWZFiD6YLtH$nX3{YiY=oXzpaM)^<^^t
zz~K3Jm0K~b#p)3UK1yn7TZonwX$wk+e|R2XZi{E!iHz!*e)S1Ek)8#ry<nTk%#i<3
zA-!-L&4qWe=h4ysvA$WTqW2n;e*B+edY#&yw1r?8tRRJ;Y`XL|ffFq$vTe&132k<T
zev=S~zdM|HIyQ1QbU8Un3(Er^Lx1f=FjCKLT?Uo8;Jbj%e~tWStZ&*rInw)kfpvV_
z&gOKQ{(2gw&NIh8M;t{BJEeE0t<s=|CB#sV@OoA53S>+cF`#PK;)@)k9D~R=pDzP#
z8P-5RIM;RZXK4V)5PT`?9ytU+H0s<v-rF-pn;lZ5*$$7$7T0D_%o}8UB4Yii%8}gh
z?EkHQ)A@7%cDlk2Hci+z7as~1(h5bge^mPtE%OTeiQmcYM1RW*8jU|=9UycXd;~-+
zGbDRp@Su{ukz``jYKjIE=(X^!e9FF`GLXsyRNgOd6?)ecn2Hb%)B#Nhoxa%8@-h>b
zN&{aHsDy%m!6o8eW7-0<bTU8%z`?L8B_(B`K6$JhZ;fqEPjqK#yb>iFGL$3Sm!IFO
zFc*4*t#@(pGH)Lffp0e4PtW$$+2ZYo_A7%PZf=H=_DQ10W{fk$|I}V3_e%dVas1Rc
z<qC)YC06~hmE@bT&oo9`X3LdqZ7%lDdiD=e+iZyqdKyaw0qupmM&}7w^Gph?ZqPU(
zJTxb@`XhO7fO4H)#t^KsDt6g>Mwx@Kd*UVK$2*~&8OG;7KUfpyN^d3x89M$UV+Y=C
zD#TcXczgxR@3<7f7r*xR_Yc)0?EJ0rT(qXvo_fdUeI^WIo6Jb$5Pdfo6IRV)EYFzR
znK~nD`1?Ic@`@kns)OV#PnsNd%}=Wjs*SdZVf^;i<nbTuw&u+cVj#Le7=8ErcbTg5
zardVKH}E)H{fn`h6hkp3ktQHfmz(I!{KBVu`%_Q0oLj4t5j7N)M_!(uvEJoM(T_C@
zs%|n!O1b4xS>qYToldFhrk40VG|7e;01&;Iw%(RY(NK~IzdA!t7<CtdI#ZYNZ}0D@
zCXr%Nk%>XrbAj0>nhSU3A*RkXW~ogmfG38dn$C#cSY?+)-Wm;_VU6dS*gVCIkex_j
zAJe&Ami@5Mp|BmX415{)PBzs`8;^(P{^hr~hftysBqg*2VxYbtJR&qac;6>?!}<@n
z1<%G%DA}U}g!e4a?^5g<Y?gBSk$Mb}!NJK@I_L^X34VhhM!0YAz(?a=Ao`+^yyxxs
z*rrZHE{G_@h^kYh?Ddq_U03TpQVFwC${nCkJXuNHMsPa_Fv;*>kbJdw_<N>m!juY)
z`X~K6$0hd+U?qN!u_WV=>pmU@)1q8LU-HB0YE#qd$SN1K1*!$L1VWi(2lK5eo6XS+
zL}vR>pPpX)3g0+hu9_;p+d*<ofWF-#*U8u8CnT*cVxIH%@u9WYh_S+Odeath8{M+>
z8W5&?=H#tf)O$JG&NXKp(jjct@3y0fxJ?H2_jKFS!pQjYkgL{~X!ybRt@9A_>?~KV
zxsQ~Y&mDt<v8BW8Me5PonnRahtT50VvvPB2JFeSZ5{%|V#KKU(MP2!n7mb0s)$er-
zz7|vmhP3_AI!Dag-Hl70JBPE!dOEhY$s&}oeQWe~_sMo`_;&5_gdL?IrS`hPm$EUj
zY^1&z4vT*El%%9^9ccQqNYAWX*6E3dZUX%({${%X*tfT5;X@YWAmq&hA=t;IDfcg8
zg!j{HTmKIJF6ka&g<}8&5QEG_lsK{zeie3)1`|sPuZvvZYHe>{8w3jD{r!^N^@53O
zO=`~Z^V)xeU00vCb&q>(P4uTkY**P$RLTP^(GuAJTPbpb;5znAI$0^Z(OJh6pIPs8
zq2dFl&L^0|9WhB7w-$`hZ5f~_@c#lZJmQ<Wx;ikQErlwpcd1^BYWCp1{OrRL@WLw1
zrhakWo5;x_wILu=j3yu*iWQw4?lECmCTSN_F3!ePUh>m;{0$z+z6i7>Hi*>@!y|og
z|4sUze<n--re7od6t139H<@42{m6;gk&xFEVDbaA>8t5)M`}1ZIT1QnOj)1cNp`rY
zWhe7b!2$$E9<!3f_y(fNoMpSw{Tsvnlcr+`$WII&=Wp}kD&bo&7S~j1nq_WH&pWCR
z<>Y%&ks`nPO2ZBg*Xu*FHVi3c8V8IO%k{Hd=R|Pl0+I}EWw0j>j7`7x*skDgjVJ^A
z83xAh=0lZshcc=Pcp6Dgi=wGkRoHPPoAK5&I!s;ttiflC_e>CI{FZ<lp6ea6jNR@5
z(eV?qcJdKL;?0O_qp|+tdPj;dC|+4xwzo7iieXq`aUAn_4f*$))^!^l%K3D?TAK6b
z$X9bAhm{YG;{+Dv)qulj5!%*Xh6x!e(Fd%GC%Y%KlkMq~gSqwfOY0l!a}nE{U<|r+
z1a5@*?qHYXG7yg_lI$H1GW|}mc6D^{)R164&Q|}W0$xTd8F@jd#{qW6kXVw%T|FUz
ztk;Q!q%y`lhdj?MsR)$??amo(p$U9+xrJ#AzGPoU&@KhP3p`_O?OiF|{J*ZQ0Z-Ll
za1%?~%ev_U#_9dex=w6aqb;#)AE1VxNvCr$C5`{a17em@Y?8_B^7-SE;efLR$``Z*
zv;?pzWKQBib0nwc%1pZ-<i5qchvmf2<+VTbsa2Kq3IzQaUS|1-!ee^zEIL_28x+$F
z($p^QW<?yd<$v*MV6nbwcRN2JJGJR;^%M;CHBg-`Jg0KIRTd&0sa(zrjU+>RslP}i
z7A}&+foi?89<eim9B?hdtK?cI!@$JfCG1kFsxvB-us=Yh1dbCph>rk;G?>fOtqUIO
zKeHxM=8le=Q|{fvk@iD(h{?Ty-!?YZJ<XsC_$nNS_FW|#DO&v?5Ir0og~NKBEa$@Z
zEc5|d?cnb|ZOMlzO%XFtKbZYdXv-W^2Oj^@S=)^$A?fRh;#;L|^@N`pka_3k=CQr4
zEerJi()9=~tv%4)b_@cyxF9(qG$;r};fy1R*{12XuL~PCE`j-%-KE#}%K9f4U!*=A
zTL*O?XmM#EkJTHtO<oIw1*kv~BxSS#cA&oet~Iukm^-lauYK-FuSmJ$;i$@F#u>4L
zRG(zT@j>nJ@133XI6<|em6V7p8v@EWfk28W|2ujpb><?uvg%=y>xO!$G^sd4Un*ui
zm=mZv&a*TS)&r7tvx3w*FEQ%03T#X;EI@QF*H`<;=vfL{pM6ta9nIzz97YU!b)D2=
z?1Po0tm47+g<Y*ULyE!*=U9|BG$HcR$zF!NAb?2D&&oxaQuOcoOGp~j^>zFPwyYrh
z-)8?J)!Q<gH09Y02I_oA{%Ur7vLaQPvu>lN@wIn!$>6KLr^EaUzkzRf?=xucA&J|y
zwPII;X**(kFWpxOt;CsM?3|)SIkq$)n9eZjY?^VoVQoeXWL+_1Z0I@?R2L;{Y?I@}
zcZ#pb=~*yd6wVVK76g`{oSgg1J9;s`Vl<hmXZN}~5zaYGXd5w{?ODvZ0ZE+CHRkzh
z6Pdfh=IGh;YGYbGQ?*)NHWueKA7iP!s1FQ+jla)ELLT7liu~2|{1v>i6w{{oS<gb@
zFtcYx!mD2ne^;D-zZ6}LLbIC9saEIj2m8KV5nLIBmt+ZZ9Om;w0)xUw{-TX<UZgFi
zynfZCdd%|~d3d}S=M}!Q1}@iyvG7JFcF*N^Q<b>&<8^XsP*9km7+vC*Ajns~)BMW}
zK(2`z5&^2E&@)6gZ#sD0<E7%;@$b;w34iiP!1$TQtR8-p=?jkBP!RN<4}r3%&2%09
zuJVfYcI+<Tk%kt~yx+@P6d~>3HLq#A#EJ4}rB5z}wib|_ct~IFIF&XDz!+AlcYabw
zacgqZKc|kPJ}G;JeT@;P6!||h6@rx83q_WP8Fn4W&D);P_4v;`zLxfHiSwHNe@N#8
zw`w|WM!_djyczS=!V+RHmJxQDM+quk9%5GE<2*5<2Q;YdS$K3}uk^(PMC&*ypH5gi
z={fK1b5j4@o>s^(I}RlLTRKLgJgT<0w<khEe0{&I5s>Z79U;4FffVhlI2_rx#wuL%
zJ)!)ivq97*`8qA#m<5qxP=Q-cN5x}(w$dUK2AIeBR44%Lu<18497O!yFWsk^b3Ge0
zt!HPqq<Y>}KAXq;u7|1(gAwTfkmGSJ^&ZA~bsaA+uS>=3Q8oa{xL@!LJ3Xmh&L01Q
zngw`dg9FCsqam;GD%lBWT-<B*KH)w&S7D<K0hC4z>C1k4#AbsRcr62&K=!`G%n_JG
z5ml~14rkRaV5_{S^?fi;4h1}pn3T)@Ly?WdyMB=YT_u4h1$Pb}&ieWKT7c<zTQ!=6
z{1L``rECJJk~}uFwwCg`Q-b=t0}J4SKMGZ5=8^#P^Wt?%Yd^Ew#LG8X;_%jtlY{Fg
zf8#d7hkK}bYN7w64T})xw9lnIl@qUf@4o}(z;657Xfe0X9tU4QWy<{yM`5wJ8YQ$8
z{7cryk^y#MrHvL=_+2o?T2#Cn-zbXJ@wsf&D-F5rvVz@*61-Js1Rh~Pc_C%&q}s4W
zAn_R@DoS|Ku@0;F*TnyvleJX^JWJgMOH|cwQqobF!(Jf4HTW#jOg0@B6+)Wz4ypwv
znHy`I;qv){Z7!%+X3+==|9)VPuDFmp*{DnbqvK$_3o9fVrU9wOV%^LFqNZ{~C{vZb
zJo41?Gu`fkqV(~9s4K9jVWoie(4TI3QM<s)Ym)_1vkU&VNf5rO4&$4r___AML=qI_
zyEFU{DH*%vTwM%~6il}obGy9axzL*|IOIKHhrC4+kEg%#$r01$AF@#*&!nm0Vj~a>
z?GS20TJ}X2r1U!o>?$;Cku8cEtJd!tbqE(-wq7VtYPZ+?F!b~E)eg-JjFn`(IOjs%
zc%|@-U{ClvO?a_W?u|6s@lijgo6Ws`lh5Cglj!qVw(VO;#EIMfdOx_AHsjP!ypDIe
zXd~_Oj4h-uLN?#ppZJDQJG!i;sPzEt@!^-;=}0KP0ilDQ^{t>S)${)c`C6|5f$&gf
zpr;vD`3}2hia}PCAh<7|gzS-!sHRXghjTLay2|P^>Z?2L-Y|m-ky_k2^%!Vuety7`
z^#q@xmw2=xy!2FcbE6b(DSd1z<d%cx=nB11o5U$dMW1zK4Ubq0-1AM4Nn=_+X@4HY
zJkny1)s~r7X~^ZNwSd#kUvJ}C`yi(sP*hAFkD88yJTrB#qYH;8SjRyNZt!$N^}Tq_
z&2D+*!?~?(Q2EdHD=SC332csEVN|?=!;|o<7~Y`Ix_-LaLeNK}p`X^CwI4&hPQf5P
z#8TYA(%q{80nNJs3Pj5Vt#TR5FF2cJu=w@SOQ$W#Vjk#tdT|CbYi*dV=-D$d#bk_a
zrnuP46H4V)R-7fmwkP_f_WlMVv$q!(-ew&9J*eEZyP{l3#BMuTa>m_^cbzJ`A)BmC
z4a=nffrrrE6U9}qY~6_av$V;Yru5oqK=rgB!Mk5GctiYGJxOgwZFT3ctiOzGcfPTX
zbdjTX;r+vtEdLm$JM+ss1laDu0buN}R?}tV_jW6ByX|r+EI#>WcnI|4pv3+hqK<hf
z-Nns|f+)iSH#Br!KCukeenR>PX3<B0AOKK!=~Dkk7VnTdcGVOGPx<a5iVHdqLDKcB
zOD9`p8^hT*c;S~96+0pV_x6sD6q7dAH<q2kZeEJm+obLLh?NL^1@O~SH=D760hjfg
z9Eh^V@@Z|}ByXdVCQ>)w4Gxq*Y<dG(Yv=H0d<N1$OeZRmF@fcW*>Ht%F|jS<ePkcZ
ziz9B<(+g99{q2)#K#F0<t^O=v<BgPjtfiCXUxDR%M0w@Qp|(XtFlhLZFXyJy>Z>IL
zP+PVTVUB32!q%U((3%oH)YHQw*`(%XP)cTJXFJ*8l<OP$qAozfQmc8+Ch9M!1c)h>
z6;OBbD?gEHsYnwMxR&vWt3b_WcMn^~(K?)fJnz0<T9`ss1b(|`+%NSvD$;V9tGDCd
zOAIW9+F~9w(w->xL4&c_=A&>5zTJA6(tm5h7)klm;;nAfZWFKB*mqiEx=IU!ZOUlq
z3@bGIY*@|~8(Dg6sXhAO`SSGLQ!<1VO4>8V4@&nOqXL9EbjmBP2}$6~2L5fWoE0_-
zP+%S{ITcmM#F(q_BEdH~B!Kk$S<7-Yra<{UFT=BVQ@a{^DEodp`Bk%&y{1;t?Le_S
z=t4mvSrC+p8|!}BjEI>t7SN;{B-rl{!*-V<0FWqRpK2%k$m=3k@M#`Lo-1cjek3N!
zq`#q?9D!rx)T95d$c%(AJjP*Z;$JLfv|U^nQww~%Pp{D1z}h1lkw8Sf^>-`K*H_xn
z^kyUj|9g>n{M?3~psG+qBbnuGAf-hc(~kW3V>-RD)xSCZ3SPW416(kXYeSbIg_bn&
zpDe^<*@K#23;tY}AzE|{KrSX2#YtjxIc0fE8Q(N}YhR5-f8NukhOuIDfw9d?oa7>J
zmni;Efy40Z2Z(HC=h-x04sV_H7@-+==jEE!D1FIKk9jy|TJE3YLKU*Ux_zC)6;nrO
zgM|rvCQOcr+?hi@gfjLCA>KS@IW%OCnij1HQUA9#c~aT1MT+Jmw$6HTcaOQ*X2}j4
zpeL00wH>pZhTt+B-KSMJqH*SscPV(>;$IeeL^9!k5N4T0cFJRNu@*DC_;pbcqJ`ex
zU((G1W^y1AoCAV-O%tq~S?2@^$ljy&=0B%Ov)Bu~>&ze?0`~=*(BDN)^^DORC4y#t
zUT!PtN5L6kSLF-5f60uL4qsBvt{|X8{dtyZYYT|ON-jFT3sLc~I57K2T9zn**W-vV
z-;jvIKTp&#he(lZfRTCi>UEYA5J4?l+oRNlDcq=z*6%k4WQOIkK|J1be8<UN_u-xr
z5F#k%lg?`t_@dZQY=UI<;)4Hc>GyZ%!XD~7#d^cA3WrBq+Yjy50!+NZIwklUF)=!j
z;?}36saa2jB$bKc7^shtyWsyxN2kdwut)&wC8V|`BGKYgKdF7XGg{a3xNYE$9vk>7
zLFNys-lTLyVMx5D*LgmfJdzn>`4oJix@hPgp;}mOP#ADM(Nz>$P`=b)$+<Nd;ZZ-0
zUxP%;u!Nv%(%Ne*_bnG!{J2MQ{)}0bXU6JzYV|eWe)!;`(sNkB?$nJ()p&fnb|kWr
z+f@HPvjO0>XfykHNZ<c)sfkmRo0l$mgdb%V@UTk$il6h-8bFxP&LoVNM#Yv2<OsJ7
z`g*NN%KoHpEIl7GIV9JoXm}R>(|YTW52b}6?RR`<rW}PUuwUkjW}a~uE16`1U%jtC
z!D#BWL4av|wZ55#g&W+l<hmm!Cut^(g20|ZfPR&AbD{V2g)m}?_XeIHReb}(5Q8;c
z7@9fzDDV$w9zuTj2CLnE(wY#%yx|S}v&!kNbaopCI$@m&dVV1x$5ik5IkvgGV}VM;
zw3QhoD`|22m`Fc7eBCbWa9eSWST;${g+!*My<mHZ&>8dG0;=^l+!4;6VQ7dpok6Vp
z&CB!4c7G8}Iua!beA^Z}S0t&~!*<F24V6!@IJ|<)_;cMnglwOpk;=yha^FcRA*}^=
zK<2kqrXCJNOCRYUbLmfVm3lguXFKT|*p89veQmH7*RDlOSu7Y*{xtEe42b)%S2U3h
z=C%B~Up6=nPVBKiS!&kNh~~5?9QY%jUt=H)hr`-D?XP^8RMiq8FhkyWU$EEO0{9UP
z9k1puzs2%U%v|h3^LhcJO-8&_{I2j-2?#T=px|lIDVO<96JEcH7$>$F{!5J4fB%D^
zbh4#y{o;e&$wVzl?#3t8>-SQI?K)NIK5YixbrV6MnDGEGI_iEsv;SAXqBQ$hcyj~s
z)o}K}(u@_SuxIP}lAnup8Qeu(UF{07wsy%xOPR&P-3<()w)Q$M>vb4y+jDA&CX($J
z5kf6k!}6l5>)_Dn3#qjIQKU^)CYzgvrOV1pE=X;8p}z*e8KE9#!+BGD#+`WlY*M?i
zftV$O?JK)iuU}d29UX#FAfPQ8i0QHgc)uulyD+vu>oQ~vUm|vv_6?9@(tS|irYjpz
z?z3U`QL;mNfKzCL+y#yZa1ueIa4$(B!x!AaV96}Fq<?cI&~|bd%&(EEH#s(+1iE4)
zrFqBO6c^SDPu(N6c^448CtH&jBQ?a9$W!$a;2&OEvfxi?Uzzse9FTdP{2%-=t};}R
z0g0BFaW5=A!^Q3qBhSsW@#2?Z%WXpZpUjI~Z`QKPu04;3xp38y%=P6RQ&XS&ube^r
zwtqr$e@&532u?&WM8qSqkXG33N@H)}DZ_q`Z=0G@z3!f@r?YV+y;|T;Z(O?{^Tfjl
z5et#8`dcrdJh2=^RI-Z`k=f0d%9N^4An<DwEV$m2AoDm5viQwX(FC$e12jDq`hc3m
zr^{dH8+?BGT9FCb&}|K0`m?%$jNG^hFF)515RRH}JoOP))uy9ej94WqfsB^OIPY^B
zvN;&HnXo@BF-=p9eHMwaY{pgomgW#<VIaD7I)h+(XAYpjOQu$+_4g}d_ZbRs%&vG3
ztnFW2!9Xm^%qHd7^ou8-w$PjJUGOKZU}7?Mi4!t!BcxrKxmHLgp!bvb8}hjXhKkzl
z4l*chB6vBO)z{evnhnkK<oq$g$pUnY@A`Xnp!BuogUhZntt_tt%-8HvXz;<`4r`uy
zPd5K$vNAkr(a$$z%*rk+Hy0~MFEOfF9H4cDDsOeAp^;P|yip*ReC?FI2cG1N^_5aI
zYqKlzJWu6jU{ZQT1QQcdhI~p!Bw^<9{EbIq+bf%v(UtegOpHFK=U&fN2Lq7oG2WK%
z&8^S#e`#niSb($vAh>TN&sP>{&p66^)t>A8yz9#E7I=QBVf21%wZ9a|7KkqjNW<u1
zQdx+#tlyfH#Ck=4L)zCLw%{-78*_1Yf{c@uX}Qfi0~by0JgOhCqoO(38Dh+e9Fk<2
z#xF4>z4D30K!XK)pxW`Sn0GJQRMPz=z9IPu3v)OBt)1+49lJp7gIJBnAKb;fnJwXq
ziiFeujX2spIn97zDq-F2l%i2r96vP?Z9V|vyWFdh!<PS0(CmHVF9l=WL6a){D1!HW
zshZu+<pLiPCx+(9_aA7Zn#i#Z%-wPGNhJZC8^C{Xjc$K44iE`HGFs^uikbypFSKk2
z?<ug%V@Dt_xMQ(EW{rCi=gkMi#*H-M5UBrHA&n$i{*dPBLSk8ZB4-V$&6)@8(Q)R@
zjw(HxJ?4GsHfW_s8>e|?N-_ra-yMlYehWVP%nhdETeW>O9DzuR^O;-T=Z?Mq*F2-d
z)r|NZy0)wGT*X+w_36e<I($cmt(0%Ya|J<8u4fH|_V;sZ7$5{$EnNxiWLqRE-@2`o
zErL}l8QUzbV+8v?Io+=u-tpe^<V_~C&a3=VNw{+|F)_li-hA6zBQchv+=_HZ_(p1a
z{1;=3tyQTb<O?)rlgyRAqsHkCi!Bc<WVX^KD<8qo=OpGymA!@i-t5AE%rkm*91DGc
zp}_+X$B8yEX<w0NI}}KP6uB-_^<{B|uY&htu4h|v32$6X6Duh&>PP1LN?jGAOQo65
zRiEgbZbgt_W5|h@ChS4+<(5W~8u6HN;ke+I8@|Yw$Bev1RTHs`uAa66@pr4RlW@3~
zIeVLE27NrjIsO!Uh&EU#S5ZZZx=n7+Xwbi^{YT<1!NLU(gj~8IrajMZL&lqhGToEp
zNKHs$nJ+)n0umrkUsAEhe5$tt!#*{M)Hl3w{WHA;s+-BFoOaPXXc}<5rmYvS3J+%M
z<|OSRl65XI>`nP?6rdhyG?M(B4R11P%E-pJR`bRol238rklSlS_a|c8Ys|~!XJ7fI
zOEyJYzR^F`Wc}TG3T{dJb9^L?c^$qd`}#WH(jWtPbIuqpcFq}~C6~@an!@k_eM7G%
zbRjVq@1M<U0`I+1S?OrWHDwcjI<CxH+p&YyC8D&pSHugb8CfG?9Sx!Ao{JDE3Js`r
zeQe#a@skV^4D_xg$`qg{Iqp_!%p3Q<=ujCyVv&_c5-AzCil#m-A}~B~A!ISvo?SK^
z+`qt(`G}Qd36?^r0_#7HxT|^g10D?iSE(-#ORt0$U1?U2o_^}2P<Dm&UvbDkw*#47
zkC{q`mkg==T^`wdXWdc{2xqKhVn10@tNk6ZoDMmnl%rUom9<YuCf8QB-aqTJDkw~+
z@RvN^J`RRY%u#zL5}E(5+;EdfYIVC*$0`+D-m{nS5)%F70&jq~0;b)YKE??T{XFDw
z^6y`;nhn8wi)cY%Q1(TO%+9Q@dv4H`Z#%Qi&IrW+2~^&;HTkXNLYWz$q6-M1G!^jV
zY|SDJ^)DR7?Y6st&h@o=ggB0g8thflBn_lXm3p@)tUaHM$@+M)(0vqkuc6jUiItR5
z%H(RXPa(37eP4Rp<MR2jQL18hh)L*Lnv*TSZyzBnP$3)o_m>K=MH#+Sj~PQC@)~Me
zc7RzhXpsEFg2L1ahf7RnRVJMnBVnlcZeDb}PVw$I8rWP-^bbfoq;pnOEN@H_?E2Eq
zjyz$Zx$DqIGB?%O(#|d=>3sG2OxurPocu=b(5#J-66FWpm$+>0^TjH1HN5t8&6fa5
z5Kv3Sy%{&QeTDVM<6w&mPo5AbrjTg8XdZ}3MTwaX29dnjGJOx?iuK6vjL`UyL6aFK
z@yXTwI@XNs@V(~XVW_F=%#_yYPfE=jH5HDEL2&5ba=RVDOjq+IkZXm1xqI+U$S8oA
zDyzjD9v%XK8;JVpL^)T;q5q@+8`6Bi<h|F)FOde=30NX?LR0x_Nijgq(2ps86mBAs
zoSYnVzvZ-&L`xo5ydYJU<<}szD%V!=&E_vK_1nJ<rAm<K>V_uZp|5k!EA3slk2a)E
zog)j=>j2TxtD2^9QhOso0!jaZ{$fj0a%{owRr?PR3p0K9n!pMV703H)bcB%L_TA`9
zX9-s=kTtW#5|83rd>7pB?vhm~@6V?4YGsg(`Wxz;lRn2q>bSD;YQfi#9$2vxofsWG
zhOKvcB<9;#GZ7oyL^o&`=G?mgY(?g(ZY*;=Sy{4~1#uN}_jaTImqg?#Mu(ZT@Ez6w
z<yW?^?I=!6(ST8zvKZs21@qdtWK@=IE%S`|F_}iZsLAO0t6F#Tm&iFf3Sgb;h-Qyk
z{&G*86UA>Cv!WYre3Hh1G`>bM{^ZXHvVZ1z2o-G0s{fBBx+8LfInwdAg}Wqz@cQVV
zOV_{s;mLu|o$N2FOcQcjoAXHiHlyZZTTj<UrW3!=&h41-Z~+k%<HCPiCI+tycM%*#
zf=ta~oGZV%UMhG^@;vUf`M957Xc*3?nlSDb=)T0Eexy2$7PqQZNEendzmB#-o@*fT
zXIfW2(-y(<RW4}D0vtjZMwrbiSpd$P=-3i)>$Xt0Rdl@sYqVa&<8?ZE=uwGdmF&Z%
zuFe|rp>abN>dccrH5?<#7no9FN`8{78XTc252*;klM`w3?8T;uX_)3Yw?Ws!X+cl*
z(7+3*YTOa6>)+({lzWIihxo$$sjJ)c;jPwx+b-Sf&(6L;k?@TG0Mf9V;b7IdzHT&K
z7OMy`_UJf0-e|7vY%|#?YTz8dx%}i*RLVWO4E2`QRzM@mBV4xwzrMKSbLj`@?D^x$
z({=)y;{fM1!<Xok_Y?x6^s@ZH(fe6R&@LJ9p2DzZH-fnAo9Bio-(d~>We%_|RkiB_
zf-h#M1Dh-Rxy*6ip(pEM%;CA&)_;$VzzpW<;9%ZJifanxC2slXaJN^{o$trOd%?K3
zq#0hFpEAS!V|7hCLke><v$HqDgOm%Cl`_fgH97cQaLfgf4BiP9B>xA=d8%T@^7sK&
zh~B52f7^{sc#!t*-`3;8i$9=R_sJ!=r$k<^-9tQWeeWP$AEAeRd{?LYkZc(wGtqu_
zq;cBgG1OPTV*8RpJiWe@n2X0}V{vjwSDJgw?c$)8fi@u_H#Tk**3e|G)LSNcB;*d#
zM9Q+sUV7BG+I(a$9on<v$CD*Y04S#CqNFKOp1eG7bIE;i7beKT?-!9^#sM=Kc`zGz
z4*TGI_=1*BH{;PX5#L0bDxDog?m&CqREXdc;{zV_9%tst#um_UJUY_Oqf8L=nyk_P
z36o3=)>({cT7PaAvnY4IQ(<nA8rbQ*j6{vVI*()Uyz8@(Zs%KkH-K%lbVmNpe|+c|
z&CL7d19=TB!|s*l#2}lOqyqSxno0svn_yL2D{MJtWtD9ka!W?KmSlNFmY9x{VsK$s
zFq{W)-`#HHSuG(MIGa{sNR{Mvb==i7Iw8vH?Aq+~9s?ZBmd^V_I%yGHIA|Oxglcxv
z5%T-x5=n&)w5ky&H7D<054)gNvLm)1f_GgWS->++*j5T8)_!ZXTI5R8ZF2R~{yja7
zTu+5~NBF{G5!rAFQt*Z<Z@rq|x^=6;!@;C@F!kSy_@{mp2b8E#99cVC=w>ol4~IB`
zeVEKH;LL~<XUXKye}B3y&c=QI!K1Nxq0MjE+zL5a@bIVPJ^lIrMq;MY9So48CvzY0
zRk`}8=@F%k9%!5xhY530o$ZSCQ5L^#>sknIqPm+lXebij#z`NqNzZ`B;`9_A0W`kk
zpS6e>N6c=wWg?*iUoD<r?6LIU6_gI9Vfrfz3RC1w4%*PZKes0;ElhvKQ17$Y^#LE%
zOF&n*OAOS#@#L`Z=Txu6-!Y@Zh!gb_yXAp*%YCNqL%a~=5M{vamG=8?3x2{#NJBp1
zD4d1(QFzs>)NPBl^g{qpO)qjS8HdJ$Y0pxa_45A8{t{qY_!8X&ZfOCHQ-Q(f;<Y#3
z&7?9n<(V1i$ckThP!Q-{4v&t3)&B5^12S&kDB(*{OGa(n#D0#0E~XmmquU0vfx-|u
z5qGI=!|S>~C0Xb2XyJXAH~E<cQ7pS}W-JehC0WvqTy>gw6+~fh9AjAOfgjGUgJ%&z
zK<Wbs$x9PYNi3R+C#kus9elcI<Y)~_X_He;NmiTLZk8i+|B+`v%NgK<_VncFJ~T&j
zxdKBR@Uc&l)h3PQWL<sQ9nOI!So274Ofh(a(b2}FN-}vQ6-a0B&Cqv9Mu<I_p0)WJ
z4c*JT`@rhrk2mK-O>a*Yg22S22dB)-IBvV2x#4K6Pm`g9))P_@?Xyx0cKSn>MdN=J
zlgAX1Xxz8eKvl8Rmgs!K%Fd2n9821RM|WZhH-(uXS4)T>quzI87FqTv&fgM1VO*Tv
z$A5xNtNiMCfNsr`@z*8GR>c#)BlEwU-qQV6S*IXlCinSGS&qIbvI;Jp`&lIpirk2N
z13c8L(j0l;9U_1G-(P7afnAd&`kMtRDT;s~WSB44nB538<95?5FTJNkk%xBy-ml~B
z94ixD--nUNXZL5Go_pl*I1<3TQ9pRRi<$TbgpAQH<VL8tE|W%5Un2?5YtCJO@$;0x
z-Vl;`BPZYYYjDS<(3`%8nWU_>isORa+L635%ZEM^Mbgnp>ar45D45xV7PF3#5HB$E
zt{Q?BCH(~3DSM_5L9ezk`V|~$;LNY8?E)i;Z6YFI?aHX?IsaQ!&NtJx8on)f|8Dnn
zd9G*PaDiS}SbVsmZ8APU+wn7V{DT)Ak6(mOA>69lLt4O(*=`*UfY_26j38#klB2u~
z_c3w-yPO1L?DN)Bf6PateiiT&-Y$v43kz=#EA2WL;Dq~}Z1pS1%B&AxOvWyr30N=&
zkw;m7ORG^#3q#JFTUq~M=ifbeY!iYCsTzC)fqZ)I04sH9X6<z?{IBRNM{^v#ZUa6q
z)_2L&r&6Suq!V2sA5izy&fOe=P{5}Ozq2m#ryFrCKb8*On7AjD{~Gh)Y>^pXuMk8p
z`iu6H@ObY(4o}Ed8`$MwF3%qvX!nXieAZ~|8fV;edwYL>zxr$2Ia&FZwD7hR4<q`P
zrhj3_|5W?BVO`Z!+CU;RP~Gx9i@nVqonP9m>>Ci$g72c@p8twd95-JMa>L%vb`TJ}
zv_6MFX(`?MKi%+N#6c%co^3I1SJ0h;{Jj}SKM3d62L7H0xdocWX5UnYuV23|w$!^I
zPxz*SmhNBiGeYx~c`K~xI$BPW^1~z1SZdz*x$ia51SY9pT^HjPmY1EGH)R0Ax_|`L
zHWJf`Fi^-2vdj{Rv1jx5+?c}y2;3}~=Bb4pnEJ>Q$g8_{VgHZ*2ocmGIu(G0S$=KT
ze%ntLtw;h=!A=ToG(0@KsQB9nz~6i=vx7RVArJf+HZHCAyEPa4TS~=ZI6HT<aIPcw
zjc_=^s1<eidaW3i&;l4?U{Nrd2Z-dV!+`R$+L&KJ10?<2ur2WSz{?qc$4f}CF!Zbi
z-;b7>M7_ra_rRPq0(?3P&=q1Oup)MInrQr0?448ZAR|__SB99}ng|jzA)X$2qK5f$
zu`fxUO3g!BaOa6^dO$Dr6gaJ6>aO!ym0FEMwwvWDm06>8uTZYWeCP>&oN28;d%OCY
zXlY<a55%VR+Ihj!41W_Yx>+Cj@>6do#No{OXC9t>vm;-Aaht)1$X8ty^l$ykGv28_
zXUmXJ?hk%n;$nZG-<lzljQT^?ejlrk1vt4!V&}QAIO2XfXmQ_YeM!?tkltm<ogFvE
z!kFwGoa-Dv1aaIg4|9vUO@b|x`A@yQ#FTgH-AvSv5@8OZcTx07MtT}TrgBNiFO3bV
zJlaLQw0su}Q?$?VY##;;#C}ySQ^+S)Ux*|=s^;G{fiX#2r8G25{Q<jrzuBJ5j?v4=
z2Drb;8FE$NEs;=Cj<>+Z`Z~DKW6H^6?ExE{anX3urIaFfNVBZWcP&Rn3Q$3(5et>2
z++Ij4Qfl}V8VPZS4h2ss14<M#+%k&}`m&s-yN^ape(xFpyZJiHF`WhN;o6`W_F!*i
z{WRP$iD0oyrpt+sDcKhF$$WFU{O8#hrEc_G=p^4upSGlYE!QTT86TcYwh|K&@u<Bo
z!kP$?rAsBceyVi&&~i<`+%P|jZHL5o26<WsWg`F{*_2*loaXEI3KKV_13YJtLsP45
zKmCc~-zl9(>&*PQCEKM0+Xvd;pL}Hhpo7A)0`ZJno;>?owX5zK4BuE50f<|`pA2^0
zzj~o@BdzuYa-?_iKD{`;TURJQn-$SFIC^Cb;qbYz=G<whsrwwS#yPXO@ehzc3WA+S
z$A|rZzAU^Tc(QlPIk$9#uYLMr4lhGLVzqQcX*zgFOg${fcJkxllaK(RbefyxvXg>Q
zKtKTTP%vU|p?&UT69|mxMpk#@8Z#EXiC9G{-4%i&M{j!o=k6|NJkUG?PNxsh%h70K
zi!|CbE;aQo7z${ta0M?D7V(MLW+YnDO8cIE>-gV4PflonQOp(bZy^p;+DG%asrdVe
z`o{VLATp9csC@r`7&Bi;16V{EQq+@;wd3RU#ePm|o0{6q11RS0tY@>O)5Nsns$y~i
z`r=MNuXLt$jM9=SC>=H(F=lr->Qmo(^XXn8%y)<slYI55s@H>yDzV-kYKmxUEIvz%
zcsB$Xz*!Y3(A1EJ_N^K34sT)2KJvw1k?3E1aMVF<vp{#m#R01L)Db}bk$COuxz!!x
zWdfY*NEpz;t|7g!O-?qWNIwIBm<%A<EC<;bdoV&CF%RWbrso<@@aZPFYF>B-Xm=H)
z*!cF@3}XOi;Ec?s>0&SNi#wI$D!*tS4JLk)*4e=S{{Hw0IS38Lb|jWaan>!e6&hVA
zM|lEcf>@B8axa|6mfs(+@n0Z^2!={-S6+KtkjNpA!i-AlxOxb|&%48;0@PxmaoxzW
zl|>~Nvs$`HU#oBF?pBbPASyegZysy@I%d9h3fjb@vMYzxan2Kkn<ZGH*yZ`kK3j~b
zSizwqgOz{-pclY6v}9bCBF+X9ylzrPkt_IGr3vFW*dqb9g}n5HGl}M=tiRdpJ=;z<
zuo!AG^jTK%xLuLJ?~q4?gaOuhH-FKTS;O)}a$$=vmXR-A&K0Z7nzZAtzhSfX{!P$#
zNO^mjuV+E2131!Un36V$HdNDJPZWiKk+YlgKts_5D+jla2e!;ClJ%5m@Lw|UB~`I_
zdR=ka_@c{)&gf6@hQ+Nc?WtZ*ynPFD&XLO_a?-54*|U28VA|`4^eX3RybXvun@FW&
zO&D@lWaLJ_>-x=~nIn7!y$U_8)gXoTDjYPgPl`EERH9bjpkyzWYV}o#<N(I^lJU7|
zIl`7G#OCGQ`-TsM0|lF(r&W6sLpBN+V+ztC^$zgGUg<T^YVE_YG{@BmVJ3;B5VYJR
zcn{LrYN!R1tr^8WW^nqVyVWB$_cow761hFAQur-nl|d>e=+y%bd!>ztLGQ4<&uUk*
z$;H8F$S*BsZ)R<_-e20|I-xV^Ec1pe_n0RAH6fIn2DR8s((=G+$6?n|+{vGlJ;9S(
z8Y4VH_S}&~$mp5X{KlXAD{Q|^j=f(E)9Nv(ReV1*sg8&gY9tDsG5D`%KviA+q&4Gr
zbyo=9pG>Ns5iA7<T&(5sKJWpl)|PA?9YYsCT(nKOMg12Zq0*N++E&vLrOsDsYoOA3
z$;7CCOw5Z>DlhSrQW<$aX-gajVoBXWfQYy}8=ucc!Nu4d8rJ8pn~i2)+nVe-CMd+*
zHH|&BvC}r||IkLSkVWGG=|06!iB{F{pee&K@M@(6+3Jg7Cxbc_T^>mjQXw#u#?O_N
z6(3uaK&!V@5G)o14}_|Q?f+$b@pMc&W?^TibYkc4-_ekipC<Y%#eSPSw*c($O|k&m
z`s=j@X&9>CL9uWctzTPqHsFriL{+0vExBFI3*S8Uf3{9^N?d+4f{9BDq-$GC`EE$Y
zNV66BT$=PTr~fdvKOQnCxC6O;Cdr?4AA^W3pJISK1t7v0-LQ5yo7P>B1Jv2<oP;C6
zHHbl$JUsG_<Z_wiaMepm2m}*~o%w2lg}xj1vT(e(oJEDNdYdsH{QnWe)Sbh3QyYHv
zlme@bJp90*(ATNhvz}Zm9lCLSFf+_ot1kDj2fIp3c>5}jb8tU-m`u-XP_6FS?hr^0
z#b5)&9P8RVM`!#QNGg>s%zR2I4?@PnI!=8cRgR+Gub`54Zh6_Ydfm(gam@oPLW{Cu
z*455M<a}l;YGiIlGohoi<EKB4#OtD|^E!eA3v!0vNVdIbbMUd-{=zSb6ZQJzDua57
z&qM~@pYI1uCk6{*Z#iD6`ZB0>K|1M`CHfMjY`C{$FmK`)Q`<AGv|`Nyz6U<v9pE&f
zhm!PFHw9(JUbMFUp16EVXXjWxgIntfh-C1Vl&HQU=~Br2D?T6dy*eD1iU4*as@Av2
zXPK6f_tChFf__@=i?8;s(?{W_A-Hqwo0U_Vo}#D6uhG8l8ji_&Ov-kb6DLyZNU+1l
zuJxd;#dv3ag!%D%ex8?OBUx?~J}gmVJ=+v%oFKo+&(D@RFRu2VY|kf%jfxN#Ejx%e
zjhW$3f+ti{OKkAIaLk{BAvDE$db=`2+Oc|Dr}(ThqX3t~(z_lwmfiWK@q5I%sEsK-
zySs}W;_Idu4hd4;^aX{TGRAb<Gx{KY1F56qQV+MjMt6pV4;AK`*F;lG^=Edi_66a9
zg$>r0kr4Jo3)FOtSG#ayA<f}b!o1lp3-ldCjX}ad+aM75`4hb#^wo#IKFG@P@MdKG
zfQx#pVuvnlt*OBHdQAHFzNMbVFF*F+d$^DwL_a8@blx!W&)aQ}X+NwNlB&vE)L<{t
z0Ac+#$R$+cZJXtvR8pMr*Y>^OQ$p~4Mk47nH+K<?p6P3-5MAHo0t6DKE^F8@w>!|$
z*TOP6?=iQl^Qni}l5xYJrd3r{4McHb?wI;4&7XqLUE-L1-&HUSk|?#93328O1)%yp
zGS?vl6_#!KD82aP*?cw)*-CA53x<^q$PSBKz`Jhj9@bC`MQ@>MGAThd&$B@qx-RF}
zCbYKe)AqMn$W}EqH4AD}@ip1EZqd?k7}{5A%zFevhpO=4m*x3{8I!5q^y`EO-4OKT
zw3)tik7E4Sv)Px}yjEbGM-VFwRI{^pjrg~DifcWfz1FrXC^)dKz$k_D)-qpSsarl#
z-m5FPx5tWor{<jPq~SReK6%pVw-yPzrmeq~9zl}cLh~|S)?KB~++W$O$S5$Tf9L-q
z6A*1EW)bCKObx{ukYaNODV?XUN(V~l=iA(caxJKFZ|ByY0;L2<r~GbN<-3Rk5qsQe
z;lGj)hOY7Nvp{T>Df)$&du@fEU`+{+A@k3ShTmRb|Cs)_G=W1{03ZeiJ0!d(N?S+V
zu0ZzYlMuN2N+;Cao*&Ta^OVP9O#X&_El6sMInS^J;-kN%*)-l;uJF_||0Hi46C&=(
zXT}#=%p@4)CLE@X%V2x|mIHV(ektoyd4Eav!L(T3K2S5)H*-<yF_xUHe5zLz9%-5}
z(B<A+#(fJl?mz%Bzf}#7swW<c+_$@NfeW{(b%(bAns;uo?S0L92aS}|NN}>*(u$6c
z;TJ_LN$6<g=x14H^f5!IZRX3OF`a@_O*p+GrE>EMT(ApW@z0pZ=dC}jS*oH;<yqRx
zmkcgEYdzPpz7zk+!YY^5SN~2<p|30Ed(@&*EkJpxy&7Bk(V>t}w!!!kU~J^(wiV{i
zGal2yl%}vtZz2_`+KY7Oe>0S06zI5wv|J?N@LYDkVo3&i-bvU!LT0)S3~I_!dsjAV
zP0B!n;Y!p~U+YN3)8=VJcF?Qea(4Vs$bN8~16>nAd2X*%Q>UiQf{URpwvU%VGaAA`
zf8PDPC>{K&S^PPDIQ(;4`ro~!<4$y3e*OiUi2IJu|Eql}!zwh)qRl?_V7G-v7Rstd
zttT14@|xBTny7_d7@R8xmp8I7zG}&c*@pb;K<63F>XLpBVE^lUA}LlwmHFtDJRq=`
z4t7rEX`PPNPm)b9qpNXQ<FPhqqchT6yr!r|P6Cu~F>LIq&Gbg-ba~``h8A-NO(@fd
z=*<#~YgW{$CXY5LS*$<NqS?)-rsxhlQG)PRntH5SgVz5X{2%17S6J3ij1d}n8zBX+
zcMFm4g9}9>oFl&BgN`Z_Fq9EqagSeZB$)Lg(!`^k=NYqM&9-WCwzsj9ft0tcIaQS_
z;b(jH_1L#vts&ciCs&{Q4$}Al;Ss3MP6GNDS5Ee7PqsG4ZlQ;+hi|Tdn93@ul%cpc
z`MYCY>G!p#eCrryk7?-0(NQmD__av_<Y(dBFR>O8{br9M6rst(SE_%p!u(R~Telw7
zdz1k&Lb=V8u_od)Qv97|R*ugL1W5Nf^}!Q{sY85hS_5XZFRdBD-@q+pR$;3TlJk-&
zWs55nq<@?zf8u(>9~!We*PhODXE*Ufeuk<wdxQi8Pzv%XFW3;ile;!^*^>M3mH{qv
zEqtRRqNPjy^>Fgt=_VygP%guH<<ljm(D%e&adV0cdXRC4=wv(hp{cybhyJkxP7=6a
ziaVBt|AAsHv)#Ze*U?bds8WqHw-c(qXzLVbC>y?Xr#$V2b~y2TJnuQ`du79VS}ttz
z#fKF?V@`8dqytEz>;2kfh-IX+>j>QZJyAv52YK?hV@P~5j^AMD8c8lh;&_`>m2?0w
zxMDeQB!6tj$4aM(#Yy+-2M1jA2^t9%<CQ2j3TU}ttUXNIepNJCx&@%o_WV`?y9WWK
zy@EyFl?tO}Yl8~8lE)Y>!yKK^SeF{AauQ0sB;%bj$HfjSRQ!{6ADOZOj=1RAzFz)p
z<A7fh0%xm+`5>}2gM0=${12e^hg~J+our46)`^|2BH0CBItEF~Kr@nxd)hT;Ny#9l
zKJp`4ZHSa0TS@d1FBX?&lDSNeyd)u!Z#bspPeZtDPxK(r)Gu<SEgl(XE=Kv2EgtGG
zRG%$U7Q82(BHDh2^=BLE^o*=4Aw7VsT(&)N`ebKq+V1$^;o+=N#KuHzseP=6nOjZI
zhT(B|rdl8s^A_p5L?1=6;OdZ<u@sNig<iJqhRNu)KoT1|sZ2bUlTK&|RF>n#HAtc`
z>T~+Y&_y`J*AQ2kmNJ3J9ACU{CjOQwFpFw_+mQ7+6<FFfdk+Oka&xoz{k6H1-r2Ql
zUWLNc@h;!6uuQ%3`}}&cWJb|M4P3py{Y4lmzlM5XGwn}elrakH6z4bx`S6Re&_(7a
zHh|1e`D7uJw<l75EiHo-R8WS%`X(e+>bCG`)DfSTx@j#ID3Hp<eomRVI3q~>bntQB
z?7YU%v6#{`Z{-MX5n_W9W*L-j_r;VF9vMgNKOT!=?@5*oZWd65cWS~fo-F8F`i-qr
zOOgP9gp8o|_JVjFSnSE<$<fLF$^P8Q!&W4x{L;b%L%^HZ67tMUN8tXCIKha}(3V;d
zKhDC>PUW7mk0(d$HP(PQEblyYg~VMPJiRhgUyNGc8F8|(adO<;?y&c7lcp-^cz4o&
zO!Wv5!c4s$kI51(;mE8dKV?u62iX+4zd@hh7?NJaybXHguOwiC%u*tpPn#;;B=e&|
z4~`@CJl>A#;@av*l+F0hE<Cwxd@b85MCG3E7e?4YDP4?{rDb5A9xlS)gVNF7j>l#i
ze9qqdyL3#3UgS8<L&`~~?N`^5xU+Le_g<@?EGu*+dk-mX#2!%<?2>4qoz}*KmDs;z
zYVUXdS<kM;p&BW){r~_=O~O6{fFXO!D}SkOk7@e6j2jn7Nlwg~Cc63)>?(Fes~?m+
z<S@&T@fr*6m<M;B=ap#vf!9k6Y=Up3iX5l^w%|tfqrwKFI7OOrC9>xYCpUL6B@c>0
zh}^=<22ygqhs<pk8X1%ZnbW=9xv8{-QZ}y<men!USF$4>JgX-z6)3uBn`<+omtd2~
zswpRTNeLA}`l`*upoyv+oXzx-F~Qr%2<Maa+{CEzylEJcxm&ra9SWBw!L-F&&wcPN
zILA;Ed(H)SVMcWdmjyG?e*wh@7)~ov{p3=8Uu6SaQArNNgP6~-8Bv45!A}W5z1v*O
zzY#Vj>&{t;d-pca_lJI@l8Y9Lp+?Dra<+P&i*5(={vDPE4kZX7vg~QJ&64skO$OVT
zE+;ShtaT?$fi0TnKRK?;?^raQE_;t7(HLr*p7S<GPuBA!)7(096vneh<3e!We5P;0
zW~<~PGEq4kgVUKWWUiWdn2+S9kL_X9k-e$XVDvQc@t<>L*S=h+wGF3jD;zomeB&%R
z2N9nFA=)C>%m#v72D7lj^`E;G+NIzuP`-w@A3yM!v@CODFH4owmv_)Vh^)oR=Sul8
zyT<#7DXVJ0HH3tO9$a{)S&1X%{Y*I3hmQ1L%6k<26zd^TP0eTKvzC%J99Zo2&ZX_k
zz;P5BEcAjH6ZL83$~P9+!!;lM3W-P6b+O|qPWqb!XztfW!|8ydH#K7L=;_5OtO)<I
z-uC*G5@20qEmZc_icn<lWxI7qj4Ab7_(&)<7)g8%ob#Lg3XV~-vI};^$xE`N*tE)&
zPL_n6-YyJy<#&Ir!2eaX9LZI3G(t6jS^D-wK*n>dFW-S<wTMh`_E_1fIO;)!lQ=J1
zu4oJrJ?DSjw)1g&a&Ah0*|4DcXY_YLj)Tz^pUn7}s#ln<ux+W5#<>l<88Q-W$yLDg
ze92FBedo}t^Vqq0F6`teU_h2)45`~_@7&NS{@DYUd&uLOm&Mi*aU#qxw+q$-#s3br
z`O&klkGJ|aBKEANcO1`Y+`n|`QkK&zij$=|yNT`O_LFaj{~x*fVmiyq%hJ~yf=#!j
ztXq*Tac={Ng+^uPULu@r#$*i#>Scw$+1qsnpP^q2%%xON+wG42KE#m#lQDch^aLcT
z_UvfdWZ}%$4~Zai$05rp*Pvpct$ILa^=W*d5-=nSN@XW9Uyb`5M?%hCF`bhY-a}HC
zd^!gQ7LT+mfYD#?uot-L>9d1IsQ=%J6(e6Kg)|Z#*4lY5wSqhFHwR}D9<$iiSLy+j
z?ghTyNds->6<Dvk?@O`@5}92snwg~^uF2~#oKd*I1!qnC&{o5;O{cM}Y-sXDi82!t
zd0Qyey%NW4s!d`{VazdLK7Oz6=TXJp-XMCGU|Q(gcEvzRT}|~a?Qs%dKiEuiSwcQ5
z@yo=|R~v|3{J%$QyfLXp+F!;5{Q`~58e(Va$*f|joiw9sulxBwlFmJx>HmM@BMFg1
z%BgZFYI4XqQi&-MGG|7F!Ynz=`Bcd%hsfCwG3PnX`B2Hk7#Uj;l2|N_mUH;MKHs1J
zT$jrn_I@3n_kBO^;;l)(^eh|Cn$~H0eR}1yYv(NPwkOk&1KcORl%!{t5?YsEhv&*T
z%amkW%~!J*#Fr~-@;ldf#p#lp9Ln-p<_LNBGWCn@a=%_h>#<Q-bnXmT=)neM95T_l
zWjIFM-G-lgxsQpiEsuIt0vyC#IunH+s1wHF<|et;CI)s_3ejnRF>JQh-q`J-QT0VN
zsniPAL~>A=Z&x4)Gp_P;H7d8?xAUwl737b=Ii5^aeI;2ctQn0(iSk5_#1|fTp16A<
z)__^ck)e2)@LDbU+uuvI?>Qjv%MUlE*R)*_(T|hkd<+FNCkZNXX{$eS&$^*Gv-rg-
z09=Ulr#DFNE)UbG;En`3aMr6@blf>^%>G9G)TL-;_?^9<xTaj#zuka44tWK3yk-xP
z)F&{-Wa}4#tm&toNHaG|;|%Ur{1%hFt2Dti-SgCf?ZcAV?N7qFTSciUP3YLC5OUMM
zt@$!gdcLMg@?*Pvu&y{e9U0u()fT+F<0kfVpLyhjPaTI}jpu6>7$u1B##^t1wRilQ
zwPNu&54$&qV(DDV?M1A#UiN(DyqtYozY9`8xT3cYkrz~xXq^hux7@TZo&yK_CI>1b
zw^@y3?=t%GWOtfMnlnTd%N{_%JG?PJCiN`F;vJ|d$5UMT4rRfO<U@><UGrz3a(^pR
z7j04Cew}&Br70#Rg2Q<8Plgz6VQmhWESL)dof-s+6hNR)*;qEO<$2{dNI1_1de3$%
z5wuT>6PDnHGmOA@8d`B01UY1w)*10L1v?SeZC3nJiAuod?2}6Xh48Wv(yjxoOJnw6
zb)L%T7LCa-8fJTQ{utFx_C-{(u|k#^EXElaJD{bVVcWhv*R`)*+uNE4E3FiG%ySnp
z<i-UM#*%kv3#dkn$L=06uGJtk=@}U?8?12nKdL5R@>Q5rlr9SyVkXkZHub2122DfE
zGSfHpSVlV{;S&8PoH_~U8&naz`WF<{Q!YxQ{fSLXcTZcJW?{eLz2D*~w{Re=9OHb6
zu!u<nLhg+4@hnTpn?s2lmaWQ2!f!x>cN?F2R3n`wY~DnoOFb5jkx|7{()|(!9<>dM
zV&Zc9EGbW;LDQ#yTw<J!5y1v>xsW8JP4{Gh=zk8izIe7=nJnVcJOUqvT;g8wj@Zq^
ztq^=VF(Y;ywjXR`Pmd2~SR2C&uodYr7=UJpBC9J()x%%J<z^p_W7W4YtPs1VmSwn1
zOqT{yg=d8n-19~h4w;9Y8>F{q!Y!U9!>nyCIUjc6PE(38RFFPjqce0r2{}d3I?Ag1
zp<$^n$Cj;Hji4np)~d`e!d{!9w;M71SSco*fAuSA5&l<dLOp5URiQ7q>Pv5oCiKM@
zQ5T03&T|$)j+9{Ark&+ooN2nJYPMvS-BD_B@zn?H$e~f5YeA7G_H%|ep0SEz?8ngL
z6oZva+;`p85M~v=Jk@D8`?B~`p$E6`wpH$&;zw`VBKuI+|K7pCBgN{t=#Uzv&)_du
zm>|R|@+?kT2p7_vD#@xNC?DHKsm{1qhG>1yE->a|$;TmLl45#5i`-V!TCwi0@>Gh>
z;ouwwDqrLZs4{lk-v@N`J;wewJ#$ZK_z{i0q5Y`ks@=ca^Wq18YqN-FA+PWtmbtO<
z=x1Q<y{?%Ra9hQXuPN7nTBNCn9%OrR?XPoeIudUB5(3C&A5xqs?dH&3d3)sT=_gTg
zt@_H#B7@(n0aX#qSBduyHfXH`w|E%o3qgz8wDL$J$eSU4eSc}>z*Ya=gOk!hUSBV{
z2<Et9kcoKG;n}Ux9lq*Kv7wiVI7V|-Qi7Ay@%IRyQ&W>iJ!>@k+zLo#9@@1`8^1K3
zA7>#tgqUU?hgtt{a<f5<x}s2Boyy5i9Ip$`l6Jr4OOAEUugunP@e}Aerno<3o|$s<
zV~u7ZYAqtQ?FQ%0>EFyuzjGXS|D;SYsAfj^B3^m3@}DMxdJfZhua|6UE`A-#khhVb
zW_+^wbU=sTe6FkGN%R9%<Fl+pZ3ohSzHL*ej53yS#nd^Tu4-&*nvzg#CiINsL;9%=
zRHG^MbfF0-M)<2BN6a;vTnV)Fta;w|)9N1YEgFJ~#%Ge3#+!{vZIp(o6<v<yQ-;vs
z+H&~9HH)M_y5~;EKVI<2g%AEm;Ry}Xk|*Yxi^Q-*7Z#W(k!C_&Fy*&zM00dB(>wO>
zuK>pENjgHllo8vEQr)@5Sd`9cMg?@7R=KB;#h07c`%dHnsDh6z8qvHCPy<u`sM+S>
zycX{~K8s1}Ch(7>AQS3dN-7L>e$r=oz0z)7>!>!ou)lvTtGJVy?m`Mq8nB_p-Zk3$
zx4>+W6XlhcLL%(X(J+!c3jE5|j~{;KyJ2OV8+YOTPDvHi^OT}y)?S2rZt`>UJ9N>S
zn##W`yUA^pO*<REHx@S*Ya_Rbn~}Q_xmX->7e{W(G=wa=vGcDq=5M}lAfO8#>TqB4
z>VE{z@svrKXi#JCA9FD!Jx5JRXF>~aMIQo+Ol)@AF$28ir!?>CxxvboWT<EVKn8!2
zauvU0?3h+!suXLne*WKR=^@QB3wScH-zh^L^-clw;ce)$=M8JoO4;gmR!NJ~Hw-!X
zPQ;}770?dHpGx(yLQ4(F$<dgcuy$NW2>Ql}jUjY_ipB3qoYk>wKw3;M#+%$0r`R;O
ztL8H896X_*F`U(WP+I5#=etnx$!<NJ8HMh<hK|$=5|Yu~AdW)R1_VlqC@yNUkR@)1
ztLcWRCcmV<xhKoHR^O)(GoJzbcELv$!4A)tH#i`Cueq}9^e>mD8%pUOD)Do0;frt9
zNNpY#7s$w7d=G(V^S{)O1E>6Ym}`9NkLw2uikrN=_*F*4!4X=QvTO^t%eci&*BCzC
zSzZY>8i$3SER9Gm8@b+W9QbchsOMi_?Nfj#F^$|MZi-(AfzF^N(*8zZ&)#C5cNp1S
zywV2k@=>&`@kXoTsGouzpW(oGQ%qgvAZxV0{_a`+)V@=|l_Y-VOrdY3PYoVo*QB6Y
z#3NZZKEJd-pEBDOd63_gKto!3szoUgY|M{xT*#l^`PSuK!z5@*y;-4}hCwHq4*fef
zJT>{?%cSHSe6Ssu%nJIMTBe%P--@eb*K;+&j1Ac#shaN-(Tj#iM6tYKh>PB5=+ryG
z+J(ojkSGgFha~LUg>}@`>o3H^8_j>VZ%$sY+c{?SrJ>BN5nI|V{_!(hQ}x|TutEKR
z7bd50-So`euvVPeUnlk#ZaLxi=jh&~EMjHwB(#=3$O;Zg)~C%eq_QRLCsBSlaOLrt
z$+EL_8pAtK*E^gewenpODIL~|_d3CmBkY<yJtQ)+HdE|@Bc&Bb7k$M{!iw1=V~f0V
zPeKX`3cyj<Xh2d`QZ@WKFgGq@EPt1NTEZ{0aZU#GN~jsXf)~(kLydQy#d-2SxSX$u
zcp<quQ(t%1tG!!1<iwGLIt3Vzr)+7LfIZj2p(*ceBDa7%vzStaTj6^wA8$A<(83Q-
z0E<b8&!#LGb4Rot{Mp|Nl?q`lqM&Z4&I`EbE(V9$8?wJf+g^*@N%K=^`vG*nshEzA
zEBu#J%uT-R-bHp{MnasI1H14txZCKc5Y<qc?rP;tb9(%H=k@pik{#W@O&2y|=wCXk
zg<QYL;02fqM!u0pcu?3DvvB|4+Fq>i{9e{B+O{_IEL`ttdaiFWvU5!7&Z?sZ8?keU
zjPNyM!;D!c)n{bfmf?8Kkr7;Q(Yk%{#j>b+X^E0J%vC<_rBaNDQH9b?%b$GC2%!nT
z<%b_Nsm(>GE1>@2M;h#P`uFOI--hy0#CADm@k!Q~C2|h5=G&*gQ&M91Add^M#knNA
ziQ+n(8La7337r>a(ddUm+fw1OQgM&(EQNvZF*T}c?3dK`{oB8EteXR}CLvq*J&vxs
zO0c0g-a7A#@KdP#b@xpRj^ZiL>Xcveo~bPMEF`@YV>GPenm-Jc)Va=$AFs#U;ENx9
z$#RzvUEn13`U_c$*Ff`xdUUIFp;B4);|@h{`4oj$&CJtg#AcF#IXGAjo?zH}vg;8f
zF0NTC-Fi#W<e7gdn{ia+?DLB%+ULXLWF(Fkp>(e}E=KM!4*u<|ScBu>wAC5wm~ZOm
zj>{a<o8{?UUETV`XQLxiju|17|Ik2hYs$>dJb1$PmwemfqIjJ}`?IA)^EKb#9czkb
z4(qzkJwCxFPlx*smVQ7>M&mp<S@^5f41PX4&Ohu7HecWFCga^d3wgUEn-cuDsSjIc
zR~)~teD_W8Nb;GGmalr1p^>(UWo11_`?&iJ1BO=faol`YH5912cs0kS_*?t^;_nfj
zx9Q5J#30*6rLvFG&t@L-$0stoS9WmXVN;@v0b#c9$foRL#{AEm%YJJ<JNFh*mD|LU
zSfKpA`cG+%|5E6icm;zfT?2a;j^d3CWP(t7x?<V9wqqQ($f2Q9S2Hhy?0genYSf+i
zlEOPEf2U_cXCeRo{FWA`C#_2k86h&=9ISQtGKJsF{LDhTS^mW*9s9eya35pF`nCZF
z)UV!0E%e=)BRyAsF&w{6e?61jUv}!1)e^#UV4~yF-ws)2WAY{LmcW*GN06vQ9|vpW
z4-g6DUY<s6z8~0!?xCTjnt~Fe0V^+r0(bY$qyiSwWqIP*X79={3X1>1%8ID6GnK(m
zZATtW1s#0vd(n|6giM&fY)9jJoP?C`183HsI$&)ObMaSTV93L+q_#{u!$%!BidM3V
zf|(~k>VFpBXlViI-Yfh$iyrZ7OG{gS{S$qr8DRD1g|*ItEnBP%O1obJsy&IyoDvfG
zCg)+RGuAjbb9nf_`;`&2E$o}!;SYA?=F?LDjq<|QeQjwe;@2}QGTk^O{ZZCc*sDey
z?5T!ONbwuqy+(@N^YNdy_dMAlE<=}oWe4Y$p>6v?_23(Tpd26MSpB=@-<CKuw)w1{
zvis7n2$@i=gk7<Fh$1%|lihoY|MGM%b0n&9T=2ZF&kB2(FJ*Dx5TRMBYrv81%gU!K
zt9kY{(`u6o&uV>B_u+|@Dm|m5{;xlA|AmOl<ixt@)yQgunsiCJw&P9k76~X7R2>gM
zt$N-%?jS{9?bSqjJ*L<f!>gG40Y9HF%&o13U8|}&A3eH$T1t4V3`F_iiKU<%8<QaW
z`=jt;n(vr?j}jsHUJFE?iq0QXu=vTZ>r#`fbfcDo6>d0vsY6r6TDSUnDXaC3OJL%N
zl1DpG-12Mi8XB?i<@TG%q`*h_KWg4n5K@y-R9Za6DRDW)x~(%)s`5wIc@X=Sg5L(#
zj?#2rD?%RAq+cc^I!%&QszbVXA<boo`)-wu_%RI)#!UDGBPK=s+P|&dl+tvZ$1~pc
zy1P>M5%=X2FR;h!8$z`%EIu4R^!n1J+k?*&yF#}sS9atVVpp}|CQUrqBwp|Bul}Y7
z-upc}Lf2Q3IjSP?Blpai>sRPzAA`wF-GWx|Si5<%mruCklw1eWDtVs!DLj44{jLQ?
z_K~kD%vYNcRX)S7UoW+sd-I2E7P0NeulyGE)zg5BV*J8nh32DdbK|>nk15^gVusQ5
z{@FaAC*KU?>PZ(37Cn2z*gd@L?tEtVsjK+0R(AX6ge?KTC`@%LSPVAwsVQ|ohrJ(|
z<Y(6<sBstuG2AUhsg)Kw=$<=N%=FJ3u9l#(%1Q)Hyk}bVzzW);?Tv2bUGbipao9VV
zLBH&1V#*5-Zn<90A(iDR$Jwh~i%-Y9>NJt;XwL#G*;J@+y<q=3dJd+YBe(u-lgVpA
zS`H9NRe3)LA)aE?#LIVbC~}$-w9L)cP~PBXiDz<6@_3$%1bmK*!}xe&ksviknr3IH
zMgWos>s`+UVh~GIGb$fFdApkKnfU34yzZFoC@2T{&FD#ocQOs!Yv{3$Kxsk16^M4B
zB3aczf##9x*crqh(Ce$6ulSJ7#*dXnk09DO#RGKh@J13bNTiKJ0Gm$m0KzX#P<M6a
zY-*sR;$--2SI~9?We~s2l5c*dpSZUdE-#Jq$c($25B4)pDA<GVW8quOGR5ovw)Akn
zFyb+e{UyfaCY1d$<%asZ76*y<@QZvIg9$%ZI>4ECnwZrBTO`tULD|Vh!>`Ry<jUOQ
zDiFhPKAP7~mm#{C(Ix%G*VLEY!EndUiC87TZ-%2k3Y7Q|T#$u!RscgV_ltc^)km#w
zu+i?2*^LDKDoQ<nBz;Hr2U}frmT$9iswncn_|^Fw4tB5(ZyGnOSE!myh)=XbOkzL1
ztwDwINWCR#5Q=X-ltZJ$bX|B@oCh=)iUXQjZ!Sz;sOb$A((N5u)OtU*^iws7tXd{m
zRi|dp#R*Y0cBww`1UOIw_oCvs6yoX1CU4@rZfS}<yXl!EO((_~a9bXUmm0&PsGCx7
zA~x`0s`2C*YyMB}N-$6^H;m6rfX!&qyUyHzjYqTVBo?KoZFX~J+EUa6Ja1|rhbw4S
z4cNhKl&OdYFB&`jt72Be^Xc?3t^DyJWXnsHBTo7dn`Wgek5t|kHlK$;UeBC>O_xGE
zFFjAIGElu-EZ6rlCWc!;3SJ@m1aiaXzE!fHdv4@b-~qs<&51j9Ml5xS_pkkD@BwS?
zy2fm0#2%LO$@BU4Gbv@Uioh4R36gCQAwkup%UXqOad)PRO$`ETT=x_KIOAZe=ihZ;
zn0@)S*tz0QnD!%?P(CNxuNT0Tp%F-9n7;MOSAvP2;ekAVp;5HL%7Gze@x9-FtfMPo
z)}E?TIVY=h@(CA5E{|2!rK6I-NzV)9M<LM`La=CA7?0*5HeJKuoH7w-F|Xe-`O?ZU
zzK#d$_YT}njl3f}QzO<l7DmOr76#oy%{|OjiW6S&-xfiI6h$GAgjuG!M@C;)yHBOu
zyydTTovlKec8c1hpmzS#+!arYG&PFdtpDEVu?`-Mb+7QPfua~l!;08pmh7X<lTR;}
zYdTZo-W1~R?NATK6PM_3+If`?FG)b;)Er1<^YeS{axq@e9Hc(2E6_E}mVz*(qU%Hi
zctq}JV#r`4I@sTg+=@In*zb|}Q+%2`$3ygB1(e1Bk0B|*p>|yZJFYsnUPYyRVJ-m;
zB@RJQ%9=#yL(<zNZ{gMZZL0C5Rwuvk3~&3Apz)8%0T8p+xG0XGS?(*0PV(Nr|E1m6
z);`&}B(CfvUt|RAHls|-m|#Yo)3^Itc@0z%UH;{g$e4}1yU@7lxTeC?uG%rN-YY`E
z4W+3-Pe02F@QrCyw;V;kOwUBkQ;!_G!|JOqLb!XQAi!VJYkjY4;^l5LqXz9g(+O<!
z9g(dEzdIP!ex9$ZhAyB*N3Mq*?6vf4#f%o1nu6h0;Tcn(q}{wJ?})V|ycPaDvEdGx
z`#s{b&70zXe29WUZ4-I}PDzWlv(L^ChU*~17^3As_+&>^vPFV}bKR6Y5wrr<UH`S?
z+=5F5m_vwD^%@~m+z`OrrPyBcb>e4)caJw2>u_>7Oj#<FQs&nzK-&<-j&I<|c~*Ku
z2<?8NrX`~b@&lvh_fGW6h~J(1#bu2CxreB8Fc`9B&YonMqR(RHo%tJ!<9rOUAN7Mu
zYXIxg8AzgKp~3Oo(dbb$YM~vcgj)*h{ve3I^(pBJ%3Y(LqWLhVRB+g7U?}6G#LMJv
zs&EJH%Bf>R2t)l6LGwFGwC|lNVlUjy>s0&>>qKK32{H0Sj2}zXz5PE~lF^v4%f?DK
z=PD1~nfDsdpAb1$oPUvSlcScNb7C3H95t#ah7=KHQZOV@Nt4E7R-=zI8BNmbS;<L+
zq}yfZaHDb5QMoRWOV&~b(7x>S30RCE(lb#krl1sis$)#-nI-`P?<xr4D7U@)-Yfqd
zl4%e#Ztqd8_q1Kpoc|)1iWJMqmpnWfh6?y->=(7UX^ndaBY+|vwmV^Ml&B;$+w|WG
z@Jav&I)5BBDErGA-RZe-kTaXzTlX%!9$t_I47JXolTU(ZdCka<xe0p9y{_HGK4KO{
z>zrht=h}mRU+&bgs+~wRgrUeA({(@Eek442CC3>j6Two%yZ_jaIbqE>)^dbyaq1yD
zrXBz|3{z!G#}Oh=85nQ9`3@Ow)Mti0tBD7TvvC5>UPCv=fLchggDIR~E24Ld#*ml(
zZWFWMQg!i}BvTrA@#Z1?_>kzEQoCH^v^q6|$yZlLyld;6{@t!)S*><!v*O}&7UZAc
z2CAor85>Nw<Sg2EM$_RtchM3kzIbrlxzhMX&p#`SM%VLpDbE@SwAc&Q%1KApv*M35
z<?IiW1M>Mk{ZsYTh~egYqJx-Mb<7L~Xi+4>#jwo-R0OS`83+<@UMrVGqTEGk;Rs4_
zyv{?3?Y_AQH_ZOj^}QCNTW3$tEf_L(=ieJaMqupT6S;+S*+$3Tx*Y5k=?&P{6)zXW
zpJKfd%nNit?jmz3%+IfmVlqGUey2&EC}4j7XB@X#cv#To0+TXwefrMga9%e9gC8FK
zZ#hPG03={N;|kBxfTqW9QkEi^SADYG{k}kK@pg>G=Fq-`9;~ju%>XxJ4i7+Oy{z~m
zjypUsG?4i?^5E}d|Cz3cy&ZsHy{@eZOzIm#AUjnXxi>XG-@EuN@ekNy|ILZ-ukLML
z|F;QlT@a@lu`=N7+}hf@l>FvTFpn(}BM`)3NrFraf(L%4=OF4e1YVA}amk@yPKJ!4
z!*4^et==1?Hn$`wA>D9%@@|{XwbLVo*CG&$&}hOD@y;V9b^?C+S^BSx_y5p(!k8~=
zJDQWiYY$xww@6Y#krT~N@H3v_e_)u7(6AE_!-_0oj>ywMf}P9D<Dg+)r?Yuwvia(%
z4XI8!6)G1u<*^$SIsq4Xjk<DjLsA%dd*Q{qi9yk0aL<K7$OL#y=}Z20^UKv|VRP+M
zvpjneQuKDubweos%cCSO&HD1o7B-eVwtp|IzvJ!w*2Sv&)bVgreIo_scq$!UsqPAR
z*T^&@6X>(LhHlkJU)!woKiZ0{V}V_{)hCeg5)$sc*j?(K;|`}H)+elc4)z#&9FcqU
zdmyytVp757S3|J5ZzflwWOc&7^RPOq9i_+4qS2>ZZ1|D`&94t@9cq;Hzvhl~ON8p*
z+6>v$hf0^WKKf%v?^4vH%JUg|Ru~$eE5S5wIL2^uT&u-q7kP*w>q<&;1-q`$SwCyN
zpgJK>nkhvo$Ry#l*38b@E7-m>;r^CLkDbf&j#T2la&G=<N4U=>cmwT46t->gVR+kq
zz&{7|Mh(SP8fU>pI0kGE{!C0!y}Macef3qbP49v^>rtVqgvoDucjqKiqbxqNoqT<u
z)g^C1!l$cYu}d@E+nx#JraPMb@4*wC@O}GYoD%8l?*NS!4Oau+(sK7jJ7fj^QXM3I
zoz&Kh0t4}8q?R{oHdt@HgR%d&H!zYOFiqF+#`r~ljFLj<VwaTJXKhrO4P4+SviSw(
zsvcKQ7ec;V7BY_r3Tr~h-c!mA8GYdue=XYe?<e-wW=A%vC3T~5GJMdFjyK-Ysb`3K
z+<j3e3+2EZor&dBFUt}@=Mn5oUD0PMiM4d?Kb}OZhaW5-OWBJmKblhPx`8<Q6YU=~
zyj)`Z+0=lmLvOvDrQ{TCNjb@-!Yj#VM)+ykn`5I?w-ayen98`ebRori09On?Y-t6_
z{lHLh9VkXsi;r`#tvJ*ESMaX&+Y`@={T5g9tSN1={h_C4uOK^cXN!d$NEy}}-;5Mu
z?ts*dXU$<J7cc0))rD@K-@qsmI5R2!ofm`?Ob4=EMa#%*J`P#`p=`#Jfmd2RD6%aP
z@)#6mvWYXlpVEBjpB8ZW*Yj;q<6SoYC3th)+B*<MR(}GJKedis$^>m~Z8@TEsNV}%
zJLg4wfual4tpF=bRyDa5JOq96HlpWVrZlay`l!kX`7?T@j-E}vcd*m%cAElU0Nd>X
znU*3As8`78&puiQoHJcvYhdUsa~{-@gm3<KfU5NkvN}*b3|yc&98=9=F}QT>k{vEg
z>mD|^0xzuWK<$cH_jI73bIk{1yvY7QKMQ~C$WzDFo#10h-%}+;`+i>MaDT?g{p}A}
zPGQ}cEH_D)xN;A>(r^D|sENw-%2tobW;*N~O)X1seyq7h`7HRfJ)j^9m{vD6GXNRD
z;~Mhuz@@(}EgCbscEkl`&jkO9=W(69wT?Xf^ME_gE1~?iw*eUR2*Q?48pemXfrvJA
zzN>8kp(bcB;t!OA=6Abp4*f(ybHT;YCPTRO)~lGBAL%d4JtlF;i5}qlLuQ-NUQLF|
z6N}Y5f(r;f7D~x+kL71_K*$App7kZ#)(}eC#-h@7oH6nAN>=F%(Q8+)Vlh3t`yI@M
zVyz%4sr1&HUDIW0oTDPT78!DY;n$Ak+}SG;-o+ig1~G6KJ()v23^5!)+U<6O`Ad<W
zGJX+rmg_Qu->kKym6A3PjJ(t*jji?*!6in`9EM$hN9XX#=U`E}C{{E73>)9%iVtXc
z{NoxMGo=Dbt~}czHOCU=qtD9Cd03+R>~9(P*|v8d3^VqBgPnOuAPg+|n-kXe)Epk>
zlcH}-c(A|Zvl)GP5nCz1^9f9>IO_&9E8Qco^`zHZ-==UA(39Rc`S0+y68ZY)w%3p1
zPx2W+WsUXFZH8OzZQ&xh6vyBBYRvX(Xzma0%0Jay7B^Fe?>b(*Z<T+iCU>9Z>9sAK
zSd8;5Iv=0pXH5N@#Cu_Tez#{o>)>x8$my-EQ@t@|*0nQQNOJ|=&6F~7MY<uIFalBp
zH^jlC`KEBwvm3hgl&`!iD=RNLdPx+!g6Ho;3h&Zbp2j?R6oV_=!4>W)vb;8Kej?^V
zZ}%6-FL!+Q3JtpI(`DFZ*=U{hr0s8EmnK6$HC|$K5`%k32H4knP>G1PWgO~R%qJ^(
zyxAd$pup!X)u?h@ZB67kU6)jDP~!%sU-RVo_Uc>>r=)lC{8+4;NSS`iPZYI#cX_2s
zqjs|SXw66|i3r}f{lt%Qo*&_{#ogJOG=$+SI=1)NORUeJX5K|+Od=|eJr%|!C8@-(
zN!yIoGx4zfK5@Gcd3%$0vwxmk3-;QHsw}eh)vuADcI&%a=hw(>WfP%u^O&iXosJHn
znO%$ub&@xHmKUFifs>n=$|)|=mG5RP9sqpB4`8|6)=Nm~ccJ1@vxjF1Y6T+?yZD5W
z6Un))3d|;w`M~6~n1VpAQ=&0oTR)^ZtU9(gQ?yLxr&r`9pf@SOf!FS(OIB5mrvKUY
zH)+!{+MQng(>Fc32`d+Cjrf`G2mLcWTr|teR92)v1!3Ue0)?!`4Lv1n+EJ+xdd+OZ
zn`-=zYJMvjP?8iIWq`NP|1tgb>(8Vzy4HgN+V%vo!@0-hB+{#5-T&Ir7kGtI|8B&6
z1hCDmOkcpcLd;k~*qY?jaBJ;`5M#4!5U<L_dT@)VS940h^cBNtMB5j`K_ZIt(Q&yP
zj)o%rKBpA%R?nIg{094MtBM|#ri0)r<`0?t>UWZ7O&h59*vwz1>PL_By}uKEZ7V{!
zS}7{qkEFSPp=78PoAXlA8V%l1_0<l`em~7W@yR*SjF-?LH<=K{m64=$vp?|lI5%u8
z+hhHVjZ4@n%);}qff4~k=^}UjLTRUf>>ezhUDc+%r{e_Xm0$4=$8{B?vrf6Fx?1p8
zHf~~GVEdnsUZ$bHmh{?78GK@mYCQHZ5xTsE#l7f`f|d@R5|z?ZgKj@fKZcdo&N`>B
zd)yP+cFtbyR*VaxrjanZeYvkS6T@l7cVfpge_972s>x-az$KtXgBS#gxuZCW#wYz(
zfYf-f%6l-X%9GY}Eb$UHy7f~|i#eaH)XN*OY}b#AA;JyUw}(z-R<dr;uFF1cJ$%gX
zFs`!%gqM_!%b06nTE<M*>XnY^h{&Mr<<fera=#$x8tiJx#IBcM1&uPb&7|uQr;tDV
zYut<*bCS>|{_ii`PvVjEHwIe)b!8IkH}i}y)l%Si*4FlX|K|1myURNZ#4<ah+o1H>
zO;)<>SGE4jr<TH#qz+Go{xxk!$P+o7(y`rXaF~1%=-neaneUeyFxaVi9FlFzjA)Mk
zU}bi<uzp4D++@kw9bZQlLu*BbsaP4x8LOi0qC53`R24-J{|r$jrPjCNgLnEPWdygE
zj-nhg)Sawnh2AxboK3B4Nv#osjjz4t%4rVt-}QJSdW^-Ew!O9GCaSp1sS@CK=`s?j
z;99-;s6~7d*LC$&d$%bxQadsZYRUxrl`m=jvh_*a3BL44PAi1<8!%eeC7@)br!h%(
zDQpDM2W<$2T!jl#q+n-{$<>LD7SeX?u44Z@=+VNYb{?HkWMX`m+ZTUtTr$oQWL_Qy
zOggBW+Q<%HK<{s$`X?Q2><m-t3}NP8Oqs84Npk+ok`3n);Q#pOGae<RTdqlw1?W{9
zMRK;E=IIfpw*ABF;rnu|h%dn)lsUC|MM}Fcg}g9^H=be_yj{9DXpn6VCXNLUU>~Kg
z(39{aO|}bdB(lJ~XZ4}aY06>qz%U&cPE}^M6S7a13i&P&gH;L7f#cpq?*0I@;+19W
zgUMb^T5dB+w-HFhWx)kI-HL5d2IOW)aTNE-iVVwjjp4@j@Ra`J5Wc}j)V#i0B3Iry
zmMX8}7+_>9Ub+`o{;rksh%FF>_Zx8i7&o8^&=$0=w7qnGF37cLAb9iW0ljZA*m-fy
z_eP1GX6Z*f=*i-lNz8)#CL%fJ79YECgQOWtKre3mY2Dlz&l=fEc%p=#yrvOHdpQYs
z$gXB$!Z~9192X4GMjL5vwu`}(hI7=e{`Q+)mAO?46^u}bf#%Y=o5aRrbpOes6r561
zjwj#d?p|g<=+#Sx@|}+wF>p_LH_IS4k-%vtzpUTIEI9lQKE2rkfEcbtGq+3PuOmc`
zRyyUkDVH&WU)_7&<RM`xspFc7Hp~BLNl_H_us5tgOaz~F5oD3h;Ls(j$G9gc5iFy_
zA4h2$FFy=?Z&USBseKnF<(4iY<s<`9h8Ya0$xxbZewM2K5H-*?K4=eU@qRv_OX{!p
z!Jk^x)c&${Pk>{nf<QSzSYqt&9Vyldv3CY9Yncl9uRb-u1s-!-{kqe|8MXs@ZRvJ!
z6Z&NnrYaq?7<R3eu4O#0Z7NNB|2>QeOx)N1&GFXGc1!Qcfd0)O`p2!jz3C<Y_@(>#
zdVJ=o`sK<HOI%bFc0wx;DZuhDoTCWUj$7HqZ7KlA>qZsNS!!BadS@~@_|z>IVWAzM
zM(!+-$x|)&dLo!u@tAKKyAMdV(KRYTjX>}s_M6`GO#-j>zZ?Sb-d=INRizx!Q|;oM
zE!T)NE$#V5EmV9&bEIyNdgQ>!;2?U0tyS&qU~XH&1^?Tg`Mmk)=1*G!RaGV~>NKuv
z2dOpFJK`^>%@<Cz`Civhi=TORSMBD7>shxeY48ui5ZLhe_R@SGD-_pX7K<*3T)sAu
z`N9GzkN^5B;0mq1h`Q7dW+$|4b7&{FoaqE};jZyfq6DDCa<Sv0PrY+*Lb8KFAZV%N
z3%>HXGtLMxwq0`T?IrFG=dyF>GorK}cHqL-J-ZTVg0IwE<G;5@?EG8%T^DHLcY;l)
zlQ+7=-V<t>P1}aI34F1mllRU-AWuksz{NkvwS#k6^gtLm39}dHdm$kP70FqCQ~0R6
zzH{(hXFe91`dUHvt+pRS8xxO1{X%!Y_Yr$nPOBUcd;a}*^^dGHTs@QwP0oTY_+Kle
zNH4-&5@XV0EYPSb*BuOT+ka<jc0Y!LGS@leK%E%e?S0qrHiO!FtQW`FEh{^3-r%Rl
zkt7oo?*t8YILXf*h4nxJreQkbRH_<S!iCiuDC|`y>Od*Bt;-#y=c7$Yp<yApSZP{V
z(k52svI`hUK)zMV9+FU=%!J;#!E6az)tg15t|%Vm52gW^IPr7yO2n)d&EFT7a;~wQ
z!~G{Cv534qL|d*t9%biSfp!q8;W8#4>tKnGx<j(9vYu*&(uAor)K{4rR=y`Ps;)ob
zic>!^brdZ{BL#h`JQpE=bZ7`t&MI4ap)dQs9BfNFRy*G$uKbE*#vheNC9AXCXl<Os
zb3cGf0MkUVn7OcPSFc@5o-bF>MNO(gZ@w_(kPNE*BX_#=0>3J)B-r103f#3e9E4oP
zuLdT5vYiUZSCh$tE3*o!`8<i6IWs<RR6>dk!qhtZT3IE_!*NW&{0{V$BpoApQOe93
z#zD!kGfG_8@wT`nTJgj3`LUQgW_v~0WRW)=|K!<xI_l4Iq}%B@(MhS?If$cH7Y^t9
z#pzHv=0PN0k~SpQv%TV7gO~FR(JL=@FCJzjm4rZe>Zw;tZ;ZKd#X~s#CI~JLL{$)q
zZ=BRo%&JefF#@rAvEQ6OyBkXqqhRCgG~KgrkqFBMgSf}xHcta<9lLEQ>$Q=2s*}ci
zun&BA)X}G~Xts=;ble$T>PGlf3yR?&0dZ!NOs5&Yg6PUG`HIkjIp4M3H6mRP+w6@6
z1$lvi_ad2qLEJTcntgqQMR#s&YjL5<y71cU(QkB(TF`PJruX3XfRu9G^D15lOH+U!
zNyj?r0&CRkkQ%3prFr+~&-)a9!;kSlG6=t0c1coMD(x6IFlHYa=Ixp+v!HgPs#dod
zpSgya2Q!sI)-l6viPSaehJO|hGB`_SuRar9i)$TEFf3s|$9F-x@QgUAzYHVrjPDEm
zqacg$lYB`=H772!bI3gLizYo!$F<)l0lEOUsiPZn$iF2tn|Uzm#9+(7>EPDvYR179
zo#6@`T2kO>6}vH<M<s<mtWgLZ`41#$J{5!IaX6UdD^<IC_1f3j$Q$K)LYOzrC{KR9
zpNZ12%;nnsJmPi(VIp|8d3xe-(Hd<ZNJQlA4{|7~B%G|ib4);Fm{YEeFB1d83#^~>
z={vuttr=|Jm|5TjJb*zM89v_)>ZX7p8-Po8sNq!3Z)J-2W)47xC6?E)ln9OG6Z{^b
zRY-OF^w~;padG28&n(@y?@E6N{?;YZ%0@WHwbDOSM#Rtd-@{WH(2V&t;XB8lQh%lF
zMR*Splv9)lttIJ+KCGkaNq!21zgw$(F5HrDw_W37oZUMzKPIeRHug!9iUKqZ>Sp-$
zrlj+#nl!FrV@E^VS0Rb`sJE6GrvrpzupXIi$xz?Q+x0)CLK-xRJaX%18DImafu0jA
zU2EM;o~WD*Hj3rbzdPR93_>~mrqn4~utCjaa!HXfK~&!rFp&WcLgAu-dXe)3Dc>0!
zz4D8QF}ZGa?(yf905ICe;G5~84_Df8G%&10j0aQj@{sbwb(}YhtT~LQ!0j7g5j;4M
z@}-!Ohf>S!pworrsokf;KXTO$aZNid;^^Jmi*$fO5<bhSZJ%}n_Amv1a$#4hc=MVn
z54G}_wQ+CAjCYH}?)TsNQ)tnXUiqjgE67*1ev;^$SU40E-}KJCl}m^t2kM@`v6ESf
z@qdKK@l!A;e<KgeGlI)@`cN7iva)?;SZnSul__>zObFmn$kMla*`X8FH*8@)s_`UI
zRVjq;7xzurvO6W|Y;omj?z(b+6)(85x##xZx%t9_m08Em*3i8j`Kwo6>hQ&BK6@LJ
zp?j|J_LsZD5JN{J1!x~QsF1<EBa5g#GVc}^7NngI1PuF-0ZQduM%*GFFdXhL>Qe(z
zQ_Xr)8`Gn_j+{czS(Mnq-=#jE4A@-n@K&TTQy4w9@I+}TWlG4b2z~hEUv5%cxA;!8
zAL)n?2FVAp;jo0btB>SCqV6!m4i<<Fi`X?h1v9n)rit@=WxI_1oxQn@#c3+g%-0ux
zFBth^$GtE<0BwP)15R-zie&`^VGGf}eX70uyPc;=>T`MFNiWCZ!^^>SE%I+xAMPq_
z1?RKqeD(ZD!Ahq2D0qFjNV#^Xl}*s;-`C4aE%m6L2vO-`*~>XOwLJ+VvL9=nKH4-9
zU`rqV=f%y%b;&|0B+{(M4Rjov0-{bF9d2~;TvTPc(X*yfe|w&>a}Y$<i71fjHJ5GF
zx&F%#hfHkqC82+663V^E-yF#75FgTWQKz>Qd}*feS!%n&(HCrtRD)~)o0DYzU48u_
zcly!lOsKa}g@ch~G-6QHr3T$I8{4b|!>w>Yu|F}4kW*!;vX3rx1^@1N6nWXe(j<Bq
zT(1!d{R>Ngr<rgy;$Z(O1oRd3{#HHBUm?y;|D|n`wFVgPh6={V+q-W?F&9n&90Z6+
za2jv*DR%F7^z4GJ>DfRBL>8m_;I}tOfXx5D509kRqe8%9`qz}X^~lYjt_kT5gY7lu
zE~ly4$yXq*#J`=Yo#_tWeb6#x46Fj=pd7fYW@&8yEoV>;RgD)Y`sj%NMQq4~^Hnf*
z)tj1Dg2uE>kqBfB3oGO$zPFC~ZUF&*(Dr^u_2w5Xa8;jbj!U~Q3%y!cg3p9UQ~N?^
zRcrjL<cFBrrcBIzDJeBsei0M^PZF}T1z3Y<^X-7uWlGjo#GgETXgTVUZKiLV98p%n
z*gd`avA#pl1BJFB@;}!U2P$hht~#LTP?TL2FBqvLPS`+QjA<QM&bAUnn(TZB4o$Iq
zu*aulR0Bw&z&EIX_nlHF2%BY-Gmha7wRj9xx;7G`N5}07nLj9KdH7%VvwCr0iw2VA
zZbUYA+qVrvqNS^P=J*Y0+1-Iwd0nNkLf1~zL~|&~^rgC^Kn(`{3jgSYkct>`5)1sA
z%B4TRrQx<4l=LON;h5J7`yw!IKj7B4S=2vlu66U6aUID5H)C&_%yC>=Z#?icBzgWL
z;<<F$Q(cm<Gb#Qewe=?K&%1kvW3^-?^$hJb@ClCoe9mtgMJIpCOSyj3tTLB=9>wv<
zpBuRX{0j%n$er29y}a4Zu)UC<p>C;kIRhg~gM!vAOC`Sn4{$JCXdp2~=e39X=Y38A
zFV6&940n{oQ-vTtSGIFgs;KMSSN4>CJz_li4~a+;e#ZV|xl&)k<((EiAGZL5JbCx>
z_zf4#Mn;IZFG*}3a}*{MG#juo%KPq$uHfsYn@$|>uF$L3*h&!Bb-&pqo0|5MIfdh^
z>u;XT=Dgh3ppZrO*je3Y-gEp4ni{uKynBMP3=v9&rb4QJR}>@Hepl5d;U(Y~FIn8E
zv)`<ms!j7VjEbWCQ%wwYl(@R7n~?Q+F*Yv=67hbs*sC)_kpG4%#gm_0)PJCl@j<<h
ztbBGb&->+%iH_uO+%o^ygc)^}BBX6c1lpl0RfnaoD#~kTAh=ySZ2R)ayt*nYzcZVq
zWFyBtHv_>=)6&IsmSJuAsB}aAC@>-(E*+2PIcf~t7hHalx^ubaH4{=}qdYN*uupvh
z4GIp_=goj7pt;!eRk4g&L8%<2^NAv-Rg+Rozt~+dK5ntSm{%LQ_99S8A!U=duqO;W
ztG&6t$0tYGX{U?2eL(NguRy(7kh9p|noogZu^#!B9MxZ^XVg%{jQ*uG__1r3X(Sr(
zg%ox%htlAxF{&yBabPvgnqLz%SCC%UJ4iX$7e83tofXmB+`PBHP<XJtw=2%*(Y{0O
z#Ii{klMg{aNrX^|^JmbWoNo-Ro%#dd2(<Ju711?M_m$jlBV~Yox-r8D2+g47lW6FC
zKcE})u@=W8VRz(yg|~KfO`OkU<7!lDBB;>>pYj%Ub#;lv6j_m^73m1BGQo?D?Qg<X
zHKld+400b|A2E0H(0$W?G4;m#Rsz<esU@4%#^?Zk4`G*dJOgy8WEU5R7I}0uOY7OS
zG||W6z$k92c_tQz%yP4(fKoWa^_hW1rT3Vl0{%txSQZu+&@>mewfgSJEZ%`*czPwj
zSSkC<OPOX-zJjbi`^H(u#%?$$iL@~qw(wPe-MmiBgLIzcH`A)YV+CWg`OIuG-NxeV
z8Pm)oTr3zrO0Lf-b3;kC(;n_H4ize#|Lzq@R3GSUqlrM$$!3NoP(^w!yvbh2ZQ|7-
z$cd2)@1Zg@46md7Dui#o^6#ARn~lFDJJh~s4ZU6V*sSnzm#^X*apnKk5Jx3Mr50{k
z<3KL@R&Ws~Ned?hw>*qO|2fb1^t>vxFP)8FPUNN>@RIU<l0c*Ek<xsY^iUd_!@p~Q
zo@{I*4C5IYiTx4KRKd-qjsUECM8i!;VL|f2_S%vu2s}(rudGZvOg-n*C2ut&I~&z#
z9{CFM0QA|8^KhX3H;^xQ=(a!cUh~jV(Q`)Zsw$$N4xdkaszd1t-W;aRhLO=LVCoa&
zFs16sX~t<||M8rJO_hq-Gi%!?5HTaAeh@pD-QV&)plnXp(Vys;w++~2INH(H%};fj
z`kXG?CWHHL_g9e1S_ijo@kuO;1Y3Iy?bCwx-fHhGsqBs4Nwy?_tA7yd%db<JgG`b>
zntFoWDNdS|0&;i%HW(4%tzQ;$s~`fp`nvV@!SGwI$%v{n=@xqk<R^zY@q~(i<CSJ(
z6-5q+d(@9I{WpjcrwuCY>$vJW{~W<b45+O$gD(W{g*M}#a6n!@2y6Ig<z0Z547nXe
zU$&6^IMQ=#y|~)a5?9q|H@Lm!ubJO!wO3>PC8d0q2S^7u04=O?L@AU8S^3X0cFzVD
zB^V`^`{455n5fRIZh1|s2bSDbR7)5<*|nVBfp&17-Z64XMBg<!U8aOHV>BvZ(GGg6
zbkUUl^DaXTa-`2dWLL--p#dw<-V~^bpalm|q;j!%Ky2l~Tj>S#T<6y7A-LyzPg+6l
zn{==3?vX-kh31&SFxAp((|&MP&F-`9AA-8kzdrhz*KGM~>Az~^{UB|y4D8w=?|9^7
z_3W*5FuHfAz;X^Z)Rb0Wg`mi2hZNYn+uwKUMMWnAIzTTePcI+!KkdH^Pz%e-UdWc|
z!=mH-kDq+%lIqbpE(`RaxX-F6Aj`9Ud<kD^FQQMSkwvZfCEkO&&(P4uA#k{@M(V=q
zlKqroj>QTYRv(FSGs`y`k1lck**iC?J}l!>gMojS*Np=v@wVBOeveL!;&RH&U7`12
zwJ|AUdPnYpo5G?-I7pQB@)mJcddq@lIzyh-cU0M-DZ{r%KnwEkXWSrxP(u+$I-tco
zKMFS_dqN>U!sjs~P<jq70@f$>N@-pdo868(`ySq4z+>zpXUWbuEz+BctVqDzt$O9j
zdP3|3n*HN1%=%2#i90A_=_P9!J-Vpn<09FTW|aRIEHx4dTq=<F)xlt91WrESHaTzF
zfiq8kT!VH1Y+~m*9DNe&h7<eq>_RDwBI<&Kf;(7M#aFf$aiZ2(bH#~|KkZD`Urohz
zd-xwiAfFifl140h0Uq1T)W7D0$R{YovhShj{IO&MxxVboVAiO#<E&W-3fhFrU$)Pd
zhVdb3J&P*;{4%{^k=y<4aB~jzvcv<(V~U7UxrM#6Yr{#^pL(Wk7Y9rXp%PN1XT%E&
z&Yb!9GVOd^od<#(ghUy7*uFW&t0PfrVf5)timfL<gdDA-Bl~QqO>hy>O3wuk5pN4{
z5>3>CifvgTB*q6;&}{9+WC}>8n!hcu(W^Xq!U=Lacz?cUpMJ2>vsVkuXc~cm5Jg^%
zj&6FcdJyO^Cj`-+s>u+x=X+zxOPvbDEJeW63EL&s>dn8jP;l&08(LpmBKPbOBM-p0
zp{74neflxl>0@~>+uKVjhpxEX>+TL+`%bWBRfbeB_J&c^fB@TDI(J`x%S^Zh;eHJ-
z2gaWA>tdD79tClbH^-KPkWJ041}xF?e$M9dl`{Sp70Vmu!F9>cI##M771P|$uytd~
z@qBu0!qR3aQTRM1X!=vUm%69q*MI)kzu;IPCCibL6@?*ymSdWt+5hkz6ycV}@%l_X
z+!ygcCztk|WeNuxOl|HXsM)f5BwGk??6*Xpm~08Q$jiTK&>3!>iBQ7cew|p1+D&;Q
zlBV<vI&aeYp$`$liA8%6Sw?1wk83(-#s5CK_+I>agg}g)(VfEs8@=5>lpPl_M~}uT
zy;kd(j*4Ncw*U~$uV0s>sU27=z{Z~2-+zIXABR(0m(#x5y!qNEq}ZO9cL?Hs<tKQW
zB#JSr!RmeK|Ix>SKQnYbw2d}9BzT<3iqQBK7>WziK0^zJN^gcohK6eVijld@0JqF}
zH|>1VRGeCLno)(Da7^uN_}<1!-t;2_&3qFflr7fdCYaT_)@(0NTa%k`NVX@~6^liQ
z`*1^Wv|96x_tAgSykgNT80Mb)`p*;Mg0EISh)8=%>8^@uRU^+#^BSF?R}=2Byls$+
z9$!DLLPLx*FD1Lw{ObB|vKctLCm&tchSh=S^!wu}#0A!@m2HaPH$ihlC26Mz;+>1D
z>c&p|hho~a7<n=_x}I0~***w^)E4wS?AK2BI0AWE@y((=ySQwlq+F#xD0mvy1oOm-
zbn#%~S#{azM5RSc{^k=7eqRM89KQ1PY|N+)vbN-wMS0Q~#&V;gr-@+WYX$F^q5I-S
zlF}Vvz|D7_5k9-ArE)jl9tsAK7_k7osb<miy!#kzeB4fvcV2)k2&WC`ALdoUy5+EZ
z5^W?{zR*CJ&#L311E~*^K`|i}9jN#lkWvu7qyZbGk;2edKv+!KVWWkCcq#5yy4@fD
z95vA~QKM*y_i8m@8wJorsPv-U#fQimwS=i_zj%kxXUYVuaT7L3)t=ihL1NX6B*3HH
zPu~<bf5M8Wd|%pZaEG65Q2HV^I!;dD?0CbI%c<vZmqq`OlwI6(AbJKccp{5Y)LV%m
z$l%0TiD#n0rbD3oa)0(<cE1;JYo>U*I16%EI+X}l9wy4jdug&yX>lCpjQc2%DMk0q
zr3UBzrz@?3t(n-{8%|nhmhsLngoOqkT^+VwK{EIyN>xYv^08}6b8Ef*DMani*}v1j
zr|xx!?k*;K>9c%*!y^D=?%yhC4*TcUdFo#0nfCnc{f<qfqi$E_c^D*{qejgbIUBKF
zeEO7Ak=;vQ_aYDO`vN9T+#DCqA6iRT85_;XRZ)4N6qNrqo}-bE#mmWMZ{@<|nTlOQ
z&Nknbyh2(CmDRvq-MQcNn*kro*fS^lCl0$6^pvXN2=WWZFQ*`N@sbDq_49tr=ht5`
zo?%fx1I-O{utf4+7CI;C%ZGfpt<5MOKUZAfwICws_S8yx`eUk@g795w2PM7>+rHoz
zkSy{mDCJU#5-yorlu?zW0pSjGYAgkf*>`g3!4p~k{HnT$zeA3T-rY-nDad;kf`4qt
zDcF+Ynsl@Vz(4eg(T0dmxueZV^|C^aG-6N%nV3}zYNwu*mASaxlwui`t(uA}1Njl-
zrardsD=R4Ib0m*sG}gg<NIJ_7+&IZ4ayaI8Cjsr?0-XqDbPiX@5e=h&16S<X%<Hoa
zW#m>IDD<l+1xnJS)t4MX4s+q}Jg+#-Z9LpGkg8_<Cb!92(Q)s4*6i=VLlURJU1;f}
ze$?HbezZf3!!4LklI8-|Bgm+@0Vv#Ky$v5)n^6F6(czp0xF-{uvl}%KZ1$pCvQkC)
z9ep3j>egQyqm|v6UHQ}5qaOs23$28TJMJGvjAAf;uvVN|wA%6LoP!-|&wBVKc((a+
zJ#u|X^7zt|JWqfr`6L^Jg4Tyg8L{Uef#-w%=-3hdY%ITgg3W`1#V-<k&J}Pq{=s(V
z&r6M6go_~wA}pQ@c2)IF`nOJ+#SfB~vW+M|x$>`+2xrg@4So#lj-szJUpt?)d!p_T
z6X&P?@KPM02{~M3Wj2%uzF2&AT(d>rwNKb+U}y{tj`;*(RN78AH7gK9C@U+otaWp!
z&A$r;_=t?tneFk=xC=sfcJB|^R&4h7c3h0RtQyw6Mh#%S=PxJ7v}JMS9cfAU2>h|q
z4^Bp5UfldrLrgQWDte+9lI+Sg!~b4;q-@?CVePzVCk#7G$fY6@l&7=Xo<gYU<KJS#
z+H?{<3eG!UifR6)063*%w;OZ?)bFwI3kcVd3@3yI|6GfEY_-D6p<*?crozt^h0f$-
z;U1*i<$}DqEc8^GUEW&ON#Mab7l`a5)?X<7V$R;(?%n=_)fCWC{zts)n3#=g?};aB
z?6nW<QC;dTpuIWMZmgZ8ssDCoVV@YA!Z3>O*<RQ1cAVYY>IKI&vT72v-)?^GZ{Lw>
zjBmX3LkQm-+SZrk(-rl@yE_<p+Jx+pLjjD0wfQpfh^>A9kfzMY?G=rle-kahFtWeD
z+v7-ZYdPwg2wE_2j|=zhMNl_q4nv#<@IU0-Y{{bLSAMax4C_j|vb>YO7mKXWPjLx4
zGwQMV>@DQOd$tSjmx<=}or_*5l<LRk-FA~hk-9n#M~Bf*%de=UA&syZzR1ZYsrTyI
z(a1uaKIdv1v9}PLloz*V$EoqWKV#+SNK2pUnL+}wg3_RG2eIH6lw{j$$av9cJf#|^
zEPqi#EB%x;PT*<e-cDaf?W_X<_?TkL9>Im8ix@l85u!A7eEWh4bc8JU^yyuF{W;GY
z6Av~QUI+n<32;&%elr+lKDttjXE&-#6X$J^O4!Jd5cu0*HzgRD_k%CG`>W_!{v&UY
zH-_JV-aJ>`=FSmf;<0$Y=5Q)dUe|lyBK%|tq6!(@&%tOSEO_hB0+TjRJ_`Y0)vsDf
z^JMaI$P6gSI2M~e);VO#dy8%L9j^l~Yc5S0AVzP_wa@ZMUM>ay<)9pJR5PU=m&8l{
zMG5783wO_x>4U{!eMI|#d%JADO<Bw((MHN32T-NUJZ|U$W-VSv5=K=e1@jMpdN4gu
zMO=i+MN~p;8F|>NDJ}|qX*b_rh@Z5jLtGs*?JDaZIJg)>x191V*QN!m*;FC*HyLR0
zH|KM`{*R+`k7xRQ|M-Y<tgW1rSW$C6%rQw!9}#NK3y~<6<C62C63Z!-L(Y*y4x60M
zl*62d#-!zx!>}9*IrF=}KmA*JJj&kh`@XK%>-oHYt&07f0?9QY_VdCB4U^}Z1F^y;
zu!R^6=<s$GE&~)}ve)qAHv*Qye+^|7k|n|$8bI3@ZH&;<In4tw_(QgDB$@O5|4lUc
z{rSfF`oRrf1ycL_mvu=(*ayxdH{Xd}y$|?<sTS`z49M6RSN#)%Ddv=Ff&vJpE8K7_
zNkxtPZB&pTW?L-j;m8WnPXpD#)ZwP`M&v;=qu$3hMSQM3(Fd$=-Do3d;A|=!dF*=f
zhusp|JnpP%B{*xsKwnV6vO>J&7{rRRZx>#CIqVH9kVdV`hC}cHqS0@r8s!VX^RU20
zDA|Rq&V;X%7#l!H%7`!_KiQhv>V>0b8a~Y{7L<9i!V+7r@P4GfopDa2IMJSls#v|O
zE3X3lq3-VybT)7Ek4&fIPp;fr%ccrpPeTgpd$fsokqn%^nE}yfU4%{EOjP$W9=<!J
zp!7yC;t+u3wvMK+?eC2f`60yIo{J9Igzro)K|76oGsuoIs&KNJ5gi16XDC<0f(|xm
z{G}k?Kc;-(M)8|(i?(c7ZQ0v6?0n5A7tYd}`#Zd<nIehq5(0^*4%QpG>bM@t>+^?-
zou+Kl-s)Nt*MfGoktStst^ICNwfO;dGwR7zNpI1pN^MQy-@VDck!ejHp5JSh-3j|s
zj|Bo~KI4m(B48aeS9tcHZJ_*Y?sLF?<g4DiKlj!N%<i_?+|fK$-(IoF%V%rJEukQF
ziZe_hDlrjs*ep)<WtyUA1M*S#*M~a?Cg^qolF3Ef^xUVo$4u;ZjyO)gDLyvyFf?WV
zx8YQN=`dZQ!%tl+mhRYnhh?_MQA`)@pQfTH_1S(Q4NQjv0!8SJec23M+D7n0s5b`o
zMM!5A(j;1<KoU5gWvp*JQ<ysC5c-IgkG;Z7&NGX(ISp}!bC7G?_Wt$L(i)D1r4P&?
zW5KDam$|QDT-1i$FI}z|iD8iboOTS_T-pRDAaMHRbY^-s1-84E4a_+3@Seh8ix~CH
z>%`UZ<?G243i--JEtY=zh^Q9brhCU*aMWLLcOvkfHPuOh^fcImP(dEW04RF;R*^9a
z6QfZ{mWVCnh3T*%EG-$umL8zM+2K<c^DvKMOnIV}sjpjYGq_^oDz5!-xvlMhE|;~L
zqVV9w7JgQEWh?Nzr_^Pg;1)v#q8BTZBljC#Cg<xHA$FtNJH1z#Gvyw_>gH%JnVr3t
z!B70ec+>Y~Uv)aD1}ZD!)_BjlrdbGt=eeZ5H|^HbM#qS;G{x;=@4w_Yfxm6F8!~vf
z>aBk5+-r4uwM(oRlvwB7wCH{Z)mQRo!9IrDNMJYHJ2)ud+m(0C0TD62*@}0uu`&-7
zm5$+ks*_UDHpwF{YtMX^>ct&5id<sGmrqJsgY&6EK4`^|7E0WHkRm@syMUKy`bgNz
z2s2@rV;|z{)2soK5Z#Jp-d|nhHNx#qM=Ck*#1ppc)xdL|g;R$?T8v9w0!CY4pnl9A
z0N8fk_LfWUep=@(bHjs*SMactFs|F^5yZ>%%LO&Mp-edHc!m=AEsU6OBidzY-4mHY
zE;mpCS4QfXU4KL=Lmkp^%ny3h3&4P2pwmgHbWF^9Ie|+|$gxRODUjZ@RYaOVo`i^^
zKM3jAQw2pw)5F9ZWyI-h5Za`B?3M_$+sd{=@xp0N4+f0yS--X@O`hY9c9#I3Y}Mys
zOmya%R!Df2XSM;n$^c<)h2-Q8d&LHkvEn=av+|+aXgIWaZ^h&2UuP(eTvhZR;TB4q
zahAnKz4M7i7O2tZgYGV(&buE$0NR4o-no6>ztJ9{>r3;p@%#>4%Y*;6f9ch>wA>s0
zv(m9L#|Vx|UZ1ONPvm;X%J-RM<xw~9b@(8y>S|+<FE|d4K0r(NzHKbcY7VmL)sqa`
zTc){0g=_ZDd@ca3&ZfP&-fAB)=(_7~_2h*<+ZhjC%H2!$w+c05KPqMXxxRc;=doU7
zq_`9*3aRd7`mD9u`E>Kr%v{|KH^<iE63=MsK-vG&hC^u9ZWY%X>l<h3XFn}Jw5R#F
z8MJ{H%hAty1-+lGH*17}J=dF}bBO%V*mKDGUeC2Kc^#Ax#ATx&zs|d18TO;F&O-ho
zGh*(2xonOuo9sNsbmri2g&d(mQX@xe1q56&%JWR@-Fb@!_{w!n(2?l*odF8OwzMQ+
zG2+CPr2dNg88ety3w}WwuP-r5lmU#fCi`lJasAroq~@ItRcZ($W<Wekjvuzu?-4Ry
zbucqSX=^Je08ewf_nd~T&(b+K2Ppd+8||%bqfJ2vOY`#)DWg7DMN9zx=V*J{^vrV~
zS-((RpcJ!0-g`KF7ZB9m;kpt3o1j`>(~~5)r_PTQTT+mn@6ni7UGZh)q9+xrKWrz@
z{o6}jp8<axXn+369ZeuEgE}q#5gzr~n^hdAI63hXcBjzEI>`m&jYw4-sy3p9gE>Bd
zq~`c}Sh`qV$)d!{JSD&Uqrl7`zu}uMwCYn^r<hOcP$HbMrG|-EiFf~+BkVH^Fm=m6
z+!y|W=tLfqpkuh+g`!vhqTeD5eEjAKBdaTb>+ijl7SB22>W_Rc(kvR>MC5-T$DYvE
zFHAtk5BiU)eAVcEV~Fswop$|f*%Ww7ya1(7u1h#`T1b)CS~=1dK)C}4<G0^p3z@$?
zl?*^K>Z?<4zXRV^KbPq(6~khcq&IC+j7nq^!fvX3{C1S@5|eFq+N=wt%sQRykZ6eN
z?0?rU+CEu?_Pr;xn4Q!o=*b9@0Yq9QLyQGxtbP}}@%k<jqnt7(K-mP<u`+!F7<=N0
z{IYJurE}$CsXnnYYRsNAAF#(`yt-X=)+-j2La&$`+!4~BOGmT30SsjS!n-S%ne{QF
zJRfDLb$sI%X2dYoEIJBOWx;o+k?kJ0&WCA!R*vLWYk|+R%N#AXeR}J!VydDV(MztZ
zCFb-iC7O?XUXEgXt`WZ3Y`${*T2!)&y^)B2q@(Y2BOD_LNv}X1-w-h^K5ci?^Tofj
zN=EA~hrKOHXllZvGdwvWPN?I<1$T?<l`B1&bYy_@nUAHH_xyR2qs=4X^Ks`xHbF6?
zWy~kIqJ?uW?T~S=(kZwszr)%UW9p}B_cV0x5Io`e=ff7jE_|rjgLcw4a=zcv=<M1u
z!_qhM-?gKHmZ0rFL*L`-Kyd*8jP{pC?{(A~ubUc7>?XH~)k;M785-MEI7a@$<)oL0
zop=;EKaxsyvf<K9Jr;(I^@7`D5#k;+xvSVmx~9k2Uc{4G?`T_vg`}Q_Ynq%bZ5r6L
zO<o4#c452FgSymd-u6JkOHUsU+q2d5U_Llo@4CQW>jDnQz9un&(dT*QZ<|kVpLA7%
zlZaer=b|@U$8iDECh&}y2NP%)j6QByNbk*^)+N0sGOT9-QfpSXrA15;KVO72k1J3v
z`dJPFa-qBX-2vA&n#!l#Pab<aG#?tv`19wzA&ew2vlz6p3z|WRmT&+fPeT)wNt<Bu
z1A;R|EUi+aa<)XKicHx!JlLZwBI77K-eB;Q7`+maHV6ohO4rDgMq=S?`mg@}ub3Lz
za)iVCf`Lv-(kq{>UwR1jj!zgh$KPtkO9%&I*!S?Y7=zPap*n2M2_+>^5P9~`bVLlr
zFGh;d_kn=C{|s}I&5!E3hCqPilsc~oNw6Wg+`@58+@&|-Y)A9b!HhkTaMv+-t!+?>
zDL59;4GP^FXlc5ml^VMLCQh91x=mK6{Z5|&4z~rxe7Cy?bkI=TdVybX(z&U(`$Y}Q
z{U(#OD#<@Hd<^&uN(WkE#p%D(^JKh&-z9=6Fmic(d|WLqxmTQ|Tp&C;a#_%}-P(k-
z^rCb|i)Dj6+A{;%3SW9kz7|Q}6{u54c(QN20Kn|lV96xYbX^;8t*tZl+!7|#zsPKq
zGXneXi#%-rOPO#Uidq~TZ7@Wy$ggC*FBYb#R>BmP&pvSiWq{}q@F{Z1tUGsum01S`
zvJKH8t{a~A^T4N!Gd?#~)P81udFetmwJ53F)yUuxp|G!4vv_3mz43cP+0?Qye2{p|
zBj?I06_#sUfG^PC5Qf2yq?~}s0G>(lx8bhK(JIuQqIW#crXClj6?4H{4_xHitIB1s
zv$4m9=|}aIw3=Nt=iz~H<vAZ*V$_x035Gs<!`8HJ{A`RQY4<6&q|Wf=nDuJoB{)P(
z2l|rd-pE#;d6A+m?{paxuQl{f>X1I<1Hu?zlJ_a$`Q=0_5BBG2GGas+#Ox^NwT9Aw
zlS0IgY8Rb?-chtcxA@#eNNZOWo8>Z+Wzidx<dZUbp?1_`ayizG+g$e>@~QT*#5WwG
zTjNBx&<fhQf)pHSa63)6wt>^_lk`xXpy%@OJ;vloa?sx75ql?M{AX<g|K51Bi5G9G
zQ9ShNtPRodO=4bx+sm<dzd~oD1vhyKt4hALKl|!L?4$|_SYfU1Vu$){phIOl19O>h
zF0Mx^V~KHd&_(}S1s~l8UAJ(>->E0uh_hMIP?$r-S-W-Zte59TCAVVswaxN|*NxrY
zJ+Qg<dX&66_U51F^8~jV%Vht0;q#+T^<*xUH%)d*HQ1=s3HDe8(C8a@upII*R^jqn
zth!c)6xwNFD^L-_Z1eJZ{9_<Qx`hX0ML;(CO=}w;KUa|+XHw<@f=<I9hO0^6BljW`
zyr-AaH?^!BlN;xKn$qRe=X;d%wzt<l+-hudcpXMcP54`2a$YX^PRf|RefpeP%9#4K
z-w~=F699!zTWty2dF##m#w1JY+-sA)gX_m2snx93!;9*1jtE3PPwEZaxbnaIvBeeD
z?AQB|^z|J-O*`@o82UeHwWv!%k2>8{PXu1_;K?hi`7Y8`zqj_9S0kG(yIi`*)c-@N
z(*d$fGh>$H5u4aT*|#n5sc|!|8-9)o12suctqi(4wTG<!QA5Cxis74Yjao0vlkWYU
zZYo_`%?h{$U<z>7-QZo}h%>$M5~_=KAxUYt?DB;hCC}fWcmsZfUO<yw?D(+1-D`^b
zY-vwz?;6xKG#Da4c*e^o?dm<ZU&h+#Tf3?lj}(x4Mq>fJK?wdaM#?b*N3~+$tJ~P*
zkz#opYisgy;6nh3N1Huy7=wzDFhEnUr#<;0axbvy{V^7lWFE6lgJ2~LW*Sqlo`N0u
zaA(~4sz){S&ZH>erU=ipE5W=~3=;ND9i^Q?r}dwzNj&vt=voe!<iVf@)}XLtZV+?z
zCPBd<)NVA<^jn_AeDUtjI_WWi=v*Z<M|MRIba590vQh9`sMIi42)>fY_~Tbe{nzjn
zfRUPQNa=5v7JdGv6o8RE;UzK0iKWn{M#*?{r3=#4r_f=fo1K_e9CrD~%ucIqoqO9Y
zn6rd8Y8I^6@0hAo<SNf6B*yChAaZ=l$l|)r7kil!P}*Q{4&ydrc&Bny`2^>g(-mPn
zM7&w$Ul`L{aYyYiHl~|1#oDNt__7H>XXlX=3w^9kgiX&*&?_(VMGi>VphRnP&llEP
z{lATlmO9ef$V~kZ&eQzI4JR~A@NjXZt){(Yj|ukbLdYAnhUe+*e4K(PuBaM9*-o0r
zQ`@7o_w6r%vD&zIM_8)8M_Xm|w4kCrAL0R5)ESrELnF^p?SD3G;kQwr^3}D14!^4$
zCb#^C-6{Hx;gE{A`|rs$3cFVH#SqD}C3^aqfrz28$^7hX|4<g8`QxM3lk217k`#*^
zqH9dCu?TVaRAW-3QFH03^uvdSgrFp4SMmCPp5DQiz;1v()3&qCGh<*mm}1Pn*7MU$
z7SD0QmDllh{JhKYH`VO$-$8*m+<P!osTUxea4g~L1EH!>|I9iI=;Dum;LB4@jUk&1
z=rz56vzP0~-AA8-o(~nncr3{dc^avEMzaEnmf75Xf_zX<Tx~c6u#SR(ERR+9d4C=V
z(Z}|kJj*0k+h^(CL(kf#1}X6Y3hEKy$kM9Cb3+6mUYP~yAfwa^@-<5!Z=kAmIj#${
zx3+Ch5Zs;qx5BEF6$fmW={OY>RpYAf2L6sNw((FOp|0pg`~C0A$g4ANqB22%Uw*@u
zCN?w|<PWl4%w{BXd65XTj6JqOf+zyTkpY(y!yh!*V9F)}8%|x{?LP~(pKb(2*wg3g
zyn9?!e|O!cZg}#+y{Y}{>A0215Y<nsGUoH(NGE>!P1(;YGV;QXXqodlGB0uDyQc{0
z)?nw+TOK}$*I&za5yED`*s~pyd~AJoEK@Qn+}Ml&Q^iJ*4S64s<P0aKPfA-c(nu<9
zA7|uHLY;<)JRX&NwFX2-+MplZurO>akiStbFN{No>0QrAyGxNFTV@K~-;Qc7cOIWx
zfgoJ&SyM()kck)`!)F>_t1VCo+mdiHW~RxG=T!`c7VHiq-Jm$8Y@k(d*>v+AqhwtS
zxXgwwon2|i6if#dF`w&n$&_%s^)SG>nV7{DmSAIczQrrN8^2b61NAsn8%46}uv{vx
zM2JdWp>K%rIc|K7l-LM9A*w*?-FY0JDk0j6G@+;!{+f{#PVVtGuf<E)J&(6=l980Y
zg}2!WU{O5nR<h})OfnGq5iSn-^42>_o!zk8DdizIen{t2)<+3sOjjS`Mw2$4ExOii
zfwju}4(==*F4PhCqJFR%W15of{sZ|%>MK7J_cKusnPzuKeO@x@Hdsaa*skw*BM|fw
z#|ONE&#ogYec%C^77h+m_f1UZMv5Eq-PoLSn5B>Nen^y3$}u~p@bJr27{2|=b?yK7
z!s`rVo}@4oGpTw}w;$0$=;`m?TO^^(1V4Fq=b71GbR1;fbvk}xFZ+O$dey#7&-Lb)
zIx4y+K>hr+{WSpH+?<ALe@S|4NT^RlbFSx)@118oJ=z;3G(|K&@yB&+*V?Q8DSGtU
zrfJ=-b(W8e;<b^otdP^C72V6N;Tlp2%YiSBrUUoJ#@RAk5|-R&>eJ(1CsVwvC7gV<
z{##sFwd7tU#ZR!X-Z+1fU#D<$aj{!~atcL;IZ(o@-K;Lt`?#823~ZA8UonIlv9x@u
zgEc=)D>JUQ9Pdf1idX;~KrdV0y@US#STRa<hWuSei2h6-j#&Gq?%h_i+g|S+K$7fQ
z8OACkVxG1&uhu6}k|Xyph7@G4rqRrU*kfNn{=vp)>6;ZDiTt~=qDQDo(lCB^%Atm4
zjhzvS&8v1hwRbGKq!$jE2#lTmOdY{I$seu+T{7zK_O?|D?1Nhxado0GxUDJ#l-%-$
ztNN!u<H)dYA))$+7O)OivrEnIR}bFZ2c}i}ct}f&`cI_MpmmErcWg_b*B*#bi&Y{>
zXemSW44|@b)uAE3TDM5A*8*y?%s@&-l}hsC@3{#Z`~B*dx+nIku`f<V8B?uNyoBEX
z5I#T?RkgYM1_r**cS*m2<$^D@Ba$DZN);2(XqY-&eYFwiMpL6(0ZvGjTe<`bSGEgn
zFhV=Luht?KZBV3j3C>sjC=HzM`Cuc+{A_E3JNKR0b@V4rFug+jv-J(c9m6x~rF)AA
zXhcU~fdEy&k%p}C_5E@F3P7T~P<YyC!&auKGz!R|21XPsEY->UU*c0P4U0=T>Am%`
z6DQ6kEgcR-4k8`E2FzMKLER%TAKE(bLmI}^G}q|A9#BIw0qYD$0<X4}mBT_66?2o7
z3bD^P<G?b-2s37hdS07uYL8d82p{Ikz$uU(eUZSK@7a^BuY}Ttl^7qD<7NPH&HIGt
zkCVnT@4vamzQWqf7XpGF#GcIoS86bXtvrsEXNct;Y1HO>f2BsbP}qOW9Wy&%bt57z
zJ}c7IWdRB`5zU)&8P04I1^FW6mD*_pt+C|~Nd-NS>q_svsr3I^1a-bCl&?KCALV`h
zSYstCWF)&Lg3Eo9i_J<gDdGHe>U;H3CtL5h-Rl;EIdAW)o~V+aQhv|E{ds@wfL_oG
zV(@kcE57%tWFoZL9_gj)(uHPTR6OU!J@+TS6!R+wkuL+#&k8v@NIoB;7V3A+B-x5H
zvf2M&Wxl56h*YB}N2#C1+#XE8bhAW~{Y>;J?|UVQ!LZ*_xhiup34}QpjnM72(ay+$
zs^Nhwk&>O6$piAald4H|?r}5AW=A)DWx1|d;%x2(5R@&aMcCY#w}DCBa%b--?{FHI
zO+&r9CeuVby-j}xcQ1k+U^inYEeh_=f|kM=m{mCOo}S^ODA0P;3fifarQ~0jY7H_^
zU$}I;!YV=2+x*ioI`PKPR>qazbQjD_R27Di;g$55Xro+!n+9_N<XrlXY7qO|{Cad$
zw>$8#p?p@0r>}3WT76yv^0MgZ#k6rF^sn1!{m_G3eVy!AV>2amX3rq1i&qpO#4FAX
zbBc7`6&yqhWyZ?E>N_y@aIPmSI5;>XUo4Lq@>Nw#jUfzgN+?SHy6ntoyoBXVQXnCX
z(hS8R+}i@3L+Xt5bDd?w(tTee4^~LlEwg>nY0LkfXf!o9$IN^WpL&rs?{EDJIe-y(
zjUxYmHYyDJ*b=ISi}rgwwK$pK#xw0z%z_RlCu)p7UaY>+<65l~^Cw6oV!DJsHH30z
zh(eiBi!<O9H1yr_oeWy%X6BlyZFfpZ7T|a62klVJM#;a;V~@A?Y$rF#DK<+0pqHk4
z%7#M;yS}MrJYe#=bG_L0MojkT=;+j02ypK`CSVQg6>gk*bo+Uv6r|D(hx;h=#{2}D
zEQnG`i=`U=)yue!^4;2U8||l5N>@~!{P2mG%?JS71K6AN3lPzZu*-QGom8vUM(3m}
z*s3OrtY&`;w1oLIx_n6D9z;1NTA!u@v=Zxo{8H&R6&&dMLMc7ct$8-i=KuQAad)PS
zrO=?REThg3JK4~*<(_&2_1(QXEd<0K6D=X~(LNX9>qyuelH%eseJKJfww4Ekpfl&-
zcsjb6cuz<@)K<;_5nv02g=TNXRMphffZi|R8B8)yfjZoyU`#>I+dNSdbWE)tsa1w`
z0R!mn+4}!IdZ5Vpte0R9C96(PPb5}Qg?z@<=_yXHqT$4V!HPAtxHkDpw-+et`{q?c
zGDVFqhqLrWAM+Osxg;D36Os>7hdn7@VsgUI<`m`|dS58{87ZL1fmrOYeHMi_wBc)t
zeC0Yj0_Scdx0&<%PaDe~eZt7(UzSD`nE#OZz*JDH;sz0g7I6yluG;kv^Xsrcc($nA
zyaYWcgs7L#0}_(-!dWi1CmHFGFqTiLkh}+yqGe(7!x$V9F%x=p6tus5?dls=xaY-)
zL3P^Gd>3w5yH&EDv92;-=wUP%l=xsjYXt`!uGd@(IU4uWYE?XD`T6+em^SH!=U8-M
zg?J^^I-07Eec_u%q=}_ercX52?Q}QXtLZ-%T^PjItpF^*6i`{-3!RfR`mW{;fG-@`
z9-$N~E7M)xQz+}jxy?B{UorG>GK-wbw0T!S%DP6Th4u8yY35~jJG~d9i$2d-<SOPh
z8rIfBZ4~s4rg)pzyto3G_U{KAoW1BZ_qDRyMP2inNM;rUg`i@lm_vwX>W7)DmQJKW
zgFQuc9%<($EoJO=tKO^VV*J_O-#5U!Q3P6Tn3yEj-+t39UR4v-i7{Mz7Bljgn$UZi
zpH9F(DtC*yV_r=aVC-vjy-XIM`^zHkI@S|?$wIp7z7*s}8gPJ@a;GHpAO{L4E}<1i
zpvkp4JrfyvFnPobQQbRmt_TeAKUZMQK8ul(_jS7jK{&yHw)}}EB0$NA(w4S2?(VCV
z;ccv1vvs6e1~NXjld4AFSGNLQzt1mRRka;md3qNn7+FRIW1BQpkFLS~kW11ga&oU<
zo4~z&1NYJ<_l?+A=H@{Db!)W+UXBBh&R)d}D#5higTwKLn$?JP5SReJ@Y-$ivu}OK
z<3E4rBTlw`uWC}rC74l=BR#5nO<RX^n|U_Oyotmtun(134U!dP+6#Gy;|p7UYDntq
zeN3pLb_kq`bfh5jid@V&uM59XNPWgg|7Ooos%&IGFTf9H*4O`(vyTyE$4aXwkG`FE
zVuCrAY#KLBE0A2$4f>5JRc>b;i}Hly{)70fujNxyy(Qo<3lY5m1Lrt}|LG<{FV*db
zOXhn)lo`>Pz@5$cb)Z58{Y@=S;RYYsh1a2QBSN<H79eT;40moA2WH}^Ud@Wn3e}8}
z{#eWVk24=S{QJsj*k|DD^36GgzVp}X3_YW6xJ2{XRTql~g?e9Ao30fdSif3A7oY5M
zNjLK%TJIUOx(Jp$Z)-GTV#{8vw__5~b+!iir#Ip`J};Ptc{?M&*wCFo)Z0a9wNYO=
zVz*guI-S$y_DOx<N#U_S$bDDE&*l<qsSPQ&crP`|*w{U-wi|R24uO=)Lq{?9dH8N3
zD%KMJb4VIdYRrebdNr|yuK1h=|1R1+fSx&sjf=O@jobES!G2s77rBRWb^20X#K-e)
zmKpC_V5n~J*5+?xdperHD*Iu^+_NBXXKu?eDe6P$^>Zc0<u_Ti(&1=ZTVF^RXNvi7
zxAe>FEZCRV4jDstufKUKb^~F}Z`3ROs>51=s5{f0^VOh>su3JaTHl{pY@D1^^=RRE
zTCqI&bE1jU{j6iPh25zc-_XP9Q!PjTG-JN*(*L9|INzT55<;<RIsyI0uOLutd;mYO
zzkkib)c~dKH{M~n9?cf|jF*^$GJhE&F|rozu&Gy0;t%~YNAKl7DtSG?!Ea}`+Iwq%
zpWn8{YtAV%4k)%4Ye_ZoSLj^Nq6*_ZdH+F}->eA3f2E%y8xSJo{jy9oL|!)>gtSZ+
zBq;lyd;=Ye7yE_2Z|u@C=`(IQ(XhVrGh92I`h&RakRIva7bmy$WVB?YdfA}Rjj|gL
zWZ3E6sRb_9Ir0}iY?0LyZIR@ZhP{Sy_pROi-R+H>(+^j5Mm<1!RAi!2M!m}I<NYZA
zOtb-wz@9xcn#~B*3GfdI4hf*1C2FLsfMA7(M-y&c6LCWhzTT+9#H>&DSIbgKGh%Zy
z7^|~)YXZ@#y1K<ZI|zVOzb@@;o3_u0_KH&o+2005G?m~<WXPE-Z=PRnNCU+G=Gni0
z|HA)A-gP-i&UZqnzqI#lSgvfZy=YM<pB+(2s)eRhyMY8h9&eU#J=~VI$e$DZRdV6B
zSdz}IDyqMo*~R?g+^QSeF=AK6nWmoq$D3!iA$$UY;>Z^JCNGwf-|b(jvFT)8oRaKR
zMpXu`MyOMYD=oYlo3ZtvM44>$5%IH*hdWoG69!AK0Q(+*NzRxDk@nYZ^6fm_gm1^O
z%TEEI7440vauI?x8;rcdTZ=!xA@>9G2WSKNB=Rnb0&rXKJ%b_6>*SPpuuljy%6iLI
z_W)H64%6(G>s7nVX%Z!_%sh;e=L1qaAAd3}dRv(G=;>Z|<(|0vkTn<d@%{=uV#2x}
zm%3r;%8J51b~+%7-~3<^tY%0~^lzbU9vp`yJCXD7Yw|G|RbEHof;*Rx#ileNSYaM`
z>4G|z%GbK7bIPLT(0R<as4*Z8W@E=YFEZrsSlvEWPHpaEY@U7A6;^VcySPi|-bst&
zQIIkVS)ZoJ5~UM*OHN;0u89vRYlqA62OPxb^KAOalRoeXJa>A)QerOf$fu2i-x&(%
zzA`Y2`L`EtThzR9FD7GF)4P_=cIU}^65yl$L1MvX(gjOAwt}oy5WOgg#upzJPK80$
zWb@_k4LEn3zZ#A{`~sN@ddmu_<@oFL&hb7JdUpF@VC87Za&kM4^FR8F>vu}o976V=
zgzO?C<f*?^Xa}04$)kU-)j?UaH|TI}I?^Po*`zS&;85-dS!NxS6oKW>nE~9HcZ-$6
z9*L(y|6o@931!XR7{(xUe2B`r9N<-ZGFy(Xuvfo9`sU~du3%Z}<Ue?eHWXDasiP%l
z__^7iUtyN_+5Yn|N>=|{oW{9(uS;JeWDKksi3a_5As{E@WS)PM;QDq|IjS<@!=Ek|
zkg?m|Ui{<bi)$EPwl@!ifY^=jy+!NEr*1Y$VB9)BImxgdtR}46LG0}805Dmr(&z+4
zoens4S{Wn$35MP3upw}hRlea^FhDnz8H*$d)b#zS(>f!#IzF>rG0&UF<GTm85Vlcw
z!ZY6WKm8aX(4_Vm@3Zm{v@qNo_6}B${4_F`@<_>et)P8BH4*58bm!GH8ja?=3f8p^
zRnOwBl|?~hSvv2@ICvW<8x3+dB&2@JkvPjY_HR6-AtQfr+dIf;hqBYQ<LP%W58%|N
z+(tcbn!g{^{5g5|p-I7u;U6mf^67T>+CpkPXtMLC9c_{XD9mWOiy~EE=3fI%;G>=L
z{CdgRFKWiAn;(AZF4ujk!{hS@#wRA2rx?AAouF#!@zoWPm2hA)PVl~^kpDfLgegcF
z=BWF`3dlMGretx_LK9~;5KN$5b#51CvULj3szD_d`-3Fp$mLxaHn<hcCYJw1O*npS
zdOEYXv9bBNzSGdzGT*}|T1_H$n~4`4|1|`UZjlDdrHlM59AJ<~Nh6vAc>#`(`iZQ=
zMe>j#-ARGsG>aZY_~)_5)ii?pw;44mI<0zfZ$jv<>nnP$*KP6++SR&HXi$O$OYpI~
zmI@lkvm;kyKdFl%^kO|^#YcO@wwos(*PK0XBU8nr8q-#JYewE~udVW!T(~w+<rF8@
zQO&oHFYQSvn$?kbQu5sHHigLadTV7>5hChbZ+IG8&y301M~gmJkVw{yIR^Rtv|)a_
z*@foxEp@a9ZqDg~(qov@<1+Q0WY|^N*oEdszt%_LV<L_(oV?h_Wo778Cv2}#&{FHj
z@VfX(=~&&;WsJw3XW`1b;*){m6VRLZl$h&;U2Ufu0`{>yt%eE5hb5omx$x*(iyJKP
z9(?!vS8P0YzbWWF>ay`v9IB6HxD?vOnRmu|>UAXIj={b>#~Z+0`4c(L`Db`?e17{`
zlr2XT50&S{wrikSt20{;7Y`>7+6L+~Qr2Iwv3RDgD@s6ad4>3E#6DCJZJpm%4BFe>
zUu@g=7I=?~!*wece1CG-|6(V)WxF_p5!cy<yp`5}xan6}JMX?=&wEi+SMnFC*1~co
zYNF@mxpxEe4$LgOTpdd@{<hR+t5Tl0X5$j`1h0&TqX)|s(q2Y?c%js6|DBfIHp_;6
zCvm<m^>e4><g&C*Ug}Td=k1mfiu<H~5}Pl8uQWO~oRIDqWoT2P+?SXJP1kjNzNJq0
z5`cN7@*NM_8c|Ja&T`C5vcmR@zn<YS)yy)%xPnc|TC4^qts2ZxZE2O%dNdgOnwSja
z@##|4&pW2)!=NR~ZnWd8;18GoLz95$@qzv1b_iu}igOfjB)w82{1^YJHb9w+u9kYQ
zIkZgr137Y9KUDyL`Pxoh3rcIgnstKlo9dzWJ|5K`I9W}=Q3aCG(>-pKDE+9Lh+ow%
zA}gyIxX2k7Yv=ja$ADcR*4QjE?@}LHySEj3v??{~BGg#L1;0!>?Tl@_z!<Feih6A_
zpj8>gl1YAuI!SJT-tV~<v>lIgtFHziV_<s9RW<4c75}P?Q+CI07|JLB!P#}lD!|Lf
znx=6ks|{3H$b273eLQ}r(U&&v-Y}MALvj!x(=qDwi=r$!4zW|h@wsMAKupHDt_gx>
zUvlhiKU1X@=53G`*GFqZC|6chf=7MM;ktp16ZlQ?K516xduwo4QPEb*H^d(Xuh0S5
zPJWcQ3Z~vFY20uCtAMVQ_X+)5X;NkR8fi57ZSHv&H3C{W@Hn*+q9F}zkR8@F<15JV
z^X!SGe<v<XKW)I$BuMRvrcsEz%@<LhvR{(b9rO`&8~o*Mi0FFbElLbNkLRkcVWD~f
zV_U33qt10ModW_rtps_YTrG6`3z`Gtc+P+I2a)wn`WaE=SBBH5Kx3|12_GKFppslN
z(YNY`^UK676)l=3jYeyyHXyvtAWD>EQ~CJnPLTicrOI~kxdf@tZ1)o}auZVG0=Ij^
z-*|Gry?3*ivHi^yEHPuU*g4Jm9-ouH+=|{@t%!Ouo5KQ4sQL)ytnk#UOYrN`o2mW_
zXNcbukbL&pVBgaCzf-sC=fOpQd9<D{bm0c;k7u`;!d#m&zo5b+ZpW8e28A&~>WXey
z+MDrM>MEA3!oL+Izw0$&+GOU0*s=heHkb5cbNs2>kfN@<DJz!Ske%_pgVMu!`T><5
zq#|lF6_lEcD7f64P^ak4a09$}+4T9v#Xo;MV|o)f0iu}TCA(pC2B73N8YchB9sW&=
z+)E3fkTib^l&22lel8Xj)TO#wi3i8QhuO0k61}XVDxS6l@x->p=&P?bg5UOHk2+GH
zP7fV=D7lZ1mwuV5|NTd!=|OOtoKPI+`yksEdttKN>n!6)ewk+@SMm>8H#2H~5<U}&
zekbkDGToQ84*a??;K;;WPW_*UvL}yYs+WPomzi15vs(SAJ_AWoM$QL`pr8=C%T4uB
zbVIXPj#EnJJv(xNL*^yNb4(^PRw)@<_@3__%^p|+I{LWau=ogOq6UyK^|g<0?YWT^
z=of7YT)fGN;QI#z2MZPqUHLDfng%8^69UIX&xh<!Z+aG)$7DCt9!-HO8$*L3jo*es
zx$*5F4g^99>xTeX(Vun#`#%O(@w!^j(fFzVUyE9SRbO6`!+XGIF7S_9{|*S#r&9)z
z+N2#%jf`aRqN^{9FF(_ar8v_@A_%yb*z_D_qSVgj=A56}Y<dRnt7_5HGhcWd3F6@r
z^T<BOnC^HzINfGW(waL66ZYX%!VQ+V$aeuFY59wLC)tvmB-%qNhEBOnDuc<&_-2Nk
z1bHSN4R<u`wqb>&<-=eQn<Tw!U;A<jbdA-MxtK*Cvw6SjEstrdZSP8KAYy;m=}yws
zC6Sl<03LK6SQV4-C`myIh;AV%$dmMS<UsfWFjp;W8j<$~SsX7Tn;9Cq$hb3{P#uDK
z#R=e_slRA_eQN53#CRdste+-{$9zA>)IU`McAKwVqd5PE4-*<@>x7c7lU_e<3Y^No
zrE*R*mh#?oH->&8Op?EgS=+df=Nsok8$RbhT2^#)-|sqx2E9c2_^V%0i4KTFHi90C
z2W9uCdqg%*7k?dpIj@9V-RlxpWZdrK1>?>fbu5?#UR26Dg~7nerkLAsT&|i&r;I^Q
zyPTv1q!YyWr9`$<-nWQ62rzFJ`IYQ>TpJE6Of|@*KB-bxlE3HuH;gJ58<Uy#DVq*S
zR@N{5#jr2#6Rqu8lqI`ccO1`+o-sdrvqWF;llb*;x#20g^-Fv!obAh|H@au;U-?pl
zL+ox-Nk>ToQ%?t1=Wapi_i3li)4wDkQb+_=#Bn738tlRKFiCWJ<ksZR!@ri-+%!Tp
z&guOE=z)X+oKU2<*4(-R?5pz`urOF3X(;6mC^Q^(OJ>c$?kztJ#o>H$ffY2mqSn=e
zot+(;i^kzT|FymOOCB`ptj1j(C?>5QI;4(mQldhK^b}j%4U#vvS`Aty1#LVy`8A%L
zQ{d9KEb$E%Att9p*Ul(SzT^+{8fSe5m3<;^+2J*+>Z~U+7o{h9`Qas-B?e{Y&!)I+
zg#yW%6Ou0V!s6GrIqQqOxPpY*q@JI^9Q5q8HzCSxK0({Q!AGA8*5x_XQC<SEn+rjs
zPt#(p(XxrK5A2x$3zlhj@7vpae?C4%XY14+;Tc5}Qv|@0q)LQO*)G(Ju%C(?Z3}oF
z*bc68y8&u3ZJ#ukJVpLB$iNfO<^a&5C46!Q2}Z0#T6yztEBLJZCO_+^CZ~hXT=jCb
zda_eyS~XLcQ)WtCf@a(WQI^$L@hN~}NV=0pTGx~kJ&va>iwr`82iyMqS!8Fz%3M3#
z24YFm`lhji*|btx_2jZU67k=Edd7n6h6p+Z^2Lbqb8<4ulkgJnUQ^>@9pMlZu+L+7
zhc)QTkLJlor>8Ur+B~bt;IM2{o@tuLXLT_T8GC{A%EK31gxz>Hq>YxW%fUS{(M9t&
z9A?@CWQ|IAgX$@<@b~M&$zH-(C+uIfm?A?elSQ2c(TELn^)${EATSRTBc(vh0onhq
z0!Vgm2_0o8A9svl`y4i$T9J^zBEY0e@hi2yA|!vyFb2`_cHO;VZ1Sho3=3$X*RGu=
zROp*Ul@}DsAJdO{Wu5K|Znmk01k|J05!}7#m`#xz`Z~|?lE1p@7I39yfAV<DD@zrl
zdhk;j7gk}}!%M$YXYWQ6PfnFw(CztkS$HXZ<RA3@dq?0(6#ftO^RY_iV{!-BXH%~X
z^tGua*yubmAk4_Ct!8ZEkAHxFIC++EsMC&?EJX;GSg$gd*Z0cBzsTeZICTsH8!XV6
zd(gEYh^}Ks7wWw*Zr1$C^yItzw37?Q)bc1+b%S~L%SiRd{UOYKw#&?^u^)=g+to<g
zL#$ZO;n}#pFnnxe8`+~45^p{eYm}DB5ei8dX^Fw}vcf*}jHrg=A()3&kNU%HBB?za
z5Xs91{5QKc&%#0uqnL<_F9s&{_U|v>cVLni-90Y~&*7E33=g-5#b<{-cmavI905t+
zs;L1j=#}M$v-GK*MTOD21YTBwxWzR^lM4$z_l2ET|C%(9Apuz7;6N^9i1Yj4Mw*d_
zFq92)v`Cu_-WFi;6#nLMv<dQRYwKB)tM32mGfRCr;Ao*A7#u?CIMv(=wJ8^HZDcT<
zGp3=wb4~J2i>GsPcIXaMcxXee?R)71deO=-s}F1F2plbUdwo}P@ng`z3#;1PAit1+
zV1JwWGDYl{@U4(HbxAqhk5RWrYlYg96-N|}%nbmm?5jMEP<#=QSNqMmRowZ*lFmm}
zIJR(EgVEy;x(~WUWj|nn%3GIb4g<)c`Vb0E#>nbQB~n43yP{W7u2(VU1>cHHFCd_&
z8B*8S{<~q~2?x#BI`X==hKOm85luxb9*_~pW5iYZa<1eH&bkl5z9Se~wHwF=7?KU<
zm2kA`w0@Gn|HeB3!BGg-)h3UG7_l6G>vZMG;4L~>oPwHpc0&EwW956BPk=T3zj$S(
z%r3nCs50QF=u?ml2X=R=OkQW%dF!-;J7_+4B;C>x;di#tt(B~P?BZQUHEkIg2T$dA
zR&9N^{eFzxfE-smS}`}}_i<P^ls`^^if&ZOb`igc7~9)C_c%3z6}3@eOLvjZv!XY0
z4y9fT-lJ21!f0PJ_Kcuw8p*X?7;L=-c!vpj`fEQjhPRg8ec$0kI=!rcG{RFPRvhwR
zek!<S?6JyScod=&8MHVxl>g6Avblo1sMjL^oJR6MGtvbT|3VSK;g#?<lF48i8f1W&
zZ|_UJDF$B$m4uv|z<F2lH5#Ty2Z46Vr){0rQvvELqgr!4590BFX&;tPExL*Mf%y+b
zb)ogXvQDkO0N1{4<V+Z`c8n!d#Zlf@T*{eQ^wShh-|_`tH8qmm064Gi)19a2u^UoD
z%NU?5Rmi^+gH2<szhiWVtKEuk(UASA7hmCBH|Be^?^9nyp=v+Xv!Hsdn>F#eYz$O)
zx_3$5<xaU)Dbb7N0nhZ7UUsO8GGFxWK90GPAjr)G84QrSr$;W=@zV%l>y8JgKIi)^
z=ztekC+Mf!42(flrhm`1ui~QTSl+l8v5%ZuV^!t5&I7j{-&i6py&L$&klM_dIa17f
z#QW<{Vv6;&%)I%`m*m7VquOxvVY*aXz^)=EF#6re$T{@7cpD`dZ*!)s@x-caxAg7i
z-KG8pU*B<3`0*sQ*a6cyn`hV#m+<L?xJb2A5FO?vmv<g5!72Gio2sG5S`Pa?ngfp3
z5BiM?tEgrSWO`O+*0qD3y~zw?ImxuE&E$dCz#PB7lH9V}a~dwh3gLzcVPmVp^cgw<
zu|IDAG+yNj&*H2lR*q=Yk1jvqMkVkT+N$0s_#Ut2V3RxI*J5bO>Y`f;#5;48CrVEB
z=~*55%Tm!1AQ4V^y)OS5KE5k12#!F=82&>p_CkY^vzJB7<&(=%kap4uw+qT7HAcEm
z>UGgpk_O~f9!Xs`gae=LqL;gRmpJz%9>{}${ATlY8OaaeZA@O9?23!M@dW|+Na(!N
z&h|8Ay*Il2r0<q$@F-K5&%d<N76onaNxBL<+^1hVT4MaKpNC0egV5JKH8YxklMAQ@
zJd$b!SadkVD!<B<86jQ+#_9Fkb$?A^6a4qBtIQ;xnbR2{{*bBn{1oXnxo)|hvh(vw
z!=ftYw>S9N3Jfex{p4>K;gy=)am9lRp*_5nj?0|yR`-b2%T33Tl|<*$2?`YCs>f5e
z(Vr7P^;w~1q=W=tQ2Y;^TS9#oLXra}qx54832(jMjy$~JHkzL<(ZBJ-jW%dOy`pAQ
zbUmAqYBXt0|Iwo$VVjNbbFP@i+%t)E5+23!<5%Jk`E>fj{sn%BE-4Z$U`71<^Z`}k
zQEPc$oBSs?#{B%#iZN-g=^m+v&vH4K`>VC(DM-taubmz7e<mU&D!&&w5!~*+^i*E;
zNqq<vQxqqy@5JD}tmofXx*FP=&LA@*D{9Dr9t29b&Su-!D5h6F7Gl@?2PxNhAe~+h
zh+5mcyM?^3HdgL)Su>z_0-su3kp4KF3d(ENzUKfx&e|j6fArTf@<Kt+l@k!5U>%fr
z5}0iJ0Q?0ood#y%6i_%>^@q^glCns#e6<0bY)4?JM`V5H@a%{7WpUr{0sx`*l8;Gy
z=#G<n)ctJ9mnSY($eL&<=DWD2em)*IY!KHYBnl4?AKkbDb!unQhSYHkx&F|(exFJz
z>5Ua4Zs&+#&8e5NnHf&!W2+^WCRxnRib-}P_)OscK9t(8d<%z!=`f4l_PMB7qGe?}
z`9{{<G}+;abv*Km4Vz`%1-o#+JjJEgHkRcKOZfAP&KXac`QKc#l}PVm4QLdZ=l{#f
zhINA7kksdmW#QC%PaFsIKld8j{=}HNmWIs<5t9KhGEk6BEXFZGlJi7f&-vkdxmxW|
zFU~*XlEodYG|=1rgzoR`6#!JnACJQo4^WxH8hNP)?{AFzP9FW)nLB29qf%ki=E1AE
z{qbJ;t2Dz;f=eImV{E0aOs^L=NTTiY-fi&PFty9^UD){*xII=I)O2w{?<UNxKEcm6
zpWCKVZ+Egj){<DoSEsL^__^^gJ0Q*zsIBZWMNY|?FrL3G&Ngx@m<Y^MXokY0pl1~g
zh>C_$?AlKcNH6VUmBAxe6fu#LqJRg=mdJI{Fp^a`D<oB=(ce!#vdupg?A~Gu6BLzH
zjf@eU{?sewrzR#RAYs5Ny_@d_UT@A7++u#KPg;Z{hkkq3(2%h+l3=fga~B|!YR8{m
z889@cb4#mw{pas9MN%b*7O0;yX;TYWg?I{7j=VB+EHvT(4Zp>DV~T3X_B1<`&81;k
zHH{kvA_lH~jPiEC;udAgO~4l3{I(O+Oj1yz3%MZIm|?Fw|JSX@=k}R`SyFO0kX~(%
znS_FRDe^VSFAxCceJMUg4TkF*Pc+htgw4HU9-vc||775i>03`UOuSwxAgqbr4<Mf8
zP1^?A>Rz=2^p?2npUurMecG~f>b!$mv&d`6;O=0x3=aAX?b^f6DTq}$xS`JGVC>Rg
z^oT}WZm4#<AB7?bP^Yq;QW7u8<*`K@yED~QKPa3raJ&$-CKcu5Pvk`ss}#%FK&pXs
z(+P?YKL9|&y_aeEOzjEq9F5|ORwu$JYIAR~GI<t6UUr{CY${dYovGyQuJ_;(I^rvQ
zNq1J4ZK5;b0r$#<mgBw8$rWfN&KF@M*$Z&CPurQFNM`f?cfxF!_nc$1v`2;(ZZ=L%
z{(ef6V1Zl)IDW7cC)nxwoPAG1<e9k+4dj=<TKcKYQk62Wj*K%!Sf3DhG^W4?`RqB@
zllW$^_~b|+^rkvJVJWZcRmvIM+>h`GAh(CwKlRGg%l$RCB9(k`O-eb$%?y3w%G;g$
z&V;pZn|_qeUpjq`5z25I!b^?x4N>OYfH4=v^`0*TE73J4EYjI$rkss~qjf|kg6)fW
z>8*_pK|_@HScesRnNNsI|K|@J51z18ELi3EWP0T(Z+AhNnAsP1qWHDZ7tYV2KeLPG
z#>z7@-^vd;=pP5OmHvOEaQh4#`831;=BN|5_BUvEV|tEKaOvXWF860a($40bX1}@~
zr0jvdK#{yyl5`b^pt^9xXxq_dTjcA)p3p<NqdF`ie`ju!BHs*2s^*ON={(akspi2d
zG4s~;BVF#%hh{Xhp99xuiJc9v9AlvN@4b6u;=sqp!3v4F;k_6i>w3ASM9;c&HbnNU
z^C_DdRs_LD(#sO0Ku<@HJ(v9LxIb(APc)h(e@oe1Z#aez7mRVCJ$+?}s7feNGI7i<
z4Xww2epD(|Tqm1+|MU*&okFn-<co%sc2$ClaI(L%b|rBn1iySZ=cy&~I@?8s#*L4m
zB;4mRnt5URf?q5%(&1GZyAj(V7^kcrgLMc*PUo`B?hG$iH!N3|nkT8{70mrx(E?M;
zI|}&~6PN~RL&BKQT?6Xbh3UD?Cz@mh*=XK*^ircVJTaxyAw78qyhX=0(lB)i`(XL<
zcem)c8XY9;@~HlqI9agi*!~lFK4?vK?=GKzNQf~4v0J(O5S5NIqCs<<aV3di__XF4
zQbc|kbw0sR7A=}rDQ!wqF*?)8Y2X;WCeKx{uAX?m%WpOQ78!6EZ`P{kz5j+ZF^s7i
z{*~@S9Cru*mjYPAy0KxD6pkQ1haA8fOqVxG_oDD_G-a(vw3lps>y_<~xDi5b_!uRM
znv4ScLzN2Bh{EE-GmK#k=adG!&uTZ0>@q6zeVZhksF1Errme46R)fDtAgMn_>{&h;
z^Rq}-iGC0}Go!emG{|Bs4X4&h*HP<TXw8gR1xk6l_(*HjTEe{3t6g(2iv7?ka3<)s
zr`2Stzd4m~lV0y=v~HG?$j&#7rv9}-nIX}#dG~M`28bJayJVN!BBHVic?qn3K2(mg
ztJduhIE-_&m0SnLzqIApY&6G%VYss+m*{Mxgf=3U1KXu-rbDEHE9Bd6UYcBOA8uY2
z8WE6l>I~3eJ&$~^`{LcJ+ZD;^9wCdIB_-aUN;nYcUn65W`5@?9(!4BEy{w!zQfb6U
zE+!$Yg#xQ!e9@DuM4UE@ei*NdSy&-e5}sGmsgIb;@qt0?d1BQH`Mu9Q(Y8xcYC<V4
z1dCWg83Eo)jF7KIC{&qI?b4~@coKV|y<0}B6nn+tX%t^xxy98CVtm+V3)gb|XDIaT
zFjgXWLDPKUbU2vmJ&V`-?hoW#^Yh&C-Ec=T;y(!6y719{XY#ef=|5|l6>ILJa-2<#
z=_UmVq*gBY#nwb6rq5>0F?Tv>Ow*g2o1SXJnE2H6B!Bft(gc8tuFbpA9&;&^kz~QI
zJiYgo_1R_9F52XMNWN3~`~~9&D!G$=Zuf=Xo%YZP^HN97mVJU&G;)pe?QhN-41<vA
zp7V;9GT*-4FCDo;;AeU{e`h-=AUI@SKK3C5(B{wKKZ)=Wb1QM;{H#}zY}!SSWQB>c
z3Xm@e(Ki@E@TSKL!KSv{WdBueQL3auN2_K!*<GN)OQ#$m4tMMd;4h(k=a<na?}<It
z_9?f+Q^Z@AT<4Vzb|5+ra;ac`;FEfZ-t?beP{{Ay6~@*iVC5}#o)7J-vwB$I9}p0*
zSfB59_B``h=CE}afzECWO$g@~Y&&md(49B+^5cYi%|ZTutzJSvCJ0pAhJ=ushPpa-
zu>^EM8>lioNP4@z_{10y%y#}&jOfbNU2YW~2d7oMd^qqSk1A)g`}&@$8+Nkp5`aIU
zRvY2jQ~WXWXm9B;3)MJNf{o$v5B_$US@#S9pN|#Q=mTsEnRGz@01UR_(<L_bMw7@`
zB@0~uhBPXEaI3IbLnvnGyH3Ff2Pu_E6Tp@zv)!<~p*$wrd;gO_PS4M3H&2l(DUN-0
z5FIjz;qvll4S-lzj6zf`vreISzL4WB#927~9E{rY;MSworwu-2sm$z!9W^6NMv1gL
z1&MabbT&#J=-vn(b@#2BZTnL@J0=5`wl{zj)0`vAP%Mnd2%;_NU83J23Fx|k=SOi3
zlDig-^R?kS-za&-ps-Kr#4<hHN190^RvcCF&+uz~nCq_`1qs<ZoJKva^!uovt6N5F
z9PLax2=i+ReK~hbmcpKpIlgEPs9$pYG+Xa_%i9hH19sTHlH^3F4o#)^G)}g!acrXx
z?K&h7!Zh?Ium2$`cQue)=bh1M{o*I&3+9-;a)<M)bQ~<R&WtV%6BR@rmn<E9czy8X
zzvw^MP5J2A9!VVZYwu7*rA1Q7e=ztPra|<{FF9`GBr7;y?UzJ6zEyIKlFK&tmfB_^
z3yjTd<)xQLBnj=j1+6}_w$7h2hx)=EaS;;x)q7bT+oj%%7bjbUUP}DH#wkXyn;+;s
zW=F@)?FxQs{2ajyuPX8@XZ|>~s7dx(aAgvSh4EwJ*R~yY##@56IwzZqj}61AgLgPu
zGU@6`m`z@a6i*IM;pbKY7yh`6p+i#t)77&kNnQ5R?A*<DffX&FW$Q*3PZpoolYn*w
zRG+rFMAiFtj4kWliqzqMqoaZ1rCI}0R)MB#>$y5^k_XHe8;vXF>^nEfpC=T<=e(~D
z?c{lXmb966b$WQG{fm{9o~v<ZGW}1GWR<H<ahZiGEBwXrJnuqY2%#b>Q%OoWN@G5z
zZrJ37T`~v8@N<B$ForJd5L=g<p!;Fm{hmPwlfm4KJP5P-r)PM0_vo8pLEZMjpwOA;
zr7!C&Mp)J8=vy-`z>fBI%kKX;I`2TLzyFV4vzw8TgoY7)x;EL#yh>zd-i+*VZCPcN
zy(*jQiqs|COXwOIx6rk(bh*k6$++nzbd&7g`TqLPUm5ql?>Vp6^Z9t_mO4)d>^_w_
zljJ<ToX2fNs^!fb92pte0k%=|)6@R?Z(F%BOMW6Wx4O~LjL;jd-ABU?S1fL252^R4
z;#y{=Jh@BvKnpA_t&dI*(Xk+R&4P8>*zCc<SiCN1<(|-_@%FBbjUDurkv9iNiJ!(=
z2^}3nnsT8X1cKq)zJ*t^O3^h!1$c0c-mgzf+Y3JW^B-iR7`1cim4B?so$S@UcvKNt
zanhR4PXq{Q%cCf~Ee<qR2aHp)fX66CgSKmSwDH%B`r4^uN&C^FBv}RF0lu1$eFRNs
z)ib<Li;?iyZ5YQap*%)O%zSbT<?KEcGlojOc+$n>55aE^B4v6R&7-DpNr`Zc^SZJ7
ztYS*n=s&Bs%YvwCYhAK^ivE9-&sRqe9d6DEdu*w|FfR~`%}aSaHMbDo4rb3u*fnT+
zT^*hh`<WCaUtnDn<0zwYOQ-x1#KC^P)WIVXJTBVQiL#-!PO53VD#w}y`9Kvdx9vO~
zFO6SCZ^f5=6A`Hm-NM^VGG7)n5LXuI#}t&Pszr}W%<Ty+Dl15It?o6fXNz0==0G!k
z)MW&!s02S<VZ=ZZzdFxvgN@2!<yxpjU7@=(0c<(ew`Z<g&dGlW{u~P%_k?O-$$*e(
zj|4fPSe<DEK4?<&7<A@#k-XH*>50k}_Yip_mMfNhmz(M&|Esx_4zHNCaoxC(clmbb
z`6Z$lXUWw2t}1!0<+~F;eBO>mEC%qm_}){yhBmnq%XSS>uFu6SJmtiIih}n;^BD5j
zH?JRJPlC+<ls0ro;+u=&ha!F=Xa>+ZP4DwHwMAi9>j|IEP;i{`FAIBJcZHl3+agkH
zf8SC{iYRFKeCOWxOsq44b#TObMe|A<5*3LCt-<PbLfj#FJ&iV?{TP*>Yzji2Y$_U`
zq(y4Q8X98kg{z_87k+jn9F6UzsYiJlC$hEs8BA|GEi5nFfVCShm%W~VvS(xetclr!
zU%eYU5#4ia@<9!pPVJGt?@qJ)V6ayQtVaBEmSRlLBlI11w24c<9abCfj8)VcMab`J
zzN8bD3_N~^Xl3{L8L8PGk3Y5Wof@2&$YC$Bh$dfVg)H%lv|jwgKZR%yOMM8rqeRPQ
zoms!`3xUy;&GMt3%?F{og=;W@iyDNYl0Y|)s|8x)rZ@SLJ6rvzl%2{q;^$USQ)99^
zpQ`;t&;M%c#I;o~DVrH)-_Qd~>A9X5hoa@>WiN9j%u6<BMpuEgzTWX4#l^+bG=NEP
zlDSO>3AKoB_K(5;+mQQ|BMqP=8Hsf)4*&N6<Z0fQ%(Rx%c2+l)-bd?HxVpH^w1S)}
zb&S~YLCs7~mDROfKC~&dXJ_}|pzlCg!P8fRXM~@pu@UnkR0tcvA&&6CMg&rYOh=kW
z8LaJ!S?BeJJ%GxsHZ+5@)3bi-hU#_J(T=k`6@MmXf}+yDb_JvU9{$ovuC7+eHY9yU
z3^+wFg^^AymYAAp;&3JM05@?9wUWU$(^?4WQ+%|jk@H+xous)AzSU4tA%g2!omd!2
zLUToh>!dgW*|lQCv~2B<dlM!7&_DasohiKEBYXLnNb}+@Ka@wNl+VFV`uFl<tQdxA
z!)Vv2DM$g<<e5Ty*6&N^4R=+Gi%k#DY}jZP1BM`6Zxp3j7A7I#+&kHjgseW}f3mt`
zkO2=Dk@H_lCP|kY!Z3blIDF61L9#J2fF#9|xJ}X`vnq@ZfRA{sR-k{+shdDE@Xb}V
zb64P4ypFD}1p~mh2Dyy3fR@(L;Kv(zR@eIKi_8$B&_Jm6Q*+Yi*1+y4iAP4SM>ds1
zbCKs(#O19RGi!(kEs@2mQv1UU98O70^)%$Arad##uiNh=PO@9fKo0X`$7|46SyDpz
z0w<D7hHuJNyB9?@s5(|~sjumL-tYH1|8a4s*6BRVRzj7rNy|B&x#u^_$HgxkzF9i4
z1vO$-zGg2~o%Q&86)4StgN?XNgY9!!qwq7MTa(Tnf%amd&VtgHYmf5?3G0a-f01RT
zCm)$O^}F+81I`VRuIKb)RrO$@%|b_`a=Gs@(EcXG9}b2-L|fGjC$S3a@r^{-ENqn=
z`{0;fIi@Nm%ayDfcl3Aupp$U_=n&~q3lNG-RbHf?P1(V!gC<#HNP#K7*z9}OyY4q}
z3(7?iOLV%KjCnep7m)y$PptZ?MdMSrvlt=Q8QU4;?CM@vEILi{L2R8#v3)3wO)`D1
zJN?#d0Y%>VR~O^ne={9*{KX76-12*=Jnm|;=O|6HqpI4;lysA*#T`{-T_4eq*<X<T
zER{>lkq*?M<>nRVb-ORbGFEV$&z`tFkdos25p|z8_TD|0lVdp@-Jhclyj?DiZ85PL
zd<j&CSJOd-QuJY4sx)IEq@-p-eZj{yZtJifz;VbsFZIAE8Uf}Y+xz<n6F^uugc(76
zQK_5?O3hIIb1tqYYCv;H(|#Eu5F9~)=w*kZd@xrd_nfmsHF!jI;#G*3uD$*o5N;x>
ztK*NwB?29l)cddCL(z<i#|ub&`BRqnzxpk{;NJOE9^FF+lXGbkG1E+Qb8BVQO7IbF
zT+Nvn_9owz7;~kJ#E<8Jh`dY8P2Q1_*2SRgwfd>nAJ5qm=UuS2p`73CxfN};?d?{J
z5d(QMz}gleu|WFUpD49OwMi(B14woq9y|on_VzxgNK}2gGlY>q^4x+Iri0Tfl({nh
z5!VVjj9t&>Jx3Msls8fiDyE0S7brh^u2Y5aHsV$}Ep$90+&c53d=n)v&wr$&AOQDS
zjoP|4X?t;*Un8sFR(xFCRlw}|ERe4?^yAHkEA<VJ5vDMN7^!zIt@?P~rVN`uVCbG2
zkhu+j>*CPxVGz9Yax}2K;g6zaRVQ*m1EOw&|CUzZfmn%YrKIpuMs-;~FwYh^XngvL
zHcaGl&t?jNOBP@0WauPUzR4D9fi}K6liTA9)b-BODmjQc{kK<mt}dZZNel8}Adt6L
z5=yOuaDsdj6~)&6sLH3p-VHXk?6r=4>CM-EE=t=>>nNVP+&j#J+l4|br|TM|3-$9~
zI>1d*3OyyvWdD0BZ6GUC%XZD#Eg2r^%q8F~=9J?=zjlc}M&&)uY+q@*nst6?xA5C(
z+J6#G)+&_x?sN@g7G6n8@_C(-Rd-i<&*Uy=y`68>9LM)C<Hocjt)P@UxWBw~EC-+e
zSg!=U_mY8eYr+1SsEcmA_T}jUuhL}BDGqqQwSzO810QQ5>s>3#xofl(r~Vwt-c;z5
zH?2Ekz-j_23L_u>Axs`EE^It}*yE@FR*RO?&C$ngz3e%4USGsALRd2d!>6#w!N;`P
z!sXE|;naK29{YrKxjDx9hvS_s0jC|6bL+8FU!5`$mPZj8<DL!p@4FEP)hDt3Yi*NS
zIW#X6C2ds5%bxpLYLoJr*X(@tuXN<4HKfMwToVw`3#BxOa4S!n-*mgUUfyI}&FsL+
zw@LZyte&0!W5f`;H`@xj0&C5ZuhnSZ<j(R?9PZx_CkLy;-fLKB5_^I(poZUPDC+MV
zsoMvNV~uIyL#63J)!<4v`T>S0!l?_76aG0bXTp2^-6lw!w@eMdFD4u@V|Rhr*^62F
z()#ZHF_!zm@aU(hF{o4qaEKvi{Zk<jKC`_?u>9fMMMfo4nYTMQCQFK>YTq_@cTkI+
z;F5AMp~u;MirkC+TnFCd?U#S6tp#b_s>zpX7H_Cr`iP2oB%T~dXd(ga$_cXKBZ-f#
zvym;KbG`k>pH!VQq~!SOcoUJ#!2Ecb5hD}^LPw*e<x+PZjeL7}I<X+j?I$&d$IY?Y
zO=G}I&7Z63oEWCWM4p1W7)?8<o)^=5bOS6gIK;CityhFh-8{0z9=y88Wf|A1*>nj}
z`Crln33GIKXhU=QuejbkQxg8A@7$H(a@P5~&Wq)B-N@mazTG^5RIQp7;={LdS0rq@
z&Z<&@cJfYha(c&7NuJV04ECCLM4_<qDB4=t)1VT4IV(Vo`n9YeA9XfI36~m6sT0o>
z3(?^rtGxHpH$dW`qhxfLx68ZZfs~#vmJejkc$9t6q;(-nwl>nFRWUCkn08YbQnS_m
zkY|wTA6*^|qqZ!uwvo@I^6&k(F<0!4W(@m&cX~V`A*+c~{Xi61M_**9m>EhZ!KLv_
zV9e}fD*rjjM^JAe&xn;3-GUcnvJ9JadeVaC40n$3PZYB>f;QQRJ3PK{&YKHG8rEY^
z&MCUx@Zael+^N&XnH-79-rciFGn}BWIr;A<Wx-#|v%&ZdK)%#D^gHND2<ndo0Z6Z-
z#phGKj;A?l(czJr3KDf$LzwS}H9lDRN5zG-A7<AKJh+8D%A+bvEOGeXw%=ZssYv|e
z-HkVdDpd>7(54I0mqyw268(f@w2DVwi+Nvrhu~y(kc-rx?By?aF+nj;a@u3wUlwG8
zTEgRZTPhCHj{b?xB`~Hyu;LT8P$o&=APd?>^5(s3;E?;?g@gC*v)ji`LX@Gn%SH#h
z(L3W2^oQd=KSfyMIHz|P?>>`NI5WMqU0gqTZHb{k*uN_PeoJz@Us{Ov8Q7CI+bO79
zCL`Jww8D)pZ)2A95Yq9t8G0e+R{xMs(wv?q(#u(Ru}z~9IT{y|(o<y>ZLpdw5*>GN
zY>+>5+hxwMtyrz0NK^48bg_^jRIj8XpslPS5Gb;L@BaarOKe2&oOVDP)ml0n7$BX~
z(v^thO_fI%_&^kVJMhX0vhD%-=w3{mL2f7YHZ2Zp_6I`IMd)diyxdDEX^c>k=U{H|
z!}+z=K&rn<FJPmR!GYQ9229ukNOq)DBLG|)EVF}gv%jz{5@(CO&JMgDonxcNHh6(4
z>%thg5zdZ{kU?6CUNdlAfJvn<rX<y>aVn6E@CcLhR7{456*`WXy=f?`F`9<!X@HfH
zlnDLr-hsAR-1OY+{DY9}EXA9PuVu3$|9eFK6d8E(xsK#)^)Z!n-?Nwk;zGK_nHEm7
zmf2s2-+XlE1KVmAH_3}Nvq6il`!PZ*nyY{Clm;dPj@Ty|uw^v&v+ZX{5V3aQQIPUk
z?^Y_x#*_G~GVi6o3d>Pi8J$ZTMmI|3cU&r(QhE1eRlN+@iHoO;$*akUB+YVek|zBu
z^o4`F3FEadBOfS$tP=+`M`VHR5xmsi9rGAfu`oO*z<ZKuOJCS9eTGG%7v1rr^@TnW
zCRNyUlVM}ZmN;BENFOjWjs71Q#I3~L31Zo)g{;L0O1!4f&?!b{%s)ES>u(4+P@F3U
z#f))Yj~FVWg))4h%hoNu(!0N-*UKYzC+s}DsI47K%U!<s?cD%KdI&V<{9oc79M+aa
z{NF*4tN*pPys@wdAyV(h?KpWE!Mc!_FvRZwff}Q0$~N|F1H=7ip4U+$qQ5IiWlP+d
zJTiH4(bl>HSB2p657oC38OeZNNUEM@oZ>9piZZgi(JgNw&9J(%d2f&(g6*!%tQPXf
zRvoS9aCu<K`_A?lyI7Tny~6G=EKkk-)lzej;#U(S&h=c2ZzSfgA{6>epU)7<<q5P<
z5(@KwEUp_oJQZsWlW?y!5?3>{d4SUg@Krvs-Lq>As+l1-den)$mhvgnRmyx5N(q{o
z)A(v;OWc>OS&M|fmqlkn(P@64{v1@(cY_#vy8GX;)Y$-iiuOe&_DjarVffMPWXWsr
zL8m8I%dI(T^5k7!H#ogqXtzgiNRPBPbQBL7I{I8rbn?}An)g|5^o$$$`g?{icTOnT
zRIY>4Z2h)tMLf)l`_Jta7|FwY)s*ymIJmxKc-8H-JC~TupC*wRK1&A^&`<Ok{Ipf;
z9PA7Cl;wLBGW^*Io&&!oqkAH)Ph}j0OoX2IxEL%M`l19O7Byr*ds7*sbO7?r&uw+;
ztQzK@Ce<1SVS+$z!1Zl@?<1tg!&zAyz<^QB>^5=?qkZ)%vil_T4AMP&SO03)`S9A=
z>fo|gEm4n2^$YJ=ON4J`^6|N66HCae?Xq=tG87>KH5`YrQL?1xg{cwI7PLH5YU_db
zNs)OKGxLt)|M9I-8z!p)Ze=S?f7~vA9#unL-CD+12V^^ksZlr*&l^I0xkQi(H)<+t
zok0`Qv8b#pj@k;I5~~ivzk=2cP7-<Fw2o;$UQ{%@WrFGV#I3%O$7tl-a`Ik`J83{o
zR(T%!&e6U!=Y0=h6~zu=@+)1LE-sFFS|n9>eOMC^w+xLnX`l&Hex13YaPTI%AdG`_
zMgs0j^(GJi6uykGTkkq*x(h6Ai<^2x2PCR`i<J1P<-}n8Hf^2Mmm$omcWUjPD?5{z
zrMRTHSe`1$G0Qi0!;B2ZE>d+mmcVDdg2>!<InAUDNF3NQ8mxNh_Iv~mkJ4D?4LN)L
zT(aqsKz>tQ*yJM5prv`t&F3G*Ig*#q6r^sYoumIV##V2sHA&M?+$X!KpiWfy>Sr7t
zQ{d$o?d?6c2I4`WSh%S|lu9xtIS%>q$veR-k8|^{6y%5_1D8?O%~OEV7ilu_K>Q;_
z6`Auhr4;<=#YQDL&l+Y^FM^-;GK<EV^NDx0fmT*<<$~~wz19VH+%T7!CbUKh!v{yY
zmb!+Yqoa}qZe@=EV`bH}9aC~|icC_X$sA7g9$8PX<8|-JT!;-x`c6A~@PuBwGOu#u
z1FJgrN#ci*jQ=1eaA)})`+k4CWtM8>Y`yO?67I0P{)19w>WVQLk(Z9Q)z9qhr3>9*
z;xjw{L}oD}hMjVi_AoeCiG8)s{Uu&vNTmLSW15LXo)vR|$n^tGv0U*ay~@L8TaFrY
z2t!KvuCvXbJ!Wg=%g`2*>1)!Zt?Rzwq}~9mRr99n(K1hCjawU?0NTO{M^Oa%BIwwT
z##0Mgl<U>IalVKF>w?k0I!{82O8Jc2#e}Ozo_6$ndGn+pqNwU}Hq<^}{i~{jp;9kh
z_O?%6EF}^;%yBQ?9Gyr1^CtE{>m$^`3R-Kc?^5nAqVJ4!tX3j<JPKWAQN5CHtf^=q
zZeY}AhA;k*6LDF<WvWUxY^cw=<8ep(K7}u+Tovxbd3l^%H7&7rTb1}X&IemTlPZ*e
zq_gM9hMy%}aJ+05PU=|S1Gz~vUt)}4;&9mc0mQIZNcjAA+<ex*;$m7}OZbEOr9A&-
zMhu9`F(pyjTb->c3&L=+Z&jCtd~lXd-+FC_(pAh(w)B_y*OtZ8ol8d0w*(Wvo`dEq
zCB96Qg-lu_VP+d^eMwCe$1>}gKXKFu;H)|iRvi0Q(vKHe!9;{v&zCw@eJs(*pX68S
zGxqqB=JYTHAt9OMncbi&VQm_SvQv1fpr8Qs*I_V6IO+qF?7N-WNIH7r?H9n<_X?W%
z8`-9mI>>&;KpKw>p4a{oyvt3iYB~)XUCSF)IwU_4|JuawXnJRlA1crOLJSojT58n}
ztn>9MAGJZTvwHnDyBIUAJRPzzyY{OrxaVng(+mD^RSGbE-t<J*c`{kvj~=SL=nwv?
zOlb(@1~BN}7wxJr5hl8c)Y1`bQzkI+<OZbM*f<-n<@;yympn&L5gq+RF3#U`#{RFV
zQTx>8R2kmxoZoKZGy;+>9+ANTA+`fBZ?;x#j3cGop=kh7FTFj<O_Z(6*6T+J@s;uZ
z`W2k<YwfYFj*d=aNRJ{!WBva86<QLHlKcoi-}GOIPkG&svJXNYVe*)z81S84wx9c@
z*475>0WQx>pJe(HLrZF!K==17<cd#c>ulhCBrasAHvLJ|-@^r+n*HBlmV&n&S!zv=
zDRG_z^-TcA(*%qA>Vo#)Z@AA~U{02OsdmOd?-a`gz1l)9=Tm;rJJ{vn5b-DJ#>s9u
zDo&3DB*xy}wU2#_G=d=JPvXl(?5=i06Q78$Qv`xL?uOxfm|H?ATGx%2t$!PsY&SIX
zi5vd71gMy7rEg{y-sf~@p3a3@*{Y@58PHmu#GV@CY;lla?ULoV2XADII@~t3y7I|O
zGwF>W)OYIh>5`(s!s$s>F`D~?vfaYR?b$J~8l9cf{W3o{3zPk`3MlK3MO`a+`V;}Q
zbWayhT|q8pSs~#s-StSy>fo?b_e4$c9`ztb<7nexyKTE{FRil`JZ8fiPF-7ZcVsnT
zW34}ch|wQ5$1A?}hA8|zU2DvKe0d)3{nhJEoc`U_Eq$hyLdQIXqqrEd%DG#R$sj3a
zYt!Yd?kA;=c8@!4NSb5h5&?I9wN#t%OU*rGlQh57RUaiD{>7=Co}Rhl4AXQ6Bn(0k
zGB6}UN~LN@>Xa0;yYz9UuI&}(DlMAO=AW?t<tII61G3ADjgIE<CS<TpxR|^G0E){y
z7_cg|)@|<5D@1~)zDnKobyU&W<_&Fy&NXiWmDAW<bGoZP`aH=O+hU$L!0<zR3w*kJ
z)sbHQSQfU-a6FYE$mDSo{}d%q=<#8nP4L%D106-9fmo5o7-MXV#EX9>E)g7%t@PfQ
z5-=k#A1ajUrok+P1o6~#JB5GPPz@)X>NXE({R%?H`#K6_A5!q*$!3JB<rWMT*yJ_i
zLVB3XqZ*IQ+Q2dZ0qW~BlzX2IXC>X{{T!e^w^Kqs3-IP3ahDTbz4kVhtf=9W6s07m
zdiv$Y#>PsG?%t~C(VsP*`2-7i$C8_ne}gj>AN7vk%S|t6CbDx4|6aOAQjF*z8k@aI
zuCY-1k5z?I)nN5P{_F;+TZf=nvbRU|a(q;C0n+)ETe!Qz|7W-L3C&_#K_4!$mF9a-
zV|}}}DJ2ZWn`C(Uv*QxOrwUbBdI}K8X*~SJDGrRU*nc!9o2(aG>RQfWB>U2a7Qxj#
z`z$(trK(DYcMe2KGHxWB$shnlysscj)|NA!<Y(~~m!Q4=?=y3H6sAC%w3sHU3qU#>
zhX?ohcK)p1xFy)VZAR5V;_$}b_c?4N+<ZC<K7yrb?7}eG8dY-xo-M9ia7+8_4-@y}
zuaPIhYCeibw5Xc3{~eJ9umWTmF(%D3b2}wV^TOpC0RXtT?gqn)Sxtjouev9%Qt^ul
z+fz~~<agYYA7z_Wh7@@-8{27-CP@x+X7EI}>xuTXzh2HC0QWrWXlierR$3pV28G&8
zUx-NIeUQ#O(;7}ew$%|oyh;~;e!V`lUS;?4Pkayp>*}p|9t^Hw*94XPFEEKqdaOws
ze{hAfSAH)}Ek0+?qF2Xm=Q>IX(frFay=!jxg4b{AqP1-Uv*5%V^PJ-sYppRqgzHo~
z4La$@A2Ljr4F6FHdu4>9v;?rF8jC0VXUO0$JO_d{eMkEhNB@90P{CsSx<^?f*x*+j
zkrQ^aDiXR!6&U|QpXvzZx86REGxgIdzqxVFe)#REm=iYj8IwVZNquePEdB#lxo})$
zN8$IXE%XgZx?L7aU%(<KmJ_~Wvb`_@Tk2g{i)s2Xr}*9!7V^kc3R6A;X%bpmgI<&p
zF<DNQDTyh4^fHf??~xL}INz$*s~-%BL^1ng>MriY_T~@wCKM@eeKMZ3RAtKv(<Ao+
z<@W#VOQv^BHqgteOIKoquABJ=qP#ml!+Juauby?`p}D=pyb6InC=DMbmK+2DF%Ug=
z5T<igC+=v!nYj=STaGt$Cf|7s%prHryJycCo_wfuqOB9*XkjSP*Y$CoXh~Hg?R)`~
zCA9#xIA<A;u5fb*mvFZOo2bdq%)Ia4J>FiGY;c5Tp#5_gDmC!P_hRY$iZ?L@dG(O`
zyfKHC*#K0vm40h$sE7xjQZy3VLaB|j6+edsdGO_evg(eG?zp|bpiB{W^=!v)jj_j!
zMeUr<*<Z7rQKVhsF3|fm1j^}=?~Vm+ugqTI)hs_D+H<5M$H&G~Lmpe}Sw|g=9p2S`
zFGZBnqLt7!vNcb4vz<{k_QW+6sjwCeI(R@73%T@QHJc%__$Nv^+muf|U=poFWF84>
z%YxZ@FSBI7Ye4>Cz5W(K*S5&?&3IK5r#S6!n&w1(O>*SsoLHE^J5O(URxY8#TvpnF
z;MewAfpsHC9YQscopR`&?B(bVk}@f0-&)GE$(ObgMUbyu(6=%%OT5|;*%4+5N=J3z
z)?%?{RyyAP0mSJ+IAlW1Qu#n72f)(~Mn=f?>jZ=TgZKgPOZ0oD>aSFiD1PBukr(B>
zs!w@5qC6_}21C0z4|sw1=CoKOOwq2cz)NK|JrXqCirf9xh^9n#o=er&`v~yVppxo?
zGt(SrWFqJZ-Z}evEP`CZdJ{s<&8(32Y7vb%=Ek(_H?tia!sWt<!K`1*uMr-03NT~0
zomhdE?}3H{i)7EiClj{OSK)W*%GZSI1-w5*h$eDdfrr$yz$@huOJ{{;I|`<}*~P@L
z=R6J4joDRQy0D0w2Cqp>dP9yG+HV5vi6yEFmhXl~ek###XF&&h@soL+5Z>C*!1tz8
zPn#M}GLHrnNH?4FNY&G5g*pOE6T+{1HD44-Bl!FT<qN~*9|p)`&X%dY#<~7RgZ8%<
zQ!w~IUDMr9OZXJf3XGmdVcQ#rg#F^9{kDWXdy*OK{hqp8%lz3#3KkY=7)f~B=t{Ik
z&Z3k54XH4gzdRXm6}eqcdrPtwM8t|?FR#6c5O+ECmu@h;vHWx^r4L|mEj#`}DNcQ=
z@$_(~VGhO2Im_u8SC=s^C)12jlAj}k2!Z!wIX8{YNMyfEX}#7m9o8UCosyj#zM1^C
z^063pHfTyK_*@}cnx#pxMYX6j-!rLwetqi(f39^&x)T}P7<oKwJXXx0)Y+%{sA@pF
zqVFAUEpLs3WcTW3ZfR%haX?*5O?L;rWp#0|I0Y60<j$W^ku-7nkst!t%zysFaZ;{^
z%*S_wQ3#0l|0&O;)nVJ2>+bPq7*9vwf2-sW+sC}gD&onCX5?6xF!MmFh=;U*QAM^m
zBtKmGJ<npa);WusmN|!xmAZR$B3tly-=E!kuYl{VR*FTBoE$h0&ZV3c7ByZOaIh3H
z9jaMeh>pfnuF$p?pWROs|7wj62Qk)|?Y}t2uUIWz-K$rx#@FGMPX`<Gpb9Q10?PHm
z23$5_v;NoVb%L@^R}Y9tX`EOLIsc~BxoEjl^zhH$S+J1^8-mf+;-^!Ij;5Yqa{^N?
zw&sK&kZ8QtTWb?i%&D)!9;GRRAgI<?Vn9DYE|vk}z$jz&u7r>2N-MYL6QopCHAn`v
z?H!zloEXTDl)4hU$gjHaZTE&-1GzxOyk#Be0{#AAu5CC9_JbCNt+=B<ideG6B47aI
z+tF59{Ze%#9n4+T2f`4_S7-1v6M;Vyc=^|mW03zjZc4Mh8wJ;R1d<2I$f%nzWS<>O
ziuTj)&>_tdKmgR$_ZrqTC&wahxA0CVk7MkZlul2mL6-6c{BFXwK#?nolNpFxz{SCg
zT&MZQZn)znW3Z6gikwlSo%Q&bGOEQ{JovJVTGr~voeNQ5D1*Sm_y~WEpSk9P%6mon
zFYJQ4j=k_I)vtg5#9waIoy)Hx3$j%3XC0IVkdns~p}Cv!s+=;)0Xb~&?)$Kd7mM#q
zW%-|Ls56L28d9x@H)7}%uemr`x^o4nl^w(@FsDr$4iDX*|5s`1gL4`N<@q~ses2k-
zmw`fnjkd=tqAXC%2EnRg1m2HMWAiy5wy~=!Ul8`$I{!vw3gXZm1b^w^=6pfzYx5`*
zKjL}L;zO5_@&1^{K#{0gP?+Zxs5Ms1)M|&)piR0sn|OP!Xx$_SCmd1JqN0cgfxc8H
zX$VK3ijTpU`7itbo6sKpC?<4t-v>H`wJZ4y%a3^@9cw$k#`>t+3*$gt^v_SiH`~L*
zlJ}gx;H%92A;hU}e#hud;l7lDzCzlbcArjP+R+M?u&_Rv|DT9p*k?lWj_LAyn(mT8
zYN#C0HX|qoBZv-)Kyi%7YZr^jkF5=<Z%{JznWt~3)Z%~q^LVEzU+s|eB2m>4XB)>K
z5~C%<2Mfzc7iR^cu8oHu&phY!8D3pn%x?l9*I$Q+V_o|i<}DDFK<d_+=IV8|K=Px$
ze}Dc6EPjlX6K{0}ifLaFv$$mh5(i2dSPXazDC}N%i&*U1o?Aw*_^HhWmQ@E*!NA6A
zQ|F2J;!gnOAgm%69#@=SY`&C=;K+Kl9pAEciiPP#H8%yd7P$>R^IH8xMMWPE^fgO*
zyrO{RXJ-Qd|7lc|5s}aCn;e&0&O83@Nlbf!FN(DHwM&#}RYJX}k`wIYOKqKOzL{Sz
z^Vb<z4f@_a8G_X)pX}|M?WXuD6`)7f#()mnd77IBTKP4hvG-yHo?QIb7Cd+G?X>QE
zkQ$imET%n%CMTCsW(nQ*<Kqqz4s?&^4K&7Ej6_I=h$6Mu)U`*(q=7}1F|rELOFx;I
zW@93kl`du#!ZgjwT2hQDV4m47nzHiAHIhyKdh4&m_wF*<^5OtEf)^nGu}an1(I?yX
zS5S8t#~F`uya}u}`q|{isJV`7u%cn@6F?W3H2qz7jiaGM9_HWcI3Lwfu(+Ve#UhdV
zRRPVs<$OJv;t0_*!1R<jMsPr?7C7HnYSA2`L7ez$uHAJUy9qvUvD?Sd^j!KY`kOIt
zYU`M`MPp-HUdthgf+(tstS@lCxwCNii{40Rri?P0`)!(hcil~_mL(<Mq^<@!3EcVF
z*;4?>8y?x2Ds|(JI{?m9jym>tWrZSAW_{C-Qcr+r`V54+H`L_INy(YMreN*94X0>M
zSKa)txQaPs-I<JcqBmpN9&dE~%5vIps!OkQ_2YbT%Adu_Fr=_f48D$5vb%h~@pcM3
zN2AXLai$x%O08G6O-!ubMuoh`EiKdSgGf*K7KV*%`^|i>O?|1B9mTI?eP_oiIbP+E
zC1M{cjhQKVQo4zjIPCAUE=h9H7#d#T%(Erz__*{xtGhO=l$p^f#)-<X7+JiKvPmPe
z!B9#%FRU0<Y*gdyD{|FCfqeV!x-7>3;}a9!yPh>*qUAZLMP7e{CU+T|01@ymBa*Q9
zqrTCA?BJ0#Ih^hJKl;<*@8b2&26c86q%r(p_ULeNgudRYzU)J2zt7nEHBH$0ymsLC
zG<FRB-GO<69U_<8;UX=cb<sImmhTSh$H1SNSm`*{3fnWt+)E8BJrTaR83T3~Gidho
zvrE(Kb*|?il*#*Mnd6|fPr1(mXn~F^erLl>!gl^TE{4D3A?#PbjjCmX7JdLN17&f$
z_O2A>d-tW3S>bSGrPED#Yv652kO6tSyy@>om`3?y7_HYJof*uZ8|M<@&XYh&AX;Ru
z>v`My9RI{ZNIHm7%K%4#O~YHS(gClh2GGC_5|$J^uaf*J`H88UAgeae*<cRh+X$dr
zJD1$bEv|i4=Qmk~Psro%UfFC=+bXr-BZ%QGE_&cJlY?4TZH8td);b&K*0w=k0)OeF
zW)1QzRQsy#0iAI87cijU?77*Ofle8n%sZ5x*d&>(N%Jekg!wVX!25LJ?)l%(CK1$z
zt#|5Xrk9kYc<;&^Ux2^3Aqvz9=a~R8fN5gyEBdk0iN!&#GP8vH;ijoN@&7=Xv7g!V
zXFP$fz82IX-V2%+3ukZr>;%0)kvvhI*n2u?TyWWkqP&!jupz00=-p3IXMDU+M7L;o
zzdBDawddpmHz5-_&!5_6qswDuZNwDDD%d@e*U?c76&ga0Q#c*gtjM3r8}xT?Ht2ft
z+)Qg=JEi)D62@BQG=(cHE&xxM;1w6<8`6AV0`I>fYVg7G2naU}Poi72HuBh6!sImX
zivrA7E45r{B_tY$U(sgL!<FOhB78|GA%^xC3h9*-h$6;_J`6Sp*FFbAP=ytDR)8$t
z5Ap3An!XpG^XWfUZ#k`t%1R)?IvZr@OL<ug4hyxJ`WX_uFEJswj-^(=DC$)x(6{-6
z1E3}hzKr#0U})cxU_}?lps%I;Dy6HKfw`QhE_gC)(Ri&U55V@e79=LSQuo9}1n(48
zM=U&KgyR-tG|sx-GU)`1@hf<y?^e$>D^g#sy>Z@fo=*EZPIG&DT2jD`yZqx%OIAh7
zr5`W1vt>k?AzZd}>r?Jh9`N>py3x{x8kpC);CphGRZRxA-)}%zp&`!5KfcPI?aG$#
zO|*`H35`m0?Cf8T*7ARqm8roiBOxYz`$iG3%G3m6`Auf^9Yy<oq373F=uGvNHLBMJ
zog6`lkUt({BWSMa$Nya74l8Rf%oZ+zEEw%P9NXfkdiM+?P?Pk?xTL<v5|wL%q@Yen
zEKZ+KQuP&rK-@F&6fi5)-LFab^eb{);jzN__~_QZd#{&(&wPHH^0_r4omeRq_k~+*
zNE;YHH`9)`bQ6wNH#=o(g5z8Fw&r`lx}O@~LcM#&Cpd|~(nMivm=ZCxg}mO})#~Lf
zWutt`k;#m{QDUF5a5{+-%Imi!_Z8vL;Hhe#i<p{v896gm8$0&BJsn07JL#CCHfsA{
zd|}}+b3dzp^xrOnD9;FC+@<D7yi#K!?|WZ%Dc2d>Ef@a`NG{SWqoeuvkSsPu8BZz6
z_g5S$c{}wGsqV~H<i<th7naQDyaIMhx1`4z-?|JZeRr%@lMAX^)wkkq1NG;GPz!bK
z5BxH5!>CU!=T@!lg{tsj8o4xmPFuV3+2TTpb^P`(?p)!4Py$#)-d;c0ybGv!JUrWk
z-Kj>bxSmpx4G4nkC(=|NE!MuMbAR9zX7c{NF4|m789iPo426podSU}R1ru3?_{z6$
z`{=(2eFQDVQ0fXhdkalFI%nGgFXxy#Ri!e&Z5A@s(V(p}y;z$OdfrrxyzNce0k7}S
z<Z8dphP7RQwql#may1DuSj0N7ID-9F+_ZyFMTKkXg3ztk@qKx>>YxT<&~NC|MFr%Y
zyOg(fv{>@%KtOE|A|%Ae#K+!~g#j^1>p8w0vIhE?o9~Pun}<od0HP74?lB}X6QI^P
z%g(puADjacjVa!MU~6V_UIUo25O;VWM3F!f!~W{yM38U9wv1@=^t~M=I?!;D*cSyW
zF~rPw>aGdE!na4O8316@7~nUj1pPZjm+#;hB`cceH(Y^yxa<5FQYnJDnG7og*`=$o
zSS1pIRT4Q#w+58F6$B2?P;Lvn?nw;Qs36rfZjyF*ZS35!;>u1)k|T?kIq>-jTUncj
z!knZ4^u3h=V!LTcPIqLcE-3=??C=iYg}CX%BmOCclN^(9OvbF$eiF*!z+Gw^RBUYk
z$Fp6bwU*6-&{RtJu%0KkKaErTR~LQH1>fkIJ}6NYE>A0H5iEU(w_7;vkv-R7r)|*^
z+IW*avDXV_UlkcPruE@;N5lI|l|$*tP$E`|UltL-^m1!n501}ll@zPmZV06I1T#s|
zY)WZv>7jh>AB^ehHhuL?sdnrhf!{t55+S@*6U)ub6ZDRvUv<Z`2N5!|L}unHeR(HG
z6WB{O88LrQmNCX4WTvakW2T(?g$k1bfO}ReJ891E`^N!d@AQ(QGSN@7j{S;XvK+I$
z)rQ@AjC^~={(c$ZkWNtZwKjK>kpRqmMR5`2Crj9rC6tH@dtvP=S`mNLn4r~NEz)W8
zl{!BF8iK$6K8raTqo$>`0VYp4OXqA5;^LiVBH|8TUHS_S1Vlhka_c0->BUNM3Z3(d
zg;WB3rmynU_34P0VRD#in@pQKkSW4fuh2bpLhe^UA}I+s$JV+@_zzdqh?8!~E$`+>
z#7y5^vMtc>M_q?GIi`E2SlMw)z4N+)tqbq|q{6EACwOEk(+Ct9hlC)P3Qd6)jAzWd
zGrUJX4C9M!E<pd$slw$sQ>%r*1+=I&nKu@yWE7WbeUq#)AO%nx$shjCDuY;jWyO(K
zDUzff1wO62@T3!c0PCxJG(uKU0I$@wpNeA)?3S=`2yNXuTQ;_Zu=?iWG8{_eh4xF4
z$Y;TI{~CT($V92oCIbs4N^$qaYCXl`4_Z(11{;!MP~Q0VLPT4c8DZ6Su-x?2Ga~If
zl9Z)N*}H=^RKfr58pSt9oZ$N=@VLeU2jH(~49Km{Ck<)~JAQ6=_N?;`X?c{-2DD))
zQbh7jI#hy9Z&&R_0aGpY=eN8H*QX@f^UZkL^2V;#SiUOp3K+~>9J3(Xd<;wIOkn)Q
z=>*p-BBz5BexD|!kL>R;KB$ewxBXY;hg!5QS%?-XaU*ZTs$)=lA9udWC0>^2RBpL=
za#wBA+SJ5!IApa?s%p)<{>+rJLNC_#3gqm=Hi{p!%T1+XZ9qja)A>iavfiD@YXx=D
zYo|~<JA*-X7@%RuH2{HvzhZ{>Q8F^HjPf^+5d!9GBzoJNB{rO7>a^k~m|MNpb-g?b
z@@d<dtVYoioS=ROD-JG>EaI&68Y(<;t~OkcZvx$%4(e;F#pAgl%YB=#`iYu-vc?YW
zB*u%^pZQO#vZO<VpNkk29v6F9lmW=BHvyCu1$PT{cpiqAa(u3w<Ct;gDiD|+$)_uh
zbGH19qBbB^JDwMOL))(sBKs$YDMHExG;9ArDoOLU2@F>MA&YW1TJEkO<W$q~qynIZ
z6q13!mKDMh08ODiGOpG-IA`*L2b1t0ZhNB!_iPin3lM;!30yP;VNXR1#-OYi+PXHP
z`*Z!1#|_)Ta4&7?%Q%0?CdtY-%sc|;ZFDaGU5rPdlVIF)oYLq+!Q;6d%ujeo_fLC#
z?7lS~>XFlQBKzrpL2ALEQ~2u)6qrbx!NwWctj5)0Qpl25A#91#H&Jj7d76ziX5<&9
zTkT%%a0leurbfbc--x=dT$_4%U{{MBCi5}4=jq1XrF0(+sn2?weuu;TFW!+|-v7N1
z>V|1<nU2z|L%#qJ&?aGj;rhC0Sp$8Scz>k#l4nlDpzIyquPEl)^=TtaR8!Q#kB&FP
ze+4kM*PnH>eyRMy!)86ioZ+9+WgYOQ2Ns%gIyKxj-s!E;3L;Ei^(6Fi4Vt*1&231!
z(f2AvSV*4v<cBVih#?l{a3e)7Cw(rfMJhsLr7P&2^^Ndy?)3CfJ9(U(y=GDPiJ?n&
ziOIwg8b}9Z%y%8F{5mw74g~G3omiJ|y}eDIc=6!2+uQ+byObHT`M?~dk!Dt!{_D0t
zKZED^{9|sX>Vnj$J?lt0Q#6Wn^t+hwZ>%vP;(Yc0ynU#n9oNIC+K5hq)Qe5+l{_@Q
z=)>&`POY7|E%|{zfA*KqR^5(|5WC0Nalr=j5oTws<)4>ce_9>0vq=Ulr;!*Tk#)d1
z8Rg(!E->%~TYtqz;ZBbRSi}nu*sak!Ior=Z2Sv)tW!?=u9q@`sh@q;*0r+#*w&(Y|
z+8kNp&)@9?lGOtLZ7dkd#}MZ4i$X2QxoM&3VyWVv<;Umpl_;Nv70oZ_$+0~exXjXh
z|LCONCCqDFl#Ag9$!X?vY41;e%E~(T)`Cti@)!HQn)|7yz8AL<M2?v2KrmLLjlrO2
z^dV71EFT0!mIT-;ha#R$M#@cVEmi&gifWmQ9MWs(@N~Iy1yxNKIatb{lIOQ-|Msz!
z4FiIXP2O$uF`{UEhk%>fQ{JY)hfOPohZHTt)He+>QtD&F7b(^r=tbrF{DWAL8ox+V
zg!9RVK1HD43b%2%-gFU(^jc2}Z(ZNZO0eVch2-)wi`T}Op0TjG{JGW4B*w8w;5n*z
z=!w!bG_tcbDF`q{9m8OK8ob{lp>=82Jf%_K8l$i<Ip=7*nGuLJIJXOoO&Q<sf}YU;
zI%|UBgt+a@>JK#Fu_1xf*JdLUBh4=g+U*w1C+X}C;ZLm%`fZx)_N<+{Kn{Bwo;vuF
zY%cGWN4hfGJ-JZd<|4rY#oKfz)JP$}3PLDO`PUSho9&sQCSM;I#Nf43__+D%0xSgl
zSp-uWWJ}}8$3)Q)o_?qz;6#^gS!q&g`IRmkFv#oZ#%r)X>(A*?fCxR{e*(d7r@jPZ
zER9?|A|2vSO0G{Um!S*Uyw8*hD69VchC`%7BX+Bx`z2N;Tn_KvZBvpbZUW1pJ?noF
z!{&+Oe5dZxN$|+*eebAXS<|vqkemw*yQ#VCBrTTvta@#?C+gsEGc@)psqCQ0)Fft;
z#Pk(t;S&xTk5u3PJNlP)^r!9U*E*`f!cWP%ds6uXW3#sUhS+5rZd=I!E3VT|l@&}i
zd!8b3FHUVxH2G?sjwfb6F#YhYZsfr!#;p)WOQuU8JDlQwm=iS@vtD@4_N->HtP|eW
z*laz$eqgUT)Ojg0dgpSosX5;JV9_52W~ihZA)lN)R>)e1V;^JcZtST?=`ZoTO!_yJ
zt&P$qu<?brnqN+*S?U>TpJH+-TOC9w<4vf$AChC90?|BAs!r3}$j<wFe{sS4|Lc(5
zeWY1rjb(r<&pJ5d%gt2h(g9ue91pB|Z}0xOigeGgk5`@H;L<WfiYc~^jSfEoZ{_#%
z`(up*q5A^~`vYDXonv6boy)y<(15%bomE3-O6lG<uJ(J=N(BH)U=;84a;X*vsG5Y9
z?Kz7@u2CJ01}xKGvP$<^pfwmDA8)K-WFTjQqj<L4(qF**cR!|lEswg=GW*RgndwJ!
zKQHRwkb>Yz-#g$=)V><OWP9$GL%@(@nPRs({5$oU!|R^XL9{v8s_PO<(!YBL@2$zd
z9s?|Nob$XRvO8R&64&4NN?Vrz2zs1m<ozwzqb)$Em<E@@e;bR}y*~%GDJ?H<I+&c$
zd4Wy?RVAZE)(?pB)`!Maa5^D;{xUl!zR<d!gQvNTme#~5d6v=@;gI&uPP#hPdL%>#
z48Mgv3^tOV6al#M;d|HpUPs2lCLmp4K<$~uij(*LI<>Vng5M3-I;^=$;zmu1@uIF(
z=zy!MojiwJ<|KNg;_we{djTw?)#qak;r9ZiQ*LFB<e8Yu=2>T&%Hr>>+rZyCA>~G=
zI`9P!fo?w@&&~f#@e(?=re)=F4uVbR$hJ3nsq~8dfp1r1v&cP%E0X-GYf~td4;V^b
z%QYX^;q$OS8aAkn#?ZuMLrl@7pF36AJaXPQy%ER0LlA}5ldwf1ibQGnwSLH;i_Hr;
zS&{bq@H*MKj*_An?~YHwynf*9kNPG03mq<Ps>Y70o&Y{MHeLyj@{=p3cf+h4JFd-M
zNK{SrQ_b_I0eG%V?s4{9G+rk;uZ5#gr{V!~Di@0f&R^&^=?W60Av-%;h}q=3THuM!
zq?hyC%{wrN{Nm@9!Q(sU@ZuWl#a<ao5{}pxU0A6;<oYx!5i=ZptP)DQXPou->D#GS
zqrEVxh{W1fOCv41udCyY2}$4CZqP$A6H!hXnel%34c*;I*jXtH4p<D6Prpg5)EnB2
z)`;8R>>FW>blrqq4D6IpA^xjwiV6+_r%RuQ^FY)c*#KbBOssnTW6P_np&F0Viu)2k
z(Q!=cX#MbTy?ncRh0LHoo<H3D7v@W)I7pjL-qD3C6A{6nJ8Ue+`#rmb@@Dwdmi$ie
zldDn$O0HC0CvV4x3KJ#~+nfI9G52P+v@F5CLk?^cZe^Oh&1?CUXM-WkPfc=Wt}7K=
zD^EX!IMFpu&=&okdaszYd;7X9UA#B)LRH3GA#L=fee75nZ*50nIzuN%Smi%NxSXSO
zx{|T)2dIpD^RKe>_8$)m-hqx6MB~w7Tf+XLJ-D@PY=B-NCxonlanm<2jn*ljsRm)d
zXmC|bdn{TL(@dH9Dq2xtKn~h%px)gTDs&nwS*vz+H__DcIr^rMa4?;)We<W$ghSDz
z0WiCrQ}=@@(frP?__?#+DOt&|JTXu<7R2bfKL^Nj&Fh4F6{h!g|G1%u78U2Y5vqPW
zz8t3^b!_KFMJrrfs$=+zE~<c)$zCVGh2Ce?IQz%~*1)l6?GGdE^=>UM<JYMbhd*n;
zGAB9yYz;Vc%pV=L5k-P?R4-XeU+|f*a5g#KWY2RpqaWWd3_^n?w{9vZq%r({zL|@O
zdRo1>@M?9>yCsx(N?u8B1el>~&@D|AEg+d!+YoN<Qv@~h7^3P&DkL$S5@T<wdHw|Z
zsyAWlc5xVo88Z6@40CpU{nuQ=!C{CtZQ8?x6XJR`XRdQC=!}6-TAhhIgjbLjJIyY-
z-#mJ`#X!CAGa{;GzVoZTMse(MPj|?Cgfs9vAxtQgU<6}ZtjK6;T3Yb3XOUEQPZww+
zL7m%HJL6kcJ+e_lIDE3K2)Qu#k^?eltmEIQl~<5^Xc&SGY>LSLHdcVmu!h`Gz+6=<
zW%GGE^M?(EAzl0hf9`U{cufqdJUo8!7HJgMVt(N_$gm7atu|Ylm}HnbnjuorZAJDE
zSRm4{hw1o9^yPGo^;T-&G#)j2<&lR87<CoK4G4MM6px13#3kR0EQxvQ69-cfS0t*K
zhiCKg0su`oLX8^&6ijI_OzWCj+2LI+;oDJbt#)`L5kqg3x7jE^A7!s1e3#wPW3U9x
zpY^@@A%uuHH7Sf#8F@wlCp7xQ1zagqx=TV*Umu>sd%I{E5s&i|xtC?9T6uy80(tJ1
z@fCZ^)NA9{veX$G<WkM31V!W&3ih}1#?v>3(+E?!xfn;U{WqfEgm%;A0k-4T?uN7T
zkiS=W{b)C&w^%h3=F_h-*5ZG3H9Zv0E+kORug6;p{jhxy*%`C<_mEsuqnofj%UCo;
zpKiF+-xIz^<<{HXSa{sA!lV0k;4$G~k8spU*rO7psGws8_*yWzZvpC02aXQsk1Ui|
z)D!mTU%#y0-K8b$Ob_sg(n?mOo~c6nA-*4C2%m$#0oR5+o}(r8(2;x1y`w5t#{ClV
z$*H6##zvCdu1+J6*LG=k5!Z2S$yt{rYz<XX7w&4E1x}8YjwKE#C1y%_*aRkImUPiW
zMVTxz46A8RRl^89Zfsl(k=DE*fo#~+(w_QaF2Qw2zOr@FddTT@UNM_;ag`}?`Ix?^
z+$af;I>qV5t7p(qVEwfEJ)+n&bS_x^>OC;ASqRZ6H>Ux96|jPL5(sUS{f(qEX)Vy=
zJcstlJpUZc+Ck6<Xr-S>vlop89_Eu=RzrX%C$B_ew7(b~)jjxZ5x-UMI)y%5tvGD7
z5Y@@><-6jrULHl(z8xl#pUytEW8)vc|L5f+lKM8!Iw8Jo_9rn04Ac?tg%-P!XB1><
zp<3nDF;7i#iM$U9;9qxV$LzYn#=zoc-vH0RVlDk1WUcH3W@Wl>AzHf?NBF#sYf9@2
zMR}_xsXMn5I-OVio?dHk{3Mn;188v0(!UL$&hs+QDJr0Vhr>q-5eEh=Y8Wx7(-KZA
zQpF!r)v8Ov>M#XPy!sB8NnmJuw6INZEn3#*`hUu<@Qa4^Zl1Eh8a_~A#SZ0+1Yko+
zTI;76pn9#|)&>JlsQ=N6VVG}S9z`(?JYbc5e@{(*guIv!f(<hQw_sCguLz&F1ND;q
zPyzx!_RLQHXkPL|`8tM;bx1ZCz=_~NIls0|nPJ4FEAi)Uzti+d55sAC1rmRYD}sIv
zzImHlB4cnf<TdfjyzWi13VJ-<vm7@HUvh#zo2ojA3B-nLHOW7F35xZ_dAEY7l=~H7
zF5mLp;ga;XXKL=7Q(Nacnun+B!?u)&iwjVQIICB)LZOV&X*P(;AI;<e2PE!-Ts&>-
z7r}ZIVxDpcO@CtQxPl2pwFVkRKc)GhRGPGsElVTd|FehhPQmUe6c4*fgpmARG$IB~
zaZE!(AN4Ly^*%_f>o1N8l}~*6u6}NQo)EVO%)jG|k&&!A!zq~mAa9;3rKGTxhT!z8
z@-l)P=p(r*<sGVy2V!hkyEekvnV`$eAHnCAR&yC*Sm=TTKrvaX+~vzr%?xvHpjXuy
zOFixipVQRyHE5X)0_!q)`u+(0%f}%2gp(Ph@VaxaaGf!m7$nuSIyD{3jjq}{_vT&w
zt)B1SrSZ^_h;6;(0uLBw4b|p)_2t}@5G`qaw}HcOtg!xswq9*scX050?ADeA;SgLV
z_XpMil^KNFuIfAtnOC1BrhN@`aX0bQ#QYvV+CP$8em$?Vzxr4d9QvdIkAQUycmR(G
zN1M98g0_R#g9!(JE#{B@xoK3e<?8c7d9%eOWgvK{$OXOAeTqz#w#EYXIZx+g;Ihz`
zNl9bK&Nn2xWPRx$$lIyor)#C;@~ef~^Q9>o@&L&oGTwLgxy$j~>}{qQyAIaLad9V~
z9OgvyBLCtj-b4;upS8O#s{Y6Stj>sHavJ}_S;Nw%g|$U-qYh*sdYv4hO$DRV3J6QM
z&~e8~yWFXfASm0D6f%D}X|AP7O>C9sL|<U~8lthkz>{zZbGTIFIzK;;H;aymiHW=V
z<;&dMmpMfSeIfdM_3+gHeC%s$K=h)3zchpu(<60I!gMTPw2FgoK7_-q#&B>kCmc>M
zY<)%7bKw;|QqaMXovsHz>`6QGN56O!#v4ORL4EJE+%|LW!d+m^xIXW2=c43d&a2*o
z*ZrYrA|jm3P7jd7wv|}_UI=k802AJt#m`sTW&ss@q<PZX9bngc){CSLezg(QDH_uO
zZF`g$`Q?>JpL7@3r!}EL2At60h=CG+9KIP3E?aAnhN<Q?i)8Z2%U5Vdl%R<9t_U4g
zQ3FHYBB5}S#P1{84_$)n$Rinx50B18Oy&PNuf@g>v2Ch*Q}^aNR`o|?g$ojXI*axA
zlVoGQT=9dU`bWxTJdqVFS9cy{8S^cK&UX$xDqTCE!v2Fn(!~y<pp3`52O4G!?PC)o
zc0DeQ(`L5{4GeXc@^BF*&s%hoL-s>FzXEW^l)tU|*3~%+aA-Zd8?Q(dLf(>(o4>zK
zsh^siJy?zAwMj5JZgr*6-kIKglK<g%t1?PyK@IRf-R*X9&*XR)V{Z@Sz5x#Infgr2
z5AyUKVNxkiOer%C^dEJRr2nJnyrZf9|2TfFva&L+@vR8C_O(foRjz%x_Q)PLWM*ZQ
zJxaP}m!z(JDO_^RZ#K6p8TX6uEpg+9U&&s-_xBusIXE1j&wIR{&*$R_XcD*3Sl`ih
z?6`nW%;|B;-e%}m=xZ;R(TSg3TSPL{1z{WlpB{CRp7A&KYLH$6%ejiCz{87ja{A1c
zgGJK;E2SoyBlVozs_WfbR4%SxaoxPqteA+%5LrIy;_n9ffp2r+siFLTo=wD*WFTs2
zMkR_lS<iOW3{sGzzoK}EE#Fa<7#b1$4POHY{1K_9*VCoQx3bOXWmN`OwaMG!XQB`=
z_-i&0pTFjR6`4Kfw}qBo*``{E*@iEoB2cKCIPt|}WTu+Y!slxOcI%)XaWLp}e<GA6
zsAhBp)@oc!|8~YmP75v3ZZ08kE|T09=rXNpU0hki6}{y}c$IT2#p1g3`;&Joo=Da*
zJ!)fB*n*1L!mw@`bBijg4LM7IpF4-}ZNmiS;9VN_Xru5NLpfLe@Y<WZazCe#K|+BA
zVa9UNYzU`?DKi(!;ExYXyv^Sz!LAA0S-~EF*~BUXk+S)qK@Q<?mzQ`<?|HvtR$IHn
z?dm_HU9Mfx2kX;IX0Li&8&5zTeI@VS(N4S@E|iYc{UM~N<x-B^gQJh1p1PM=KOy<G
z7$yFSIXR6x{58}zDZ(q{VdUw(Sb;gW%Ei-nC&hD=Z7UZ-fLrW1+;mu_?`?aBmA3JN
zQ|0)ALo{UgxN|0i-eK}Qcu|kSgW$>T&`SS<Q<&vkNnRX`rSQg#^~^OX<AQVYnpG+$
z427N=mpQk!8H@|0vYc`d+;M`kUYrMi7ka7Vcp-e!m%5k>Qr+!mURea<?8}tp3OFE=
z^eReU$T+T~qVlQ`@ru(JCLDsih>Bm)y=8iEbUD6g`FMRGt8y2BK2!Ej#&vog?#W+F
z;}Ybuj0k+7U5b=;p@5A{Bh^7PJ?5hfg#a2~4ro0mXrKM6gyX80OI>WD$yWxAB=-7)
zdyc^Ms{7&IE&~MeD1_{nkCc}1w=rvOsxBV;v<EkmG@6<@+^G$m+nsldQ%*YoDT+?l
zSYR-7)v%y0uZ4EYW_e4!*fBy&2qn3T9Cj~lL=OJw-b#o+yE)jh@RK8r?V<`^-0Tq5
zwzkfv90z(qLbX@0cvXXFXbcONP@2^j6IecUS&=mO10Wffw!&cF)~Dwn`l62b85rP8
zEskOOKt~1$`@oIhu|k2hW`w1Uu0~?Rq|p$Rs%Qm6P1{oqju2pUJ9hbW{{(lro<JEp
zSgx|@+ab<wbe|r|SVn1xWX~L(xVWfKsz}>3Dg8{l1lio&b)gJEJ5>>$FaxgAI+Itl
z6)^_)z;M{CLlrlbt4|xpYmLa?L-qux{jB(TSFB|$NlGt+>ziVB=nu$!<}jFekAke1
z<f|Ra%n*_DXfyMaqRt1uIoY@?3e1Y&gN|vAV>7-zOAi(T%BaXDk-y3FP<myYaI~@d
z1%|9AR=w807pt)K@77aa=3~zJ%2j+QT6Io*Ra@5@Sgj}~?3tXEl#W8dy!)O0(qnh`
zS;d-=Rm`5-n634C2}wTkjs~b5!2dajD^L-9#&hTQp~}ZjuN(UOJFWE38Z~^aV4n3x
z7xg<`Q!yU$DL$uvfQkL-WlvLfLm4MK!ZieJQm@EG==MNwN>20OTnIGLK{uB^!%@xj
z(I#J5eyfqNXja%)XQ=^3U2?}qpN+1Ul`ztFU|$k!#X0kVx%h9MF<GsEzhwn;+Lp7B
z(j;#VPwNM?ofuH-lc|TZbgJ{(Klo#x{@&koVM-In^+?;o6HBDq`Cb{6ErYXkJ`RIi
z!78ili_5ePPMIy>iFqc=A5Y_b@v}=C+`q=Y{tE3@ypniFBk4T`2}0c|?~`G!@`Zbs
z<Kv@Fk9Wq@|0_c3V|bqJX>I%t!$K0U^&F-9Yr(3m)=hs_ebF|4UiF278}LG33O#3d
z{EPaMb7*Kt;VFBjF5fd&)|dww1GerQ6^(GmBO-k{cBCKrGL<K^>xz##3t=K%a|JWG
z@%gO*&iGeuFp+)J`{Mh5-m`M^jVw!PvUSZ%>T4Eay>wyW`u1<2Y$D24m{80^d)BWp
zUHo6A7!<?g4V$CJ-}1_1T4q>zYn+BvjFqbLm?PIdg^h@w^QT)Gf8Vc2GfZwKb&6*x
zT(p|-L&_s+#A+-wUn60@c>K&Dg%3W!6@R!EnEeab)V06pM7h!R0h?u&yU={(bfiqi
zw~eE(+f@ph*K9_$`uZ2!i8Wj{u;`B2NS6m1B4>Z9fbh?mfPfcZGEmmBHh!f?0}9>I
zC+jO;0VHw%y(1p+>eZG>1_Fl9nV1n|Ai)PQ9!0)BM&60jE2NoON~7z|gx$CEL8lu_
z@k?%vknh+Ka%;rA+hGV4Ogk9zl|DX)nsn^}MDci;&ejoaEnGIGMa`{SL~q}qpZ0Mz
z{O01)hFecud&knt@xl=3I5mts>xah&5oa9xNCw4%togN}<k{t4e(YM*xyoCwvz)aO
z!^ruKyGQt$7@@S~wQ}C5tvaIG%xJOdcv5=CNaIY$iFq>5LhXgHg}sL>U5#~lCZbAn
z2em}kBRwYFlxX^I3!6cNS>}+x9f3@`wYi(V9@njCl@2fLZ?jLtShI@8QtP>OWPkZ{
z9E58r{~go3OZI<EcEE*;BCM`@6j4J&tV9*&oVlpjy#{}2v3HJc#iGf&;{V=|NL|8w
zUJGY+H<}vslTIef`J`g2LP{XtpL*hVx-;QQZ?U%iBjQf2Xasjf6Ip3e!$*wACz=_o
zt`B>{wU;$GPVB4iWGg`|VUXNV51--&7-P*a1Z!dr^MBp;VuwZvUau^vTD!09e1y&p
zA-HN%O-7WU2B3~Mt2k674Qo9)Wxgp(<_XEZm3}!ry}B58ovwn1A;Z(OsWE-nez|@c
zp<)^E?wP!_pZK@2ojhxK35!WA5?PHTqgF~PL&(y)l6PPTG}IHjDi~T|&L+_9=5Egm
zDh4lEV|abuF$nlL+uUPo$*_mLT~s$iq2veu*%J_Jo%0K<{)Bz}{p#2^dR1wUjU6#M
z_j8tu4)}aHkFu%KcJmC-gvNHXo?O+P;NeISvTxYu#?-z5X_Z^=e-}>a8RVATLVNj)
zWF$F#s=nj@{p^`!cJX*&7^$ZA%jwq?2xuKVJl>D~ao*L2nx&>^xygi?F3=q+@<dc^
zgOWXk%Zm@=ejN|c)Pq44(mK&-sukUpz&yg&uo?wV9fG^aKkqBzQ74=oz7Xhm*mc<c
zSDJsDQ<}f_jY~CL&k^4~&zuv8B@{ekhqswUB=jUi0TOrc5-3aF-H5%aceRo!ElDT`
zE_$x=Ul}*?vp8duNsFsd5hiw?mDn(t!|w$IOpp3FCFFx{uPpNj_JiyaClN1Vcu`Ep
zSzdEUy!Q!IFPqMubZxBsSH`U9u#hw(c%6zfjP(ju&~k8sU-qpO5=4C{<Nd}PPbXWy
zyxvDrGC?(sGtl?O2H!Vjl^gVU&~?1Jh_#dP%kOiI!p{Ep6JX6z?w~?$vY9$Q-+Ga+
zqB*E70*2%AVSJxWp{@Pw+{>4Ny|kcek`T)Wp~X~t<dmabH_wy4zUC9kQm5eRTHD$0
z`R<onTV)TAHrjN$PC`Ut#xc!fFr|FWR4&QrY3>|)Gj=$2`Rno7>Dg)U_~wJ2<LSou
zgSsvGLfX!p+mXA-xzx{*JKE}^<@;@+#`-$`Il+r@V7DzYz8jNXgbZobyYUnpgPnEr
zPlA>^$24;$)2l&58Ruu<c0{ZhC@l?p0*6$$26=tw^Nfp&Q*pF2lVkIps?PEG*W#s)
zCVJ)Sn0Ptb6jtRt`teG=W%QOUjc@6VRtn--DR5C2Gb2afpi4E)9pf?EN~m)Y)w;xq
z!<hc8zft;Rt_YiT&bxk-*y==>(aIrOX27cQ>Sr81nSt5oW0;5K;<QcXj6Dj40{E-F
zh6e2l1dMa}y>3~HIebKLh{V$<vU<1Pp^-wmuOZT>Jx0>6T+*!ar+288VXO=6azN+6
zag$%cNNU>lln^4~jgen6b}-1e`3@ud9Bxxhs97(`@a_{ooENr9KkR-AXR$kkt#A~1
zcd}eDHzUJVkjPOnCL4&5oN}__;|j17wu8Mq@CC6JnYpYnDlFG*fFIy7Fau_v*lh&{
zxsdobnDdtKln@aFY|-xh@MONYw7u`jom<Wxw^hWrK81d@`QVf%Zehg(e_Rdl;CY($
zP-O`Nl^HA8vc>?Hp$2O&DW(Jr^1aMo=UAzmT;$Hc%2es4{4q0Cm~wIO4i9YZnb!>N
zQ?Zz|59R+yy<#*)ugLodC7#6pZ)&=|^m&NzW3x$=t6z?D)4lzuC!#oBz0vllEi3ee
zyx;8Q7(&d!&lY}jN$x_4*pvZzNfC#43@ncf){7sdM8VGQi|8}u`z|gmyq-zx6D`Cx
zEAsIvUwn&v1I3Bb>!z?X8aXkFOQ(oli$B{=1NDPvzbuw|;tq~ivI<{p1We<Jt*^dW
zF3J2@jgC3nJptzuH|@r`n8M$tA{1IBzFV%w?>mU<hLojZm5s$D=_{<5ovRh$Rwz48
zPEe|Ra#%-E=s?{tqd*m^o-5&Uy+Rd6nJs5D?~*mGN0c^l+<iO*`|^?fiFnIjNK$Ui
z8^guCfn3v+@5)YhI!P=B5~3AFVgwhX@qUBemhcrwuLZr^Y}~gB8uD$Y=oQb47uv2o
z?#X>FcI68fZpLvdF#R2`Q4VX%zT6!TZL5h7RqpTI$Cy-ZvnvVZy)Y+9;KT+|XjR?m
zRxZ8${jbX+A82ESw)A^@d$egSDrk3nQe{0FO>lHVcRJZEK3G^ED+K<lX{;AwkrS-$
zbpSG2r}42#4Pr>1J&9Lm?+ZTLcu=)m!&?Mk{Z&?W-M<%CMTWXLMIC(~u7t9h-pyv}
zg?<*%GtEc@XY6-e4|jGbK%?zg0Nuq8fL?IfGkbnx72q9H(+R4}iYb~TQ95gD3VaZp
zr9;ifU+PfiL^z)6>#&3oYiX3mGWEc1U=sI->H)4zr9WMHZD&ZOrIpl&=6KL*T(=Q}
zx3bu7amByvi2_G^<Or~oZx<?<+NcMPtc13|;q}Dg!yf5Q&lNZ~P|@=KU9PcPo(Q1`
zEY;K|bW}4<!wFwECs2WH>a`f4@O;3UxJLX1ugd)#810JU$iL|(FUlxc8`rf^Uk~qY
zNQ^G=t*o$u8N9oOo+^K9`EGCwgABR{m&8l_S{PJJ;SxVAC`P2F8w>F6*3;k7#8c{e
zPUd@J4}Pz#gdR{5PTIgRFLd_I<YD6<-GzXSE|*V_3}$9A5<JY=+$GICC35*5*?fM4
zdQrlCbzR=?kY1QCu3x$1{$JwKj-zj(j^A<lowu2ImXTf`9I7qdXLqlA=1jnFfGU<f
zIcyr?(&qBIR}q}KFAZ7QPO$lBWGenuACYor@Z4KP#)7T~rtf-oC$-V|7nQD&bF9T9
zj+6*Fcoo#Z_3vJHj&^a+QvTx%$IYhyoHTqDsz%3C0KbfMy5lzhVFjUY14By)1S=t+
zPNJNaShtepUK`>vht&%on;2;)*jAu(02F=W<m4n~VTS_bB?<2>8EU7RBw%-m(c|BJ
z(MV#G;jvdqlPKi!T;eoU(HygZT&Z$KxPwcn69UTOyg$CG=1Chx<E;@*qXJ(Cl{i2Y
zK{iX@6Zh(F;=G5&i2R0{A_I<3sB5vl$R8`|V?Xs!y{3&m&oIw}d;V?pQ9nNu>oY!<
z#|59M<1Ycmek<YZyktWi(SL4q7JHT9J2o|hRKwnNvgx#>Mi>snPjLL29NMcIvd~P<
zW0f*UN%U)xysL!p<*SL>a_x?KpYYF4-&$4MS)}&EZdY@}Gwu^VgjJb6%rLDzNC+PS
zJ%`V^UvJ0|6U)a;XL+;iMfQDnZRNuG)r;KIIaRmVo9Y;G1%cAiSIFYKvQoV^R|bbo
zKlB+*NsPreKz&qv4!ZcgviObP<T2G~96zJGBTFxOuBP1RMTQT&E$K^qA`7(IFK}SN
z+$MWM2$$n2iq}NbT3zsVpvsdhka*=(7@q^L#jrz_Cx)KBycxG|L-`ogwy?C2vmCY2
z2&Ru^RU=31G2NIvy9I>0870ynTwCO5Om(2ev5~hL!zQL~=A;P@(4r*7-$5M`(mh8D
z!Qh2;h3Tg}1-Q^LD^W_m8{SUS@U<$p|1xCs>%NA?Ewr^5%Lr7pHSoMMxe8r$_5*!(
zxC0JB-5!;DI%mhn?q{HT0sgB>A?<SfKHT4h5>r+Qw$?%mjEU0VE(JXb&bQ<+rNOP;
z`O2D!>d9aGYs*KcF{1Qn?8J;c(9pIyxIYnj`*!B2*m^?6%lN3M>e>K;<3Vjq#DfQC
z_jR81{r6wRM=Yr5eZKhAKt8nagJRnD%C}OcZ|^R5?m5;hf4yUn!c&m}zcc*uv(XH@
zK|LggK)!CH@sTdSsln(jnsgepw>~XQ{Rw^$m=(lyh*Tm_`4QxT51Qj4bep1!DiP`b
zs9boK=PV=qQ`@57P}sA@`y2LkSVO}L=<?2-#pGS<(8UpHJH$Ge$Hn(tzr?9?@&}|?
z6$kIB(U7B|vCsAXHb`lt04S#(a~nS6SzDD8KC;8iMZIv$At<+vsfaFxJ~$$;$!c_G
zuI~f=^rVVhGTZCA+S)oTYM#c})RSViXY@=l{+v<*@6M+au)NBO6^##3sC<GC$8*!~
z*+w1(yrr}iJyWDd&O4IqIRn?I#svP(q(?6VQ<*&8uG=nNcZ-G$w<mMy6{R5WstEmY
znFmd>{$&Y=YgI}J#yP#87n=QkzjiQZkNZ5uyQ;}3W@|zUXK^>zG(pl?O&D3j;D$bM
zQV1-i#kMV3&7NL}j{zPcol#s{Tf07L-{lZI*Se`dnOx)2zc)AMXTt8E;pFgLSV@L*
zuK3RBwE%<uyey+qB+;3tPmZ}H15y-4aLqbs>e&^$;o?VNPybQJnlF@klN17Psq=2T
z)(rPF1se@Y-J&_S%G)tE>(wU|3dc)O+Z*5PIozeCrJX;=wHAzk4djNFtZG)h$v&5B
zSWdFWIsJ<Zyu$QGMd<66$=wkRGKq@{ly?Cd9WR3e<7-P9RDArNve@yr7s1&frf;}Y
z-J|w4H@<Fr8XME01Meh#W+l+W)FDEJgICj{p+Kog&=9#P^k%vNt`Fv|T<~OvRUe!5
z)OQV?aJ9mEL3}gZ;Gky3ry`sGvXZh1I|P#W?b-tQGmPQ2P5fw!-`y;2dU=!M%rCjQ
zd|}a}Gqx??b6<%Eh_Pl<&jq>%W;ffOdY;S3v6CXjicJx@EVh%Y1$iq(N7)15vG4km
z!%SJyGq&lWR`$qION{hlM`c6(I|sFwb1nGRnM^Gc{#Mqh_=NMUhiD5@o>o+stIoqi
zrQO#N^3=TTFDikPGlNC$P)JnAf`)IJJ7hqVRNOS-2$N4yS_xXX*#WK?QTxXS+luVz
zLeT68A?^#SBE<uOWvg8EaG|fEg2+2$4ahLIPuqOEb3Sl}rdAcIacP|5D!jniM{!Ho
z*-+@7-`LgZIa+&~aOmDV+h7a$JhYx}*Kv?0?u+}8vm<n+^3fb=8G~pg+jIxn$xwf4
zu%GN`bDXTi-E=c&FV8o`y{P<%yTg3DZ4FXhTkAgjjq^F<0Os@4rG%%0ol&PB>~0R;
zD>c>84Q0$nZfoZ(8<<kJ4GZ9PM+?mY;|O=@H#wJv+*w~ov~D<F+}Qd2Qo(vuGgV4g
z;UN4)W&U*qsW*By)h-br!vZ)L4{yshJ%V~pRX=pOZpUI(m|htz0wx6Kv(E96;ZjUy
z*6`O8m9+tuT!X-Cj%|iF)MCTB{T&VD1djbn*ti37gbZi@wp@)q{1_a6RoNM5Ly2%s
zFUXgZ#v!EA?GZ$0`LJ|TLtB583tja_<k9aHHnDD3g}h*@OmIs>Evprq5ERnFf6v9a
zqNE5ZR!mo6ly*t?BVe(4W-1|Kr0<zxki><~m%|}r>#qan7b5EjPv9i$bhwqfWF$FX
z4}{&I0~w5@8NQ@S;j1=`k`>SjXi)vTYD4X8$iLUAb%<nPO)6GN(cZK8uF!2f!Dz-O
z5P$I^agUGnhNjOWu@Yjde`o<n$63xe;4thDLQUE0cS4Ux)@v-uxGQ2ZE7*Y-WV3TU
z>#Zw#IibA{fdU_I&Y4DYY&R_0M%-XuUJ13GQz?C38HP3T!C{nUG^hr%76#E7j7~;3
zVG`1Y2Geo+yewnYTv<y=#q+HruzVyqz-!A?^WHVB;T@@S`y-Sm%fR~9@)y`dDCAlH
zGV4rD0V=ko4fN$CR*z~1w$Aq)JWV)qJEfla<Y<<fKU^BpD8vp$^RRr|2`ycP3kX`|
zv0oD}*Z)_tH|us;J4A+2oL<m7Nso3zntlL*CNL}RD4GW^o!%onJXrT$krm6H+~1Fa
zRuw~a1#Rp9c+Bx+l~%_V`(eLTDF{^lbZ(nnIq6<lQkP7*tD@yw?4P6r>4(z&W9Ryq
z{}!Wi8W;ZngOZ%+eRY<>P#lrPW_lx4V${P@J$4ouAH=cBBz`Vot<cBk8_7H5!s3i=
ze_|U`#<7cKW)W|q<F|LykM^#3Of?1NWUORdWRQ1Kl2U)(+UZjB-E@FQFl7$vYYLMX
z^a|WOiiZwi#F_)<XEf(qbL^YeiJP_{!UvOdLxGB{NPxdgdwTiE#BKUx41p{GtL~W2
zOuy6<x0#ky42W4LCksJ6lj+p4jEve=x)fah4)JKS!>u!V=MbojV9y_gHO`^ZVZI}2
zDbD4V-OlDDSvK+2&r>@p+0I}QZq%CQq>QaNQ>KI3d~ntyPt%5qJ}-ae^8A?=GOYri
zSo^a!R%lI$m|v*;9-5G__tDNR{{xWvbuL*%peQIpu&}3Dy1P(SU!nCYz5Z<x=N0w$
zCa`EHyKZ8}Q*KqYh{=crMdw0I#l$a=dcvGv%6i<*G>fm{#CqEoK<q0dm%Lo+*=jsd
zu^?%i(G`{HBn-FzdDxi4AzFz|`n$;mPmG+tH&aJkGp|y9E9SGcuoCm|_&EBB=X^NI
z+HP?~MTz`s>viyY1FatZ@^a4`4C`cMM=@0wNK0-UL$KtIj)xQiYDJB<qn*M#`RS=>
zW$W5pgX*9SDll8Klw6@^3R6xBYsJXc>fPE4Y7?e*{t#CUu<U&^7{VGRg@d92Lbn8?
ztRU|2T2;%pU)l+EL2*!A>#wf$nLW<E2$w2MC2(0-B`Mh!I(wKd#!>^>v#Y-^7#N7D
z4Sw)o7UNc6zEhGmFy&opU5$_?y%Ds6ZNxb!@fGeJ+J5-Q?=?i6v>n%NI%#TC)IUVL
zlzQv+sR<4OE-75fSmzLe5jt=bM*UgZNON59<8I@-#OJYw(@knN=ZG~xA+W2)r52OC
ztiQC=ocaGMiLe#MTh)~#V1tQBDK&R74_$m>v6UQ`EbZPaJ=3sZ9Peuad{F4MmCTgp
z%6rpSLHPE@L{Z(`0cZf}bBYtKv&UJ!?^BlNZ8ddxGMA6fN*7=`F4`52d6zuz{XHdT
zBU{NxGMo3C@_5hh1i~4@AN##(EQ_fQxf`HLpkxr~CGG_CY@lRGZDqqnLF<yPqLJ_h
zhMoqL5VLtH1J4T#vKn_YdhQ3^GA#5c4eTIm%ulI89I7&idN{AV>Uj`6pzXx9JCh6D
zvG!1UM>t$cK5jk(uKZXU0U;uoPk1_*9h0w+9G--oyYXjoPY0N_PY!96#RKT_Ckr@l
zAiE=Dl0{}_8k@0A@gsjw+kUWJ=!w0(^y6b|34N(Wm*^@o^E>dB8>~I44kbfZ(mZ`j
zvmckqCO7Hx{CpKRD`9M+CG2pGUi;VT*BZ5jT1uX0*S+THimAfG+ZKrz4k{SvWEdr^
z2#~vYtFfu{qFb-&UlXQ#dsBGD-Z7fV-1?n<&n;Q%!~dMsxRn07B5528G3v*7bDp16
z$$v!|Yz_3y#@8ay%Hzt?|5a#!x*1`9z6kGZ%}f8Cuj8~t*GKKx*_LkIsnlPRF`zmx
z?Lppx={IaKmiEP`ivu=IINO~M?m1nrdj*cFX*yVoYnHtT1}QurB$>-I9Wv)Q84*Zn
zFoB)y)Dt#s7d~-9-Yx{aXq~O6besec9RGT9jV7!3^2s$%xr@|1^#$VPhN?+T+;^Ah
z7F6b|+LH%8`wcW(o5d7ZfOmR&{@l_A?|O)cHCf3yPkOU!y!}mX_1jLD&>wz*#2{66
z2JVOjfJ4rxa>Uz$3Fpvq0Kqj_?Wp4hTs>kTVgp=3Dx>qJ4t8Q(D5pKAZD3750b6I7
zMyW#KV(i(pof{<rW!F$!w^KW2P_+!ldDYj@0;UPJp2q$JM^O)*9O||pBpc4k8^+=u
zudQva(^YC+|Ij<TPTCZH@#D6(#iWXbHHGkEqq3ontMAiOQ0|4X-5}NmY+?1ZOU<{I
z65hKu0zHZ;tVs9Pyqe9O!{99!YjxJ?|B~bI<{3Y19c|9;Qo24ns||Q(-l$4j8hiU(
zk%nEvN33+sIOe={Z3GQ+h77;I_T83l4_dDb^2o#tL#<Iy1U<6J*C7#It_p==<N@e*
zve2h<szPG9EdsL`znsO`yYhy^hh^z>h|!GE9CA=gi@|RudE6atX^AoOJKfXNNzKY?
z%e=osM(nIJtEEzY55=(-I!JA$&-vvZKOw;-q#12tOgzv^g6*8Fs|BuQkc*xhqbCX+
zK^WwWgIn}gr=xt0l?b;De^cQdMFZ==YzJ-EIeWA|`=mxC1{?NmI^XZDI2Dp-)b(Z1
z(&Vhazcm!$PM$bg|N51V4=YdQofKQwM@0u`XTe2l8_<R*9d=JLPyVC*Hy60+aC#Hw
zR~4ku6Y0j?%C#?M$F!*TMl-4*Z~shs)EB7jt<{%N>)}r5mI+O2_ZIg%-rtMcJi+bn
zJ*l~LSE~N(4qfJ?F7E{=?AL)6e!5P8l#*1GMgN%<Gf^3cRqYN9wBDn1Z>7N)C4CHg
z-{jNVs4G3U1@mi&ZGC!?)yO`Iy&ReRCg+vj$$vvh-cHF!P;jnW$)1zK(Q#oMf>aIZ
zWSFXXn;v1a;JZBEG;ZV7asRGGsR>=+(=|ZnfY2Rdxwxp_=L;C*Z<j`C9geG3+#`&W
zIq!CTb0@7i&s8wz{od`DP9D&CPeKlqv}e<;^h`=UW_D<&Q~p;km!4m-drGgpy9TGo
zBW2(#bFPGHY}@_zdpoZsqBH3IvHX>R97|FT+jff=J$N}Wcivo2VU>z|-?OywV19$_
zs6PJnfBVC|Gw<`pR4nF7TK)S(7JCwO%ssgib6Vj8Awt%s?FQ)50+$Lk_Ah&O6Rd2}
zb-!=o7<>QIr*6cmv7hLiPU`%{d44U%6ZFXt1c$g2X~Y>LLf{700wjRK+np_8Xg~<W
zeOe1&C+>dzdXObBt`mR!KTy-h|Bbjd$Hm31>tarcF3LwPqL}ZbtZG6&6)HR}+@99b
zajgP9Q>pPk8ub-wf1I2Ki3}|SovnVX1L?_|ZY?vEZNLa->;D?MaSJZ}^C({Ant?w1
z&=~4-wIw|%=T=VL4#EPE5C6O0#xw_&90v91NIN%DTqxXyt${gefs`lPri=<@Iz1g7
zCU7X)md3&LTf~!@GRA_*bf3F7GyZY`tQxh)2jSof(D&&z(rg}ds)J9V*H^)qarxJK
z*WeeGs*X<7RVb4~>`k}1R&KkvC7{mTOhf{ytY^utTi)hL+OmaCH(>aW<(R>?j(wV2
zXY!G}`IY{J8RKv9Me~*NVCOdEcS<)gb${|Z-_s|D$aP^-Bh50DUmb-ysPvDV--v9R
zP6w51+HMV+)SVb$_&^<DJ}b($GeZ9DO4$ARHD&|%kJJFndo9qi4v9e;3SV8)br2o<
zsn-dg^AjtnG+xjn;f6&knDxKc`nEjldCjY4c9kx8S!0*t`bud~ix#)?VqPmu#G2Wf
zoKme_bIyZZOcmp#pzuTEmukp}A)=eT12MunQDGD=LDhT;U%}>m;dwKpI>&ru^fp=3
zm!--U!F`b{@f^Ayt5*GtO)zj+mUSKT+<eYaX3*09*!>Wu?I&{Hs?FcGQ%_Qj_2t4+
z<J>e=;idJR^lE#56@xdrvB4R`P^amLrM;?2V{$K6S)V)ji06I%mC836{AQ|Fqs7(s
zo`3)S9J31I;8QT~4v5by43FozT;@XWWT}_+UtH=V%59e100qQjLEgaTd!(qIxcg&&
zAjn3St%j5gCbg^NC}^5l%jzx)PMPoXKj<j6zn2zr_-Aby4BU>oUsB;F^-?m=B4q#S
zLpFHJSMo|p()kXDgTA!sh&-~8ryeBbUVSll&fefGyJfnvm3($N1R%0-SFV-^b;u7!
zwO=Cq9^Z7WFgFr@Zju3mIY%MbZn9ljseo|hldeH9ddW@M^v3;vT4cR(|1LSoFtPpn
zKNjU4cFFJZ_hT>fW|6wz!mkuc+A`^~5KUk<Df!>#_404sv5^f_AMSm8>dZ!WtI#yy
z)yV6pMJ5pmeZGk;0qNnaA9v#a)-z0ef#hf7L;nouTDO(K^*7-e%LP^y%_mpl<Z*sz
zy7!jFeX?zxqjobIo`Kmc1Y2%4Z<{&$t59nPw~00I7(@Ji#7-U3g1e)4e)ClveBIci
zjNuy4vy)L<Y=3KHo5nqOK$UrE5<Zg=9S6b)C4$4VNXi9>%aO_GJO7dpD7ejsYnR@t
zY1r7@dO2<O+KMVdb;s1PaIa`8!f@q*j$1UShv*#%{jZVJexBx3;Qd^TMF&zM680V>
zbS&IU2s<z<>I)#uZA8K<C)ZSl5*>qQgr|VBr08ICGYD$ryRrH8Qo6Od*`$!CH?({4
zo+S-<2mx=e(jiz4Nacl3_hY&`&kk-t>^<GO;vCZVI=Yax{=|+Be26spq9V(?>;1VV
zUh+bssSdrPsFkhV>yVXBu&EwpX6R?T%p-Uon){1LiamkDAA@Hd28^#|mBq#Vx|1pw
z5<#j9m}<+e`KZkd_w+d8Tj%i);!AwF-Cyf@=2TNl_ZAN@7a97wK8V~)Z&CaU%$dVT
z+G&b8VHNZwR?XzQEj4ukU5ovSdlcG1oNe`sO73FRbku8sn9!oW7wW;TJ@FA^$gjkP
z90y5&wJWYvhG`bm_z9+rJna0v7%2yNvTTVk>c?hjczf$25hI!~CUP{@g*hqx1rd1S
zu!xC4s5fdwCfb(l;8}3@=yYx@l2w;Q&Q|Y6JuXDVP*m(vePz&K2smiKjb58AOob|O
zJ!_k>TNM1ro-*P~ZwaopE-uq^e%zmgLC2HJsK3Y{QVLhimD04j^X)}<21mh!9}4y8
zFYXT)C-YEIv6h&yyvz0^_b4`vFPjfu?D#;_6wljK^U+I7GQ)YS+8i?w=pRGfmqRO<
zT110bD}aQxUGR`g!So+&O6whu_o3vA&vJKgv-wkeAz@=b?x3&I8&()X(}tD)HwyC+
zGnVJ0Dt#$;b5wX&U@+ilJ$>2%SOaGVXS)f}+w1NB4AJ?O(G<_$o$|Zo&KeC51l9o6
z2*P~OrON<r_baArclnM3ze-l~^I(&ys!cs5XGkt{I+R;|9&g0g2d2xB9iRChdRvE*
z1>c?ngBepso2ZZpQ3&@6=23`G+kC1<j;Ba!TtXY1m5<jPhVB~nVf-zw7d>kDii|Ve
z?z{qH<CC`NQdH*R4w$Nb8R`Dp(tEKwXrqv**&r@(p3B%k)X+x%&!3n+k3S4P{C_q-
z+l=VYU*YfRj1%C>&$BQ=e}vTV8j9mv>?s0WEMLC06}IN^qc17L{c$U4n4Kn@9}-`b
z7$>ZU_Zt0$Q-BSZ{V`+>uUh^6oJ%y7My#oI{IH~#>CORZsi&FT%28sE*47TPl_@^z
ziryw-{`2v^*y=f3{G!XCDM2F;W@0yqLCp*OT%k!wC8?Dzr^f3{;To5q8kIi)m%-@G
zcp=SO=o%Q3zpq_C3G&6eSALzle2Eioh$i(vr(cM0LB;ykxfiW!j$Qh!dG>R*Pm)wP
zX1MgZjN@sV&dHj^+3D8VB~RXKpA^|C=*yvb&mLxiVB8q$aXJv7rFPb~hcKVFeaGv7
zAU5adgd*r#n*+kkNvtPWt!g_dgOK~C6G9!XjbX30500ZWfF}olH!B+c9E4Vy!jto?
zDQbfO=x`^yzE5ssU%(fPB(==f=hS(<h}8xaXZz*6_SE9(ij%9K{6IHf74I9Y<=-k~
z0Po!(wlXmlj;(CWKUmgqM7woHO+<i}F-ID)fktc{9HoN)v#7L0qaAJ34sy5t9k^{~
zy!wrzM8XH%p{voChMpnTY-J$nKz>dX#B|Q*hG~fa7`l~cV+pvQ3}C!-clM(+E<LS|
zU{>afiTbq;q=ZS8<KxqZ##yQ$g{CWr&e`t9d~rKp(9eM@a*)AgOEVcDi`Z1n<|FDb
z)(KsR5o4GM^-l{I53~WUW$<iY&Z(l(-*y2JJ`h@z3eYVFqhrYq_LhTwEXvI@gS^(}
zWYpjf2&6pKy{OVSNiXd~)2QZEX+h7-x0cV8<}gT$NsRR+e`f<Nh|+URjO12_n7fAm
z7S=TH&4@nkPLf*XGwXDReg%?qsOwM0L)+Ejuo1ENi#Zt~yQimggtE$K)#Wq&DQw@J
z5%PI;uq)O6Wdo6NXhY)*%<%tJ958yz!NXvAbQfT3LV8h--?Vx*9Nw^IqKdtE(4{U)
z-GwBf0n=W`RU(|}@7pX}`Z6ai0=zEO@JEd;2}f-SCoj)v2`4)W-RL>#WcYwTHg$U?
zP75TEPUDE|ec@Uc3>bA;-v!Qh903CE0wK_a67g_!Yl#!wbKGIZ0Z&!jJ2CVuf})y^
zrOm8U+T#*Kd0^1jJ&&JjAa*R-P3xBH%S&W!GH_YOEMFsOSs5koL_XYT1WAYRPFc7s
z`xP&RgX8{22y?Zizok06@<RWwwvD>vaW1&13?649FaDv)Ddq9N1<7T$;(rQ|tlv#d
z9cg*EN7eJxcTqh>P{OCTT>mE6%?HtYk8s|F@Og;K2TrC-i5FtGd1B-a3fg9`ZhuVJ
zR>8$d7g#hva#aQ4kQOsAQ&KTwLV!gVSx=$u1gM(-Hw#d?VIEcnlHb#TZ5^Yr9(nW6
zC$t3qxc2Wk5?eMRZ`(9b&@fz3330d4wJQ^T3JPTYGwmZ_WP6|Zo!<jXXD3S?wFked
z+Q6x>dw#2}!-aW!CnhE?j#q0vi>lB%{}84<KB;Zi+qtx*{vxM&Go1LRFA*^vh%SBZ
z>o%S9^W<j2Db_l2F;HFDb7x_pbkFwZPfkvI{;(ha+oZl^u`lQ~o3uywoP9moJ`h>`
z)ViU_LESc%zU6|l8U1?yN`*T#HRSMk<x_-Qz8L)g*j@bZ%~dv_B`-Dj8o8jlGd_YN
zdMP=rZ#kwJKKGsHKY1vir{}qNnO$Sq!|e_Ax`yJavf{&cIwDUeRS0HbXUFaxv&Y}&
zgCYC>+rMLw)(61(PeqLayPRQOTOg~a@?`EkHf)Jg)aY@me@Sgal4gHym>%z^>j<?^
z+7~D^3F>FUU(zHAN`#9Y0WMkj&t6yD-!m`F*hL4W$QNj`0%GZ=?5y5FQ$R8_%4qlS
znSJ~S$&NU7U35U>A1uaJ@eaSjzW(<m52W8Fy(wL_`*5As*nJoylD3tw4<cTgG^X3)
zkGa~>*U``O>W5P<gDCMJ*VQjZ+%tZ@c;E=m8$Jw1-s83NgW<l{^V)mX6Z%!m!eHKI
zz6R5+e=(#*OL`P1TWlMo)YiJ<iB+_o>`{q)L&y6D`JhS8L3>LmCt^_OY6lV>p6y@!
zU}Rb54dE9XS2XeKoooY7PIs=NuHm!(p|#HE&~fZz5m|bBu8maWI?OO#py7j9XhC2D
zuN(+pb~n&+^WMe)9tX443!MDuDB`rF$}*HYCxiLz3I#qh+KlfU*HlQ=e5NYO1Fz6S
zzOf7_@?#cP-H5Dmi#ImF1-BUC{--7kdcte(C_weRAfDot6UO%K+4qa-CZc?;mSegC
z=39F>dJd|VvaJ)e<2XXrj{s>>sG(0Km8(hhiSF>;w}I74{DY$fP!|~lu8k2gmc*?i
z9k#tP4*Lu}TeiQ?B*kTo=A+ACzHWlJsenn;;&<onxSJwRs62Vrnb35Hg{Ip@q7rwn
zSl(4R9L1~CfNLq_H+7NgqVxc3hE(#f_-hm0lth#Jz}x<TK85cs{kIn)rMR?jHmp1_
z19`(&@)Evccac|+R)Z;em3-0+h&cuFtbNtHZ0i<V2ehR!gRm8Wet~Eem1j5-r&L(q
z5}r<SB>7-2lEPYS#~!Lt#Z;E{Ol@@K3@!hp{Z0sec-&O*{8bVWw>3=C8e}E~PI5K3
znWGmtyLkSH^+q;NIKH&FP45^{SNq!(YYj^YMTD`RXk16Xs63Y#B2s{2^q8E2{KRj~
z6%;RZA01F1=ujdSTIE*0b<S^Ywsu_iK)|hFeTw$JCL`3bo#i1;SYc(5PQt-A{Pds(
zwZ1v@hAV!C-0ol2GFbF&aI2o6iBq&#lna2P7k=H>7U9_AJaRjmUb?4F16~U4oAK?g
zX?cR&lsZ&f&{9Iw+5W!Nb5d<Bo$d#rxRt#JNAi)ZHllgcCyMMt%M-DeJ@PInbGtNB
zhz)xWVi{GRJcjdIDRW8XXJo&lebh>J_?%W1Vxx}L3|338;_8cU1G&J17t;Z%Igh_=
zWe-jVM6^2tL{0GAMi8~L$Ccb4_W<Cbe-<~cC!C$UOjl(vwbm%*H1M(%_M{QbVVr#c
zj|d;o*ccw2Ne(t)nq>)W6Xs4A^^iaK^$TA3Mdbp7G<ke52Kqi+fbK7ZN1A&ayzJ99
z(_g05Hh?+=jUCD@`dckCjx^8{i7)-2Jsvh<=wDw_yNhzM%Tt5x)?+?<6GthiPi^Pe
z`3(9OBKJ1bm)<}Vj@N+H?i^b7J8x37uMgC{M72AoBV5!0XGu@gqcVgY=S5v<%seon
zycYWu=qU$LP<L}}=*3~MO;g!Wy-`6cO`5osVwI`s^5-=1Th9Rv0A81$C^*)%t4wSN
zNI}@wz`Z@Q_wpy~{boJBV~VRWvU+k%tce6yjQ?O!Wka;4pdrUaYjXo$y}KTsIoo6h
z-+EX(DP$EVU}>_Fw=k}fkGl<ntzt9Tp#>>|(iyQq^T3b?LNA^RHp~CKG*5GGni1lX
zSyH?uEr|C)GV&sA5Ca$kSHElrbppL=N;x-E>!Wm-!~x{{<AS%BjQ{ZKbR?c-WIEGh
zq;b4=-vWedky03W-Dn#5=q$I@)UV?A%Cjzj!%>Q7g!qr8zB=1FJGtC<Mmu{R>N5&A
zH(=skCjvp;U2<3#o(4sZf~n9%g!SIZ{dQ*^J2%(vnD&LuP|<^dHQSz`6Q0ot1)aAl
z7z6ApjmK?@*Pn-7s`M8bnp{yby~x0djyGmJFK_DXHQN#;UNYD?v#=9-qf7x3_xuO6
zdH67-#*v<#H@@M!&GR>b{-t3P)lBDcK~dLS9p@y(4KrVu1{c~Qla39PswW5U!rEZH
zc|CRrS0RfOu`U_$p(gsf1NDSg!X$>4DDF2O^j-b5^643Ip~kE`72(eD+{iXr35WR)
zau`#7N)vy9t*P|KwQ!Pza8Bb=TuOB+`Xy2?#RzFtDm*m1u{Y-j2_>5?V@&=F-@FJ*
zwMxN2qC~E3xy-l1aAPZ(%z^VW5jft#A`Srdv=CEc5miCqxB=lp=NPn3z3%nThWBa2
z@BdF!(s=O$F%-7aw>;sXZ`0XE*7QKKX5U&>8V3Md8)iP}>!DXVrA=}8A^*3wTf0t>
zQ_i!mTU(cRd(QR}&f37oSo<~GUVHVAS$5-X9fWNkAe-O3snqP%V!{ye-~mM~aKyV~
zzCiE8`0nC$Dw0I{36z^Kve~2-RVU%_3*qB7b(})u3zJW=Lciwb!q>IUFKC!As|2YD
z|0-bRIgPv76LoOBb-D?b6#WD<SU(g2bDYPfLG?z$wcLOy{xUK<!Wz8Bdp#!`;A^b(
zFx0i}<wB@OWz&4n3@z{(q$dyGd~`w<qGH3y7(F4l{N`m`vk^{j!4F?YM8M%8t3a7{
zZFsu&$*mDNMaWq}d|RvOR50LeRwX)tb|87&kK^1>n9Unju>dsw-5@_hH*0|8xq+Xh
z5i>9*oR>rDUY}ylazIxDZQa(9{jnj=u5KELg-0|lkKF5OiJ|1J*6V?Pc4Gb-5~cE3
z)0boNa+57<pp)7bC{&wSRy8&H2^^y*7F9<>#2t!xn8J)-q@>$0|1}d~o$Q9QnQt-K
zzYGx@%=Hi{cHp^MbgSq`p9a?a8U)A#W|Ih<*CXh2y{_374-j?al_qQYZfKZkN_e)+
z@Rt^p@f1`HJHCcTz%WPvN|}^$0Zt8}Yo0mGT0w%NAtjQ<mu>R5h#XdFyp4q>%ztay
z+2tGMhs`(&*5xBjc^^yd(D739(q1XxY$BL?f&K}tw9@7?b15{{Phh%25TOjSjcIT_
zgBu}h4gdb$`8>Ayi?Ub!wLW*q+nMy##P9tQc{5Hh5GfnvU6#Gwk}e)}t|PR=cWlO%
zA$PA~4a`!mk1LC9Ms|#?eBQG&U`lLrfUzu?*%$J-zM7D-gh}U+1-(*3MABfD7`_j6
zg$ief!SN6G?)&0F<H6)|p#nU4sIO2H7!&*ij}cb{<Y*-oy6z6SYru{@-4tqLnMrhQ
z3|{JbG4v}6tipGSS?^2tKT^KxT9DokCb`zq625LI&ONn(@QtJH9wA^&He{SJeWOe2
z&CKZc8Icl|%xb%=yMoQ9H_Re#`KzX+Bi;z+ZFy%8s}v<Al|vY0x`vI4A$<H|Q}*f+
zu$8mm{di$04;(#3f(UuNH)VS#T%_wIV4C&Es!0Dz(-S0^+<RR~l3tAe<sHDBbuN63
zq?^QTHu25+D|Q=L7*CHbrvn+81$mLsuulAa8h-Qa`0EYDpIf_u^BClye>Q9dLxq_x
zYmlGo2`{skL*^DX+y61lnP_VDX2Oh#cr}S{ibW0<=!u@gO(}A)4>O9fdP{;S76GEj
z{#t|E13Xu{Km`|2aQ%)Y`(^}l_A_zIY$2_ypbeC|?N*Ff%#T6^H!a`O6*usauy<W-
zE=V0U&3#K6U_-cSThS0pao=qbFD`IA0OnEe&#_MJ0>5^})-@S%Q!3-Moz;~wyYaGx
zwCKbAUyli`HNyOMi=kGYS8U1YuL8?_UMcIuMxKqio&4@g<Fxsqb24;R&3j4<J{z#_
zm=7cv(e7=WXqY3=v}2JEg+mNtHxqs-rtNPZz4Rp2O;`F;U{*^{2mkFmIf-mGhFOCr
zE3l2Qy%=*@KO3(2OM`<)z24HnxC?0+8W{oPR`Eb_T=>nj$paC6Mo7k{7rpJ2%0?}^
zj`({q_G+x^RSrK(u)~HS21~B>*qE}Y29>DxIyN9G#H?7u<ogs;6^Edp$KXH2Nf{vZ
zT*Vn+YHQ75Im;SP!pQ#34#3^{mw7tf#U~&c^xOz-Fmjz=!?lEZw;nHTVN(f9Q+&GI
zPa5<?OI(H+?&OBu>TeB<VhM&&<4`F0IkA$}V*WEZhQC2%RRv}PcnR7%?!syAHmvK!
zg?nm?2hkj&BuxPi{+{xP)*e@6W<EYSF+WUNQzt{=&jla4ULlld5fu!Mp$w0CGC!%9
z|8^t*Wmry;<j#L+)Gomx?3ocI0wjlOUNDgW@wwK(hh45uhO8ZU=IOYkc%<$RiqcOa
zkM-D>RVrr8<~O%?A&O5t7p7SGs;~GOk2-vzkMyt17qQMW9Bn&zBa67>^%u#<s)Gx1
zhk`<YnQdV0s7d2&1b#1H`!e0-EFtqNEeJ=WPl#J20EL|a4~Lr!E*&INvD7PFLuNP5
zr_VY11_#{76QmH(a<8JHTm+VySWP3w>>kf9oh~f>+!bc=iv<!F_}2K-1M1V4I^&{r
z&F`TCasQeLbG+?kd6kQ-$MA<gG2G~^lU7gr-{!_RIEvV#t8%6Ce;b%rRwT=g4=0xM
za(yS+WNrh{_hwvo-0^-$ba|*iZeW#TUKWm%Aq7F&Mu2b}lIz3Ep_r!aH!nIYfA$^l
zTgzp?th!kP5lw7)7cOH6e%t3r?1y3c&sUIHUm_=D5c)N&Z#||Q7xI$dD~_c;fm$Z%
z`7mq3ED`RN?EMhKOLd6?e9ytiAt{Ml_p4d?_!Z+tN3`IrHyGJ(V0v%4+`Ml}FD<hr
z`0v8F&KbBace*wnEp#`mJBKE0G*asB!8}MXqjTQa$@DxVR4=M2jx3rz^muwk(=K$a
zY66<W$`~Vg;EqBGdp4frvcaugQ0$5CTWTF?ScH5(1SE%RV|Uh90=k3_Zw?Atb;TX%
z3F|!hj}N1|4)yhEBGXy9P!xde_Vnk$*4h8If|t4%?#-Z3*J=6h%KtY;RRGP&cgu+l
zdd$67>ci1FM$UMOm~ZaK`|R*ksh97=X0);pN41BWg}})FZD4tM`RlK&X7ePiQj0|g
zA0JB>%Bv5U=Fg7&oDtSCmVmPvf3mt$M^Q4vcOIoB>~EzF^~Imn<LiQ^S)aKWeJT!R
zH@<aad~zqwzhiSYt8h}o7KIMXm%mXfk*r94RGkg4CNyeSkToxOx`o0W(K+N<tc+WT
z%k-cS{*(RH>wJANSTF;tivG+B#^@2Y;P7BQGWTOAhTzax`T%m5+Zv%~q(11J=FZWq
zC&4D1$2nF~8AL=q$WpKZgL3E~&!|eyL!Iwf8gX$8=!~|0lI~?zSEVZT*pyv8DS;gg
zplqe2@>GkxFBj~bS0C-=zZdys<nnp0-j?4=pn?a$MExCSo1}2r6BfGmk_b&F^w#c!
znRe)7Q|g-;+Xy}oRhFH*fhVXAHrUQVWylw!C&SE1GjqfSV$if2K9Nq}QptQmh|VU{
zD$|txmhf_^zJVwHhkAOErALdwD?BUiG^SKP*Qu-i(|7S2^hl-?ax~i^(nS;WXTI|h
z8uBkgkd?Zv5%A}r071>n`md){U{CD5F2mmf$UG<U!T5p#4-unfJn0G7J3wWWskVF_
z*2??YFFoqFb>@iHP)oW}eTcFdeVcRoczU_+C%qXWvBo<4q0Vq;fI~nM&(JS$(Rn*t
zUpLCotAS*87tI(J%o^21<0U*eFdk}Z`FUM?@Bv`BubRxblV0-Pl)EPJf0uMw{<MeY
zlLm@33rVR2E714y@rC+}lbPr?*9gbDm@Xt@0BS+q9<q3O7|#`WJUN_MsC{ln{I;(c
z4q7N7D2RTzVkk682wZX-f<tK-HH7~=)r{!qTQ2mF`^2R8q5LRuO<*I4q>RlDD;hMo
z8VVTNu=0`F{c=$CL3YrqBf-dauj-RGKb~_}z$L4Yyl9&4{+ZI|;}=P62A_z`i^aN0
zWEev;{XJqqx?T|zi{y>m@Q)Q{9(riGTs<3!i&tfE<+4}q>V~uCtM*EJ|7v3>mr_Ew
zC+bx{W8ruBxpD2YBTJ5{oQ)eKk@tB7KNv}lK%aYRgJjki>XjN9#ssp*aW^m4pmj)J
zf%Rb|8k-u;5#W!U&}egQY@6TtOoiK^aEbDix|x^-ylxG~A6o^G2d&Q-MZOk|ET8=O
z6*f}yJ<r4kOeknX`a~6(l_}UC&6S|80D%3y#l%`&Lu+xo&}Xmy7dMuCdAurYQmskY
z0(RXq{)Fas@@?Kt<VhOmWgTuABYcY-B$q8r&8X0WGwu0A<?eKC;D63f|JPfW!N5J=
z8qV5AbCLJ2y_SL?&e#41VR8^8sVGVSE?x_rcpY~A`KhTXQk({sV+gQT0awm!J{*7b
z(qKsQY-ij-sK_AHmnx^cwRgnzP3L%7=VX7k)M$J2q+@7#S$!$~==h*{JNiBL_Evzj
z{XzW%JEi;0@%qO)fpIsEzWd!k=e|jOJKF&AeRZ{$4sVGTAP3zouTZo;J5utW``<=s
z7OU3V$<VO(VpzS$Q*|%&sjxquh<w=QfVV}VcKQfg=MBOE@?YHF6(SaSUxHe6QJ}<A
zB+UxOR|#o@oZou5QVT^I4`__sstiT<)7@=$0UV|20KVlo?J~Wh4FBf2Db$TL#XncM
z8|~^&78YasNH#yKQJKb9*GFZT;Hh~g3sdwBDNZ(57m(jGpwDWFdAf#E=|YU$PG!ff
zY|f9xli4w>?wgGdncw{;YqfL!Xh3sY{X8FPYmc@@XbPaSTY~86pTPJ>%k7);=~%@w
zRrF)P_^b0l^+2}aq;^`gQ3~Q}{)b-8m1vGJj3J$(J*~5mHxW9FG_SUNrDSB+s)t*R
z_DOwpS&tbeX%N16^_$2CjU2a2I^ixypv3IkN;6j_Kn8U=Lz`?%+HDA`<o=>^%eDq*
zH8`Kxs09DBBJM=1!I(zNNSV9kujVR7z-*&B$A2z0F@xEF`y+`o^AC30I{OI$l!qTw
zfhE=&W*Csj|L9@`uV+YN#<Ej3DUZIEH{fF(2nY1253X|Q93RI+4<~<xZ^Vkd7cEOl
zVt)VKD`&c#1*-eef5y=cc=$o_L&VR`AYf7xRTIkod%Cu6{V+=VbbrzU7#+uI?g-kJ
z^Chxt=OZ!p9%)uutOF!30)Bn&y07H`m+r5ClehC=I~1*~;1&HpN9P{S^!xwu5rw2V
z<`m{oCZ~y*e2N({hnVwnK82x>9Fiz<7Mk-MGv|@R$hjOQr%5tJk{nhfAxXmT{{H-j
z>vA#g-S_M8d_JB&&rZ3%t+X^J$-mv~txco2%Gr76_8I*Pu_Ldo`}=7hw6z=fX7tAP
z%l{2^^TnhmywoID8EDHQHhY_+^0gkuGi4rEP>M><fn?K;7Y!d9AqECK&hCX+T3Jn6
zBdC1O<m<BDVD_dHUn2(8$+im*=`WXUWSj>qb2MbKvVNCevM*$v$Vf`$?CdW|e90m(
zKO^~wFD*`=u!e>Ug{Dh9eW>)riH$(l9DUYAL)uJRSxj>T5Rc!GM+~7}+Xn6a^nDNl
z!bBX@UwUvcWd^RP$%1u^c8Wu5yZs67-q-y4@b?=bCKez+Fx74H67{`z-gSmWiTfxf
zTKF+{CdyF1{^Q+_ZPP&why5C?a{=$f?`6#~Sij&}xg%J4S>1mCkEPe_jAScWm|l%I
zdc?UFZKXH>!6lK*W?SOhH|{-4p_hxV^ewP)KwyspyE$p^JLc-v=u!N_oUqX#NEFch
z3)yR35feKN)Lu-tt*o{h=BXBMFR%1%-qtLl1krO^{Pw#y_9OGC|J$OuW)Z1-5y<kK
zW(2wuNA8XQ8ks`?!3hB5(u$OnZGtOD&~yC?LJAyy(@x{+gTve4q4`it4Ja%R5WNPQ
zBM~dY%CWygI>!F8Mup1h(%Sr1%l+!J!Fj0&`b)Bo-G-bKA_xGJ!pW4SB8uMypV+WG
zChgB0I6U4B6PBk>;(MNv(zWd<u+hjZG-&?D0vOyk_Pev+DMCU)?Q_s~7DC9_zcM;0
zz3;UFs_cU|aZpND^A#7aix96G4>d{NBs-N<ewA%4lYV)ZjEoIpkmot{yV2Dwf3Fm;
zDT6Hipq$f#kGzCa8e|yVJ(wqNRVnqIA(E<QlB#dww?!FYS+OlmToM?b46ijKFc0k)
zJQMdS5ORTh0R(yeB?bjz@qW#Z!}64PI4ac;Ngn$2rLE@WYt#Cd8Ia;7Y@m5OCYsJc
z`S@XE{aJSTwLX(NnU*hoivgn4gSo?H@Sk+lI{LG@IKJ^|fG;8N>aX7UxB5W~jWN%m
z2>#NXKt;Q|jHo;&7IYb(e0Y|gb`-7MMYQm)o21k`+{oK>fV9l0fK{X+KsltACAr2K
zTj@cqr1*DOhl=(-NV|FAV$M!DnUAv%m>ZS_IUgLa{TFu8%}k`;9PVtF$EWR^=h-y3
zlmT^Vo&PvDfAXnU>ujQ&2K5h!uMvOnXOpoDhAlV|5<f(-Z>i_se_388ti`IO+dz{x
zKHnL6>gC3#!+t_-@a%e);NM8ehsM6Aa5qg%2}w&?OIQ}&SJ)B%liHGvG&4yu0QCze
zltSAI+S@)YmABN!HgiQA8RGEAqa)so71x&G)4gvgvc7w0$i!Oxx;$Nn2_r_#&}4y(
zoU3)u`c4sirtyn{$QtNBZ>kdO1!+8PNU_t;R*?j1#9mFefLrpZf|BetENK#t26U?-
zmO5|BY`NtQ&g7x(dqv%;VT&bG!Mp!T$ZK_e1=|N2coK03(EJ0MaJND(-vru%Xeni*
z)?wEEx4`OLpPe*OZYDGc4E+b)SWJU3bu$n-KG{E-y{1NWZvN!24$&HiF-i%StdT2f
zzA;<3uQ*Wy?tHQ+Atq7of=GAyHLD9X!tYkX&vr*3xfTi702&B@7jmO|b0R>I{I};{
z#gT8<9>A~uk7)IURP>mr^3?drj&drZ93+3eqcFc|vc{0?R|Gu^Y)0qqSC|d***55R
z;mFOGYOKH}BWYyk@8pwP72Y%%2BEVR?NO4Zd@E#jKQ0dFHm*<pQUbQ{zRlN@&7k}M
z>PjP8<-7ge4I{-#t;DQ$znSXls0R1H-_}8(j|l$9#t(|lU`tFf@#696{NUyL$>5!v
z^1g+PpnCns+4t?vxrZGc`}zOw3l4q*YnDeHV|8_Sx8za6YP-<i!;!@^-%6NXN(kdG
z<!9DM4C(#&gUoKPvi>v(4IEZ6%eaMQTL^C}C%Us_W0@b1c{-?{f`DiO=#oY%Vug5-
zfAcqoPA4#HfZCOtvtPp$em=!^n$oZ`v%9E)urO8S)U}#=>0emUy>YUEqR3D&nba_a
zyaQ4*D+Co8Q(fI&r`sP^=$agFYo&%*CZsh~*T@_rnXw@%q?ORg=6EuMfs9WW6LgqP
z;f%3Rd?^GWBWAvO7xGost;Zu7G4V9bA&0g7zNdeqK&qnaLj>L6@7bGK<UK7D+C984
zrwbPl1zA}o;BE<@M+#W5=>MiYddMf$<YT`Gcshw|7Wfq5KKZ_Bjd7R4^O6En39r9C
zGdLZ@r0scD>i*%K7mk1br=$rxSozR*uo)VUNGSB>u!wJ1$N8O#%9y=ZqGjwx`+%&0
zDm*lPvf^@_85*32jr|(l_J)UU-c0Xhfij741LGDbISs$`zjKPO%k0I*XqFodVeA~2
zJYUKOhyu2-(nw5%=yIt*Le~57>gsq~>3U9y5dY2mGiEq%vN?R<HQ?s{o$hI*dD8NC
zhBc*LUtsb-`(zL>I(Pra9CQqaS3<ONv(6nmlIOFDGE!%$OsQL~#^-l*bb$LJ<@sz~
zMZnMQt<0lu6-6jc<59HC8MI#t;OH}biCC<WQpUeS+kKov=)0w}g9mhHCJO8N^=WRg
zelzVt!<UMP#n7q@mxciOtCf<6O`lQBvhFJv)QlhLzqC(mPe3`tVOWK%3<oPr=n%l;
z*_dITSr?mg%jX7XxP(9DYIN7l@sL%FPd)$g-MUk;40{~w(zd%4?^pIHN9Dm{!qVkp
z*_#ICM#2b^Z>`?IT3y%dMpx6M;NKCM$5E`w@L-FPE1retj@QkWujdJt5{0MNoKvy}
z-vcSxvYh^^pj0VXXyY$ia>7Cx-|vcm=88jSZF5t*IrCQ7)nBf@Xg?b(P}vnI!pc#~
z<4+|})BsESA9wSstCQ7m-svX+JYq;Ly1shOKSWCmOl&}ZG=H=?KY#d30HW#&qsbWg
z9OLHY^UV5>$%o9>x@M)$3EuOj4XXM6T{EgCB0-u&u0;*D?QeVAH)(Bs3qL`^@Z>k?
zwYO6~b1jsQwd9~JzqXFqp0Mqe+Zwg)ishDNJ4><S{Uau_ml@{XjE_wWAQpY#sB-rD
zpo`#Yb=LZX)_C6=k=WVwmNn4bVc`LyOcMRFEP4R^k9$bTR??%z`*b-Pwohz^+G_|`
z&iXspOsSqya_m%|=_iy2+%~<y;@3=Dy{&;q{!J(!G#A*>U#nXaIEQkyU&Y}!I$z(J
ztpoW9!sa>WM!(mR*KqaIoXT-_g+NVi;vj;zNQ_g8>m5R#Zw=P>>Y(rYi0aFTz*w9O
z5UIKkx```cck8xj;|MAgqu5;?bCt)UnSSNeX1O;9EOdk0`~qKTr`n|$jd2UM^qge1
zFJ3EpIcWe8!C>cJYuB+v=$$aVaE!A}+F7i^P_`8lIBJ0cOq+}sWt9Ft0Ww&#lMLEV
ztPmOn3Z@&Mu)#1{RwcdY${o+9x`?UMU0OHna`-cvucn~~>(<bR=i8ag3z99;bnJ&%
zAu+vm{-p}+I-|&AUY^HKqLouCG~xmT9Xlg_Z!M1ix`S3>oQswd_@#-2(M}dLtTg!d
zQ2L|Fcq{JdFh%X*Qrb(Bd*P7aV(r?heq;zcCC!q>`&}}E^wRIv9g)l2Lrk)c`e8~@
zcIcc^^X$7OVxRXSxTV}2rYFHP5YStM6Ou$%j#FBM*~*xC)9-5DWVXUA04?#l($UD#
z*46sE_~R<aXIm6<>$BHRUp@SjKmTw0^wkKZTf?Oe5c?Uyi8l+VXfupdDW#>lv%xz2
z(eiI+!ljhwExK%=(tD(WQ{eWHIJo0!T`Vs=q~;>~76j0NLJY)TC6c#K_P>??{jI0i
z(iVayPb_7eL@?i2<8v3X8j(i7_$ev3A0*eGp~^Ai#gd7>{D4=cI==%w*L@9pkDIb9
zY<_ty&@d?HB)=>C<(Ad8JqNb5TfvUcWiYqxAS(5Pvz~Rz9zs8qGK$VC^`3H2I{FVG
zq5Se8)~0%i8Kci6d6VSwBD`j;<KBkD^t)JR(+?UI<{&l?>S@z<^YnOoIf9;7AT}8A
zHLe`5>gA*!6wd+nBa9JbOtIR<>;Ta|(Y*G^oxf8~2UAW!O@DNFy<)M%`^JWSGMp+e
zKM}?SZf%CWP4L~U1i1~$r8WZ1)~uL<h4{^a9YlMzg<+fYHxlH%7oNQT?_1MEc+&@4
zw-xB&g81PeV_f_ssk+)cpCF}FcZpkj<d_ly9bHwRH0>f<71d<5zD5`UTc6wnm!bUA
zFs&<BhJEE?WPMM%k@@t{1pmIUil-oy2*h}Cy8KV<HoF6=ehao&IGZU-keE%4eIUGy
z&sTkRD%>Mk=&unc6F=@|ZWtFYB-?(b&W_yC;wR+X=_q4}7d-A}JWVE3&@v2_RPZxO
z?X^KdOI~>b98v>Lv51s0&y<i|KLG!z@NT3{WzN^ybO$0lp6%=ef~7muZAcG3C9k);
z(s5JK5^GEcn9-W{VdTAyycQkuigJah`s|p`RDvc=SmDmGw#fvx8^-;dmOL`Auip`t
z%&UNBk8&dPKm0EMw9BziHM+N!qb3KNAr5Ln#z|f^zscYNOQn+Wp}fTS&fV^<(TFZb
z3mUi~?CUQ{au2t~FIn|pv$C51r1hL$rw&wYL2N*PqcyHx6v#vQm9d=>lx*(Pgw+_I
z)JO%Pr>kI9(c;IqACgGZl;l8wUf&;2HkneT;i5x8z(%F*O}*d7XkkhNh%>!UGBZEd
zaEs9k>xb$lo1c?=`%76nN0-K$k^tvP7gdCxQ5>)xN7FT9{~R6we-M~FBu~I6mo*6}
z`V2A9GtotHlrH3EjK2eWTY08OSm73w+bfo&958fp>z#q<^<*95=f*#T+=~h)>ciV_
z9)-Q`*MoZ`dwS*uj%3G4+e|9X)?<qauE{*dgdlvSKxyl#FsPPvY&eS9ITj=0)+=g6
zo3SOCo3n5K>zF^N^RGEtJo<L@#!u6&nXNu@AAD``=+9u^CImtamn((X7@4v`WOr`f
zP>!xAqtzwkoDJZbZXlg~OQ8R~C6~FmDU3${00clQADV8$9$y!~av!PXDLEQbsRvh>
zkz`FSjt67T?Qct6O`lAHl7m!f_a>N2=)G?X7<M9lwXF8c*9h1d!2v>`3jIZOMnq43
zurWMkYw_k6Rx$bT;ru42@>UKBl62MJ^-(UC*Xr})5J6+-&qEka_54ElPBglA^rXmR
zL$NQV-qOrbk}oVv2~Ee*GEAX=R7o-tEM5O51ScTef@ESayR?((Q02u67VDA!>ilLJ
z=j$(NRRI4WiiOo`8HD70iUo_3&t0csDwQohy2A%R@(+6qom4qIZU)zzxNw2BVsUf5
zBV=6cho0~W2yMT*u1^$;L3nbj4`1+3o=`XbqdHF4d=?Dk|H0Q_KB~?%#&+=6*fsfX
zQ_0mhm!5g6PuUrafOEs*SGtSJ*)0<h7k{e8?NsyL)sVb$CDIT%WpxUYJ;JZ8W<oL>
z6pUNISVnm4?jLwIfs-b0WpT4i4Y$S}Jc}dmXryM3AXyB1NacQ*A)Ltf{Re=(bV_$#
z)$b8G{$UxiCPL=bD-u-BYsQm;8?p%B-M$B@T+7R?@OG68cDU6#Tjw3arWjW;T3Qe^
zyY2`NzT!k+-l*B}%oSfEa-{EO61UJF?uk4rDF5~|^gCb@*kE)N2kzZWo?xTRtC_6b
zPpJfY(on5JNO4_FbzKl<Qg!&-xvcE~VKT+afnp-^Fy|o(t_Sbz>6rh~LT?{g4(xK2
zi5%a<2%kCUDY4NR@9UO`&?6}`+><l69o{HsFz&6_qq-;P523eg?G_2WAWScTZ`ec}
zf*o#jmurqOeyn`5`YNfP6A(PtsH1dYGPguI8c~i_k$Y$TwbTsbJzH13N(9Snh(sS1
z`&y7p(6^)^7evA~I{cRlc#)f0<BJ+<CKl6Hld3diI}rquCpX25ls)rV)&J3+^u`p>
z*7-ycXv<;i%#csWwHnOfS<konQr6Iw{xS%^EIGwD$2~`uSRlwe_q_27L8ch$$*A3>
zbm}9hKudJ*=30wxN}7w{AmH4T_v$r1<rFrn4NRAB5Z!)rdQzhdS_Porl~wY4@iRI0
zTdF3A$KH9as)2tOPAvuCb<fN`H9pE=hZI~KIcr~qO;ko^Uy3)7%cbYp{kPY*f_$vZ
zYnUsKyeY@RQtG6AGfMkW3vC74?CHmOlbQ54;7`T;zw+7I$jzjByUm5$Ah`OV@A}oQ
zwfe5t%n7X9_1-%E!m)`79$^)fyELO6IrSgP5D|PJd{Px_Xsov;oSC-*XLQbP9In<M
zu7W{1;_>3z<o_=8!K;Vg%(!H32G2$;6S?`Gxt2p$1fNcXPm!qsqV-SBG02TR(PF;O
z%L((?uzrEGAUHN4la7wT-MAS<V`25Gl2tJQ>rNsT^{!H8*=jEFq5`GDEKK<7Cu%#$
z)uUPl=VNUNzc_O3@aytk$uKH55jF6(ttmWwEr(V1ntb)Cp((y6Ii6`6E393wI0S|4
zqL&hfZk-Ktxi)mW7&SNw8<mMAO7hLDZ*XjC==q(M1Tr`}@Eyf_<(>N?8S>1Zvh``u
z_RD5ym%ZY&SbU1w?0S1uJ^?%@pW0g1q6}cEi$9fm4(u`3kf0o0HWP(#X<Q|NBj*L7
zR!uSo`tTQ1x4+y5r`$$Cu)p9nr!c=en6{qOV5?iZ`-H|?mu5OlKU(VF^wB>ju8P5<
zod*XGy6^3qMy0_4&5*-|vKp8Oi&<NaXmf0)>)&v)Y79EFFPxbSDCbIyCL-(GWiN*W
zzshR45{8u((2ujh7~Gix=fa1>@>WVV7)WZ)8R5VMRMSVL6(qPH?9gW|+)Kl_{3iWd
z^4C2@+&UeV(3a}piMCd<i-+kuD<YSTKkf6mDWXZTER2snl#J^~w>7ddtXL5Pg3r|S
zC8Xjt8G%}2se@A7HknQOBDeM<zwVohbbRo)smVm#rQk>eMwnP!WRAX~FVK}NgPeIx
ziC=$aEz2MTBD@pcvD$8Jy9Ifp8l0SGkf>h97i;T6=ErWUyu8L2nyG&gNM^<{@Mj89
zjr3X@kuPs|b=h?ImXn}{&nV?vfFPxLz9{{>B{5h(zoLg1rTy-c&8SvtBZ#_->}C%V
zG#A0g3-V`W3<k(CXK?BKTcp`Z@?<r<$O%%!xmisGJJ?D6sMAVyV|yjogd-ETu_$Rn
zWOnU@#s<1y$CzeaN$^tSlXcqr$jFhp2%OKmvht4P%s1K?CQr1LRd9-l|CW|XU{wl&
zt=f&fl0~nXZCF~?F{v;;AwDY#fyTg=9wK-$?Nv^sLKd}d;4hrc_`!z~9yC0jv8}P2
zjZdENS+plnjvsfSo-?Hi-syG6DMGhJp_%q3e*}*9%bHrL8cxWYMRjv3rj~MYu{6L@
zaSt@;N6N$w<Km3!R0o0MakMy7pz1tAsuZE`&2J%(Y+`mDCQL%tfEHjHSWEZ=_81Qi
z82~$0m=hPEC0>s>m-0H|(Ttq4^D|1khkL1KZ2(X(E$=_Y@@X`(_zoYu70U7x@NqBB
zNMa6t00y1Dd~%F`a|ib)Tc|nnhugOp-j1|TXSZLa3+MP<|A7HZ_S*6puoe0FU8wZZ
zw$?o-Z#PxoG-i%?n#cr^8&%&Eml*y<^iN60R;DFE40Wh6uSZG~&cuJ|y;~+CYOzf8
zh^gQckSvufZo<Y#7c^n2pq{LlXRyyQ#C%w`e2pR*%6IV~Ruv5MUR!>)P77TqSAO2x
z^ZIsJd&B~7*b=>E#BV^*t$_cRka@y3(d|#52G?ZghaVnEjDj&z8Q=Bu>?JldJIi8#
zF_;GEvXu-YWFZasj`-DQVl<KluKlhw4r~1cN8?BLA_~uk3B_DF{}ZG!Uj&Sd{x5<j
zm+c|%kxw`hUTe&Q-fwzKp=wo=s+y`hOLpuG4eN3|{p5uAFL814lnLbd<NrJD`kao&
z78gr;xL$ISY{`A9vEHcujHf8ynsDMEr_Z`5uX*Z0s9(qL>7GM9_^*Q}S0e#CI09L3
zjo&EvSiIKmKYN_}dd0;r@0-^ndz51TmfUo8nu3@GdZOJvYM73${`k<f>yYdXy8j&v
z+PD@}iA>S%)3`N21YTRed&U*uTADf&;|*d3FE&-~Z;7thS5|A4ev31IcFDR)9-@+9
zf8%S>mr2~M_H|E-z3Ms<{6|%jnkmo>$0%RENnf*?v}8;!-`4PQ3sMATP#cibdjo&v
z!Cpi=uxMu1o4c^Vmff2{P(jVkuotfdCxY!%bbN<Zw#EM-75WS!z!A&^Q+D_>1Cd3s
ziIyDR?}8@{+)MA=nyao^1M6);Ddl2)ysyJ_l6Od9MS8p;x6ygVh3R(`#N%kAq3bLs
zCK9yx?}0s!)QsRZD4Ous39J<3{>H|P&Bw)>1gxd(00GMZcuO%e_mqx_%LU@yo$uDw
ztzq#0nYyv*NS6aU`;Y43kpf+J6%-@mk}6eoa4&0-l-ZjBFW2DI<uo@_LK6b9c1%(5
z<F_IXO2pk0rEJ8_Fb{nbZ1EQsT;MlfeZ&0MLdQ|SD9sknLgL7Ty3x`HeEKAO3yDqE
zGoVxxt}#j63v5sAlMsl{2%_mdYM}~|jo^rLVP04dBSpd)<9Fm&%(*XAlv;nz%5yg(
zKHb|WFm_%3a`;aCGQ%Awb>lSMGEsJG0y%C>a!+Y@RKrAnw<z;=TOls)MHs?;+-fh*
zu2<h6ua?*4SNjW-ER7fL0Ovr&Yh<SJsB<#6XHNSmBY(e_fnP+(HRT_E+%=gjG`+@t
z>RLb(;$jdD{uqu^6|S9<2sf9*E&8`^e_|KAGVrzQ0gy?IU+wx&>`4$zA#C((jzy{+
zW_)pB<O#T)4;i}vovGBdt>3I)Ng2KH`L_keGMYK#DKnpe6@K>8vN7V5@!W{^^>)1h
zp5;2*-!7F!%_6Q74)$M@Zr)glthI#e4R!V+>$%Cnqh(Om7^SmOyP0?17~O)8o5S~r
zO9@x}=E^hz>{=Z34mMu=!P4U6r5CTmza1^<1=0kD;d<n=!ZvnE#xEh2?Vfs=SljaM
zt4^ty1^*6SPbP^hEO-I7fDBS=aAa9C<tb{p_7vhVRQl(eduv@+LB7c3w}>`i^=R?i
z-M^pg@-<E*1VRnybli90l#y|Rnm_wOv|w{7l#-Rsp-^XT;1!gUD3OIL*Tt`nZ70t#
z^+on%t9RD@&*R(;L>T|X|33J~IQn+AEAqexu-GG!!sLz}fHF`W7Ch^wsU?>4A?Nq2
zE3&HEmKb#G>{t>QBfeW1S$t-#bs~3`f9U;KTYQQozG>EfnYYuvei0mCG~Bbxct7K+
z<fkoET{34P3@o#Bz6mY*0`c;qMj2l&ySd$3b$d8)S6(0F1JX<X3r-kXPV!xzI%^Ig
zx0-5tWt_|Eeqk;*t2p?~d-WVci}U_f&TXxY-Fq9N7ddqwXckliWG%CpBlKJScK_`K
zF~09;t)N);fNQ!$ycx0I0SMD)d6D=r&n|}z*W|%P9sI%f4^8X-HQKR)fb4mh@lhl7
z_OW0Qzh?Tj7O5=19QGw6FkS}@1vHDQ{C2-tLD%Z+&i?L={PqvNmlie3b0FIIJG5m&
ze48KcT=y@V(&YH-v}KQfT!~p}e9-mW&H>szbf(Gx^l`vz<i)663W_(#nxh)PJg^3m
zv7v8shM+h8ysK2MkRLaB_WJ&iCZm)0$2{mrwmtIBf2f=amP&14T|Ad+d6OK>sr5_u
z!AMBfugl2kKsZ8wgSc%9D<=eEV2>>u8|cV5eO;_YPK7rXa^2iz#QS9wraaw@gPRv3
z;)Z@-#V?z}43oDHG(`;BVuu=T4-x1!DRQe3n!LKxcRy(i_<XO}Ue-!}YAvWc3dg(2
z3**EZ%C0kvS-rws5qf&HlnLB>t_Qz!P6mI=Dq^MzB98?Pa~fjZ(CS4QmMV>xZmbco
zfWB)D7abHU#FDh9OhClv`<R>;5C-wF3mZb{R_pGK+%NvEv;I>&V1n5uZO6+Y#w6%5
zBGclEmU?Xe&d9zx?Y|{!uzqps*~7s3W6#MCDn*r@s35o*yh*29<5>wOnMyO*Dp0iv
zZP14pSU6*~yn!xVa{9uUeZ1McyUyCq<NsOt%I233?&2|U1PM$t@56J4UERuV+<b)b
ze&%6)W!{ztOIiwSNM=5HOg8I#-E1;y%j&ZdM*U^c?hTT=kS3zZJ$X3I;fez}1tJs#
zpAo!8u9n-}<~7Hp=W?6#ZL9o#mvNHQX#KpTvn31b@Onq1thPS{Zr3yH<M)A-Bq9Y^
z%g<V9UI({ZH+-Bh=77KnU-r(^<)H7?dE3-j!NUX}+pGAv{URg`29%=wX4{7=G?Fw{
zLC5bmYYZ@i_a>?pM4U-zynC`I^X3EPgz##(Oa&C08KT&)5vx4TD{E5R*?PV8)N=tr
zU<_2|0s;@eR&$QEc5Z$ZeoayF{lnpYWdC~PHoywr-yLxRIRL5gGv&Xntfr^JyDm{h
z?><31UcbA$=5&O^-vjuz&Ic8zd|UICfyCs>a7+E~LLKjvt+QiD?o2@F1Cd{IKWeHq
zi%b#MVl}CJx~lX`!DE~>JM#-=1X@MYoh_5e&>mi=q(3e0gH<V)a79WtQ}UuIh<ne>
zUF7wYh-S8)2XKnt8u4<0Qt$m%>&y5d^T`(Y%}p3gF?xm*3c`OKfQMFPxs?uk7{tXF
z;?VJ-wjhYE!G4g(_;C45Uj)*Lcrh0<3;5Lz%@mrZkumV<KdZ}c|KYR8<5+WLKqJ<U
znowhP4ZW=jd>))q2oo@VYHBF=yV1Z?G|mVRbi*_rFRB1e4{q+FtM|Hb_0p)Bd*f&%
z3*OIa+9GG*ay><n;Kup3?2~_rFoZ?r<a4%&B|UEb%81RbU8<m*-3^C5xomlp=PIjN
zBgeY8s5!&;-f5EF?b-<8is?Y%*Fu+Ag_i!w)2XWF$-M#=+e2WDHOU46hG`-_?X9c<
zwY)A9mHYK|{}YJnbNWnJMM}ERkk&Z-yqi3vAL@!()X2iJwD4BQvfT;5BvP$bVjp_(
zO!or-y^1J;M0ABoDoTF*!kK3@oLExV)jFkW!tuOH*jW(xpG^8R*z=Dkt9Yrp8rsPN
ztNx)GE#ai`Vho3jrKQ4Jl%z-in#>3Fa6xip89Y1w89SWeJ1~NhW}wvri(VhZYh)+$
zFmq~*^G<jr6I4sM6kXz&)z3RK)<b~u=tI(pb7JLJ2L3EeW+u4B_s7J`1YtaK`8m4p
zerqOt{LC-wZ3Wg2BKSLV%>W+{m=U6niB0s^@qyAK;>85X)(bXm!}qdsEMom;ZQYFI
zgfhOSo!M0|e8d0xuNK2BZ1?Y4{qK&mkY9bbH49P)-5#v>5t0q~GZ3^7<=$?|VVwf5
z&WhEtI$qvmqYW_`Nt|DTaL(o@DZcE#a0X^u4%4`+uO!d##C~anMmMDWDO{j)T_CFg
z9q|g0DN$Mns6uQa3w7f+v+gpf4A4%trORkq!%Cl<FN{C<PhqAwZ8Uy)5@P#R#4-m^
zM7v_1yL;=WOuj8#A-Ea;r#%##!10R&t&9~e7IaHQ4kC?%*l;56xVQ_NMxARnKS?%@
z-k=7k3hS~>uUX-{W^NVi()aYDOGdR$cm~#~9nTrEvN8lJ^oETJZ^A^hYh^^Nuw4m4
zup9tAtv;Q9+WmQP?~do=o$#ifdzJZgyJN$@?jHdGEP#UQ5Hdk)v3~uQQ%~gHy1LWR
zuN{znAl9W${|fC><(i%Q4cd-VFKHia#>Lxret(#=mg3F?mw9AZgryf$O|HrD7N6_f
zyHEGzJh)&CtvD#^b(ksQ$+kQBgP%GHugfOGz|W7&-Efba`ATs@=vvui)=N))3)K+D
z%h}R<E0wePbbVLb>+mU$6DqvRGKRbY3}TR4VqCCuZhlr2hiUTkqLzu(vU$?<L+V2B
zC}1H7;&V0wHG2*=3zWqM9+jGLKUJgMGwKJ+Fi?dM8iTpa0PWd|=UhHkcu*)sF}ET>
z4ykU^gsaa17qk+IFc#i~3$}7VGSpun3xH5Kl9@!Lbk}umkMdav$HB5|Aq6489qPtF
z+Sl7iHd0HiE%|(WXO8oYZvXnSoOE-%aOeKf-uC$f3ZY4%?1j4)&mgqYtgMXSD#2y1
zo?^_EGI6&f0q#J{?%R~zkdfn?sW)Di_ExD-Ez>DMkrlLu6NFTKE_J%6_iOhr0#XtX
zm(EyA9e<YlUa;vqn)_CKh;8RHb^Z?3vA~Gyyd)wtU<$CL9z#LDVqUd|8TAQbGqN)D
zc6r9E!APMWP>Mim8Y80#02ZSg%Pt;!)Q}girV`bzAT^NfC$~YS7|YvbJZx5ymbaE1
zE`Etaldv2$RzO~p1iOp}QqaLL(Q_n%-lHf73-)cLfl9MzW^xLw1^@(@bm9X<A5ZzP
zR$2_VUS)cHUC$E5HmUju+o)+JYu_*L!__vIsB2h)!u@#Ar#yU-um0E5Jl)cu$0cV3
zF3O8Hy0X@c{r%x@qdv8Hhgc5pPjelWN%F=;n<d7Wo$Sqnq$CPdY!qF@XZ6VYC{sb*
z`LCwFTe5s|j}OS(+0QEYyI4PbBe0rN#sRSa|51C^PInoM|KiTu(|v55xEs<WM5Dfw
z4H&9W98=t!rfK_eXU2=hwm<GZO1Fs2Tk3dki%)JrCm1viL?r+h^AHSv8}pyZqBdF}
z<Auzj%jV5D3c5OQRt&SMcA8-6riE}iH*k{6)G4Ja51lhRbS<y0CQ@Y1L$Fc$B4qB@
z{Bk7s*UsxI%TCFTK0<xm+|T9oykl$mpi0<Cvr-G5BX$XV%FDEqLgXO01zU<YP{j+Z
zYmN7dVjJkm_mw1DTh?o>WwHrh4*DXy?3Od!Fm6|AL6*aJ9L`t}83_{VCNk<j)|Pn|
zIU#7hr2$!oa#YD9@#_pI%3%kXvP`C|r#3^u(C^|IS2NJG{P=Moy0WG$;ku0XHXL-3
zmvK^LL<$Dd`ZS^=KdNxsY^z3wY5n^%`2>8dO;rbq<$UTs`1V2ke>tW{ePDrb_~2ks
z{O~sL12K$Xf?mqF%Vgp!_!GbyzFK+-UWUxPUepb~moGXc>vx}2t2S4?4uAn-#|^8k
z=&ORfJ|F$mr@OD7eZqkN3=q2KkeM>wzeQxDOP_@l!;1iQnJK5eIz^RkVpZRMByMrE
z<6)Oou<IH`^Jqn0GHzE%x%AZt3aM8`8=l_n>5;5^ZG@GCq-!5bS*;!(9lky2*ZS{m
zpH{kaYNP?@9sO&>_xH%&-5a|*6zdJGC$6r^aoM_XFdJC^eb#Ub)+uUweSDCRF_AK%
zkp30Oq$wS9)N5CtG@+aC*XN(>SMFF26dD}S;#%yKXRWrr+l4bSB(1cl(N}TR`DfIT
zlg%@g<&@ps5bCPn+fI4LWNsM&Ri1OfX!V9$a|%q*55;-q#qFOs#Ac_r@8$gEJDMpT
zs!3g@?cA(}GW?UdfG2gcMa~Ydz7{-OMZh%#-iAtuogO4M(s2?d0PU1Mwr|G7Psau&
z*W%t0)iRA<Nro=)9pg4sWW$N=ZJApD%bgdgnJ!1wNPQM`Wq=Sgp`TEbUBheumzHdd
zW@p;_vY_FK!r~+n>yzB!9w032SFrzS2t|OT+FF9CBI1X+(sDYLO?jVGV2AsZFYsfk
z-~-dKkO1A6suT{jf!7CX!ESNWqvuL71}2jd?})s-!udd7enC4PhKl`qpDn=g<4%vZ
zwq+1pMuu6QqP#M=kBsr3+T|UEu`sd%^03DGxmXSi-$RT7avEC$<$+#!NN6O;#r6*i
zY7fItM~)&JVx=kwLBDBxFE=w^G76#6u}dYEv~O!mzpeUpSXDw=eMOY@rFgWbaLko9
z+>60}tuOd2SSnHRJ3#S2SAh7t{4=o49*g;CxuCWqXo!)?G{Z7_*{|71*8j1&*GNEJ
zAZH4o5i7XWxr6ok?juvV&CSW|$#1V6w_f+QKeP!Ea7{L6t0wgel2rIP(#$ZHvLsJf
zWtLo;K&zIP7Kl}oK*K6M?abunf(0s*N5bx&JbIW6eDoprn$oq5ofnQzdXgz{nNMFK
z*64*UH(2e`2hB-M9}s^tjWA`Bg+~|HTyJySIcuYSCPVvm{X@OTFcvJlO@I$8VLus|
zV201+31D+1r{1T)FH$Xk9%3Gj>dDzvS0O!$P|2HltISdIliHOw0g_PmN_nk+!fPne
z&a6NkCaeLEWsm?G-tS%5v;PxVF=1gI#tKskihmlGT<=H$wqq?~26xH76$@0&C~xCQ
zio>Y4ou{}cUpz=(NTI-D0AQ51eG)Q?5yIP=sQyoG1+YHY;r}f`O?p<z2pOBlN?!0X
zOTWoqHIY`<06=Ss_4&l6xUFTcxUB-42nUSY$isho3nNm7=@!2Ecc<=@1gF(Zw6Evm
zw)++r4;I(UH-7IQY>&K(h3am70*iI0?yvi^_a`eSzvY~4o?E@s9<jH@Y9=^$=fXr?
zxP+z{z#5E=s3bq-$xsV*n1&7Ep3nQ@@Ib)77s2&Py(JynpYyR%BHcRi-WfMPxBMDH
zCw+U5&-ESVL;1HEdfwxL`|S<(aesqE%ZDym3*-GR?(Z!x|6E=GL}ZNVIA%E`6FZ_}
z_iX?DvSnA2-a}}vr7QCr)$yoe%JSz+us61I^?$o5AtCChmkZ0w-!^4PJZ`tUq2|*&
z4$~$cSnMT}V-R48vlwn$o8bVO)nYnWg}1=Ua!Nx+zVxb&nYbCg@#$I-zALJK=AUO*
zcUM>XLstWSXAo1BbjcHFZl4R*GrKGHX1=8mpuLTwwI|f5F`=iPJBL{Qyi)#R_D45>
z+3UFG>BCti6t=&2)tlwoNyXTbt!sA51>zC6FTUG#n6#CAWy%Nx5tXPkglCgo9CRg&
zsgf!T$tkZQpwJjOZQ1(m5QZ!5B;@wuK*;G^ETQOJSNGJ=*}HwWbIZdVKWZt#Wk{@+
z&MC)FmJ|d<C<ZihIPTiUM%6u4uuh%KyhwYn9FPm`o$>6L1C0HV+!!z4Nhfcg#S8b^
z<&%<-7r(|xXYy=km_y5IR&W@rlNx6%Z$T3V-qUR-*oJJEGTai4O+LHV*PF>W`tYcH
zsoA2UF&|pyJeyL%3IDZ=QGhU-g+R0onMQfgom-DEipT)PyUY-`W@w&T*O$aM_tlQ`
z$&)Z)CO7LK(#qx1l<)0a5L3p}pww;L9?J@qi#ziT24BU0eNi{1-s%Ku0C&<TuFQ<I
zWY6JxfBW}X(b11qllg<6eMSHCb$QrfhpVKm;Xl<RNOEbqY@foOWkO$0s=(1M-WojD
z+uVR8U<Bkwez#Y!^T{=m%gC$iYHOR&Ke54*eo3E0N1Ej*^$+IHs^Dkbx1FFWQW*}j
zd`e(qn!2{yQxlugA38RVhOu4~nzm6q-dTr_mbviyv&~DQiuM+aVb7#&Uvv*5n=4{1
zU1eau`&RnSsb!3mB5SmIDKno0$~0Gt$C<5gAz8lh38GXxIwnde5gikL;2A_iTd`F`
z+5|@bojS(QmHeUt^6pPNOxM#doatTMo>Vb~lfrNj2D5W@H+FDq!1S?TiRVV6WTnkd
zt`O~FtT73&`sst7J@p3xfcv{V(l>tfVDw4HX-Dr&BRJ*x)bz}nCuL2T91N$d?dRv;
z=YYi#g2Iuaj$9|mnSS}Wg+PFtr(~JcToQh+NK+V*^a%VECeGptN7lQwQCZ5Nqx!$7
zs%C)y@86uq*bCK?EVcC6{GpP1u5#=yUR!o{AS^62RI_m0pwatpIT*l9s)}(9x`^7s
z7Cqz8a6K5B6GZ#s*n48OHwl4Y+l0wA5|AbaEoyMUn=Z8(Ka8P=Ke(5VXNn3C?8_7d
z5f?J>u-=Q~7N!Z4>R8hjgx~B6L0n91Ff51BMt4o-WCB%;a*Lk@CIvQ@^aPkD<zp&Y
zU8_)71vdV!U72aBm$U^Yc*H%~zNc&d$LucktJSP)6=OsLc{RnNisUUX@nT>K*Jh!E
z#*?wV!WI|gN0QNgvutp?pPuAXSHO0H%z2BYe`Fb3QfqUN4?IsD9d+xj$v`l7TWeXE
zL`d;V^UAFtNJ+BuU<^^l2!>_nu9K9kosTUhw;ZlH-8@$RDFPVB=j;DY_4V~Nj{d(_
zsTIVe15Y{$93=6tFBkRp94^4eDM^yi@x=VzD>VQ(w>lWbSqCGuH@*p^p<b7J3(bi<
zybj-@CN-R_MSiIt$Li0jCVz+s3lkdjUP20T@MQU=$~e}{5|duw(haNdMcG)48p0G_
zF?z?D?7=#8X%gzzHJopuXosmvfs>TWJngbvUOYz#r8m<Xt;v~m4%V>|>6q8KDoVL8
z)@2s%OEk!m{g|HlDcA<?d^*wnnpIL_QbBHspwh>_=c$5$y0F0qylVYs1@_?WJXx%z
zE4ojjQ>KG7ZVq{9uD!$uelnRXqpWsx``rjo!|h=?PG{DY+@SEd4A~6rdQMtbCWX|D
ztscbfiVF3ObA~u4yRY2RE|!L;eGTpnBR$wu5{@dvV&CvzHQ&11u@4xuZX)}OU^(;e
z&qdyeP_dWS{(~V{cUKFnr=7!5D0Wn$XdpcId@F!aR)F_E2pa9%eCcp6b;34ic7xuL
zlGg7cjOCCa5ik<ZoHvIr{lf?=ke;hPdyW)El*d49s>C3juKzw<v%~11E*<fhlYeD?
z4LIP06kA6|B!uQx8D#KDCXL>k5@dFK)ol}^*-DEqqoiZQH;~qG7dk#$4E@tLJQsdD
zokU19Z;E?dZ*GT)=169r7G`oaZjgoxdFvTCiBR$4-pMPaTx^2@mbd~XwSH)4QC2R*
z);CC_(39~fsm#m(i}i8Crn1@GHKl>Mw|o;|Wi7I4gJr-Zcz8BLo%Iqj5>1NZB}iXq
zE@d-Qx;r~HrtAKi;tMXCPHH7fiHqWD!t$IPxt5nXJ}?*anRlZ;5wHj}Y{`d0pUEaX
zEQ;GklH_B)UlN&pkHwg>i(DMrUbyWwbGK>k@XzA7U(d|OZ+Ip%vR{-(bzSaU&dM`{
z$$UKqs2MC2w^g&SEwc(v&TxIkql|yE@R2XpHI&nbpjVc@-7&YE(Y5UhRV3*%LX&P(
zt>DHP+gepQuAJ!x7(MJof^3}V$1lL)Wr{3!Zx$<WDAVEuf6}6A3N~roC+NWgB?BVu
z49o!I;wKdw|BBaj_U@#I^h?VO5ydP0wkqlGeJ?Syv-^x;;MSZz3W14w-Ki@La-{q$
zy>|(>lz<rJ^~00!p|)5V!4~gkYzyMy(wBdY@(55S<@5qRgrdJF706fU9dp)f0RIQ~
z;IW@^GI@$*i0a1Xxu#A~{EVbAaGQn?s#&-X_A=6pGdh!(c0peB{@DJ$`mN6G(@x!5
z|6Nts7Im}4%zXl$+oHn4&RvAe;dQnv_A-7#5Z7Nw_4ulHz12`U1q{u1P2Bdo^Qqp2
zH#h2i41cpR>{r49X(G>BXU8bqL!E|lBPxS}L8e#!;A#lNZOh-jRYj!qW@N-S)9=s2
z$r!~uJF9H7s()r~u))$KsOWgiw;JRLX31~U)!AK|{60TnXz_>VOY6LgQ3{!cN6Dww
z+QE9CI{~USsLdd<1}f$y!l4aaX^tLzR%{=dy9fcD$=fQ|7ncgI6?fw|r+&t<BsL@Z
z#Zk%JjyhZv&{2^8^0Ox*xPaDADw_<ZDStE7vX-~H{}3|XCXT0Jt8X3)o&3JLU*5H9
z@1p=vaCPkgg|dOigxw%fVEa&Rh+NJ~p<~%z4zHS-e$A34Sk8Ev%2f&ZR88kNq2TEI
z%jY&>wfxt_%K)!tZha$ix1$9QcCEiAuZ!oVKaRKgDgK}VTa1FUSK%g`sf`5j^fIWP
z&X6Lo))y3h0YH(3q6gq!CpM-@GSJ|N{?ya9rg?ck)=h6uWr$>^kO7j3qoq$4z5mSb
zCSRX;R%}`4RnE!N9Npo<j$<4TB?m-{C1z(nXZntx4ONU)7ybB@N`Pp`q$N*a0;qzI
z#us+J91j1ye~KHzwUC0?ip+G{2EsG~2If4}mX)lPq;A9Om7LC!0G+8*XiJmOL2)eH
zPgd*g!ANfMCq}c_7OxN=oz}W{PN&*xyRlgs(1XWK)R{I9?g%l$?ATtbwxbM@K%AAV
zHq-do96q;mpJv6L5U(_@RCI8YaaQt<LkDgL=r#qZ0h(gmRiM6Hi#ISdL=iG{<Fj2t
zBQ*ifh$BWFw4EWX`Xb{0_9DW<!)VA5hgQd1@!DzHl{5v!fCrB1tuF~Ld%^_m+9e6h
zHt)|_!&hex=i)-vtd4&ETwWZ!z5z0Zrs^Nu@8-OE_-os<2|#}@Kt6SN-$m5eJ9o`C
zjsDqN{w(k$=AgQje#xY-`5h_Cq4irwq2D$}4j)_<Ou@x|J<|$xRTSc;U`Y*&s+)2R
zOL?6=#BqDeI-9kQjvLlf;;DnU#9fyFBXVX~2xCFT<Y_~>xyf`>kN`b@XMIe8M|0!j
zmzkCcTpW~m(qUTN#7*R)Bdub=OwMak7EgJL!33QOD64Z)ODSL=NuN6>F&r$YG}Lck
zD)a;?w{^G@4?~C*{sz9w{_)5j8ynUD<}tsst))a`VA<fys!EE>@fRHw&!DBtta{pe
zdMF_H!qKs?sIX{H*ctwmkbXuiMXeU$<_f8>&CE#oG*ixvy>b=o9Z?y-JLWn<ZUOn*
zxr$%Q+e(0$%*!KNMaY0GXFGu)+!ym3`Se253XcHepKd*trn$G+JP8`Dd=MS!*c?W#
z&^*aqY<^(6;FnEdIBhSmtsiQrd|Hg@cHtRQN7ksiwT&Nikk;AvayXb|NsEP_NB3T$
zSiiwZ!cGX<xh2R_CCaK<so!{X(dRPF;QV2CERFP}Q%TdfZO-y{%rAd#;Y(HN!;arS
zAJe6F)VXQ@v>D8laCFTiU^w9wC~1P(_YI#Lv-Q@s_<HP%uc;M}oLm)1ewRzE>Zame
zd-`l%z0VbYr1=p)qkq9K*$^pHl;yE7p5l~9ym*2;{z95k1>wC{kp<1hBy>Vw1dnuy
zTK2^ZL79z^&Dh!;5TJ|&j`fZ@Yc?Q~Wnn=X7O(!xu&6U55!jaQ$0O_&x+LQBKVxIk
zunrgf<6q-y%9gZ(q-Qqn_8d;_?yvuzR040|Z(#e8Nw@i9;WE*u7O?4a`n<Hx?SbxN
zFu5vBG(pnpHtl~UyqLJyi$|OYp5nB?rVm~u8e@@1YvbRGl5n--OW3qZgG)Bcc$`CD
zqovLrwb_SI>7XH*Y!-GoC{KyOX1f|0V!WKNIc$5OUg*@Qv1fpEh)jeZ%f^TEP9<^~
zRNyx>A|nEWUq5vks-+TCB+Eepv-EQ`RgRf`=!XDH8b*f<*DJKz@*(5%@WYoyFe()x
zPHIvpXgYLj6*q~S#x`WR0Oth;11l5AJDuIpF*|?wqZNhO(2@*4!=XJ(4=7a1Ho6f`
ziVcd)y+7LwIG28dH!s|hv@lJkp?k}{AwKgvbl}8%JR;~egj^zAnq7}<wZdI)C0@^V
z5d_N8zCMepo$<(>gSe1eA4q~0l@O{Rz}f)-pL&X2y)~Ia&m#ueO*68y7x((zia=kO
zj5X|$A6FVT!B-5h%RqdpZVOfHXdSKYUro9K&W=;rQtSjwKJ_ud{^u(sDY;=xRUyUi
z#%w7crp9XiBy5JNy4xBrOHrYg6Pz3!X^fGP1rlm%=KHj|3<LVia@cD!c)d}cIun*>
zk%;hMAjK!<K7NGs?ghF$YO!7oCYbN_@3p@<9z_5*H~oFN+I+<J*!tMm_Vz_3F<TAI
z%QhC_VPS2_eRq&gCOzx~{JyUr%&pJm6Gs;NvKL3P$2<=}lmQ^42a3`964*X{{3HvX
zX+coSEZwKMIq(XzzoF)EZ!#QAYzmxmwxr6uYWY#4{=g+;oC826CX-x~g2Zb8a&|8$
zX!b7IT8`y`(DA5h!jFA^?mz_XY?-ySoE--Kp^Tu&Fq*`Xt%myRuM^Kn&aAhBkDeo^
z$a%`#;^bmc<R1Ww{vUwA9F`r@MyOygviks%vDW%R9d8Fny1mSRmI%bdV{ea{W#M?^
z{*xpT*sJL{e4m;J+{^Dmj4|AlH^e5lgi6WU-b1nA;BY-fJLvlF*&QEUX^0tqD)>K0
z_(t1hbBv)jm#G{n16`x`F2l8s_3?J>bsw@NO(^<4k>7Syn+2X}Uq%1|;OaM}sAaeR
zNWU_xEds+?GoE^<_{qaIi%+MsybC;aiq_{LSfRs~fmyaM*v5{utou%)V$&ocziKjr
zV2q&DHV|aWTQPLo)v0n~>~O}27X!~EVBz}gO50z$LTF465#<~ESPW-6OG9XWTJyxK
zxDVrq{v6jKV*Q1^IL)H#AHA1|ZOn}JYw`HH<KhVzL$~CN3<suv8@UENrQ<Tw%<(Ab
zN=BMlnllHVI}3|sBmJe?=Y&w|S6yZ7_q!b-Ak+cm$>D&e<-xyy-z<K1BdiU~jZWEU
zI*(DZY^sh)%2g38krl-EPs}l7Om%IZ{acMLK$s;2)n$Gw%6!Adje23PlIY`ijKUmk
z$@6D;VS|SoO2Rf;TR3f3HbS8&#aIJ2IG%;4&C>Ca4#sZi!SZiwQ}sF<Hr{b<i^Nra
z-GMS$ZNB~~(F*11>6LaWaSPiF4!9C`j2y#0Fcm&PB&3b9rlBOwISUWo<t+Hzu4GLQ
z4D-NbGLs^^8hltKA_hjEr((L>f3Wu17_gC~uX|U$SzlLxv2w*HE)9(*ozJM*(VP~w
z@VyfhbZ_x~@}?~5ZFy6Q9uwp%G3Zl;eKD$MH9f|XnVTA50GEpL%iC$cHQTbL36Ir&
z@83#q%i(e5kMX@dgnU|@KRcVBpMP4sdju&0+?TKyVY#Q3l%8C_PK&c?6?=yxq}`wP
zZ&@Qg4Vt)IR42{q{VTI}Sf`Hk&JyH#T=Q&3GfWxV>tO7e4`N3SRu&dE?}Woj==rA}
zl}{f4!8n7cYoM;KzNn!%(&$L8MNF#l@KJnzcXXxkn6bf|trs;S!a`MX<knfNFHH~1
zURvHjovGWH&IdBO#m&%U_zgUn()m&UWq_z#d`m<A*kiJ(7RLo+Z*vZIZi#{KAJyjZ
z!sl5G;A9(oq1-Wmg+yR7?Jc;_#hD*{{WDuS4(|5M&VT&a85Xo(arH1d0Rj)IR?<;`
zGq6kv5MVqn|M32CwkZ~aSK#JJ$6~*=IH^r4Kew`zo4QkA*}i72_16ZoF<b7m^XeXa
z$+Kg<U6G+>3J|(^R9N{7Z_P`@#zozqiP}(0uUWcapB$yddyW4@`bwBi@2>bK0C5XB
z@cB<^y8<M|-T!A9=s#<U5pt}0@%ooUR6t&!V#l1x{4VXDBLtYKwK-gp_0b#mhhu$m
z83TJ}!~E}u)e-ac_EOvLR2iI6bYV|94tQR<9*b!kCyam#Ol>OzFvjEs=`;E=4WgL?
zCqc;A@4_Zx8hZ^@5N|?$IcJM8XWPMbJ1-=g&|J^)NozY}xS2rk7vA(Di#AFQq=B7j
zg+w82BFB!F;&f%986mU^=m&iVraGzJy6G8-t@j*E&)0A@e)BtV*^lxIPK#Z@kn^Z3
z#B7f`ia|*z$M7(X$XleBd!oWD<&wiRI}a(Vdjkpm5!lA2FXh-2agrG$b5udwsa}-$
zVtp!11ZTv=jttr?!Yklzpn-A-%d((8A2E4%%tMqIva>qO8kKi){-R99Xb+bsQ&5}&
z2{ZcjAr23CRriDL?Jwlt^8bAg^>1_V>ixri0P#-9!tf@Uy+1ANDkEnWc-#=p(mCgj
zxxZ}x3YSp1HsdX-Q~}c%KV+q^F)ZUo2tnZIw6%SZd-g$`ljV+r9`pi_Bsb>n${?N%
z8kkQVk&Uj0ovRP$JABz}xs#cgT%WgN6L&TXi{i9-e|q<zo?gpwG%T{DR}?y?@5<yp
zze}PIDA;1+OP1{M?5{-J*pWPiW!TV^wN8|A{ff>fQ4T%``^Bp)Sh;>%Ockl|Ifx^S
zvcW67%q(P&N6H)y2!n@vQiCd3usxg;X7|`70~K6jh0m|^zP{{=oBi!D{~J4G|Mp8(
zfCL2P-Xw6MZgPG7@bLHVP|f4wclQr~=lOl*tZmc$1}3_1b16$t<dj+0=~8zc_mpLi
z(Ajt1Vin$ruP&>DomZ;c&q9MAyO&9KXoei=7GJtu9J2zns5u&cb&On6bo$|FzJ8Dn
z?6Y4O=lQUIWZ}>D^{YoaJ^M91|3-SWnRg_A%zu0UUIIr)$H>3mq_{`7E%EE^pgyxX
z1qbu>fwtxR)3?-XQoMar;pcxmeFkmZ2q#&=cb1p+5c4r!dF44gJq_UQbC^w?{GP?i
z3b<a)r`}alz8pn+i?Rm*vbSJiQ4x0dUm5HR2qc*IJvr~!7P+@o?D+I@8Np$C=e`o@
zVWPt<K)Qin%)4;hVcIg|6Jv02kT@B4`qp$o*q>z=!E+lAflw;t<+!0df9?9?Z=hZ=
z(lf^cE8JNAez`a5C-MxLk|v0nm?<NGPG{t~hEl9ghc&)0FSrEF$>jqArv%pp7{u*Y
zya|z%VdP-W;{{i;$7yKK8AOcWQ|oHF=6Dm~ju36)8r-X6E^>EnEV5^Q?z;u%=vUl>
z|KsS~<C$*%KR%*H%7~IwwkUI+FS3#}ax*!7oR>q9iZw!F<&-9eDAt@Ml{t^ZoDY>#
z&Uek>u7w<z#IT%l`d$6{%U?YlwtcSa{eHck&&{<N|5#8*K6wWfi+Qg0P$Gi+KcvS{
z0467eb0h1Wx<QoaKNq3XM#AL7k3fm(<R08)_eK%~Pa$8;vw>%|`bB@7_?^kKUspiU
zG85iNq^3B+zX1zLRn_b=m?&J8svZY^cmAj=n7YcrpQR%g4!hHUF@P`p%iK4pbcl$Q
zTUD*U@LlW`jrvTxMyhI%z;MS$z+wI!cs#7$Nwuq9>b0YH`?ic-N!i;N{}V}e2_>_l
z=*Y6R%6;Nf(BHgkYb<F<lT<$JZP7ll4iwrT2b+w&AWSg?{m>>hkx>u0kg7cXXoAv(
zYZ<xkcZ<<!&TP-u#DbKu591cFo44GHXKJ5B{}-RloSyynYzlO;5PSoTevi{jF6tl0
z)$D#~Wlom)2Q6>yTE%AcFKO(D=|>2_SH$@A3t~pm$J#=jMM;AhJ$IWwWO_JOvghim
zd{)`cO@s>!*T&%WwHcmnPraf9u9h5%l>wQ2IUy_JMSh7_{iwu1rZ@~QXk35?+a&yf
z7*Cm$R=$J;7#wTw{e)NmL6=O>4Wpwp2zNVNAqfDmsGDnI;dgp{9rlVIZbB{9*cFzd
zf=(W8o;7PF+`XP#M$1m_G?Nq(!yybK4A?HGqM|=m)(IzhR2RQVF%-(4tWbW8c;zXr
zVr!Ci0+G^cf(Ada*jkDrEyyga04`f%cXOP3P;8%=f@8^7@oh&9A@I4I1HFZoV!!3v
z8Y?r`Xp{u`6sd=UwU-%Q3}g<}34$+^^#&4~WtVtPUe2pZ9(QhJIGM)%SMhU$+objN
zjjFpZOJ^UZbz`GywQ3cR?IzUY(5)V6L<Xjv!^xw0^+w7EE9<J%WP;FFs)J$kg-OeD
z{bH@d?!dYN)w=pLZmpcf{n_l|SiA`B#gy0+iANX#!q^sswcc|zW<-Bylx6nHFepzC
z5ZXPxZ<UXZ-U@VtTe->P`DStI*zEHg+jXF<3##@Tyz}-(J3~8vcD5$Y?<_uj(IgvF
z<uigW0BH#wr#7cfo%p^!3rpgSW$*F}$V`8h`cbf8Q_t*65x!zdR9SXzk3}ET5S>{2
zy$0Yt#lV8Wz?-+F_T0^I139Q(vRpS>&AWhEF}RS~LMULGW1SpJqUGpiC(U}#9*5Wa
z!<eqigFiA&<MjT(=-#I@ay|PvoaXnE2s(A|RB)q=s2bZWFMek4AA}bg`iK~w@xu%%
z6nwQET`fvsy-~JiWHTL;Td_}rZ0{Si5nht&a_8LeJY8@;>j3umyl<s;r#CX}im2Th
zYObN{$|Y8ZKE|}k*q7;~0M8V2u5jpR(UEVzzU*odSeFvNj~Q(W0S~8D5)1t1&u=!L
z&us6~xrBY79gFV3!Jl0H_0?R~vqMx@(F33MVcbF9Dqs91gg2F^D;8w7`@>(()k5#0
z;0`XYkH3pb@Z&eMn@7_f={@(p7Y$2>2U6tn2i+Lk!0Q%R$3QHt&n&2&-&6;VrP836
zAGB|v6a|Bb?~?xMC1>>+_G{Xq<td<)EwPqUwDaudF_?ZhBIj{zX^_}K;-r%c>$&j}
zGsEBz@V>?{`<?!$mheMzf&#}Y+QDu={T#p`%?)_oSINk(U)szg<B^O%19Nk8-sOv3
zACOBNUX5qKXz9qeQE;A0riQJJS+uiE&>j`jCv79Hbzsgmx<{);0rZVH$gEkYV%t8>
z``uku0QzySq|$$8h!N7Rx=vFxf`E|3N5oaWSA*$(2aVdKte>FUiqtbVZhj0`pcqnS
ze~Jl;3&~fp=YXSSaCV;)+U6pWwLE;U;S*l(d=p?Ff|OM`9<w`4`iZEhs6}62M9cPc
z<xCyn%bk$ry`}|HmS|&?Z9oHcO2<6x4+}6r&<vyz*F8EdAcy+z#@fOZCP-8P=xvhh
zj5l+poc3}(ge(W4`qiNLYP(WV$94N78l(FnS#L2Z;|A3~hxaORt%N(ym><K-p;mJH
zhq7_il~?S8JR;NmcEg&5QRe%1Nwpz4-6d*t+b39?@03%FQis`(pUB^C8e9xlgu(nK
zv0jPFjsM_q;f3)KA=z&VbDtX$U32a5!%qa$=U~6?%(e|jfHOOn%Z;S*G+sP40PpLi
zMOGdNq-s_Qy+1IJTQ>2Q@?GBTfrgX0+0b9u#rpK$!_aA%R<2Gj2tlHg$NUz1In<ad
zQfe3hn8P<G|B`UJpTRySBUv|gB@B4HRa1~Hx8gtHbJLTkX722gPi;9);BgN7e&>wE
z1Q;mgVuJmc{Zh}$zyh_1@mAuH8%r}}l--u=bg87~9gy|;n>YjhaFM4=C^&(QZmkT3
zE!&5A7P}-bp5M-E+Q~b=38o3g3oAW9=spal?RigMWcLF!!Ti08<9=AR$2)7h8xo>S
zn#UsNs(uc2R>w?cw+cNDGwl>Ua|ZjuJdakcUCU%Zu>xp%P>h}$?`^g4zYcYdJ`TQr
zhNDEAHhNx!{Q12;Gt=>7KCo#kp7aZp)4}k@Ix@2e)8;U|8q(`4-F}K2X#@Ra%}gfO
z4Bk6)Zs#{I&*`&wV0jZckFhZF91M5HYM4ED%Sq`^jB_L!T)(}|q1A=ThkTz>5J%J5
zl|Z$Kz0v!Yzj$PW8ap2^0nLIx5=Vae;~s<$jiJpvsMX1u;s2iI{I>fsFDo*bpfnH=
zGW2y2kEDZvdbj_=1JT)@x0P+*egS_YjCQp;cym3Ew=M#k^OC20?raR%xGuGOKE|}b
zX*gxbul$2nf`zb2Y)~AZWbgurY+p^k!O7H4r=)_e)xnoED&apPad<KAW9i$y*Y)4f
zg<!;v8XC<FZiSAdwEf6nXZT7r>Tdegh%|fPT1qbahIEi*o|>j=*$SUtCB*?8GIvY2
zkNJV9D#hRLnI2QJDHmiGVkn0;fkzPEqO5V>m$Odf1Dw5io`0uj9Y|#;(XX%w_%y94
zSdxHmorwV6#BkEs0xgHQ?Y$e744JB6LFi`M_g?6|d3I<M``79b_e;W7CdxHnMh<k(
zbegN)IlbU@jVC|?=d>t`6pB?9Jnk0vID=1i4G`d39`E!~X40=)`rW?jcrbT035vX8
zaj%=+PpBBYm6MoBZmU$pUA)w2A=m0%)g|9I(c<!scoILhbr|u|=JU;F!oMZ6KJ)V{
z8%M3TV2Q$b4{%zzUapP5{};afdu`!IgX{IvY7;><-6`lcg>Z*G=F!UmPNx!S)HWd6
z1M_!$K|b*^?B3ea+WGB`oi%pT)@5AT$Sr(iYt*|ncQisa<!!gO<8%sZ5tOu+U+iq3
zhKgt?QFe`$9b_sonfZ^Q{?F*nS@ndbho#L1gCBVpY1ZS&T+GQ7arf^pV`;*~9_H-_
z&3H$7`SF3Ww$A+}H?s$wmNagJTBVsx)gN!iq=1_+Z4lj-`w30=xck`u&QU3s6cY2A
zERt?q1IyMo6o222II!rU3@1wBz@B%E*)&PI0fb5NlMvKl^kpk_sA5j@Nw7g6o<N*p
zWL^hkmecIv4bf5HMdeN5RD|RTgz(Fs;wSDjX8s)e9<up+aU!&-Jl>gc@7<Zcc~+iU
z*q^Z|DFgQ1i~)j)yd?$Uwh-*T2rE3;U2{7VV*LAc8fA#?P>7MIC)76HU)>(|Vy1J*
zzIiOq+w<kCC6lO}|K=;{{j8arw2FAoBPuz!E?eW5RqrxlLK`#pdsmi8$S-U30dwf1
z|5l5-7MHfSp8HR%Z{F-5&a(*n;}f>A8n&5o{P|<-*xe>@BbyqTU%C6#7yx_ko<+CJ
z+%ZAl<G%G*$afQqklJo?DGl6kW^00mo!EfW*R~S-5LSBYo_vP~G;zj2Q3)vyRUCM_
z4thtvZglgdEkk}F|I?E$zv@h%mD#(bxXE*eXihnrU<QF#;%=?q5qQT2+#mM+mCR&X
z#q@kmas*F&S(Bt)C-rY06-rCt{TlZUWBr#qh=(GjOyIAXR%hC6Ozo(}1BSAKhZ!Kc
zV;A+^Kkj{d>$=p#Pmf4w*(8_Qzl-{fjh=hYOl@B8N2>M?#TMDR^8I%p0*1bLP(PCP
z^H&@uT29cbKtLbVYEhB>Uqw<Dd3k;4J=}WWOFM%5q8)Ne_R!kG)+!VYeP?lAU=OP%
zIYFvS6yK4T{~eK=A#~+sKEF&!F!e1iGkoRY``AAm&YYCti>-VV32)QHci8_>ICEU5
zbr&LHRmQM!*gFz|c?9OQI^e2qQggF_#DeKB97|YBZBzPiBn}fUDHgGM>1YOL{Z3;J
zEvesmRE(edE{A5QMRy`zV;?S>KqML%&(HfbZmbxn&dkh!sL~Io)Kllx4%qzIa4~6B
zY=MMl)pBT*AJ%aNZ$5vPDvF;9j-&2Hyb6~=iC-1N4rM+LMAj)=($l}at)KjnTLx$x
zo+Bfp4kXLYF~87IEV$gMpY1d0ES6N34|Kj!;l@0cNCg_j5csm|^S25?{w6wR*3rlo
zIB!R{1r;o>^}j45v`75K90H%1b#KQ8ErQey9JfB9%NgelJ^--L(x$G{BPoLiI01@W
zj%AxMGCj8}es06NjveT%2K4ZBITQ}9Jqt9}*3i)xqC1;?pyL(x2Y>+^-3$-z5v$EL
zJu?L{NydfHm{<I68|bDA;Wg6^d!tzEp9Sln&8ab;oq6ug#tyd$<jlRkZuf<5+}L0@
zP*EqZ?J-qlF8RP(Z%c14i0)Jmdyh7T?2=3{CJI@`+ZaiqDaGFr7(=y(!3{Sp;fZeJ
zt&*b{rd_i8LOs9O7v2%6e0lmb@)&9&VvSoP@dsS6HTH_Q7+jJ%(|R;3;WMhb)milh
zH)QB;n{-UYPy2dKG1KM!_7FEJ(A+_(pZevHdq(m)zoh=b2k!<BAcXI3nxNygg~a=R
zHte;!y`r0uzfyI2gv?&ZG2hp0JZ|GPUwb#}&~cogT!CVOot+8!%=4(L5%_p5Z(lGq
z1p<PdoSa|w=U|x@oG*fdf0CWlI1eJz;#7MCbMyru995#GpX{PzGOX6M9#n{}BHdst
zB56>z-319i`ajF*<*?L63@ZqX>DD8R^xTlumF@X?JG7XA2&TKdkMzaIzHWGBW^rcf
zc0!JT5Qwt#XRq>g4#@n@3tQ*&N`$6p2RWvNXlvB~-z@seHfeDh1VV$(LukgK(`UI!
zb^t_)Q^lqL*z`Pilwr4}^CYTyudI+93K^+M*lVV#!kUb<j3?PO%x`G7{{Fp?K(zV{
zNBS4EcZtFKghlgGcBa&}3rBA)cd@(%PE5DwabAROUM0L`7WJlfc}P7P3pxWwjyBdb
z)ilJQ`T85V{`jxPq~F^ssv0amX+GK97RP63dTV6!%&oZLIc@Ni>15j-Y+oVW$S_j0
z^#T`Qle+e1E8zz)y*5wB>_PBqSZyeKRJWPy`En4*w611MPfu%uRkJAU-__?#?c4u)
zC|akJjZ0_{go(FAnaIQN(d6_LaN{k&t~$SFdKZg<%&)u1mKX0v)1UG$WDoZ6bx@V)
zQFnqV@%}Xs!MR+n6^@MOO=$Yg#AMbvTQh8lfZT>w>oq|iKJs411y>>C26d^MgC^?m
zTOA~Yc4jKo+HJs>5s-B#OBLldM9QfE4a^GdblzC@b)-&LS>tY;O>wH<ZXt%|po58o
zyP{+q2KwoQE6@u1M{fPk7*bA6B1q^I0AS9{EUs^R?~Jjlj2D9dT>6Vjhg5UzI)q=y
ziXqltaYq-JCRXV<5jajnGgTe^eED7CrwaR>yh=K5oNxbepkL8)l4h_-$o^3`09<vd
zoVB&JO<tZ_N>|029Blw+myZ+(@4O#KobzG|V^qr@BL|`QLG+C$^AqHx?wIg~H->VG
zHq~w`aay|-_o#|@%~K=<Xx3w2F^0$8D(u2mi~gc0cD%x>_Jjw5jK!aIR{M*vtm&Hm
z?w;ZV$kRO@Sa@dV3d{C2eKfRDl1-$!4FJ1hVeTJ`u+8?S4WF>p3DK~0PMRh7AQzuR
zDgKed#U)XYRNO1M+yVdQ4~j@6+$$|_Di5r00ph*SBC5;xV@btbc@s<e*8KE*&#;;(
zAUar^*Vrk`pRg*Bi(JwJAAZ$8?Y}Cx)2L_YL3*yqo{4fU1>u(KgX^{9DBpdTYD!XE
zTcw-?7fz^B)Z28z0*IfKMozbVoI50XxMt)6GU(_B*U5Vdd*8phwsIej9v-jvAHfny
z#Ei@$#7CQe$_s0VVhsK;*8rcGnET2OCyR7xw>#{$C9K?pw}RC;9+Y$NQIP;IICG?-
zc!_5*Edue}pQrnD;*D~-ekNxf<eI$M`!B@3c0XJMY2>B)&u@S?u{m)LPyylS)FYr|
zzl!T&)hl;ts%mi{+1zkrd}nP?2l+UT3Rxm^e;l;;0=LF>lY$O(k|((Lvsf(ark^o}
zO@u(_|A`zzOZF4?wn#<3gMcV*kPtxH6^)**oXvI{()4*5%Ii6uH$esg*Ap*OQ>Jd~
znm5z@K?s<)xIp8Lhy0n{SuWifSKFCmjjgRsiCIlLsdBESw|S&hqhc^Z7dEySz(v+F
z`ye3%M0My^qa!23Bl8?GIW*KU`JzdT+?zq(ZSDAY^dAJhA4gMH<0(jGvF?xnze?x1
z#SOdnzWnLX_)yc<aMRZ5o$JR`*`%HAxeakDWRzDwaw{o2CFM|Z+QQUJ8c=F>NNwE)
z!JfJjcCnr7v15+ha^)vrCtZfWj7QVW(ax#{tRER$$yNVC9f0H>G+J?#YK-pb(lrA+
zgs~d~1XB?7CL9?_p*cz2NbyhQIN?WbX+zPrqiVeCGDdp;(iPrsNVr8O3X$;WL5Csm
zfQ;87vEM{~+K?5isybaaY;Jl|#0pOa0NI22%k$ffBx&5+uZr|G-^mLyuwOpjisAm=
z>DRgE`0+^V#@rHEztlrnk9ZZ!y*f{PN|<jk)Ufhq^&n8u0tiO<`#KV8JX5(hX4-t*
zucqEy2O$Q0;@ZJGo^%5^@+8-pQP1BOVJL59`d%1(!fp}pd~oNXp04rd8C~Mm3i9nz
zhee}cgWBVTXQQCKLI|9vd)wkB%i=uoL~Uq5SlIc#?de?S(!pP@*Z)}=Kgg(HYENnQ
znc_dSD@4j(q>=})D5pI!XrsaiOZ8=)i`?reP&b|5rZ~ahrPOZKXWrXqCBCdnqp4mO
z9QLQ~9L9Ytun~826nFDj$Beq*3%)mN_DrbTgjW(NH^&D|l1px*#Zw^$(=&bL%nKAU
z$1WDpWluO0BZ8&N1Yg@i!r(6}+A=cOt~-CK2u00ZT^edw06@qzwZgwrzlHj2cbV;d
zw=SRwB6!i)bWh=6IFN#>EE;%Qz4OO)n-{izey6Mb#rY|)($gI)W==cc@oD!}JLE$6
zhLppL0x#e_GaflNaCi$~)f*fdbocQg9K%lOwfn`?wi%EbAKuuoZ93M5Zng1>^cy(0
zkg_q8cO1}MZ2Xu%Nw3|a`z-o6EE7dKef$gR)NVm}Icx2b)~`xtulZ?ELVJv1%!!*Z
zYjg2PzY3`%AJ@KK$yQ^}#tq5|Oo^b&f*+Qu>ILdFMF|Adx??0CN2>m%5)Trz;%)KQ
zi}C0uEgnY~0vzpycj=VKbVJn{j}-B$NrrS{?a2$0kO@)<4N2f5r8;fmFqLIRq4Aps
z4kTp)vWu)D{rWo&hjlv*Bq$!Qx+L6W8{sYSKH8U6-ID;?6*T~)F26Q_siMb#touTh
zr7n!L?&awYK{=4IoS-5-T42#lcN)Pjd<h9heyyC_VA3>NDyjvu&3VPNarM01vTn0J
ztj}N(n0^?`1G>i3cHZs7nSt;i|FGlkf9jfcHaxO?R_>!d;}5&%*a*JdYh~()KS_~t
zW-J}w9(&>SplQ1a3@YcsHp29DgRwHX?d@GG9_#jDe21h4i~ul>v-O;MarZc=mIO!u
zTwEo4OBb{YkEy1VLyuC<V5f3@YD+SvXMzv6E$V0IpeQIxbzKvC-uo}o%%lKlm)Cy(
zuITPs_j{D?rP5XJKpbhjU-vWh)4w6Sp1>~gje+EIYxINM;l&^YE!W-@$P}%``Pp#!
z`ZvZX_iml1W4$Og#X&eZLLcWPPJxAtlC*)C?`b=yau!(T{Nso{H6pAQzLwq^sRQDm
zyQKFW!mns$(N;?bQV2{M6x~S|M@?m>w|Q4xPUc~1UEt}Qi=Ha<gAvKCXnNWy7eB+C
z;k#;-fB)pr>TABB<=x?yrUh5xJ`>LC>TWN-*jYOSMmG!+ErF0hxGHn1N|`yBOA`CH
zr<a^!?U~-DZ3Yk{QRMUo39e-En4^a_6@ANugOZl=z`EhvXEgn5Wh*iN-brL_NfDDG
zRiQyBfK1~Ot7ao8Wx1ZxB|3+1iDJYOp>$VBu2|At<o*2%{mgx@l5vXk&+6U0Ot&Sp
zy8IS#Mh540^&(4?MEnvcOTU?+oeMtUo9E_AX6r<`qeHRS=HPU23V5q5;%$rf_3r;=
z(2Je^WvZ&BWD;@u+HI9V4%xHLNH`68f+2IDIIMi5r61mTv@vXFdzgLN@b~YvwJwh9
z{`O~qPtFlqZj0EOs@S_Hf4+4gV;Z>S<9Yc>y^g$(c%oqG(k*XuF-z+A@7GBch=@Z`
zn<t*h!agz;Y`r2qe9?v;nqtBU8FP2*hVOCP$xhQAzP)vV*&1~eTbsWtZl8?Gq|!Oz
z<DU}vgus^l>s_4}JJaVkdqy|8JEfeIL;MhaI@3uJmP45oEh=ksP@-8f<S!UIl<49T
z@XJNl>ztJsZkO^Vz~qM!FL9<_#$X9izsk}$9yP(^4c{w&0LeimOjs*T;$y6A^tZ|L
zPhR-rqf0#<Y9#W-OMhMKE#9^N#JkesSVD~RYFv9rPv?X$>T>CtgPb8Jm~l%`4d014
z(Iut?yJ~_}y8D9f&7nzGpMPLk#8$*(1z2{x7(}D_<CSly0pG_1ed6%)bAQ!G`f?7@
zFCay%KYvJPTFGOr!dG~C0fj>Q|0IznROXU|!)K*sOskFs&wzYoDB^Rir?kAsC{sUz
zVq_DQ!m3|O=Yf%%OR4qmuHZySq!}n@Q8BrmEyMsHk~`^$J1{UgjAh@68@toEaO!yS
zMYQD0d&i9ts70qTeFk=~I8e4Wu}una3x9osY65%@sy`shxM$Il|2z5>z*)63=Il(C
zd41=n2&S!FQ%k`*OL0TeyI^Ll;y(jFuA?XM#c~~~nU|9@O=AitZHU{8oTtONJ4Iou
zhqmw=JKMYToT0cPTOY>JM^_z_b3MOSE{}-vWFok=-c6&(N@I99;Rq+WOC9zO455WS
zkvS13WM3ySfeEf51V~c>^-2{>chWd+z|(kco-%wl<3Qd<GU8Rj5}VstQ!{)5EVb*y
z?yXI6w^z9So!CRDj*FMfJ(-RS`=Um!7up`4lQ8&opfHMxgBLZ9lBz)70&9TQYJV3q
z&QyP~dwqZM5_pamG|qd*iayA}*R-TkXvqe8x1RYgJ>DZwAeRVY9H&p;=YC6xQuB7K
zZR7;v;f2ewK2?L$PU*Q{Tv&QVt9mKFGC7ln7<YyOaIzBr0*dXdM;cw?m*Tz=PG<Ot
z=<G*`DBsklr%3Urdpq0DZBV%%#S*_gRqYa|T3|~4B$ZXt#viWsmM0y9;o|g8w)HN(
zAk5lxL`q5UH#E15+_Hm+hCtV0`<Zkr6fB#+UYJ#c$jO<M`yaA}*di{A8w$w0(^=M=
z+0ygS1oj`PDxL|&N^MQ~9mG{Q|09RHG!iLs0+tN9#mZC|-iaSu4voPK`4%vBxN%kx
zxKYjjDj_z2&r%xX^g}$6ra;N2CCm4A321?px#umGrcq$XjbaP|kY7B`^md8k6qzoh
zYna2tk$Q5~@UKU{!9hXb50FA8xWr$|p`nn9{ZtLUR>X@JuF;ml7b7zL-$zkRTYfzM
z=doP=LCR1u=-0Sz(vzt#?Y_TJHLo~zK`wD$X$=iw057AVnAcIV#B?u4*RQCq$=7P=
z*9p)A+6M;^v>Uo>He~WTZR6w5JM!{;aQU1B5-Y1OW#{*4V0HWrE)bqhgTv{Fn06oI
zTTu!4vL$236HHWVxSlLbR<<A~8+=~miWy?)9xf6l9ug5BG55uS;pLaTR7P4S7M=Ja
zX@;+8Em<W5B^zU`t=#k!#^$vhLM}%C42Q1}<O8#Ie@}WXA)fkEpjbTk=>FReVDH@D
zB2HYcrqJ5@i%b@n?_+@!LNEp=(X3VQtTYo6eyaR=R^{W~5hVkWGL*@QLQCMqTtF^C
zXF%3%pS#GjEB~!0nu?)Yi!w?N!mN}wmUXDwS81|j*oVVd)FD|3oVvw5@N7t<Cpf9E
zgp6uww>p?q%;8NuiiOZhD%|;%(^XaPRqzcLDBy{zn(}xs^Oz<1Q8{55-O5`B?X$rQ
zNwH(1dydp&2>6RQIMS;FNon06XDY&k#bJ;4zD;x<Ws=T{OmubmgTuQCWp?h-xtE4-
zx%T+ZN@8GL!|MFv;^Lh%zz-@)mmtm!JM-4o)cYPIt`+aoyU93M1xTqe*gPj_!8G~;
z^=+jY24MeD(|s@Wn(l|4(#m`i#hO9Uofvi*03W?E=4XX3@9Q9o!fGZxeC2@P8#0GE
z_)Uksig!tNy#6vrK#)Cmy>2v=w>|7#bvCQ_c-S9JbN@)C(2C@x&CTDxe+Qj8c8EhP
z`L)NjbV#l5Y1G8kCiY5E<Fu5Q@|Sp4gq(k2O^%<cVgKs$e}@)KKqKk>O}YT6jHGP!
zG<r&-ll(K6%+dDT5c5QPJzhQFAIzQRnnV){Evat^M6rC@E1Hp<NB_Ns>6{4p8;_2+
zW#Zt?>QY2-4yGrAoiCyuv@CW!)5*f^rw!p*SHlfaZ8~ue155hd;<b#<R%tSZy(^1A
zk3maq2wONA{3=s&&*u}pyiDr~!ufNd1=#*{KM^p%7_T=llv8G^c6AG%7Pv-dB5V}3
z21|x!c~hHfzag)ygGS_i4*yWn_wM@T4zH0w+eRuT=oEf%ZM945a+T}Gp3u0V%4Z`l
zuQau`lL5Kq>_5Lrw=r}|ZY3Xb>&8aDD=O{W{!>Eo@5FU3Uss`np=5O&OW=}j0bTgQ
z);$%v!?dT=^?ZdSReE>1zjq?Fwq)X)vU@QYVUCXM;qGeKorDyoUt^`8{d&3R2-}z*
z&os=b=#;nMO-{G=c&9n5Mt)k#R2;}Q!+?EHUpsj9bfF*JtPq1b>~9^NEiB{DaC&Be
zJ#_ayPjcX{3X<szW`e4bl7~~*eu)`M)T_};>;bW~M<axjnGm5Nc7^`aJ)eS<Lr!^D
zts;eSa^G*L<^4(~gE+W`MPa<h!)i~q-H~V3C5X9PlJtofm@z^a#i*cpN$2kLg>9?t
zEbgpXG<(#38I_{@b-IqF6?Yww(}&~>;gX4x+uu&25PS;q#RD}t-)A-Vv(yhzEo1k2
zQfz*v3z&5&;|KA1v>GEJJ(xJsuHIZQsMO9&Ui?y@ehYL#wJR>VTlwn5J%(10Yk)`d
zJ{SyjqGpY|c4m?WwJOzU9DeO;`sTx|>j(i`MI$Uqzm`b?*|uQ8`{g)IkWxy*V=g2j
zzK~r^JG9XM0^FWkZ<?SkEIdM*NcQOFcrF~vQDzo?5a0RWN6UmJ|K-Gu>_L81P;pGU
zBY7wqx`wW?QwOvYA_JY5SuP49(C>gL;(EAOoS`A7ZKCYk@W?hhYy${Zb{3Cs1Jrvv
z;4^&z&#8Lly+awZ0Z%RKUs(SR8I>*)5)>MLd_bv9$_cL47nSP97}!8~IQR?_G7u=(
zO~!M7<5!mhHnz<0GgDLF8`!tB(Ko7e*qRWY8w|f-d{Isc6&x0}GnOF=u1HbM526!(
zOa-;D96is8HtY7tU0Li+oo?O`eLi=v=_Uj0N{7cE`4Sq3n>JS1+)%Ka0vpo<v(k6#
zo_{6?3t84Rs$u`PIu@kc2BfI+H|W2Ix14*Y?&xNjtC83Y`;eyjFkZ&!t;z-~MWZ~y
zVSvJ~lN1^YUKnXbw0Y_swLnW|6$X&<7==tAF}effuIqUj!}A$|Txs^vC+#UId7;dP
zx&}4o{^nNf9?K+X<Schs=Sg_QjQy<b*?4d<83|dtUJh74krL4@cEb6e?mv-u&^4vo
z_Yl$4RQKfoQWivxD|;qoU{bF^T@ab)dxytQvHa;#O+k1uupG~0f`ecIQ3<5Qjk6h*
zHBPE(YI_Ey7yd;}#U=Qw!Z36jKU{GL``gIK%2pwXUBZQX%H-wcwU7CkVCL>-EZ?Iu
z`7J3T^Igc@YzOvTQ&3ms>1Kf+98~JKITEmsQ&&a<zTQO3-3VNqXc<)(`dQoSvYXp*
z@RjgP#Lk(Eav!J~NCI(vSQII2MUg$omrS7B6st|2Riirzv)-XS-t*3oG@TyYjyzWb
zbHHDVESII>@E5=jMN*;{!f>Du-u3$gcetKLh#ib|tk~F!|J1g}-cRU#o>Xn6=T{dz
zwy%73zZkLA7wlvh3<*mm@4&M}m9D!%ef^crL(GsSaeltXRDRg>7of7V=u&r9ZAQ3d
z$M)Xm)Js4&J9Z%%0gT(aK#iQ4hiO-)6o$iF(5GrnPAlD}SoYE<`I*U1VsAW;=$#Tv
zL?4kCSh+)_*K7d{q0nnXR)x>}C`G~|?*Qs1-7&I;&7J?Vv3#BPJ9ur4*Dm#1;@XiT
zuS=QAU3Y@WdQT4>kD%hc$PJZ|-|t0eWrF1-(@_cORUl`KFhRy^i~wITpj-6q{C?q@
z<!a|V8hDhywxXL=Ume**5~tzudeTI#!)rg2aQfMV2URfui6AbabIiP1JsA?@4^+kp
ziP1^vvEL((_Z~{mpF4vzRDoM6UTqvlJLK~P?Nh$~L_*f4L%jrwMLn@l`FOP*^R{PO
z>5n`GVNm_(--Z8~=j3!^dBG2{&aT`W=$3)9Fy^u=7AkhXq!@{)wqg<TX!ZVWU)u<e
z`rc}_>@r4sp|x76N>sxKnXh7vG4h7g2pA?w9QP6X8o~S^Ip|Wr-!EKa$M;QkfPzk0
z!hgL$PqK>$ai^q;Tc}#cS!<B1*j1#x#sw1Y@q_mqK}x;4JEbeK&G${NBS@Fj9)EOI
zMzw~PTXu__mfLlA7<D1`Jwb(Tm@}~m?r&0%o87TQOK7=h`?fB4YMIu6_fbOwsd!QE
zO3w0iclINA6+3>gmiA3Eeic*$+a<t^adqR|>4Ipf4^w|z`CTbcmrnd><IbDQT%7U9
zeUjV(L8e{&p!2BtL_TA9&f9J_$<|1*(6XJ}i$4D3A9O!=XH%^=WwUB$j$3kg1!%<f
zzJm$s=MQ?3B=j4cj5spq4>13<N3{<WmJhCw+AmC-v5ZZqKZ5o4r|fk}ls9yU7iUJx
zTsam+eJzVKmb@0HH|d!Fr)YHZ=kRHtorR}i#u(iu-4q%haVVqGDp(Osum6j-kPpjg
zb9giynSj5TYd7+3xb+PW14oKE|JJ*fd*fAgp;DiPMPJjO`MR4_iHM6y(fV*4yh)j*
zeum>FqU)(&86ht10iQ`x69uP;q7lN099@|w9G*~k(BbI@jt|wo9>SA;4-YrG27E(I
zIu&aTrbb19*W%cV<W`__93mFTp$@*wd@biFc97F{6%5^CAgWF0dJTy*RCQdhepfy=
z$ydfy5GCf{m>K2a7!>~DL;{WK-{r*&U7a6hO}IZOOfoEV8q+en?BB2^%XDE3MSooQ
zEQJgIdCr2TZ~Q3RsN~+1?L{;q+}Ie6SAD-ne6k_TgFv`W-;0v>pyFpylyA2NArr1M
znwsd0Y0HFEu&voNuZ-Pq;*nnRD;s9!V4jkB!-t`jixkJ+K;25u9sK!FNj|U*{0^y1
z8~s}Z_Uh5V(@sjqE%N#R=!_ejDprW#H=#id7VQ(hcOTOzk`K0wpe1D+;V*&bET)NQ
zX1GmkIe(70u~bAU9CI;$fX^UG6170-BCrC_JzR9V1QL4I=4lrfAd}=f1owQgXdM+6
z5Iy4H#Xr6Ns-+x=qmAV9FLS}hU7L$j>otR3@n%Tli)QZlUJHw-oUXJS0%J&8E9&B#
zrs(ryAR_0jeObF_fbidkBsaardf+bKS@hXnZCamVsZnzfepx5cIeWpFx>ZHF?$j&!
zAY(`P1xWpq*nb(1Wy1lh*u6Lr7j(SsYJP$7O9616Y<SBuhJU@K?cCAM^7J*mQ>K{f
zEdHh@TS8WxQZo%D&X6y?v&4TyoH%)9`O1UYw#Powch@vBEfobVvD;5J)bnj+@$$A-
zF+~xyb)jjMn;nisv*-y;dG6tGXQkUq?11#NXA+MB9wK+^|Jo=KBZ2FLLe9<nG1Sn(
z%Qj<ob{je5g4UwkrI_|`-DtOmSqWf3oxLEip1xX~G!B;u?SUcSUyOPtzDfCa00O?X
zBAtS8$Ajb7mg3N_2B7H{iyOo~VvQ{v-)<iL_RELCL<$J-ze7-I1Jb4u_=2d30xp3O
zFZ(_bTUl1OrJ8$-;hC;Vsa7_{$DAs^)pax72w$e$`;QL*G)8@I_x0<wVCnSjwb)7V
zNaK*DcH@MRlT^O&)0LKEEKVQG!=ZF2*VC>ji}HQo3M(~ZYeb5yD|3TGzHBY~+A!Zk
zy4?eS?U&};=cYkxbc20-d&=UM#5F--AFb?zhKh8th&a#M5>WJ~r{@CBh0GrAL`D@?
z&W;<x2-15VW^`uNOjQhTg+tR5jlbKzdwZ$ezud;F+>uCv$p$qMN;d-35Vei#ywRQA
zerc-Um|FhXuVKcmn10geW$EC#|5*?f2Gc>2VvulO_gG(Ed8G;Qu)B1-6&_>ed)vVL
zi*}`JKyMH4mnph#?uq+<P{+52leApVVtre?lFJ5ha9K*RmNcEo?2aCf(#z8kX;yc-
z<(}&~dH=BmsZB?@OT)|>e;85BB(WS`L*^{ameksku$}pkj^&vd;I(q`sn9la7gNvc
z-cWhPudhdGC8K^)e$<*D2|1IM?0I#-#Qal@RB$0Y+5^PLWomw0w-VY>?t1$x5JW22
zSF10vBn;#x{mq5MYfg%%nlAP(!k;jg>2ECxWpN$rKj#Fxv1q#8matI$YWHE^k#$6l
z#Dvb<il?dQ{Y;uN-Jx^?B8)UXe{#wYj4gp`N2G=9Y)MBrMMg<^F?wsrfUud-<5li_
z;xh=pY6fxWHX%(*s%r>_RWuFiNOKPS7=^xZ(M9zy%SAT}QjTWQZWhnijYwGwt9GS5
zGs~gLNnRE*?ZtXD;wnDrQdMdQ><SVwj=`iLzLX^RFCqQ#qqSTn2Gb@5;X9;RcUAb6
z9UK|XUIp}o>dtYpQC`LFa5BikfKwm#6gmlWJ+Pd+v8)+?K4cUALhs!9<L2$|Dz-#H
zda}%%R|8=gmwmY{^O?309_e`T6=+T#@h@)zErj9XA;YDlY#YD0^IMyXPj@x}Vt?m1
z3oHrNmh=rtLkGrx>C)sMIe38t!%&0vZXwH}SXt)o9H>Z^uQV}Hj*{Tk0F)OrTFR70
z=AP;N1LBBDiH6np8#aC{&4!ssL_OCb)BR0Afuz6S1ttxDM{tkdJ1LZy7t(_fX>|qa
z1WNEdFa>NBq5`rXQdjRxIKPN5Jt}poJ$QL`(Z85^z3RCQUQwR%uBdjmfLKwgfSM45
zij#{L0I8I?zZ378Q~td?e-RkI2>iF^2RM#DhGKGqw31%RYAYC`1~aZO1cmu;wDAWI
ztfsW=3QWLDk_8V~U&0?|Ynrh&y0rcz_#Ml`OwL{>Rf|-X!L{<7`~pTUU~dJU9Wk}n
zpU#}WyHVvcN^p->v@4d(^}I*MDtDoB)Z2=yGAif9e5E<mqR*WU%^B2>^WT;ILCls-
zke|{SUk?tM_3I_E01Izrt`3}`%~)@hp^i#Vy4}~*5^nwK-e)SAmr2?iTl*9Io<y{;
zcsvko*{VBDV%f-YjtU8*Y1EtqWF+}6>&J^1s<_R$EjDNjXAHJUIjRZ5bc1d2Hu*^4
z*<`LX_uQY4f+_Oj>93yJe%Gxd-$k}Ik9}k2a({VOJ(M9{*u_uBWy>T>OazXG`Ui%U
ze<?Y_Zjs6a8jk0cq*j}k4zJ(v(`vh)J$*Ts-aMuo{}58C;(|(2$fX+3uNfIf=roMo
z&x`DntvWpqMXxq|r4g(}>`x)C71*#6Fkl)jtzsI3ZnGy@#p_K%FUtUJWR$m7MiWUI
zs!%Y#Qxw`j*k!{>&U9zL$)ej((iPqPtuHa{UTghs@Nc9QQ}n~hcL_=-pMqK6>KG_)
z-PWNwWk*mo?5jnrg`*PcwQ`A<&1ZvFx1U3Q&O8tuyQAAK1$ANouG9Prp@@-|67xE~
zX59{7Zkpo4RbNT<H!g!9PiJC)_wIL%vv5J8+F?>F0#{<Sp70ruidaPkxgCBtP?53M
z^*^)+{hE2qMfr#W-FPwRCd0~dA=DAH%(HU;cF}=ii>Fzp-O`@wI0E|~qA!5b%~C`X
zit0j1#lvv%h^&Ww^3$0HvCzld8K|?%Pn!VtC&B@lZ7I!ro3f??n)8^-@+njbBKSjQ
zU-Q_V{oUP?pQV(NWDO6!UUebTX>PTe)_EJ=0|Czwnn1**$N7kdzkV|qE14PCi!IPR
z#XwA?3wqM6@m=EzzPC>aqqF@KjXb{OT$Xu}A1rq_<T$iB)_xqchM%t%HJ*|PY5Gb!
z9beX4lWD;E+H;TDG^|=ezIP9a%(;?{I#rtr3U;S`MvMQr;136<=Ysh1ahI&W1tyD~
zwWz4}ydMXiviv%E8m^LsS_e@7_@y`n|HdoPHI4bgC6)eBI;a4bg(qE;9KZ`dFy?ek
z-O;laqrT~fP~}kum%ud|iNy+dsnE^IKu(d0H~IP_muM_}!J#YQ;0KlDgbxn#r5}1A
z_c$Ef+Y*WsER7D#`mKJ?5`Wm*ZQ*~uLnj6QH#a-G+@>gkrdPY|C|ST9=3gd3gvBWS
zA<pOj`70QhqrvmWz+w5E|BrTGD%Fdj94ZJKd^%o)IV?t0x%Dfr=-D4)tU|ElnYWB5
zF<)H|d5YMMRcjmla(G4qc!|@_m;DBf_Hd5B5bb(<(*x>`E||o&<}(lG{ArwkDTekD
zBWZZJuS|RMSdN^~1$keAk{cIn3f=%jeWg9&>4c8j0U=L%JO(8saQpG);{O>T2TNvY
zHq3+i(db;7T~QLP^JeqJmy%i2f?o1#Qq#|jN=d`ERMt8m(5H9|;+*77x-$KZ@>#lq
zoMXpB|H9JQ_W9V>)L;;xmCKf)J{45ldI~BG;MgD5jU~$am(=SBQlN5u{``>|1n$zz
z^`=qoYCCHN;0(GnI8Kx+&zz*A5?6V{g&-mA@Z8@FDV{4qgnLLwT?PA)skx{se7<M8
z>wYygwLaKwy{BI*^{Nvx<*0JWp1!PWbw(w1Qd^6f$$omzpZxEL2-uz}p`W(1CMKS(
zhNd5aE%aJU^lR?xyD!Mc--dux0EOUDaalZbB1q`!Vy}Pj&8D9|KC8U9Qtbb7$pLj;
zkXO$|cq0BV2h1x}c;dhU0hSD2L=}Wv9MVa(Swju}Z@_cf+cLAdVY&pLrHN3qT*(oS
z{Y6_#Q)4qTF?<htBO$LyjXr;o63<)nXTEQXTB7&qPpHR((b{1#zYds!F~L2i^$o%7
zvHSg73<b(Z?4nj_MFI1eiwkWKsgvw%jMm&Ua46CLXB~*znXxpSR3~)ApY9fOpm$6u
zxBr8YR~B%X7DQBh#^KLfU~<V>3@wqve%{Del}uwB>p1#aNBN^?d@Ep#AwyoP{As`0
zL8Cd6^OCL^RKz9HVv$Eh_egMtCZ%VcbY4VxWc_FLz25&Zn2X%s+1}#xy#W8L@l2S$
z#~g+~Asg}+)X!ej$a-{DO6P{-9p|diQ3v;^tHx+5;;S_Om8WRy%k0YXcxGFRl$%Jg
zj|5J*^OYC{m6ZCwe*$BN0P+u$pdMGJR8#%qZ`{J@0|)Z!`E-%HnN<Dc%8?^(AOv`*
z@^=z}f`HH`0zi5ym-cGl&&o<ql+Q>*$bFyU&`<!2YTa{J%lI{DZUK!TzO%V&%yoOM
zot*h?T&<m_F7nrcc)KRkR5fHsT!`D?uh&9@da=UD+GV`3m_h3BIMvl>@UsS?QWi;7
zxGlH`mH7EB>GEq${(HywtAW5MN<OlfKJ9nWh%X{=-sxrkc=D9rG1Uomm1nZRpF+)%
zz!T-J#O=}-!zm*11(e%n<<pK|wJ>QE8@%n{1AWy<m>&azOURkf3HaLDy5NP>Al1Co
z*U{DeVBpYVs9f+5j2;l3I868=DS&_!3gDQ5TAg;+_YMbwtDSmBYD&1E(d`m1R!`^D
z8#Y!1S;<qbT^54DFEAq#DB+UA$UP$ns$mY%r^DY?z-0_`G>WAm`NJd4v;&?B*+IpB
z{{Q#F?ovOecaD=O{Y3$7_PUg{gNTrz(-8bGUb5%zWpN}v+}1<W{Zz1I#>blj|K-jO
z;iO^3#=RS?#h@AysIz1?zOu;n1sE=?bWO@kORR2OHuUS=m-2FQX}Twbb=T;0w0b<i
zcl(7dzW>suasRqw?%Z`o5`kfdFZ7g#^M;4b3YJs)nrH;L{L4#p#Q}<?UyV(;rxa*E
z(_t?)?W&yHdq@oWZuK@Vx}R!q4UcTe@83&MYSF0EG282d*;!LN4~%#eDf^JHz$SxN
z61dyF4hB#V*8ouVCdl<7<%n0L3<*)p$*N1eJ$RF0P4q#kgq!YjHRLp_!#11XHMBAt
z>Y<z)r@u5kZH{QQtQUn{124=wKs<dx04=$9q{P`K`ig}9A@|?!hkfSx7S<Qb_<#8T
zX2^aq2#=&tt@ncv<;-@^^IpdzKv5qeg0&aHAW_tAx}TnskqziLs;2ypvn)G4bLVOd
zne1^n>buY<O~ygRf%FC?bRg-WKzI_|Ki0^EIHw69I|E)v-G$$S%H!e(F}b$?geTn#
zo*mMNlEQtr7*grtXZ{WuA`s(9(!@A<q{H4O^togicdy#yU2G66(a4kWU(tyAuR<m9
z+bwO7S(igb%DP`__*v;YP&Lv6laIVafs7T>JnX{rD`GMC_ze}bYTo?13pfSm%uiT{
zkiGUr<MG;8WMKzBt)EH6&j;2m-#h9She@E}6Oy6J)}{c<C>xzX--kX88M7Ve0lmOd
zB4d7v-tS#UUL6CmfkMw>8#k%kC{8=#LulRojrlE|9*aIT<|}P8^O#d<PRlqIWY7iF
zA)Oa$Jl$P34BMf~A|SabH!E{8ca#NR!?O-qfecv4P{2_Xj`67-fs0ME6pm-wA@XTP
zE#js85t+R46q<N;y@-LmL!g)YC3<qE276lZ^v9Y>8oN>q|6J|R@ejX_vQ>iN_K0!_
zTULpA+{m8uY1)|2;5Kc6BO4#B%OhxWX=$Bz;1yi2QTJ$i8wjTnh#=?C#RCuw`8J2W
z#>k=B09HkJQFjd<jf?)G^3{u&PCjO>NE%u$(#xH6i=``~L6*ir-TuQ6E9sPI8X`v=
zP-Uf+ES=j}tO--Nkf1<5EWuQ35Ya*H)Tog$?UA3>>3MefAfrbMvWmWhu*TT&^%JQ2
zvI+teirwk3ZT6@su~<~^k^KpOb2|bfAhrOPq?=VYQWUKp{lL>xGMg5wwWo4$@5>7Q
z{IS>-{-9RUn@_0-*Jr1!RW4q$Qo8+LC2YF-+5>94Wp53w{L>lw@4}LIrFd6V<2%0s
zW{$D3%wkft)AyhwDDy<4SN2~dg$ocm%}WE7P-&RM0~5-~hSa!@NVK{L2Sn-TyjNF-
zj~v&%A@4vm9gCj%VW1yn_*<(m%3A1>J4j45)yRJ!pWy};d8CD|Po?m``SQ!U1exoZ
z29VUHI1vsn%{JYzKLB7XIhl5T4I{7*x`)hf2Q70XE8F>L9$9U^5OffEXB3Z)8q-jF
zVywhCoe>Y^GjS3d84ZD<nFl#eqoqI#Fj{Jdyo6k9oO^AdrWS=bCr%ZS)F-ya7a-*5
z!dJ66X+=deo}VvsI#u{kc~r_F)xAJA4Gf+4S46NJKjE(N<1YZVi{Oc-h`v{OoG`B8
z0!5>*3sg4@hpjDy);aIm1BZVptppgkpfh6X!gh`IG8_*F#@9eD5Yz|g${^klYa!xL
zV92PL(8x&_d2zyNvjgt4=Z=K`RF5;1a=X(d4uQz9)}vKU+|LJ40}pf;%co?r`~7sC
z4suRskWR>PqW+>pdB56#Vm5N%-tXA26Yo^q0d!!thaiimqmKXkQ9f8soOxY75|ir*
zP*00Ny*27)ol;6INh0q)q-D+mu33)oz@8Uw;{s`flC&t&5VJB7FIM*`G1((4ufXm{
zZOPnOv~3)wHFxk81toB77>;IW74O>}F8q%3S`^H(y2v`N=FH>)FD7AmB+6HYmfdz!
z;uQ=`H$BDtssU7qUIql8(L+}fsbd<M1`Bw+b@$uuwb~Pj0ETLf2eHj$ydwQC2Q|`A
z!4{Fr4^c70%NJ;<l<rs&P_>@^aP{&OMpx}I`&Shh)dLjFX^^;EPjSsj05A|xe+pY`
zKHkeI*WL~B!wPIh1L|{(G19LeJ_Vnl<;*szo)SDt`q9<22E;XnVE%|b1HAh{I3(`k
z2Z{0V=}cZyse%CF5kg0n_V6h?_iEoeO*8BbNn5CF658mLzu#L^*opjvK51An=+_;n
zSTH#L&;q^qpgZrNn9`WxjSNt`UAPgpQnVH7bHtqqE&#!L=g<9300&`3G1C)^a+H6K
z!obUzoYic5=b@n-o%(#dwB!9C+C%RJwy#@Zmt}4NzgskhgUb1KtBS-<HA;ds4jS?e
zi5wW89hD9#XM^r{fdp<DAlFPy<S#ssChOeRbLJJgWULp`fcVOJRHy)k%oCv;_7X{i
z=b#j1l0@2mY|XELZ*60?=}#B&Oi*NOG_zmAJ)0<e%Cy4-E-x=O1tnObZB0D}&iBna
zB+64P@dHpQ=eT<@JshE2s=&mfksRo)VTYMy7_{WojT?L*MXW0^_~7B<1~+!_LzlV)
zvmFFt(L>Z2fn6KelTRA7ES1jY#K5u6^g-%g*9V}es@S^hbW%K6>3{a|YW-AeMfp77
zQ_|W%>~Cky4L2lsm47it*9R8od?qyD<K8TZ!rxsVXynHH=t^<*GsmtI#)pRitdV!w
zutqu5<$A9<)6DuO^im^Bn*+2~Ljl-}I0v7~kkR;w1k<afdIvV64bfy@`^OEk;|Yl+
zv&>|mZ)8F5OYM}tq*Z!?687%g8<^4P|Lsm8+u(1Vz8~<~ZusX@P*rNrI~9sG7VbSF
zlNXbGu@ej6Ary&`f4I*P{i{lbREW%de%a4Fq^Y){X6Ea`7)S)$N;B<@F$l=01eu?B
zX7l>w!)EB&Akq*$wF+L7+go)Y_FUtXUKzvFI19u*@*o^cB9*Vn=hAUrgb;s!1Gz`O
z5qL9Ji-bQI(<#xb<GwS_zQ#aqdsS9Knt#uCdED*|AQb^4A)f^L@JpKtJKPoc7)Wi8
zc<)AvDQ-k<&fM`u;?Z6z(B9XVL}oOcnhz9{awO1Ggac(BbT<*CKxq<WbihinV*|#N
z9fatze1d+xcqLeMW?z@aKy0S!-vBe}%20swP%9#&`jx&EykeLXk9ptj3$PU#b6{0V
zq(h)q^FiGyE9*tFuNOPnUhI-Liy%}cu25k|IP%0u)98QsPum}2=;ZXQV=tbxmy7a=
zWwc2XD=;`XJx3oGA@wXU>#R2M2<nW?llBSpT69bQvD#h}+#Rsaw#uQF#vn<fLa&6i
zTE5r2pHzGkaQYOrQ#vx7#8%KV%7*w~?0WVLkB+YRa%;nOl*#b#((M8CN^ADr+_KZB
ztHAf)FS@e?sCR1bH$2m600&pgJ=3<4OMUY}r;{^<H5}EpI}Jiz;BEQ2w+8W<c;49$
zjl-{qMIq`%yMf2!CjCjfzsw2NG7gT(CB%|$xn~FuV5_>jG|iZHH-lnkA`iqJ|ARLv
zp-+E!_J@E!JS=5>ZSx=Vp$Ewlu2x4Sy@uxk**$k1zlnzZc?S6n0L@wR*cw<$k+X6T
zI0aO(r6B5%fs#1B0Fvd<{0k%>T{XslP-_By$yCCY3Rb0(xOpERZ8nqVEybxMReY9I
zGHr9{M;pVO4G>8cAQ26!=Pb=ip!*>2E~3TiDWeg>$SIv3`SLrF;-#hfNHO=~FEMIK
zREqqm2d^->lPEb)un=G9aowJBZMwhONR;jfp;DK=_DwoH-(#I<8UFpIA*{n7hVVe`
z4eh|5F4<Eu>e5O#P*gnha;R-u`Xlt;*Y3v}+bJT+Veua)7!G)Fe5V$Y2m+a$nzz3S
zB*f(+DZ)?;;nRi_{^a&+xw=L?Xsi;s0cKZAx?9JthIA&vu;ik_2!{&HkR?eRoGoMw
zz@8=DM7Fdudz{DGE8OA2uU&0y0%<#7lAK)5d*=r@9wg3z5RCXSv8s)&89(W^u?=n`
zcjLNDij*6xVzw<=BziLQ6oBIHKBz8ff{=D^zk24mi=XP?SArK_k3;awO8FA?rrY|7
z(9Z$;t8wIw6*C6FU}D#@X-El{F6bUpiJ&Z-*Ek!Q&0GHs{z9}(FtizIB<aAuO4Wgl
z{+%<4wGq~!{G;pi1P3=PWWwBTk2r(`-e+s7*q=vVM1g=EC>c5r?VIq(%FX=WV<E9@
z?tPY~o-&75Ha&2?>cO{G1jSNytpwEgdluJ#bN%6}OXlP+C&j7Zy4AWly<AHr+W&EM
z?(s~%{~sTzMh+ti$tlVlTRBZCMq*BNlf!Zdp;;13&WHLsrI9q}(Seb}oJY>(FbPvF
zQ-q|fS~=z%e%I&M|9S9uZ1;U#@AvEVeCA^5d%le|0;SPr*Mf}K`_b;-L49Y|$YzSS
zn7xvR2#(zU6h<I>SAv)BU0+|1PogYcPzxx~4-$NGOi}S(k_ueFV6GQ0t?4z{7}b${
z&(-$Jfs&L=v5b0u)?n}(WMX!O{|{V9B2JB);AORBcvwVh`lip*-1HM1iYIuL`%8mn
zh9X}rUzVPlvirT%5yb(Zqa**KH#sV-m4`^Cg{ZomI^_RxJcsCZK+lH7TB;*f5~TeF
zpP3^$pp7I?#wooU9K6xd;oVumego9GyckaF((KQLK*<Ew`iIr1$*LmpiLgd>EZ&%m
zaIVh?;=RCtpV>sbYEmxwP%rclfDMnWU;txr_iOZCIZ$JeNM$G6`AV4-lCQqIIYP&k
z;!f;c_KC%veEPg8l;eqo{KPL<@++t>b@JvE;|Z_A+?ZMWxDJ_tf0<RyOnxbhJZw&;
z<!YtWca+HV&2~!K%hBjIH=UGVrktBFbN9^tOTaDi^C3)fTAliqfq1A3{KC6l=G%jW
zCAdHIvMyTS!4QTmoku*AZ~NPrjn(t_qG>eEzfL?i{%@5G*`T{`ewV4S7kDc@rgK=7
zK~C_9h3F{on5%g2C({ceWBfdx$(ZCzlf&tW9)^#Ha9nT{@Ha762rH(RkjW=p20sX(
zx&x)N@Tq5<#w2Ne@xXCIx_>$74Wv4z3nGS5CA0&S+d?l9_UU)|d`-SwI`F{Mv^;va
z;jobcZ^v3mZ||2jMF~u7{hJ{e4-95uYHY9H+FIli_oTrUNmZ+UeHzWV%9PN*`2<cQ
zp>z)FZMNt&?ra0ohDm2)q82xJXLdXhkZ0#6+9WXFC9g7_)po<^W(AXET>JYXPb{Kb
zzF<j#U%YDUwLttKdey;8*q;Q|Y$%x%X4-x62Lvy*FQtErbJP(In(oPzE-f-X36Mk9
z{v*JqW6yA@HBUe7ixI#$6F^N+JU*>TH<#i@^U1!y=IEzW+sr3$xic0)RM_LSKrVix
zYL0k{NZ#R(J#beQ@$m1f2P_#o-G&<WZP$KGByc8RNn2T1^B$wi913X8<_`M>>&rZ|
zz590|G-)AmFnliz^PIddUMc>@b+rovH>4627_qTUWPGk4^FDE69iN-@+&>RINWq$`
z*kb^-=AAl0O@9JuD2I0FAG|xad4v87VtsQVCd1AlUC()cJwEL$k~m$v)&S0lnu+xH
zjg6b*H<kc|SgXX#3e}@cdL&X1bDk>BjNXs2e+emMb^iSrNFlugRh6;g!&;oMf)XGe
zs{Pq9w>cc{4bb6tpoD&W&e6iFEM25B;lu5=GmnRs3=ckMeH9+kQu;7^d*@H%WQgky
zNQfjtP`rRLJs%V(^WXpazV0oZ<uY~4V5lD=D1U8`g}vQ7SMcwIR?Akq|0bh>*_4%)
z^`&boFG85~<+!Tif4p-Euk2#)f;u+N{Coam)?L#Z3-?8s=M=@TC(`Qtalk)Z>ffR*
zIIZ17&hGda5;fA$^erqW&2O@Slyo{@`0gbH=J)MG4ik-lk_=b)ARsgB@DxX&e)tC#
zd}mBK2|z#h5m3yS@4%wv#25MaTN5Q@y}V#Abw}dFW*|kl#Ds(g^tJKNPvlF*VcprG
zTn9^w><lm!UidEUm0RpBBUj=r)%$7_jKg{ZZ{!uvIkIzcganGG#f^tbY4hB^%UG~z
zV;KfMz7q#N@7h8UhCy#6R6Cx!JU!u<we<E=(~#~v$L%T<R>cA)>BVPsl-Yx@D-b>2
zBN>?u2(A6z6q5l;4e~#`Kmgn9kb-G)pO&U5j?i4W3&zRxcEbo}O^JDa@$<=@3CmT@
zyfjA7ABIx9ZnnEXWjRaNzHkMU-3c$sPtwEZo)#C=+1Y>oOf=mzJWOXd&+(Lz%P>+5
z4;cmm!_pr61LmiF%I(dm*%SSqLd}ZDm5{H(wfU}o{21QmfYa#j#E205M?%!qfIa&6
z^@~^zSy+ohj-&<GeX93wbfcx<VV0W8CE}ix;y`_OFu1!)I)D)<%I*K^+wlUYCSMP9
zJl}-M?VB%$NF<}mxMTLe^oN=po=Pa#tEj{H4ul_pIg~0Z&QRbN;edMCm6aGMS+7mC
zhr!Qi``<T3Wr%aKq4{=pvMG+R$MwzgwOCE`*4QL?3@ps;M3EG}$Bb3ARJ!B6VSYDj
zo~1Ddt;euPY#-9BUDk^i$tRES<avF)+A-ZVtu@O#__QroUy;TZ7%BR;n6a;CD*&m?
z_9Njdf+sQ}Q$BplyLS#@T11;|1|&E>xfdcW-TljDyHZ!<#P{EKHX+d=Y?H%V24KVJ
z8rFuUr29<5ipW2AhK6IJFW+``Ech}SEf+F)a-mb6SDxphc#c?xbGphU$C2mnmU}~1
zuX#Hbn_hgjOLW4JIF1^t9F}88y_OM*yeA6p2GYy60ZIolUJQc2(i0KoS55<YiwGS+
zsi{i3q+l8stI&-jj4fM_QOPEP=!3{3q6WV8V3>lv1R_&uy}SF<p&;gwui6?SA}z~)
zKQ@K#-{O()ZjFoocwz4q8ht{F5PDTGzS=>7>VOLgmjvfIFo#Nc*YU4MOH)%5T>fn*
zcYXrs4ATSIbo<Zo3gD@V3JiQo_VC!pepWGXmwaNV;Iw;{9z$9&ml4=;*2UHJ&+SbJ
zBK1W~hC2H(&k1eaT>87G=(F7FX0<I+$3LQ}4c>oZaR1v+Eu-2Rk3usMHUP(TjPqN2
zkfndv_PlZiy<x#k1;iL~i}7GL+Dut0o9l>PhK#oU{$zD+ZE2@+fm_V&tl@7oYd1kb
z!Q9AZStMrde>$@(D<@%YXPdxK^k5d3n|JK-ndpuADLa?sfYJ83wBvC~d><f3P{p9T
zWr7%&^%9xL^TLbPgknbw)wSbd>}|%#OJck`KTO~_0xhK`)h^pBxg^jA)l$4Lk^ioq
z7qUW`(k_iP^an*3v=P8mYb2Oj8{jGwm6+iPM+KIpD#i|mv$PA0b)zN+SxUr-GV3=w
zCFXF>`3J8JY>i-{!NK%#w}10pA_{!LD~T60shO&ASBIB(v|TX1<HrTL%Vtv6Fm)v+
zGRToe7v-Imn-xxsvdM-7QN<K!pnbuAtO%qdCa$xssP|A<g%lm$E0ki@UtuvW_qc5>
zK!|#yH*r^8#yypuZi^-e>zl6wxUFDbg`>3*eVcMo$aAK@Hf{JzsmF0@vu7#}fEsxw
zP@yD)Oi2lV>*c_bj!S4W>Jk7XOSH@INJ-l+1dNrp=2}U2=%xh0J!H-oboHjCRTB<T
zuOf4j7z=<8Ga~r>IxXknS4e<b4`IzUjE?`lm0NsuczF2x@*{57_S(j>$mZ57r(<zf
z3S=VmvZ^J$01U~eE9j|e9bYU?)&2_nwEeY6P_EI-6Y6Sa1X8lXWO~VpQ|L<z*Wd4c
z6*y*kwZ4YEc*Y9;94*!v-sVhmz;t?Ts_<>bBT6xFOQ&bLNBPDZgt+4+c#s7woTCva
ziT&j+*f&2ne*OpS8)@8}uDLQ*^AvHNv)|~M7#0}^iYkTHBWuNh9F{aP0pX>(8+nck
zobbSW%*DRK7Z7qjX<f!Lo0Rs`*$i~1u@hl?QETU`W5-6`V7N|d;nD#<m}2I@yVnqQ
za7K_odzULQFZe%?jMSwwJxLNoB;{_Yv6Gjbm3jQ;1r23$XTolU?8*2C(-*1s3hCI3
z$bsX3Q}h@NY52Wvw4vZVi1c|yiXb(Yl9Uk42a=|dtFde=o>-z;@zrKC8!162Ldr>O
z7G>yV1dPm?tG^;IIrocF*BbGha>xnw72TU}tD+eCtxp<;uLosRBD^9=NFkxRTFwlq
z%&hf!Md~HLztMP1i5jetkz`ZSgT4&z4{)P?F>qpTg~{s`+uo?8it}9~NsYP@oP)lf
zNa=WF1zZ37vk(@!K760cEgw1iZ~XzFPeq4((Ld>%QfH<bXpn;1!y{m%+27~PEkM(T
zwQWjRH4d%q%|NSQj*JfNcz_J7&AK@7!^$bpd8TU{f%#bBsAv0lR}yH$StwJ8%rLKf
zzAuqN73c874gRF5P+s{ms$>;aeCsWTa`DQT_J5NvpE?G1`j#r1U(Dp}l~NG@ad$r*
zwPB@3G?ei(_P-90m13~N!h1QtS3L9V%IFiHTmOA>kLKpCEOQ$lpN$FI{`XzaeaGH;
zd3~t_kjd38VM5j{V67bt_a@2qK9t+#g;VJ6S?SbE9kf9E!o=mRXs#eab3SfDujAi$
zEywbyL^-#e?bVpKqc<(=2(F&h%$mdJv6xQKvOb|!8cFn-*R$@*-r;B!bVRQI`}He5
z;E{K(?5vzk2{2XrYP*j$L{1JZeGZ|K-(TalB_roV61%3-JQq_4xd}qqC^Pxyrmzz!
z0B5tm_7WB~LRpudh-e?mj+c-7cta~;_z7JAv`S(2G$+C!G6P3KF<K~KVUh46A~f_%
z_H{3(f1dxjRp&CG|MbO_5CMr;`J%|^1q*E0);DR<jvdyj@<D~&Bh%A(pax91mVYS0
zI9C3NvN{{bn~oJ8_i?H!BJ;ie;Ljn`Q<)b%Fc#-xG0zjnfi5H25@w7oP-qhD5kU&<
zmO0L_dyMlF<Vh`&3%+dB9k7w3Z<QUCjt$EEU8{~sW`buCa0MH*4I8C-?!m>v<BT>Q
z(v<JPE$M9V-YtbXecC?V&`M35ZV8zKx1eDDfGtbpQtNy&dEx1GTSbh*)#*H!pwb@a
z5$zN*7AfWRZ+>AH$L{0}zZ_rruPaaRc?><OI3g{mQxsIU&r$2yoKSA^FnS16hX+h~
zRBvnCwTc_kS;QQMv&Hc8X+(UQP46SHm%o;q&<EkTk|xY}7$#Ldp6KSM!}Kcck<1Um
zD4HtXfnj(|goZA5VehwdrRVZ%sF}&Z1z8Netg-vY&-?5bduf<RzR&inYp-W_OR6a*
zw$xta+yatN$Dsm{@!J^(hi99TBbG_|#L+7c<8_2gF^(0Vw<HNe&t4^C#Vq$qz3oR}
z6ylYfuU^C<X;($TL^RYTn}yCzR~-^?YKA`$%`&wYC2HbIjmSYa{xu!BO_lNUE<Z5$
zE%e{>i>K!9)P>3N9RJs+0_a6baeOb&-%@z@Nx#Q-sqhwLo;qa0tJrWs+n?2-!7}>z
zBok7?9Cd-2$3hU8Hn*3WZ+4w2DoT0yA8!MTiA6XYMP1DWak)#+yqy_=B%~S@tBwYL
zELKXPJ}rr^&xyCk`q8XeFDeOR4Jl~V!d8Mq`4qRR<LB7Qa^46y3v9B$=$&7*s<JXD
zb#iBYQ*Qg;mLGj%BhL}oiT{8}=3(2C#Lcep@viM^5sw;&UNPro2%BH>&bd4TdLt{`
zBDtjJ9^+5`_xGo+rpJ#{c<oKOAaDerFn(&Mz`kzuEq8U<Smbzme*dU<c75a5ujT^-
z=n~+9o!v<FP{X9xxxvkm&Klr!GS=YVa=UFDAQ~fofa@Me|1LaebU$<U^Vv9WTovGG
zg}02~oUaMF5fOMXKFzZAMf2=v0~tgHt@sIlHkgtcRL$KxTj1iDcs4qK!rgqOGvV}!
zPSX`QATP$!4vYa)lKZTmp)oP;`zjo{_V4uzJo)AhdA*h_rvnVNai9Gy*UWqI_UNqX
z1$}F;$towSSLOuS#~{2SMkkXM6;)~c`(P)6^bRfojcK>R5DTa~mJSX%Moq2)02OWq
z9rqH$V*UI(KR-V^-a>%}r5=*m%$7l)mQQXB$*|-MA{M{MR6}|tLobJyhgV~^qosY*
zI;9_3dYq|B!|?MMi7}rz&C!UNNl)EDgbs!)RW!A67AP(98Ri$;XlT;ixFwAy|GH@?
z?d8yQQ@R8S>N|I6&?_O`5^Dj6dHK<}!~c1z6T5i$BnkBpGXkl!Z@zWOwz3MGlC*7D
zh8YyJL~c@%n3a#}dZZ|bixNtE(o#{Awj#Y-ngjiKoF^7*A=@lz7mvvM6Q5Rg#*mC)
zezIAt7%QV^l10nkkD@tF<{V1;8_DT5?|rvfy4ZY^O~k{Hb!67&=J<m4;F4G-pL|g|
z+j*{)?3;g3Q8mTt^W+V!8#fA=ON-#-z-oo_R*)6;UxcF==6qRM;1|ORptbs^Za$}~
z&}cdoh#^pnBb1`e2GCUU?A%=2>cO%5#ds1uG67E;t8^7ogFSbm0|Ev3{PPo%a8iV}
zRjeNj#9b4Elk~L|imTOc8Sv`mN1eUpng`9CD&Rk3$|q0R=Rc<Eg^1+{07p{=%TT(I
zHf%}WJ@oa4xIEt-aDq_$V5XPiz6^Gt8JL;$-L@ir%S=xboRshNN@o(r?`P4nH8HFP
zWmfmHmN1qq=>@boUGmbCh7q?<3U2j^%z5=$vs9Jsl1H7q=<m#sLftAV-7Y5xz9xdL
zg)A(dw!XL*Rsr7=s`~o%Nw@ItF4^vV8-f!)3tB<UUo|Hm2@c4sq6t(Ak$OOh7smK=
zb12*M-@9DKq=T=<R9G2A;Gt^xDqgvSbx=Z*#V?e9A!IgQfJzmK=n~A?JwZ$EmkFdh
zsp6QceeTn3bY11mYs5+_8Fvw;&vB9`+qq$?*-~7x_>!Tea@4dqVs4%h_~z%48Xd8z
zU%&iXK55>ea*{86qqK%}{dzCCKQk=Q(&ij~ygo%YTzDzGZDS_S5nSK3+`AF5`NdzI
zn=`BuTl53!Ogop|^)`6WaA*$HT6boa!Kd1>%B?MYefhscS!?uOSXY@X^2~498*;K3
z^SkDjL<4IH5hLvigSd_D>BPj%mf~f{5l8cC_5SFNRi@0!e=q0b)=M*<;48z%WqF5v
zJpFfT{?X&erC-DS#b5;j&H!lHm89Wo<GQq(nOg?IuG7*TZnUZ@&Bn3i#<N?ubp&C@
zZF<3IbZe|*duF=#0Wk?%R*cU`i@Y1+zr+e~YYSZ%elZ}@vKQmmI)WX>iS$l)ZAF30
z*x>T=^5CPq^o2%1R>{*l8;5AAeDMF;0dw;>%5#L*=pWgNVb>80q-E)9hzNA=&x&W8
zTLd7Jw6|@g6TvY_CLsd~cbjQxVSz7#;l%hPfWFUd=}rEN`$=4?Es`YAEa8^fRNl^X
z{i-y1^?mT;3<Qs&^|W|rq`u5VOz1u%EjPnW+m5v!M=iJGInDlKgFfYCncju2L=ouC
z>fGecO5;ueB<)b{L2rH$-nn1N6m-BU5NdR3T2@#p%3t@g@})Nx*h@hkuT^+>FN|qp
zlW-^DuaeW1`GRGa9V^b6XC9W8-QLH8<|K~Kh8jwF>E>yOz13F*EFicC#g}o`V9o&i
zfnG`IW1P11AbthL3%4qw#^ZpF`H#a9J70VD6)xZCcwGDtUO!CFq&$Ms5B`xXYcMlM
zYuPcS;6>@Cu}PGL&_pLoTxPtjEZ@6Zr#*#0y~~jRI0{K<D!N-xp3=y~t!N<bf=x;P
zJs<XH^p8=-ejby5j6eV}dQe>K-_qXE`oapS0nW2d7(>|yM$fq6^4T+`nSP`*y$TMz
zN)?w77ig&O;;g+z%1l&T-Gy^VJ_FeYbbJ55j_z%1z*(MB0|K-#Pg0pbo!x)p!L%es
z{O-L7^(i)U^!EwEl5k?zpK;o1^p8_s`RNd$EiOLbD53h#Xn((@^0Kxq?4_OQ+^u9Z
z!-=5qazVS+x{iBlc9b8k=FnZqr4X&mZP6eDo^bn+Zydrr$&?}aRC)6%c2JD$@J1y=
z%|baN4f}Y%8|+Hmr}oh8zh6C%7|S~_Cr1CQG(HZ<LaVdVE#Ki#=@*OO5aNOje@!j(
zQPM`GJIx_#l6Yz}>+Fu@q)tm$CB7xu8z<)Z-L#RR!?=ccLL(~sI^6VQO4>-t21|WK
zd8}7(C1AWk$N{dB3c3U$q09(9EJN5gt$QOEnCJ%ISA{Y)AN(r(d7KU&qyFA#mrF>l
zFg?*KVYeX|`R$wNw|#<f?{b@V$qaSvJJI(>B^D~zHM6k5?J{~J`Y^k6NaCP~!J5L+
zXOho!%mUNHkw|ahQr=g|g%v_7RFXvY`&xAcB{SI)3M25Z*?#%+n8Y*^-G9WuS9%ig
z4=mfFKA!$9D=bm-LE7$GEzfi4Zlf9hJq($`!4K#REqw?cU*g!>PA^P?d7^v!XE3&l
ztt8~vF>yR!+PIsui=LMW+ke+tn<8^J<YwL4{xNW?i^Kb1cpAS4*zVs;3*!E}UFS+m
zkj#R>$!5uLd?MN{v&Wf0J0}JX<kgc>G}+rCDE0Qhk&zMG$8AX{u+RpJkLB>F;HY>f
zIoVm5JmUiK03TL>vvog9i^Lvr%(4ZqDHr!;*T=+Bxs{!bfR0l1cS_8v+)f~<XJHWU
zZ{99{2OSz&#bMXh7N)yyII5ocy+ZXI6JY|vl4vOBBgZreRlJ$FH<OC=hANlpTcb8B
zxisBoZu5&G=+g&da6G;=mA^X{RGTPEZ$;S6%YK)zXc=Lab}u5Yq1>q?&@)Ix3=b%d
zdJIiVxPl5zZSj(#O^F?RpV4kb!eNK(;!pbHYndA0Cn1KTf25$WeOu8xJ6j#wRdUg5
z+lya4ooLdNp!ioHdWQLF4$u+5dTtK&$81d{jwE*VHMZD34oDnM-g^OQhWL-D<y>n2
zW~*xpJZ@G@c4q#7p*7tBE~I$6wjm>yc$I)fes`(yqs8;_@YZ<JVBexn3aZWWf;8xM
z=(ij%5{e^;ftPMXA7?leQ64oEbHh0<Hs~x*nsRrbGns4~P;j~_8gOh-8^xe8bd6Q-
z&y5;ere_X4?)bgBS3^QHuq0K2$6R&LKA%FgafZLEr$LLiq&eO^Pjc(h-b84@jB-3A
zi^DXRkN@2&4`^RuPEJfHbXH4)ftYu0Np&2#jGcUPzrhH;oJ?lokU0q%C3pjw6_(*C
zZk#y#M>`j^tPE1YIgJL5HYXs^J%wz~2S*cBs2I#0Ubv*lsgy@-UlFc$s_)TEm?GvG
zd4I2B_~hTMG$Po*`IBn!qr>4DL|Iz^7A2tqTZ_nLbK#uRb&6SGIa%w1nHnloaY)p&
zEPnT4*vo?C2K;9GL9CN6yz~Ht^XXbwUlPnT7A1kC(;dp|c#LARJCmHG`kvdnWIibq
z=S@@Lyofuef~3|n@%}~js6B{LP%M0llk!saF@8e-Iy)&nw)}_XtLHe9<&Y=7i>F-R
z+t&D2%j#V5v`i)`iGF}!tt!fg=2kQ!m7U|>OT)lv=|QjE7=2=MxO!uyB_L*t8#54`
zljVG<4$vtOkm4}nK=-p}e8!_vc5<ggXeo6Lu`hA_PskM+4p^~1908eg_B*uiq~TBX
zc7*8!+Lxq&=?>b{m#>h46%Bl?WcVI_$DCkogt8Yt5X3|i(3foio+=X~#*4gwy@N7e
z^P=dM%h}n`3s~q3@{=K?D*D)aGYfufY1x<a+wE!4qAe$~gAJ(-d%MAzrk2~C&RC12
z(WLe<r#bp<6m@Irh5EjZL@d5Ac)E?uTdJbr*fokf=w;O;_ICRwf>$_^Rm)dF_mp^F
zM2QzSv#`HXWPH~p=$sW&v4N%E4z#zc9h+N;BA~O0FgN<3?=|v2x^4-{DN9*Kqy$Q*
zuPiP?a0!$2l$8mKQfyz>TcL7~h#DzduSubl00OQ2l2(y8CLH9FG+>gn(aQZ8>m?UL
zDfRbii})^H^HT6BIQ}<gCnTn%NC5TMHOul$J|isW^2op6TH9Z{HdkAwx5pm0|I2gV
zTJ~#`>J94{+Zqdq206d6)wzk7^+jb){dnchNX$m>{rmSfw`>S@{GF{Ipv|W->%-s>
zG=feXvz03eQ{d&zvCJvXF_%yOPYvXE3GQ+4{AsKL#z{EQBq^}X?Z`|0R2J!A*1>X*
z{WGEZ1#z(yAQ9B?T$%81$O9r58y4UcWo;&I2Ie}HKV3*!wT2V1=<m;q+yZp{S@K`P
z+i)V~b-6b1xfGOib#2exJN=Mar+`t{Tal>!LEY)QpM*5eX+};<&ux0lr|f=4v4i&~
zK9I}Zik2xMKSMw;C*P6s<>BGw`&sT8xU-QP^>OPr4L^^MFIJDEH@|K4l|PPTflFW4
z>@CCSjkT$nKF~Q}o8_FJ7ZN$!+`h~<NAe>XFj(oglbUj2;UaV&>ZK8S2`f}zHehb{
z2!}zRkW7hVJ9AF&MUqNTMHO)>%3A%c>VVe_M$@jHizR2HMdBu-H^&<%qwhO`|1*QH
zq|VDuZLb{AtkNAzq3?mW)+r1aRpYAa?fb>Dzi3jg<vyKDlmzfK2#$G9SD{v*EX+Fv
zU=kKcl$J59VuUb9Ngv(xX>)e<E38*-qS*9Scas;jnb`8mKTa;A<u-nx4k?>z2{+HS
zQ@pzz-sV=-wEj_Ug|0-irb)!sLq+127s;8{EQC=X^@Ar}+4!%1mfG$(gnT@mEp3M#
zL~AI_KLhCtLLc?><XNDOBzvAK%dnXqSYXi=Ia-IjDuNk5a80M>6V?~HlPu(>f8NrK
z;we~9Lk5uvR(;OpDaiE-F%{LisQvu(6NAM^`1aTZU1s$8)8*lTfq_$PmY{9G2U8G@
zt?`x5TOIUkTckBI^#jO00>s<xX=%`#b}osI<3|6`<6hFMU$pvZ!qa(e|IGKTypU;+
zvW(E~Ll{8?`wCh>^9_0Vf*dM2BV7VzeX{CK^x~t(|Ne2#p6?d78(U^%|MAH?K^!8|
z=$X=fO6oY|_X;GHO%ysxVlPt~tgai`;ok7lXT<M?$eKPU!zy-jUGBpm=(wE#;T1%N
zDV1d#$G(4a_e~RTew(|zY0sN|s+wQxQA}LvRn44peG+ACGnfjEUsO$s;9T$J%8l|O
zy3OQp(qK>IY7!~QjE~NRg8RMV`1pa;KII|yIQj98yEwe{@TOc6l0Pm3R$46H(oVny
zP*Cgw20n))S!n-WIwC6S56R5RAAnK@%PDW<w%4N0kWg`fluj{%F|{bTj5ra&aVeyF
zo?-vTi!gl{G&^=fr(&Wl3?v=$pkKLc)g$rpuU!dRRh6056yaV{2BJosmSH#1<e=tN
zA^M<!hFiEb>Oo_+Y;Dgji%jl%?jnGIKjlk;f>;|*=jz`<Sf2Fwlj^UPN-dfAoKVe1
z9Bs6L<^*V9jp_>VFql896>u58_j6d0ixbi7e`n5a1GeMBT+AA4^{hOGc@F+5+?T{4
zHHL_Q;qE}hqad571FVV+3*^@P#&W9_cuC}Rd*J+x#c(RM4WjZPEhbhMk;?EQ#Hr76
z5A4qATOVO}CNCWaX6v+@AuX3d>h3IX<UFLDgpvbgY^Q%Nde*rLIc8~Xw#Te)Pa6(y
zcGdVx7huf!(Z}Wn>NrCcI$+`CT_Ca<h2<~RWoqqXLzbF2OH(oS{Bc*mw(*L`XQM{z
zh53#j!(hEhcxkPSule?x-*1G-nand+yIv}bW^s}vy<T*54DC)_-XL&hyj78_u9K_z
zO9#{xA#Pjd*Ps?>kv5_x!@JE{ie@K(wV+mrzrW^R@8hT}@S5&`nQMc}97vW$82M`Q
zHwpo_^L=FnJnmWbYsJg$c*-M;=rf&SE(qRi+90=92Du|ulf~XVN~vx*p>n`8h+dz+
zxwA5uheI*ern*|d<{J-YHq`4W3Z2tarb+!L{HS1&G5K$BC*}Dz0QkFSK8X{9p-j!f
zgB;QkoLeG_d>Ed%B&^ZVH%(4nFjV4|eD4#0FM_qGSdQ)pDd_T*=CqQJQLm2Bxy!ki
z-j)OkhVa0dKQ+wuKWW(L8yqaJ>R_UPg$Lw#SortY8p+x#)Z~gn8)Q!!WDftZ&Xz1E
z59Cb06c9V^Xi)q@{p4dW!C!RtO4QKNa4O%aBuKL)a2qA|(P^ZSi5|*c6!%KHGn`<a
z9rOmNMxz8T)lzI8A$(!#g$7A*ey~`5CbtLlY$$szooEdx>hw=1Xpcf05hn0(s!7F2
zm70<Xs~)gSwt&}ss_1`vSOh%WYjKykd5p%vYG;3~k=#Nm;IYtC#4$g;XE&$%9ua`*
z4s)oGo(`jn7uM?tTON>KZRy(f7|OQBThTI3w_VtWC_+$CE_>nP=q&L^?3u-1zeaZE
zxm9ja=U|p_oYZc=lND{mkX10TnG9?HVuwXBo7t-I<hu`J9|ni3cfj=f*U^wVIMRO&
zFz_-b(bMc`I#l&5#)+RReKJsm;=ZzmiWa3LQKx$fZ?CH{(T56)Zc01^wj3;h`9Pn;
zmua<f-s=D?_MO4OD;623qldcgm-immX*fdh&3LI8&Kr0%KPmZnQbnK{l6?<>5Z^w@
z%V%ENL{#GC^>jX%pFdYbZhrBxpnhU*HPYkCxt~<1*grQccUEr2=xxKFshe-mcMx=k
z$&ZzU#P$p<FT5~ChoM)=HY_N{3m{9x`~`^Tl*;mkoAIa5%53>bZnWEbe(@>QiTg;;
z4>{a*3QHrK6L!f%Sn4-h_x3o+?G><rn%rL9nFa%ufQeQ>k2oc3qcg(LYhNfIej}N0
zXP{96A>b)=BsHy;&Ri@r?aO|@I=KUrhe*KhBx5Ush&GT-$pmwQn1!!B)qZpTAMjq@
z0=L@&qh>5nF_K~C$~!Z99qWYk#|18vt@r1!3CrO^@}zg0T{ifIZ!=(3KmAiu#k0)l
zN2`D25&Gvj5$e(MdU3^(iuyjZ2Ft#5gRvh<GWPKxT}lRLOe#52F`$tEt5H|rD8;sa
zp2*pHCAswXm*=m4H+EumbpJQ!=t>HAS~fcbyFTV486M_iv|q7yzT|+CJm30y|9IBe
z7=%?nchQyR;5ahkI{CJ6CTZjg%F!8)ywv?uJGyBzNdI?!FqS*)*V82QK$%iOpS``8
zH;4%)H;ddXt@h|;x~@+_$>i4TDj+}X{QJI73=cF5%gz27P|E5As4uq^<1IDK?M2lx
z8DNo3RpMa2QNmK4f<#T>bn>mGzxRM6;D!35)xozQyMCyUN2OcedP(dqxppoQ?-pc}
zbP4aBzw1eZe^pf#t?uB#vBb?S{S-=}1J>J%N)}Kn7R&NJ5izc9eeNH3rb??}3=F$E
zVs<Pnbq4J;`9)2fX2Q(W0h!04ygk4K@Mpy!8`Lf-UozSBLXZTs4~c<e@$AbZI7Z;-
zIbhdQ_@Hgg+kITH9lzG^o~g`u9R#>D`Si*f?%>RDc~)07M!hNk9POtKZSR8So?QHz
z%T;xh&2%=gfqC1c*XbQZgg>-!ig!NUbh9jhUgmH{DZBGQ*INsu9d>hErZYXrnmUiW
z@gsthKa5qoWXb6DDQ6p`9?4ijGkzXBVW!j#N%TFH!y-u@OU%`%4z~bJEln(PTAPXh
zaGoc0O=^*()1w?9F_KmsK%>rEsC97-c&{{PfRM>%x#b}uM!r-j(5et+l)Vp7ZcIO(
zrmgzXKM8hW)k^uqjVdw_q>ETfjY|*HFtQIyl-1F`%6x!=oi3qvU+~#$*O5adDHo)e
zrs>1s%iu^CwR&(8PrdjA!tmk?@BLWH(n{>A;WoE(*PcAk)pPB-c5g{4<+ao0_qd(V
z>sdIVA<vAj$K{U(qJ7DbOcnq~Vdk43RPoZZ{|ANUorfhO2f?y?N#eP^@wczfYSogH
z(j<c(G|hi_cvfWForFRhx$nD{^9ZW_lCkq7vh|lAZSkSW10!A-A6x-Jz+OBaoOjlP
z18;i$vhL(Rlwn@%>CO0#%xp!`xbswgp5G!7vBTecMkCIqvZijPK_@@ngH=BRE!}eZ
z8>+dOPS!J_r(UmQdBWzJ+kcKR^iL1jB|r5%@XYNxc+r}?nVam`nOe@Hyly=FRAqOD
zHturjiIWbNrJl)X$GJoPrqi1(V=ZTQ)|Xi&L)-lT+~wXDx$M5P{+Jjx9`Zk(8*OgO
zuKP@P6D;r)6XqvL%eW;mboQ9!UQ^RBw_?gKLu-_`Lbst7Dxq@0X<?~%2n_wo9fvj>
zTH2$70vD+Ef;Be}UMA{d(Rh>8#H|IO$Q=K(;_)dYN-gdUa-HGs>ULc(uiZa`tWb<#
z{nQsrBVQriXg%^C@FnVND)FO%l%fMmhF%&9nkkDooaPY-LjbXMw!u-?!KLf8CE)wP
z=;Nu<z6tD=aiNlTBHUoBpHtDcN*hln-L5x`(dZ8R7D-J-<e+3QCc;RUd~(vrc-|xs
zE4h>?@_1#l?R|4?m^-{6U<6CO%3P^u6v>zf(#ikv3hrJ5+oD6vrbe?<bDl~4t=jD<
z2IIdhh1^P{%6=4YpjA%I$1Ak9=*76lMukE=-H6*QE$wmuJe9JrA!8*|BX+iu{TwPs
zyj?4v8)dXS{`U>+BRSbd%J;5;G)aM_Mr)i~rt-l3n%k0lc))x*j5whU0&M*<mYP6#
zNB;l`)CIiIoO0D63KMOXv;)|Lir38v=T}lqO=%f`nqTbS77!`ef4lBRB=8>+Xs(ky
z@9>5myGt+8f^&~{myyBJisnC-fdf}|=Di4a|660aq>ib>V!O8UDy0&pKC$vZdDQi~
z)|tr2ovqcF_KwMvUY7O{Q(D2)rq8Lo9*Zsken>a~hGb<OtPEuP@hBEM1?@4SD#f~F
zQtX1*w2?#9Z`y-SYVsM9au~t>_YFdd0$<J-8Wgq#I5|C6Dk9GW^1-O}OAZ!~>}*h)
zmdA0ct^`sK6c6*V?`#IamfyaU)+`XxIY#~aR)l$qKq1#=8Wg0qBu2dvg;Q492HOr<
z=_UN18ZJM~WzhW(Wo84q3Yybgc^}1G5`<ZE+d?3<v}ad_6LuL|m`_QMga%X$V{iG}
z5o%^+7*Tu1GyN(SDm@NJA`}9z2@SHKU-kmXQl}Q9SgaqFexmuZEAOilT20~s<L;5G
zf2yLF^(NLKyGeSI4nvu0Z<(TS*}ruO-)H{1Aeo^**lMhxR9NI>^_={{jA*u2Xk&&u
z-t<lKk#264bYU<nRQ?V1Te<=Ui6!iUjb3ZV+m+Tj%(bNn)Ihrxd6i1V`IRqGu@JIU
z$=yR@0adXYNey)VTUhz+{PanY7AiUJHB)4^w&%!-qeW?4(fQXM;*{JdUEQE6Bp3q0
z5$I#Dr9}PC@$&LOV5Fmq6X-dTAC?P5)JTF5un;!=GVYvS$c7Fmaqu}aX%HJ1Hfdgp
z$<9(9#*v2Y`GG#3Ej<Tz&+bB9SVE3xcCkgWw6=@8>x5xjDZPCYSgZ0HOQx^U_Cs>Q
z=fS6GMesI831M$yROFIuFAeilw}iVk9>DCZOvbnqwm^zJW``Ri$u9TCc_v8^;Kph~
zn6=`Q4QT7>(A*>=>?JG@IVqdSufVh%K=x!~1QV8yNY7Q|7nd6B<(+7c9?>2QZ(AR(
ziVltlaeo+8i7cs)gUX;F1n2Q<FfDA4*+fyb1=SQ@N1PJON8D8vcpi7o;;Bx=sia-y
z2z0QOWoa?qtn+^V{biri1vcP#1{T^(1EV|k_ih$&5;sq!PM1>a{ZCamxQ<xHRxacP
zH%f<7uRg``WO{>v-B?Hf;gzrL;~vS0j)=#WJ~G$(kXh*vNg*S7w`knAHnD?EE#uou
zS$uK0yh?6;4T^y)3odq0NA7?4Ub^(S+h@s=3<n&N8pYC?$by1YD|(Vqk!$lj@T#gx
zYG%J-%JjWEg;cgnG6El?Kd^K{&}3xUe@qm{6R&K=B74eui1RYusZ#aXy|`mV^05sI
z0~H`XZ<bf=@1|c>4<fVonm%BYAg~ENI{5N1Pp4Chw8Q8RN#6bTCUD<L?F*VY@ef_9
z!#V2<x0#3hD~2l<*xC489x}=&$5+vBEcI}T%lQ);xpHNjV=|e_CC}?4H~!v(eoc9B
z@>i`bVG42Jj&eb*!N33gR}ZHE-Mr}q$s>pA<8?Aa43#;Re-eg`V!4C!-}(mo4{DjX
zY;OY13|NfiaFnU!pj!5ssBkY*SZL_Sxz*8Y>_ys0xrbzKLm*RB;y}+gvKOgj`Xbm!
za>w?p^`mm((vy-jOAY9Gc4w%;cS`FL>HBA=epzFfeoAj{slW7a1Lc3uGax7!D2`^Q
zJW4Wt0e_}V$s_vhmNpe?E65XpyhKRLUWM4SCZPm41SgqZlOF0zoj|7^d4Li?OhkCy
zq6N1A`CUXRYRQ@vT6|Oo9MT?4zfkXA35r<$HX7m1Z;bUP+hf2uI-fmIq`pgN;EpNA
zSbn!kIi1ay9mHcmq)LWm07LEF3+&ut|2&4{8@kvoIZhaa>7fuwRc`J7fF^6~&+SJ+
zZ*d%PpaApceJj>RWP_R`0-Wmpj6UTO`kFu7%_(#+s_n`3da3{89A(-rLk!}-$WsFV
z%zaqZdWYBxxaq<h1^juwTW06v<%`Hi5IX+8Bn@x0$Of$b9p;3}m+Rt|r7B2cxJ7r6
zy211vJ~0c$)cy#{!Ge<b>lb3K4=4T4ZtDDJMas^8bC^Q?2_|q>lhgkk|IDcgvVZzD
ze{?;8Wp?NYH5ts+XT1B6RSy4qFF?42RrHa+B6muW4S5Zi`qs3Q%ec*dd=SvU64-tv
zy%hmB2}<1r(TTZu6No-a@&CYaAR)LaBN_F1a{P>;Az9G1@n>_iG-o{W`l<2dwhsdQ
z1Ph|m41(m~dg^%m(sYT|>Z(ZjG8Qh8x%PeKHjy5+yj*_&&x%OLPS@k@wXLqM%FOF2
zENmnAM(Q$kLPA<7bwuHZLH7)!b7p{LzR*>{^sL}e!I`%9)>};N38XUpfO6LKPjRUI
zqnkNxWpk_6RPo5^I0Q_|SV*BQVs-Y{uTg`DhzMK0uD7QMDK#)jI|CWU^g?&T?SAat
z;=U#ty*s-Y&%ba1d)Ps3GO6KDD;--J{bQ`52DkJn7pef<cC5yA+M$G3=*Cai>312|
z2t|Bf1gnuntoqNV4L?u*=@eUS`5*M_mLWfq&Yozg^dDmoZ46-%N7NkTi=eV}tg3%c
z*p<^-#+-C09U8w5zD7bk000WoW&Bt|&leT>GAHr}NChe3k&}7u+PhE@|I0yu-*?K|
zZu$^&*0Kh1K{eyYKn0S1=J#zBgN1hi#{%C9rTEX+^>F2q5ONr|OP*Ad5@4t;Z)9QY
zL>soA4!av7ZOaVj`Bw3iZ1M&Zt8>&bJ1a+(UqO2A;ruN_hB*B_syoar>6*;3=ngWQ
z#um(BG4y@u231ejuUa9;C&t?%*URP-|Ewr-fIbNA{IILmw)Y4^iVC~u>uf#kP;N}1
zPtHxYZX1KCUzm5k?~oBm)JRcLRMS#E_Rq@Qt$fl<d6Ga8YSgK2q9UN+^9#oH2JWEG
z5*U5-0-7FRjPZtQnsgmEj0#^`?yt(Krhs7E!s{EY%i&<Jl*9-$lvzP43v-RS5xfW$
z{C{gnHs8&Ua~yAvOJxCuk132_5O#8z*&>;h&OQFu?2u0ZQX%7#^_V0f4)%(yu8w<-
zD~CD_ynw-BlMyzx+njgQ?K(A`%Ekd4`(Pd03@bXLOa+adO*d=?5pX&qc6NF@In9%j
z`w7>_m`)?1kBK%&OC;R^dPDI-0tI#36oB*&L+gyo1m%brD&JdcHB|?m?#iodJ<uO~
zm7H<sJmdMW4=5$pSzW{mDdjN&v^0T7OFv&Y@rWWd8%lDy0aC<(X}KOSQa-FN8#NXi
z3kA`Vc6(v5Wzj!|D!=v~6jSRa(_we$)fK53f=Tl#Bp46o<&oLDMwao<iYZj`YYQYr
zy61{93{yrG;E#N=Yas!~2vnzn;mku6{n=MtNG<l|EXqqz@4(_ari-PApK#2Hq}S?l
z+c&=kVqR}$ph&g_7ZpYM^|cn`$p578pBzVJAqhP;*Mw^xUYD@PJ~cW<5l{cKMY2jk
z*;i20SACD6VWD%4ggNlUi=^QTO1y=Mg4Aa<!mwf>>cQUgX!D~ChUp0Hifw)IkvECl
zDOMatxu**9Z;bgOL<q~Ca5Q#sgZH`fjNuS?UvVAFhrhI5#Ab>W<FCeLtfoN8UFU@p
z3xg{cCM1u*WN&YV1Wc$ZfE#ZNz|DNr9{f+N{aa2LebB**DkjnkFjX!G>+U>2csb1k
zVlZw6dhKD0-)1}`Ks>wR9sL^%|3RP#0zh&b54VkF0lwcWt0;1Af2g$a^J4#^yTC6G
zq;td7lX8AQenU0n?ar2!y)AbpvKu?m_&dw9VE>(Kxs~|Mh7^V3Xh}9VBdyl6#|8a&
zpAm^w%C4Z_z;)n605s8CKsifZ0;L|fS{-v%cJ~2gs!jvq^W1~g77$}YkJrqBg)1o-
zQr+W<rA3|k8>N@(btXTw<pEucM{fZ-G4V{A2DU>O8&-Ee>CbJ;IN<x9CKDA+3scfw
z7F#f-&YNORB)lpcRl9WSo<d3q-q4!)$-_x6>cMXBKB%lX5SxdWk&~6RDG`N1%xV2$
zmyfUYD-sv96{qxV(7sgd#t63aG|8dXlSd(_vh}158s|l0!xX8nxgPSzvAr^C9-69D
zanCeN`ZLMt@$vDWw*)MIY%otfh*LfoodPkP9-b|izMDPcAz753xaDD(GO-D=)%+ey
zhVj+&SCqTSmcqz8z-jFY%&2FZjy{`ScrhmD`DQfRrS=OS1)-D}yI(-T5|jf(c;jlM
zl`Qyql`OrmNF3hl@AvOLD?x`?i<_WR0d`X6w`$se5r&dmJYF#Z7Au6&nWd~D1wIai
zctDhcFBV7&4T))-oTH}w6Ey~dF5-mKAq*vJi2fn00I~sRAq{ArAw$w-qM7EZIzr_c
z?Bvt7*wSG?8elM>OO~EJ6EEl)#)aPElMg+Mf&V)c;@)mmvR|~OPB+2c-T_M`i@uBS
zr7Z#4zfed7;})oGJAs9=5i0&Kg3gAX<5%aSaWL~kUL@mANt5K7pSgj#p7$71(RZ`s
zi^yNABNTagaD<Y7)8tx2zT*YV6LCl?1-wKm-hr{nM)-kFn6ajE!c9c1eQd0pq?eFb
z)arOh$97`J&iYVHf#-8NP~VnGP{|Bww^~)?r&MLRlhv8tii$!;d`d9CLHSFxOXmD>
zu<jUpf;Gg3*1u#z(zJ)`JEoz!TD$gcC|oRc{vQN=xNEKw&u0vkQkJxb*r1$v*6hI!
zfN=PgB9&9gJ$1ia&8cg<cwl4iR2>$w>mu1bC+}UnX`XhIxnJWmHBeba|AT$)Z@HYQ
z<>~5+@GmhL_P)YN2zq92c^XEF28!isPlb}w%VGR5vAC09)F=nA1w|C1@<_v<I5O2i
z;nk?nv`xMJMA%%bc#zD0LSFj{9I{)+WFDy0KDljdEa%~piS2({A%~w>$<&gc+6A62
zx_LUjp9hF9DQRn0wU_jx4WkTYZR1{yy9t(qHU#4bfDnI=Q}*b=ar>*<H<p1h+NUjS
zXUfDz>H>+iHrut?HT5D6G)~3t5G(Ox!{JeZHLjk=ar^Fe{Um!<C{IpAA(c-@EyhL7
zVt9cRHST6^$->6ZN6S7En6;(HG5WK;GW$f$HS&j%U|o-=vAH^RBG^7wpf)x!4I1E6
zJZy_oBls%}DC4~~H5)){wft!a6Ho9QP|#HN<q?ZjiUk343JSzjK`hjeZi58w6A(W%
z1jX?EuVuU}h&bNro1f@X=a8l<JaMq^6&=m3vzrr=U9UBTHKj_>aWaa&!iu8v?Gg{?
zDgR6U{qM*#9C|JLQ~;jlO1o->N`Pl%+ea|}?SBHc4$`!EPfNw~(ifOIobGMBuLe&M
zB?C$#c>S8X*&6ea2R@fC@g?u*n^+j86sY*#Y<ArVzVq4#!H?xu2DCNOR#8V&dubBJ
zNzegA&4Ge}$31L+mY+Yi4|<Kwh<`Ww1E48mws)*t4rsS~{IZ>nTKaJn(4#m%&g?l4
z!cGRiS0A5=5$->sK@?obPIY=%w<inXDd<0@-d{m>31CYsY|h0Xc<bMzl&JNqDGtTr
zf+*38z#~4KoJdJp_GoO0crkWE7tNvEznB{s!^q}xxmE4qP2(VvH`}qZ1b%uHgb8*X
z0xgFe*UP=r0JS#-@@8mXf9;c}lT=DvY6K8$N#Qr&O%eiyP>ggU3CXdq!FjcYOq@b`
zk2G}od6P;dypkT}k<$m{bw=G>t`p9A2`XxOOmU8JIujK{1ON0Si`h77G%2RPFC&NR
z%dBly6-hh%uTB{L?EdvZpL;~OLiYtM3b*UxIeRQR0F0XwSom%nk15>4`7R=z75gwv
z!gmB}@FyHn%ea|R$6g3!T`Nq<ln*swP;S}0O*nb|;-c<M>s>{8<LfK0n|Pl(6wgR!
z0iL#WL*@`L^x(?#@{aASaou-@hhmP#Jvnc7uO`!?mF+}O7p*qjOBgiLDZ#6vckj9N
z6Q{PgPr&Gbqm2fYHxOSO^WoIZ%PWP{eWImep2{ZxjVXhC>&85D*7s>#dh6sf9QME!
z+V@{NjMuk_RHYq#o;XOQ`x9lTQp+tW|Kcn46)_C_X*5M9s6^T6Q0ZE)$iTLd=U*jT
zU8Ry&{8ygoE*PXT{m#8|f0Skp4kgcvxen#8a1S1kOO=beqK{E;rarFw_wjsxyWEDG
zxwog)?)zzqB~Eq&=)<YhZ(?@M9TC}_a|CbF@0QuGo@1+R<c*v9z>wnA!2Te>qSQmk
zj4*z%Y|1@G<pU|}h2n6^QGm$KS#qm*SN>FP16(D^09f^7^HtagHjlXRZvk|(HjDpk
z%zk2(Fh7lrwT#ck#+gjwX;{t2E06t@k>`$qbtZrrXh%zKh|Ha(q;6K?Nf%k#;H*_`
z#T^!?Ns!F2SI!DV<24-1w-0vw^PUs|@ve!<*&rm!=V|<E!yjp6kJvK^gH16<ejjTA
zDFqJeDoM)0WI7MgX{hBSc44r|{<`R+0NmWT*a4o<;3l5f27LC_Q96x$K{U%7tY&`O
zgE8fjVV%N<CZC8vwBY~9yQ=bNC<$+4CwCXodmvZJ3jJo+6)~wxTV_VcVXV-apltcH
z`ofe<kb=HJ#?xzzYsx*f13<^)uxo6aXu~IO2Zr3@&ZUC8vn8GsUC%S~+uh2obRuk_
z+|b?P7+8@qpZe<;hF$~X<6toF*@t%K7Nuj6oYOo~{-X^|&CPodpyq1-?VC8yGY9*l
zY8sX{xS;OuVi;3J;r{)+mZca0YLSB&^_yPOkwp>;N)|WK@J_``bC$ygl|%`GVnNB-
z(6961scf8@wNa9ISEQj#+?t+(e2;n<{eMMdRXRC6uXu^sou(?AqPw{@wLQDN_^Xy+
z!0S=5)P4!VD}OrMWl1=bM0$sLsa*gie-@3HtB1;me}R830UeL9+Y5m404{qs4G*X2
zV#)($_)FPhUeykVK<K)J^%`-xfF8;r)l`g~VE+xeGIQ1HsiEKJ7jARS>}DF|SLZhQ
z`SbM#0&=MD0-6&h%)w3ieGymnPC`8SMu9Yq?%D;B&{;pxoRe9X?67Ec%cPnEK*sf5
zYA4k2a8rj;QSw?g;rUD$$h!HDiFqVjV2#-|?~SqEG%<NoF>K$9ZWjSjXRfDJlj%fr
zENoKS_3U4h$*S;4eSaZ%n*6$-;p03b!HIne-ZKUTA~)HGVik=rZXfcaSS<aNS50hZ
zJ0Q3XB{-c-q*hclfQk7~Wn}|PTiOe&0TODrl;1;v>Sgv~Y7U(^0r&}=-|)@%DIcIi
zUI?{bb|9kU8{=GZvP4o4wrJ?i<yO^c{uj@6MzEop<|m8msJo0TlzA?iiz~&+!%F!G
z+WH!K-V(nD6V6{k9b?p%TEovh*rDh-Vg5b1E&Jo5)6f&Q%KHz2E$jhN_K7_NH#UL<
zifvSBMe<Z_Ed{QZ_A#yYB6|!hDDYW|IPXJ0WYYoVORSZwgKYq)p)aDCrI_S5NWDt6
z(s&WKU(wk&Ud%{^>XNC<vA3Lp$sUl=Guip~sjhg*X^4t}Hz|6kB<g2Cp~x3!WwMT|
zDr;70j#ercafaNSd}B8ACMm*u=l+C`N6blnuOuJsq@>$}Qo4r|Z9|tq&5Sfi4RGtY
z&6E=cZ_jOyohKPh?ad%3xm$62;lbqouiH1P8Mty{s?MoKSyCoGXOz(adLh%>U+QIh
z!E0w{K+f{e>0AFJdkPnl=Q(G0<`;*{4{8;}Y%Pz-O+-5dHJA}yJDjUVE3MIvgbZyy
zM5mY+h`A%9rdvZsY%B8^<CBxtOWK0}TQDPhcisH`3GBTP<*T$M8yL8jzKE-i3kh(0
zJQF5!UO0THX}%dF!2Gyi&7vhPho3fAj*rEmzAMF*Nu<zmm#KH2ES|n=s?bB;FB((;
zpkOU1h3@dSJ+SR9oZ6xHuG3Th*ydi>Q_%@dC$c>G<e3c{b)`<Af}=xs@F^r3gwcZe
zQ4F%BCUu5PD3y=B$AG<TeT?Xu=Bccxk$-uhFryQPw)<b>!f(NxlK7<2%H<UQu{~*&
zrG7Cmz*iPlus4-A(k~bAu*oSgG1T3w$Ol1a-+UtK=utaM<UJ#c(v+FifYs_#hiy@2
zH;`D3-z77-*>G?NV*G&ai``v%NqQ%kTW!`&w$rF=ejem>H`tAE-)I!a*02QMdPp?`
zz~)W}IAgQ3?!h&dg!APRUj3KHcI>LoE&eIev9$#f_p-9Gab)@j^*Z9Lb}ITD4I~lr
z?t{vmbLPi2Gh~kpQSoc_8}Uc)-&~t+Qv*hOC-zX|NO)V=@~`2+!4%IM?cz;En~CEK
zH@{qK^cHR+x6d_hKVu8<Ld@&Jg(v?UG{SmmvlR)nvf^wzi?|DFXim|4HUj@8gQIT)
z@gw6=W)`m4(tc0U_lKK?|Hsj}$20x?aePFLR3jBqOq97c_eo(cG1o>e%cU%dxrE$O
zZp|eTX)ZA&X1R>ymV3EO2uUX7QZD-<p}Ajwr(ci$>>t@apL5>t*X#N0-EPT>3=)=0
z_r+qk5Kk_;u!l9Xb4uKfv?KILFk_(@BN|PiGl)u>tKFO1>a;0}x55ZH15iE*obzqe
zmQ|-oMnCKHAWJx^6~BSz9S@)LgYRS{W|9;qj0aMZo|yCOeo;nIz-(hMbhm4NWw7aY
zT_ed3(;*SyK}{~<aPR-cMHTHtMLR!3-nVZLj<@6EyBlTxpJL)En6zd(Rx0<%<LT#6
zBVNd-$^CW*eS(Pup7W3WhV13i{Hb#vUakt^CIy9LF|ruo9)KV`Y`iz$gix_=h?~XN
z68NP<s@`|L_$untt80)qm23KVbGh=}R_?dXAbE|HnP1@-o<I@?(-ZliM6vu<QE!(3
z&j>CO7O8*IHXNqKUZ!4D=VE37e1SNlXEmf&f4x`y7DJ0`NEe#l2ncA&haKI11U2ec
zVr&76MEvTQmr6^U|6|Lw=-s`0-$fE8?<)*bE!VbNT!rEpXoK-L85u<-^YvbNRyK(I
zA4=VBHu4st5f|m#TTG=YM?E)Bhx+<nUU{r6$5cOSHmQ`)KTqwYzbu&t8{O0))c($9
zdBz3R&)vX2lWl+)ZwfArx2G}KKdFWYJZOGR`r$r^iAcOyY}KQx3tk`6eQp9#o{8ws
zw}vlk?65TV2eB;HDS61<23w<B;4B}z^{b5qLO8DD>zH|PA=#aA3@Io6*vJ>b5h}`~
zVk)fWSKkh~%gI+qPnOJI8n2nX|FzT3J?iPa(b1KyrRDCB(7^X~nrEN=mOOzzkXfCl
z5&!(Fl854o4x(nE_Lj{+EK-l-jV153SSe!KvAX`*3Yi>!y0i}TxR?`d=UZP;L>Ss&
z5hRPoRFMb$Pst*ebWZ5#;`3}sm`ODR0OOa@VbO5tSIkG-(-Nd4j<-Zx?r<9;V~Q2~
z8>Vb&D=zU3AA85ug5zZby)bd2L=;Xs<3Czil`@J6q#vN!b-^sF!UK@*;I@QE{l(Tr
zw+80hFHKCggC+@r38Z$a*9>mOflT)^ym7|^>AWxK!w4+Y*MK2~r}?Td0Yj64d=M(!
z-ndAsj6t+YP~2$};wR_5Df16=ldBl-iWCOLAX@hpxXHghV$EI=ZIY3vtOpsx0{HRO
zFX&FhG=S(z1TO~3=2f~0HQ|n6<|mi$7UP@8R##tSP*t^l?QX9(?SgD%&2Ey#OiUsF
z3*{2GIJl$)=vrM|p{LPNSrXn`2?QbdM4-mBrP`H20g2={1yZtH;bFs#UKs#5Rh_As
zwWezqIT#&X2Zk=kwKZ3?luXJiOP0G)UFL{8Y=<X=?GrKAqxh_*BrV0i(E}Gr*xwii
zeTwwlhE?+|U0VyRXZykP!Lsrf-;CvJ7TxBAj->LXTU>>Sok7yX82DRxE~2O?jbEw)
zZ9DH%0;cVS3}g0YKJY|DF{@@H`JjP9?XzlUk4BG-j0D*VEvk<AjZ|Tz`9!r~N&L+M
zw8%RR>d-_!aVR9*0CnCYF2Ee~9D|PJ?{E2Yiz^<vCJ)G3`^$q(A#1-8ynnmiRHn}u
zaz82?-O5+gKMv!Iz#^g&#UNa4HLch=Kq1a6`*!u`xpC1C>FnNT2cIKdbdQ&@=iScX
zmzM3((O{1X6upLs41KYvm%Df9v$dQ3^WZaV21#`}zG}yPPG2Ban6izV#bE#5G~ZFH
znp!x8KPR5)sN>+YJoa>3lR}zE!a}^4xa(p>LLXm}6{V=P+a^wk2SPws+JjOYbC4s^
zCglUij6g19>+j#Muv=}UUrL!6Im$Q7v?Oh;6dPFNX211i;G8q<ORaDO)6_(C+~oVu
z;6-^36(cFruhGq(5}Sy<KWqfgOBq>x+wj9@y@S?rs|iF+Sqg}A9OgZU{BNt5ugcI#
zmVI@THY~Wr!5)c}CM;aAT*03L@}jsS(rDmW1=iL_yK~cDRE%N~fU1==T;*Dy8w>E#
z_cc(96!S_((R>m38!JhCgUZau+oUu)+Pjy|cn9t7&I9y}%_^u{I5a*~=II`op1xD(
zIqbi_WZo5_tkhE<xa|JlC5k0Nke7EpP#x5xGAn1W)s4w-u8fE5{at3Qc6V2gKT^HF
z(7wJ1N}v@ozEXyU&a{9kk97Uh%DvzV)s+;uJ$stqA)ohM1fZ8Z)AL2FA~<qlBp!za
zakck%+^rt7lPs7)7%P=K*|?tLu%V?_dDXR=n$b^Jc?>F<fFzbVrT{hI<M4<V%5*QV
zAs{d$JJ>aYpT!m!A^ZkKEB_$xa=LiC0os7Z5rdOj9I4SMJQ37x456HS!p^mEmF(+U
z_e?~pWG6ppc)_1??{v8Rpr>j^j!2whrKQ{#|2kmqvO!QgA9FK3WD<QlfVg1_^(GZs
zAjWGhcfBw+&~vz05z`fW{XnrQD72Pq3a`~=BA*rbFlR4^d6D~Y&^Ce_?5&JZhpooR
zMvUROy4lgPKAgN!4048~rY&_(%aaD@xXn!b>M`=}VXEq(F(LN`My+ctFv@}KcXI6h
z+DPdB0@#M_?j8_xrPX*OVR2pWAEJ0h@6>(7VWGN~Ml+W$yCky#zoXSegXp)2G~eV-
zK2b(-@>`swHfxq2oq{Vj`?L4Q!Ep3gL%C<Vi`TAOLlPkR*0bW*0U9v@dBLGhQR|u1
zOj5fykI_ZOTb}o%R=P#*kwZ`mQT7-FGl}W!?GFYnsP^J_PrLO-09e+PkPG0(jE8;*
zgi%{8f2u9Q-smG5Wev}(;)Yawr8pHNk2`C<-}hPf+W(ueUAK+J>uF>?J9g|+Vdd=h
zf`3uvc_J8teZ;b#^f96(BROoi-=$@Qe$axUZ?0;a7ECouw!!$GZxlK1tSs*RT+F@_
zlw_d>KxfCLwev{!<9?4^Y-F#!OTJ?}BO&wpH8`mV&b<F8JHCAPE4u-=Irf+B<ULPM
zXaU-r9#d|P<prT^Ns1dnBh(sLoX-|rmvK39guHwPER`{cPf2{afw7Vs7>gum+*duz
zJVlfPXw*LkQ_dB`4nk?W;}X*lDju_OSM$_oU0q2fure$JBDW8*!vd6Vxg~qv1DseW
zJgpDS6z~#F!j@Rg=Pv`=l6{YJsK%1PEt}Hs5&r6#;0yrjL?pCv4Zm)n?wIsB&L`b&
zt5Y3*9Up_`BXKF_ohvJ|$0eqjy&}h=@q?=w-NU&4jhU^j+U4$__jur|IxeJmBmyy9
zGJuKZ4+}FeKf#%rR35N4TYdp&nCspq6<^f@g<lD+*<P7G%3Gnp2*g+R)2?9*;NqeY
zzE$(X-FK$R62+n))XzmGHNYw^3!8zhqVrGu-G(s(`15<|cT6m~|IW8}G-C?FT(d5~
zr@(}Hz$@&VlAKBZ&M$ELUmNj~mzSSZNT9ZO2JCOL+F>)@h5H#zn|iXKSlI3zvcG3<
z7oot%Aq!7I$9t(}JjJ!$3IOW#{$rtg76{v&wQ<dmot1?BmHDR4fpNQ0jhd#03}#{q
zJDS^Ew?gxGigQ7XvGoB6C7vJ)e<EtsjWkU4#RR?7=?`?~zrYg2kL1%RJuECC$mbCl
zQ%=t+9b1ntb2*HJ?00LFEcKkAm`iK1poJ$p?-nFN0}o^yVX-XXk-Ke{^CZQv7<eAZ
zDb6yHg9~)^AxZ0Bb~?$?K_{ehV0@9Hx>m()Y42<0C0OI`aSqK2@%T<dgO8EU1%iBd
z=br!0h)GgX6pr)uen`GX|Gu!+jLLRW&(se->%(_N{m^@XWK5nwrOG2sB@@=b(^SS#
zznVHT*{X8NPQm46mNMafN%YHdUx#<+7#Ne0!t&TO#H+g%Y#gRDpbqLj!5bEXk%IG!
z?Y+-FASz1YtIDYKwIZD3tPkGa>ZgvFIeMl}Yh0NYy>R7K)|E$d<V)nEVb=}I=BzVT
z)#;#U5%ZQm4!MlAAmmmQD$BFT7_=^p3`w@IiiISl+>ZbH6_G=N!72S}>@fouZCrGq
zHq!GZWEL39W@4pGETFd>?zCA3ibY-Z-J|hixdnA%ATrmJu>$ybWo!pY5j&uN?phBb
zYO?<GSn#;gsrl~d=@VgUXA>Z=t<*D3CppHp!7xuRifYzh|M~DD@5|R6MkmJ>-+iV{
zC0T5(kMC_~H|>w%9(mU^g!t^P%&Dh7WUaXNS8{v^Ta9fEV?$`2_K2G%-zcy&DVV}a
z3vza;jgDWIVdW*y);?2yAWCG+%*+gp?~aZ)JXlqjRs*3IAx?vN_x_5(K~><v_^Q#1
zg(CD${JO7Ut_`S_+|Mh%w!D4E$dTOoUH(hK(BPm|;nhlYJ}l+xdlHAFC<pJ~hmA2}
z&o?rf?**jn4&ZuUA5(Adhl@Qvn4I+0XT-ZvI=RnMPcHJZ3*>_x4_m~5@Xo@Xw^~P&
zM%@PrZ0QZYSn|K$clFg3i?qHZq!pC=^KukPhr@`F!XLYttD=zjb?`E%#0|twf^JX!
z=Ptg-NPX8`fO&MbjagEPXDKo5g5ukcNHTD4v-j(UiW@g)q+J$(!V=7LQtW78#k;lE
zCEF$P;3)mPt+cBLV;%tRJVs2rgwaw3Ru6yc5DejM!j%;UVaa*eO3PdmpsIu>iVdni
zV}XNW-H_ehzjOR(%g61gL??EHINYibSJeIBQBaU~iVyEyiZ+ZF;ml!xrzxwn27X^S
zscJS^{#X-~eoQ9GzPK4ZjEOFSiBsgdPv`BsQw~n)FIn5`;2~TGc7$%eHJfuqrDR2r
zc08RB?feqcF{Ub`Szxn%Y#YqJUDNZ37tFHl96XczfVtIJTjTdLuowKXu(-ZG`se4z
zOM}N+QhV2%_IClPaMY~v!E*BREyw*;)qX_a;95%|i4T7_gYKz<0I1@;WU%&kB}O!g
zqy-3rG}_<BvM~nE_ooNp*YleQ&NqL}!>`|d=!HBNE79ZrO#Bu9IbP9l{=8TDq{Dn-
zPgp-XBq?M?UFMY2+S)rKHYmm!?fHg+fjGz*$v#PZ%g1N%QTs!TcwS+p6awYXc6OZi
zeCO$0%R$y;_kg9W1XcL<QsXUa`Na=hlHi;nA#SQU3sj@Q9Nf2_i`P}*wFU!YPY}k}
zTij0ID~17BOOW8xN$p2ars@+AS2TW{!$R4@1p15&=yyHhND5J(R^O<^#$cu?q6P&I
zCmUE08I5NwB|}QB=S9XvH**XddLW(45Y%7_!E_MM10S|hv5^v5XaMc$im|cLvHiVc
zq3;{&#uv?Gn1G=*rpi*N_FzqIVF<Y-1`Z?^2@3<$Yjc0m5Ri?6%X}2@h*KU`FgmMs
z@!n;+lF20S=^e%0$eXKZsBLVl|1L?fkhIa0lgQ`L?LonHp*d4u{T@wTh8rR{pl2Vq
z3TQza8z$mPfaB0~=#m#r794#7KTA?3ewPIG*sTcUR)U)<{6j^R$kxK0;Gm$*-?QCn
zri+5tg)C@o>dc1C<<C=Z`<Imzrcq2_ai$BkpxcWp`{+;v8+Z06#?U}v=<?2{S;*gh
zS3!Ko{D9P8I^j7+&4pHJ*Zg<G@>_%rB}s;QRZQ|OM;k2Fxq!1lwdJB^p0_a7TR3XZ
zYoArR_p8n;9^0=#fw6ViOlMlqxnT&P`ydoxxf6*e6s??<dw<9I_SBpN>6gSeTsAK9
zpw|^9*tj7O5GofL*ef2%I2bOKWg(1sOlrNt7WiYl?;P*rCk@4=mY4xd&21P?sw2?r
zXd(&Ar7Nno{uhfu7-4w!oVmoR9kGZ#fgd$ln*8C3<OTLmyoN6Rx%}(~i4ziBefXuS
zSFuEw#LEQTkO~r;mho($I{bCzN#adO+mD46&%j(N1*S5;`k>OEA<M!_J@BezKwHmY
zTo$_j9kSa(s@KN^gHdLO$A5`j3E_CfP%XM8L3Fw%Z#0XM9sM$<5v*Xs3aPHnl|uDS
z##E-@B*KwZRvh4!=9_lq;nwds&xUzxotRk=6SA4;0I!d3XVW(od=N4gYJiMtTm;dw
z#vym!`JZpI<Hw2_?hWb8NsV4=zW`objJA^68=$JX-cyGy`pU<v4I#LsUSyJB$n^KP
z)s~BSq<AS~3%ynt=tnMkdM#=`+Il~ialx!45*7d$3hM!ru1bv6M@^&dawT%nAp@WQ
zEJo2-IC~`|Ff^b#<AT+9)5-fXh#Sn3>>n$mOH|E<#=78*qNc5(7Z+4D_lF#JyF&M?
z>*}_5Z}qlZ{?~AmK4D=>0cAvywM%#!vghs%%Omg<Y;hZT&Q&Ax86NQ|sIcW?g-4e<
zy~<;|v6u1<OG*0ibjg{%ZU=onqD;9zvX$fEk?Sl+vGnLEyht$SMRzs_eRK(W4zM96
zldV%eSvrX@?y##Qgx<*q&{sK&p!$JiyAGg^-z=oBNxMl@r1aRH^qeBk1UVgu^DUV|
z8-TCp33#$Oo~I`Wp=ZkVoP;GyT<Y~TfYU$y(pp=MhQ)jm3ikDVe1fJ7auOy85K)5b
zb#M|j;;CSieF(pDr|$XJKLOjY$4L4nkQ8?QSTK~Zo3HpLF<9&q+i{mdniBjJsBYDq
z?@F@5a2M#(AQ5SYIlp|vK{^EN%`qPrQqx+5b>7$1)P(%C<Swfle*|Dmdm+}+Ymcav
zKTX0C(FJ2c=Qz$B+s4kn$jAWu!Lm`$Wb8j!PC%;9R)<+A+Td87J@mQs6d15|;6_$6
z%Z7ufMDG%=TS$F{jP|>Fl0JpXw;ma1d!<JO%<G>%c!MZVp~7ROuS@uxKsfcMFC4tG
zE3V<dkLKBl;?F?w0Xw?=U=>=bS?KzGh7hQ(J_R2OudFMa%`HD+c%PnWvX3?eG`v%q
z)rPS5vS!FWV5BetY>ov*0~m~-+HZ;_-3D<&j9YtW^1nJTnP_X%GX~3~fd_4`&soK2
z2<i7y8%TTv8pmsPE^{<DUyKB=G3zzSbJ&VNt5jw;rRJ8+EbjkY+}{(~JH=d3GwD+5
z_J#f0;f87V1ifVTtNmSPiz)WCCm6}<UWZ;I*r!CcFfuQnli7R+LF9dV=R`s8Jl5ur
zkl-q;M6=P7&HskhQl%gRQX#_2aNec*Y43B9y85SQypypO#uBGeQg=R0IQK6n3??&#
zVlbw}jofR38|l~Q$oHc^x!v@f38`O5`|Q8B)5D>+nmfOFt<NB#P;V{vNxT0Gh!%Gm
zz5m^3;(MH<oxNj#j^?x6G5|O?lB*HO&15|qH-g7`?|undjJXFk#U>_`jQIh8XP^T~
zo7fvpLR-E!y3<tyxLau+lUm%`uSxGTg^xxztT!$%fF5t#+G`S@)BEj*A{K90Shq9O
zn!M)iI*HH79!5-8cN7OXUxL2Vt{JkwHhQRlzuI$B=~+LAWHFNY!HSX)4T`>?xV^aT
zka&~5TgjwemsSbBSyNkvG!6qk-Tq4Pp4+q}BPr!%thBeQg6VPIytLv~+z6Oi(LBjb
z4Gp`aiy_%qnIKuxNoIfcaEaH5XZeuLylH#?=z=Mf7ppFO46LY!7J(jp5tyJAi&Ypy
zxU$8lfh~6FZr?XH`_oPPGi^^t2TL3OAMx4;g74G-k+IcB>jT#E7+LD1YeP-o)=rV#
zT!w0?$e%@GA_IkRj(f7u!cy?6I)&-DH6Tq<v(th0`Hi|&cA^XsI&la(NdfZqSW<Si
zD<H3OA!qFJU>-#j((m=#U9TC#HZAy<xNA~Rh!V`SuNr2<cW>S_L0}CMTiJpWj`Bia
zYu>zA!a{9GIdWKlw(?Hqp=b(r^V?l;5pH->33B(&&*0qft;WSI|MIcBxx9eWE8q3*
zb6l8{RgT|q&?)xABFC+FF$MV^wN>TxyU&Er1Eb@A!RW2O(?)T9wxBy2b`uA}`(ysB
zfb*%wx|(vqF%XS!Bx!S~p<ydlp#Ox$X9;_I2O%1vgZZ*Ck@-{JN?+vO8Q3|=BPR1B
z`EXHj0qP6J*49=Bc#OiMR7=NyjE_?+6F=eF5&6E*u6I+3d03*GP<zOKKu%7bU%hjb
zHLvP5<INA5U!B_=VA;)&RC^tZ-qn!*4RZb8H=CbjEclO9>d3ySAcK9g2I#v~ZC~S%
zjl9VCm-0U7d22m-_npjMT&;obn`Bc|xj=lvO{;A74{dF2{TWnpyGpOGX-$d(y*2u*
zb?;av?e+F<cuaaeFZ-28Tla&5PvJk+0Kbn?Jof9su7>olwRshqS<k3QqMng5Pb#Qe
z#>AC@7o>1IwsY0*63=C^cxr!zQp|P2BembWPmxiO%mA#RsQXCO{H1vrCYJ$%_sJ1U
zcJS~wY&$2ZV%^3MtVJNe#ZQ@5lI2LV`1i#7Yqr0VFtVT`!vf3gx-(*Kv{dJg8X-d9
z2h$h{CN7qB_N9Nmc%qgVV@d3)oWU2FMA7hiQ}uRJZe8!HypMt{+O67Zx-$7wypA*j
z%kP?>RBphypNPx15FruRp|Voi^J({SBTY~K_gAI^FXjWE*Grye{FT6+L%b)XMQtsy
z+<b`Tb3P70aZSM$iSj}831aNp=vc)=)1N7P;7EA&xDteLXJhae+3G5Sxf?R*KMGp;
zPfy8%9T3Aciy4%!_<=v>BE`A-Fu=@nzu*JyR=%&>$Flv8t%q%Z@c5nJ@Dtl50r45z
zKus-AZTSzhKg5&P*Z+wv&#oQ9|K1&KX=`&H_dAuET6D{|rUp#Bz!|>jQJ|zU-xC}m
ztyw864s^o7z?k==(lf+#(i|V_&SjpWa5glGX=g6wocxZF1yFDrk<QOhn|40W7)w#_
z6ker=3S$dO9W+Fuu27U@%gaj^w(hNqMyjOOw-jP9Pp*?EXYVy#GBC$<-Wu3n87Zw~
zfH<2;zeezyd-<3#>s>3w>k?(uvwC}H(}NaD$52i}BsuY>Hpo`@xDn6n`yO7!KR+lo
z53d(h*=K?#e_j$x!So;Lu2*yWZxhhy?Kka?@2~Ig$?tnq*ByA(eFu<e6)6?I3oU<a
zFkr#Bewl1rfho|3dIts7jJd~`ffMO*=qHp>K^mt<FBpZYpM9oT$b24?s=l+{O5*AW
z_KK@{|NX%ti63=J|H50-up80~XE+QHd~8-4vQL#$q`l93fT6zB;eOP?Qdf;*(-oli
z0Qxz2nFH@Yc7ANf#)xTWUko%={DPy>M1d#(L?{|4Tt31!P0opSC1YbL`~Xy2q4D?~
z<E}HrKn4xHRwh+YTSXW-390wO$adBL4!9(djC^2wXC`zj-vlAYs+W0cg~5E3!?8L2
zw_UO*-V`3Df0|1R)h>aEgv6YnDT2a=GtKREfUoaMpUd%za48S9AZcnGvAw?!MTqmP
zty;w4J%09!{C-f^@V*{gB!YLhy1}eT`YHzUHZCOng8nOtfOXMoTie=f_X}!omdZZ_
z$HL<NQoFok<6R2KTi0c+M+3s|<!I4CGH-#EZZSe1&LIY8do?AVh#BCzDvF`8>vjfZ
z<@YNo^m*H+yd{F;dXQ6h9&#p*Av68?zzg=CC!{BRk(DGY#`w8ydi}=T8;fN80cqdM
zfwFFKRSb{n5o~7&AX0f}ozk_$NDdrpxyaLC`3~F)$vSDwnyKH#z6>b|qXG>S02EGw
zJ!NP9a?8bUI3rWAV}B?!8>nH309q!RkQV#Bo~lQsl_lUF@j8~wXJv_d4CDCyLqgON
z%Owl@m-gEBsUBaLOnSe0k@2tipt)PRzX5h<*uNSkT^h;lU3La-Pl0rAq)Ib(HAP@i
z<ZLncEn+Z;H$wPim)|LXmH;r~#d=rT#SCEjQX0&Q*@mV`wgM={d@e|$l<(LR1E~U%
zRi2cX5j=3QZNC!8Ffz}0)ytv9b+Jra*(}Pr`}!-~H)mbc7wHq;EtZ*;Om8OyM03sI
zljPAWyy6L$du!1*4}E`G0~bGb$=1*@RQ!C_)fl1yqktZqOk$)Eun_gLM<<nhzEk>!
zg<kj`y=90rl#o4C2klep2IJ#nV568GGxue)1Z0DHYgz>os5(8U$UyZoMl`@##^uX9
zdYVza9U^T0s>w|v!02katHd?^RIgX@bw*9eypO)PK>;brI%B}BG02ZKB__+t-rt!4
zPnSw==CG#?ib+Fa4Vl8JXBcUK^Hp_#gXrrn&{BlvVGn~E^<cTje3FxFFGSf`7+?&F
zcwpbsGyT{ljUp={PcSDSR*x3JD`0PHtMhKKkQs=q9)f{rQA5b?E#U;P&3@-ym6Glg
z3rF@RsUF7)<lLHm0C{ednXfgeYo5*^6u7kkCd#ss6D?A5Iz|`VlzxY<*EMYn^@c1z
zb^PrQ;;9d<Fl#!YYMFbJV7}1b-ix5DKDhaJ>_LM9ou}m$$Qkw;8MwleYVy3v?`Ju}
z$Y0e=P!v0br#wR;fLx;)?(Tm!DtY}Z?@&j{P&1d776y@3lZ^GjP@g{*k|M;2#Fxar
zS$byWT2Ofaf?xQuJ~kpC7789e(=_Qy8KbH1vhbfE#pTyfIRlv82~>d>NSxt76^-$r
zQpl#uR2M<T75aJ#JP51RB*&?^xXcSWN^)wAR3A|xr>zF28aaGxsX<B&tG$vU8lw(;
zj-Ffw|M4hiieDSH?42#<^b22yze!k*u*Su5qIdd?M3sk&5`N^SjhEG<_C_s4R{j3c
zW}vV%MBaezt$Uj<cf=jg{f;xRMa7jMsq{nGiMH`)VSCcs5_36t%}I#bzx?UmV}K9<
zfpD41`}X_g+5ft`kLKe>MUH_o`zC0%j*JAe-%VE`Zp^k^g^A{AiRTd6W%DF!#O?sq
zQ1YcN0qmUbhzh2oaXzyTjPk{au>R%y++bg?8sD$;;8V?#P||WgHWqg{p&-5tL%{P~
zUAve5P;Peb=g+;p@aIb$Q<(u{_V)Hcizvx!VG|eJ()II;4TI=cVA?dO7OR2;nj{Qc
zhZqC+aMj>2JCvPFMMcHLkl%i8vmNn=f8M?fg)^P6#OHbm;qUh=0alodWaSulE=`#T
zf=j7EhN$I&=3|9^&sajzr^O*85=$dwZzD+EgyyyF3B!`e8I8Lm`+tkF_Xixgxo%|<
z4C_~2_>!j@64bqdHrmhNc@W;6Ye65Zu4#Gm$jBk`Kb(9}P`Ohkk%U!aewRLDq1(!e
zoxkP~$WZ1=MEBV)%5ib!{H3upDZb0kxXyC6uI?ay1=+<IK$4B@o$L->lN)l=aiNLM
z|KWr!b7!JrcQ~zeizN*}ED@`fS8;_PxtZdm`ZCGIRFDlj-NntT4cj$fJE+yzlC}xz
zra~_Ew9VCd&EBrp6>jMaRI2YmG%e36Y5$k2@Hz$Z+)ABlCnfgvVlmrX_i9@|RTVJ<
zY9N9Hf-D7i##-tE96jR`UlZz{X50S>Xz$H#B%7XlJmJ(icklbcpV~%c`o$G-87^p4
zqa$rAXTE}QY_ehm6xVYZjULgB1nn5tbPH373%MSHL)~shM4(}P{%tPshas1ImZl>8
z?~o-yPpUp?`jS5PP!)||jCq%KGRbsOoBQ6F!dD#vr!Ep?Ic*lwtJWK@Vb31j_I{b-
zj1o4D-u=m92Gf*$O)r_-kK5bX86kWTERBTrG6q@;f|iPhB3gCZaIz=z1g;svE!!qr
zMs<3;E9V&V-A_;TmXwxKT{lDbn)V%m5p4CT<KD07E^Eg>noW<kL-yZlo?T<lE0<$F
z_%0F!QjaEx$U|!D0f0PR?>NKBOW@4OQ@)dResv=$k9>BVy@2$$(K%ceB1x$$8F#Oy
zr@N-pgOE5{lmj9Y`x(W{XU3wshI|J_sl9fa*XaHVeS|KDH)Hm24x}fa0K3?$4RNnJ
zg*DIx6YF=xAcqv^Dqzuap(*2zCk2!=xRQXtx>a#3F{b%l9_VZWSvII!o6J{OL7cK&
z;OQmv8#kW&o)jhZX%34R^&mMYBy4&z|07Mv$AtyiuzNjYni|GHd}7hB^k|%!<l7lj
z_U7yA8F-X%nHUT~ag&v}u*Sv9z!@TzdiGbS<L(U5JY4#-FtQyT?cC9+=3>_~bgB1h
zA~NMhBG!#Z5h!U=KHvn?m<9YKY*ph6%!381Q3#(;_x5N=x@k)xn9uC|9w@SIbj|Od
z4e;8Vyr-6Lm+xKX(x29Ykd4uw$HZ}5GBSn~st6U?rNAs?C{Rg;E&kwng+ZKAK^NaY
zk$?;0+eIGiCvTx3qgJhxkT;bK?jE4g$s_67_&h<1y|dmTcXxY{v2G2v*+J#{MsUDp
zV-BI14U_M(sEr9idK8kFF8vG)c332jPv*ibAfn4Q9D_Sis$4scJK6(+er^2^PnFYR
zF-RoTP?{nxno?BM1>!3uRy?<mV-QAkbidj&s`r#gVtJLvrNvAvj&J~blg1uHvFLLB
zcPGvm^r}n(059wttgqZ=>c21&Ykwyc1Vj3l)w<6?`_!0^sPPC?cJ-qEVK^_LBpEAJ
z8R#{bG2kZP1P1;o=~#{NOkreRgID0)t(a=G80ai&-0<1x?xm0y#X59(9tg{3Ra<Is
zO|iYY?b9A~T%W0RT#c%M*P|pSq9=l`Jb`dnx=}=eRSK45iW4E4cd&f0<Rq?@O?Lc8
zE-Sd!QY)X90o>rpRYcQ{cis6&82jNAoZ8PM-`f*AY-$xR^DGT)+-Awnxz}aef(O?h
zGaiuh>*{Wx=e9}+HwX)EHme??YQOV?-u0v;HT@XlqL5dy&8yRq{66V3;I6RtD8TPg
zL8X2k!1VeZs{{FA2Jki98VDeuFEaj;DyGP84ky*Jj@r%IFlma2DR8`3UqGwzju8`U
zOM=%vCnoBTV6j}BycV5&uTIzIIC?8nD@rQQCPeoyZwwTbX4z?$F~siBNH7oyXsu{<
z<|><#0fI}{8;cFII8`2x!9Bl%WGRGfW;OZ1lM4HkS0uJX<cw=p;#;7LnhpuzM}tqZ
zrbM2%kWqoCmej0}SEWv2PY`oQA-;VQ*{aK544~L2AdwdvHs&^;o>FzZ5_;v#h56q_
zsg8oV@^TE2Rg|7K6VYtiUn-b?P+;8YS2<hv<@iS>`JN!k?|K<gT{%BtD*2tII^~^1
zX{J}jH&Jn3$aO>>?5o=P%FLZgp?p_xclX!G{D<PUncq-*%MkneGY}{*AzK4;@*!*G
zt9KN$o~>607}>~}Bb>P`;+|7QiUz<+sJ!(#6dOzJFIXN^C&YE2B4N_b1xis$y<knG
zGXiiBAkP_;mC%kPf*?`cb3V5Yq#R&1qkEjoZI2b5fPIT)0Rh9$bgZ-lI29n^Y~ES=
z?C#q=>H*u+Pg#3&b6bHLAv;UY2L(W3GF~~waaw}NZ-FUrTBvg{u*F<Cd(2}b>eY<5
z=P+QAScBc!-@UiJi3C|uQR?TeLoP@9Mc$l56u5efDAWHdnFs9OGKW&$dB9%UUy9cJ
z5mLj+=P~Y~Vv3<C3*fgKuZqFLkV`!{2XgsUG@+&Fly;&8x6vm=n?T?<ee=PhJ^9eJ
zg%=mh0H+}pHrDirwX}3kgPbBU6~VAQ91(*lKub|>%Q<stXB|(4K!_wTnrS70E(tty
z)x9<yXzT7kel{HIQZVy1E?bu6ok|~)d7~`fN%PHGSb_)#->|^2_`bKa=c3O}-TM#4
zcgVI?0B_Sg8(0iRuk^d{R;AuaKzelZAci^H0u$z#NCLhi83I<zGP<inEEiR?EQnUz
zW}uoJoWB@tK&a@8>IetDI`ZkL>mw=;Da%}rn7-@5UaIq}8ogj@r8HL0=XlzM-hV$`
zA99hd)Hqw)BhcMZ&B15$^%|S_&5r`VsAeMTbpYE>m`s5sf9ZA5*rooT5a<EGM+Tx5
zmP9dn8iALcWObs9B-AWUd`s7blad%|43swi45!g`v}+MWQbv(YQliqUp3k`GcJ~8W
zfXD!q>e+@tI_$L^bG(!Xt8?4woZ0c-R2VNr!_-Se?|8z&^nF_ww&b8OQHmCa+SA{h
zEUw6n{cEe)_DN?tl2zPFMiaQ77xLd$9U2vnp5ocxmw)d&Sl_Y=AdI8G4{}RBPZpe$
z9GqEyeDx%g-rO8v@Jj03<olYJ3%l-Emyi1ezr*$)+FxoDy1FYCP;4?Vc)~jQ(YGhl
ztLxWJzh&PL)2d+)5C66zABYH4w#Wsb<BJt^0S~BiS1q~S=m`0DT}epMd~Hcl-4Gxe
zEp9dX`8U+mc?{2hBW-p_NT^`Dy(-``!oC99BbX3r@Qm*6f;Qb5_{u}|<ITms?JYs+
zCp~!b$rg=GbuVzx9;1){uI={kZyG~I*iX%^tsUi#>lgS@40H^}lXjRPcNI>9rH{kl
zpxeU<FDfs&A~#>_(-ZH?aIRz=$ylhXUL3A^(%0J`saQMY?$8vXEZ?P2I?y7dV}*OE
zf25()1`V~->b}&@A)Xf@Y!DAS41ildcUGotF`G+sIkUs(DFW9Dp7{Q3a9n6c8P@b{
zd3<aZxQ^lKQT50*VQp-<R>}n$y@Fw&-Q<?}!S(U_VAkv;eyKp&p6)&hhOc|sVI6Ws
zuIYej265hB+~2D1uG;<Q79HQ4*IW!x1#IXC8d*h;Xg3YHEPNo&wpIjLHVkxrI2>Xq
zh~SiauOh_T%Y3?SdN0H`*aE`^|IGTkvtyS!7PpLz_4O=!YPOtR7rM2(7#bQF(t~_)
zSht6ZO~2#fefgk3_OiIiK=n*sz>}lgBq1o<CDP_SR8~;EF-yn-w59m1UCfv-e2W6E
zFzreclozvn8%I6kkuhz-O+!3!WzM$PX1CR8v?L-PtS3;@ezs)49`$k|<w7#6wq(_9
zTm|f^L7_O8Wr{9H`A)J}%K18eV-)&v!Hw;$N8K#}Y^-@K%qKnH&-N`Ty~$!3a$W9C
zWeK|&JaAA6B98wP9a1;4W-ns#?}Bfpxx2ElF=a1KRD^SX?b`H_JC|$idVpDfmL2PZ
z=YhQ)J6w>S4@#y2g43ku?!NatE2{uW@=M5iw`wR$waEilF4NVkxx2c#iD<oj|3>61
z&Jyncq&OOi7pL-LW8sPp2o26xQ*mR5M8s}Y^qJ_rQD^ErEbP32xDV~yHjo_gtRg0n
z`S8E*gjy(HP5y)%m&1AOQ_ShyS|%NBc6h1j>w_s4qhWOZhIK|sQDqMfE{?xnSyg4%
z^^Pcjvs%crW;5t<wY5TrdB3joyiqQAkRF7Ph+4gWC0m|ZIC_eF8Z=G{j#}&1+x1Ka
zBJ~lJ66l8r60(4_{--x|4@3b2j#Yb&H4MZHs$_U7esUAl2_sZpER8Ix6;PZaDRMex
zX0{7jlzhn-&>2zPUjZVwmSf{L2;mUeP12xwlBKyh)=$9h-sex8rPf6P__(vM?52k=
zX9`@?Qx8kfa-{s0K7}U=95^XCu*z<3hHW;)7@Xr|kMs2~j&-$9P2+Q@T+rewYB0)-
zWe|+%JAN?+h=!CJc8)M}tgRc-^6B^M01%leR|!gd#gNES&SE?5M)Opc!Tq<SBuDW`
z$~TBE_CP2!N1k>9wjpI2^zvv&(UFC%kv9uxAf!U^l0{SH7!o^TYr@{}Zd^?R_%nK3
zl-a+zE8mpW+h6-IaKCGNYkwKFJ5#M0vc6Cqu&&zmP60!KfgT;uxDvmV%p2pka;T&&
zSP!*4i4RijhX80V12*%E7lzy<rY}|i&*1gipAnjxBD^fX4xZSNTLfH-<jG}W-xM^0
zVhX-T+ESE?d=j>o&EO3CVJU&hevQi^d%_XhKcg4Dyu8A+reB`?gw*>iLH+Iz{#lrD
zKlt@!WnH0l=^~1festeyLE+=ZM@R2JdWz!h<o*C4SN*f@3piCY5dt^ndV&&>OI0W3
zlP-X-S5Ove9V3MoLWuLacg1{tSZ`e9hdd9r9Y02FHVfS)Af_JHes2>?N$>9D4BcL+
z8{b`6B$|T=#ZH%F$ZL{&loj{pNYiGwSLpT)-FI?5(KkuANCN`cd6_#J*>Y&;D<!5e
zmsff|7U$E+Tz0a)anYjFzmA5jb)9>7Fz=iFAUs8O60Rn~E-@t^y7{#7&%+wwW4$~d
zf~`|UFayCSxR8v82k{jQEVu2e+WS-io#b#yIJHe1K}w2MeHap=g7Nd{7ibvqs8l&2
zWSby>zKd|J8#O4H3Hp1*ypxnPvXJlDR5RhxPa)@!$k-Sl8@H`HC^F|{H=m0<l7d^i
zNd5iMdbdwQ9!|U^FkHeJrar*N9&d%d$NoV*#R-GlOv50O%^f&j*d!{RpBt;VO&{`X
z0xyodBqE79`4h}?z*bfB!72P_?5BJ|?v#n#^^!fdsm|?ulD|h$+(FT??Nah8x7zee
ztG3W78T{H<GB&Oc=ry-@_x>G@ZuX{HK(!5APX$~tf$R3v@V}P&B<OsP@=O9EppyAM
zwoO#+6g~+%@HY;M8qg}#<+}JW-jei}{2bd{<5pYpM_d4Z{ip3}IC93%UJTIrY07F$
zvY+)+S(k{e*f*s!%|R~@rwo(iyM8rXAbkOvTDgWf$?xU?5<l4zVeEKg!tRz|5ol^#
zK4VLK_nGB3$nuUu{_(FXS>%G7$MR#rGAo~y%$IBl5q%F4C0oeU&tF<32aGwfhyBB~
zg6tz62ZSBP4io6VY46Mn|0MH?w%SQ^<4MO=5?)ji!WB$kkkS?W1A^DLR_9^t6eViw
z8&P)792;)l`Z+U1&DW8lUZI@Z*^hl=hEUcXaJ`s0tq|{<cubr{v-?{F9ozjgCgzh`
z(MK=jK{WgVwo)I9fZsNe<itvT8|L7Elm^*zC_0@v>VT{>z_5u9+MfIN^9Yum$S=CY
zm#b4aQJ6U2Q&fUL6oteX3!alVaoF{Ld($Hu+f!kR*z7V(OC)rQnL{l<X*O!W9S>V2
zzr3fPLyGuP>EV|X1xs<wGuQX;!>DBb@B}U3VE761n7*tYYW7%DXoxy0RT4k^AM6~H
zY-Ccff}|$KTOsn9Ez70g5#Pna53bov%x;QCD#RC9ejY4cRC57yK@7YH$@>cHBMS5V
zii6Ujr>^M3`6Vm`IXS|`wV#^Tr++`3^MS|;NA?BiLy-xrcQPe^6vh3<N?h$lG|gC3
z#e_H?&)tprQnZhjuCbsZg!^M%$&tR}Xy0b4ycv*u;v}7u11_j}oyKtCNm4`f7(nVa
zi>Bf`5)c3O@^okj0c=jJUp_K_=vanxR`xMR;un+tJd%MpV!ER{hN2HyTo0Dh!<6w$
z<s}I&{_$K_y+nUcQgSgyuC|o$zmIcqZq29$ouCCTuHzJ#>85|(X?bw`sbJ7I_Op6(
zv2!e@e33opG)9uxthOX+z|$8^a&k!Y^{h~_FCSeqA<Sj=;4WdSN=%^+34eD(LPJAK
zGop}3BTI_(jN`n~P9ImMS<Fe+EdYx*MNFlkmynTW3BF7JFnx>ht|%9saqD@#0wX&B
zu*rUQh$-x(Ey?l0pk~A)687<u?Ddg>Gl>N+*mcSIvky&iMWde97hX)N(TYah1GY07
z7*f-g!PbOSZ=u1VC@}YD&=7hB({*0sd;dgxtuVfm<Qbv!SBd=dXho;CCh*<29IJz!
zJH-b9z?=&6&e{^!b&qcGE}J}lPPb3Cun1|2fD49;($*akuz#-?QP%a~O7=5qIcyFz
zr;8g9kjH2zD3t|l#V83cp>7m#gqrx4H@Ob0P^Nwe;eXtxsYIcdi)2Z5ml7-st<EA<
zO>!%%diJw5cZY(h$wE=#KC9DR-ojVmw+91o8EHN|aJo%f3v!z53+5bmDdH0R0#82L
zoKibQdIvBb-VqUOirSL=NnLvA5e!{`DU<(Ru4`a0Su3W*btR<wfN1J+^Ja48HTmZ#
z_^D<k54yYc4aJ-=t;QtVSDtdxO>X6*=4fA2cq{pNV!ezazSv`Aj%a}y7_c6%+;N6o
zfBSxdF)-|Bzo2f$x~|Ia)AoR?M~#{+cG={`dXC44nsqy#dkpBo`jqd9(*Mh3T`}!)
zt5qSs&9Y{maj9Rx(Uf8A{M=d{oE9Y?*Cc+<DxA$u;4~cZ$|I#Mw*w$CHFW3dxf@>i
z5h1uAH2m;KE>3hIT^p>&f)<f3A8j*T%@|P=?<atO1Mpq>*liRjJY3?R2QoWSoy&SS
z7b9Wj7!TB8Qc-4&DCa>+>s{#|4)^4e#Yj#aoTU{%w#d_`4*PaK;X{M!K4EXtF9hmn
zWOHWj`Khd1yLI!LHHDm|nC2~sG*;!n0nyFaWyy`drA-Xk_k~=ueu{@k%|;yGL-$PG
z?p-q>>FIucX&w>NQzIojya2)@1|<~}=xgi~zqe-lnJ}Fk)ImoD#!_y+#~%>~ws0`X
z0C?y+?BCkYaQZI`MC&jN?__+hhvsXR;0Wnj7<D*ZGvXCDOJb1v60*+@4#4k-JbdPD
zdxt`_QfU3p?n|cug#wESs-EY(5C$YRoA>32)h4H%It3Bh=oqWou^wn7dnEq^8d7$B
zOqL=*H8k}1^=%Od=!u-j;{$Z%=;%RQNB^vkn(%47k1jiZ8TX|}bf3U0D>}e`4CtFz
z9&W4*?r&@|vqQsq?4FGNygLkZnxmf5k#AW5ng;eaR)maa-ww!06kCY&q*-<julDDz
z!?kP9{}sd!S4E;NKEpZKNryX8uESB@TTAx>LIaAD_qxpfVJIsXqpIe?xVo+O)!)!P
zMaqxh#YfIzdLlX|IV}<St`Hzkq*tiM!-=N-_cgL07@iJXYMD~JwlR`ksL~m8_VOJY
zD@V*soY+ilHH9eXLA(j17<QO4(rZ!@zzb9hstg1?=jBSQ9sV{9{fL?bl8&y_c7Ycu
z8ZNiVY$p+vn-zUH%uy|D?EIWv%z!vCjr0|F8qh0y5iXr+Rv1DHCxx6G$5L{XJ4HBy
zH}=UP0=6e5A33s6HDBAcso-FWpl9r6T~m3MipF4u)KBXu(eL%Z`cR9R=ZGFEEqChR
z5J4l%@?^B5P7H;=)25Tl*AT^riF3{4#}<?jwsSby0mGVsH9%~G3&p6RGdEk^_n7Z)
z2Z&RovrP~CFz%`VIH{8V^(Z*up^hhjDba>sEL<i{2SEPd*GDmY>^;M-=?p);&&Y#4
zC>UDs1bVeaWqN00L&>B+nJG@OjD64~H#N_OMqI~KdbDt!A|KZ`VvGKW5}7m>G((_V
z%3dX5Pt)=W^ZvrHOJ^Z(Z}L0=r*Sul8bDALd3gD{Pszcx2pg;^k?&kQBfshp1E<>>
zfy}S<Qqc#m1i)d(7|a&2Uy(puM_3CR#}BfHoyswZ5wM80cD~B_v`q`DO#g<u(Yd+I
zqLM5q-bXWlqrS@HF#n>x{bUfc>?ZWNZ`H^Gsg}-lCMzK|NM)pIoCk|3Sb00@@8Ns-
zZI0(&qS(oBEHuezW!Xo)yWc5Jo9Jgh))=(@uWbc(J~ek%%uiaB<S5=twm>8FpmZsY
zLXrUcZ5bP%xVYtfS3wHP$(2%0{c%*E@WOYEp*J7sd;~(Nd{lI~Q#tZQnSjnH2;*9;
zLGpezOXfy&7B#-@Ife+|xgO}9YzmQ04ChVhx^C&%6uLFgY&=Kh{xoR$qLe=U052;~
zpPZNBh1UEj3*%=8ZGqZ9@1?fZr(bLhNZTK>9jG@(+#)Mt00zTXB%)W4w<shWnO;^a
zLgIUHR=z*gC<cC^Inom6;cs8p)bQVPEM+kA@POaK9mR*+3)L}Tba3G2NENEB%h{&~
z;p`vWI1bcvkx4ZB(5?BP&~kvDvk{%H7`o5wlHgd>$tQ{7l^?YQ^GCRiqT#75HJ;sC
zbxh~QiX_&=aSGWyiW0bZG@2Rl1$-_9;8Q1l9>>ewBn7U{g;bArvkVY`aF>?<Y^KsG
z!(`Ckqd$#GbqP~&$nR2vTM@G*4dSdwyl5n_NqB&8&gsS7(4BV2E002wuoj&ZfIzME
zR6Eu6Zf|9Jwu{*dZ{;taO^5NpO`#tn@20O|{_r}b=XY)yBZAD*^J!raW<4wdc^3UX
z?l8sqlKa`Se^8XBy}#Z~O`D**(U6d>dHIUSF-&33A|dk)yBI=(f9a=sylrtj!hI`#
z=xzt)YoWtU>4+(x<@QN7MGU2I7>($wIwEo3_M}LJN;)<R-GlO|K0q-xaNwp3&~8+y
zB;N~1nOJ-l%&4jv1y_H9eX<un8gw_QmfRtghZfc^(ci3V8(fp!#~2jo4kRyc#Q>~3
zKQOTOE2ZJtY#5Pej!mjj)1bEyR3URQHTzwKXJGEfE3KOhxtOyN(sV11{Osyy>q9Oh
zDiCvDZJlSRnUaO?A7H-r{Pz8el6tMIS+XzKtk~v9eD}A<^N@a#TkOKZU}E`UDv=Ww
zJV0!cI<f12Nx-B2FqlB~q_X$a#Hp>ErW^PrPZ8r3F+)l{oD=so8bbFS?lqP?T|awR
zzf%e9Gcm5hS0(ebmd7qJJCkRk2N>No1}N10jPq(wMrWK?Iv^F6-AVr@ayi%7fD7_5
zn2AP+5<jqaxn(wO%{dtp6G5b9<Y>#C3AEZg!_Ri6E!cXf5MzXJ9ubgmMmP&oUDJt)
zMA+g1{^qvpg3Mx8Pt%*r7RoY&f4U%j#h!Q-+<YwaZO=1+Ry689Vvg~*cho6`9R^|$
zYK7&?f!bm}!tsX}=|Da~SAyK6*)PnItudGaa7>CNy<WUTxsQ+Qdyr+l|2sU9u7q>+
zeGJ~;t*+_Tbt-XUDOK}co@>eE+2SKem}s1$!)*{mD>80D9FXaC%+d_m5_bB%{qrGk
z*n#}rZmHLJ0~`{^3xR&5k_i(!CxlxCQk{RsxzP||35bL1O5I_4(v)NbQEd?)hj`$6
zUz3N#m#fV7n$k>P%6FX$@Keupg5nCQJXA7FK#h1-;Qt!yXH3>m?-dsw1m?LT2@sk^
z&xxIsd~JDJ6pa{Y2S5}wjJM=LAjhjEzv-3i%>Kb)&g+AR<sZi58Vj`!l$|WZHXJ2t
z!S`L2gZmzkPH9~xVWp}LLL?;y^&}+h?m28$X)SfGN?yNcC`hXvXkq;fRQpo<>iqZh
z$6p_mk2F09{E#HiEcY!CQR#dmU@`LFg8!vnup63JIQ%hSBpe2gTS_J|-|yyDmV8X5
z3y5M2w4uvAL5qTTaHv%fQiMs#F@1TE5<Fwke4MFv_ZEXNvw><s7m_DhzmT3|Qx2<D
ztGg6IPZ(v*6!1Bgob%Er@b}%h#G{k#E5<?h-uk=Q5BxvX;|+B+Ly<60i(1-Pp?KVY
zsLL}pCv@Vj_>^F^&%a0d)Q#-Sw2Qoig>F^v?><!H%E_}t+>sM^W`~}kZu(piNw`pL
zwe5S!T?O0e{4zWH*rRcefXlh3>1s~9+HVKF#)L*eHuRLR>1RR0Ddk?@m_K!kV~xS<
zo;bQ=$nHd@G*okMO%?m5<>$p7{Gd|lc)fOOA<8rTj%~{1GTEmj#1~^Qct66|<&1}<
zQL(yfJ<Bep`;LWID+x=fAieg$^`~vIxuxeX)J;qhjn0AIFX?r~FEWbxdu|3e?07Qv
zW@hqT(_tF;;`2_{X^|v_w%?Bvki3|Sxn^URtTD5-p5@c1bJHue@DN)%27#owI-Dfn
zJ0W@qlb3HjgC6euo$ku96XEr_5)M-?9qsM+8-IbN8<Vig%r~*x5$o+OIg53>p1qRY
zX*8Azp+Y5&3u9>Pntpzfd`z1=2kcl8w8vnFkQ#w=Ey%;h*8f*1DRF{f%U#ZBhn@d8
zl79_%l1QambR1fIgeEgN%cM>j#C?*a-O-}!UhZ--P7dM7yAKH3zqg;2<17rrAJ%-P
z_P<1yfT;`e{IKaXSs<5m82-(gOTVWEG{;mB6hB*htlxBaj9(^0$ph;Bl1WxXci6l!
zBPNnd0R=S?%ZKLUW}sx=#6N)xn_ZYMW<01Kf!|rwah}0qbf7UcuT?7)P?WTHGJL$!
zG4BAWgb15)HbwGbh>0fn^ePGBO?ua~JFVP*75;r7YQhLXW(jm?n328;Sj2hTr(!XK
ztH~Lh5Yja`g^@pBVzq)X(a{CWWEGrir=dytSB&%L7>cV9f`U9x{GuW|`57fo5hap)
zQ(e;`VMvUi_3&+(eucpb;ufp<zDYh7YnT(U8oIN-zf0BJ?T{<+5Rz-{?p`=H9=gAq
zFfJ&&+@_R*WtSvM`$&{Op0G2@#EL@nGo&K^yYz-kW1k?i81xA)IOS$scu)u`TWyk<
z81Y1!5_Ky~I@4W2A8Z7+s*>$s7{UJPfB>`sy2JXh%O_JG;JhX`aYWoiMDo9RLK0Ne
zf}0EE5P9_t&KvM5IB1&^6HCQfhR+UMea#<6=zv4aeSoOR7+zZcaA&x7z`Y9?&q)x(
zw0ihX)9vVKy`#30oOE4h?tA)~O`R(jnSu7#U;_Y5!{IotdX)`UHlJ~;d?D%xWoP!@
zy&l|r=Lyp^6!X!b1v=}jNCNdmz@Ix!(iuN}1hccVv2k%&%#(6ycq>Xe`S&oR#$2;s
z>y5%4BU?=#)#`wF*S*!yy;8@}-4C3};2Sa;(BS*A=3?(tm0v$^$>*tOKKr;$E;r|c
zo<P?JvB2*h#RTbf(qPvaiVWrbgu47;m)|pMRs1_E%hs4r3%k3EA^Y2drGJh^GpWqS
zX0HE56U~p`D)yVy2tF-v+<rVwf4rh@v1z-J1X|ZjR|nsmuPYo#d?!4wA4R-q;@k5C
z_Vvi=9-DzjI~$V$V4-amx|g7dJc@bp&vyEuCUCcwrzL+jJ;<R^qs{e-NN?|eR~8>#
z05qD{k1gi$h3>Jk!xOi+2P0Un!kQru>ze>)7Hk%><m$RZnno(YC$3X~>x*l8@)o#D
zu!WgRQTQU-%}vg225MZS4_Ec$s^g$=0BR^&9XpMmTd6zikC|QYKSvUf@88t$?)e`_
z=N`}W|3~o=HJ36{O(m_;*AUGmCZUm-`&>dUA%rBCx#gB7(ny+1%q=##O}XY;!o-+r
zA>@`=ETrY@&hP#G^=I{Xlx?5)-mmjI=Xqj9bP_**FU_1hMTk=xxNy$1oYP1Sur6^>
za08*kTNPztZ5Tl$#YUskwlqa32A6J`t6+KmMOvvZ%B5rT4b7-8GGx4%L`rq|4R__Y
zHt!)`*v#I(Q(tzQpM42dHM#(Mietjn?7K+V%WlL$9QJ;uKwRtxiGK)nCeylnCm|j2
zuN34?+T2NpeTJVS@Wt>atkQfh+U(h{<b>qohawJH)Q$O4tj0I}kAjnA>Q}i)TNj){
z0#yer9vIyYTXOQonAS6n@^!zwLsgyXE<g$%zg?$`oeNU8^$BE-ufw0=^u0P`HejtM
zco78d3~s*&zfodIvM)P~wtNJ*DBVZvK<P;+)WvaVfCgSoOPP{7$(1yea`!&fyG%Hn
za*};&&0N<q=~MQCjZe$qqcaVsfb$%a;Qrjrn1n*OCtT>X@|`C8Lu@nLYY!_guBw^1
z;h5l$Ge*{Is)trWi}aAzW*!wufs0&rKUedRyhFnFosfI?3TvsP#kx_LuC;c60`y&T
zFPr-*%o!H%QdoOR+?6;!%s~iJ#GxyTPUqbKtBDRa2&=(lI(MCyudXRcsnLFr_-SBM
z&u7EBXZ&8w3&L%aBh9;WK#%L+U9RjmGxKGI9Sgv&iquc+{@y+>47JT^T4^f7J@jGl
z@I&2WA108L=?ywJYYsN$gp@g+n-a!R3H}QC`PXwqEP!_79dw$~alC<4Mf$V0_O+%@
zZ6fMoAB)R+R-Nv|5rpxQmkXo@0eU$=pJGK!iEThh!fq<+VFdXe2Q$%2m>#gLfYM9E
zkuxy2g-UGS^D_|h52_&rUt-BZt_pk*ndTolDV47c25{I)`Hz3_dDi!kF}Q-nn$I((
z#)T&%UE)Z*sy{5Tc3A3Doy%AQg&8@*IZ6$!ETzgfY@p<#=kpZ8qp)1XX@`mgq&xG9
zyEPavn(BOVMdIJ?{IwLmj4VkL1fi_8u-uHHO6LW?VDwZ~IddI5UheT!&=2_AWDi+l
z(IL#m;8DNU$a~=HyldLyE0D(O+h3vc=)7TeZ&JRf^o-0zXt;Nb-MNYuK?s6wT;D$B
zGlVaOYC0tnPY5aEy6-#CEe_PWg`MA<6Tm@fF1WL<y%V5$0*TO9v-|EX>FSCRKbe_G
z`kZOwWoNk6#qx6UkX{L{*FJHTjcF?{{kHdYF4A{&MOP$YfLB$qdT=&=V(a(Z@@cm(
zHH}_D_m3asO<}FC1+BR*%x}38Z+o@JeZ<PSm*XLmEr2(}YLutfe7dUtlDoJ(xBhl3
zBRX_*aqQLYVup(8X^7^fOLlKTfbke>Ypq?R(rNvPRFtDlWfD&eR$C*-3_k*DGyrYL
zzW*#;V%iwl*%11)6jW;d=Ud0=3w)fO64y;M&(c>L30*Qo;_e1ktI@v<3sv6NqgYk?
zIC|PBHHueZDLnb7W&`N3EzLZ3nBAbevar1YU4|qU0WU78+FzgpZM*1P(Sz*NSaKzL
z6;3pV1y)Z)udPk3Z#m2)M^5a4u{Ax6k^Xeo_W&+>*=gZ>xn%L{qVn1k5BFnU*IjTF
z6*}shRrd)it>-hgu4xML=$qJrq9T&RYqbQ(ySSetHU_2wVW|F(VOh<Oi9!NJUg1c-
z83)UBq4W%y5fX;@NXg!R3eRhUEg;DtH)$0CHnO42uv|9jDayszvd|D`pxWQMx9?yN
z#9$%>)LO?loVVBRSB4<o$sMot%IbSH6&nwNkjq*yjAajM26sMG%C&sE@^hBIJxq?&
z;0uO-K**msB7^F{J3i@hGk)!Ry|8cS`kUHdyTQc8bNr=~v0_<REZT)WJ*S5pUI<nl
z%O;qsk`ZgNS(Wi-#HZ$97hbwsPQw!>t#j)qCP3^iK!J}fAqp*d{v7DUH?pYy&0nMs
z9(mzqd~RyIn6=K!Z`lK8UN<O$)0oV<RpCxPJ<FQ=(ZWDx;0UCZ6vx?_2+dVrBdCCu
z)d_wiAJ&C<%*y~N6}|nN)4#-XS+1ICL^SlmNAJ!rF47KIAd9y&f(r2K*P_!J)qBll
z*v5Auuf}JIRX3P%*lu)&7#tB*i`w|XYGejc$tb2RsIW>M@<LV=h+yfKpu{_a+faIr
z?+Dfq^D^)P-_X%9{McBZ3AS6~$QBlQQo7<(eN#3dXwf+CS%85&FMp)!qyt0bY_}Ix
zUYdwJq_eji7FuGYgA~J{-5{#xh=*+g-rHWfIdjEGM3Hwb193C#K8xkUR4Una7+7sz
zxfX*FawDPiK{YYu=1GE3E-K31okvI5AL-s6#tOk=mwXdUcx)D;+CQRRgXaV}GKu-i
z93Ka3fjfacK%TOnwKD+OJY%qLyi~~v4~cFX-H#J|qQ$<oaMUe}+CX|yTR=F^bmWZG
z9R!Ho6WZDjVK7E8iwzPkQPq;9&ymb}mx-E@TO~>MR8a#9++W?UH<`2if-b~sA26Lf
zw|sGeq*Qp=0vsnC->#hgCw^jkD%)=&a${{7Y?ryAXpw9X!r6e~1QF+ZVsJ<+RuKMy
z<cof!A~!%?Q<3{wMbbT1$>x#>#=zh;N$GFP*T;epKE~Di0yMR>!H#rVYPTPtrK7hq
zcK-!-&R@Yz>~`*qW^iOk^;=P_?5R6Tcc2LDT%Fhb6tKNlKdyM>o>pEnFd+$V-_^`3
z=2B>to4&hiYwCU6xd12XBh=t(%c_?ClJ5=T!oQn+9!<lGF+^2(Cz}0WXvAvYOWZYS
zUudqml5n>7#HSHq-FPVKA=PVeAh*AZyZ*hUPeSngl0e~5ekc4{N|lZUPL=wJI;Z{^
zHCS_W>DxMX^DrSL@wN-8h)ne=g|{$f$jUZ5poh->Pepa@>E;(V1ipP6zg?gGAO4r6
z;_Z84QL2%)A8|Z6p8Q(&S3wHH-54qP<SIL&4cDaB8}{`;`$vCz&7k9`5d!m@(a0wi
zLFA*or~pehKBz?3a*#$6OPHUOYQiGnbl)4?$1UY1z;&C8gxtP(-_yv+snnR`O{?su
zh^LzWM!fk^CR9qo<ada>LGgBuF?>tXN61`YFV*~}nsn5xcsm#fQ*5ugK_(XqopLB5
zy%%o@OmiwA<;Qw~6@q1=S${v;6M4o{lU-hfvxi~kH{EiMpocW?gKpsCqB*swE0C9#
ztGw@I`p}h8eF?H>mm=Ceue5y5f<VvWm~2La_pRE`nXIaE>q2M2B0{E7<agHui2!JU
zM!k$QT_wSte73?0**T(0NXYat!8563&Izl#myu<6!y&hT3ME%F2q_XbMpttFZ2kT9
zAOmC7;%@vDcZ%?CGLB?l#!C^?DI%o}f33?&c_Dl32SW>Pv<!HjaEjx^&}Ec3+$^KG
zt83w7V7CAd^SO8|<PZi*`_|aHyOOcHQn||sRo^S+lOa!7TO9S<T@;D-%sNYe#^C0$
zxxL6VT&ctZrSJh^!4e<<#lhl8!cYD6CHfL@2}kRU1zgg3Nl~neOA5u@U*Asgk>0bt
zI(V^VVU9hP{!E_xjoL4(C*=}KVlD)z35$rV0^^>2*?)q87F`@ItvdvAFe{w+eP`F2
z8&}^(t^J;}WXcfkj$h<t$(Ief+xddmTpnuu&?4;!<%<w(s{E{?vYwuO>Bp+~(`CXx
zW_kkqbV?LXW1to9yN*N$ZNJwr&L@0%BJL(X^P@ZsXO5LPn7|`C<(_N-X%;zj*2E`0
zVuB1h97Dg|29N8GpQxc}eo->Ybz-sZI_4*L>aON-iMWC#@Y#fM=YsM{u025FbIN`Q
zn<q<k85zIEO~s|~3%Q@9z*5=dW)ZY>Q@KuzQ0s7zuNx>svlINZDoOdcQXyXcBh%Fw
zSy|?;*$E5OXGbvK(5hiozM#%+V2<~N)t{Z6agE3P(Ob)E6A|3SA!qF!&hA#;IJ(4|
zr^;T6Q!SW+y0`SejG%L0tk4W4D#)2HuVs86R6S3Ziz#jOvDUa;i41Ev^K0==niawp
ztAu<*<N8H&-GrlRnWL0X8&e=rpyn6Yjo9mJOu1C?_|owMz~-RttU&xj<G}vGsJoSa
z%EGzUh+<S6)sm;($@fXPgb<f3t))8Mq5UWV$R$NZ?T%+s4~ytck_WbfCstJj5(2gQ
z!LEkJo7fZ8i{v@UCgair!=s`Ov;k-)+i}880re2xPCAI_Y|zj>9Q)Ws`P;8MM6dE%
zPL&f~2yupTGOqN^K0Pa(&C<|ZP!=`JZkAk)bFCg;SZ%8FvIIV|uLGvsUk_GMR+9E^
zSqmQA`Kdz%=lfWit4VXgO^!Js1N^1nflEYR($Z37*b4<mhKk-oJ~CAnznztiauR%m
z)Rr=3smzf0j*$It)QG=aYa)had9cZo%_le;r4cS9n4U1|%cCpElz{if$}|*(be>`b
zlbEJJJe#Y$hw%R*90l(F&7xuiN*Yb%V>0m<f7LZDcvO$==LAh*ZWRhhW3iO|(}}n`
z+a`IV--u2b!sL4cq)!S8Qbcx3Nc(N)sYZDBQ3PkG_gBG6(gZ{%_{?STP$5|g5v7{8
zTDoRnXk?A`AFL>ICn<1ClU#_Mu!`$Yg!L0$C*B|4l;I^m%x`WLDYeX@+VoLTN^2;F
z_G+ZDfOg=m{dl>f&y@lK7FmNbar0)I@6j%U71ExTJ6iS(gsYncwsZ<1NLFf_k`Utb
z%w6@w3{qM67g^+J-62dDyvwUR6v%MJP09W?g10LC0;vR<3ZGVhok2{mufJ*y|2lU2
zwfF}@04>dxAeoo9Gu<D(zOt>@HSLvqxQWIGD#^D$?zTs-cS*JG7S!=s$2dYtuL|X#
z9edRy+A{lAE)7R3=)yq@tW~uH5P`iXp7KKV<8DChynrQcdhUaasy(;#2^1x(UlD?!
z<8-ig0m-N^Her(diTX(d^n`6nO5fGc+WkGYy1S6KE9Ck*$^->@A(krMJh2qLRkyn{
z>KDEKcP^+kD6*ag<oYK%C%ec(QCln1;AD%vR&Y{h;H)dsS=q$<uSH)l4bveUqqs1%
z>)1|H+z_)dEK6V3&Q?CDD$9?(BqgVLOb9n(*xkA#g)NzUBG=0En#}Yp9B3b-S!N0I
z9UNZ_jvUg`maI)3D;vW6A-nPKBb%`sKeY@`+^TpR5q4a$aCLmR)+foPA~4u#NTV0k
zf}#RfY=z7oWZ316IWD`n>#EPC8DoJG5-j%e{l2KH(91C-CD?aenK%>^S+TQEv1l)Z
zy`^F@7>?}@Zdx9faQoF5?rDi7`7^^tYt_{1fs)wbps>5Mv>T=2xOm4e^G7kOv&%f7
z<iOjWrfKrZc7BaG*{~!AwiH49@bVq*{OK7Ty8?nmk6!1x|6=nKl(6r`prpXorL2kV
zK9Q#EOC+N7*!YA7r?ihT{CjI_D<ZVD_n<hg@}+%t_Gq@t*9*R80>`>U^Y_xyxBL1&
z*3w_PJN2{p;_em@Qmm9<xH(PX>nu3?SL4X^a)4%TTnhM4g0v~{BvlDB*_IUeK@8zP
zR0ZJ`?EQK~saPA;3yK(|aW#k5(#WR^tOQM@EwQs?rCIX-0bH-f7Dnp!Kzc(pPZax;
zGx$+H=ZvP1Z1^5X3F{0d2K65OEcI9|XN1PNJ_eeBU>Gcu_HRIgJjeRmFWnRj7U@dl
zYF<e4szqVbc=@X*V=WF_mi`?lTOP`|$d;k_j?R)Qglfd;=z=ft;@>GsvYd+J`L^tg
z8gT+q0_qZLgqXium?-J3_l)F#>>IYWb1*Rg_Oq-d^9XQTm66ELq#|_KVOXSWN`v^>
zHy{P1R5tWbNmjjb+!o_Cv7)Bd1hm@yCPeOb6r=Ny>yZdZPg=?;YMDtcPT>uTu0l1X
z;IO32L-oP<Y4wdsq4cA_p9XeI_*h*>cHYZVc2xi>Z&3>7E&!7za8)XUuXJFIfRC0X
zj+rc18Zt7Wmnau{kqc6&pRl0bh4=~WXIwy-B}LvvIiwxM8p=#In<2sX(u`M@ma{@e
zmh834$OP%mGZYz$46=#llU3*aa|jRWs7lLgtxJF@6^Hd@*`N(uCZwM9GgVDk<~<J!
zP7{8R13nOxK-OnNTpkC)Wl3Hf$>`cL@v2!t>mSyV+E@gfGT6l8ayRp$z}{lvPFZbH
zoKtd_^Th53%Q-4MJp9iE*n3F+g>NI7V9H+2mWn1}Pv*lgce^DRnU#`4kkdp#d&zcU
z3NOFXY-|)h4mOM=O1mA#;d=08fd9t^Bjt;b@ybCOL&K=skmuZ?i38bxVy`En4W(;w
z(w<E;(q0+IiQ1-x*;$YJi95*47JC>*XA(BY_P*XfzK?yrGj?un*+1(n$WZ4U(5)|j
zoccXT1BX4vQegdEUk<#K*7}~>lUB*2sG)n(mdET`L)YWfR&y+EY~XPKXC%ULp?gSs
zpvr3MQEj!Zp41lps3A7tX`sash9<Wx2reT{6vRk0HMIcLBe+PBZpUwr4lRu^Z-Aj!
znh!zN;pY$3iM5r*o}q-clKZ;~1Mg&=Md%IrHa{8MNEJFw6#5x&dRH{|U~HDY=m)D;
zb*j1($~^)updsZ`<V1P!r5q-i5oVu0K>gXDVJ*m$*<bwWqLd}?4JwL!8d8#D&Z?oj
ztCGjNgC3a{j7!#kn}wBt|K4#D*Y%v4t5Z2wVZ*#zjY87!J(8+cVSy2Q4>L;siGqA2
zzsINMpONm6VFg#fa7GfbLZ}P+7uob>P2u)e5+j2^Mw=O_U;y!F{dW+V3UV22mcQLK
z1nt*r5O@!~*RQTEnTJaxy&em$BoRNkFsYv$M-LO)*EN$LPkJ{^EYxMs#~C1#r#sNf
zCftny-Tg{bQW!@cYYN8K2r3CwO@Jdn+B()+@UlF9n?~wEJ1Z0h9*B{lV1&z7INr#L
z<LC>*ErUukXWn>^(p46kd%N02U!U*6KnLwB&afQ}54q;+S-rGoc5kNQjFAhgn|&YU
zkV4;Ghw!zmW_8bEfBT9jNSn*iLwH^7=CJLL!M<bt#pL;qk6vT*Ge|hTN%FlV%W{cq
z6F0C+A%6q~d6&yaZ{aF1NNI(ll&TTo0AWZC^8|dhlx&3+LJ(3`*2^8Q{mw)UlvRs^
zJ_RwlxDZU=F3s+=c+ah5Lm&5`!kVB%^=yY8B%ROp&7}QLdyR)C+PC*ESM74gEj*wV
zH3<6{h&d;M2)Y#b^L+tEA;(dGQgxz=IDq%i@6B|BGLANfkRKDn>}+^_?z<A7;%u5-
zay~jpejKNY>;9JTuPqC_qkMaW_5c^aBr3TbrOQBytPb8qy2IRPjRG<^Dk`+A++TB>
zt!4v8Yl=>T&W~)DK=tEVQCkUU3vA0Opd*f{I5q=0ky14H5Qf{F3%`p1OI$Z12ds_6
zF~5R0NuRl_z&@u22YokV7O8O?(~F)t>MdygHQEE}i7A1JiT0$8GSrW>dz@Cks?*U-
zlPe$zT}Cb|Y{1mrY7M_G23M>)=eBNbaQn$Da_Z+lq3Sz-xSP}5PuKHai_0MQ%IF{;
zqqLgC*R}z8ZDZwFPd?BO^^7SxN$)y{<D23wq{P<AKy)F>y+bVGrJFwe4d6>$;lbXw
z`nYeY!st^!z`Aip?p(Zm9&~`M1=n}-qKdu9(Dc@Y4dxG6hMfjGST{O4q<Z2`mi|Fr
zOn0l)h3&n9rqk|r>Ef5@<wYak78<G9zr0(rL8sV&pBnjb!2+Y~_!bsa3-d3<;Le_u
zK0}H+)Yr~9m6%fY;tf`;K*##bTj&J{ZDTBBe%1Pu!OzEs*9EQ4{8p8H_AZ(U<$**%
ziT?U_lIL+f=daz1=#>6K2&!RT>@_i}A8iqtng&Ry{(tCT#wY0Xw!!&yn8WScm)O1b
zoN~wV6$A|Xut3eEU4j#@HAD!dpJ%)0<Xl&KFn9D2hy|#PeACuE$tzv%HQMTDDX6`>
z1$yYN$`CEy%Sc|kr;*2Zq^&8;i2&n=65hQ5;X67J938b4CEg_RQ(g)Qyx{-Do_C_R
zb48VB9bAQrS&xK7U4LtZFS$Y?=KG+BoU<{r!T|?3Rf8R0HOLjn(ORZ^@1Vxg15bj*
z3z91-Jr>Min~Muge&YdjT%weAMELGbWP*SoI+sqaQlTzy&%bYM<fZ(GmB?*9uI|ZZ
z$WV?NkK!ef?vArwjbt-BLqiEh!`Fd|4pDuOO&5T?JoUPDDGfKeeFA36wnYAn$Q;7k
zrStw6D2rce-HpluAxF*R>GcRCzbg@4T=~d(fLdB}7$lay4qW_Dio)Gh;5!(Lc&};3
zW`O5JDpo8J+AU#?eT8*-=Kv7tI84i`>J9ng+eg9Jem>)Mt3pO2!Vq;)YThCf_ZbUt
zc>nDvCaw+aG`=AY+o>rkQ@%8uq<kK+N_rl6z+kRXANr|Ivas1n(5+|C&c_#PVPGdH
z7K2qwLKTrj1&ayF$TJxL(@ub;)st{r8q)^IJyv<<rbv6AvKScDYE#lpo(_*BB3zkt
zIOmslhTI&Y6VA)?x$Ye7=Lk(T{OfJ72QI7y9(sT$X#OFLMW0AUfR8eo9gJ7)exg)L
z@*=v#?wja<dBgbJh~`+O)5tU^kvm_`%O}%`HbOqdSyS#YpPPRM1ag2_6(l|_BS<8W
zaDx7UfqTpqGK?<&YP2zEpcy4LczbGY#9k3Zq=`Aec+s6dPtUs6P>I(51lWtE_<FzA
z_08zDP43KJ?VYaOQ1HGAi_&6!i57TAP&x=oOK?Osw?V_)D2e^<%}AJ|0aS}!PO_iv
z*2qpZlR!^{ksTmuk;pd&5*3btPL;!2Bqe64am~ZJ8_bx{>d~=ZtF^V?$(<xf4EU+a
z<8StDcca~~{1#qgwS9{V*Rhk_;wSO_c_Mk**1PI%4hHSWf5b;jGYA=sTic)aw@>W0
z1KCXEZ3ebOT)@Q^cVBqjd%y%~%mS(N+8oqTs5{9O2~G^LlvK+T#gAd%7Z;h8oS=Uh
zjlC>rf`r@qUidlkY1f{x=NawG9q4I=4!m7)Sa@Lc>U78ia5Q#qx;W0708zSe?wIAC
zK1&|H4<roFDLuhlo}}(BOh7N;Q|pq+gfBEHri<a#XXJU*_^XPE@NxiJk#SV_4AFw_
z^g(jqnDfAjbL|RWjQ9r$&f@CPQ1h`2>lmjT=(RJ7Fzmdu^N`!2?yE2vijI>K+Q;&t
z^gqIgy9zjC^y&4Z2Oq%QWcVY07qhMy%JKsrgv;@kC*$$MBYp4gU+&376_ahLr1$6l
z+H<@>MyS%crIgbncp1r`Hi)Z>^;u%6B%!CMOHrHIew6S1@zXN_v8VVlwD`Qf1Z&^k
zSPW?RHa>p-?61b#YS*!b$-_U>10^sAM0NiXU&FpPeuY0HxxyQpHnXWHG~m8Wug&u}
zpL$aHT!?h-v7vO{47q}fpp%uR$&_EZFF74}{9qXS+f2lA8#t`yJui8BwyZwIB?|DQ
zdui=s*JT5PL+e80EI!1qW^Oa8mH5Q@gmz@OCLl0OQ|2}|o32UB-M&^q^cs=;uEHGL
zh;bpaqd|!pEp2f#s(Lt}O@e~#N{Qv3fOUa%$5K34IYkQbyBbfcq#O0vmgBLq7~2G|
zTo1OSj#pM4sixP68TR#W_JR}kIcnn-JtgO{fLxXg@aRrI>VUoYJTGmZ9PHQ+bsv(r
zv9iV@D<ci~+i|3nA{bqQO;kE{#thld{T2Cp+*<9)b;sK13ntZ1vtYr`>42Po&VxcS
zUWgmmVvG@b7{k%J$RVF|Zo&ZIcNovJ^TOkEqbTk!y$m0cla8;oe+&Wy_cK$uiyFru
zK#)NK_lZy`O%^raa7kLdt5M6Mv8sXtlal%3j&N%}{0}oM;l9H^5{R_~^g(RH4Khj(
zcRCjEmd<W$MHnM-SV+M_BW+=EX>mM~yPxuFI$neyJM`Zc5BQqyg>NYJyLCe;@UdCa
zTg?;AA&l=?bp=HDIrx$6zB=FQOS-~O4R{hhZK_J&N(qpN_}w-%(QHFS$y0vD_GJ?1
zh=csqj|-ff`H$#8(|aGmR1K8RGQ%eD*f}a{u(8jB0hcC(FAcfVx)V?YqKrD5Xvs^6
zf$>@jN-OsSk5(v!JI#jC<}^KOr-B;(w$E-(Y^f`9W`h2<Dfe6$<uO1IplyLzgnG;=
zzB0eJxKvmw&Xu?re2m>3s1PayY5y3=<fM*fQfKFS0a@qSlL7@^<x^K=bOIGJh1^pt
zT}evT)78&v3ry|6c=*sOw{|eQZ*PI|G~Debu^1j`>f=@fGdDN9?5ZJ^mzOI-{L>VF
z4LLg;UUG)VK>GBLzB}BI-QTObYyG=5!@jP>I5Di3i@D%=oVpu&CX<)P3VZAHrir5Z
zzW3?w2&9-PltN^M^-YIi$8$`JwRbk(N-1Ulma5v)I1EQ;Kojf*OugEUV@QotgB9`9
z-RpOoMq9(U=8@J*<IP#KGuw?0)Ym>%h3-SEWVvIm#4wE~k)yvcsZUr+dzXaL#cvCm
z9+iBZ**ZG$?F~LsOY179JZC5jBhS$6=Q;>d%>@yZFCz}lWLWtZ#{f!zc!%8Vj=RfV
zenIJ)@eGcruG1T-A4d;6zO53+ls<1<v~ceIs5tl&2SZV8R}*W|rxl$gT*>sr7sH)D
z^F-pq><Lzm6URDWk(5GNB>c351+owQj}BHJ^TDHfirfVkco*70fc024YxA13U)0J`
zX5yEkn;V+4*>e#`yBKHm9>kqE@Q*@Qte2+qe8Dr(Vs`;#8ac3SwCSP_alDpVl8L2Q
zE4+jIHZ9J2RMxr>DVZ23>2<1T{!L>d&Swl@TxLKds{tazs9Kv0NFrQ?;cUi&Y5sh^
zNONz3W7@mHa#*jelp{se?b|)6SihtOAeu~!fi~HH9kr9eCp{W1nS0H%3($)mndjm(
z96?q-QX5(@pY2DSb=Z#_plo<F0h4Icwv(ZguCMpfd-%Q3?Xxok+kZE~ddV&8tmkn>
zHk+*oGQ)`|3OP47cfQ{oYl^%+q{>cpJxf68x715%NB(6#g#DV&F~eX_n!|8CT&4I)
zOXL3nZB4C~ST@=F-?N^*@PXA!Zl~zY;dz$$$A=*wBqT@X%kKhLAlzFmE!A;|gd4p8
zuGflE#Pj`(M$`6>c*i4_t;9g09OBi)QW7e*TEkih5&Cv;^Xm4NU;XIj=0${iON9V(
z))1lPQ#ew0S-xX)G9pp~bl)^WtYlA6hDvQp%}tPY15GnmgvhR?;oBQ;5TKu_e1x2e
zi{N>G2&p&4i^_<V5I9B692%h87Me={1l>Pz5NR@(o>}M1X(+D|!$qu)sf{lD3vw8&
zE;E$$yW2Ii&0HGCd&iF_QG&<7H1pudz&S>0pa;1aF57tGHJ@KbScz_e{j7k=z)weZ
zfOWpbtDAb5cbn!DW|T8JC8%7IlAEySx6PQ$CYmBY_=n;1O;b6=#5xnpiFb|sx03)~
z#|t%mUYL|h+Eb{947_*m-~*7Y-QH!W8=^Ftcy$s{1RV(qah8<P$>bi7Sj$o;CIBQH
z;e0aNNBPjR!N#Q~+3zol21%c10+19)!`Ts^ik3jG(&nVZLG%m^)Am`Vk)wh2`E$x@
z;Ry=hx>RK?<lHNgka2J%jA?t{b!Y`3dcax_+g6|S5a((veR?J+i=ASwW-?Sqd>xjn
z{5`wAa(7*7*T3mlbN$eR=e)%#otgIqnS!ou@&t2?)n;8_YaN>ugL@AwDMmvo<?rI1
zqu2lZRWY5`e8Q}d(%O>c7eN`6jJLE(J#>;ATWN1E8zL+B$aG9LTQW2`3$xjN&d1!7
z<Js^Qn}DxNev8hAOq3h(xg6qo5DSO<LAq<tWYic}U-yhfo|?rFP~7HQz8V$HflP(W
zR1ue>is<50q4a<O1aFzD095R~-0im!D+p)jYoozeGQmoz=Le@O9fhfksu9dO<!tz`
z`TX&!()r3h35aXS#X9?T8OlJOonMogne(d~Z5?O<)IqGWE4D;xFP*<o7{xlOv9|->
zuKt$XAJ0XuK?vV|ts*HP%hce2BO0G)>Jd>LeCBn!yBiCmF4dQu6~JFB?ka|3dAuvG
zzU%6`%CtM=ISLzeKfwb!aYI5JN0}^JNZR3%58HPFcQ>@wg3xBMjI3_4Kp^+~G{8tG
zndf4*8DSu7jJMAwi`{ri*8Vn;SIn4+m?9lCm6(>O4pE@v^a{S%<R(qVDq&-6eK@DB
zx`?EY;Vr<x4VYll+P5f!_?%+K)Y?vj`t~1=B3A_;$)ND%Gn#+!ny^%TH#hn?@$vUC
zhKDN7F~T)-nz@_XeM>vP<43czVY<Cu$GgBLNh22nb!27HztsN)eJ*?n7eFgt-2k=+
zcalPKHI}ITw7K2NIV!d1<P0f|*Lv(?@@dz^wqhdjb!!^`NkqWuS`y}NN0mHdJtCKi
zno$wf-Jj`6L{_LD8ISiqla(P&utjVq$i&@lvXQeam_&WKb7tVx^w^zy)o&#)JV2eL
zpYJbRt@Fw%r6}-I#Ow{t3?}fvnhc=p6(GD#p$yXRMQi2p=*ab5IrOng)-wb5Z={*b
z4-%T~%laE>cE9{F(0N&!KE6;&>_C1DCq2_<_9%iMYl@=uqECy@$DGd4gTO10h4`-q
zLMJH%)LEpo4h&RXUDfVC!sad3oN?ezrr{|0t2}`=xC<uj>KAjs-gpDxvw=Lj)Vfaf
z9dRDt-T1qu2Q@&*h~j-8CHPJ_zm4>is642bD$0*YcvJpZ97y+I%_Y+i$`*Y*nDtl=
zuGk4RSDzlY+vl_2*_~{Ok#RBph|8ybg(ByHxfkG~us7(LffpWTq5>5*Xr8@1AhaGQ
zj=Z8TQ}7OMlz@8`_0x^0pe_}`e61DKHO;LW@jp>O@)1w4!Tbr!>H?atG^H0#F$Xfk
zd8>N67I{I$Lzf5qsMEekiq$deCU2%4a~LfGL3A+Gd1C$p^hsQ==tIZ98X2&_G`0GU
z1_REg7gON1RnPY-`LqM&HKn<)!g5<ehyfLc*u42W9}m*um|xzmcKY<NUM~&J?BJKz
z&6aZTHrVCazEP^zf%pd(s6JpNm)w2t+JZ@nk*QrkTMPwGXYgbFwQew}z73O^uJq>7
zg@;k^NWe-NA<R{|l$XqM2gcck5xyZjXn6pO;gI_J=v`0%+%(LkeL~*J7v~L)h#Fr_
z%2(OTxFxYcgS^SKIdXeBPb<Pco&WW%0^$YoXI$)$81%7<^K3IIEpInL%1uWqw_#4h
zSJ=}ErE#ymmo>GH-hl8M7Vw`ejxZ6QzIHhxXK8@!*X(L53r*bF9Vk8Gq1bX(*|}KC
zdCX-9f8_p=B%4;|pUO+ezPLcQ*u0CSh5tbCmvcnfVLK~BUxUXE$qCzGYlpon`czYk
zQ$W6~0e2U^m;O@xGc7plA5e_}AXVDhBLlN!g7`$_=9q}~R*Gm|Yed*;A7^y*B4^7v
zdSeAdMV~zDD;ZBgW9%}nVEscjbsBEk3c~0g?!2A2Q$G}(C<?i0+#I$!zc3o#)y0l|
zE}9nL#HK*R1WBSYSN=4j?2E*+`_4WjAo-V2g`_`Y^%Knqzn!g&w`4QV9(CAD*TKXB
zE+7V|v-_yS!pAV(Gk?B6{Xq`Q4Kzg!U07F2?ji$_u96uM0IP@gk6-O6f;BGHj|33|
z0jX(-0tZIUqhSYi;l)p3Bf*YCtMs3BoI0F&TQTDelc8ebV9ru$+sFAe4fGnbCulaW
z9`@5}OK4b?+z;RA>ZN#uzS)p;T&Cq2!ea0xx7%4=UPsaYR64b^wlrwx(1Ar1Pvd0$
zhdc_eI3Xp4lxD7z13LrZET4*FZn#7?!~HoMZ>QUMD$+8Mxt`dgn)C>j+t23C7{>rm
zTYoibcBP$D=Zh>n#6=LA-};3#okPw9X}Qs_`99_^=SpX|JAU)dt{bpF$Iwg$jRwGN
zIpzet&ksCD+h>YNrScUXmqj7}G3Dj(8o9e0LvM3UJt^P-GqlzP+9vPcvi$m<7$5AR
zb0ie@`ODLukdtY3`d*iBkVW6iM#X%x7eu-_NL$0wzC;dzxPfvTyqbAHclY1E>xi}W
z2$1KX2q2OG)`gU^3X*QoyIOAJchoRYiWSr)m|5YI&@l4)NpT@8plR%H(s$xeMfPeg
z3@2g0Qm1CZUU~bMrFOzu!ROrMskbEfWyoS!^kLyv4IOARS%3gV>b;RYQ`s=;a2}y&
zvQg1zBhB^bus@1~^%%sZ*j@LoI0torQB?QbZQC1SXghC{XsO?$8lrE2pfIP!EmNz8
z|K?l>ZyEy=Q0)cO*^`rzFK;R_`Q7B%8%Fo&aM!0e9KBhD+0eSQs&)_aE?!3_3DH3+
zGItyW&Ol!`C`>*yE?L8KAzUK|q`w+yK5^nh<X(InFQh$q6c&s3Z-<xROqyt_#TSc>
zB_4m$j@tR=SFbyyM*T0|6ScN;4`cA#N$!yDOZDJ3iF=}V`Id^q0ytQKLztVrgCk_i
zg!{O`m9ePm86pKfh+koP^paBh&I46?zCN0JjQwjSAS^swBM<bJ()ryrH{Na5z2=P6
z?!|V00(ze_`X)0oFrAmm3lfxc7P<@EBODL2n;nN}3*)T-L{i@`@!DKkTDs_mF*F7D
zA_UcW{1QE?dJK+ya2;3<HF6-z;_t{t6rnvhI!x#;ru3Y!plict4_but%fpG4^Td(2
zLy7dSMutsVPt%6(lRn1+fLoO#?H&s9azCD#QfVJ0KakA}!5H-XRNalOB30*O43LDM
z@db&#kdreidF=MN+h9Qf$j^Xh?B}=C8nKS6z0@`B4~C3p)wGJM1d==(Y%M{fP}D@t
zg4FKcB_L!nTy>c`Q2zJcMLsyO!0u}|L~!~|t)mMepc?f#mJY0ljXnTNkpu)EGzFu6
zRgWH)kbCM2BP#_X_Iead$=Z2}obOcsQ(8BBQFN~?r?+81s8ov;TGH)7rVtv6-~tH#
z`$9BQ<M2dt<mO=rvwb~6D;`!#Dm&@gO(FS{cedDXWsq)Atx)XG1@J593<)g2IIaQ|
zlc7~zzZygIrAwC-IW_i=6UfUnPSn~|JScp2t18p&#bh#6z<blLy}RPsp&4`?+pS{K
z1xkQ51d+n35IqA#Tng@%?+CbVBwJ8x0AwV-z2KC^n=_E0ra5>a({$5ucPwNgYIj?N
zxxrAM6-Q5J-hG}SsI<1WT)66loFP{pzYdP!o_hf5oiH8`6LT@fT10*t94-}I`#q(b
z<3B)+`1&u_a0$~Qg7)k3C*dMM>elb=y{iI}{l)VdM;|(QAB^0ZV>^S5i;9Ua3m{$>
zg6rjvqv{<Q5`kLeTr);vIt~tERh=@mWjRzMR_12>KDR7zGCN0wX`3j!mwOR$_E4vl
zzyZ_idHG~qiHknry#Bf>S0Lr4e402vU;d5mOrEE(9%PyQ!;QZvI`nDe&_Hrq(_~9K
zuv^E1!t7{(2CW?&{C$A=-0BjKl1cR{(03W@r_%l^m}7Hl+v2ffpq0!^R{~s17*&FZ
z%_swjz~I`#P9owVcP2n9C%_nI2pW>8;Z0cNO61i;-4e<k?~b3c#NA`(RO>(s<=v7l
zo|K4R2=>XMCNBk;LinDZbY;mfkvcs2hk8O<9R}SgL<3~fXI#MhThO+R8x?G#Dk!#m
zfiIIivkwZ=<r#PiUyM#q_dVwab$Hgdv2VXF=t2v^lLQbb%KVkGk6M&lt$(uk*~0mu
zOO_af9=`cxI@ZyXY~wsayp{8KN6@rfpzP55>w;hE>py4IT+lHjqKb_L(i{{xO8o~B
zdV;0`5FJwsIcbWf<P)@8QmPt3S_bmtUyt2(k+*2cCrcpq`FjIVEDXuCBoh2hp-S(Y
z1E*&&=ttuj4mE@J$ezqV1p*{tydl;9?WiCfq&kMZjw=>(H66*TgvSe1KjUWsvku?`
z4vwO?6p{h67R4X4+b1&7H3?iAmu`e{H+DSg4N+&nmjLGH$Nj#ez!OZ#Y^l#+tb?+%
zuG7cOx{6t>JJq|hE2EoD7kMQtVX<r&t}PZV{lrL&NNW9*`ixx6Eg=~r@0tc)34+`}
z*8Upc2!u_284|KEeeIC;=h`qLtU#xgDB=^S&q#ax<WSQxuKw0J{1E$T3T{$5<RIhm
zy8^8wexv=zWj}FGy_qD$T<F_w2asX>B3a2v!IK^eU0&t_0kaMv#v3^#ne17DpQzQ|
zS=-|Zt|%gr+8w0YJA{;AN2#}(&h)v(oqzLo*YhrnE-e2BSf|~YWrp&RQk~>h*Xpkx
zaGYs|ze7gny?*|s&8V+`$;+7!92YdrbSGLs{;GQTR-svOm$w+e8sqFhjQ+P^>CJNS
zZ+Gf(NTxIiJMTcB6N}ybJ2maz3>2=}*$V`9#p=MrMy3{n?1aFxgvr?Jo^rE=C;D&+
zvfT>=8PL>5uIj6M5h}5d@AH0LJxxdEU?Ygh)5e4!z?3??Ra}}D>KdIueYP~V`@Aq4
zc*oF(I>K_z*q|7qgUD!<tr#f~LXJ~Ao+qFjQ7C<z!EYW-3yr~c>8!b5->?!hMBlmx
z;z5qSP*b1n*@bp6%ii(3b3zJ$cs9gZ+{x=dUwsUFUS7>o312ee(aR?VD3sTtz#2sl
z*^Q)-T0|t23y%xV-J>PR#`O1JyP1b8RcKtRMOuOFwE+>emVz6<y#WMT#`+&nS?``Z
z?=3r{2i^WXHO<Ok!w&M275{;}iGBn2+!UfHFB`bLGzJXx<z)yb0TpMzOUH06=~S8r
zQCs`c_8&%kICza}ENah{U3pxc<?>X$*h+>THZUp3WH4fje2>z@m?bYH0+^}^p7o<&
zEUW=7@YfTg(cppmG13CvfWHvyU9nvsy;*M+w@Ky!;Uv;$6%Y8{j#JhH+lEbVSnKAm
zz=j%t#<I_`d0#`BU5p?t=OU6xVVQ094FS|7v27nhZz7mfomlqr?iI{C4Nfd@K9Qj$
z_%z?aFrKqR`!#q@QNCdP>Cg%eJu%1X(A+cJ3o9KYav#TG<tnX=|K0W-MG7hPG-+ch
z9wRe_Iz_pknx*7iQBH|Ht}Bf0Ec^FKRoy}n4%-96tiT<FY>5!_HV=CaxF=k`Yj1L!
zt9eOVyDL+!G+SxBuCI~ycI!`>m9^OOGLvWKFKRuA3JeXC=|&o;KdT+NT?g)uwDdn=
zPNv0fst->i+F#lIU(6ss1@j5yyx&GHDb@AmJ8damv8v%;BQ7_{l27&_IsgoOPinwf
zlGP)nlfXm_MXlkVrwzsmv5r-ULpm}~36NsEg$qdj)Y=0mdru#m;z=@YP^H2O%Xuht
z<<<`}k^+TMNM6F8^7{Sg2k}iUC(UJC_~4ZGU`yQZalnC^UJh07NEvU9l;Z<R&D-sr
zCdSOxRD1E0Jnc)`IXfF%MM715<x<H_h9*@!hD7!eew<hQY2*sB<P1CZXa#A`^qEk%
zYq#@0iA({U@ikTD`@9WjRriz}tI~kDEAEYJ45}C?lRYf=v&AYaG7P1uds!Zx(eoAo
z8Bq9F>EN4@6;TF0^pOD+cvLV(;3oQ|A+`j^%wTlJ$4KM^j)&=i?Ms$`4if+iX2WEq
zMRcP>Lh1oht|k1C>kE=~d0H*AX=SA>zRoM%eSK}+-?1EZj%{H0VOSSQw&<~)$RgU7
z<l7?=d>)$4%$3;FlxE{+yBz0<=J4&`^YqOc1+9NlaRLYnL3=q?jqg24U&Y6lcIUJM
z^m{^wg4=-uV)ge7V;TfvlobCBb+}e$a{7%q7U6m={}cV<H94C$r6fyfEvws-hz^Ym
zSEVG-r-n+mR4{iXW^mE;m-sIAt2pR6tRZ3*xZA8JPv{nrRP<LP%hJBb=Jglv{pr!v
zKs-JB7)9sT&g}<_xT$|>MO#~cBea0$g6fNb>U{7%=MXpvT_!9WSXr_R%dZC}uGX&-
z+vb7-I4^)c<J~rM=k4xP*Aluodi{y^?q62_;1&Hhsj)C>(jXm@5(O`^DV}c7M<K5W
z-1l$R)P904_O6FO5R_DVnOxMh@i0mNFtI0QFetv++fimRcNOxjMx1PiRQ2D=bEJ0H
zx)`As>{lJ%(in3QvAF;4wD?Fl3g09FRcOLo=T2MrP)K`Ef?=^D#oKW(%udNe|M_PO
zwsuVT&_G~l)Xvsgg6+l~SnRdKzAMVo#$;5<P5PI4s(uov3N*#sMc(Wg9vMI%!SuH6
zfxuFgL2peD<V>gte@h71RXsPi@A83^kP^>croX}AZWn<&ol>&f{u_m)St5!c3olvr
z0uNRqDj5*>BrSyj++t~y>g9&zRsjJQCFfoQW1=yMIN9`H&1L0O8t31p5a5m}$uc)U
zu8#Bn*RQ1B7x-T0r}ec*wlOf|%|3G3tg8ayPOp+;Je%$AI0}d%G)#c#rxE^l=jns^
z+Cg5OA-wNst9<vjs#*8v{TV1vm64)13Y_w^6r3o~N05bFnt~{*@U!4KDv);^M_t>v
zp#^Eh*|-(Z7-Bw940#8rneJzbtSrOgLR3ty>$cwiUH0=KMmWkTos@bZynbTU(U~<S
z6N9H6;J6Ywjl2-{H~0d%A8p-ld`K;T4%Hb_;Y+lNwJ7CN#?m+dgN7|2W*jv*$LDce
z@!9ol028H%t16PjPBkVg+e4?~`ljubTEj!F^Lx3*0-`u7NqGa-ju-Sb9>S~s-HOoY
zXAv_S#5c1q9Z40s55)>VBqKf`ZMx1MA;VHFXN%cg$Dd8g&os*$b&Ce5*xM*&h6LN)
zM&8Ww&`0d+GPJ-3m_~0+0pa@^=e9_3``llODn{^dGo?s{I=oO9nU@jPN<P3G1v#{v
z#6QbrpE4u%7s~U!x}je#UFsmY>D;9F$J%LkrQg}^xUJw7g6!E%!RxJJqqKmO@N+L$
zgz{$_o%&pC6^-I}bZV1D+JA%VwCG7-Wd1-Y=XmE-q}Bmk*1?200fY>{%%KNo;$GUF
zV~9m<|6XRX`zM+=sXp0tUbF5z`LT}&Rrpg;_Sk<T)VH=+n_>18C4#%&t(X}yxTnOz
z95|I>GE~y9=2hklk))Hf+329~-}EvU^idy09UZ)THX?!Y1NXU#$>t)@mftzkq_L!n
z9k&1HP3X$TA$A1V-Ayg+Xm$ObSvSHO8l<`I?H~tY4WX^0enRP_z?QmaSzwv%S{}SD
zNS54a<y?fkaZu3ly8pi6mVICd$a4(GO?LTvkWS5t9|ZK%*j47RJJ`|EGo!bPqf>F)
z+8_ziy1Slnt@6Z%3;bb{$bBUM#3?Lt7@OA(jSA15eFPE3;C&eq#l&I!a<%f=!}6$3
zShGc`-*vVdG7MfvX3>r<E}nSk0&<~KYeAt}8b=CXCDIw+8f=e1wD_b-Z=!kACv}ZW
zVDh#L+*n8^b<DS3wKP+S4hPNJQetk1qLb<HJ(Lb$8Ss^uW@z3!QtsI0Z;FIrihd+{
z0kW1LxXHkENx0Ui6>sI+J=W?+9E}HV$f;k$c~y)A?=I9&oJx0&T35B<zp20UpqiH2
zo(}Z01GpY~4XLU$w6yUuJrmNVz9$C$AmmQ&KS+=Uahq!lR=!>HiTH>-=zrYJI5v=S
z#;vBuhv$!q2`Sb4=@ogj`_i6*qP^g8Bay(S24uAZTNY1Z=`?f=QJGgyc8<I~ld(Do
zSX<UPy{nfBtpE=P;DzVzYRWV0{a#_a>wE=}=owWr<|f>Afg&3_SZpp+%T#z=VYSv{
zLQ8S@LFDa5AH;x6(bD2#BkdsOi~1o9g54YUb+K;y{4Gimj$+kgMOD1D1lXN)Y^<WK
ztMK6Pam)J{q%YCMK@5}b6%e(OdlZPf=C>}}Zf9Ah$I3|ym`kUn>PY;<CsPo+OqW5<
zsF-lPCET<Ku?hSO-Whl#f6ZqTmXfJa+q}u~hbBn)!CAPHYi-9D@>$>Bk0IH87u+>Z
zB(HESi@Q|4?4eJU>2L@(JZu0W$h2L-1ep|)AXpHtD3iF4cf?*s=&^z@J?57;P(p~>
z!TOMTz^NgTKJNolXAD+$I8#im?Vi3@!ev?Mv?8MaNsORE==~Q-C9P{TP6mDkXCN)%
zj+JZ6gryp(nw&=H2)GcN+<Eqrm&umDKm2D935iP(k^1jx*{t(%8XJ2W>p1Z?{EZ?(
zf@Oa!SWS)im@&2P$_~phW70{H##S%;?(A-?_?jZVO6waLJZjShk=Z22As-5q1cL-(
zddmV?N3R_#3(pmc{Y%k^wB9TB0ewWC;A9=OTYtB(&H;bh_TsBQ)-JY5n|#0L9rj|N
zY;jTKy{9PN=B;bCzMi#RmCxvnotGq^G%bKhz$)J(=V~BR`9*4$P_AW(S{BHt4IfCb
ztqzvCKcDH(1aK07QAk%}#@!vX#DM0-k6?`_y#mxh=M#BUFKHqMK^CewiV;k~VS|C)
zC*~tA5nso&qmVuUssRebs19N(IDaCVZ<ykq*DVk)WH0F7%?fZJM@EF33Q%AK6jR;Z
zR*XtSH2@s%Tc8IorMrg%yJXZ`Jc0<^9Y)9x>LE-V2CUv;0kzC~@e;aF<_P>W4EGA6
zgMD4~V6?9684UB%JT*W=xE=WELI~pale^rB-AxecDUbpz`>er+H#8qCK$dM@tK9iF
z<Za}e=xs1+&4q@CN7*losP>(|EhEilJ+)9M`iPaSn6DKIe2O_N&u;`0ndMYD&r9xB
z=W}mGgzuc<^BYaZ$x<-#{buosPs}p7asE9Gw~W=2xs7ieZt=fA-VaJ513|I!l1+xI
zM=VlWAco{!EdgrR8b+5a!Ce9#*n^xASIb^lir(GC+g0FkCGhQ;fD>JY*zuaN^MB>8
z2PV^B;)-Cgfy4N|sdaTHh4IZ+sVMIBlX|s#J_}t>q%!aVD5_Oas$)oDt$xQm6P?%J
z)xNwQBu~*PK>owD8^VKUo%lKHQzNY|iis@5mV}$mkS4P(62|X&Ix-CfFUznU^q$lG
z!=tpm*gO!EErKFk6o_0poF1xL2e(9q-RFIbI3d#2S&4x%!&q=+XK<4{F(;WV`?^fc
zf}V-9_mXtINdn4k`_vm6XPSMLmDvig#-qM!C_4BpdG5OASz~^KZuk4H3*I${Q!`SF
z#66VD%N#Rm#Upn9Kr?mq<$snJlJ3-H(^Uw#-l1lwUnFz=zf8_JXk_Rl%if2HO3Z+L
zG^=ThMtx?+BDvlq53%0elhS=D^@_USeT^h)38qQPO;&BJ@{%Rbw}lYwm|D^u<7u+?
zaVD!7_MT7#ejZ!X63%8T=glP26LLzOW$(|VkjPIGe#&}%-IVPIEru-(4Zfp$QA4iW
zs!#vme@{2}BLmyw@=34BpWU;-@;syS)c238K7cZgE7fo=NiHNm!)4*HF$M^Z1rSMz
zk1To3mCrm&cfBQ@E*87z9d)iKmEm2%bvUK~_q>9{hlEVj=1e+#$f`+^@cb`SehMnj
z>1_ZOQ3|P(mSEt#)g*z^$OyplHP($Stn%M@?!o_#kN_-VR#2QEmC%rU^E4%2T#<0T
zXN|ffbZ~R~g4@WQdh^89ouHngEI#-_96@|bC8z&tDz|L2$#+8OY=6ew-G&IXWNX9@
zzJ4wwJ6Eh)a^q*a<~fj6X*Q<cQc(Y>k@GO+dvh<8H{Zk2wR6da*=BvzT=0HI^5az5
zj!B|(!BJ;P*`o73PCIoF<h3Mm1gZIab>-Is_OCi3YB9TP&J`EuIH>_^o08AF%NcKL
ze2?k=M%)EGbZoLEh%uATvr}-qC$-jCbKto%HIx-dAUJiuN6xV3&hT}Wsv!wtpF_46
z`u6j{i{~O*)sGZ?ZnTIE&_^T8Pac$EG}`(g4<&g?AXqA$q|$HBKEOt&)!Y~z75)bS
zdq0T3ly_F%neYQpRr{!WWEh&dNPZ!=7qgE#6514r{)PM9i&71Bf2XO}(t=+HAHT|c
zGno4E1*Kf4-<~p0dSEW7m~?H1k_adYC_3IQQrH@r5@0_VM!tGZy5JoeZ$66`#cK37
z)>+$!F9AOHcKzGUQE*b)Z!Ya_Mn{K5{o1q<ykx>%4*E{^!Q)3vxgBvoH-RPD726g^
zIyn2QoZGg?yf8cJXWD11mB$5ZYzGN^kUHZk%UF$5#t!t?kOPs<g3tn^`^PPzi@@ip
z9)JK)Iru}t<KM~G-2YWihPc-ci(#o=tscL^ukE==w88cGA4gJfC3wfUADi1zxn2HQ
z>G-D@mmYo2(bDT8J<9>P8P5+kjxB_Yn)J*86=_{5oUQH#(Mxn4NftZorhYtmpvltR
zPm8ZZ%x#uf>svi`yOGvJ?hm%}w(-Q;L6PeFP9oMu6oxMDo3Vn5My7lY@>RQif%w?n
z?m&P1$e0p6({;cQBQfl^F%~SRUdedW=aGVAa>XOX9LzCa&jerrJ{o;$=MUpv1s;n(
z#jCZ3J-<AjrwEob05GYC$v>4<M6$MaH@56a#T5|-`EG(^#mZCwB_|zs1KE}J^g@4i
zY8L2M{=*ckP}@%b^%MWoF;UDRF{)|8t0K97pKn(D<~^zJ%R#y9;fuv!nmmB3LQ_bD
zH+%0VPuogs+D=;)ogvkkfc8^H+;0VWPO7VwJ#zS(yyE>>37#A9aD5)5;Bhf}7Hzq$
zxHoQ6HAfFR9@0X3u~$$Kp*PwA`vvlC=gi@5P22zD=-dOD`u{(EXgDr$hBD<7o5Bzq
z(rCHNr7V{anJ6rTR7mcZ)<R6glFLx8xuslEZnZEXQ;CXP%4JD%OG5aa@9+Qqv7L6_
z@7L@3d^~PT`ZB(<QYOAfKK#)gj=Y84U6~_pcW;Sl=X0LV5-TYvfDeP~@rtZ~j1!}C
zPwB8a{>k9Lmq4IqY`zF)fn2<re3Xn<^dcYUwmbLyb&$GeAK24uX>Z-}{7Nfqo%@qB
z@^%&N!GKL!UMMZE`5h<IaOr!#$kCRq?OtJ*vGKU7(ZlAQ%s|diyof~|ysw9)4=bD1
z{=*llnpe?+vInR>!H;77k_Ab2Ty#51G2>0(pVsJd?H87vsUnJD-?P8N*`JP7E$bu)
zvVc6?Hm|9~(%nwKzEwZ6SY(o5U|P14bGjN^&@iiCMc-XmXY2K%C2DzkWHInITVpv=
zKcyT`Ut(R?qG+A!G$^dIyLEhkWJ>}&?7(7{*27k6+U(S#fQZ1kU^%91P}hBF@`bq%
zfmphoQ6#eKw1PnEh4X5lym0Zr8?lp+Hc*sV?$RnP247-%3}BM=#rNZYWH%&=Hr+dN
zt!fz_hNFlW-(tq#@t1lwfU2w`nR0(Jv(ApJ%ISJM!@&3Wro>aZ$IIDu&aYAKaCi&k
z%MGZ{d;1D8U>7kj6R~8VQ}xb#HD^#q_3k9CS1*7|x{r-xF2|>&*w_dK)T_LI>W@;i
zdzW_WUE0Xc@-jH}hop)vpAyMn+FNIdK9POOdZJ&>mF<JU^9EE4&8b?Sy=b#;y<$3y
zW8Ag*Mxd1bJLl`xpA6yYpX+~T`WI)i0`@iAlVM4~QDQUP%W#g@+dwsbQ8{2}2$u~0
zg+6zI@tTjv<{F<CGJD}uYQN5H`{Q&-iBQ}OM>=s=(1CGw1wrnv8I6nZ_V94S;X|6A
zF1ViVCEtQ3rnEFPay1<r3+ly@*IyDS8qX$U1w6=ur?ss<&tGd;4Qcaw$iQ<@84T<R
zwn+lo;X2_f%m46R{+s-d)mn`mUWfvej2ZK>$nBAa!GrRKTdD-Vjn>J3-|*es=gUNm
zo>!>L8(!acy6s63x=)9~gCf%HoU3}V{j{DMIqTALTG^)yRV%)8jn3EyqhIn<;x+ru
z*{M2XNuS<cR-4KdJ-Z^_qu*fF^!e=34<`*-WO0~WG%|UL6>Qi2)Q>MABQenKAdKPu
zGwpS`E^B)6DzTl3f}IRr>gc%gYww&tih8o`Ozz$84B92-IQL~_uF4KpgL=ANKl#SO
zV)W^7@3rqkZ~>Fg%jpWm-yA`fzUM!7&A&R?*>Sew(-16*4x^k_zo}hZtRb8H4xWtn
zj%>C_Yn5Ephl5XBX?8AE#dITM?Bv$8HF?H_#e-dA!4$hh7{*(5DiOcWehFP9Ejy8E
z3qCqDeDQGN2^o0LeUBByh4US&Y`D);aoE}Wzv&MWQN4WfeclR2s4@jBmHW)bY}+6~
zauIGI&1rKU`@r?6XI)&1W{;QveVl``gCEj{9Cx`={;A&C!|bM%d;`ASaG2;WjoXRB
z$f%PJL_fg)yB4EJe+-}R?a6dWtbo8C(=I2GvvVf0aHu8`&;J0gL{5Oz1>TgMS20j$
z*&fJBYb?^TI>8kHb4#%8rl`GfY?buoY>|t)dr31hS4BoKUf8KiO}z<d#G6c%aaX1&
zcH#F742{E=#-FZ+fs0K~|6@8@^-Gys4X@_RE&jdedY@_G6UgQ2*%_TA&%O~$Ol3(=
zY^pxHP-lTc-pSXvhk8HP`$)QmJ3Dh#$tqo2qd)MuC3?CiObk7V3KS_`X!Crpdev11
z&xlvV{QG*?FpyI+Q?u@%QdPg(E{$RJQ4q!pk`ATx3GKznkRQH81#*q5$4klJ!@u?X
zM#2vmyY4T<+>CCyAb#$BmYRMZq(X10Oq=~^r*K?#dLnW7nVlFJD<JHCcb-oc`#?Yh
zeKO2NR$G9qf-QDx3ZM7O!xq;8S)CnW$3Q6(hxNnxxCmNrb|boB%%0RrJbH`e=WSmY
zLdKF6yw%&sBR|avm9@jO$;GR_O}B$6fSt&)mK3S}JqJa8i0{pX3<&aa(hpC0acTzR
z7PDVD_~Gu5?YcpCM$l{|H&ga4zw`yZZ>*i8k1FJJVJG-r>nR=0!AE-3eiUa0rlzmC
z1gin278J#+vW3gUGfkM?9MfGxs#Ww<yg9E@;<{v-osv-o!}Vk9*7W~;B5zk@`<4Ch
zO0U|l?kn|l89Y=H856S&qk~uHV_F&Z))v_M;W_%r38J<+yb9|b`}e0Ib_zyirLVfz
ze!duTrd_g`2U{=V-BZao$E)glw9aYp!t!!T!k)!^x+sJ_0Q?oXz4FND^Cex5j?rIF
z7KXo9035J7Aw0z*!E3;!fwFg*ac~X)5>uKTAEv+lJ!E*~v8-J;^L?)uVRQT8dbL+i
z+HXs#pt;uHgBuBNu_qoVH0JbAitY63EeY5<*n6S*!INZ@r<B6$f_sd1I8g4YYPH`v
zqcJm-D-(^mek-XtA(1G%OZm^z)YxikUy#ACeex>BpciKY(}S4k-#@h<|H{*$1I6X(
znx$6Me7bZmza?B8|3-w5@9OCBGFnpLsnSPB<irP^5rE<+mEzn^=*;Cfc09n)MNRzX
zBr9C}(Jh3#tw>VoS+1*7&b)#yFSqn9CaTF!ak2wYu+XnZ+DDXKN<1^<C0oH4i;)8F
zxM#+>FOB*wx9=wJQ3b9Kl$TW`^=5m!Socq~Z4#`4DXVa6(lvt@W{cBkUZs!wWZ#Z2
z3>J?ysM@hiRHV0qv{jJGQwMOu)%D_RA-bEZL$K=+@7zQEMO^>kRTF+?kDtBoV89`&
zKzG#1B)~-@dmx-GgcmD>^kM-z!Ov@WPN^@cU36dkb%hUac<C_rW9X>CzCX&`TJ=6V
zjGW@B41q%RyZ;a}n7%%qZf+|YVTmhN@6k5@zIOGCyu>CU4ieN-+S&}Prvpl1I<Z~h
zns;F=GQ<8lLNEcU87wY7rldtpUeVq8wU~ud!kbfd&^otthJ%iX-3Y#I+fAtF?5UE7
zcMT8h^RjZi8`r-)^OF_y_15b%r|&Me9?dVdW4e(qoJZ1+<Kn<G%Gw%qh@G8)<bB!y
zr&t0aRi6?wwoaEgyYNNuW3IBf>;o|*4chtcCQ6%aEEZ^o=I2lxnG_0+Q@_Z2x`K9b
zIUDt7O(tpT+pj+p(J#^`>P8;vR@{8q{7BUgrc0?7qhy|sL$Lv<z&rkq73J#huYmTu
z;aeRq?5yK~1b4a4tw>#GV^H`Nh^ca*Bc=9d;vh*eE%r^fA<pXjrLInijifoRRwslX
z9>Et%@0w`))hRzZYWTCJc^q>CdEo#bm~CWkTUkCAepJ)XvMgn5qBq&?%`w9ni!-?i
zIuM#%2G4;$KAn61U!4$`1~jj*k0k(cK=IP~p&RIgJP#UpQ#q5P{CNIF`{B1QVjloT
zU@T~l{V7~{$EzJr0!WOqp0r2j(1PQ}i3%8cq@|TP9TK!rcJ-j}fwr`y9gr~RHD)JY
zAWZ+N(q`IP%NEmv@@`eCiu&c&a)iz~c)EU^dAd(=;eYzAL+DWQt@`O_W4V3adcGM`
zSW!vpDOJz6M|q5(JmEXIGRR0Hl(O4I9$Qj|abDBc6yeatMW`=T1Z;v&@L0wmVjgD>
zAQEjIWMEjbIk@$=%_jX6%AMoXe=5Q4&aDS~ArecJ0~46!l<Dt_pb{b;AgrCo7E4Q+
zXRWf2l)~v{4?3#Q1>~Ck4nVE1MLM2#sB!kj`%<N~M%OILSLY3zBUagZryT|c2G)Q4
zNDF#U_gQ`)fN=e&N6lS^UY&~TpvRW~3p?zSye}RGoXVwiGV{vIe=PmL%&9`A9F#l}
z24GCWLM!(e;^yyPN7+THWYi9-80k2Urc&JbV0kw>bn~ZknU9>ZJW1@)Y#c;#9q8$h
z<bTM#%C#ji>bZJ|I}-cMQq$nSgD#Ii(oqVr8=Iq)HUc-u8Vj>>!Zes^==O47jYpc(
zJyTOt#fM7ftlxz}@aFDkqx<(`^P3lS)e_Ti1@*Vl&b;n&)gBQbRDWM*fMo%^{NSnI
zO~jEV6gHw^o>-M#TveGGeX%mDj~4{Qo{}_mnyxSA9Fg}(wX^PL)(C@zDC6Jn%!Kd&
zbUuu$QqNJTy4h7AlzGz6Dr3qlk0WtoaEPL9k>&URb`h1=5DS?~rj7Nwx$QH`!;eJa
zKz4VHM&kR8D+VDcDG_>Bg)EoAXC_P+k$%FSc>u#oBWh-Uz8Li6ng)^hYifV)Vwl4R
ztKvFYb77A<u7l?xKQEAqMjbfrm+ncN>JdicI0^hbc<)ekb-hp!LKVs?vX+0h3y<zS
z5dSoIP&&*YN7GIV*B8VVDE{EoAJTFvP5XS6J^5y8cF;QkA@u%K<pa`6bo-3SLtOot
z{eS|$l*xX?#&~?|eB)WNY_rYmc5~M<Ms0~`n%_JJ#lmhZlpla#XcY_b)QitQad9(!
z|2x5sgr(BX%)ipnn@1Efk$yWF$m;;`l1bXoIr5dEXo-7}<h3<DZ~~g>l#pk>M}7m`
z7T6Il!{Tr`je}=S;JTBVvwD&4AabG+c<I}+mTtJ2?%kI^5NsiweYltKhka)L#;F^Q
zx-`%O3j4}}XEpG*1G6m;k=Jz&nE@Q1#|G13XhH}?hggT^7AKo6JpAS(LHBGWf2#~p
z<A<vng7P%^xUX%!8)KWlQYPoOHs%jkH1Ya0f9QMDxY>cvomwIn#f*!X&gz2<ZQ$g`
zYlA*?WA5vYYq_F%PZ-`9BcNQLP+BR7aPniNaD~WM=*MNft3vqDZ<_lHEw|NTA?4)L
z3|xDH;la}PgvHl^vYVw#j{o$o{`HAp!zxRRm!337D1^&<{5BkWuRx#KwbuyrAX!|G
zS@T`Ac*ZYTB;r1Q)F^*T4d*4B-@99ZW<Oj)aa@0>Ob}7Z3`)eHIt{)YVXs}Q)g6M!
zOX@c4Zx+^r*(k*cX;X&66>j_Y(@ATxDN%;L#gb>)6gB0UT+s)y5Fdao0RY~?zV9n5
zyXaj<3@zK%J|No}*Wib@pC4&NgJ|Zk#~Nr=tte2(7B4Gp|4TYN+dDls95SK9XglQ)
zEV4_BCaZ>8`+QA4-aWGf-$RFN9F9@vaAf+)JYH7Ud*%I_ODL0!Vl~lY0L2exh}Jrv
zFQ_ETzgoMx@0hUBDf3bUId96r;gHo1<QvmCe41V3IGppm)^ut3&*Il7M;kWZq7(Q`
z0e%jF^S$iCt#QKbC_^LAYniez8h_FO9_;No#z^f7&G24)Z7(1RrMX20KM?@zESrOF
z-~og=rT+tH?6>m(A?uL}8clChkNQpCdtk~v<jSB@wDHnT-$KV8PuJ6^^l?_bMh^yq
zFiQPqj*;ToZ~P5~A);rhvd8^xy#^l=HP?NPk5z+4dvuF@k!^L4R2eEV9)fqeYRb!R
zNbsH>m9zH$7zg4wa4I+X)9@oJgYOWHb$<OpvLMo(F8ly`RkT;5!gugZjk;9U6>f2R
zHS7FXh{>1<LZI)xp9@&_K>#v0v&Atej_1`+qz#NyDH)(7Nx0bZb(vqhg>@S)biThy
zoQl=yA&K?tpy_-GpgR}E*&lkj>V#qlb}CZ0Z|RG3h_B7u^Bm4=6gB;co+;;@GCr%&
zG8AQB{3lfZxm%7`tFy*2d@ygfdnn<*dHH;Vk(n)%vd<WyFd1eIT^scZ<3qn4!iZ5c
zKsM-3R?)b!z!s`zGT81Y;|1ES0u_jq&cuh3RN6tKpZ(O1jc622f@y1cCC?Kgk)p(G
z99VU!aQgpm9wY|9zYDu>zT-HF8d+u?rQ0r`+f(<MqY7)@cpx|g`n$mT%)?f9gp~+|
z?aEot24UNpg}w}bf)6JAlRQ{#AlOaG^JKoS!*QTNhunf5av6&H{-{(pYRb&PncBRp
z$IEXCYjM_PeM1!<_6i~T&gLfRej@gqt|$(9I<HSx*pEP;&s|osBsW24U#$I%Szq1W
zg5Noa7=j#*7Ip_uW~On&5Xe_=Dcj4vlS`k29R_@yO6s&)IGM)_JB_ut>HCk;E-?43
z@1y=qq7}b@UyO9Dw}R=s9F5IqXw{|yM;Q0h6*{9LVzyyTjG>EbRqNkxy|W)RIqsh>
zX-02WRImS0ziXk8k<=p^iy;AKr<n)npfO-9?P4U!(6u{J{APdWgE?Ek)78^K0IgaP
zcQ3f;Sd(Fsqt8U7rP^75&om!uf&R(G#$xY(-@bWm2V{!m<kuz`Zi5O+P?Xrq&hI85
zet(Dd)>V=)w?9j&`LG2V`OiLv-$*`6Bn8OK@Lj-y{0inM8VF(NxZSdJAB!#QDhmG!
z7dwfkXk>tGlaslV$8ll4U5W_g`OMJECJT1u1?qW*5l@{zFF%h8{Sp#&Ir$b6Az!D3
zum^3Oo2j(o?8^GkTbfU0$XWd}I#_I}DN+0N%b6QO3L0L!q<WtEPwnR|Q(Olaw+0$j
z6PxGs>2If|I%clM0aW_X(_-edC)yO|DL5Y(qhj(&%yO3Sdzi`<OY_NQ9HOUvtUg<M
z>7qxGGj%9vw#A-&461tKX{&LsXC(SE!=ebdyy+g2z{m7-*lm_4<4IW`-?;X6h?~8K
z>M}5(V+CLAo<o?a5}din1Ec0;gF6>dJVbi_nA_0PD381A_mkK4qOu9e%v6{;v)gRp
z!F>FqSr0n?@A>WWSzm+v(%B`Pn16(>xf!6vMw+7Am!DhB%a=KygkuOqdcw&eT$qMR
z4}y-P;lyv$p0%*R9zf^>AIkq2#zD6SrddbG7rf_CL=~<M`W7ea_7jv`zfKKIb#&Za
z{o0uy-T0!(roXF3{YOavcQo?f!#Y6#vQ#kb&^m_7hg16~H2t_oT{7OzJpBQoPn-Qb
z=+9Nh==pwo_0L+l^(YKC3#lt5+ZQT^zpq1&cUJSZbsZor1f;r-9Rn0d&T8Lgbgbgr
z(7nnyj@j9wf~yO6AOza-S^Oy<vKv_CIhNu&mOOF3P5=VYTwZ@$Thv&vunw#84*-BW
z*f65FF%w%AAD~~)L%{}N=O_c`Ls@fEQ;|<plD?eru98*uNxlVGr;ClA_O!t1+TBF`
zDDO_f;Omh}%#*SALktk{cwu-Ew}0va%nawho|uN`f+fO7F43-|EPP;3FMwgN>H+@l
zAC~78ruI?kOa&l!6=?r(_4i8}s$W!tCr$d^e2>iUMu!+$X$STqgSdo>s06l<-+!D0
zyWKbT-erIahyn@W2eHDk94!5$BdnjzE@_=tAaRcXm{?gV({`F=rI1pp-sDF1r7}eK
zbwKFH@HcT|we~XuiX_NQeJbH8({b0mDi28uqiAXA_4-A6X#s%G@hqVSD&^<?EuDr>
z4JTraLkv)JQCIPT5d7NtJfH}G?sNro=o2S5dYt?5snr0vVis6TI`4@_O=&#yc{_+I
zte0RbnhFm%C#g>`cA4GAi;otJqZbfxBm(@mQX$imjK=gF?kzbYU-!rLNc6wZJ}?n*
zIeRH)Q~BIBZ+kxGPGEPUh)4Uw`_g6-aL6T`-#x+IF3tUYZqur~;g@;&%7C1yz^1c(
zyc4RSW2@<M!_imE8Y~~*QJqqUK-hX9Ov6<%?Gz@?MRo%S`RKF$#cc;U+qrkveDH#?
z5;HqBh_3Hw$1>gMAQihY8}-&%sfW{d`bnwcFJbwAzsIF$RC*O`hAck7EAUYTQiIQu
z?|y&8-ay?@L>NPL^Be4l)n)quls{t^=0v`4)!h(lkc^#c5(kc}OrtL~oS~ID`tCVK
zPs=={QWg1|Cd$io-1F@j<uZ@}4=x=B-gs#@)lT4`S1O)(bC?N<M4#;YA=8(aJKUdQ
z*gChdoBv8Mg@zhy-So+vOyX%quT-2H8y!7r@Ut`EG54<L%x1rC^mo|c^N?u{UZ}F^
zw9i;9R~0^xsW2b;3K2Rlv%GeBd(!5`R>x_0Qu<<hbDrb=+W^N=If#p6yoU<9+W&l`
ziMlh`2_*{}0IqKtMRgry)c$^y>P<noeUF$wdSE{y-#aX0mxHTkm^Jo>f)W;Xue0HE
zZZS*A5XpeA;!BYRIojqnyr8k*%yFAk9Du$$Sb9d--08|4g9c|m`N;lLaF8l;>)&Ub
zSW1e7a8^LTlS4z6b`GAx$VK#3SgLNJdV(is3eX?IDhY`7K_FF98qJEc@JmLXt%Aui
z7YCDS#wna@8zWz@T+vXO0*ts3{>&h8zE~tb5^b5b`4ScKN#5E&A|$4|h#v?<lMoZQ
zE+0&0BJqqyMj7wa$NFt)Ih(NO!moXpWHOk45+wV`mjtjl0jKQ5^9jFbLEWfW6scW6
zvqy4&cIzMqHBa=4WSKMI!??aTeEAeNsjge(!Z+=KUFL+v4~zB)VoTul{p|(x^I>`A
zHSulKu?FXvc8mg(21-s2IjdzVFiD_7BFKhls~JH<AGCR{j1=w^eDH$WRLtFy-`O4X
z?lE(ypQET5$)=~D8j6KBT2l<@<8_WnYXz!-WXiW|8~{Y4&8Mgd=sUty^)q)dab2Nt
z!g!9~c%1u%rojjyWVVu?l@$8C!MP8?$qJ|7uFFj~rW@kQ@$0<tDm>R2F4mw(Z~>oX
zl5kUj3om<^c79gi!9-$T5MY#-<lCaf`Mr6cSp!#eE?0Un`BBsb)B{*^c(KJbR6b-Q
z3VbKD)2JQ%J5D=s^lI8wYu90TW*-NYmsgf+B^S^kJv4lKi~Ma<^`kR><4<JbOHZ0D
z8J<AMnjpW?PU{JsJ)-Aar3gS;{+8|1<~MLrYwH6mwY=^NbmO53qvNz%q!Gy?JBB#G
z1(8`GOlmy_``WrtQG}l`cu9Sl9fq!%io6XqzIZWa_)AHzR|byr>*Z_Wlo++B!W2ID
zpH;UUg)WnRLt4C+j}PXLEy-_9oI88=>^~TjdpmxI)D+=I;Xbr+&%n>A9`CVA-s<WR
zzZI3@;5-kPf#>&~%m_QBOi%80O}J=~(-OL<Rb4^s(QFtlsa$^`Z=Q8`%A@vrNJz@q
z_W9-pwg6y^@rnhhp}uO0AOHff<BO%*Gd_MRK5~DTtJMDcx!oxD=R@c6zINqP2nOU9
z3r@4#;aY|3$7`-KhC3w<bMN5qXs}C%_q}@P<F))5#^f)r{bWt3@1}TFnt;h|DpS%X
z?Whb-tk#``o|68Fw~Au+4(U)gDv8|*_LKvgQag>Ur3w+oI)7T~uP>^%r}DxOFH!fj
zg3;!F2MZZ)n8L}-i@LL)FK)rq3$ab6LMG|}l<&rnN&pp^_leZyFnfnJ9T?i@a`x<@
zm7l-1rmyOs)EJHV8~VH@e0drklP-=l7|MwYc^^f9ik0=F^DxPJDhzgrqo}a#1!V^A
z#++040`FT`vxSO-hgabL%I9KMrUIY5H8&&@nP!xMU|VVYdfndB<mCQ#Ss51~HgX{l
zj{PqYt1GfkrfIRE*i#__AW6Qm9{l<_lKZ@CiP+Mhr0*$)m-^h%VRm+MW8@M_P@}z;
z*O%vdPKt5q(&frSGv7dTB3@9wvKD+W<-ryZQjnyj$Gy|*MCJ5OCv`N>Hf~0j){CPh
zY4>^#OuW%g!m$(h!IotX1Sk|ENGf;REhrF442SHGT6jGd5gcs0D>y?GRS3}~7ye%P
zxU%_HqQ9t<ZJNMG1OATxM`c{e7}u&@Q<tJpPT8sXTPoEM)vyN`c*ul-4o$(4`%6Dq
z&aYeh%`sN+``eMUvK0YNfMt{t4XLzWfM;ie@2RO#pRUzo?|(jLf7S)QfGD8)ylSPb
zo(__+wd%E;T$tI_WV3QDdx~N!jV0Vh4ssL_NRL|sUUf_8{Wpolbt+omy%~!=idS@v
z7&jpcT_!`@;2ElU)!52@_$j{8M^4WhZBBYusfy!grIo8pCmkqwa}$8+dB(R312ljO
zjKx7aQhB)++3$cbdv>L<Ts1#cmy3k`aHB)xN73y^K89>Q_jg@j*v6&?Z$<i61#u}^
zsR9UX4ru}8BVSN5b=bwQE0jgVO1@E9=1R0ZOjgypBYe&G48tfql;U4Gh(v<d@w)7B
zsrwh?j*%@K95l}nsmwfUxRZITY9|~klBEeLFeaP^JnbE4YSc=xrq%V<jFE9y$~65g
zWRl?-Jb<HZJhJ(fRAl&jki<x{MtVRj6Tpim1aE*RaN;JXGdkecN}o5ars#Uc1q_Zm
z3tM4(5hRGhP15p^M8{>i8pdq(oZC{>6!o`cXb5(qlHTq0uBvygsB5{#B_+I<UxAgD
zZ@*$RE3I}C>Bu<Xg5KOd`HY^a@Om+W-)$!c92#?`!{UT(o!6lL5)dQJ=!D|NO9LWT
z7k^ERD2A>42!B2Vk)qdU=As)+R6^zF<*yAV&GpE9e6d+;D7)QrPF{Qt=No%mb#sy?
zg;?d#-OvfL?!c?Hz=2r32_pZeorR+A&p*GJ*52FP^gQO%e6jG*I>`6;b0$m23gXZF
z7BeoDvGDp;<|Q4&bD16t?Bl<5rqJt<ndo0J`@4_-u%N@=IY-~Ws@592^L@^!@(A+y
z%iDh^2_o<?^~UN1AtNj=K-^oKjC^ps2Z{Lxe~lLow4^vVz^_c{1K?S;;(8JG8E;m@
z=x|MCdHEtt(M5Q(G*pTaRNkD<ZF*_|Yd!kB<;iZDq^$9ac0<nImBIh>7m2Z+^oH+h
z7Ui0Nqy_&7)gG-rQLY)ruvkk7O?QihE(~W#28KO}TKndiw+q%w#jO5{vX#D9(W_%U
z7PWd~@KNQD@^bNwHUgQW47THb+o{r1AvqvbCXBM%KjPQMKBLTAfzOMII{ZH>3vM@s
zz@G4L9Wv)47iZ6H`u<R+x`xMbzBku17S(%jY&=39L$~!idRq}V*^Q*8dD674RXPw6
z@*W7b-o4f+d=c8eyOxeh*8|*5g}>7J-o80<dSJ!3zPlDa1cM~o41m3P$?!KdF|~I(
zh;}O=!#>+hqzxA?-(6ET=s_EPx;Zkx^-?7c6h?a--%NAM&0(5%*?ogxr}?~z{-t6T
zU<wFngbt;XKqR7D#Y(}GfVc+<G3@lj<Dc9uKL!ZRH(g>WNnw`W*^A7UU;I&0wg<TW
z#8(auo&7P~R4K;L&zspw?>YdJLu4PooHu$`t-Adn%zM3RCEtlH-rv}|I%<486qvEw
z=b?Txv)9|q7BOG^egdabwWO}rEr~v#%tlaYm9=(QfR9HDx(y9FC%2_~{Tvz!*z^8L
zuF9Jk?Z2n~#e-7cN@hZdZuFcIv~+Sm^JgD>W6C!yD)^3YYv$$tK(0_MK5Bg-nlu}W
zlGxZ>e5|NKLV1W$IPi_m+{)e-Tw?4B<AM_|toX%W?je`tCkf7lRoe7-TGd`wb0#=D
z3Xi}$eeYh<XG^kwRWRCVLhd(b58$S%=vP>u?Qx~df35|7e>hjr58Y=NA?RI67%H8E
zB1k#7y3>{z=PrKip|r^!&Am-Ev_qsQOVRv(G^#E|qoh(l9=Z4aLm>;*K{}<t&&xgJ
zxOXQEBi~zP2=lt1Mf@Bd4U-VQ4nv2T)5{Flw@fO_i_A06$_qqYp9|ldu04%n7cnku
zeEqt038l<<TUl07PL|LE6sQ+jOgUu|j-Fu3@+9_|iisO{X}|l>TbWn2TeRv;)1v<s
zjf$m4Xd(=BL?e=#)qm_cY2eel{Nr%UQVz@l=|ELwpasdWooKK2?=~H=eu9HThk%VP
zhV$uHKuAPL5>!5$ebSk^68Jn0uP!56eCmDL89x<w{IiZo$B)S&qsOD(Ze03~<Rndr
zAiR>_2QGx4M_>zo$%o7vevz|2_&nxW&g3(v%B`P$F@6j^xj(!=F!p7%wdpJ7+@*}!
zxup-+7~1p;;QiUoXgM5RAx=vw$;|Z@t+q0M^2zKk`CndUEtD$5?WGf{-<whz<S$=}
ziCq2q;Wn}#<^J!3W?a5_8UO(}V*S&6(`i0B6jcKdcUf&s9s41t+;{e`sfjR>Sl^oG
zu}6smNd+jkLnfvKK0@A`3_ern{mQSI)%0Ii`w0d#LVj;Xk{N>QY=u3N+tj*us_1=B
zC4M^6;07uP-XnG=eTm2c4h{}MMl_*74j+t})6ztE(-nY1J9uY^ian+OuIiZ^^c5-$
zvX-URS*uJGl-Ib>QB3B{ApBe9dYFNiCsG;8%So%`>_<nj2XabU8yk1SAjy>Yk=u6q
zViPbcz$xPZui3wEf}rnR-YC~u=HEA%08#tj)L80RrDxV!*&mHrgt@E)p8ZT9$~^rG
zhNNKHy1lhgu6m9b5xxHHU*Gn>CTfIr<uHoGc%YIv;0w=Id9R#vTmLh&?Xnmh9TAvE
z(<yleWlm^l&9ZbQO`*t_r~r=e0l=uLKIM)A%M+V2>49L?dg-3fe<&aQ@5y{qjjE=G
z27E6!qN)Q%rq;sO*j5v7BZM#>Oy^mb1%`IQeO+_EqZU|%W>0r~7pf)tPxGUSx-hcC
z-l|Ufh<%Od6$dK9!NHaW(P=@9`x5R3eu*`)Anb`~2Y|v49~v-lTr&D?DYYSjI49mB
z!3;1)aZM>qfd^g3`oO-&sey8F?t6}L%_t7clLznKzAntwi&uawLHD5xdRDJr8{yCh
zn;Y-VMd7bW5UD>e48P#te`21WAVH)=436B#vv2Gvp%@hJcgR`BP+3x>(g6L2jnZqb
z3^e_e(0OJ;WB2kGR8_qo(B*>w6d0t0KUh9QiBFoZe!jIGmel22?^Ja<X{w;hOsNKb
zowzR;6uos^f9dP5LS?4;psuAH(X(RJ7N|ah(1H~QsaJK?%Hge6zBO-bdr@NZAv6_y
zv^vM!Z=k{3Z$F*w>{mB9WCS618T{Q`e0Wy`-H3u$IYD8z(qViL4qITWtX|!;H%A-E
zDk8k$Ap}+XvSY=8zQ2{1Za3Y~eX5d3(U!`3*j?kJO1|G<eez9C&2E-Gk)a5m3QwOl
zJO1n|gnwmV4n>KWmkQ;-Yg7;9P$<$|INs_3hS^^57u3`9GXZB;!t0%<!{D+;<5_I_
z=pzp?u56y??0`p$OM9w8USPI4&D~+L7rry}33J*A?Kam?-kSt#5dG@mIIPIVi=Zc;
zXZG^oAMsh3sy$>soHvZRQ5(4jF24>81?EkenX9ZoCrRg8BLl+1ZsPgy?Xil0LU`Hv
zvHI6HGQaJI;C^t1C)c@Zr8!HR4FDW?b_@UGo5(k{{~nJu_*dV3IuwTjA7mv>5M&E+
zuUrx=x|>z-Bu35mFBe=t*0c8(zq$0{WA`Zsv$~9?7N*ymo<RJ$=B;m@fac|~vFMGt
z5W}cv+r8CiqyMbUj{JhvOoI8B)VnV1FdAo3xoGyj*p0SJAqK(e2)sz%3F;PHib=PP
zpO|@Un4B(x43k$h#bY2Uxr79hQ?|ov?~3mQVr|&(Q2_<(8vqddwsh7hTd!onJ@Lhj
zn1Is;=vb2k?ve4lw+VO}7`f+a5nq?G6D|RHzx0_w5t9Q=5{gfpqljGI$!~<yEhtkD
zq9+6!+?0z8kZZIC^JFNL-OC*e$%|D@FCqx}eV&wQ_|x6P`J{7u2*nZc>m}Sa-^+dY
z?&-X<3_m?V%UrkvA!UAuUDhRvfj;Z@EY)S55L<$pSM%^R1soS*;9t$*ey$2J-onEa
z9493Ob~d@|R@Y(-_i;HCMu6&$j_>^oxpw?`NrqixI4Mk0%PO4-<A%9Q7wesqC!WO+
zkLz(z6)-p{8?Q0&dtqTgVtYYC^IYpo)al@(w@nxUEzY*^fnK=%5<>uR_Ddv|GQ7hC
zN$rSyYvsb7wBxEXv=Ch<Zx!@Rb6*k*kK{8#bON9`9>H^gS-lA4epJNe;a`yk|JjuG
zM>jNWhPLSmLHazB*sEIsdpZMj2G0|FmgL+gsaMhm$$Ii0meXpcpjCaT<z!Mvpj@X(
z`7CDt)D<|O?0%K&PUm|_pgLFh)D31gcX>JF5}(h`>F+E7UiIUGODEQ@ns!y+OLnuI
zb!UYGn2v{Kv!4sT@+%^mwN=fj(sYQ#jH{?!qXq4QH$E}M|N5@%XMU|i88-Y^$dJsq
z3n%>SNZ@PwTk|A!&7OjztK2Zovmi(N9UiDSV{!+|oh3yxRf1lbQLb>%3=>0flRiMv
zOc+4LMZYlIXVVGs8>@6ARIWm37$r=%b-wEf@)_N)zceiFWQ2=@9(~+P3mC3P|1D6@
z`OLuoSzqZnU48bZ+rWqUkS3V3_Gr~W+Vk#|o;M4>Tk-q)*Q@&OV2yeyjN&>xd&vFY
z75x#zFQRPl?3tr@P6{vRuvo9hrgzNZeDqq5VOdYNUdxLYo3KC~=8mt=L=yJ;sZ8m~
zWR3@*R-0Pzdyqd|a2z*URejV)J)Tj*%sboZHLoVP+kryzD&GkjDQFpkjm?drTT6O{
zuw3z9e9Ga&9DR*XN%#6H3hpB)h3d@5U2%^N`|@)U-gRQY#CcB-y7L65)$rOuKw<7i
z{h(d<_>Hc1vC`GY-x=5*N&d<r)A-Hjvv7ID;|TzK88c+r94ouZfWe7)3=>E4t`&^^
zlxJh6k>~=N6DF3|B3~HZ1dYH1ShEFJF8TS;bP)}bn9BmJNxyLRd{~Bq8C!O7aGykJ
zXU=fbh7e%u`f;?WpOQB;*p&K_A9GwbH%@r@=i0Kjl|2dm+(+ZVSP3NBDWjOjz2Ki5
zCh5laun2w{^=_OZTT8-s%C9sbp8URjQw06_bRCBTIqGCf?Vw7P3Wt=?(d<sK7WQQB
zxz)1F3Lqm6sP@J2`evtmjWa5y)?lxoIBZmhL9ubmKUXRavv}tsS1SJ0?TrhF2o4K~
zsy7tlHiB0efPyKuka1$RG(PILG1;?O0%CgZg0d=KgspCVjhrOH^v76Gic8=fMW~Ei
zLL(rwCBy-v_XwBhn>hy@h#l@HF3jW`Uapq=*FN!AuMDMlJiI5rf;wXvfjPhrpvi}G
zN+%U#6-pjCG0I+wQAzLY6NT9Ty-_vk;&tHz>Kj*rgn;hESD%ekjKM#y4P$eRyg;_(
zYdyj|OZ43dy|8)wb+Q`d!QeC2`Ry8EuwtTV>Nx3{Asl=Ey*@oNmI^77^}4mZbA#G+
ztX5YgdO8Zq)wmRg47xqS4S1<1Q)mVX%%`UnX7y_5=9FZ~i*IypQuDki8F*gRGF<km
zf)7n@pqp05WEfYQjlBsO)6oFdMgm>HWK1A{)O5KiI^I?N2SyWH-2Hu=Ncw)|O9M}P
zf`sk33Bv7x8ub%nM|eNiuBMSOi0l$cl9OUZ<N8!}>(fCRP-Hhh)-o)Ao4L4OPLd{N
ziSneUAwq0qH2tN;kW%d`xA5!Z%=c*el-DiLV4wAb&fKaH2lZfoUu`o4oN`&H^J9Z&
z{Fooi_sa9bDLpc14hd^qX&k}n-tNB?gX(-u*vmR?XMx3=)9x{}G@KG1%sSH**eFv7
zU76!m@9bPv8t`@q09r)hQR$o>!T-a;(#rDT>M{E9Mr(UV?zEcf!&?2FEcaL+aRwZD
zcA9N^>Iu_v3Esi|gjtf9C5|LaLtq{<vc<J}YWBBzGL8XuwqB_9ag%OKXFoJa!5n-b
zk$}%eX&_CxdTc^?wF(VtgoYj^E97zgKL$JdphImllGcb)avnM>H*^PuY=r&2b{0#M
zbbiBf!_hF_(N~#n#GJ#y_F*wFA~~N}{Gnkin#U4$EGZ#!G9|O&>E<5lRc_okD>y7e
zb9Zge9OnpJ8Hd-LpH}9c@5KOc{qXbb$IA+ghQkd&EF7acQw7_7dU=}Fu#xQg$36|i
zh{d0)FE)5Dw&P<i^!d{;0HCCYEna^h1eqN75R-fBv&UqCg}ZZWz_;aW^>mV1zcjvG
z=VXBS4nP<otyonuZV5w<wE5YaUy&wPo{Xq_TQRQNW(Rl?mIu4t?=y8AraW}es6qv!
z-pWM-f_HgMSB<6WRApWte17@byaTwB=gyGadkcRj{1)l_%@q3E{+S}?k4A%bBix@+
zIC{AZKduZnc9!Fh$HO<nl93Gb9+}(~4hlO3)1glE)rj%MDnN{mFG5fdyQf^;)bZmK
zOv3>Qu&Z!j<v?Pd2LZbST|gHbd002WJY*Fd98BS6bUsZ=IFAY9+Q(th0FGA~#E0IW
z_-fBy(qoPX&6bY6T<=_&SVP<=p3eGFVv(RQ|BqlD^-&ST9YFH4J0f+dEUWaoK{_OR
zfKdYi2ToZ_n%+-d8a*;kMGWa{w5wO(sR(mRm0sY2O;cxQzwh(4ufq~;Z#6S|2d0z=
zE`5hztS1?c$@d+;Y#0YXaf5NFT&=-~dDb!1UVp42rjP&phIQ);!#dt*xSBeKj;1R<
zF6-idzbVRZ33B#hTecmV)M8OB68K2cK%32>N$7o|+NK8`q9cIusKAMm;yrN-K6QVw
zC479b?Cqo-Vy`A=5Fly(s44C*fM#bFg(Q3TJ@T?rB;@vbIbc!BvVF`ac2@EB!>c^{
z-y*tB@Y+&%7H<t6fl?0P&C4vuB-rupsEuebh5htqH#x;ngSO>G5WS$`L5ClGaL~*c
zdx#t2@7>}pm+WlM$f+?$gz+Z#YqJwArNiRDYlE`9c=si~Q^C2rm%!SA#_lBF7{c5i
zAycqKOo1efrVGlmmJ`q9O=Z4yJ8EQ8alkTbbr5c$>X7$RA&$c#H9m;Nd*0il#5uX!
zvtNg4tG6@$=znpxYP`zy^hCBIh(u2itoluHufVwPdQ_`g*dSKs$9Z_lJ^>w`6CkKy
z=e#4Kr-poZ1R}zbh_@0Zm_@i<n__@4*Q15n`O2N4zWKDud)h<K?BmtmTTbfjXoJ&;
z0moFzy2~HIXDdR_@rA)pP(hl_{FbM!qeDmLajlm=<fASpux-#@YIDfD9okrVQl59k
zl->DPYoAu5K-~*HVD_P2-<Cv(4z34$;1JYoI&13%Lt~LT92!`;vMO;*{$<5sO}I$9
z95R2o_%`0@s9y2ickhqnY*io3I6+b=iL`6^v5w-Ss`qH2wUQ6=)O)gm_K_-?+02=Z
zU;Qfe)@9tXS(AjGmG?(N3>yBVE35p4nvbajl2JVbB+!mJ+Ly43UI&O*@pUFmhH~|I
z@a0-1w#V0?qFSRtffMlk3c4T!-EPs{)D%zYUr_H^`Z<-vn;`sgl~DxP0^Tqe4aVZw
zDHU(P%XuUuT*z2vNZ!W3ULx|RzP!jiMpvM>>&IM-@U#5565nQPZO>@U>LXC!Va`s=
zoYtJvq3GQbn;VO18+cS!eo2m~pjVye&--rkB0Q2fdh}OiFqZ-(U~g(W?h!g^3cdV3
zThrY5<<V(1GPam;^5>1H)wW^x@AdWsJ<mFO`xh?ipIOnNF*`5-UQ0w%H>qHpH(~wS
z_{uvt_8ZbPqIfeENy9zLmZe>Y`GiDuH-=to5d<6$^gFkK%6eAzg2>m2Uz+aM@d2Hv
z`i#K~x1Iz=#}KUwi<oOrv0VIvkKOH{@B{iwqEr&aVRx1rOW_mzim(+m2l1Sp=@*Bi
z(;DTKCKwQ<|CAZ!WP+9wSQ?zGY@>hsj_G{W=+RMm>tjc(W=o=%*E?Qp_E+b-z8P(4
z`uyLJd#cG;XF_?&^G}z`?c7jao>_N;xz8K~O^tkR!@Dz68Gev!oM1cf!OS)m5|3}&
zS~SMK*3<lvh$FkyeA7E@L9GuGxBuCa1TRTX&mQu8bHs+8Ko8|X#rw;Kj6QH~PsHNi
zBzb^br~lAZ-rBsvL8<30Ey~gu#MFLm?jsG2+ogXZ^{<ZbnI~xsHZ_fObo|-={o<eI
zcFy{C<o4VdiPTg2N<)_rXh5t$0KX31wH<^FT_>pL&PUNz)*ibR^c(~DV^L_DaLHu)
z2=Q;JwxBuGVq;tl-w_+lkN$TGd+`pp1o)cnyVgIdHuC(v&+2dOudmfa?ciJwN>L0@
z8uUj1(X?=gW`EnDUR<in!RJ3?Cw0dG26k>AEUAq6)XQ7$@ah#%TW;3qsLbmH?tVf8
zK;vZ#Q{z)!qIAhu!an||bLzX6<zvE+Vas}Gk~H8))34AkohUqKaL@c)pq>myzEru+
zbs-uauOWk+)FjYuj2)aB^$@fmwcZf`5}vFgG0s!Z2OJyOvNBYhb9sP%ZXc4KwwtvC
z73dD>?{G^{Ab2wdfmqao5NkCaYeL_l7GJERnb@pirzzxaDg0#XR*w*$DahrmcpW{a
z63*jXJH;`Lg;!@@P1CGDQHH<&iv>EmP{WrrYb+r~onzCcIKPqXA<oY~@d4S4Yi72Q
ziybmeBNOl%-HHUI4e3o36}>{3fF=mgXyOY&Y$$uYklF1-Evgy%U{&j)S<y(IrPrv)
z+7w>9A$*OSHynhx<JRcH{n2QS>ao~&dk(&7`pLL(v|%s<$2)8{)s10KPOo5`-1ESj
zruZ^@E<=HN@>lnTIhLcK@pz?eKLYbI0R!q4b9W>pK<6{x%#F0ItiXfm=Pgkmh7>Fn
z+AR~lKU}`;R&ZD3O%NCKp`G|zd|c()po8ZVmxBPT{Cph|54D3@WR8yrd$*Z6J`uRB
zaCL6&^U9R>i+{zz*W^k=<VRaJ7Y{yQ)-;!EA&kM%=Pyz9+;AS&^L{L!0^PPf0410z
z-qGmugAD$v`Y-;hALZjtw&`9`e4qx4d+dQ0$H`$AjOAP&)tk88Inmph0<oJ_s%qN^
zq}cApyhCu1H;syT37;-WnRI}G-z0?1-|KO3y_?cBa%uI-^4A?eEXGO3Gf$Yb7>Tb=
z^7!5a=TK{J);o-ZJ$;z1y76~w#Ku<y%8T$h*cIs5_hYT{b&@`v*)I}xIC6R8vf(IE
zfrRjWKHisqu6kO#J7MMQ>n9OEQ%dcd3~j>s5s{lKA788wRBr^WUfF($lERoEBqr{2
zko1N1j-I)Wj=7P|n2nqldbz{W4JIf%Ms`hRua`kMXq(VlzSM|K!kp89vlOnIi>opJ
z7JezqPk7)(kwiFNuLD;cj5wb_w+tEdtLX2Hr`$DA9cFvqeuttdCIpgq*Zvr&;DOhD
zP{(iH%UM~j@Pltz*H3y?H8sVY|4C%*fcYRzi0|Jn4l<`|vY%jffEWBr0_wP5@~lUX
zOnkZYLJ?f(8><`1O2y?5D46=a`nu>=_l;3~I%g$(7vJ|z_vfvlYoB@QYVX_CC|Jqu
zF4WvDo3P6eO;^`+?9Gn@XD7`rCBq9P64Nd#RJDKtSHsYameRovx735+Qd3e5KT>Q0
zq8g!0ND8rXXzwb=?#@^hPsF|8nkZx5L0AxMcBfaSwy_-L*%Msr<mhQ3zwcUnTfl|B
z!*HkVe*^B>v8^oK#SgFZM_U_4H<WvYUxGpl?Zd(FlTbg%2%ujPdl@nR#Aa{x?*s|F
zTLO;5l(c-fUDrzH*^|0TGwoG}+Dm1>mfEYe+Z@{yhbzk8uG_KW*6+K44B>5QDr3QW
zNm`7rNcQxVtr^iMG6{~_Q-$66(+?qyK$GG104=MLhZY#1A!h}?4ZcPl$+K?SG`@`q
zC6EtUY4N9e>P>wS>XKDUYA!E7eEIC|Fh0`vp+6F1|DyK@#loLDG2+_hzqMte=CzH@
z&CQnj=oaTox@xbU)lY{KttO<=?U@2V<qE@go-`Z`Qb}DNltf0uT)-ZE@^QpT^1{~D
znn8_wq6}Oi0{w7uk~PJ^i`;~jVx+9vvwoMWZbBT?QnMTLfDDoj#eQd=s5@M&BqLYo
zlHfvKgC9ga_#B5q7Z>SeLa=uZw9TN1J(^G17}>`c8N6btCQ~>Fb2x3B%l@~0+58p?
z4^B#bPMT4Wmhl0{zg{F`&25Mo<1KhW>SSf@pe(hj_O*NVCpQKDGUj`-&rW>hY`8(b
zT2=k~Hk^|m8i7iO4Q@~FogVIpG=q^B4e=(hOkzx}*oZ+<>Mkf^s1{G@U=LE^`>^@U
z+82FqVNX~Mdfe3!M9Megb=zZwZ)k|OO|o{pyPgWNxTxBD>A~jU)6vS=62#m2Kb_;Y
z36J-N)3DUK@+2b68a!|SaUJ=FpsaIm;)BFd(*5%6B2x@vPC^;s?(mVTXAZKDgS#W&
zXdC0iUmgy==~g#-b@O>vZ+Uq7z0cwG1+Z(T%05ubKjhcY6M1K<v`pmS2bc4pWH+j`
z^3jj)XY{NQNOA&;6Z9KJ>h8jcQ!-2#iU9rkX7r2Ud5`0zt}8z~YGpoZ!UJot-d_li
z#xy87`L&30oM=y0jnBJG-aO>E=g3S^Wl}9=^V`~8yC1*)3|baG(Vb}S>TZ0b#m))q
z*Qu}H<8=Du7imh!)6IvWcLsL8PISKcNqx6W->2b*dGXS=C$nD5R(JZUA61{8y~%2v
z2<XF(T+(ETQj`d6*auVEZ($CYfG{*`>noqJ*10#;=ZM=slRnl`CXx%8D{c43TH;aq
zel2)pd*_I?-$+V2v)r#KTzBJJ1-uY=Dn={ONFtUp*CCD%G5?JG4%7SPYUWNM0qYC>
z!%zIYtycbZB$bq{#@l}k-e_BlyeK?TTl_K;1xxsR+`h3qrR_=G#GF2_oE2wKLu+55
zwxB3_canqaF6U&|;rdkx?uI)GrFdCa6Inu@?Ti<ieZzybE@5k`ivKo8dKF5?|Kl|~
ztDhOX=DPR61DI2TFYbK0GJ4AFj);}B8iXm`F9W;Ia)rR^hT+d123RShc0>nca-|K|
z72wA7px0~daUO4}k2|T~w_Dm>rXU!WU}67%t1NOSdE)W#eANJJw?b^|tgL4y9)T=G
zP$jRgzW%lo?d>uf3#E~LuzEa{yXA#}H}tMbTY6AIJqc<FUz2><g&oJ__qDokdGMZi
zbJZ|fqH1r%p{8eBEBTupiys<Bwr8&p!{+ULq+g*-+7Zg&#Z(c@#oQZkQ3oIHd-dqM
ztN8_zD(ne(msR4iGXYOwxE;UNed@gU7B4|4w7}>(Waov8Y5@)PUTqI(sS|w;x?4I{
zNq>Fm6LBYcCw(-Y`92Qd2Tj1tRAQfZZ5E_wjMGyXdRlL;2$=lt>F{{&{5s$t%q*OV
z%q~K3xBpJqASOT(Jr<uK<C$@q(}*%2(Fvw_kVwCKn~^jbhpj1@bN|-zw<u@N%Ns@$
z|C8{vbvfrcGBy?ub(b%WMhNk0<M8iS`-_Te_faoaiK12fWs_-1C|bzVvP69<M-Q89
z+2~lMMl}K1$E<zwgsZd?AoI*5R^V1q$aMi8@xPJLjes=B$S-+Sqe|e$D(3<TnvkX*
z$3Eqbj*P&Y)SN@JD@zsUBCQ~Uygn1dU)NU0qC((~Tt9)r)6{tSnxL%AD@*|}B%Hha
z<#{HPp=Gx$tvUMNAKz?%0Y_#1zj@n{CDTLXI(1UW9NzB+O#A)2B}_P0Twc}^WMT+!
z>Fg#5))Pg^N+{*NpYUn++qZ5MJo0#3?{E0~wXc=%BHM(aKsP`D#*UMa5id9B66zR;
zM!b#KDVB(Kg5Qdj-8GM9)q8Ho+mG$EQ0fGQ2zxwfAD#V*E%TBJDph(wBt{v=ST4e4
z=av4VW#YN2Y4{IzO|aPdRci@Al>zw10thzK_;?LwMzi3C_!VN_BaLc!ltZv~jqsB0
z^H$>u07iZmAf@W*xu{J6^S@h*U;k`r2psZGqTKOzaJ34&ID8ut#Qbcm#Xp@J8Xg`l
zP=A_}MAWRj<vfGpPk6cK;1nwNjC!`=+Ddgr`LpLOhX+ID8>3lFX5gN<eHhj#!7O8k
z#4=H>#|~RjC(U#xZ@_KG(ntTPbdVkwsA7QbPd<Bcml1849t~YTv5B_1DhGSYB<o3y
z%G23i4}8iyTe60Ru6fzlD<{45fBs4qpV;4<D*&G5ddo^NYkR!=&c0axx6v>bvjIB@
zJ54%Q!dnjZ@bit-FRU+N`V!;gb2GXH_WqYPaV021+geglPGOvJ`Ke5P)Z$ur(>3ns
z*cdhT<Lp51-2d=4wU~&lpN$&iZl?vV)3DC(fr`@=E7L#L*ORJSoz!_c;bMxhMiw(B
zD%~XP+TygATq*SJT5W*&Iak-SXZ<?^u9_chY8|?CxM_rsf1Gg>mibKKe5*V_2HM;r
zW4WaKTR0zIS74s-*IfJlRPQyfmH?$lc&lV5PPSsbWAwujLW=80m-E~(qt~<d`eqv&
zTU*O(zRgJ(5=X}b5NH}q(o^*_m!-$5%JRJtcwO^`GWTC0^ColYo`_j$62opf@&o5=
z%d^I3t$}PmZ`r=e;Enj<<+bm*XR3Brs<%j>fmty|o^3Yo`Jf=;u(-XU7@U(d&6^6#
z;5pn97GJzIwG|o@qo2zhG;`4HRY%z1YB-FEOmB)3{T*kI5bYS`PBzQfjC6g%K~aS#
z@}5M>9iu-!R6g4{5NNx5pu<bsYH4I61biQTDrWg~%(CHj=k|j4_MfrQ5rcod=Qe{;
z|3}ezI70pZas24wlyamTi90JwID3yXicq$bEy>Ex9_d1y5ocs(m-)@iUO6E<rwG-h
zOK0!x_x|-4#GTLQ{eHck&&T6BB{TwAT!7A}Q3cg5hOG-U5ooO>)fX^2wu$-+s!2!r
zC@mnG74u6nlFzkegxv91P|5P>yxx5l2Y2|D_t|>wlFzvi`DY>WN8``V=1o6Bme}Ns
zb1wRC`v`JFVw9ymLu%$O97AYie_p~F=&;kqK%P=Wf*LJE3z(LhQgyiXom`b7#I#sx
z02esstP`vdj+x1I(7+m8(aevplI?-B3k3?C9F3eT0g6+2`0=0AMbN^2P#ACP_C7-m
zF(1C?dAP0=;rZTD+V5J(0o`kDJs6KJLMsDP6va%0s3G6Bn^$jG)*bJy{dcvvB8wf0
z6xG2ykAIcuPK&!t7S^y04G0NYYC#WfwthkvqlxfmyIc;RVOI(^=86|@LB<d_pf_V_
zX(bsqLgZZ?<DUZOpJJXqh);p<>`okjLkEX%?uPFssvNBqPUae@y@tFJEJVEo+k-#@
z9#_Aar#h}gEXFsG%>hc9c22fjtNFCS<39`(OJ1D#_h>DEje~{+i3*kLz`T@m>J76c
zh=E&n<MNE^c-5;)7aeHb37TV{9|4ZiNSs}=_BA=i=hAdMcVPAe<_PBQ;iiX8dJ_K#
zD1fqLVbY2*%yo+Ny7Jsi2kV;SSK46mkKsJQ5$uLRT4}^R?J|fa0f1BgreZhHV+bas
zi0FuFEQkESd#P1Q^?X&2s52Fdvb{9|rTDg4+)6*s-Mu_JWEXC1anBGH!yMs~IiLvs
zAlW0~mlX9U_I)?zJVYBrb35+qfInNHE?V{vrf?2wn1Fw`uAF~`hIvhVDW5!G>)5jp
zI-a-(Q0N{2sCm<&LZTfAYy^QB<1;f$?ZKd^a`^E<s??vA69XZK3WdB8Qj_j)KkTJ}
zc%b1TpFG4b;AtQgK9T1!?8RTOGI}g(*^Z13{{??%r|^8#_M#PhnOhg#<Eb2FE7Fiv
zQs$&buF!;iv;|FP)`DTpTCBd_I+#`WG|Ko7xdr+KYWn0-EMKeAWKMVOC5l$rgd8u3
z#ul8+bR5$|-qz||yw(*14+&6|y?TzgH98;GN{vb|;DJ&qK#opN$+bBQ2EpL;`*@vz
zFNWkuZtKH|qzw)|HA<cQ!qi7rySn_m4_B2|KADMxb{y^~P#D!mT^3pkaf;zPtra(P
z_FbB2AlmPg!#?cqk!QApOicM9=*cs%$W81qc>Nt89R!8DFi}wE3e`7y2~ttN_Wpg_
zrkaVyDxRP7KMoB4dY4k@4>n-)N{l^4sRMCtO_@@)e-^d@<Ji5$8B}kdbMiYbl~=c)
zEY-^Nh@EVz94BrrYQT>I{9V;;v+oMUzqgDjD-)5noinbzp|Z4E3T!4n1D3eBxa0i^
z6BPq!ezC|P>F~{OY&?U_23in|r+PKm|5ut;TX&Q>O^V@luEJi`lgz&v58V(Y7#148
zG-%HlVRs^$3rTf)a+E$C2X&KSG0cP*!EsDdiuV59$;R3B|3u5|l^;N0@zO8|f>&!4
z;a>R?^4Ou||8^^5pFiMHBY(C2;#c)bt2E7luUR9?3b`YtWcEu>75zQJLr%dZ?~hE!
z@$&Bck=NtH0>NOp;*Z?;QD;hPy@&?Pu$c*>itfdM;145+MVt|k{0QMar%e3Ylk$@o
ziAcFEHJh7j{^GQ3gE2RuA_?KDBHH}xx}PIn?3fXAFSGK?`NQ;iJ^P71)-%sgdoY?V
zD9>N@H58)qb_^1N-T9)yz-+Hu)4F9LVC}#Nakzn?AC+5`8+5occg>L0`?OYQVB@FP
z={osZG0{c+aSv$QXAfT|ei}3r-LU+$BV4Mj{mrj(GKzuA-dr@Dx*$JNJJd@$MB>bK
zC)PcKG?LJ-Ug8avw*d1^6VzHSU_N0(<z2{@F*J6Tzd13VWO)e~4W1Yjt&W@)pR7te
zSyBflMxj+JNPS(x!27956|N34f0qA2Z&IgC^;shB$O+X<%W<E1X#C?`^nLU4xsJoN
zfR4joY#sn&wND9OXRCd*?b+BUA~sNi59Y2L*U{0@%uSD^AdEovg#+1%H(dQ3))5>M
zg70Fh8mOs=c;R!A_7h^HMsdXZ6$8a>e!FjCOzs{$S1R6r*y0R)>EH@&hBzR*{Ae>C
zR~SK%F-777`@YJ#E$yonz<Gq=VeE{GuoE1NglF6Tf_Htq9fL|L(*1Lk!uciwCRhxI
zGQ9rkcda{P^Si}8A#r^p63BlSd_KF7p3bI;#r;J^Lh^Tf6_@Uzf0g9!!|Re$2o~Cv
zeqh;K)MvK@I49QDR#p`JwVghdll8>J!)qA6k<F6am%?n3PJvGYz537k69XtM?Kd@~
z^F}iFf6X2O(y&bEylZ8pkA*9xjdNj?30!)4Ak1JXtNR&{3qnHuLT4q1iX;UdFGX*j
zB)LoD4Z*%4Ln2a4+~Rcac=xd9XTXY@hT$S^)-GE@0QnJ05BqP|+LgCdC=!O+P}T*!
z<o2VZIPHNj1RUzvfBjuWJ;@nr;Cs>REaa8#2a6$_G*RzYgI@piDebFq;x@x1kVwqt
zEiB3m`f&_yL70KLwhmrghef+kET?%6^S|h|*+!4k6OU8pwwyBv0taGbb#>l<S;^@O
zLT2Yi)LQ~$7o|zi+gfIOa`G}6Yc&!QgWuTvny4-cZTTwfWWt;3GLs+?+y9xPuzn@_
z=gNxy??{8E@|^WGB3f5l$HwWaOWv#GwQT~l^~#F!-sn>$;)mBGFFp~S-?I0@fJ0um
zF#Fp7=gc2exHL>9^&}F(SK$8lCx2j_?cABz<NYj|tnFq&Z^kZif>C7M*l^$BVKmdK
zN0hZ;tjx?dB`-8Au*py7cx&RG&b#fPi7yNBO|vS>Bu6#)^fFhfv3KUhJ_ilYCTn@u
zIThtkUcW7cGrv7*vKV3W8-AX-B->EBIXg>ka?fF^byRvFLaT)}u~rJVHlE~5ZIJeB
z(<HVZZN&YxgnBK}<Lq)B`+Ulabj5IXh!W=-BEjS=Ql67(f{*=@pHbzMD~tvudn+jU
zV}zlFNj;p$`GPd{GXF<;YR+?bphrtP(_9?>3<3pHS5XEX{9XFu3wDYpKcl(DDsrtt
zwP20IJI@E=ko8xvC*LS}r<37_Gho83qF-}$d$JZxG>1Hz`;JdHPft!|x2^%R#uas6
z5@^#f$OJ@)ol~ZRpMj8?w$A0yvc3yiC}iSNW$GRo5P{rhoZE-;;ZM);Tqbvn0w@+N
zpO5M@7fi{ejlM_Wj6{aX&tPDvv4x2U7FtG#s@MS>f+HkvRUa=oCxAp74-{@k``^V!
zNrh--2fVFVd@?*OeN;iTo<!uq=}DpkE(#Gd3>xOSVfi+p+a@MM38g7g76biqg+*>N
z!w+P%_t(kvLJK`$3^ltf`N<2xz~6DQ4UiIHwHB3uL>dUU%3K7Zk-EL$;#ZF)`K35&
zHrISafCN8)`~9bq-BPt0ZQwE_$WI7jt23l`xOiB^zV&kRZ^(U%+YoR`>9x1Rr|RU6
zIPv#^qf7xHTjOKJuV>(X2%(<&2wgAaq&-K=h>C*2W0Yt~Jgz;X+=m_1-IfU_59L!C
zaAxJ)DFq}7d=CtTgFSWtJ#*8NvP`T5FJVhPuNfO?t%}=def;kcB1j0Ftj@AA*nOAO
z6R2tZL~S14T=JC%^wKAs*WqeYPlpeMM=sf%LE#~iMA=m&uD>#uO^#xDlmGgn`hU`q
zF$`J}YLyrGiv%~mg^(mKYrrD^=9t`j;HV0$TBd|?FGRH6+AvCkW%l}r1S-Hv26z9R
zqd^+J2A=J65Ta}pE0mvs!J#5L<*w8F$>(sdtUgUVET~8ow*nvEhUK1Zz4$P_!*@wv
zC=fhMHXcV$tgoDeWzunj;=A|wFdbguo1`ItZp_`46A}V9-;VulkM_Nz&7QE+XqBU%
zO!J{C3F=m+X6a_&v(Y|>)_H9G8{SP`l*AW~jb|+OwCt>}gtdJWPrDqbtamB$zX~0l
zJ0_EXuU`C6UkjZFy{C3F4ttz-31Ov~6g&byt9(O94F`V(r@`P@9!GA9AQ_t*HoE{u
z!)YY<R{9|Pc{Xu#9jIJ<h&hQCLPE1H%7?vXc^x57P5#&$|J&Ii>9ko3U(u7W8NQNr
z8P>3>@XgP(|HhynRHuAmaqEN1(#ZcG(2^LOVl-I0_$RK0g&ew@#FaRUG`Ql%_FMa6
zbPJn06mk=SiKFG|)u2^vBc+#`)HQs&a^c-_#V&R7#t5#K?!|w>9~5#G2KL;fogiDG
zuI~k6gwxC6mbsw)yQ7%ha^=$ZFBeK+&+4tU3onyDKlV9Y`&-f0+AI?1l3ld>^WQfU
z2^+rFe|F(}Cx_hO2aBbBvm%3WQy$hHGHmQUbDe|Be}KOR?0<Ca8OMT^w7b}G9|X&q
zU(2mBIKn1od!C&K>J6JpnYec9a9>YI`-kS{?2$*Xf+3gf*<F=S+RESOy+neh?pE5g
z1xY>ljOv<|y&t)`z2RN21u<@VyutYFP04$Y2h`9h{{f5Sm*s4tRo1OZLB9g!Y=oJ+
zOkcxuiF1zDJuv0g!?nGD4)a?gqN4Th2|q&Ke|^$OKvt%Xm#qd1T`pwAe3A5l{`{Iq
z7G#Sn5%i{ptW{BiiXTpXw#O$j9e!&a-r%9lu<pgdHPfE`y+#54!DMay<@cexC&zt>
z{yJ0$d>>o{WnjO#l{qVoh}}Eiqi?UCX>D)c@){@oc6>c$k~Y;f9exrWe#CV8Q}lEe
z_=yr)X2TCokDX{~x<5YFgP%ip-ht@51~Boc!Yy1a4O@zn6uKb4d`<b>LIS+gRbeXZ
zhRBfdcOmk|o!2`>!9aJ-R{#aW#5SFLZqR<joDp>Le}3PcyWt2L4Jazc-jhaIPnq`;
zCy#w)7lI97fE@oB2d*+h(R47l1{X6PJwCjYcw3qg5&_{sAs~q7_a?3v4*m4h9mh=u
z9d7h)mb?S77XK#<U4}ZYdc?E5TOlj$$7Gr4**R<+vSwP0s0u~oPpl(TF;IGLQ0gcj
zzsZnQR-q%5YRI9gA@mc^_*~M$xwsBZ;wU#UNyRfS5-wCxiLeL~Q7-TIJ%-2KauTN6
za(4Lgnx$#eer5+&HQB~eF9cZpX#iRu&!9I3807mk7`qU>y8lT4au)g}bA!|VJ&MZ7
z*3O^(O{!?e(Gj=_9R(;K?;R&nj8x*wsKMn!E9>SalRNgjfa*%ew%LlrxyfWL*pJmO
zFNR=X&JHEcbwpZlLeddpeC7W%gDxG}DpI6y#f}w{gcF_HFS^M~5F^$oKDhP&=<+H!
zlq9`Wyp;V$`|2P}07e7b6waFj!E_<F1m-V9!FP3Ijs^m&zy2?{eJ==Dm$^=Jhlmg(
z8D<UcPzbXcvbfH)?{ED$exgjc6|hccRUWt+eIar1F2sm!41fC}h(hnZ2S6i|`MbLF
zXxY3;jO%uvN|?d`>JnDs0ps!8VmY7zy9fUW*u9c|XT@qrl77b<z2I`R2~;Yl2WF>!
z{)*;Q6wbuN%G%Pc@d1SVq|wac3eOqHhnio6EBhRsAA$oAT1_n-??o_}XlL5F=WtcL
zPI+JkFBFIx5XHW0UAjh)<oA*2D9a+~^S^D|aAxosUE%p+T-)nL+C6^f-nNTKX;7G*
z3lWTUYq1(B`#yk@bqXETapg4ATi^J$&>O^FalF#P1Wt(#NBax4%Px-bym(*t78OOY
z$g^rlEe$c+wT%t$_urRx4io)>d}V&+Kc?!cs;R7aape(c1UT>SEe&NAR6JRE(&iJG
zn)uv#CJa$p9#n5W9CyP9q^g+uo|E*iDhQ=6^lonh(24vKDEzMc*#QF1O9E+G7t}tu
zkN8P|n{@Av{RMklLJ)U}_C1|z`lUv}b*2gqyb!;FosaNV70$$r(VDkqk1nd|-aB<R
zw3-j!U2bgj_eVtx{qb(Hp7iTsZ~5w`JAuU8uVro1@vyde%slyf;GA8tOFS#l4_7SS
z`uDHQ`Hu<KD7NF6?60LaWo9<BoW>laoLk`o%cqRk2v(v?Ji>#5{rz{h&A`ZUdusS_
zVjrsZAuO!kN<?((!H@5YCtkY&W~qTDX`(httJ6PAceyL;7ebmWD>rH$va>mk#e3h6
za+JfEYs<y+LgB=}&mVXI=Wes$;3$*c1ejbjS3#fw-4n7>dW8ZerjwhLkrdzEot2%Q
zXr`7Nu0D|2v`^AGfMd2OJBs*?o(m$eJR?5TRzeWRYG(8BV6xNe_bUNUcB7A5c2{>3
z72Zijz*O)2d$c)d^V5IzWIp^<<z!e+@gfIiV`sLd`Q3n_7)H2ru=k+1HodmY*L3-b
zhp2VO(ctN}O2XXnvC0W3<spM~z536?{*%_eWXB6dX~o=U><VUrf5Z_X@N;V~A*YM@
zNVo$l=IG0j4jzActy9LT+mN>B5Z<cod?7<$3Koh-5!VP6as3aI|J)IJ8pH4i;jrP$
z>qEl87|8duSXOIwEv+pv&^EuWV3;ioVvoO!A9H1Ey?2}c(<UC1^d}t^i3nwce6+17
zhN66rR%SdvG+rD2Z{#|q5*#E<(=Vfbe{Yy;E{~(ih+6-DU^l1YePf`k^*CU2XZb;~
z@2{elia_&BT@berLWF<ls3kFXulOepH#tp>G})@l2)={yy!c@vP*o)w$CFi*Z0Nj9
z@R~-rFD);J&NM&*Ty<eps|qo1YOAWK+b(ndrTMEBW3a-Ks*1tZTAcxBMEp1mg%`p=
zkvQay@>Z8enq3u>L`Y0hc7^mofGnt)C@ao|?oYC%r>EzIp8R17-<z8j%M-05IOlpQ
z*QcjzPJ96zx;nJ{t-BkFyIs4Xd7P<aF`}}-dHZ=kX03N$WG|v{5L(=uD%Y4dhBqke
z5L-l_;-Jw(-jAvb>vQ!KABn76e|&^eH>(73B%*~(zxK_uUQod(fJA{_8qUZrs+E>4
z{Q|WQI9bAC!P;UD0+Gowv6>2<SNg6}AzfZ`W5m6*K39NU5!PiCJ6Q8oINegrVM`Jq
z^!s2&r~cCAGUblBW|0hFY_<U$TC**U?o+Og|C$5Wd$YD5ealU3Y~(Jh`R*ni3Nm-4
zUIa*|;vl<I_IceqCcj5)6vM+pt;`mHw~M=DUmgFx>IRjQ7XB<?a~(v$X6@0UaTxij
z`s)si#Y=-DHpe@S9p=@K0FE7qbB;Yu_wIGjCsFW!BK3M8DB%HG2f>Am%-OGMkO(F&
z2U=cQ<Zz}u8>?yrx&SZaFunPs0<HB_Nz&9uB-h5~l1?f?XK*`c{V3?JkimFs*tU1Q
zrJNJE!mh0UT-kZ*mN<Xp6a00#qJG<c&T?`Na1qOE2L9EAerRe6&zRgVs~Rbt;BG%o
zFq!P#Ob;{&G3^N3{8m1DvL3M830pbX@m(qKskXKZ{?WYSR@u?fbOz>`O9RgEdJ>|*
zDP%p8**6QmJ6f+n>Li`*|9{XjGhjF10JZXvxlEZ|uk+s-y-;Y3&AnO)B!K{Lzk4_t
z)P%hiC(MRXjbJJTr>W=S-qLweeiu;LLmXYbb{!KI-1nlZR##S68OYgVS-bYq<qY|h
ziT!mi(c;%5Z0TlTConDVR5v8fBP%nkyz*2h%MOuB3YE#$FKwsRZxbzIA`(TQ^MIq=
zw!gRD6YZ}OuP!P+=U<o>CGIVs^IO)~Z~NaiH3qO^k5$5eZ4MZlR#mw4OUtRimTg@{
zx=X&$bzk4zm`;;&4RJYOeFK~86n;DvE}@&^(_ZB=EaC9oUe5K;pjWnj%j~1zenDzW
z8eDvfkG9j>;PMNfcBMvmN`WA|6RFt_QE8x^ETTc$Rf-TCl#uc*onYqGfD0gaA-p)O
zf#gs~^2VR_s^w;9L*m!EU+kp@H%(0cPpIOROXv~PsVp+Z-PhF|PfhCB7d;JavIsxf
zcRK9Z-|aY^Gmk*VK|)$Pc*U+nnhj;Ybm5qM+T?aFY>r#RTIqqM`zfX4NJS-lohQOG
zcys-~`FrFd0^J+VBE_6xVjxitXU=}D;_lu`68tSpz{w%jD}36UKm8mYH5s30nW7SG
zG%kAgeZroQ&)y;<6jlJw6W`jKNUWRErivldVrXtdRdpw=Ajs~=?O%=k0_Ahyn0__I
zCeX&MoYJ{KeuJmG_g)(>MhAaDLEVv2L>O$XFqD20JcAz4o|^})w8ohW#Sb&JoR-Bp
zg~=py2g%Z<g<2;ap>jPdQzWfJj$sXV-@aReWz|60??@+}^Qgz=z$HR3r4CuVX6XL#
z?Mc`^)kc`DJPT9=mA}g}U5vi%`lb0@tazg)iHUr?B4&<F_mj#x{|emo>=5#+7aWYs
z*EzE{S3E<62MemszvDxJa>m2JgTk~F#hiQ2^HjGx^<r9Hxjz@YFRCsb3^xH)YT2g?
zHwQ!BXBZ{yCibm-2aBZ8{e^VOgtm}Rcrdra=b8$@5*Ci=RFKgV4?IIKB-CuV4{a(A
zin}g}F#?S=*dY80k?IJ0YVuoi%kJ-?bMoA6q3bwL16G0oB!5g^ySpe`KqwT}Blg%`
zn(Zn+pL7l&ugc1T)YHhay>J!+pB9uR7R`boQT_1D5kGRMhgUG8l8&%fERo3nNEgXs
zy2o+pqTsP!mj+{oy)vakWjIEP>kH-M{scB;REGuahGjpT_$*BYYj)f@TCX+bOCZi*
z>{3xKrvY>5KAYm5U9x^~;8Oby9$WtiOxTbtuRv@>O51&_X_*04KA8;PIGhjnQREtI
z&J_KV0cW!uZ-;Upto+W7qWaC}0HrBm+|l3<`|t@<NA8ICu*kycI>?1I2ZL5A4fBee
zI=sZ&s{m9F0F`Gy<Zm#%;5}+^-+S(k)Wy(2S<SZ(Sw|l6<K8TXz%F)wlDj!r^UgKK
z*5nFxIhLhaK~cYl57vLS<LeU<(&mcy=6~2rk<m}ghaPESQEDY#w2<F{E3GYUZB6sD
zMp_OrNDWjCd66^9z*sLgP-|^aM)YK>v4hmJ7U1K*(zpDgOk5hj+?TbR`2Q9|+Bn{D
zsiIcPk$>jPV0yK+J!6i<c16DiSO^n%bm0Pl0!X~6D<dSvwq-W9U|fU;6N-13N)P<D
zecUs9hEZ8K-D(O;W+t*0s?SSkr44Ehl-#-^p=AKNY~iHoJ5pzh*?GIlXR;Cr8<U{U
zyE#&^K71CnQR9YinsY@|zJDs)pXf2);2wUu_mwGS%+&RgL!F3fOeVI;Vpyn0r{AOF
zxG?!(e^RQcIpp-`v1|g(N3U$LE=QA`>CGOS)A+=(mc5Pbs}624W9I4WZqj*nvGAzu
zH*fuJaz|G=nyhd1X7xVun_hjg`Xt&8nIZh=Zshu+?;|`IlbH}q42|SmWd*RW%uJY3
zb-+(Xsf=e9d*z6wL#-)uCeq{X8$5L8W<qfg*F%(V77N<-g2+9i-QTt|@Q3W&2VOv!
zdhs?jd)Gh+u-c{4;JSjYhwA|=a13BUqE+#Rzaw{*4T;FL-#_0lm2!vg500Enoz8?K
zalpi?z<-9U7=b9fR!sL_cv};=Yu-;$Y$#ivK3;Y=O`X}cS12DT?ZF%E{(6C)4x(p}
z76CwT{pD}wKC;<Q4^z-P6Y)34U<^2jD*T60uiQ^Rl*T=w-Mo}5kdT3<GZ58CDZW5+
zXs7aa8l>}OVhjsES_|&rIfsgXV`#vq37Mk@Wf$c;{P9xo2t89{zhZ)X-Y_=J?>GW!
zY$znu=bW5U2jsgn-Ow|!Lj0_#jEA;KJUW6}f*0<3ao9#dP@Z!gp@%8Ppvz!~&)&s*
z!PZvRO$QcTf|>0<;%xEK)cE&5$wRjzK7j#7dO6@Slx51Vui(cJA9uN*#J}%2=w-ud
z<@g{93o7awmftBYO+O+@nz%FegLlWbqNMthe;fAo7vL;pjLAZ^jfxTY@1&b($|-T(
zJM7Wd?@=3!TL#w-uVFbV82K3#-YR)mZ!PE4;?psc2TacLx0E#O>TT@t>T8?(6=enE
zTCh1uJxutu#J;TA9?5Sb%BM>Q-|BrtM)m-yKJu;b-Iyd(Eg}P53^I!Ok=WhWqsC`>
z5mS1gbBQhiucd79G_XKt)4a$#ssm2k+;ica(d|`PgXQN3KJ11y&NvsBz?-Wa2$6`*
zf>9!1!*V_QguqstV$)s<U$XoiD4pOCyT*fi&ZKjb0<g+{R(6azK8AW0?357PdnzdG
zi}^kupRnuj7%8n0mUWt=t2Ehqb|1vB=6WMSJ;8<6UO>ZY3al+%mBU(1bgu7Py>i5{
z;J*f0DNqO`BwZv}Vm>fk2pLm_dB3Wbxr)Dow<U0(9uJ=bpZJH4HU)8)I>2{;ip4^;
zFpIEfackSu^sMbzF=VM%lW?_&8H@!A=>{PG9v7jF8;8p_M=tLz!&(Dd+rH(Ev{`%^
zkaN)82p@1(g<oO!C>xO6meek0?Eb`oz5K1BA4FHC;g-a}icKC0(L%>F^;Min4%=Ba
z?Br+WNJD*DI;~N$i?iH`V`bFkr>0l7>}Pg+HugEy?g%Mt%}VN`4>rA2wvO}N?^)f2
zdO^Wf%%`bMzLBYK<>&g=(hUb3_`+jf$vE!sQ>s(bR6=%VXNRJh8XFJE{bzJFnr5w4
zp*+w{Ju6Yt%(9l}bk0Q)V%X{1;;OP{!JClxq!MnTumiRCJNu=OJD7h_j?>sxUd^|{
z<;~73E33(YW><aW8?tL*KSPv7Hx2IwhM&S|RAIN+HV4b8@Z<G>Gz*#9eS5Uch^v$~
zg&fkZib$N0WaPy_j3VwYP18WkWQJG@s;;)rAA>WN9=?-42Hk~_0PtfItc|smnTbH@
z$=-+Hz~h6)@Pkszn8U2ZV`Lk^>nOm;Y&PwIC%;}BAB3!^;WFMR4i45>R;OnT$%~4U
zD+xweR>{HFn<dm*kWtJ0aeN5K0(0LF;I+b2hM~K&yO2n%1fYVWBfSup!S_t2_Vp*9
z?$=B-%-x7>l7+*B!e7Y*VutQ-EDDbnfyUmK%A3@aU{qgw29HcBVCptVa2DJANd`Q}
zzoB<-T%=sfmz3%R2uj+RH$Ef<CS(je0D}2{(<OU17tf*my=~qts~QAZP7lKO@9LlK
zEFaRp6UMxKoof2JCu~HvCG6=7r0?8t!vku_;mLDDL#CX6!CJML-<vm>M!9Di1T)P<
z=L0A!s~+K}LyZ%H@nh0-pxJnD3I(x;pfw_t$NVNtmMC0PBbz-iWR!F2SuvC>55NfF
zOs5s{+JBEc3yNWec48^uhKP*hfkGX0VSB8^#x8YTYAYYEO8P<Ja?kN|67jJ@0*e~q
z*O$5A)idB$tN|zXcOsNx;5>wyDV{+ZTBu#5K}^?!Qi2pQXtfP2t;TM1Kfj&fDWcyu
zvH#t*SoOnoDA?hb`s^Mjs$AX<d2jAEFu)NkpCjjB=~+yyDT@_%inJ+k`>R!MWnB4Q
zU{HJlNY+8*jftLKy!=85RAM^eTBm{Dzx8#`mFR-1%>B>u=hc1_e%B5(|KK5v`yR)@
zZ^aH^6@1UAK_RwQkWrK_5`*{RuJF0Sqai0TfN(%-k&(NJhr;Bpz)l{JBNiWKFE-Ym
z^qXDx9UlPLjCEi65sHL}EPiDJWVEhEYhUd_T;8YUt%!d*EbqEd)V7PW$0fbzaJU0w
zX1^EiIzj;(Z?cLH@lTvU!0z7K=(ZK~nU5B_A5lGEsPmHf>521N)@4l-vkAV$zCzO*
zWbi{Xyt2c!)<SSa>yG#{ka|;+Mc3JUs4AQXi3d|lD6>}OGTBQep>S9K;b*~P5X1*m
zW!Hc2xl_|%4#Nm1Je=j3najB^|39+<z8vnpLr(I9*wRSfC9S#&9sPaD3xxcVcK|jv
zAOHpBV<N~V4pS~T=nJ{mWfvJw|8Z<GZ#_rChtA|AW^Mmjh+~7(LNSc(?!c>gIN>Rq
zEB?yL%H|>a<B^<+r?uviljDMZib^Lv;in3eXq99CaDfj2o4vIQE?(28ltT6|vRkH_
zOw7o+4^{5qf|~-`dQ*`U9#JGuzfdegh5$IXhaIg)uY?eB5Qt`4wdtVkR%^dXa67i5
zA=+s7!wj=NT4U@?m#o?cmAvqyla}=m^C1fS=g#+gzD3Zy1j??=Zp07W(arT&%*{=$
z0)&g?$MwoNGc}(ngOgPz(MF;*2?hdvi!B}}Fr=|JQtW%{p_l(D;^o`S?O(sV+p|qf
z7-%Z~eLMKq>mAzvT6L`L_#`VUan&BYjlhzZbC~kvFOh*~Ym&_-Ko!yj>w{a$Y`y0W
zOk;PtG<@-rVd}Qnmj5mjOcf<t!Z+wW@OHO-Re@6RWvBY|kH=XG`k>^7bz8t|$B?#J
z2rNP>q>Y(}jc39cTdDP~h%noC9N00>%gQ$BDB@js0ro~XwmH)=Fu(qFQ6wKNTu9*c
zCsLM)*3{o2aKD}rHX|uSig0r!fH<4_{L@IITMLisyK%E2A5>O$hAM2S7vPM+3VwY*
z#6N$n$nApR>mu1UzX|h1R9u~2FKy&b=#$=i9MRr(`j-Y||HDI%kIu4HxE^{TXd-DM
zYbtm#a63k?Ad0bJ?mZ`*XMsB3FF{8F^Pg=0xnLoOn2h_w43YP*-ijWh_g@*_1tU5c
znXbTBK=<&HF=vD_A-M+=6G*a!*lkJE#XuP0VE+dNL3>XY=V!2fo8#?yar={*`C}t|
z(!|mbr5b%n`QuE>OqD{wHFjP+3p`}<v-HgNLanR7+s2&$#iO6$M?FL2*#c`=;rTpX
z{Mo^fnB|jB_ISW;if_Q90R|X?i$F#)<9xsKN!qX7YbW@wo<m2*jHQIL^FuG=P;{=h
z96p<~71j**!r=O?1Eoc%HHc~i{AUI&Y@F>G<Rcudb#Fleh8FOfko3Fc04&<OT@**V
z_;@6TlMw^MB|JkE<T~WC7K}FnO$#=fwZi384pasH=`dJ=ViF;6oew`if#cKScUm~x
zX7A5{ppbx!B96loU-|eR+iVky@Yc}B0_Ra8o8tNE`jkz<+dBQz-{4tMKFg8*%F`dh
zTa{M)LA5o|$Y~5Ob_e&LK{ABroWYb{T3p=n4hR>9Z`+*yNen+C56XB91W9K}bq0$B
z1g|{d-tlK@vEyXmhe33}xE>d4$9VuozAtSJO&6Zl{<PFBOw)1V$&!+|bvg3>!wSLk
zkZ#X^^?n3TV#XvCMa#1`T-sfYCO*D^i9xEe7FU$JM8O+K0i~O*V%8w(>OK>+D(9Fr
zr1h|yV^G$>Fn&T35a9s-IOK0%`|5*L|LEIJl~VEneTc^4<{xKbBI9rccK9Ku+J{4e
zRBw*UsE4~BB#5oyyEK>pKfiuC3JMa@(vVeTNR)Mj<FEplwH`6C0k{^zp|C=l$Pvd}
zMQs+`DEk!ZNy-2VfRj_*W3v=TY-UOHt+s|Mw_6xrzcP5uA5_xf#6djI5wCSd<S}tx
zr{`LH1bZpHI)u~L-^9$+(eaVS2W1PlHh{c(>C!1Wd)%E^Uf<!-wz?Uhqn*iT=;9h*
zOO*d5m4x9a9Q4jEVhymxZw`n04T`U1vps)2V|q2`QSW2LfjHs2hA60`lXlwjA)Uig
zn}JS%hXC1LIVX5^MLAra^WxOh6rd0@25@MyMh-oRhx24RX<lSV#T;(}wkkD<2={ES
zYTh@Oh($*Pd7f@`919H&GI7t_J=AO*;k672h!>(qpgFd3IOH{ja)wLPJ~2bHmggSF
z!|e&d_JE@Hv@z9?1y_h980wd%fTb#!9*`8QxS1?;ZkcetINwc>Ht}OAh=%zYR>_nn
zF;jeJNQN_2+}(mNyVri6tnWKgpEWjfr+%Ah^eB)$hssy0vrw5=YW1CkYiYlx6!uqq
z^7(yDjnV%5)1KMclq*&m>%aa<{jzKNU|~7S=Zt?<h$szwb%pnir<qyS_Qr_HQ9nY)
z#uCBbm=&t}b?H~x6N_S#<VVx;e`zqCK`M;hfg_QfXMVKqcA|CScv{+<KW{36Waj9O
zyVca;v6>4*{=MXw`?D9+=cT|$>c!%%#W1_N(}Tn$)K>frZkh=X3JCHX*6>Q=6M|&G
zS)#gvn%t-&+APF<kMP5*sE^<oEuAIDlvy>qDm}R1-JvXEllw0y7%f!&$cT4)(es~1
zOHi1ciQZI$TIZ%>JyEekal4HVW@BmX9fdcT^m6i3;Yz=x$j|zKKjQfu7N^P@mDRK2
z*koO4I&yF1d6IKx>cqD<l6F6%JvwrRN+9Yw8OGCyoR18}Ti;1aDPDLI2|*MrWK??#
z#1xtaci@qK-D>N&zDCevGif4<2&@t53xi<9>as1)7|HV?Llk5qedXgOd~3g*g2g6S
zeXHU+U-RzPFT@EJIZ4c80sWeYpkeBNQ$PN1H+;|e??Q`=N1{pntjKNhdP}3(B--7@
z>ucWcQIoz<#Qn-zi8UG)OMTJy{b8hSX4(GM{=`4W7fAiM$Y_IXKJCmeCO#GVhB}E6
zQlv9?!~jbmIDtL4ErN8sy%h<O_k47=t9qu#=Y*@U+ThT~g%)y_wHtOR7EWjLf?(q$
z1M9dU+O|tDz`rroado)%5u_PX_)IOZ4pI5k1M91Uk0B7%Pr2@;kN`X?GU<LuI~P4X
zH~isnlaE)b7ah%8YMIN7PwG_~RiPK1d1@|N)UvYahPDqMJg-!cTv52O1~yvEeJQJ%
z9oQw^*T&$AtyU(;SZVqkko<L$ABwXZ4U{*iO#0!SGl?4!hak9NA%0){@hk2iy8@hz
z&3poppUE6Px>#pyC{9QHw|BI4w33CmR92UyV?oKTcc0xdyb`F)o{H#^yeZ~LeAc{3
z81EL+jTz@~NakZFezKrff+5sle>5|lgt}F=1Lf;koFNcQ2099kwg6S|C{(dzEU&BH
z3<*vvw7t0ZqxCjSJwo5)*2hu59JGM;FTQin8YdUu)?r{m#fMe&T!!5UCR#GQ%N{ZO
zTQzQ2+4@IYDb7%r0Y#h!@FqS60C#uFaiu00if0Xrb^BQj+64q+X<(vP*eDuIj*3t*
z#-8dSCeH-cf13iAAfwXmGxekl*<sMWe5eTS4(d1g#iz4v?sE1L9<N^eIBMZOQXhZ=
zx!OoIM!i|VO28lfA*(&?)3kwnx%W`|W*=EG?oTbY6M@Qrd&_ghOQrPlB{;zBChD}X
zg|{%si~D3U*-gHsy%pDK$SA9}x&Ago7a5QGh++~Vw#<T2vx?O6t4q1WKvsTdT+Pyk
zX&Zo>)0ku#C5!)N6$>?D@*;>27}&D)e*MGhP*$;$1%4;O7yl96(gr7kWEf`y*Ys*D
zgk!0LRIziE+WpCeg@@8RoySKM?$aZRN_cxXhu$OY%$u&vw9aXgdW|7L?=H5Cl~bvM
znk<?vJwh?|3r7==YUMd!kSrDz<EoxKc@($rQm<ruA>y~E?3Zolwz(oN;mX0-^`bj*
zlA(b?-}B0fgsbO6QgDhF5Mm8x(n5oZqGFLyd5<ri@{4L5(U~P61y7T|@cY0XeH(6V
zb-pm?`F~drwjY7ikdh~!MeDEkenC;@a0z+rZHzd@ZDNN@^+&&5BNbeeVwV9V#vK)f
z%>)S&0uNP^%*+(o&Sqs*ZIipId}=vW31{IUIIf2@#E>hN5Bo|>jr`65)dZHM6nd$+
zE_oW;;bP{T*%o@(z<F7l<puNkZe$EhEe&F(!ERr~W@411Y;t)))XN%9jIvf;6&1Z=
z@c<RozWN4|-D{R!D;V2cMQTpRJT8=@)b=9`o_^=f%gf!RZm%pf$t~B@Z<arBaG%&F
z?epk?wZ~b8M9_faxX?2?NW)9`h$_LdTpvI{j@UDv)R8Q%7NR5Z_LS^h={J&$vY4+y
zx5dt#d2$UFndJTS>$SYxyu`kqo@1+-FR*+AQyk6RUpPZLh~zYsrlYmh*h0qYuFx@u
zgzBE|DXq=fBoZQyLD>WQ($89miScN?w3Pf#bO0*hh8G4kkVl814}LHB%JaiGcUv0V
zxw*NNG@K1Wc8@&-YI_gR3U1P@T}kwMywX&q3pY5t2bz+D?u#>?(TJf}6+Fus0SmSr
zi7uz_REI`9?^KUL;zoO_CmkSdn{#KN`C~-6ICK{@IDFU@C?JZz1f5shbNx%Uq*F%#
z?nh&1OSPbo!-Wl)+TYg8-3D5%{w5}Jl+nPP;`+@MP9Ay)<_??|0)aw^d)X=YLRm(2
zA_Cg(s_?GJuw*$X0$G@05DRAvfS@@wY;wv<+y@KrzXCc`j7`XG62xpN@ud4gx0wFh
ze&Z1w19K3q?Z>f5$8=#YBY5d1{gkwmYdGyHPd3uIr~FmQ+Sw1TL>%o*)>Gd5D&|>!
zW;25nG*?!<u!w1C8S&rEE9NXCuP?3(#T4ed6O1*{v1BnlUY&klLji-gbx;0H)5`j`
z`EYgD-eRPMJddEa>y&#(HJWN5?e#zO4Gvz;xn!tYB2jFmBIuC4t96cD(+d$5*Bc2h
zAh=9}W+q2vUrI83+V9vdTz(Z-SxYkKTI16OXMZ<&14C0iJ=Q9!)(vgJNsHCpXk=`2
z!wU-gcL&`?tmZvf{{|0?hx#4)!m!LavwgJ(357haY^l16lWn;giTtJZrYIyZ?At1n
zO)!;ck4cub%k~~dMdHRM*1tB~!X+7&)SK#TTit_^@$9athR@+TH#}}dB#C2SJ-ElW
zXdnm_*V=wa)yR=n<;eGEVIVpSNIYT~<WP+)K1En0R&kKmX)<V=1z@S{2~Vvktvzr_
z;8@ujj#m35$e#iFsi+bH;?^0WATcn_Fg7@%TQV_{HTZY-OIhV>(xtN`?^yUF0v(Nk
zp=@-j-{H31zHju8AFXu*0tS);7}t>bhHMRk?rRYR?Ej~=9t2cSbuRwd-(Lq$J`4kN
zEI|BZvN}vE?^Ye*^x#nCXg^v7+-=Rv(+AT_s-7d;4>y_?gCa&l<e3WJaBJu~dwHP~
z$e`1jpI+b^U3Qf&9|Za~nR^4!+&06v7v9QB)mv8I;*0sQ9<cFmM?ib;Nq9HTQ9D`d
zSsJaXd4j;g<9}@&MaYZ)A${qL`*?XXRBw+q{r81dx6Ei_Djq(2*UW?D>U4~JIP1$5
zfYDZA)SJ-s`aEpL2+=RCZT*C=H+Ow!Jo5$eOY$W>{-Eki=Usk=-ibvwNe&dW(ku{Q
z*Z)DX^-2<VcY`B=lPARuz*nqp;*_P_(YfuJLscHtIrYOc?W<pqQEG5q43{_&$LcpB
zY$jT`{>kFj#|W|SZZg!#@pC!X3-|Ya9R$%;ZVliJx>7#VFtsUhHee7s8T3!;TdNaF
z1qp>?57EG)dtlcgmu}idkNH&Vje6;zi)4T??2o=%{uzT_q+@{CtG&hR7d~t-SA`Y9
z@hmZ3F>sWzYv|5jdwl(vp-^F}1V_LR#39Fr=bhMSxIGR^!%b(foOo}Rd=e0X6}dNQ
zaFK;pg)8w%b@lbaWJA*K^Ca(ji`B%wcJ4aRGQz{V5s*am!Th0VxKw;-&HxeuNmA2Z
zdd4zmaXKiP=*Zr7x+Wi+B5JeJNX<e2C0B=S4@G-yNP_02Aht18eE7_~w23S`noAyb
zeRHw**K{x~di!gAf!w6Ylyd}lqhqni82!?Jo{E0%t8mX1YWWMu8E8eL4Tw^O{xG9k
z?Lf&Y5|^?EeTVIF>?j0qxf##V?0V$+bIsIApOk+bN5MMMGH<ORvWsoh;97R(CZ+AE
zq?@kSF!x<~I~v}64diEdrc%nyUW`rS*Z$S!@voU`0f~_2u!}`+K)ks;br7XoK`><3
zTftb#K`pWuI92zof6k=+WT*R!kKsAQ*0M3j*vae#;Pe~5HNZk?m<suHSTHf9s2~;g
zg@Pw1er-<<P7Zgu+xOR`MeQ}1e+1rAH6#pATQjPQLOgew4natr$=xVb^P+YVpLRB1
zCss?a7-4^~GkX*7$r26iIX?Ms`Wm55=Ru?`HqD|O38WZ?H&){jU|z2k6ZQ6!#1G2}
zNg@ZRYYV`)b=WBp{z%3i!B`k-?y$<kzok+NP~D2P;Jg}w#cGJ`yt{P<lmn$M7|OOy
zFGZH|`r+Tbtv$RLF!=Q#>la(1;PPCF%uys4RR#g@>x3;wg(0Hyxrd|El6B6}Kr|3}
z`y#@ODUufozbeev`5BoFJ*5YzKO-IHZoLUHt)8Sl?c`!->=g@ks#~YJ?wS~5>BLK0
zw+cC%=QJQPMtt5}RZpUnvIB&9jA6(LzZ_Tj;i0IB2>=`IcAH;{QbU~8%vGmn#$DVi
zaV}cAce!_zw=|UixSlc6v4#y{*qr(^g=2#EY9nf%BCJgnAfwC6m;2xdG9PDi8k{vj
z%L{AxtCHkywJbdtlbl6$t-KIn{WpI`a<DYLAr+o1cyUm1%UW=<H86DZXVz|DHfd;W
zt2Go`Y5w&ROOzKu!+9cm1?V@%7kqR1Kz3Kw`w8vEp0z>IjykIcc{f}?kEonx6l<py
z1DBj+?gh-2p&_QZq2qAl>|-CPoXgr8&t9T6I&-9{d8=F&`4isaxx**{1J?Vr`jcF$
zMy!ij{?&r#6;+|H)*llt_s?HB%g=~?;BGeYP1ZlaI3d0sCJT!o7&56(RcHICb;37*
zHRVlxXW-UGtJ^;gyBHV_wvH6Q=@dh7B%qy8ia17sNIBwBpaJ15%3S*ex`6}%jA5F>
zl(9P;?v=dw{)zPfMM=Gh20eo!#os+|h!f5%FG|T_;x|CoRIrdb^r(KCgI-ibbgn&^
zThl<44xdE8Xw&3GLsl9NMe{l!Ft*LO^@rmPL$9BgreY92pRH%VYY2Zz@iP2vr>kiu
z15~bZUoD&+Qe`<!O=hhvzgt>aAs@<?j`IHaJTXCjr@Z~VEbB_Ze1m<5iVMF@Tl4W`
zUm{?(Egn8?oaIWG4XT}YYsB1%@SK%AI9Xqov18So3v|S-xD!4S?n7Tm4xFJ>m;nB$
zp!+j_3n6#OsK1O)9ueeSuSGezO;Yldl{$oHs%G;N{3ht^?i2Pyq#x)yIp*^INNvcd
z*$Mc%ymQ!N;Oc;Xfw*@)r9ikaW>@b)z25w850VZIi9kj`nPc97IFUAvb(EHtT?64`
z?!R&Jq_^WG@*!;f$>Y@(ZXw5XHlH6>EH5U;-Yz!(2?*HR@?kkX*vaw`c~aeQPc;1C
zm?_#_DQAch=Y<u>9*tr&g!Qdw&18we+P}{-j2U4v!yX@UK^T=h!!K~;I&Ynto<^>f
z^L{&X2gcA7V|XKqh9^j0%y3w+kccgv?a<uJ@wafUl)}JKOG*%H9w=JCDJ<GY<VVFP
zaY$RO)AuHR9GzOsd)m&N!nwH?pP$obBG7-bueyQ(woigjySb$*l<6>3ln=!R6!Hnp
zRkKeTbQo5Z@`d$k_IqE{KWUkbQ?td>L~wP=_WzRo2n7{9V7NK&erN}IldD@Y9r#Z%
zpGM%bEFc8}ZrNO~GcYj^WA^?`b>ySU32?o6hVpooYl$RsGke2~o=lxzy_f09N@GW%
z@V6xwE~44gl=H@z3x6?-j*F{MSIXk22chzpSLk=Q>V7j)0-2%j4K0ND7i{Cb=a@0S
z#0Dl<dA~FaJ#F+Ng=t<E<HFt*yU?9s_;l_zFD`zpkG$?F%MuC4On*(=fpr%MZ&V-&
zF+dY?1^m!F&|ue(qYS8&-v>kG4=6`LZGOY-PI}c!5sJ2owoWh8lW)K>{7*b(AGgW%
zu;gsd1yL)<Gm&`sIfOcdPn#2ZgeNc};{Nz(EsBA6y*UMi1Fsxs3_<{L<_A033p{n;
z0>r-(zyIJRd7;rX^qn{PiwUI@@2D+{zE}E~3-i%YwgF}3l=Sp6mqx1IOk6v6#X8p>
z@*I$6r}CqUs6nohU9xy{`1=U^>->Yar~fv%Yx1)ias@z-+~f;kGNmpP{y9m}&F3Q@
zcg)-yU}R~v_D~5wP|O($+hq#Rb9yzQUvAg<03_?qLPih^sOJ#wi!)@e$z6BOM^iG@
zS_>J5M;igBYipKf;R)Tc`+Hs?feeL5`{DcP|1Sr?YO|<WgT38YB^ygxY9!zBkO#Rx
z<ok|NgsRZ-(H0o5+jRWjyc5nQ{twTdNF=;(74AlEbt+KA@}qm6`B*F`4*zYC_k<EY
zP4MO5y~$4!az1IL$PN|o(dh45P53S^{uh@VpqQmsavriidan5LM_K$O{X1<lng#*E
z!7WPCePrdBZi6c?Gz$*YgwEzGaNTQ}edQC2e5CldI4I2eM!x8u&B4Jqv5WA0J47H;
zW6L!Z&>J$ynJjN^8pbH=jK}pVGCl{rF&Ux-#14A}j&Tiz-$togP3>68z{Z|h!RZj>
zhD*D<BgAh{;Mu<v6aKJmD#kGjc%sh&5%cS!z!|@pp7vFF;s=Q<kbCeVglpk&Co4n|
z#fRs?-BDFvJ$~9Ep<|ekBd(J%<P|OCYw_jKVaA!0@1knCqe&&rN9>f`>g_^<Pw!Y<
z>aA52+l}=k1ShvMje_21G_e?H(Y*`X+ssSlXl-=PrHQ;2p*O*V31&={jw>GvkRP*H
zeX8)zIJ-Clo{M(%osxv8t+tvD1O_yy-w}$JI>XF*2@=sedShNPrv@PY`bwkOZq<8D
zTd2IoJJl`@ZEu6o6aqE5==*hb7i=qir&j@_>?$?KhcFgbK3G0DxE{cH+s-IQ`f%1O
z{QL6m^ka8PaKbOBTpw1TyllJ@fMZb0K<c*jmYRkfY}I89T~CP%`CayJYht7WfSr?^
zg=8tXB+y;^?olHgc)C6#s~6`z-naqMo2%g`M|YF2tEmc+Nf8YmoNF$R+en&e#Q88)
zVe=;q9U%vXQ&KpbZ3q#So5%^v_t{~b*=xM<HE@an^$~Zpx@5(O)`F=~lsvCM81qd4
zi)|#Doj7`$ic3U(1hxR6Dx)9@t5&mpoHDgoqp-QOkP4}}XA*dNvd@i7@or!#e~{xx
zbQ0aes%bI&RD<6hllw&zQ9FDm-*W1zqayAYnb}OkW$wzrKlgw>|I2`Ob^HSUKC$i<
z%#QfnTe5*D>8{6KPd`zgGj`75Yi?%6Cj}H1ZaUz0v2^(fZxJrS7D7Xd@0*TjpyDU7
z77wwCIiFKL=+?UUSJ>p{#R)4nj#kui$$os^uJlM-QY59ZT9mNF)HiU|z|0<wIk%9l
zkg`55Ihev1t1g5i9r%{DZhHMHyo1(%PaKsOdQmS8+BPrI&F)!cWr)JS+~C#B1|Jd&
zX87$57>a_QNWNb|pdF1ikR@*|><7XNgm06_9ybC(^L?T!q*gdfzt0Yl+i&Z{r^3&w
z`Xk4m^wf}~M@D^0mefz?OGuFwV2um&oZD&p8#SlUSs<pJyy)8<_}@1->6vTAzRGzt
zu!DvPovV&CNs6!xC)Jz9b+s2&^E8<AS=E4$C<;6i!3a<`7Xl%SOX@|);F5ZOZJhl4
z38FeH^j=RA&#FNbA7<aXJ^`L{bew9Ce3@iVeTZj)`vd+I!KF;Cb7yHDQl=JWi#TV$
zTQ>UD&aLK5Z7v@DJBXsUW2HlvYHc&^`h)Sz=8h7-@A1a>+{}FH&W!skCt1*&>jmz*
zb~0ssxm=So+5mCg+1hOOz;UAPhPD<#p8E1S=L-XU@?!4-ci2%kCmk%e?2_*=mYn0U
zqhRL?kGdGS9poc>jau>^+`dp6Dc^+cKL&rW_8`=2<_0<mp;G#NkHCW7f7L&0J00M}
zrYj4CV{3}C^~0?Gcoe<B-_{14e!PpE;4Q&OLk6-Np(w_izwU>(VpgB#+_rtkjG8+S
zMv2~FiLuBelyxYK0Fu5)M%ljXOEDM>mbH|)Ke6KLLcbJ&2H3yf$|E)3<(w)U@QK%X
zG5wEf+rxqH_~P(E6&08$zy^RUxb|_|ehom8#h@`_#q|Vm%YO>5-7gT_;7g#{ik`LW
zP|zb5%2dxia^5=llg#|JUT@0(a5k)EUgYuj3smmY4f0Rlk2o<F4KG%oVRi=DQdS#F
zk&RXeoTO(Z<JZ|E-2bo4dz#NHvnZtAqAY)WI6)u~`;pxL^&sWHw6g|s+#St1T!}+S
zAaf}B$gU*oNGhKUsx={P8Qxqx+t5QEeDh_YSJ(N0e_JgF4@6$mON_B>IhaW`^AUM*
z>CYdN%-?zw7~HsGVhV!iZ9LjSfr4=C0b|t8zzjYijR<d!lnld2X1V8rbSSR8T<#U_
zW;<IPvrgUp6!Fs?a-UiClfjy;Ig>u}(-x~r(-!jkvKtN+K0BYSJKCF$Cac@8Cidyv
zEa#*6qO%@pvs`Bm2~Z|o=O=7WtKWe=gR_K452$;z`42mEGb5u*=xxK_XN<6&Cq7S4
zFFpQAxQsP2Ap$9SBtu(AID8}AyuBh>kP!irPK_Q_r+KHxr{`_W6UJ+V$%1GLpV%Cd
z8Y1=EN0jmTpB4E8OibHhv!G)1gX~($^;gwe-)d2e7|8H{f|R!B5FS{?E`nDFNXJ7=
zBC(Y3O#&|%1Tr8HcqHLIiXKIWaqeU<WVDxz$#=DYsAhf)k579e7%Q0HV+IrY2n}Aq
zzqeqTQ_zU1U#Xb#o^!yBOXlc;5sHBpnihNIk+UQav0I;;+%BjplUQ85>CtVD*S3CG
z=>5GxZ=Q)nvr6N@`ag=!J)Y_RkK%JFQ({VtkRegZCS@2g_vMmnDEHiwTkdzU5OWDp
z3?Y<zu6^AvxkQF!l3a(mB$v75{(Jxa>Cft6KA*i_uXE1xK*EqRS=R`!49zrFKU~KD
zz+sC^rN}&3_?Lf6cB^qioc_GX^LT<xA)1Y^9d{k_=YuFE(N;?=AADnKC2c<e$c9p5
z&GSNxrJmZSqMBb~v#4K1Z-g8iNv~{!?cHb6L?HifJrGNsKAvK~__Y2nQUB@t6fm+q
z;&`ElDd4!C*XT)1wqAUZXqG|Q0L4~AMvkRW>q$9KT3+VLv&JX%{u}MtqibRr3qQOk
z{4n+ZHLW)ABU0>~c%u$F+Th6rro2&jtGwGHZ*J*8?{uA3f(ke)dCF$iTWDOt@(Y8h
zQjg<wilCaNi9JrUlo4xjX~H`O=IE6rvb5U2{TOqHj1A8yO2uWi*<xb~TtM^Cw7v%3
zD&5X0)r7Kh0Fra92TIXOPzOHSZ*o(+%CD}1JWJlt+~2>)L4WkR;B1@Ws!F_tPDv{K
z|8m^&6V1m>@@n)6*7Wis8awiMfp^6FMRTxIO7ItXk}VeB{&;abyU(XPFWMi2qR^>^
zDLt$UR$S`uUv<Qo%imaAQ#faS3IBYTRha{iz<zLPK(bt|@-SVP1P?1{m~HSeaWRJE
zxiueUoloR4u#5i4)SkPNF&cWXhKEVUQ3}e8`_krV_eOzVOov5}!+?}t`x=K}95B8r
z7S-(hY95v=H4uPci|TkP`Bt}^I3zlUuaF8!Ry!P);AOZs`)|;@yM8MB$yvRm0!`vI
z6pMKTdv8)nPgNtY95hl%b_G=cI&a?nsRz?0qmP6{OVEGL_#5_T^ZS9uK<99E*kA8)
zUS-hH{%u9Sy1RVE`~Ee-0U_*Iy!a#6+6i+{57g042UFL7HEP%Q{{8bSDIQs#|18IR
zC39OJC)PQTt)P81`Rij|Rk}#zy(F|IXk;G``7h9<bQBV{l9BNhbUU|&wD|IWqtc|;
zKt|ZxlayO3-B2W3>jIcc@8TNju@UF>&*0gqM3e%*u6^>xc4zo`FZ7@M!J|QCJZ1~h
zp0oC4&{?b_u@jNQF){f_?+eov6D=|K^DJbjn7Q#BMUS=7VohgdwbnIbNjQ$7DCJr#
z=K9s%gS7dDC8yH3SGYA@Z=woC@AUh%1E2dvqKdqVi1#Tm9>+5sbwI^c$IDJfAm5ma
zBiqPkYrObG$yEEFGNV7VhilXxsyV$;vF2dHgh8zEncmvNA;*V%X}a0twSn6U05TCA
z1u?m43GbACkmkf|nqS+{PZ_4!H$8`|S*t+xaHHO<)x?2?a?2Ln_h|ig8ShlNyav~b
zE-jJd=Cmewiqxbjbm*<R25}p1I?wx@6*0>jHmM9*$+2U4QnI>V=5k=sGw-8|yw>;K
zmLxN8y?eMN7?_{mTQ&FpfIsJhswGudVoQC9Ij7ykkHKGKegT2rO>ZQM>ir*;9dD&&
zuUT3Y*}HQpMpv^R7rTDHWQ%~{^$D+PEZtg3+mBDS-BqF15Z-nurRw}QC9n8Ezo1lm
z<wbdS(zSd#i0HrN?_Z#qXBqW-z?A)+`j(~LbF^<aSRKiZt=6{9v`umJEbBE@6Qg{S
zDK15kv*OkWRbfm{oI+5#Ee<BQ5&>pQp!Jvtyp|;MClmsF=BPHuTOEfqF0PFJ)44(b
z)uay7N%~@PF#neoG7SuzX{u#-chc`iq;f7#l?o+A(UAx8&tchpHiS-sTppi0lwxCu
zUYSKRkk}%YnQp(;E|-vq5l6I6hVIR3D|Wnd(ZyS-y-78z4sVULe&$&dmUz~Mu9e(T
zdx;fni(ZEM(%R$RHB~&20nrEu9i;jW?yD#(tgEYDa^Qs_7Fv3xE!qmRb=2jEgPz5P
zzfCGy0bB|?cMo{hsE1PP>sj2#0;go*)BGA@%n1Lisu94>h(usDu4#(RrzoFp$|F4&
zU?BpjgHK2O%I$m(!D!fKt9S5KMkug88hOu`=u#N-d*rh1xgDtckR&sr&c(2wa=HNj
zt8pbi5ds0Kg}0FrNH133{$*DD`X!KqzePY0VCsRZl7B`;G41^IPNg0VuKEN<PmcyF
zuu;&sWER2SY{4zXrN$n-!X7DKfPk!$rAI({kp!b(!_23;_3khDr;GRl;jDm*i#8Rq
z8`2acvmN0HLzmaf;@TgZL8wJ-m*Wf58YF@ZUU-<y*(G)9aR{+krrx_^2Xn_(+%EWd
zK-p-GfFnd-f2$O}o^<3|H))s{X??zc`+C*kxbK=m=zjHhNopSzGz?|(tnh`y7~Y6{
z^}0i`7I^*VSBNCPxurP@M@cjwq>v{*$@0eQwH73^3kkD+zNIEwoLSSVu9<WN#oRt|
zJh<~6{jR#Q;E9g6H)dk$?w>lj7|kU6;XNrWits&$D%I3EEaqxAFeXE1(y~L2vyV6K
zp8TUZ+-o1QX@e)ZOc3LBi;rv+LW7;r<)yph6V^4r(r~p@8?GL`zJ2V4amDkZDXi$l
zq2a#1PemnFSPd~1_zS%h2xPfnV%5joR|2$5jty?(-+9Q)>KTsq17q9AvP)Pe*cA**
z6{cKgq|-(RkU@|8S`I_%Cho_)yRFY^E?wQ!L$g_A0?z!<-Id{`$MWJa8zUqAUKq{U
z3OC)pk!OKueKiClPBZ@nSiLSs1);J}6O~zFF}tTH<tHSvFIPy@+1q7xLAd;hS8`wi
zKx-QE@fGd++;cY3-Z(<1!Q4oQKz}-4pYtC&GhW1YH7E=k{>_QPH`4ngzj4<Z*bPW<
zPbhs@%rh1EYaHDgOkvnrUIk*JE8744j}}WsmtZ)ufc&dt!|x~H4DiQy=cQchr-2;7
z&h~g6?i~5yl>k7w9_HN{-}Ye=(5QNh;lIVdmASaD!@}_15yKPLWs6TV4A--X+vpO2
zHtsSaaG!Shl~j;;#!IW!wTvcg5cgIwyH~bA{%bVC3aKMGa_dNd=(!Hc;7x_d5-aC~
z(9>;?A0-L==GR#{QpkAxppJHOa6)NmXb8+fNAfRP;Ms{<g>+|&ZIAwK9qugwQEusn
z*DpUP3c^Z!5KcO}cr`TMRuLx#QsOKE|Lziu86i>Z1uRk48ls=bO3`7vR2B+%`E*m7
zKHQn4&hrHpe49rFnybVD?-=?}P2TE&RUo1CYv$5h-OawN0(juZq_xt8!Bp;p74#YD
ziHiWE9*0L46XI8MM&8%|-l;>?pl1+7PnYr2))@W-=KSF?DV^Ew?x^}Qnt((ML9!h^
zFkHgLz|MmWN4YjwINiX%V|1$<AKrke4i%5DH1hJT@F>Bl+|Pdo;v1v*1)?E%7y|pB
z<adVbd2V-@@^T!72zPj&LY6JPh^^qAfAC;bZM^77VmBfI-br`0^^$GMC|NnLTAGzk
zv2lB8?9Fc<=uB=;d!k1d+<^>el$muT)8cauEE@i@`~oVVB-5q(eydEL)OyzWKTTDv
zS(oCv(i3R{)8zCH*Nl9pR))1dOJ0~v9cWJdDojOxTr|F8Fbce8KN;Ur8_;Q_ZHro0
zmlYKh*XTPe?fAEQl9V0jRQ$7;5c$r3GPAV`_29ndAl}g&bc|=-hKbq1)F97m|1Q@X
z<FW$-?M&ge?>R7!^FE+YY#@*rNh)TszO3jPbY{C>TW?NcV8A=LM5iz+3N&UG7Iv$P
zU2OXTE_HMe4VkS7C_O(T-Mu`)d$E}Ll#fbg#hl3-j*lRWVBb#D?u5n<OR_SAKT6*i
zdzy0b<=Ok~)1Q`>3kJT!-@g4*ROV8`v#+!IEAw?gYQlFxfO61@CEl-`bZu~TT&Z9C
zwf9suyI<^Vvg4!7Cm*B4UtXo;mI42)&c#6!MedO8GZv2~zhEkGO&EngY;ufBGGdVy
zXcxCKw~zzrmJAXXekf;UyzCr3Ni4ae6`HB54jQEF{yPlv9T2;NE2`DZMMG^NUvr9#
zGwjZv&a*Lmz_%r(eJ^^ycI6#-qDnyGamW4lgF&Gd*&u(vn<AYp2fLw{UN8%CU^$RJ
zzg<KBQP@rnTaIW)6T#(8`A^27cr6mbq58tkhF#TEb`qL}qN3r6HjGGNyny<k2I_TX
z*>sGK_Wqu2cQ=ej=g~Rn5uu0@GO;pDFARx=Biu3knY*T`Y4cFDZJd9QpI?Z-(>OxA
ziY6#KO(*gg!`H3%v+n<6%3A2jfP_S!cpL++zBXBZfXYRB-XLM=_k~@_$UuT~bZxQ;
z_=RRt0*W6xr}fQGbO7g$2hR&@m?|c|Ln^)(Gcap%Pw)12z|pgxMAng)&(N6!`%bvc
zO}<`rTb;2EN*>uB!=vp`6&uTK_dzC$HEO}B$rALj%p^=TyD@DvS`=!YS7ld&Ac)jF
zVX@38vSa@1-T3onqcw*%1vG!-RD!2Uv>^|c5!85OzA$`~h>l?oCwk_`rChT|fhw4v
zCt$=O7uS4W7+H~aN;S3Z8|q<@j&74-fS~zz3Mn2{rpn=5X<X8E<+D;o*i%Z01?;V&
z@q+l5HDvf|iO?IX=o&683)$@CfqnfapzFaz7`gMHl-~idVYf*Bc>?bV_ch><g1;&W
zH*A0VFG3*vN^B(_ry)67<t50}>4?+<r{&PS;en+O{~B5j4(i<E2Xu=m&0P>#c}1%h
zOpt3jxTd%y)YQO)o+dI(W(!3@1^>N1bqWDWBG=^cjMgmd^w2NXJq%^9YsmtPO}nH3
zs8SEZoOSDmPsYsot9u@%nQRybzWg7`Oii{af&_#L9!7doCwuQg@A{JAb-RORE`8Db
z_l5xUsFJvKEj|y;yea6sY+t?uo4FO9P^GAW+UjVNgDP>HhJ#y~qJxzXt577Wk}P@y
z0Wsx_>%M{oiJ~V8*+xbazVZ+#-Fe2G#7!Wnny<*CA65q<8gG-H7rDAxTB>h+D3;47
z4)t<(h%$?vniOIpND_vmZs5-oy9tuePtyE;D`5<+(k@Nou5@20OIr&Ig7<7l)0QEJ
z|G7Lho?8`wcBH(b<xkUSW*V?{!%IJ3wwZLuWTg08%vtx4u7I>#LXSaA4d`sra0!dD
zf!Bw$;x(LpHQyrRJ^VaScZJq#KEc9-HN#LTuh6WS-UPj9&7?2p!z@@n-PhheFx;6o
zhO94|9H)sM?Qj)D8=Ow~#n$BY{D$U-<)huhUjyAq<f|UUdG0HIi#%#hsaJ^?2wWZ<
zG8BN<%2;)+nRHsEvw?mRV96Ce51NqHU|UY<`A(6{ZO|HRD$)F#gDQBx5%frO4(6kE
z-V+-6?E;X&Iv2jHGb7uoQ-HQUBT>9vd*RFlR)KaHmhtr+GgIo*R061SI)mQZJlrb0
z{Fm8!avu{{C3=SYf*G;sS0EgjRiHX<sQkdvTp~C6`cQpyjk!Uoblm+|CryNEeQAvr
zeEb8aqpzoDMx<9$h|>x~JFArzlv|hc=$!zgsl|C7R_xv1ccD4Qf7gzWW^F1h%6%FN
zhaP9;zo^|grcuDp{t?G1yJ{zA%E)T*_|j9H=%xLcy*)AQVT-NcsY3L*2u0l3kD7Dz
zjO-39e$*7Ku_u%}2Q+|>uMWv|iyG+rMnF;y{~oOAcuPbn^hvVKJQ+XS+$(co!uJT+
zxj|-Mfly2Y+y}O0W8zY0XZl!klUCSr7<#gC85mog*>bBjI`H{Rj*mF~8A}|~Cj(dz
zN&0kaD>^ktgUs^KJTEFS3T_MSKraw%IJ->s;;%l1r`N9iDib~9J45cP`Kh^rx$-Y#
z*NvTC3HFV8YfTYWA<B}Sot=rYikxipm&?*Zgdg-5C=9tSzJ~7{Oi)0lN`>yQaG$}I
z(`-R0DyFY2EG-CEWpW-jjmazAc=4I7Lv#+KJG*M{eKKz_m6e3Qc%`MuT&s{0_A2aD
zYuo-0%y0R>J_j93P<BH2+&(yRohHuIDlm_zXj<5d0j7;Ix9OaJQUQ6fAu6rA;h=uG
zNM{R|YNwE?J(-p_-Q9vwkUwdbjq96z_6dN02*{9t0Kk>?HV=eE<{WJ<Q8%7`G7(Qg
z8sg}Ro2qbUaL612><@+oxW7u@3lgIAimLH2UI4`HrcN}U$(`#c9(9{)QiU?3R6Nkp
zlMlI6UopBq0Lh`=O0Rm8Z=77#L=)<Brj+y0oI7TkN>KT!%hoaMYIJt+w%TOUp0d#@
zTQ@D#2`jI%$eRG_5v*RJHiO1=kOiR2r}{bg1Nnu~mcEqM`jC1OFbP?)9zRPJXY3K$
z+-ls@J^6Rn!GCk3Lz;2;Xvb35s3SVqLP6$ihK?1F38eIH6?0Am1j2@4T)h5o<<Oqs
zLk@&A(x~Zao+O^Yu7A%s@2iQn5jSbSm7!g4J}5Hh=opREWZqdgB;DHCC43s~S}Z~^
z`a3k;(m#jMKrot*^EgAWAb9*Hx!1ouWr^@7DyK|$aJ*4m>`GL}E7YCL{zlIuB(kgI
zldi5rs~kkB)fDC)<DmzhVW_pOH^(Wmnf<z-p;q^NkOv(C)XiTGVEj7f>a+7yL=!7^
z1uqDVBfA*Xm>(Q&{&~j_{E!%<iRtI}hVdTZ(9>k6eB^WCIie(!z&Z>u+qkk%ma#1<
zT^@c`BlU}*c17*YsS;wsRCH`%Hxoh&z9FnD=0I1d_XQ6RM>e)R+|Go3mzj2``s(+n
zizq~3$G=ewA6lw#&KM3t=_W@D{A9C%zo(ISXtxOw9*Rq#=Ub-`7J!OAI$mswqu3A~
zz_#32#d?1hf;Q?8X^{T|b{9IEZ*aCe(SfSIPRxM5L#8B15M|bo#jYfD8;tI14|@ei
zG3yMcAboBcii*8vbeTXGXJ5{K@h1+zd$fA>vbsEY?M(ms1xTOGm*-|oxQk&MZkCoU
z%}s65e|(s1Z*)l!1jE@23~?7DG4WUz5Kf}!#Ul_bh@ZRiLZSl(*u+mhix;m*NvJ^u
zq0c|5UC)2L7|8CyFNtpwfiGzO^gaALV+vJ8V9HY6ET-P8b|D_?ec?{dof6OHet{}f
zHn5&M7(QM~(~Xmox^X<85K5qz>~BZ1URavN5z21@to7@<2%lVMXG)D^$o<-)ab$d%
zgankbt1Hnrug}^6xXuo=rA<5Vi@2q0G#auyQO@3JvxpQa{d4z(b{Vo+KJ~jXfVQx_
z8N)O)7>i6wRD@cO-}W9izfpFR^DJsBJ9N)ZHdCw6z;^Y?fBSveC&CR2&lEZ_&tTkC
zP%}B>B;%Dyvyy6fr}S7}wmYD>ARxrwb$`+iLac)^dGxj;&-k}O*?xWyf6obC?`)%^
z(i{|O_W<djb42w%2i+McwS2NUJ-$noQ_hV)+akVj!K+9fva-GJ{~tX+yILj<X4C#|
zh)3YVjn8n)#^7Ch*_N$ATYTTpzlysWVg<<_S?_~yv1E#F{+Y-aHO+F&n|qg)WeKT}
zmIjBNIe^9<^*;L01Klh%*xuaw$r?QJLtkT>F2~U@exYh9L%LiPVRlXIR9+C~T9x?w
z6ogi7L&rx~^2&U6h<lG*u_34G$@=AE1-^W}FX#^Pa`Wo1dWF9$tNRt>{L2MZt(yWp
z-}M0dEmdQTuc*K^{pGgHxP`@J(7%y#*vvrj$WhwB!O={`RN$Hk%$>;$0oqa~jwVi)
zmiMJwj&CEj0L^1$Ln>Q4^Y$-i>fA=ftuA}hR4&!?SMruHng(k`{qMju8r=>f()7DF
z6K|1F2q?RbvMs;_|8`An=xAejcCZY-mH?hE4OgVJ1)*o-ReNK_%UDvnh4k{^u{5^;
zNr7(Dq3v8(9%sIGYIxbH!H1M6B`Ym4Tc`lryX{t4%i9xueF33p2n2kKBzsO?6i|fV
zcL;V^5LZxTulURB<V(&}Erx{AykTf2@;UsX)*b|kb*`zK<ncaP-@1F8aPpz%EunBi
zAl^GJ=gLp}&S>?|PO8#SN@i3TY?f}Yi>Rj$h@gbs6#aI8fiKgU!+^2{QwyH*HTS(~
z5oMkDMH7C?Aq9Fx<yxnxcJdF|%bgzS_TC0;5$oWz&)iPEcnzuJI945Q!>g(^;YHpn
zy99yg!P!Z1w{76<NDk~?$oixrzHxV@{}<XfFz^yy?{2izy@q<3?1C=17Ci60Q^v$!
zMWh<c*gMLSYz=qk=diQj{A!xT*X_~jKkPjjm{+qK3421wPvHV}TAn24HW;M1rsskf
zV^@W9)bMfj6#qh%e^wzx5GP}+a@{VDY>ayeoQ9nc_OF{ntIBa#ON%VlgChYZEfWrr
z$$h=Ig3&()P1l+a{t6X{<7NezBbB&|^S}oPWo{P;4frKSS%@`7U2F1fMdRPqkWDW|
zDy7>2(YM;{=G8oQheN{R)i+2iogibf5)MD)Cq+Mb50PF_mhmg|(zjNCaIAV`CG<kd
zeo;;=0B?2vg%p$`90$79k{2IUx~7NRC(A--1m2^JS#q5oel+6f^nazN)>MV<QQC4D
z2b>{&xN`V?)5l6Z<UZ|SbX@nm*PVp+6dE*Q+tpWvlZO$!;Z7P__RQgtAo{2M(E1DH
ze(;pK2g`WbcOHaa$RSI6G`p<HQ1!V^VHPy?`EAAP=Ng3=7o46n6|d|YW)es6)51Ej
z!f&+Iz(Xj0R>D5n@@V-;;K$|T?WmLa&?C!(BA4-nr42-Rd+gp*!+4oIZ=)N%kMHm$
zB>%i8Rsz<-(Jga-kN(`pzG!qfovk(8BF-V%_cBvAKCyC5!mVtyuX`e-Wg^RXTzNb;
zw)h^taR{ez4hb4nH^4W2u<r-fyNC0-*_YRPD_JP548{HTA6tKkII<t@qZ1^i+474_
zD%&2-xWjNd1>lBN`4JS$%~U*i0%?_XLpuP)p(cPor52_<9T8@3(Q<}TU9{XF(egC)
z9%QI<<Xq%^&6hCD?NxBq>YKTbz{}*g3^V6PgsE8@b1prVeXG?~we+~jV(Ko&)OMv_
zL^y~xbM9P(ao_M?tLLui7q)6Y!V&MqaXJ<344tTsvu^z%p_`K_{%>Eaj{0;{+5&cW
z`irH@Ennz;o+x4M&RhJo;g?P|HP0q~DtC3OxA&_L)mH<k$W>N`>GXQ1Vpd30<6A8e
zySFch4AA9a&8?;p9_;<c+`S&Ws@6U75TYb%arizHQMZtmD+;Md7}zy3lGdK`c1cxp
z;)To1neHr5qP{wQ6Mb!bF1SICuVhFMerc(}_<nEUus@Ln|B6OC8t$MVxf6QD<G%}k
z9hOSgZ)z;jaoJt}ci2bOJveIPQHEN4qds`>0Q)OjA=ty%R*Rqi@73g`Znh~r5xX|k
z+^n7ZZjC1f!8j@Bn_7Y7wCX~1^_noz<qH#LG56+>8l{<4bZm{DlL62nfynIu+EN;@
z@blTVbz26XY&5LJ<QjIt$HlJ9aBonX5<B{aDLyrGgm`s|ux9nAgCl-L-tL#TDqSAh
zN_D-F4B&Kf$^-nkk=nJWf|cs;Q9u;Ez9vt3USoVIP>od(stIR=K%8%PStfhiz~c{u
zAk&AB&y{XZa+k(^A{}^69+mu}Z_O*G<ADfxKol@HPo_>fLeKaO)`SZ@!BnsYsIW71
zZR=lO`8^6@c83MsFcDfKzTh--zJVtL(6(HZT!}BE;J?w_H^w-zQ|uLF2o>b^t(SfX
zy#H>-qAwN!TOcY!xk3L4^m&SYU~!xbwc@3S_V`1l6n*H0uxR5hhRf^GrwFj6l8$Hy
zG**0p=wp7eM>B`5H;T*H^5IkT#ZJ3?n;%i-?M`50a~Xaj#BJw2jEJsxh308E<&T6h
zzn9XeYP|HNP&fPk=3ca#wSMZAHU|=8JJif}`ageG-aKJ#H4ol%*fQf{a7!v%Y4$(K
zx*v)B!Ku8qU%NVfXV&mqX&i#!>k{nm-zoD+EjA|)PojFV4O+Kc-nS2y>#NKFSsPt0
z(CL?(oVzkUgIN_M_`#=|xlegC2brqVb+i&a9)bNivT;Rrt}Ww1IjCL^M7_lie}&r$
zzIJK&YbXuS6pwUVGwRoXK2RkdOj|ITkTM^KUr9pqUIfcvAw@~_6zVbECEcq^Fc`M<
zlYSA90z;?ZpvO@Op3s$C>(o#tO*V5jZG|a8h1l5hQed3w?2yl^vUcKLtqjygUlIez
z0ZqQCEox-IM8~4PwFF<abcTqByWYuW^r)WTD>-kMYR0y6U@$J#0j$-^Q#CHJh;N&X
z;_8eBD@0G7a5{K|0EDQ-)QGYu5`eNlo)jIUKYKW0v)|v~R#sMyq*sfTVs<TCV6;sb
z9GO4dS<ZE13Wv~5Mn8G<-aoZ8RpS0_w=#PtULYY^@c&IOIFsZa;339WlHi^Cnx?6z
zqQn~oK`oiypv!yCuFT2Q0l<QNeV0Q{A|yli);%_k%3Jz2|9Q!t@`yp*>}Bz*lUvOw
z%lsw+D0~^aQ;k&OKmnytBKyIBF1$nZOtI}1AV|uq!3FgRIBF0K|7LK=@rSR==!1FD
z3__?hbfYrJ|8wY>h<CiY_n^h~KKzajwvp<|-wz%P>v1zS^2YtD_c)E$`#kwKz)2xw
zx5;pSY2(HDKg2e;DtiaiNer<L!sG~7H#caG_VAW>sI0e`O5K{PYdII`;VMRn#c}Z5
zHJ0Do^Rqk4z5X9+K-I%xVd%d67yQPcGF;Ml!O3D_G%d|#^!HS->vGq_2xL6PG06X7
zWD1JsRHF>JaQM@b9Fw`swMr=jjA1@JEZ<2(%;t>}HN9Y^zF;NLu7m-0f#ucOzVS80
zi=yKCm2qo0R4KMINH$Y&F^)S)7~~rPXgoG{&kw^Y%Gwd8XZ)al;z@7`;CzY7Sm7(B
z$}=Wq(3uiyb}wQ2YS+cGv$96VuexX@3Fi%QKz?r9{8vBqbO!)58>RxMW!U(tOnI+*
zG@6#?*TnA4?eAY3NSfcAkGgz}eyi1T^3VJ&)s!E_lCj_=`;>^naSz4TuCgpjl)23c
z50QIw*xFlVeKA=`#4nQ-UN}dd7ktjXDiDNCTy`TClck#f3-&%(z7yXpxc`nR#fq^)
zF96oj*GJoMhsUYqRaSI)nXdtV{Apm<)f;`1|GKJ622=|B!5Q_2Yga~mTE>C(+@j}J
z2b<Ct2G`PCBvD%=*f7@UP(WGemsyBH*%AeQ*GzDKo`$%So~*`scbPhi;}c;gqsJ%}
zsLWZU3XI-CQO=Q8c^2JAJMUTWe)N~Z4QW02^_6hZB<~1V1qre?fKqgr@UO@N<y;5N
z(xK*XSt^FnA^J!r)4FsuYLe(&3l)kj^*%xpsjB$|{ayw|C~NroZ-PQHXQnsVpZdIN
z(r*p%_w-PmyjnCWz~YV>=<I_CvOz@tiMP)w3E+RN?h=NSdTwgwp&yBJdYgL(XWK9B
zg}-9TxgGgT4appyh~i;H*#7gwC_%1PiG?}VfTE1+r37DyZM$gHKj-pQmS>oVg2p<i
z67&$cbtFKi;*?(FazN1yogvbYEa-?<RE&g3vgec!{50Z46$@mgDK2?BV|NWrB}9>*
z3qu2jc-f+s-*j1j9&-a`WS#Ag7fZ=VXfB|(+KmkshahopiprW^(`9?~mR1>a^Sf)^
zH-7-l8Lg{>DE4abMn+qezBUvY@FeYw0~nr}5I&|-EBWyw&N^vuOyw(H_5w9VpX*ID
zP>1UsY*>~Q5B_zel~a<4UO2m+83Tldk-Ob5AYAYhK6*!c6f`D7tauBdZAGQ|u+|!n
z9`#N%6#Ma94h#L9{NR%?5=w0t-J67dNgoYk!xA)&YElL}tEYLD7dr>#?>>z;QfBW`
zs*vN|_niFxU)jn*4imqyX}j1`#%XK{m#Wzu8J=%6>P0dmJZ(ir5MjM=UF_z<6XUtD
z)or`1eAD#nLc0yxsVay(rB>vOWW~wG+DR&6xpef&V*EW4Y^zKw&tBo2%-X@s%!X59
zd*B1#i2LuBjc&1+5`(Y;344z*xk|#~xl132-`K-i$B^%)norjE_xq=gCPI@JI{)mS
z_q$;iW%36B>#zbB8nA~ndy<3pHbGmiJ~4l&voUD*?`_3KS2>5EhtecD4J(d95<2{)
zv21*v=$Xc3!Yfgh?^4{q6y_TL%7$%>*8xX>e_ly>`d?b1E-BqZ?~(aU0{e&uZS#<K
z9oZi<5ppu~;Iex1*P{uBD~`Z3nEED37)QgLnv}mQe-1lms=yOBi@rg#Y5MI_&s7?C
zTcj1r&SMoM`@;g{JHUM)@b?QnL%@mC<fM&tyR|5#ur%@F;Vr?pGqoeoypbVws<C@g
z@f6yPm%b$Uuy>q6ku0xRCs?TevNG{MYj-q|HZBr{cISnwMOU@Ty{rDXT;Kv+wyg|W
zFSiv49a8<Uu!e~llM(H4TL3tX;8>YB4GHwD`S-UWfh)ET9TOg=7oM^I*%Snl42*6U
z+8RA5O`%6P#yS1UhU-p-(v-v=-p>5A;3rGeF&eLSGS;GUVT%mROH%<{^7A-fy{qF3
zMi}7SVqe1dg@M6jsZjUss0)ZLy579W<uXckX)wF_+OP`5#7NwoEJF0pc~84F-L}vO
zE?Ewe?e7)*c7BTO_e0!=R9p`t-sWRUVmL|%X<A+f*vh~e%M$J=l`dY!IVvq3od`71
zI};4kDD33_eYV>HAFJz$60SET1i&V5R?A1BQ`vGy^Xpt>B=Oi0JSGjah?jjDIw-^U
zj7BFJqQz9yxZ_YFL{m0`XV)9ri53$tB<zPbfDdqFKK~7;pjns^DTB9ue+}Gya8J1M
zCjwtmLmH!<8+)*L6HoGoLS9H+SEoE<ggnjIamTRQK-;bEQ};9<`bJ`&dp_+5Oh5|a
z@z(He6r&@2|0JTAn>f==6~$0lY@f#QmE<rgV<8ZnGm6D>{u@M=4YH0~qAgsY_sw!A
zt)BvDpg}W%7sh66%DMuQsdoP0K0XZS$kfI=JoFm=irf_OxkuQaj-|fi6C3VlunPRI
zJ%H7vf_!Eua5`ByKKFxYpSU&xuis-07ueDxUdu0y7@wF~+BoIM%Luw*)?V=KMOT*Z
z$*`4-7|TVnUom0rSgIH%qjgbOAfIgej6ZSiqA=%s>2rdw{i9B=k|+gDo3%eP?uO(q
z9vWhkC8@+u<15#@J{#g<yx=y}xipDg<Dv<BT~W4n0Tv;O-e{kMC4Yjq5MHl0zW1w*
z0)=P|4f8d7TehO9>j!(R8IBZuqX-r4!f8%8+p=Kai}wyW;$h!$KH8zEjDG3`k|S(A
z@m6Ai^7v`Oj=4dmj$SJRE<8-PAX!-ZSo>Bq`*WysUUq6<H#M!_Ug-3YcN2;gn__R;
z`WAJXzE$iTS{-)Bd%Pxsd|DA6MTduV5d~YJt<WEZO^bd+5yGj=#Y}0Zbb5)n9lCr*
zp0N?Nm|R|eCQFqm>Dwxzntgh&#9W{vqoXvb{`%VS%WMs+AypQUKtQ7_{2@A^ytA-f
zyyW%DPA2cs7N44ec1N>IZ84YpR%|Cht4rJbXlA!XA|{FUt}G+x@_Q*lg#Zf_&N@C~
zUFR~M%8ucW>kwGZH!Wk+cea&+==O%wrV!ntFa5GQ#bY{z6Z(Mn$s1!FwXN^%owY!W
zA_;N6pvAuZblzr>-4|j7SVd69>7mCMCK97{qOj9>$;a~}BT_o&KMqZ#c~7ufE0&BZ
z@yaB;KHvJ*dhxlZk|GUr7WB(N0G0LaoyqdLH`CkCY~v()Z46U6gb=E{?vd70UXw33
zgv*Qn+U?CD_;kB8Q=jQTDsnGF+JI7HjpR!h76&e5?TI;6I>^&>Gf?5AS@|Z?t8U!Z
zc2kc|Z0P{;{&~%l8Z!XWsYY=D;Vz9!kK>|hR|nu$Q-8lbDyWvkH3w`jcSx`0iI%OH
zu>_BL<HV-_G&{lVs+Ll$4*zX_{w>b!kZhB}I1_Z|?KhE9$DMB5sh1ry)nkBki0m@Z
zi4c7T!7C3|p9}wK##PAvg(e&`6-=|2x(1_%xAnz#Zg<+jBuCiBxA*;i^RqGKSKldc
zI%rOdNSuriATV9UFIO9o5m7y0-U(?jP@|)E=b`K#`(nf&6puU$md$*xPkJt@llSJ1
zOWh+kKn&$z!5Vw>;SsHKhrjY+xSqb^;?5v)?MQa$znKQh;A9bx;c7yqly)4~5%TFy
z)Y!H*0C<=}=xh&JXlHYw)>@+T5Gz9VWy_G`d1}Y8UvThI{{fr$wD_4^v_9~8u&bCk
zubHQJ9ZVemrCrtf>WInGcF^P4H7|VaT3@yrReO1-@ItuXos2pEGmTu>Xr&+8foR@+
zB^#}l@OFm3!t10!NYm~wI2)vW-Ff#6Wb;9<gd&vN_E&*N7Z!>{;I2C~)vzMGM8t>#
z7i|fjII(z`Fy7>BBU5x3nnj}grn;SnEGY1^9sWU<>54#VwIhG1U2@*=cZ!7MOF_|W
zg4nN?luHF`0l1H-ogf=M9CO|v131PH1`6b3p@NLuv8eWll#W5==mCV<%J02tPm<~N
zOd64Wt?|WVU;^y@n}BJ$uwU@}+ig<!Y*yrLgD~7o)<EN4uJ-`mX*kuIgS&Vj*1%aF
z3%4zK4MtSx<&H3QO?dZ4-Q|`XybKlD2G1#QO0QLT`0fRVQ_`IO#@0XK(;sU-u9P`F
z{IRVBpZ)sg{;d4se%YEV?mp~WxBy4+1Ebfqk>7;}Dh1Ans=kWN+|An%?u&gOoi@Dm
z$M3hy6_Gs2flA?*y54rxG0BTNBR5_|3z%HK({R*$Nf?2S_fO^Eugagi<Pu|pd8><>
zA1K~VJD}F{O*RHzexb+Psn=S~VLl$XwqjtLIZU>Wj?v>*eWl)GTDU){)Cxa`rU5_`
z(HGh-{xVMh3XiAfcV}(9T<=q3c5+CQ7~TK_eXfSVnD21qw>}Tpsqdm|;)pM%fgopE
zI6Pz0RAQWQ)JB(m<tq@RBbENJsY4p?3&zj-Ql#HZ*OtyBiLBkXSf>KZ<Icz&t*(W5
z2BR1z`Pnk4-C&C;NKu3?*Pp?X7*bV;p1dBDb=7|Zn5=kIaVxQ7DSEl@n?E<!H~n`}
zf7{&PG0M(R#5z^YE?q%hzQ)D*qvazB`{A!2#M{HXv`0<ku65I0CC(<IVC!RE#4Aoy
z+t~`Czt((M!<OEIrdRy*>XV~^mXpz=zi;gJwnrQdANwn4#(bSfYuNWcI@)b%3J$Gt
z{-GOstk;KNeH(>e4}*oXpP>*{hn}Jw^qxK4pSRzmNmwo)*;fU<GFOLQ*0z_LR(%!M
z#mxqf?L)?h<?s6LMs?i0mn8KPEFgN$+UH+zapjG=+3Urm>Gl;K+&=d`tPq36`9uXX
z(cLi%T<Zp~T7ROu+Q&Oj>LxB1Kl$$wuop}1PD*530&pkoW=wL%N}n)m`q7=(j)bWe
z28cWc!xr(@{87`<!g%e9&Vk;jp+7OMVAux3v#)gh6x|mnI~HExaI82g9V~&yD#0tz
zTUhVXEfuOMXOzddH?8xx1{nErfk`ca{5iJxRKfeAc=DT(Wtr(2)A(yqA+r1kd_(Qn
zs;_ZI&RRx&Dt5=^WNS+*nvTvdUhbml&hmc2w@RHD>d1m0`b<udfsxVnl2<r9-}E%|
zkeSg4qQC0@7O#nhT}FyEj6BvkpDQlOkDLzf`TXih-6S7d!;w$m&+>d8h;S3kd%~cd
zn)bl%l{2NNNRJf=huhW?O|NkQgnD}@x|`*?Gia?F?yGbO&|;8xTnUB@&{f_VMZpsa
zRkPUiHN-M%0|NrSqzG$1^Y8#1hu$B-a@FSMm&+x%;Lnlq-N_#Z8|Hl?Iz;<o#Rt;M
zhP#ji<|_~gW%Vu7Svr$)>ANSJAIpAWK*3&5ECy%u4GL+6YeRh|pWF~@8{b}DTH>}n
zJU%*DdK!bl5aSQ}NpP;np@1s$wcR5DMEI`{wFAs;F5?fP@(g<!G(?pFTDs8}*e;*L
zu()}mfqtO7f=cr<rmz1YgE%QRz@>?W;+WSh?%5%}Km#D61=O*7Vi!U1sKm((#gZip
zYBDi>l|vs06vu{f`MY<yZIL2MFudNIg8w{8attKdUIEdV**_E0)`PxB$DiFY$d0SL
z0{8+k`LueZ8r}n3;vJAtcWwWm+N@#jU9}Ct5{w8&dPZ0c*?&3@8I$V{k1Q&}gsl_x
z=`0Inw~Xq-(YDZBB(B<6dsuci9L0%+Vc`@!4EJ@(>qERc-UG&H_Au`T;%DHyIQrA@
zJSG_pu|jwtQ%b7-HD`h$R&%~&;;)Y#YX2|Acd{eoK5l9F;~lafzO<%qzW0f?D5{Id
zZN`89<NIAsfx{U}U9hz1xk@SSxJ^ObqN36a<7>%Zn7jV3>G<=694;ugIvS1qT1K+u
z4;>-z;~ET%G~;r*TAvXLssQag`Rh*^MW;_=f8rvdZ1bMhjPVJ)uN$RhtUMIMr^a0Q
zWVBpeo+Kp`5uAs%C!R&nlMo0JH{_8b11B;XsRoWh@(K!}$D7=?LBaots;l;hwGBRr
zMRvjBdN=zs9z5WpZ9f|!_je~ukrvV-I1VTwdz+g;M(jZH#*etxrRlTd`fR-Xy;0c%
z?DTAGx{QL<M?9TeX|WrQDp044Wk0l|UDz(}rS)PmGU()QKe+R7ck0~J5#dVSm@&y-
zwh#SKRAg2rOWi)%KG_T<eRn)Mlw^^#`n}Y#QE|O@C={4^a>VcB?3{GC$4lyzFGsPz
zfc$V#XFh|@XH?VM+TSkrb+fqY#7J~59_C!tEnJviV!z-O?9Cl%ohmZ<AGN#tWGmrh
z@9)eE(6@ZoCc8ZzaRSCkqtwX9viP_=_IDco^}B^unS7(ex;0N6rk!kB9v7bMOC*2g
z&_CWkipp_8`j|@)qD)P(5Z|V!1a^p!nG0KYgM@^+oS$>9D9G-z2)cxQYIaSZ2pMAw
zenY5y%F(SkE^Q6h61gzV7a7ft?Qm~?(-h+Qy1=T%{6=;Q5=(Eh+)-q?n<+U-lOoZC
zulIq+lpT2{XVgs0PITTWJx)ysuL*N^xS{m<(*Ds7l4WAMvqa;9z(L>m-r<5x)#NQ4
zyA9AkqoUI0pEQ{c6jZUTO={cI>|t^Z+R1nUoA+w`SDbL%F6pQJP9}UWt2wbUkBu4y
zqOD2Hg{rB4cN+?+OocN~LtYj9_g!Dpzvw!XIQG8NMfX%;=C=0_%ne3t65&<imS+%z
zBwX}%z-ne+6^2DKsS{3V6fFU$&clyI%dS6AEZL(g+xAo56Ud%rLv`za{oP|rUb!hG
zhvbdNjb`Q0F4~yzJwd{ZpZ<F%0`uusAxD2$N%n)4G_G){l6%Wksj+jZQM$ce($%lg
zp9W3tLOa9VF&Uc9<4PK(fa-&XmkfiIl~dJ%14K(!{9p_7KTO|Ec#iyt;22cub=Sxf
zzb7*M)k~6|wCV$vA>cr}do-UmVD~mbJ@2dI7icGbyCY<;aDKxpUFRz4zhEQ2mGmg!
z5gGw7n8+C-M>JfB(K)}-GZDLXwB)tF)hCexe&eD};N341bJOVczW|bUHxv;@9K!Xe
zy`1oQIO6?^>XG8dR8;>YJ@S+ito3LH6zdfiodRQ8L)rLhf;Id+w%WUwC=<>q$i}V$
zyX4%&1gBK-U&!&~J`+gItb*Ex3*e#tYXTTcX9OX!O6L-v8oP+e_0qTk_$B8|6rJtH
zgJS4AxO2W3B6dtl!t>*gi6e9K+pu=Dj#wuIQdr8%2ttu85rcAP+0REj%^>-+C0>YF
zJfKt5fX=6E2&{)TaaNL8Z6IW80(&^sCEqjE3Rsfik!{C6w%g^R#cI2r<P9_}B3Y7!
zpTmId9lQ86aew9Chte7%3cD_&Ln#?n%M>$)36hrf_O+vxuICe&q4_D>!Je{N&^A4e
z%u_lrh=chQTSCGBFcg18sR7G91Y-5(k%hRco23$C=<yPjV(mashCX}7%6lGaJwaAm
z5g?##J<gjt*Nz9jEd-o@z!wXi`jDf0K3=aE?lZ}kp#D)xWZmXY;O;_~nI~pj$7#(x
zrL*%f3$u*t?%ks$bvbYykctCVPzl#E8He%yg*07XWrx%Gr-ybcrO9+*R9a_$zw?ce
z&80Cp<yQk$T3bz?S+r?Gqb~RS+qU2Q-AZ3TB-QYaYnN*|!-TaDT+#e&X4m2r-%>+q
z8J2G3;v=8Jnone^e?V}c0mpymv6q@Yn^=Xf)JH<OOi^APuAxuU$+8isGxQ2Px}fGu
zJ#WLhWhtupW5&jN@ff|VGy?FZR==W+0j3uJv*x>hGWO@<=q4v`cIkK(CRdP&p^BQk
z0Zi%j{lik4l;X}S;qG-4E@Wrtr*FA9J}Jc@W73B;uLXPfYra}5%H!u@QBnk&JtLS*
zNQ-L<A#bz`op>1Xc2{bW=;^th735XOj`#y%$hRR_Z=w5WCiM8^XtcYK+8Ko68F>HQ
zgPysQI8*0lfg;5|!@MOmpB&r581*$*6j5)@n+_K2eE-c9W>5Wo6#8%dxt<~4?)cFV
zme*mduW!>G9TyXHvi{pU^Nws^;pl^MON+n3Jh~#u0)*v)wO_Ta37nsEdkE@geMS1J
zrpHky>vhQ+bAG?eEwZjwZTPnA`+0jth=v4!@%V0V!Ox?p&=Y6po|fYc_2^mg1W1kt
ztXQcmRf3#8=GBEBQRB`lWw4W(#r#^`X_z268qI-^aeMYzS<O=YPSoAw>(4_1WKJu!
zc0euwFX|^bL>v<+B^S}2-43Yx^(R79C;6x8l;Oqj3h{f?iIBR$fDj)1;=%FG<~7_b
zj-`GDb_Mdhz9^$8Yv3k5zfx?E*KZ$fqs+)=dHw{md!+yB^m>#_18oV5ozI&1-+%tE
zo?o|%Q<31F^`6x)j9pQ{iik4HyjH5+>fibgxz$bC+1Sf}k`K#WV=5h)AL)$RT}u{b
zgh8fXbGS;j!>LY~Yf0!hIxn*0ZLbs$Qr>XLZsFZy=Lfzi_pi*)R760U0%$h}&N{aV
zj6AEESl!V~?Wb1Jnhr*`FXsF<ZI~!ch$3@0Sd>>MS@%>|nq9(IdrvZi3H-pRTEnk0
zcQIaghT&Ho?ebt0C`fU3d65XGblS8<toxzGEmJeL)6Wy<=%=65O^tX<q}))?+p}TE
zKOgxe>Xaf$6oNL@!LfNM@clF@S9s4ZXo;NN%w3fQN3a1FI&CrK#9D2aluB~7cC0{1
z@PVwvGswF{L7gf+8GU`1JL3i;qIPsDW)DI5O2diLrik2bRgRVq!)oT&Firm!r*EZO
zWT&UIrv5k;-eXc6cl;|C3*EbQ;|l|oxU(OW${wDF)K~G&^dw;+&@LczFaxM6ue-vv
zUNF|pLLd&+TO@Z9i5FWMH^k%TR+^vkuH3hfEHh3A8L*7H#g8V*bTQH|Hm)`N<=w?X
zO&v--5mcyk=?zG$<NWjMF+gM!1DzBrOjKU*X+*JCl6QhzEZlk}6DXfo_HQXrN*Z6l
zAfB#hh%?{1G0SO0N&y(+x_A)6kcCGtcKDIu!@|#@yGKYC20L(y71k8MJABBjgFvE~
z!(mb`cGcjsHTCfJe-8UrDHW5~aBwic*$MCCrYd>F_c1A{h|2J@f+2QndwT@AHNHwV
zy~o@JKWp8@WIi0<8*WuS1zejG-skDP{5`h&58RTFKi62FEmn$}X%)ig8O(t`l==k0
zO=+0Ok=cgNWQ_l#QKJqD!9S9}^4OH*5UVPCgIh?G2-5;pGZawseJkd*es>4I@|*V{
zj2k?v*Grl@$9$2BxpzTk>)>erFKyz8VgTy~cN-A%JNP#<UQ`x(xS<|7B-;0GcxAo{
zmZUD0`we|Su{;K*6ZjSVqHTLm%xze!6s}TofOa^tGFmmJh4cQo%529)Q=K|z%1iGk
zcJ_OtR?;`w(9qC2GkIIai4T6X1s_zlBD8TR*uXzeCf|It2_rGyhD|8B4MVfEz2~*G
zc*JCl%uFa5V$20Spb(}P_mf{Tw3d&qP&OP1V=k$;MMwuo7H$WdMNvfZkZ9cOpI}Wf
zmtXr!$8^?juDk@slt?5F#u(Mnyb@Ks^5Xt%cQ*JR|9LODHW|u*cvSWPmZQYVv3ISN
zF;^n{FL_-4tY7fY%O&!+T?$i_;fjYVQ%C)@BXdAWgTg5_^Z4?esHu~UsI(EEJK3F@
zI_`;GKg}Md<kh>Tt<>D9JbyGh8-U5U;ac8gVEV1gZl@8m)ig~sXvcH(_6jGQealnM
zG!n@2ew!apSyBzd|MAf-ElJTw%>JCqg-m*{GDFTIyZj*n9Lj-3IuJjI^*Ael<`0yN
zI3>B0Y<PT?%lBO|nI^Y2N=M$cqcgRrgBR93crUCS&UghJK9k5_Kn%pt+cpOHqq=@S
zvb6J`^bH<1B%o|pkKG|%>}t24slo!Uzu*?N$=mRf%yj46(q74=DLV7v>8@{r+n=3`
zAl}499AZ9|Z@STa_t@j8a`_kQ5hFogHC@db%3cjMwGF$a$>DV*@cw4s$4|?4HAa==
zCDKM$U;og@;XJ%CygPDY5oi{GE~uPa-n5a;)E3DnSN<x`sNE!y?5m`-j3snIV0$Zj
ztp&riC#o$8X<O~j7_$+XTVLu+8~(%-S74_=`u<&LAOfZ|z>p}+O?@@8zp(tQCg6B)
z_^PuDToSk8g}KoFv@R-#D`impUPM?z%p<3XYQFVWs~<c@qct^UmxXPuEmF!Yr&8=b
zO<ndGUq!NLrrMcuioB(B9Mmzr)TM{ImWNb<$U47H#1*sBUt$D5>O?XSFpLU^gPMCb
zOUkZO9;FtN)608rGBW2w3kD;$pIpKB&}L^hB_*D@#3i=#6y{vP`$OK}>6L}Bhp{S7
zgZsckrjlC@cv<YA|8$tK+CvA1X|a6xtu>6uPZ@_e8ow15)<p)DlDiO-;*xDLoXUF|
zX8Wg6Zs3MP9?F5`f(_v`$fe5I67;p)&v!y{hnE~neMk&zrMCgG4WowEVt~#JvK<YA
z{xlWij23e0%PO)A8>V8==|*V{m3_VOx1rg?HL!XKlDKV4TrTDR@TW5wskzMrK~kT>
z0lOmT-~}ukrYWXdxE$1D>O4@$1!)B@ntTG?S`FEu-?K+1|B%;1WwH3%Dcc~OO0*lD
zBCK8bWjk_~D1;Y)m<fflTA|rW8DKWiNj51WC`PNMu`hb?7i~Tbr7N=9$wQ93>lIw>
zJS)$E1wv~UZHx<C+i#ZcB0hN6#3A!djgoZhcxFN_&pS(<fO1}?4fh7Li}K+==yo5u
zo@&Z`O=(_*(8Fx)%s)yYwtPcrYjfjn#uB@QMf*X!|Cat*H$6-c$y;kaarZu|hbKUl
z_0O4)(?#JES4QmX#zlve7Y}j*{T%y(9^V6#A1I-2a5ZtOojeGu3LCEH2PI>0<&%lf
zgU~8ggjQME%^E$9w!OzsOZ|fa7MFQs(=O&meg3;|-*5mbpNsZ6P6xpQN=t5&;Jce!
zl9}-*Gc9}IubfJwfCRSewD@!i3cIr7hyEiV0@Gs(+qF{2C~*2+cUvipJ!PKRorfoJ
z;~V#IopPSAa?itq#!0+2@}_6IW>&l`!mrh>yu4{xdFql&)?a=)6+I4Sl)IX`9lJ_S
z&BDfU8$-pL0luwPo`hHM_ny-g<1smRGvi^cjiRq5^`GQvk9rfL$}J}D0M)FNt^4Rn
z&6$X3rspUhN$z0UsyZat8@;34XF5wLTfTsZK7aV{-{-Ma`Sz?&OP!rlCkLTikwM#A
zKS!_rIT|h;fQNNHeU<Qk#1NhwJjK?bh?+?ORirJ4|0wn?Ah6jf+SfaKFm=2?+Op@u
zqi(j<<CJHZq*Lud{x#t$7oByhouO+{b+mS6`-~^Qohsb!>xeU}ZUDM=^%YZz!Mhbk
zw<qN_yVJ8a3fJyAm(mnP8^-vny}O4$Dj3oFUR0$En8f!Y=ZJtp>B28}<Mn!!avc8y
zqU2f_)8gM_>!z_wOtl%MDVq<bLNYGyQ3tZu*4CP~J$QG6FRRY@N$Mm?_0wvXx+oqz
zDp*Z*$bEgezVJcEhaCR!PMu3yg8?-ki#h;Wr*QZyrK4kW-oIheN=1xKG;d-sD?Tb0
zesTW8W!X}1UNy|L5*z|$y22#Zl<H!@08yuC!Z#@&*AAN3%<%4<)TFP%FYPI)=kF%h
zN@Bu0h2wjrrRxZ%u;~p0*{t2Av;e7VB2}7M1t7$_clU&Pvbnx)a}FOS$=~TAUftXh
z3fK-GMTVC~=3VsSMeT5XN?T-^9K$Ie^I;HUq5@6#u7!mKo3a~$v+^q5aXzN8q`1K8
z(phnp!D~(_03fOMa_AS26>*0js4y3W`0tm3;qNhieqQvtnm!AJ-nMDc%;gcROY^*?
zxp{$U4Dvhfo){C#B#&+#BsddcHZ5a<hquq3W6vcTkVGG=IXh&^Q7bab$AM)8g)1D^
z<WMCy<g67JeAh4VNU}SX^QVzEY$n6jY8r6rT@{%Dc0ZE&3OEYLZ~f4xmiqFeAp&8G
ze>5s4GncdnWtiv$^jhf{&7C3ZFvUeW#X)~x`&wLnX%*C(B7ap_^8)T;m9f(cfpEwV
zZ=rmSXJIhn1~=RW0zlWE;9SrJr$@64{;#4lk7xRi<M`JR6ERYagozwEwsOppn@o-@
z{E$e?mHR$&#T;c+a{bH^LOF96LS%A{%n@>@VJ*i(?%((4Zyr26zI%T^pZELqem+B-
zD_E${#JB+O88`6oHq%^HW=y#5EeyDQS5;ZA@GOso={aP+t{xk<zFef1XQkENC$?dC
zjWZg7wQ(y-8-`u<AZ&3n8AxxDnILU(N&;W?LAaHE>Md@#wl_2;NFKth`-GnvE~LjR
zp^M2+3mo$yr%2%;fHO!by11NFPeKLhXIRl7U69?|^<JxjfOqx#@4rp&+CQR#L<Dzb
z$;|K5(?K3F$T36fMK&LO*-20-z#7FajL`oW(0urqAxbD%d0hvF?~qYygc>y!?uKv%
z!*t*+T2O-d<W!Rbo<Bq%GVR`&>SKa=k>ScOvDBY3=Uv_*1qz9}d;A3@-Xd$zzv0ow
zcg9O}R5CM`ey_}LY|$6M<oM;ZQX;pG>|Vsyfmp!t>Z-50{J!1q?dBy)hRhH8kbT>S
zR+xl!s-Us#g94gzSv<Am#@d&zfi0gl4R@6x9gZKqjb2S)HnD4BX~4}ZsW#C3ER(av
z-6~7k&v!~Xl2er(Qbr-UPE5%U%Iqw6^1$Zp-#>41hy(JM7$IiK@;@nvlY6a5OGbh<
z1h&yUy4ZJ^(V8~n05O1(#>YANu+mD=pRH;&JRuSkMM<bXHf;A#NaKVL3DngMZj7Sl
z4F3#1dq(cua7X4CwT3W$*UcVo3?8qlr50cfy~g{VgW6VEu0j)4_(!I&N8g0F<G;H2
z2Lzm9`!2J!x*3{!W@<WF&y0VF2=c7UIHf<A_m_jlFRdVlR3}-Pr&#Owi*%-6^}4B|
z+<mTS!H<!92OlQr(l=cH)yW#^PJb?1HyNyyIWzxCfS;!$@x=am84pJ2uB~iwEhgO=
zlJ)z2M$WrlXh*HTFY5pFG(|jgXYUdO)NjAUGUMb}H4fKxu<2f2_Il>3?qv=xsp(NC
zTn%!%bMJn)m9Tb~3kwMiwee)5HJOYUAGb^gPQNOW4YJ^`Q9B7MmKlU}&0N#2uD3Ke
z^=N;cQET1-tJM%X-sRB<i?CvP2H(W=&$66?f=%*CiMx_goLQAP`1z!#;sc1!56H-E
zeQEt3#^%z|k??8X0KcNT753`PzE01!*<Wp8yRr|u1#(S>)fU6qUt?o-kIx0z4ZIMy
z4h^_q^IDgg!{TAF4FPd}>e$QeX6;>H)gQikL8ckNEr8EJ0Nqi~fM<1vC1r%2<Z0Wv
zM!l_H`qkzoRbb;I-Sf^!+E}kDl=W!u;C_m;(d9u#DR>Ip-u$__{gaJZQ>xBJ<uj*R
zn$vBe8jP_0pmS+?9iLV%KI8{-b8m~`W6y2q^Tzv&C2eVoc<0%>=E0rgJy3!Y_T7au
zGtYq2KB&9hZShOuMK4f)6S;qha3$(3vcaRQ_-*W*p+FQuf=fpmYKavjAV?4j3eqWR
zZip9Nc3qRrC0mlBIyd~wX($kZ?h0PMqB`gQHhH97N#KjBP;3yHOk`|LO{wtg%qGX5
z1`<TMF7Rv1NQ>%HR<@RwEMW1`JludsTK(h>j`c--x`WNR(Rpve!u#`~3V+TyUp9Ih
zzzt`9QI8$dslp32wVrPh$JEH)9R9-Wk-D1YD|7&In9P?vBh@S<cZsK2ZW&!*);9oC
z3g4JmnBXuQ_Hw03muQZTTLRax`eSH72oBS9t2jY6z9fJI#}E2xj{JHgm^2>gWb`M=
zxMvbZwf(_PxKaf2uvl^0t|h4{CvgOX;9F^fSM?+tcKADz4K8wOM>Ihegs#)Re@S;O
zO5+3PGH`tB4J2t%+NB~5%RGm;)N3=4hBeeTg9TG2KD!jwvq1$aiukP4jU;ayNkzSJ
z6)Zee^qF9yL)Gu-|74aOi_nIAa(E8aeunIodZ%fapLz+eXiSU3;ddXOi-p5k;h$rF
z#o3*|&;{4wcW}w^Aw9(9)RI*z>PeqfFQ2I@#kwKw`>D06pWK(@D_07ou8_cSp$?Fi
zLZ}8K&`A9$<l5yT@j79@H)Db{0uoe_5LQByf%eQX<44~wyiPs)3M4u?%(YL5pZs4r
zmSbygcU8Ofdc^+b)=hB!&#|mI$L?t1>Q>|K`{)tT)Hz<qETYx#%i-U}Xa9`<Q#(;-
zloAPsIYDlVJhu)PvT&xEN8|u;rGqO#PQq(%Z=QDUtq}LZmz}U^Cu?vrvM;(W+;*_}
z*IR^iQzN(G?#Ee=zD1u8O-74~kdOD<_W!I<8r~G0@X6IPJKJ5owws!=rV-K7WP7bF
z@wSiJ1@LD}@q2qKw-t%DwhB4QOWVKKn~t}G?MJ)XV?}gVnuEh59z&U-b;gvP`WSL%
z>TW-#H6&thiJDhOYRy0j0=FC~)+xrYj4!AX2O&yW3YL(UD;%E>N~xp+8mypCi?C3%
z8yEHPElOM0iSf4k4f0Czq2fsq5tW#cyBfjYrfuzW6r3DKTh-dU6FX2#y{u+psgv%?
zW3UIB?P1aH#sncIP}CVoqg(uJ`6Avx>Mh67nd@QKgcEBS;6lRrpS!u|MD^RIXY)*^
zX2aj-yf&+z_7uq@AjjrQN=gb=A5&UXW0~x7<di-W*vghxH#hG$WeS3I$TbU`(!JR6
z;3suegIl8N_r4A^U!E<YB_@8^X_%U(d0zY1NC3{ya?hQT`dsr_C6y7(XxYs;-WaEw
zw1(`I$lly<sMS$fuUna<4$!KOw;HJ#hxa^$sOp?$HL`tdOzo%I?5fD=<5XIJU>q>n
z;C6$o0`bKkj8FeA$vdu<P4<{=bALAcs^@=Vn2&BfpW#i~FL&j(Cc@z0KN!BhObh`g
zVBZcJJdaQWslXWMr9onbedOVvzXu0D=~G?4A;;eKX~yG!nmhApw8*23<JMWP!l4Z8
z>9est{5YAkRdx=JF_M`AB>Elgku*vq*hej``!s$CJr$JWr;NesS3|$=EKjNYGc@lx
z=fE#J-iJBM-hN^(eTVMUD96)W2+L04-&Mr0xU;{i2|L@h(g>D1a8yL!a4nSM>KH1N
zC6e=5@UObmObersvca=FHT0#xws<nz<F#xGNjs`VRB=+>BbT6*DK<Cf-thiCiak~E
z884UW+Uv1?9W#Ej)&2f5V(HsEtY#@rt!r8UvGrP;RZd=2w75`kn|d3e5Z2yZAO#`0
zW}%%Q`$6t63Y{xh+ZQ)h)CHnNA!^TfwUd;9_GzD;^>qM9D10Y^j)Vvl$#T^x1KO&8
z3@_uM+6Gd*dOl$8_*xG$yU>tftOG=J=E8P>3csU7y_t<ebolH}c-wilQ+O;Njv0<1
zd4DRx8eZ03vC_U@f3;0-iT;f<0r44D;(Bn@`A95n#O7i`aqY_eRu|CoU~1#`fH1~}
ziXY0%4Pp^_odU)_)2O=uP<V=zc`KS1e2CpZk*Ky|b-jt<*DXMME~*nEaIc~or>SFI
z_F9ny#KRcU@Yc0hhkH2}*eds#nAp4dZW7?EFfr*OBzpaNbag7$(1XCuZ124iVsNL2
zU3LAF4Lk5Tn017twCME$ihoC~7X3c~<KhNl7_AV7;9sHb4T-v+d#j-QGPoX(m5zn}
z1j8o96b#=DZWJ|p%*jYu8z`?IvmwXUhF06QM(%hD?bI7ze9f=@S0{?skm4Q?@f*o=
z8vGVRi-4(s?~E2A?L7XKlBJC5pPT?oNQhcmiH>TXabf%H8V|U9?2Q(1>qMZ43YN|c
z***n@vD%)`JBQQTTjpo>7ll!rj~QX<g9}bBU`HgUcaq6lrgw4PD*a6U-+|^JKn&2F
z+GsdW9#lA*SBSJ87)CvSX_wWl?NeM%@T#SjtiF%8Dtak<@{nPTCN9aEl>cBT4pRQ@
zdg8->8sg=Z&X~CWVLRF-a`GP!52l2I_4^G8!*#R}vk{xOeOu@!i^b)3%7glD`g6po
zDK8$=t@d+fe}8UQ{MtYMyEmRe%@8eCX&mRir;)IL5^hSrB_r<DT)cLNFgMzz`sw|7
zR^107fzzjKyj@<b5oYL@t+q2opWP#f!tp$OC}=m*-}8goORVY~cXteSSRk!yera^?
zet0s+2|%kPw>2E8{Skb7gdJ*UYS_yvfS+n1b4RmL`byo@*$N+*2xU`>k(71%qmlt}
zV`77PnuN%EflzgX^!sZ0CkyS)kWUSzLHi$-Nvbyy$>BV_!wp-3&DZ!J8GT*MFjeog
z{Bjsr6$dw#{66I0)K3u_;Ct3#nb1IYq9E3<-y8V7RWI^>NbodXdh4a45yjFaY3^pV
zSLEAqnuX@a<{Fu@&Fz8au1)vG3B?xp(EVtzd!+?o;!pi5YoB9s>fDg|famc^rik#6
ziC=bQHNX3~p}#^R=6JcLyC!}OY@N{;HH_I2nVGS(b8-k)6JA?uvvylh3}Q<SKiu5g
z%}}rptTJU-=l--#SxP-IuHrR2GZQQvXL$<iX}>epgb<pDoK>X+Trc8h#v}7{MRdPe
z5luFq%^ttg$TeBQoPSxoLh%b%yU_KjK#>V%<doP~rZToV*cK5H@nKO=v>)05)r6nu
zK=fe^4N}B6eE%g?kCw{FxGG7wU)ho;|5LW4{bYU@oIg!fD#&{&R8hq*GI3uAbSdY<
z-JAE!pHz0WBi)pQ1GJU7IvX3u%eK1u7)Mo+hda|W<e!4tT1J@iO-itGPB-R0fw$nF
zmUAgL9pCeNNcPr51Df~`bNh#O7^NJ!|C#a;&>W@EdbBAe)hy+6bxO)j9~mMQ3<#SD
zpDIKJaNmh9D=p8u^z^CJQ?jM==N-$kS{jiielvAAn*pL_C@Vud7v7EGWFgE{5K5mp
z%la4!hcM}WVW^Q~Q<-r!Qy$>9B6@v(brp@d%0A~?y&eN-{wPneIq!@VTU2wGtzL+}
z#tkdX#A<UCm6w(_d2;LZ6!B|=NNSYuq4)1bnHwtGXLJ$}B$A-VsZaQO3*{1cG;5U5
z8IvJcVVy*jDER2-4mhzg!`Skx<II#-1Eh-xAvSL04;CkZc<7}}!bJlJQ}h>AcHWa>
zP8H|1n1Jqw7=X71kShF;ip5lnCK$7TC8*HS%uNU4M1c1~@dT_P2Wu23H=$bpJT6Qd
zQgjx;@pi#b?BILB;B66<w2T0+knvFSfa$=PuTBsCMe5^kX(IqlL&pjM78#sl+}eI_
zcexT4<1FXh4x?qKVfWNo3hV3*rPJoVqhHl+!SpT$1Nq7%#<%x(88PjG&KESHaMl#@
zO;}HD|4vJ3DISV_=$r$6>h}aWY{UL|5T*kS?XMk!z?&+!!ZnPgqrW)ZfP8{sKx$zX
zsN<UqWbuy^b&-9QTC5cu@Zo4WJ}7bGvvY2(3uDt4^{cN`Xzx;A%8EO4K8_&&lcAVr
z3X-B<J>Lvgldh;&f<fUw+T)8oP4rar6bhtxI)RIyr2<IITrhaBxu^}5{p*BG0Q3rB
zvgwx2RiY80{<G?r%wobkaC&Hz744E1f>xi_09SBZ!Opv4UXAGFh2#K`9XRu@-M(s^
zf;kEfI&PZ4w`Nq3le-W5A|z%1_DyUa>{;ga(zf)ztdCChp@<D1gTtrQzt8I6$z)4{
znL5|jTXVy0k(mHTi?pJvE@=DEm`Bs;7Sk8+)mO_xoO@Wc?$t2^f*uQqTybl&bQ{;A
z5&tXmhNx~RGey?AN&&`rR#NbGwbE62@L6-lImuD_5}FJ)c~YSVfBvr1=S>98i@zk6
zZBYV`w`d>F#epZM+Qfi`U9)9Lhd=4haXtrIl#&ekn3YOTTD>s^<>oP^IGow#`>%M<
z(o$$ZgC;)Pq=@1h-U{_y-Xr0}vt)~3)^=`7aCdrqJ>FaSXQrMqLLAsErCoC+Fy*i1
zaNoftiRd|Gse<o*MY0y&8fx1XPP^)*{v_&Z{Pc`qP{1yDEZR97l9n*-Qs?f;JNgs<
ztnJZFP&%P}eY|#e3xpSK`v->|J$;g6lHT5gI9cUc%)V!%BIoEIpq%G;`bVvl$uQ~^
zpT0oE`~8|}cd%t8x}_d*e7)$XWWPDSIlg;<@ya~?LEG_Bv&Qj!H;VB7c?3+-M;9uU
zi0f9x*-dfrfc);kXN;}~%0!de5u1|N(q$)k`EK}3)rSw7c3~d+HK6TfGw{P0b=Ot>
z6Z)lyK?aJ9xSu$YQtf?Z>aZ&E_}QR+D-Z=l;h)PUewYhMH{O&rPLX{Ae|N6TV)|6>
z8oQPzM_6cBO&p^&DU5_Lk9I5CcZ!Np)IKTAZSggYu9z@WIS%_k9q^l?x!0&Y^Ssn!
zerj)On$7B9ktcFHJQ7VjVKx$Ear>Nwg~2DJcCG7gN9p<Ax@^Oj)Z0boQ>XNN)XwO^
z@Z^aVah=*0Dbv2CgXXo?Mcc4P?}<NZSKR+s1qC)wGqf<SEu#H3T)11~j<chLYE&{u
z5%3DEXbm3!c0=<?nGLR@QDvh7RF+HM4$nHoKnlXp|B{_&z6r7Bk7e<ESG_l|zS$HD
zoBlrYbN4gz@-A3PvAqSdMc7=b7AsP$okLmzWP`=5JSa^9cMqKjY<Fiploj+WIE(NI
z9O?Q~v7!-#HyrQd6ec25)Pb;)&(Kbo-jIs*CMUUiL%x2)IOP<kGS87-gSayox&!8z
z>s)a?H`&oTw3Qt$$oT>eq+)k=Zf3Fs;tkZ!vd;h2?}iAtO{4f?0j<|b=~&l_`Y~2G
z9Iw=Fa}(l#OzeO-yiQXlX|i6>Y}>*z-%E;X=Zm||cOKkl0j!*v{0sA0Z|MWTn82b@
zE|r$pCwlT}0ZPbJXYkYUu@<S^ZzL0J?j^CF*Y<{yhyvO$F+Ac^VT_12>Y-D4J;VY2
zdH$V>{<U}xCXx$|%dw8$T>7?9lu#%I(b2!I`3YsOL^57bBZQIOP>}?VPf`$O8h}fE
zK3iPa-01@qTMx%q&)G^l63js5LQZum9}tDKyj)SIJUJF#gomi5OYq_m7(YV(NND5K
zsN-W`dFr?M*<PuHjWFfi!@Zrc<&1IT?)cv;W^NCJq)fUO`~}&PL6_1JMOxQDW)U>8
zeBo(osyL!YwzagW=Y<_3Uvb@LD<9RDcAPA}dKTsZWdm_2(?g9nB!*FNZhE2&`sScQ
zRrzw+qML=qUCFgP6$e3E+e#hK68_Hh(M$ki_s_us_A>o8x=-@!n)O_DgUrqpU7_s=
z?c%p}bVzUBv0NTl^rM((2j#Uzusvt~g@GfrC}GM(BdE6JP3hp-dSbV8QPGR=DQ+!T
zylaPzTyk1^OJ%R<NoR(d$<?ROi+E&8*S~`;vc{y>ZYkol9tmpr-3-#>>1_}%-Wu_d
zg6%SU^?B}g%Fl_t&A7j2|LaMwp7U+6wFZWJn5&<&m@CF8H_*{w9J-jG7MVpI!Ce<g
zwe-nG$8VP~izP6%11M(_&!{*C@CRFpw|!(bdww0p?@-Ga1t?|Xilpx%eOeR#O$pL=
zcO&12QRt0TI(}lX{-23u;`XxX{7m<ZbDf|sD_w2WjfyF?qbWCM8tLK#<iWvC;CxqE
zfgoEenrE+4fq43y5`48Yo~GhOer2)Wx#2BCbNbfGT~bh0O18lRzv7v!yc=xvtgj3f
zdyLaf+2iJycCDfD7E`d%9PXvXMV$3neOJC<vurvKSjecL#E4d)#R~ez*CPKOb&l@2
z=N93IrO9%dlQwUYbi|EongqDxbu<(O`XHZJ;cx5w^v~Z0cc*x?rQj$$RP*@{FaT`<
zlW0E7mo*_zEQOKOM8u+_+=$#4>ssI2%9G`yu8R;J=TgBl--DO7Tr4=*q{2~O&!i10
zoG_;!6?msV+SRi&;roHBWk<Nncdp7*0kkRP%ZDc}i26fL#uPjtBS!fm{vLKGs(9t8
z+==S({{U;MTnV;NhC+?58(C9!=tCK!ytw{H69EJMV8i71jv2qZU66+)=so4pC^vE3
z0;4tH|NF(-|3cv>KWhbmC~R;;<_Y!FEs>!9FD%$#m|s*%B5#lyk5f%(qJ<6?*6!kK
zp4@QAb=-Y$znr!|A5uHf+5&@gAX-|RR<qDg3ut9G;kp)<cf0Gy^v|Dt*YkicF26od
zmg%RhpxX(mqcs3T9CV6QReHUjD_MMke8?09k;F`bkq&M&1jz<)+3pE*c8X&7jCwrV
zXuaT)!6!!*ZyKP@74-?72p7h=VNcSsS-6>(LuTB;6^Pr~m{}f@3-H`;2o;kP<6TbC
z{h|s0y6vi*yiXU0cGQ@8Ye?;F)IZ=w&6`X&uy<gEv$Zn;*}^B96R^tBjbm9GKbOlp
z@uIA!5H@bOKg+uTP{Sg)CMZ&XT!J|eIF$8SROH|G2j^Y2WY5jeJ(9Z24E#A9I%RtY
zLAKg&y=r;KEqrir*d3iy<T}Q%lTPB*f+VXxa=wrsJQ6EaL;$M{M3j_?MQ6PsywE-g
zwEu-@->c|?-R?b#V8a@+Xm|rxS3}fwV99_$2LN`nhHWmAguJ2f<?I{Q{M|l3ZkWAF
zRr?j>F}rIQ1^3|=)eW1)5mcS)&Gf93^(M#OzYtm8|76Xn`ugHH0VwRh=X?qlbB&Cq
z5iHwzqARI9ot%Z=Bw;$|@e#iyw4m}o{Ssa+h+(eekG!FHWP#uis7*iBxz7<Ahsg@<
zr+}i)5hXCd62E9xx23jGpJJvMlpsZ}UjX+(!$l<D##r9eg4x(*>l99wia}ke|D$$`
zD`}LYU^QPjAD{dn`Da;d_m}OZB3fMKA_-NPw01&&Wx;zLwD6(UtsIT5E%2evi3eb=
zz!h!JWz6t8-i}m1q_$;j@rEeo^jlIh5_hi{*NPkF_;syF?ZLS_CLY{Ry8g>Fpju?|
zQnKFVz><OuwFTs?RWIypfrik7JL9gYEsq$ahj%Tm^t}XEaq0$~7_w=Tp7!eDdqm~X
z@anTHYgx*FVs`z1?*H&d{gI2DY;mIeJH{qfW0lkGYGWQS7+a4Xl#D!LD4Hy`<C@3E
zr)iAFsvQT|57*E@pH{o7dZzuuYN?ZerqGQHDitG0#TZG+CC7JDsr0^DEGe;|v~sFp
zUMHhi8lKzm>YNN`xGn}3JK<YzSG7$n8EiYcvMx`Q-^3{Q-3Bv&?LqOKKLakO&3FV^
z(!`Uh>nVyRotkedWmoB-CCM|^oVFDL71RCE<#@Cx;wdV0v<F&Zw&~(Z&nGXKRt)#^
z90(aFKI~5Dd1+)Q*BAQHw&ie|-RSwtljq%%;>dto)Cp_l4<B2<PM!_0upD48Zks8=
z7I*JAS;-%-mw+;+kMMS7i~PC{ylO%;LV1k7UGo3$T|-=&JzT%b*X09a20(8Y04lIj
zx2oTy)|d$e++_#RLgj1hc(d__2xvA4{OD5alN>`yEDhDev{e5XPw6p%0ikecoP&^k
zr2-gYZE1nYL6fhMLAO^_#^KStYo}_E;UnRr-$7m+V#asZ_QzbOx#{!XX!B|Dp3oT|
zIP}vC>O;Fmy=$s>W?I^|I`Nz<kR|$1L-^Dw<0+3_UU;M09mkMW#BI`r|3~i1o!+}m
zADe6>eY6v999jtkp@q4pD)@C$^z+xQM&~HmV4g<(IP`4x0p!p6!BeX(4Fqm{4QtIb
zAs!x$C{927zr%|@ci)A0g}bmTtcWm0-zNMejvG(cGj@OZhUZc-{QAO<hTgM`zoI19
zf3Z(F;YC7i)3ML7hBS9@NZvj;r~oeioMU~g_E!8RuWsfLf-C8(c0#c$>5#k;TuSSe
zkfuianLL)nFE`IHNxfcGRbfL>L)BrpL(z3vMayUVvVc~~rHpel)RQStq#9^pamTG9
zK>u>Ws~7;aPmE#K##bW^AFB3rieoxO#bI!Kb$uXL!n2&mSOWmA>-{5JUz*Kvr2Fw&
z2<#$`Ab2(P;>6xDTNWv^?V17z1;;qcGGOpr>Q%@Kb8MZqs~$&wYEpc$A4yQ|wWI8|
zRBYS8Udv~6;p1Bmm)Qx`&%X=0fttV8-f7R~+Yedm<VgI$gSzT3A-Ni48F1ruc;n42
z!{m(fK>L?RNmMa^&(t_30CH*4g_XDCn<{m`l_e7yi5^%)@um350|#^-D_nMf8DCQI
z4_OSX6F^BXc{X0|eTw|K&!ms%v`d!&k6QN!o$*7814(1phY7E!ym)}BGs{DU&F&yS
z0n7lCzSI+)dvdI2bzHI;99pTaZWW-$F!s}Emq5Xf)on1>d0-%Q_{Uy@=TP&lX=Y?d
z0Pzw?_?GD_uB4!>^Xyq2zMP1h_zi|WkR%Tk7*2ckhW@RmNtO`Y!mdg0!{w(7Z<LH`
z!<3(d{A00HO=pk02h}$orw7L8-m>Gi*NCir=#qF^LL9Te=&5^o5W+Y(pi5(FXf!v&
ze^0avBtfAuVPOa9MNhD$w0#7oasiV{?OS37{_fROtUd%AakY3ifufh<7)V#Z-H)Ei
z;Zx_`YQ<Zvs(A6F$)$<irB(OlkkVC?b@ivwL}Fq>ezAPrFc!R(FhIjEaY|*(?&I$u
zaJ%_-mU!yd#gznz>aS#Q8|VU$Sl+H(mN>ry>l`BC(1EACXH)uIjq85bgtan}1@ff3
zMQTlA=Kw)=7EXNhwZ~yypmV6@wW+Y9h&niH`(1G^_7I__(e@jh_!$XFQq(xZ^LP#^
zi=cW0^=mtIzEPzy6Wj4+B{+C5P=w7P`$q?}?sbDV$e&bujWfmydR6iAHy0bny;8!i
z<d~eT{k3w3LEm5!T?omY8<wt}b}!HT3$fhy_(zD_ctZJZ2%2d*m2cA9W9|XpL8rZX
zx%}{cv)4{gWOMVH9HIV%Z6d1I8Pwg6Xwk1~^wruyKv&C$$8&VLF<%!ke!sap9`OY1
zd)5UK_9!bU_xfYYZxNlOaq55phd`_>$Mu<doAm|%a-xl3;e|gj?^oie-AQX96ikB}
zkv&)P1p7T%5{?LRJ_gDHxq)T&h^$Mn#Q|Ee$E4Irr^E?=bhV8@1+C`FB$fox7sfr^
z*}wZK_MOeN?Okze5UOeelM_XR&jQO?4M>uUM{eu&W9pzrkjG8Lze{iV4_DrgJX}X)
z4W842O0FpOTS6Z<586l7Tc_|DZ6>aab{XD3SepN8;97pBSJ8GXCU~NyMVK2v<A?HP
z%Il5$l1^lE$}Jh1sx<4U_jJE7rHp}`&b+p;E?A|)_+!(OmmHVOZ>s+J$ihbGdIG>%
z!)No1ifuQWTn|JE9a8Jz9{pQeGJ_Y9ucEZ1%`MWy6-?r_FB>C?reIijecHErE<4C0
zXlE&LjO45i4B4>2!7d;di#Eq6-OGBAbiOgi+v!~ou6zt<#39CbSfUWgID#}I6Pe}b
zKoE8`fxVmvJF5o=xIq)lrSv!l{_dJ`VuMGw_tOU4>VOvhg14bGfwySJU%mO`hu)O1
z#@P#|lImQLi}-gV-%*J`9E8wUxbZX=I?RUXC8s-QWSk4hJxz_`@35lnw<?jC5_k&;
z%s@8F<njiZ87`K})@sUeCK}{vATQ3Ymd8K@U6Q`qxF+b|gGs&BFZy`87bb~fhcZ*^
zzNUh+BYibDS<}Z*Yua8bm4zLE70Ll)%#Kl5Lr5~xiGY!AuJp?CwDotuPcQof0nDgX
zWVw=!+bqs-waHVTv!LEnQ3V~JR9)!bse12JJIrh$$#Vf<B5>#qS+m1`#pS!o6yd9{
zf>NhR)mKF|fn0H!untu6^$y4<omnm&cKF=k^Uu{ICWcG<AQXcfVx3-y2X)<QV2t%|
zds01AlRcZ?x1R>8ork(%cW`k30;+JK!<=d~@=`zJ6aWz;sa2q&8t9+9Mi=%)j^8)6
zGLb#=?AO|6tZ@b6cJgHKhC?jwNwM-}A~>dwbiI8zHNc<*M#50eBP`HI>Y!yj*zXYs
zQ}EvdJr+pz)taU3pbkvYV@g&~xWc8YmRt0dfQx(bd2$rQNDiuAspiVM)t7I+A6kML
zWZvgt+!Rz_ZI2bslK+f1Aclt%3L#dKU_B2Cj1Z1j)zxemn4XQ8RP!XK_tj3Y<8~=O
zPiV>(1&67j>E(eow4MhkOR9xN-ZP0zC~u!?^5Tyr$0^3XC`YYg*5{c}rLxjchVK^&
zihH*8(GK_NgT1{y%Akg`bzoq_I}#hj4=KtXEvgG?Fz&u&r=D|~eX3!>*2uY*XBIqp
z9?+(ztKT@)f;HY;3I7~nB2EpkxMiBLwDNwo(O9W&+8^yhvU0Ad$3lAFh}?qzzX<A8
z8#2+>D#Y{XQ@qs47}3nZ;*i{P0e2PkvZ&+BV<iO6!@YxHZwut=#dp_71v{iRoANa@
z?86U!Uo*)3+9JgXnO<6wU}(GX7S4nr|9Kd^w;j+H?#O?QeMb-EDaJmy(z7V_lzEn6
zJNLLp{A`Py$sHWZ@e5o~Z-eNX<@7Q{ecsl5wB_yof<M~w#HN9dnwr{m_3zVn<p-Z*
zvw0?DBX^EE^H)oY>63ORAM^IU<SH1$<@$kv`%#aLzus*&^7UV*=$?$t;V1dD6YV2X
z&h`5n@!$~pzyKn&r+o72ASa6kr1QP&HSIT}GLZ0DMy+h(;y3i2G%z_Z+ZtX710NcM
zzfGgDTNLrf3VkanJ$pwkYKXDokM7FMehSLIJe#M+jh)MpV-7}j$n3rv5CqflxA7;3
zv<2l4>$PWyeD~WySXYIU@A<UO+E$UP#xq_j?i51AyuY>mwzHF<+N@}mmR{&>x;~op
zSTq?tkz9IN=Qc&9V<0T$i7D&!{aJgvSxuwnR<Fs3Mg?Kv+^ph|<D<J-68qC`*5ft;
zU}pC(CdIWvd9VuH`H9iYR882#2xnD$7>4jutv`p{NvrY5>iq3OSd39U<7ll!vf}I~
z@_4C7gpCJ*MgzQP&#)0>rTUe~YpPk%)`KjCf1D{R{Q|dDPSuik=Un7~_03J0gn}l|
zY!;XUq*~-}JBXaQF;!Kc4DHW6$9fxQe%FHJk}ocf8mK7+Q&T!-=3~s-yzkPlplh(v
zJlQ;{!@FVI;H?DC6+J-tNgHo2K%LMskUovQLq)(L+Q6^O3hzbbel&Oo=mPnc&V5%Q
bzm8AbuVoE9qaMlyfL|Cr6WvPfd(r;^ifYg6

diff --git a/examples/tutorial/stable_diffusion/ldm/modules/image_degradation/utils_image.py b/examples/tutorial/stable_diffusion/ldm/modules/image_degradation/utils_image.py
deleted file mode 100644
index 0175f155ad90..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/modules/image_degradation/utils_image.py
+++ /dev/null
@@ -1,916 +0,0 @@
-import os
-import math
-import random
-import numpy as np
-import torch
-import cv2
-from torchvision.utils import make_grid
-from datetime import datetime
-#import matplotlib.pyplot as plt   # TODO: check with Dominik, also bsrgan.py vs bsrgan_light.py
-
-
-os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
-
-
-'''
-# --------------------------------------------
-# Kai Zhang (github: https://github.com/cszn)
-# 03/Mar/2019
-# --------------------------------------------
-# https://github.com/twhui/SRGAN-pyTorch
-# https://github.com/xinntao/BasicSR
-# --------------------------------------------
-'''
-
-
-IMG_EXTENSIONS = ['.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP', '.tif']
-
-
-def is_image_file(filename):
-    return any(filename.endswith(extension) for extension in IMG_EXTENSIONS)
-
-
-def get_timestamp():
-    return datetime.now().strftime('%y%m%d-%H%M%S')
-
-
-def imshow(x, title=None, cbar=False, figsize=None):
-    plt.figure(figsize=figsize)
-    plt.imshow(np.squeeze(x), interpolation='nearest', cmap='gray')
-    if title:
-        plt.title(title)
-    if cbar:
-        plt.colorbar()
-    plt.show()
-
-
-def surf(Z, cmap='rainbow', figsize=None):
-    plt.figure(figsize=figsize)
-    ax3 = plt.axes(projection='3d')
-
-    w, h = Z.shape[:2]
-    xx = np.arange(0,w,1)
-    yy = np.arange(0,h,1)
-    X, Y = np.meshgrid(xx, yy)
-    ax3.plot_surface(X,Y,Z,cmap=cmap)
-    #ax3.contour(X,Y,Z, zdim='z',offset=-2，cmap=cmap)
-    plt.show()
-
-
-'''
-# --------------------------------------------
-# get image pathes
-# --------------------------------------------
-'''
-
-
-def get_image_paths(dataroot):
-    paths = None  # return None if dataroot is None
-    if dataroot is not None:
-        paths = sorted(_get_paths_from_images(dataroot))
-    return paths
-
-
-def _get_paths_from_images(path):
-    assert os.path.isdir(path), '{:s} is not a valid directory'.format(path)
-    images = []
-    for dirpath, _, fnames in sorted(os.walk(path)):
-        for fname in sorted(fnames):
-            if is_image_file(fname):
-                img_path = os.path.join(dirpath, fname)
-                images.append(img_path)
-    assert images, '{:s} has no valid image file'.format(path)
-    return images
-
-
-'''
-# --------------------------------------------
-# split large images into small images 
-# --------------------------------------------
-'''
-
-
-def patches_from_image(img, p_size=512, p_overlap=64, p_max=800):
-    w, h = img.shape[:2]
-    patches = []
-    if w > p_max and h > p_max:
-        w1 = list(np.arange(0, w-p_size, p_size-p_overlap, dtype=np.int))
-        h1 = list(np.arange(0, h-p_size, p_size-p_overlap, dtype=np.int))
-        w1.append(w-p_size)
-        h1.append(h-p_size)
-#        print(w1)
-#        print(h1)
-        for i in w1:
-            for j in h1:
-                patches.append(img[i:i+p_size, j:j+p_size,:])
-    else:
-        patches.append(img)
-
-    return patches
-
-
-def imssave(imgs, img_path):
-    """
-    imgs: list, N images of size WxHxC
-    """
-    img_name, ext = os.path.splitext(os.path.basename(img_path))
-
-    for i, img in enumerate(imgs):
-        if img.ndim == 3:
-            img = img[:, :, [2, 1, 0]]
-        new_path = os.path.join(os.path.dirname(img_path), img_name+str('_s{:04d}'.format(i))+'.png')
-        cv2.imwrite(new_path, img)
-
-
-def split_imageset(original_dataroot, taget_dataroot, n_channels=3, p_size=800, p_overlap=96, p_max=1000):
-    """
-    split the large images from original_dataroot into small overlapped images with size (p_size)x(p_size),
-    and save them into taget_dataroot; only the images with larger size than (p_max)x(p_max)
-    will be splitted.
-    Args:
-        original_dataroot:
-        taget_dataroot:
-        p_size: size of small images
-        p_overlap: patch size in training is a good choice
-        p_max: images with smaller size than (p_max)x(p_max) keep unchanged.
-    """
-    paths = get_image_paths(original_dataroot)
-    for img_path in paths:
-        # img_name, ext = os.path.splitext(os.path.basename(img_path))
-        img = imread_uint(img_path, n_channels=n_channels)
-        patches = patches_from_image(img, p_size, p_overlap, p_max)
-        imssave(patches, os.path.join(taget_dataroot,os.path.basename(img_path)))
-        #if original_dataroot == taget_dataroot:
-        #del img_path
-
-'''
-# --------------------------------------------
-# makedir
-# --------------------------------------------
-'''
-
-
-def mkdir(path):
-    if not os.path.exists(path):
-        os.makedirs(path)
-
-
-def mkdirs(paths):
-    if isinstance(paths, str):
-        mkdir(paths)
-    else:
-        for path in paths:
-            mkdir(path)
-
-
-def mkdir_and_rename(path):
-    if os.path.exists(path):
-        new_name = path + '_archived_' + get_timestamp()
-        print('Path already exists. Rename it to [{:s}]'.format(new_name))
-        os.rename(path, new_name)
-    os.makedirs(path)
-
-
-'''
-# --------------------------------------------
-# read image from path
-# opencv is fast, but read BGR numpy image
-# --------------------------------------------
-'''
-
-
-# --------------------------------------------
-# get uint8 image of size HxWxn_channles (RGB)
-# --------------------------------------------
-def imread_uint(path, n_channels=3):
-    #  input: path
-    # output: HxWx3(RGB or GGG), or HxWx1 (G)
-    if n_channels == 1:
-        img = cv2.imread(path, 0)  # cv2.IMREAD_GRAYSCALE
-        img = np.expand_dims(img, axis=2)  # HxWx1
-    elif n_channels == 3:
-        img = cv2.imread(path, cv2.IMREAD_UNCHANGED)  # BGR or G
-        if img.ndim == 2:
-            img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)  # GGG
-        else:
-            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # RGB
-    return img
-
-
-# --------------------------------------------
-# matlab's imwrite
-# --------------------------------------------
-def imsave(img, img_path):
-    img = np.squeeze(img)
-    if img.ndim == 3:
-        img = img[:, :, [2, 1, 0]]
-    cv2.imwrite(img_path, img)
-
-def imwrite(img, img_path):
-    img = np.squeeze(img)
-    if img.ndim == 3:
-        img = img[:, :, [2, 1, 0]]
-    cv2.imwrite(img_path, img)
-
-
-
-# --------------------------------------------
-# get single image of size HxWxn_channles (BGR)
-# --------------------------------------------
-def read_img(path):
-    # read image by cv2
-    # return: Numpy float32, HWC, BGR, [0,1]
-    img = cv2.imread(path, cv2.IMREAD_UNCHANGED)  # cv2.IMREAD_GRAYSCALE
-    img = img.astype(np.float32) / 255.
-    if img.ndim == 2:
-        img = np.expand_dims(img, axis=2)
-    # some images have 4 channels
-    if img.shape[2] > 3:
-        img = img[:, :, :3]
-    return img
-
-
-'''
-# --------------------------------------------
-# image format conversion
-# --------------------------------------------
-# numpy(single) <--->  numpy(unit)
-# numpy(single) <--->  tensor
-# numpy(unit)   <--->  tensor
-# --------------------------------------------
-'''
-
-
-# --------------------------------------------
-# numpy(single) [0, 1] <--->  numpy(unit)
-# --------------------------------------------
-
-
-def uint2single(img):
-
-    return np.float32(img/255.)
-
-
-def single2uint(img):
-
-    return np.uint8((img.clip(0, 1)*255.).round())
-
-
-def uint162single(img):
-
-    return np.float32(img/65535.)
-
-
-def single2uint16(img):
-
-    return np.uint16((img.clip(0, 1)*65535.).round())
-
-
-# --------------------------------------------
-# numpy(unit) (HxWxC or HxW) <--->  tensor
-# --------------------------------------------
-
-
-# convert uint to 4-dimensional torch tensor
-def uint2tensor4(img):
-    if img.ndim == 2:
-        img = np.expand_dims(img, axis=2)
-    return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1).float().div(255.).unsqueeze(0)
-
-
-# convert uint to 3-dimensional torch tensor
-def uint2tensor3(img):
-    if img.ndim == 2:
-        img = np.expand_dims(img, axis=2)
-    return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1).float().div(255.)
-
-
-# convert 2/3/4-dimensional torch tensor to uint
-def tensor2uint(img):
-    img = img.data.squeeze().float().clamp_(0, 1).cpu().numpy()
-    if img.ndim == 3:
-        img = np.transpose(img, (1, 2, 0))
-    return np.uint8((img*255.0).round())
-
-
-# --------------------------------------------
-# numpy(single) (HxWxC) <--->  tensor
-# --------------------------------------------
-
-
-# convert single (HxWxC) to 3-dimensional torch tensor
-def single2tensor3(img):
-    return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1).float()
-
-
-# convert single (HxWxC) to 4-dimensional torch tensor
-def single2tensor4(img):
-    return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1).float().unsqueeze(0)
-
-
-# convert torch tensor to single
-def tensor2single(img):
-    img = img.data.squeeze().float().cpu().numpy()
-    if img.ndim == 3:
-        img = np.transpose(img, (1, 2, 0))
-
-    return img
-
-# convert torch tensor to single
-def tensor2single3(img):
-    img = img.data.squeeze().float().cpu().numpy()
-    if img.ndim == 3:
-        img = np.transpose(img, (1, 2, 0))
-    elif img.ndim == 2:
-        img = np.expand_dims(img, axis=2)
-    return img
-
-
-def single2tensor5(img):
-    return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1, 3).float().unsqueeze(0)
-
-
-def single32tensor5(img):
-    return torch.from_numpy(np.ascontiguousarray(img)).float().unsqueeze(0).unsqueeze(0)
-
-
-def single42tensor4(img):
-    return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1, 3).float()
-
-
-# from skimage.io import imread, imsave
-def tensor2img(tensor, out_type=np.uint8, min_max=(0, 1)):
-    '''
-    Converts a torch Tensor into an image Numpy array of BGR channel order
-    Input: 4D(B,(3/1),H,W), 3D(C,H,W), or 2D(H,W), any range, RGB channel order
-    Output: 3D(H,W,C) or 2D(H,W), [0,255], np.uint8 (default)
-    '''
-    tensor = tensor.squeeze().float().cpu().clamp_(*min_max)  # squeeze first, then clamp
-    tensor = (tensor - min_max[0]) / (min_max[1] - min_max[0])  # to range [0,1]
-    n_dim = tensor.dim()
-    if n_dim == 4:
-        n_img = len(tensor)
-        img_np = make_grid(tensor, nrow=int(math.sqrt(n_img)), normalize=False).numpy()
-        img_np = np.transpose(img_np[[2, 1, 0], :, :], (1, 2, 0))  # HWC, BGR
-    elif n_dim == 3:
-        img_np = tensor.numpy()
-        img_np = np.transpose(img_np[[2, 1, 0], :, :], (1, 2, 0))  # HWC, BGR
-    elif n_dim == 2:
-        img_np = tensor.numpy()
-    else:
-        raise TypeError(
-            'Only support 4D, 3D and 2D tensor. But received with dimension: {:d}'.format(n_dim))
-    if out_type == np.uint8:
-        img_np = (img_np * 255.0).round()
-        # Important. Unlike matlab, numpy.unit8() WILL NOT round by default.
-    return img_np.astype(out_type)
-
-
-'''
-# --------------------------------------------
-# Augmentation, flipe and/or rotate
-# --------------------------------------------
-# The following two are enough.
-# (1) augmet_img: numpy image of WxHxC or WxH
-# (2) augment_img_tensor4: tensor image 1xCxWxH
-# --------------------------------------------
-'''
-
-
-def augment_img(img, mode=0):
-    '''Kai Zhang (github: https://github.com/cszn)
-    '''
-    if mode == 0:
-        return img
-    elif mode == 1:
-        return np.flipud(np.rot90(img))
-    elif mode == 2:
-        return np.flipud(img)
-    elif mode == 3:
-        return np.rot90(img, k=3)
-    elif mode == 4:
-        return np.flipud(np.rot90(img, k=2))
-    elif mode == 5:
-        return np.rot90(img)
-    elif mode == 6:
-        return np.rot90(img, k=2)
-    elif mode == 7:
-        return np.flipud(np.rot90(img, k=3))
-
-
-def augment_img_tensor4(img, mode=0):
-    '''Kai Zhang (github: https://github.com/cszn)
-    '''
-    if mode == 0:
-        return img
-    elif mode == 1:
-        return img.rot90(1, [2, 3]).flip([2])
-    elif mode == 2:
-        return img.flip([2])
-    elif mode == 3:
-        return img.rot90(3, [2, 3])
-    elif mode == 4:
-        return img.rot90(2, [2, 3]).flip([2])
-    elif mode == 5:
-        return img.rot90(1, [2, 3])
-    elif mode == 6:
-        return img.rot90(2, [2, 3])
-    elif mode == 7:
-        return img.rot90(3, [2, 3]).flip([2])
-
-
-def augment_img_tensor(img, mode=0):
-    '''Kai Zhang (github: https://github.com/cszn)
-    '''
-    img_size = img.size()
-    img_np = img.data.cpu().numpy()
-    if len(img_size) == 3:
-        img_np = np.transpose(img_np, (1, 2, 0))
-    elif len(img_size) == 4:
-        img_np = np.transpose(img_np, (2, 3, 1, 0))
-    img_np = augment_img(img_np, mode=mode)
-    img_tensor = torch.from_numpy(np.ascontiguousarray(img_np))
-    if len(img_size) == 3:
-        img_tensor = img_tensor.permute(2, 0, 1)
-    elif len(img_size) == 4:
-        img_tensor = img_tensor.permute(3, 2, 0, 1)
-
-    return img_tensor.type_as(img)
-
-
-def augment_img_np3(img, mode=0):
-    if mode == 0:
-        return img
-    elif mode == 1:
-        return img.transpose(1, 0, 2)
-    elif mode == 2:
-        return img[::-1, :, :]
-    elif mode == 3:
-        img = img[::-1, :, :]
-        img = img.transpose(1, 0, 2)
-        return img
-    elif mode == 4:
-        return img[:, ::-1, :]
-    elif mode == 5:
-        img = img[:, ::-1, :]
-        img = img.transpose(1, 0, 2)
-        return img
-    elif mode == 6:
-        img = img[:, ::-1, :]
-        img = img[::-1, :, :]
-        return img
-    elif mode == 7:
-        img = img[:, ::-1, :]
-        img = img[::-1, :, :]
-        img = img.transpose(1, 0, 2)
-        return img
-
-
-def augment_imgs(img_list, hflip=True, rot=True):
-    # horizontal flip OR rotate
-    hflip = hflip and random.random() < 0.5
-    vflip = rot and random.random() < 0.5
-    rot90 = rot and random.random() < 0.5
-
-    def _augment(img):
-        if hflip:
-            img = img[:, ::-1, :]
-        if vflip:
-            img = img[::-1, :, :]
-        if rot90:
-            img = img.transpose(1, 0, 2)
-        return img
-
-    return [_augment(img) for img in img_list]
-
-
-'''
-# --------------------------------------------
-# modcrop and shave
-# --------------------------------------------
-'''
-
-
-def modcrop(img_in, scale):
-    # img_in: Numpy, HWC or HW
-    img = np.copy(img_in)
-    if img.ndim == 2:
-        H, W = img.shape
-        H_r, W_r = H % scale, W % scale
-        img = img[:H - H_r, :W - W_r]
-    elif img.ndim == 3:
-        H, W, C = img.shape
-        H_r, W_r = H % scale, W % scale
-        img = img[:H - H_r, :W - W_r, :]
-    else:
-        raise ValueError('Wrong img ndim: [{:d}].'.format(img.ndim))
-    return img
-
-
-def shave(img_in, border=0):
-    # img_in: Numpy, HWC or HW
-    img = np.copy(img_in)
-    h, w = img.shape[:2]
-    img = img[border:h-border, border:w-border]
-    return img
-
-
-'''
-# --------------------------------------------
-# image processing process on numpy image
-# channel_convert(in_c, tar_type, img_list):
-# rgb2ycbcr(img, only_y=True):
-# bgr2ycbcr(img, only_y=True):
-# ycbcr2rgb(img):
-# --------------------------------------------
-'''
-
-
-def rgb2ycbcr(img, only_y=True):
-    '''same as matlab rgb2ycbcr
-    only_y: only return Y channel
-    Input:
-        uint8, [0, 255]
-        float, [0, 1]
-    '''
-    in_img_type = img.dtype
-    img.astype(np.float32)
-    if in_img_type != np.uint8:
-        img *= 255.
-    # convert
-    if only_y:
-        rlt = np.dot(img, [65.481, 128.553, 24.966]) / 255.0 + 16.0
-    else:
-        rlt = np.matmul(img, [[65.481, -37.797, 112.0], [128.553, -74.203, -93.786],
-                              [24.966, 112.0, -18.214]]) / 255.0 + [16, 128, 128]
-    if in_img_type == np.uint8:
-        rlt = rlt.round()
-    else:
-        rlt /= 255.
-    return rlt.astype(in_img_type)
-
-
-def ycbcr2rgb(img):
-    '''same as matlab ycbcr2rgb
-    Input:
-        uint8, [0, 255]
-        float, [0, 1]
-    '''
-    in_img_type = img.dtype
-    img.astype(np.float32)
-    if in_img_type != np.uint8:
-        img *= 255.
-    # convert
-    rlt = np.matmul(img, [[0.00456621, 0.00456621, 0.00456621], [0, -0.00153632, 0.00791071],
-                          [0.00625893, -0.00318811, 0]]) * 255.0 + [-222.921, 135.576, -276.836]
-    if in_img_type == np.uint8:
-        rlt = rlt.round()
-    else:
-        rlt /= 255.
-    return rlt.astype(in_img_type)
-
-
-def bgr2ycbcr(img, only_y=True):
-    '''bgr version of rgb2ycbcr
-    only_y: only return Y channel
-    Input:
-        uint8, [0, 255]
-        float, [0, 1]
-    '''
-    in_img_type = img.dtype
-    img.astype(np.float32)
-    if in_img_type != np.uint8:
-        img *= 255.
-    # convert
-    if only_y:
-        rlt = np.dot(img, [24.966, 128.553, 65.481]) / 255.0 + 16.0
-    else:
-        rlt = np.matmul(img, [[24.966, 112.0, -18.214], [128.553, -74.203, -93.786],
-                              [65.481, -37.797, 112.0]]) / 255.0 + [16, 128, 128]
-    if in_img_type == np.uint8:
-        rlt = rlt.round()
-    else:
-        rlt /= 255.
-    return rlt.astype(in_img_type)
-
-
-def channel_convert(in_c, tar_type, img_list):
-    # conversion among BGR, gray and y
-    if in_c == 3 and tar_type == 'gray':  # BGR to gray
-        gray_list = [cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) for img in img_list]
-        return [np.expand_dims(img, axis=2) for img in gray_list]
-    elif in_c == 3 and tar_type == 'y':  # BGR to y
-        y_list = [bgr2ycbcr(img, only_y=True) for img in img_list]
-        return [np.expand_dims(img, axis=2) for img in y_list]
-    elif in_c == 1 and tar_type == 'RGB':  # gray/y to BGR
-        return [cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) for img in img_list]
-    else:
-        return img_list
-
-
-'''
-# --------------------------------------------
-# metric, PSNR and SSIM
-# --------------------------------------------
-'''
-
-
-# --------------------------------------------
-# PSNR
-# --------------------------------------------
-def calculate_psnr(img1, img2, border=0):
-    # img1 and img2 have range [0, 255]
-    #img1 = img1.squeeze()
-    #img2 = img2.squeeze()
-    if not img1.shape == img2.shape:
-        raise ValueError('Input images must have the same dimensions.')
-    h, w = img1.shape[:2]
-    img1 = img1[border:h-border, border:w-border]
-    img2 = img2[border:h-border, border:w-border]
-
-    img1 = img1.astype(np.float64)
-    img2 = img2.astype(np.float64)
-    mse = np.mean((img1 - img2)**2)
-    if mse == 0:
-        return float('inf')
-    return 20 * math.log10(255.0 / math.sqrt(mse))
-
-
-# --------------------------------------------
-# SSIM
-# --------------------------------------------
-def calculate_ssim(img1, img2, border=0):
-    '''calculate SSIM
-    the same outputs as MATLAB's
-    img1, img2: [0, 255]
-    '''
-    #img1 = img1.squeeze()
-    #img2 = img2.squeeze()
-    if not img1.shape == img2.shape:
-        raise ValueError('Input images must have the same dimensions.')
-    h, w = img1.shape[:2]
-    img1 = img1[border:h-border, border:w-border]
-    img2 = img2[border:h-border, border:w-border]
-
-    if img1.ndim == 2:
-        return ssim(img1, img2)
-    elif img1.ndim == 3:
-        if img1.shape[2] == 3:
-            ssims = []
-            for i in range(3):
-                ssims.append(ssim(img1[:,:,i], img2[:,:,i]))
-            return np.array(ssims).mean()
-        elif img1.shape[2] == 1:
-            return ssim(np.squeeze(img1), np.squeeze(img2))
-    else:
-        raise ValueError('Wrong input image dimensions.')
-
-
-def ssim(img1, img2):
-    C1 = (0.01 * 255)**2
-    C2 = (0.03 * 255)**2
-
-    img1 = img1.astype(np.float64)
-    img2 = img2.astype(np.float64)
-    kernel = cv2.getGaussianKernel(11, 1.5)
-    window = np.outer(kernel, kernel.transpose())
-
-    mu1 = cv2.filter2D(img1, -1, window)[5:-5, 5:-5]  # valid
-    mu2 = cv2.filter2D(img2, -1, window)[5:-5, 5:-5]
-    mu1_sq = mu1**2
-    mu2_sq = mu2**2
-    mu1_mu2 = mu1 * mu2
-    sigma1_sq = cv2.filter2D(img1**2, -1, window)[5:-5, 5:-5] - mu1_sq
-    sigma2_sq = cv2.filter2D(img2**2, -1, window)[5:-5, 5:-5] - mu2_sq
-    sigma12 = cv2.filter2D(img1 * img2, -1, window)[5:-5, 5:-5] - mu1_mu2
-
-    ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ((mu1_sq + mu2_sq + C1) *
-                                                            (sigma1_sq + sigma2_sq + C2))
-    return ssim_map.mean()
-
-
-'''
-# --------------------------------------------
-# matlab's bicubic imresize (numpy and torch) [0, 1]
-# --------------------------------------------
-'''
-
-
-# matlab 'imresize' function, now only support 'bicubic'
-def cubic(x):
-    absx = torch.abs(x)
-    absx2 = absx**2
-    absx3 = absx**3
-    return (1.5*absx3 - 2.5*absx2 + 1) * ((absx <= 1).type_as(absx)) + \
-        (-0.5*absx3 + 2.5*absx2 - 4*absx + 2) * (((absx > 1)*(absx <= 2)).type_as(absx))
-
-
-def calculate_weights_indices(in_length, out_length, scale, kernel, kernel_width, antialiasing):
-    if (scale < 1) and (antialiasing):
-        # Use a modified kernel to simultaneously interpolate and antialias- larger kernel width
-        kernel_width = kernel_width / scale
-
-    # Output-space coordinates
-    x = torch.linspace(1, out_length, out_length)
-
-    # Input-space coordinates. Calculate the inverse mapping such that 0.5
-    # in output space maps to 0.5 in input space, and 0.5+scale in output
-    # space maps to 1.5 in input space.
-    u = x / scale + 0.5 * (1 - 1 / scale)
-
-    # What is the left-most pixel that can be involved in the computation?
-    left = torch.floor(u - kernel_width / 2)
-
-    # What is the maximum number of pixels that can be involved in the
-    # computation?  Note: it's OK to use an extra pixel here; if the
-    # corresponding weights are all zero, it will be eliminated at the end
-    # of this function.
-    P = math.ceil(kernel_width) + 2
-
-    # The indices of the input pixels involved in computing the k-th output
-    # pixel are in row k of the indices matrix.
-    indices = left.view(out_length, 1).expand(out_length, P) + torch.linspace(0, P - 1, P).view(
-        1, P).expand(out_length, P)
-
-    # The weights used to compute the k-th output pixel are in row k of the
-    # weights matrix.
-    distance_to_center = u.view(out_length, 1).expand(out_length, P) - indices
-    # apply cubic kernel
-    if (scale < 1) and (antialiasing):
-        weights = scale * cubic(distance_to_center * scale)
-    else:
-        weights = cubic(distance_to_center)
-    # Normalize the weights matrix so that each row sums to 1.
-    weights_sum = torch.sum(weights, 1).view(out_length, 1)
-    weights = weights / weights_sum.expand(out_length, P)
-
-    # If a column in weights is all zero, get rid of it. only consider the first and last column.
-    weights_zero_tmp = torch.sum((weights == 0), 0)
-    if not math.isclose(weights_zero_tmp[0], 0, rel_tol=1e-6):
-        indices = indices.narrow(1, 1, P - 2)
-        weights = weights.narrow(1, 1, P - 2)
-    if not math.isclose(weights_zero_tmp[-1], 0, rel_tol=1e-6):
-        indices = indices.narrow(1, 0, P - 2)
-        weights = weights.narrow(1, 0, P - 2)
-    weights = weights.contiguous()
-    indices = indices.contiguous()
-    sym_len_s = -indices.min() + 1
-    sym_len_e = indices.max() - in_length
-    indices = indices + sym_len_s - 1
-    return weights, indices, int(sym_len_s), int(sym_len_e)
-
-
-# --------------------------------------------
-# imresize for tensor image [0, 1]
-# --------------------------------------------
-def imresize(img, scale, antialiasing=True):
-    # Now the scale should be the same for H and W
-    # input: img: pytorch tensor, CHW or HW [0,1]
-    # output: CHW or HW [0,1] w/o round
-    need_squeeze = True if img.dim() == 2 else False
-    if need_squeeze:
-        img.unsqueeze_(0)
-    in_C, in_H, in_W = img.size()
-    out_C, out_H, out_W = in_C, math.ceil(in_H * scale), math.ceil(in_W * scale)
-    kernel_width = 4
-    kernel = 'cubic'
-
-    # Return the desired dimension order for performing the resize.  The
-    # strategy is to perform the resize first along the dimension with the
-    # smallest scale factor.
-    # Now we do not support this.
-
-    # get weights and indices
-    weights_H, indices_H, sym_len_Hs, sym_len_He = calculate_weights_indices(
-        in_H, out_H, scale, kernel, kernel_width, antialiasing)
-    weights_W, indices_W, sym_len_Ws, sym_len_We = calculate_weights_indices(
-        in_W, out_W, scale, kernel, kernel_width, antialiasing)
-    # process H dimension
-    # symmetric copying
-    img_aug = torch.FloatTensor(in_C, in_H + sym_len_Hs + sym_len_He, in_W)
-    img_aug.narrow(1, sym_len_Hs, in_H).copy_(img)
-
-    sym_patch = img[:, :sym_len_Hs, :]
-    inv_idx = torch.arange(sym_patch.size(1) - 1, -1, -1).long()
-    sym_patch_inv = sym_patch.index_select(1, inv_idx)
-    img_aug.narrow(1, 0, sym_len_Hs).copy_(sym_patch_inv)
-
-    sym_patch = img[:, -sym_len_He:, :]
-    inv_idx = torch.arange(sym_patch.size(1) - 1, -1, -1).long()
-    sym_patch_inv = sym_patch.index_select(1, inv_idx)
-    img_aug.narrow(1, sym_len_Hs + in_H, sym_len_He).copy_(sym_patch_inv)
-
-    out_1 = torch.FloatTensor(in_C, out_H, in_W)
-    kernel_width = weights_H.size(1)
-    for i in range(out_H):
-        idx = int(indices_H[i][0])
-        for j in range(out_C):
-            out_1[j, i, :] = img_aug[j, idx:idx + kernel_width, :].transpose(0, 1).mv(weights_H[i])
-
-    # process W dimension
-    # symmetric copying
-    out_1_aug = torch.FloatTensor(in_C, out_H, in_W + sym_len_Ws + sym_len_We)
-    out_1_aug.narrow(2, sym_len_Ws, in_W).copy_(out_1)
-
-    sym_patch = out_1[:, :, :sym_len_Ws]
-    inv_idx = torch.arange(sym_patch.size(2) - 1, -1, -1).long()
-    sym_patch_inv = sym_patch.index_select(2, inv_idx)
-    out_1_aug.narrow(2, 0, sym_len_Ws).copy_(sym_patch_inv)
-
-    sym_patch = out_1[:, :, -sym_len_We:]
-    inv_idx = torch.arange(sym_patch.size(2) - 1, -1, -1).long()
-    sym_patch_inv = sym_patch.index_select(2, inv_idx)
-    out_1_aug.narrow(2, sym_len_Ws + in_W, sym_len_We).copy_(sym_patch_inv)
-
-    out_2 = torch.FloatTensor(in_C, out_H, out_W)
-    kernel_width = weights_W.size(1)
-    for i in range(out_W):
-        idx = int(indices_W[i][0])
-        for j in range(out_C):
-            out_2[j, :, i] = out_1_aug[j, :, idx:idx + kernel_width].mv(weights_W[i])
-    if need_squeeze:
-        out_2.squeeze_()
-    return out_2
-
-
-# --------------------------------------------
-# imresize for numpy image [0, 1]
-# --------------------------------------------
-def imresize_np(img, scale, antialiasing=True):
-    # Now the scale should be the same for H and W
-    # input: img: Numpy, HWC or HW [0,1]
-    # output: HWC or HW [0,1] w/o round
-    img = torch.from_numpy(img)
-    need_squeeze = True if img.dim() == 2 else False
-    if need_squeeze:
-        img.unsqueeze_(2)
-
-    in_H, in_W, in_C = img.size()
-    out_C, out_H, out_W = in_C, math.ceil(in_H * scale), math.ceil(in_W * scale)
-    kernel_width = 4
-    kernel = 'cubic'
-
-    # Return the desired dimension order for performing the resize.  The
-    # strategy is to perform the resize first along the dimension with the
-    # smallest scale factor.
-    # Now we do not support this.
-
-    # get weights and indices
-    weights_H, indices_H, sym_len_Hs, sym_len_He = calculate_weights_indices(
-        in_H, out_H, scale, kernel, kernel_width, antialiasing)
-    weights_W, indices_W, sym_len_Ws, sym_len_We = calculate_weights_indices(
-        in_W, out_W, scale, kernel, kernel_width, antialiasing)
-    # process H dimension
-    # symmetric copying
-    img_aug = torch.FloatTensor(in_H + sym_len_Hs + sym_len_He, in_W, in_C)
-    img_aug.narrow(0, sym_len_Hs, in_H).copy_(img)
-
-    sym_patch = img[:sym_len_Hs, :, :]
-    inv_idx = torch.arange(sym_patch.size(0) - 1, -1, -1).long()
-    sym_patch_inv = sym_patch.index_select(0, inv_idx)
-    img_aug.narrow(0, 0, sym_len_Hs).copy_(sym_patch_inv)
-
-    sym_patch = img[-sym_len_He:, :, :]
-    inv_idx = torch.arange(sym_patch.size(0) - 1, -1, -1).long()
-    sym_patch_inv = sym_patch.index_select(0, inv_idx)
-    img_aug.narrow(0, sym_len_Hs + in_H, sym_len_He).copy_(sym_patch_inv)
-
-    out_1 = torch.FloatTensor(out_H, in_W, in_C)
-    kernel_width = weights_H.size(1)
-    for i in range(out_H):
-        idx = int(indices_H[i][0])
-        for j in range(out_C):
-            out_1[i, :, j] = img_aug[idx:idx + kernel_width, :, j].transpose(0, 1).mv(weights_H[i])
-
-    # process W dimension
-    # symmetric copying
-    out_1_aug = torch.FloatTensor(out_H, in_W + sym_len_Ws + sym_len_We, in_C)
-    out_1_aug.narrow(1, sym_len_Ws, in_W).copy_(out_1)
-
-    sym_patch = out_1[:, :sym_len_Ws, :]
-    inv_idx = torch.arange(sym_patch.size(1) - 1, -1, -1).long()
-    sym_patch_inv = sym_patch.index_select(1, inv_idx)
-    out_1_aug.narrow(1, 0, sym_len_Ws).copy_(sym_patch_inv)
-
-    sym_patch = out_1[:, -sym_len_We:, :]
-    inv_idx = torch.arange(sym_patch.size(1) - 1, -1, -1).long()
-    sym_patch_inv = sym_patch.index_select(1, inv_idx)
-    out_1_aug.narrow(1, sym_len_Ws + in_W, sym_len_We).copy_(sym_patch_inv)
-
-    out_2 = torch.FloatTensor(out_H, out_W, in_C)
-    kernel_width = weights_W.size(1)
-    for i in range(out_W):
-        idx = int(indices_W[i][0])
-        for j in range(out_C):
-            out_2[:, i, j] = out_1_aug[:, idx:idx + kernel_width, j].mv(weights_W[i])
-    if need_squeeze:
-        out_2.squeeze_()
-
-    return out_2.numpy()
-
-
-if __name__ == '__main__':
-    print('---')
-#    img = imread_uint('test.bmp', 3)
-#    img = uint2single(img)
-#    img_bicubic = imresize_np(img, 1/4)
\ No newline at end of file
diff --git a/examples/tutorial/stable_diffusion/ldm/modules/losses/__init__.py b/examples/tutorial/stable_diffusion/ldm/modules/losses/__init__.py
deleted file mode 100644
index 876d7c5bd6e3..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/modules/losses/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from ldm.modules.losses.contperceptual import LPIPSWithDiscriminator
\ No newline at end of file
diff --git a/examples/tutorial/stable_diffusion/ldm/modules/losses/contperceptual.py b/examples/tutorial/stable_diffusion/ldm/modules/losses/contperceptual.py
deleted file mode 100644
index 672c1e32a138..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/modules/losses/contperceptual.py
+++ /dev/null
@@ -1,111 +0,0 @@
-import torch
-import torch.nn as nn
-
-from taming.modules.losses.vqperceptual import *  # TODO: taming dependency yes/no?
-
-
-class LPIPSWithDiscriminator(nn.Module):
-    def __init__(self, disc_start, logvar_init=0.0, kl_weight=1.0, pixelloss_weight=1.0,
-                 disc_num_layers=3, disc_in_channels=3, disc_factor=1.0, disc_weight=1.0,
-                 perceptual_weight=1.0, use_actnorm=False, disc_conditional=False,
-                 disc_loss="hinge"):
-
-        super().__init__()
-        assert disc_loss in ["hinge", "vanilla"]
-        self.kl_weight = kl_weight
-        self.pixel_weight = pixelloss_weight
-        self.perceptual_loss = LPIPS().eval()
-        self.perceptual_weight = perceptual_weight
-        # output log variance
-        self.logvar = nn.Parameter(torch.ones(size=()) * logvar_init)
-
-        self.discriminator = NLayerDiscriminator(input_nc=disc_in_channels,
-                                                 n_layers=disc_num_layers,
-                                                 use_actnorm=use_actnorm
-                                                 ).apply(weights_init)
-        self.discriminator_iter_start = disc_start
-        self.disc_loss = hinge_d_loss if disc_loss == "hinge" else vanilla_d_loss
-        self.disc_factor = disc_factor
-        self.discriminator_weight = disc_weight
-        self.disc_conditional = disc_conditional
-
-    def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None):
-        if last_layer is not None:
-            nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0]
-            g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0]
-        else:
-            nll_grads = torch.autograd.grad(nll_loss, self.last_layer[0], retain_graph=True)[0]
-            g_grads = torch.autograd.grad(g_loss, self.last_layer[0], retain_graph=True)[0]
-
-        d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4)
-        d_weight = torch.clamp(d_weight, 0.0, 1e4).detach()
-        d_weight = d_weight * self.discriminator_weight
-        return d_weight
-
-    def forward(self, inputs, reconstructions, posteriors, optimizer_idx,
-                global_step, last_layer=None, cond=None, split="train",
-                weights=None):
-        rec_loss = torch.abs(inputs.contiguous() - reconstructions.contiguous())
-        if self.perceptual_weight > 0:
-            p_loss = self.perceptual_loss(inputs.contiguous(), reconstructions.contiguous())
-            rec_loss = rec_loss + self.perceptual_weight * p_loss
-
-        nll_loss = rec_loss / torch.exp(self.logvar) + self.logvar
-        weighted_nll_loss = nll_loss
-        if weights is not None:
-            weighted_nll_loss = weights*nll_loss
-        weighted_nll_loss = torch.sum(weighted_nll_loss) / weighted_nll_loss.shape[0]
-        nll_loss = torch.sum(nll_loss) / nll_loss.shape[0]
-        kl_loss = posteriors.kl()
-        kl_loss = torch.sum(kl_loss) / kl_loss.shape[0]
-
-        # now the GAN part
-        if optimizer_idx == 0:
-            # generator update
-            if cond is None:
-                assert not self.disc_conditional
-                logits_fake = self.discriminator(reconstructions.contiguous())
-            else:
-                assert self.disc_conditional
-                logits_fake = self.discriminator(torch.cat((reconstructions.contiguous(), cond), dim=1))
-            g_loss = -torch.mean(logits_fake)
-
-            if self.disc_factor > 0.0:
-                try:
-                    d_weight = self.calculate_adaptive_weight(nll_loss, g_loss, last_layer=last_layer)
-                except RuntimeError:
-                    assert not self.training
-                    d_weight = torch.tensor(0.0)
-            else:
-                d_weight = torch.tensor(0.0)
-
-            disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
-            loss = weighted_nll_loss + self.kl_weight * kl_loss + d_weight * disc_factor * g_loss
-
-            log = {"{}/total_loss".format(split): loss.clone().detach().mean(), "{}/logvar".format(split): self.logvar.detach(),
-                   "{}/kl_loss".format(split): kl_loss.detach().mean(), "{}/nll_loss".format(split): nll_loss.detach().mean(),
-                   "{}/rec_loss".format(split): rec_loss.detach().mean(),
-                   "{}/d_weight".format(split): d_weight.detach(),
-                   "{}/disc_factor".format(split): torch.tensor(disc_factor),
-                   "{}/g_loss".format(split): g_loss.detach().mean(),
-                   }
-            return loss, log
-
-        if optimizer_idx == 1:
-            # second pass for discriminator update
-            if cond is None:
-                logits_real = self.discriminator(inputs.contiguous().detach())
-                logits_fake = self.discriminator(reconstructions.contiguous().detach())
-            else:
-                logits_real = self.discriminator(torch.cat((inputs.contiguous().detach(), cond), dim=1))
-                logits_fake = self.discriminator(torch.cat((reconstructions.contiguous().detach(), cond), dim=1))
-
-            disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
-            d_loss = disc_factor * self.disc_loss(logits_real, logits_fake)
-
-            log = {"{}/disc_loss".format(split): d_loss.clone().detach().mean(),
-                   "{}/logits_real".format(split): logits_real.detach().mean(),
-                   "{}/logits_fake".format(split): logits_fake.detach().mean()
-                   }
-            return d_loss, log
-
diff --git a/examples/tutorial/stable_diffusion/ldm/modules/losses/vqperceptual.py b/examples/tutorial/stable_diffusion/ldm/modules/losses/vqperceptual.py
deleted file mode 100644
index f69981769e4b..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/modules/losses/vqperceptual.py
+++ /dev/null
@@ -1,167 +0,0 @@
-import torch
-from torch import nn
-import torch.nn.functional as F
-from einops import repeat
-
-from taming.modules.discriminator.model import NLayerDiscriminator, weights_init
-from taming.modules.losses.lpips import LPIPS
-from taming.modules.losses.vqperceptual import hinge_d_loss, vanilla_d_loss
-
-
-def hinge_d_loss_with_exemplar_weights(logits_real, logits_fake, weights):
-    assert weights.shape[0] == logits_real.shape[0] == logits_fake.shape[0]
-    loss_real = torch.mean(F.relu(1. - logits_real), dim=[1,2,3])
-    loss_fake = torch.mean(F.relu(1. + logits_fake), dim=[1,2,3])
-    loss_real = (weights * loss_real).sum() / weights.sum()
-    loss_fake = (weights * loss_fake).sum() / weights.sum()
-    d_loss = 0.5 * (loss_real + loss_fake)
-    return d_loss
-
-def adopt_weight(weight, global_step, threshold=0, value=0.):
-    if global_step < threshold:
-        weight = value
-    return weight
-
-
-def measure_perplexity(predicted_indices, n_embed):
-    # src: https://github.com/karpathy/deep-vector-quantization/blob/main/model.py
-    # eval cluster perplexity. when perplexity == num_embeddings then all clusters are used exactly equally
-    encodings = F.one_hot(predicted_indices, n_embed).float().reshape(-1, n_embed)
-    avg_probs = encodings.mean(0)
-    perplexity = (-(avg_probs * torch.log(avg_probs + 1e-10)).sum()).exp()
-    cluster_use = torch.sum(avg_probs > 0)
-    return perplexity, cluster_use
-
-def l1(x, y):
-    return torch.abs(x-y)
-
-
-def l2(x, y):
-    return torch.pow((x-y), 2)
-
-
-class VQLPIPSWithDiscriminator(nn.Module):
-    def __init__(self, disc_start, codebook_weight=1.0, pixelloss_weight=1.0,
-                 disc_num_layers=3, disc_in_channels=3, disc_factor=1.0, disc_weight=1.0,
-                 perceptual_weight=1.0, use_actnorm=False, disc_conditional=False,
-                 disc_ndf=64, disc_loss="hinge", n_classes=None, perceptual_loss="lpips",
-                 pixel_loss="l1"):
-        super().__init__()
-        assert disc_loss in ["hinge", "vanilla"]
-        assert perceptual_loss in ["lpips", "clips", "dists"]
-        assert pixel_loss in ["l1", "l2"]
-        self.codebook_weight = codebook_weight
-        self.pixel_weight = pixelloss_weight
-        if perceptual_loss == "lpips":
-            print(f"{self.__class__.__name__}: Running with LPIPS.")
-            self.perceptual_loss = LPIPS().eval()
-        else:
-            raise ValueError(f"Unknown perceptual loss: >> {perceptual_loss} <<")
-        self.perceptual_weight = perceptual_weight
-
-        if pixel_loss == "l1":
-            self.pixel_loss = l1
-        else:
-            self.pixel_loss = l2
-
-        self.discriminator = NLayerDiscriminator(input_nc=disc_in_channels,
-                                                 n_layers=disc_num_layers,
-                                                 use_actnorm=use_actnorm,
-                                                 ndf=disc_ndf
-                                                 ).apply(weights_init)
-        self.discriminator_iter_start = disc_start
-        if disc_loss == "hinge":
-            self.disc_loss = hinge_d_loss
-        elif disc_loss == "vanilla":
-            self.disc_loss = vanilla_d_loss
-        else:
-            raise ValueError(f"Unknown GAN loss '{disc_loss}'.")
-        print(f"VQLPIPSWithDiscriminator running with {disc_loss} loss.")
-        self.disc_factor = disc_factor
-        self.discriminator_weight = disc_weight
-        self.disc_conditional = disc_conditional
-        self.n_classes = n_classes
-
-    def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None):
-        if last_layer is not None:
-            nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0]
-            g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0]
-        else:
-            nll_grads = torch.autograd.grad(nll_loss, self.last_layer[0], retain_graph=True)[0]
-            g_grads = torch.autograd.grad(g_loss, self.last_layer[0], retain_graph=True)[0]
-
-        d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4)
-        d_weight = torch.clamp(d_weight, 0.0, 1e4).detach()
-        d_weight = d_weight * self.discriminator_weight
-        return d_weight
-
-    def forward(self, codebook_loss, inputs, reconstructions, optimizer_idx,
-                global_step, last_layer=None, cond=None, split="train", predicted_indices=None):
-        if not exists(codebook_loss):
-            codebook_loss = torch.tensor([0.]).to(inputs.device)
-        #rec_loss = torch.abs(inputs.contiguous() - reconstructions.contiguous())
-        rec_loss = self.pixel_loss(inputs.contiguous(), reconstructions.contiguous())
-        if self.perceptual_weight > 0:
-            p_loss = self.perceptual_loss(inputs.contiguous(), reconstructions.contiguous())
-            rec_loss = rec_loss + self.perceptual_weight * p_loss
-        else:
-            p_loss = torch.tensor([0.0])
-
-        nll_loss = rec_loss
-        #nll_loss = torch.sum(nll_loss) / nll_loss.shape[0]
-        nll_loss = torch.mean(nll_loss)
-
-        # now the GAN part
-        if optimizer_idx == 0:
-            # generator update
-            if cond is None:
-                assert not self.disc_conditional
-                logits_fake = self.discriminator(reconstructions.contiguous())
-            else:
-                assert self.disc_conditional
-                logits_fake = self.discriminator(torch.cat((reconstructions.contiguous(), cond), dim=1))
-            g_loss = -torch.mean(logits_fake)
-
-            try:
-                d_weight = self.calculate_adaptive_weight(nll_loss, g_loss, last_layer=last_layer)
-            except RuntimeError:
-                assert not self.training
-                d_weight = torch.tensor(0.0)
-
-            disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
-            loss = nll_loss + d_weight * disc_factor * g_loss + self.codebook_weight * codebook_loss.mean()
-
-            log = {"{}/total_loss".format(split): loss.clone().detach().mean(),
-                   "{}/quant_loss".format(split): codebook_loss.detach().mean(),
-                   "{}/nll_loss".format(split): nll_loss.detach().mean(),
-                   "{}/rec_loss".format(split): rec_loss.detach().mean(),
-                   "{}/p_loss".format(split): p_loss.detach().mean(),
-                   "{}/d_weight".format(split): d_weight.detach(),
-                   "{}/disc_factor".format(split): torch.tensor(disc_factor),
-                   "{}/g_loss".format(split): g_loss.detach().mean(),
-                   }
-            if predicted_indices is not None:
-                assert self.n_classes is not None
-                with torch.no_grad():
-                    perplexity, cluster_usage = measure_perplexity(predicted_indices, self.n_classes)
-                log[f"{split}/perplexity"] = perplexity
-                log[f"{split}/cluster_usage"] = cluster_usage
-            return loss, log
-
-        if optimizer_idx == 1:
-            # second pass for discriminator update
-            if cond is None:
-                logits_real = self.discriminator(inputs.contiguous().detach())
-                logits_fake = self.discriminator(reconstructions.contiguous().detach())
-            else:
-                logits_real = self.discriminator(torch.cat((inputs.contiguous().detach(), cond), dim=1))
-                logits_fake = self.discriminator(torch.cat((reconstructions.contiguous().detach(), cond), dim=1))
-
-            disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
-            d_loss = disc_factor * self.disc_loss(logits_real, logits_fake)
-
-            log = {"{}/disc_loss".format(split): d_loss.clone().detach().mean(),
-                   "{}/logits_real".format(split): logits_real.detach().mean(),
-                   "{}/logits_fake".format(split): logits_fake.detach().mean()
-                   }
-            return d_loss, log
diff --git a/examples/tutorial/stable_diffusion/ldm/modules/x_transformer.py b/examples/tutorial/stable_diffusion/ldm/modules/x_transformer.py
deleted file mode 100644
index 5fc15bf9cfe0..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/modules/x_transformer.py
+++ /dev/null
@@ -1,641 +0,0 @@
-"""shout-out to https://github.com/lucidrains/x-transformers/tree/main/x_transformers"""
-import torch
-from torch import nn, einsum
-import torch.nn.functional as F
-from functools import partial
-from inspect import isfunction
-from collections import namedtuple
-from einops import rearrange, repeat, reduce
-
-# constants
-
-DEFAULT_DIM_HEAD = 64
-
-Intermediates = namedtuple('Intermediates', [
-    'pre_softmax_attn',
-    'post_softmax_attn'
-])
-
-LayerIntermediates = namedtuple('Intermediates', [
-    'hiddens',
-    'attn_intermediates'
-])
-
-
-class AbsolutePositionalEmbedding(nn.Module):
-    def __init__(self, dim, max_seq_len):
-        super().__init__()
-        self.emb = nn.Embedding(max_seq_len, dim)
-        self.init_()
-
-    def init_(self):
-        nn.init.normal_(self.emb.weight, std=0.02)
-
-    def forward(self, x):
-        n = torch.arange(x.shape[1], device=x.device)
-        return self.emb(n)[None, :, :]
-
-
-class FixedPositionalEmbedding(nn.Module):
-    def __init__(self, dim):
-        super().__init__()
-        inv_freq = 1. / (10000 ** (torch.arange(0, dim, 2).float() / dim))
-        self.register_buffer('inv_freq', inv_freq)
-
-    def forward(self, x, seq_dim=1, offset=0):
-        t = torch.arange(x.shape[seq_dim], device=x.device).type_as(self.inv_freq) + offset
-        sinusoid_inp = torch.einsum('i , j -> i j', t, self.inv_freq)
-        emb = torch.cat((sinusoid_inp.sin(), sinusoid_inp.cos()), dim=-1)
-        return emb[None, :, :]
-
-
-# helpers
-
-def exists(val):
-    return val is not None
-
-
-def default(val, d):
-    if exists(val):
-        return val
-    return d() if isfunction(d) else d
-
-
-def always(val):
-    def inner(*args, **kwargs):
-        return val
-    return inner
-
-
-def not_equals(val):
-    def inner(x):
-        return x != val
-    return inner
-
-
-def equals(val):
-    def inner(x):
-        return x == val
-    return inner
-
-
-def max_neg_value(tensor):
-    return -torch.finfo(tensor.dtype).max
-
-
-# keyword argument helpers
-
-def pick_and_pop(keys, d):
-    values = list(map(lambda key: d.pop(key), keys))
-    return dict(zip(keys, values))
-
-
-def group_dict_by_key(cond, d):
-    return_val = [dict(), dict()]
-    for key in d.keys():
-        match = bool(cond(key))
-        ind = int(not match)
-        return_val[ind][key] = d[key]
-    return (*return_val,)
-
-
-def string_begins_with(prefix, str):
-    return str.startswith(prefix)
-
-
-def group_by_key_prefix(prefix, d):
-    return group_dict_by_key(partial(string_begins_with, prefix), d)
-
-
-def groupby_prefix_and_trim(prefix, d):
-    kwargs_with_prefix, kwargs = group_dict_by_key(partial(string_begins_with, prefix), d)
-    kwargs_without_prefix = dict(map(lambda x: (x[0][len(prefix):], x[1]), tuple(kwargs_with_prefix.items())))
-    return kwargs_without_prefix, kwargs
-
-
-# classes
-class Scale(nn.Module):
-    def __init__(self, value, fn):
-        super().__init__()
-        self.value = value
-        self.fn = fn
-
-    def forward(self, x, **kwargs):
-        x, *rest = self.fn(x, **kwargs)
-        return (x * self.value, *rest)
-
-
-class Rezero(nn.Module):
-    def __init__(self, fn):
-        super().__init__()
-        self.fn = fn
-        self.g = nn.Parameter(torch.zeros(1))
-
-    def forward(self, x, **kwargs):
-        x, *rest = self.fn(x, **kwargs)
-        return (x * self.g, *rest)
-
-
-class ScaleNorm(nn.Module):
-    def __init__(self, dim, eps=1e-5):
-        super().__init__()
-        self.scale = dim ** -0.5
-        self.eps = eps
-        self.g = nn.Parameter(torch.ones(1))
-
-    def forward(self, x):
-        norm = torch.norm(x, dim=-1, keepdim=True) * self.scale
-        return x / norm.clamp(min=self.eps) * self.g
-
-
-class RMSNorm(nn.Module):
-    def __init__(self, dim, eps=1e-8):
-        super().__init__()
-        self.scale = dim ** -0.5
-        self.eps = eps
-        self.g = nn.Parameter(torch.ones(dim))
-
-    def forward(self, x):
-        norm = torch.norm(x, dim=-1, keepdim=True) * self.scale
-        return x / norm.clamp(min=self.eps) * self.g
-
-
-class Residual(nn.Module):
-    def forward(self, x, residual):
-        return x + residual
-
-
-class GRUGating(nn.Module):
-    def __init__(self, dim):
-        super().__init__()
-        self.gru = nn.GRUCell(dim, dim)
-
-    def forward(self, x, residual):
-        gated_output = self.gru(
-            rearrange(x, 'b n d -> (b n) d'),
-            rearrange(residual, 'b n d -> (b n) d')
-        )
-
-        return gated_output.reshape_as(x)
-
-
-# feedforward
-
-class GEGLU(nn.Module):
-    def __init__(self, dim_in, dim_out):
-        super().__init__()
-        self.proj = nn.Linear(dim_in, dim_out * 2)
-
-    def forward(self, x):
-        x, gate = self.proj(x).chunk(2, dim=-1)
-        return x * F.gelu(gate)
-
-
-class FeedForward(nn.Module):
-    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.):
-        super().__init__()
-        inner_dim = int(dim * mult)
-        dim_out = default(dim_out, dim)
-        project_in = nn.Sequential(
-            nn.Linear(dim, inner_dim),
-            nn.GELU()
-        ) if not glu else GEGLU(dim, inner_dim)
-
-        self.net = nn.Sequential(
-            project_in,
-            nn.Dropout(dropout),
-            nn.Linear(inner_dim, dim_out)
-        )
-
-    def forward(self, x):
-        return self.net(x)
-
-
-# attention.
-class Attention(nn.Module):
-    def __init__(
-            self,
-            dim,
-            dim_head=DEFAULT_DIM_HEAD,
-            heads=8,
-            causal=False,
-            mask=None,
-            talking_heads=False,
-            sparse_topk=None,
-            use_entmax15=False,
-            num_mem_kv=0,
-            dropout=0.,
-            on_attn=False
-    ):
-        super().__init__()
-        if use_entmax15:
-            raise NotImplementedError("Check out entmax activation instead of softmax activation!")
-        self.scale = dim_head ** -0.5
-        self.heads = heads
-        self.causal = causal
-        self.mask = mask
-
-        inner_dim = dim_head * heads
-
-        self.to_q = nn.Linear(dim, inner_dim, bias=False)
-        self.to_k = nn.Linear(dim, inner_dim, bias=False)
-        self.to_v = nn.Linear(dim, inner_dim, bias=False)
-        self.dropout = nn.Dropout(dropout)
-
-        # talking heads
-        self.talking_heads = talking_heads
-        if talking_heads:
-            self.pre_softmax_proj = nn.Parameter(torch.randn(heads, heads))
-            self.post_softmax_proj = nn.Parameter(torch.randn(heads, heads))
-
-        # explicit topk sparse attention
-        self.sparse_topk = sparse_topk
-
-        # entmax
-        #self.attn_fn = entmax15 if use_entmax15 else F.softmax
-        self.attn_fn = F.softmax
-
-        # add memory key / values
-        self.num_mem_kv = num_mem_kv
-        if num_mem_kv > 0:
-            self.mem_k = nn.Parameter(torch.randn(heads, num_mem_kv, dim_head))
-            self.mem_v = nn.Parameter(torch.randn(heads, num_mem_kv, dim_head))
-
-        # attention on attention
-        self.attn_on_attn = on_attn
-        self.to_out = nn.Sequential(nn.Linear(inner_dim, dim * 2), nn.GLU()) if on_attn else nn.Linear(inner_dim, dim)
-
-    def forward(
-            self,
-            x,
-            context=None,
-            mask=None,
-            context_mask=None,
-            rel_pos=None,
-            sinusoidal_emb=None,
-            prev_attn=None,
-            mem=None
-    ):
-        b, n, _, h, talking_heads, device = *x.shape, self.heads, self.talking_heads, x.device
-        kv_input = default(context, x)
-
-        q_input = x
-        k_input = kv_input
-        v_input = kv_input
-
-        if exists(mem):
-            k_input = torch.cat((mem, k_input), dim=-2)
-            v_input = torch.cat((mem, v_input), dim=-2)
-
-        if exists(sinusoidal_emb):
-            # in shortformer, the query would start at a position offset depending on the past cached memory
-            offset = k_input.shape[-2] - q_input.shape[-2]
-            q_input = q_input + sinusoidal_emb(q_input, offset=offset)
-            k_input = k_input + sinusoidal_emb(k_input)
-
-        q = self.to_q(q_input)
-        k = self.to_k(k_input)
-        v = self.to_v(v_input)
-
-        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h=h), (q, k, v))
-
-        input_mask = None
-        if any(map(exists, (mask, context_mask))):
-            q_mask = default(mask, lambda: torch.ones((b, n), device=device).bool())
-            k_mask = q_mask if not exists(context) else context_mask
-            k_mask = default(k_mask, lambda: torch.ones((b, k.shape[-2]), device=device).bool())
-            q_mask = rearrange(q_mask, 'b i -> b () i ()')
-            k_mask = rearrange(k_mask, 'b j -> b () () j')
-            input_mask = q_mask * k_mask
-
-        if self.num_mem_kv > 0:
-            mem_k, mem_v = map(lambda t: repeat(t, 'h n d -> b h n d', b=b), (self.mem_k, self.mem_v))
-            k = torch.cat((mem_k, k), dim=-2)
-            v = torch.cat((mem_v, v), dim=-2)
-            if exists(input_mask):
-                input_mask = F.pad(input_mask, (self.num_mem_kv, 0), value=True)
-
-        dots = einsum('b h i d, b h j d -> b h i j', q, k) * self.scale
-        mask_value = max_neg_value(dots)
-
-        if exists(prev_attn):
-            dots = dots + prev_attn
-
-        pre_softmax_attn = dots
-
-        if talking_heads:
-            dots = einsum('b h i j, h k -> b k i j', dots, self.pre_softmax_proj).contiguous()
-
-        if exists(rel_pos):
-            dots = rel_pos(dots)
-
-        if exists(input_mask):
-            dots.masked_fill_(~input_mask, mask_value)
-            del input_mask
-
-        if self.causal:
-            i, j = dots.shape[-2:]
-            r = torch.arange(i, device=device)
-            mask = rearrange(r, 'i -> () () i ()') < rearrange(r, 'j -> () () () j')
-            mask = F.pad(mask, (j - i, 0), value=False)
-            dots.masked_fill_(mask, mask_value)
-            del mask
-
-        if exists(self.sparse_topk) and self.sparse_topk < dots.shape[-1]:
-            top, _ = dots.topk(self.sparse_topk, dim=-1)
-            vk = top[..., -1].unsqueeze(-1).expand_as(dots)
-            mask = dots < vk
-            dots.masked_fill_(mask, mask_value)
-            del mask
-
-        attn = self.attn_fn(dots, dim=-1)
-        post_softmax_attn = attn
-
-        attn = self.dropout(attn)
-
-        if talking_heads:
-            attn = einsum('b h i j, h k -> b k i j', attn, self.post_softmax_proj).contiguous()
-
-        out = einsum('b h i j, b h j d -> b h i d', attn, v)
-        out = rearrange(out, 'b h n d -> b n (h d)')
-
-        intermediates = Intermediates(
-            pre_softmax_attn=pre_softmax_attn,
-            post_softmax_attn=post_softmax_attn
-        )
-
-        return self.to_out(out), intermediates
-
-
-class AttentionLayers(nn.Module):
-    def __init__(
-            self,
-            dim,
-            depth,
-            heads=8,
-            causal=False,
-            cross_attend=False,
-            only_cross=False,
-            use_scalenorm=False,
-            use_rmsnorm=False,
-            use_rezero=False,
-            rel_pos_num_buckets=32,
-            rel_pos_max_distance=128,
-            position_infused_attn=False,
-            custom_layers=None,
-            sandwich_coef=None,
-            par_ratio=None,
-            residual_attn=False,
-            cross_residual_attn=False,
-            macaron=False,
-            pre_norm=True,
-            gate_residual=False,
-            **kwargs
-    ):
-        super().__init__()
-        ff_kwargs, kwargs = groupby_prefix_and_trim('ff_', kwargs)
-        attn_kwargs, _ = groupby_prefix_and_trim('attn_', kwargs)
-
-        dim_head = attn_kwargs.get('dim_head', DEFAULT_DIM_HEAD)
-
-        self.dim = dim
-        self.depth = depth
-        self.layers = nn.ModuleList([])
-
-        self.has_pos_emb = position_infused_attn
-        self.pia_pos_emb = FixedPositionalEmbedding(dim) if position_infused_attn else None
-        self.rotary_pos_emb = always(None)
-
-        assert rel_pos_num_buckets <= rel_pos_max_distance, 'number of relative position buckets must be less than the relative position max distance'
-        self.rel_pos = None
-
-        self.pre_norm = pre_norm
-
-        self.residual_attn = residual_attn
-        self.cross_residual_attn = cross_residual_attn
-
-        norm_class = ScaleNorm if use_scalenorm else nn.LayerNorm
-        norm_class = RMSNorm if use_rmsnorm else norm_class
-        norm_fn = partial(norm_class, dim)
-
-        norm_fn = nn.Identity if use_rezero else norm_fn
-        branch_fn = Rezero if use_rezero else None
-
-        if cross_attend and not only_cross:
-            default_block = ('a', 'c', 'f')
-        elif cross_attend and only_cross:
-            default_block = ('c', 'f')
-        else:
-            default_block = ('a', 'f')
-
-        if macaron:
-            default_block = ('f',) + default_block
-
-        if exists(custom_layers):
-            layer_types = custom_layers
-        elif exists(par_ratio):
-            par_depth = depth * len(default_block)
-            assert 1 < par_ratio <= par_depth, 'par ratio out of range'
-            default_block = tuple(filter(not_equals('f'), default_block))
-            par_attn = par_depth // par_ratio
-            depth_cut = par_depth * 2 // 3  # 2 / 3 attention layer cutoff suggested by PAR paper
-            par_width = (depth_cut + depth_cut // par_attn) // par_attn
-            assert len(default_block) <= par_width, 'default block is too large for par_ratio'
-            par_block = default_block + ('f',) * (par_width - len(default_block))
-            par_head = par_block * par_attn
-            layer_types = par_head + ('f',) * (par_depth - len(par_head))
-        elif exists(sandwich_coef):
-            assert sandwich_coef > 0 and sandwich_coef <= depth, 'sandwich coefficient should be less than the depth'
-            layer_types = ('a',) * sandwich_coef + default_block * (depth - sandwich_coef) + ('f',) * sandwich_coef
-        else:
-            layer_types = default_block * depth
-
-        self.layer_types = layer_types
-        self.num_attn_layers = len(list(filter(equals('a'), layer_types)))
-
-        for layer_type in self.layer_types:
-            if layer_type == 'a':
-                layer = Attention(dim, heads=heads, causal=causal, **attn_kwargs)
-            elif layer_type == 'c':
-                layer = Attention(dim, heads=heads, **attn_kwargs)
-            elif layer_type == 'f':
-                layer = FeedForward(dim, **ff_kwargs)
-                layer = layer if not macaron else Scale(0.5, layer)
-            else:
-                raise Exception(f'invalid layer type {layer_type}')
-
-            if isinstance(layer, Attention) and exists(branch_fn):
-                layer = branch_fn(layer)
-
-            if gate_residual:
-                residual_fn = GRUGating(dim)
-            else:
-                residual_fn = Residual()
-
-            self.layers.append(nn.ModuleList([
-                norm_fn(),
-                layer,
-                residual_fn
-            ]))
-
-    def forward(
-            self,
-            x,
-            context=None,
-            mask=None,
-            context_mask=None,
-            mems=None,
-            return_hiddens=False
-    ):
-        hiddens = []
-        intermediates = []
-        prev_attn = None
-        prev_cross_attn = None
-
-        mems = mems.copy() if exists(mems) else [None] * self.num_attn_layers
-
-        for ind, (layer_type, (norm, block, residual_fn)) in enumerate(zip(self.layer_types, self.layers)):
-            is_last = ind == (len(self.layers) - 1)
-
-            if layer_type == 'a':
-                hiddens.append(x)
-                layer_mem = mems.pop(0)
-
-            residual = x
-
-            if self.pre_norm:
-                x = norm(x)
-
-            if layer_type == 'a':
-                out, inter = block(x, mask=mask, sinusoidal_emb=self.pia_pos_emb, rel_pos=self.rel_pos,
-                                   prev_attn=prev_attn, mem=layer_mem)
-            elif layer_type == 'c':
-                out, inter = block(x, context=context, mask=mask, context_mask=context_mask, prev_attn=prev_cross_attn)
-            elif layer_type == 'f':
-                out = block(x)
-
-            x = residual_fn(out, residual)
-
-            if layer_type in ('a', 'c'):
-                intermediates.append(inter)
-
-            if layer_type == 'a' and self.residual_attn:
-                prev_attn = inter.pre_softmax_attn
-            elif layer_type == 'c' and self.cross_residual_attn:
-                prev_cross_attn = inter.pre_softmax_attn
-
-            if not self.pre_norm and not is_last:
-                x = norm(x)
-
-        if return_hiddens:
-            intermediates = LayerIntermediates(
-                hiddens=hiddens,
-                attn_intermediates=intermediates
-            )
-
-            return x, intermediates
-
-        return x
-
-
-class Encoder(AttentionLayers):
-    def __init__(self, **kwargs):
-        assert 'causal' not in kwargs, 'cannot set causality on encoder'
-        super().__init__(causal=False, **kwargs)
-
-
-
-class TransformerWrapper(nn.Module):
-    def __init__(
-            self,
-            *,
-            num_tokens,
-            max_seq_len,
-            attn_layers,
-            emb_dim=None,
-            max_mem_len=0.,
-            emb_dropout=0.,
-            num_memory_tokens=None,
-            tie_embedding=False,
-            use_pos_emb=True
-    ):
-        super().__init__()
-        assert isinstance(attn_layers, AttentionLayers), 'attention layers must be one of Encoder or Decoder'
-
-        dim = attn_layers.dim
-        emb_dim = default(emb_dim, dim)
-
-        self.max_seq_len = max_seq_len
-        self.max_mem_len = max_mem_len
-        self.num_tokens = num_tokens
-
-        self.token_emb = nn.Embedding(num_tokens, emb_dim)
-        self.pos_emb = AbsolutePositionalEmbedding(emb_dim, max_seq_len) if (
-                    use_pos_emb and not attn_layers.has_pos_emb) else always(0)
-        self.emb_dropout = nn.Dropout(emb_dropout)
-
-        self.project_emb = nn.Linear(emb_dim, dim) if emb_dim != dim else nn.Identity()
-        self.attn_layers = attn_layers
-        self.norm = nn.LayerNorm(dim)
-
-        self.init_()
-
-        self.to_logits = nn.Linear(dim, num_tokens) if not tie_embedding else lambda t: t @ self.token_emb.weight.t()
-
-        # memory tokens (like [cls]) from Memory Transformers paper
-        num_memory_tokens = default(num_memory_tokens, 0)
-        self.num_memory_tokens = num_memory_tokens
-        if num_memory_tokens > 0:
-            self.memory_tokens = nn.Parameter(torch.randn(num_memory_tokens, dim))
-
-            # let funnel encoder know number of memory tokens, if specified
-            if hasattr(attn_layers, 'num_memory_tokens'):
-                attn_layers.num_memory_tokens = num_memory_tokens
-
-    def init_(self):
-        nn.init.normal_(self.token_emb.weight, std=0.02)
-
-    def forward(
-            self,
-            x,
-            return_embeddings=False,
-            mask=None,
-            return_mems=False,
-            return_attn=False,
-            mems=None,
-            **kwargs
-    ):
-        b, n, device, num_mem = *x.shape, x.device, self.num_memory_tokens
-        x = self.token_emb(x)
-        x += self.pos_emb(x)
-        x = self.emb_dropout(x)
-
-        x = self.project_emb(x)
-
-        if num_mem > 0:
-            mem = repeat(self.memory_tokens, 'n d -> b n d', b=b)
-            x = torch.cat((mem, x), dim=1)
-
-            # auto-handle masking after appending memory tokens
-            if exists(mask):
-                mask = F.pad(mask, (num_mem, 0), value=True)
-
-        x, intermediates = self.attn_layers(x, mask=mask, mems=mems, return_hiddens=True, **kwargs)
-        x = self.norm(x)
-
-        mem, x = x[:, :num_mem], x[:, num_mem:]
-
-        out = self.to_logits(x) if not return_embeddings else x
-
-        if return_mems:
-            hiddens = intermediates.hiddens
-            new_mems = list(map(lambda pair: torch.cat(pair, dim=-2), zip(mems, hiddens))) if exists(mems) else hiddens
-            new_mems = list(map(lambda t: t[..., -self.max_mem_len:, :].detach(), new_mems))
-            return out, new_mems
-
-        if return_attn:
-            attn_maps = list(map(lambda t: t.post_softmax_attn, intermediates.attn_intermediates))
-            return out, attn_maps
-
-        return out
-
diff --git a/examples/tutorial/stable_diffusion/ldm/util.py b/examples/tutorial/stable_diffusion/ldm/util.py
deleted file mode 100644
index 8ba38853e7a0..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/util.py
+++ /dev/null
@@ -1,203 +0,0 @@
-import importlib
-
-import torch
-import numpy as np
-from collections import abc
-from einops import rearrange
-from functools import partial
-
-import multiprocessing as mp
-from threading import Thread
-from queue import Queue
-
-from inspect import isfunction
-from PIL import Image, ImageDraw, ImageFont
-
-
-def log_txt_as_img(wh, xc, size=10):
-    # wh a tuple of (width, height)
-    # xc a list of captions to plot
-    b = len(xc)
-    txts = list()
-    for bi in range(b):
-        txt = Image.new("RGB", wh, color="white")
-        draw = ImageDraw.Draw(txt)
-        font = ImageFont.truetype('data/DejaVuSans.ttf', size=size)
-        nc = int(40 * (wh[0] / 256))
-        lines = "\n".join(xc[bi][start:start + nc] for start in range(0, len(xc[bi]), nc))
-
-        try:
-            draw.text((0, 0), lines, fill="black", font=font)
-        except UnicodeEncodeError:
-            print("Cant encode string for logging. Skipping.")
-
-        txt = np.array(txt).transpose(2, 0, 1) / 127.5 - 1.0
-        txts.append(txt)
-    txts = np.stack(txts)
-    txts = torch.tensor(txts)
-    return txts
-
-
-def ismap(x):
-    if not isinstance(x, torch.Tensor):
-        return False
-    return (len(x.shape) == 4) and (x.shape[1] > 3)
-
-
-def isimage(x):
-    if not isinstance(x, torch.Tensor):
-        return False
-    return (len(x.shape) == 4) and (x.shape[1] == 3 or x.shape[1] == 1)
-
-
-def exists(x):
-    return x is not None
-
-
-def default(val, d):
-    if exists(val):
-        return val
-    return d() if isfunction(d) else d
-
-
-def mean_flat(tensor):
-    """
-    https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/nn.py#L86
-    Take the mean over all non-batch dimensions.
-    """
-    return tensor.mean(dim=list(range(1, len(tensor.shape))))
-
-
-def count_params(model, verbose=False):
-    total_params = sum(p.numel() for p in model.parameters())
-    if verbose:
-        print(f"{model.__class__.__name__} has {total_params * 1.e-6:.2f} M params.")
-    return total_params
-
-
-def instantiate_from_config(config):
-    if not "target" in config:
-        if config == '__is_first_stage__':
-            return None
-        elif config == "__is_unconditional__":
-            return None
-        raise KeyError("Expected key `target` to instantiate.")
-    return get_obj_from_str(config["target"])(**config.get("params", dict()))
-
-
-def get_obj_from_str(string, reload=False):
-    module, cls = string.rsplit(".", 1)
-    if reload:
-        module_imp = importlib.import_module(module)
-        importlib.reload(module_imp)
-    return getattr(importlib.import_module(module, package=None), cls)
-
-
-def _do_parallel_data_prefetch(func, Q, data, idx, idx_to_fn=False):
-    # create dummy dataset instance
-
-    # run prefetching
-    if idx_to_fn:
-        res = func(data, worker_id=idx)
-    else:
-        res = func(data)
-    Q.put([idx, res])
-    Q.put("Done")
-
-
-def parallel_data_prefetch(
-        func: callable, data, n_proc, target_data_type="ndarray", cpu_intensive=True, use_worker_id=False
-):
-    # if target_data_type not in ["ndarray", "list"]:
-    #     raise ValueError(
-    #         "Data, which is passed to parallel_data_prefetch has to be either of type list or ndarray."
-    #     )
-    if isinstance(data, np.ndarray) and target_data_type == "list":
-        raise ValueError("list expected but function got ndarray.")
-    elif isinstance(data, abc.Iterable):
-        if isinstance(data, dict):
-            print(
-                f'WARNING:"data" argument passed to parallel_data_prefetch is a dict: Using only its values and disregarding keys.'
-            )
-            data = list(data.values())
-        if target_data_type == "ndarray":
-            data = np.asarray(data)
-        else:
-            data = list(data)
-    else:
-        raise TypeError(
-            f"The data, that shall be processed parallel has to be either an np.ndarray or an Iterable, but is actually {type(data)}."
-        )
-
-    if cpu_intensive:
-        Q = mp.Queue(1000)
-        proc = mp.Process
-    else:
-        Q = Queue(1000)
-        proc = Thread
-    # spawn processes
-    if target_data_type == "ndarray":
-        arguments = [
-            [func, Q, part, i, use_worker_id]
-            for i, part in enumerate(np.array_split(data, n_proc))
-        ]
-    else:
-        step = (
-            int(len(data) / n_proc + 1)
-            if len(data) % n_proc != 0
-            else int(len(data) / n_proc)
-        )
-        arguments = [
-            [func, Q, part, i, use_worker_id]
-            for i, part in enumerate(
-                [data[i: i + step] for i in range(0, len(data), step)]
-            )
-        ]
-    processes = []
-    for i in range(n_proc):
-        p = proc(target=_do_parallel_data_prefetch, args=arguments[i])
-        processes += [p]
-
-    # start processes
-    print(f"Start prefetching...")
-    import time
-
-    start = time.time()
-    gather_res = [[] for _ in range(n_proc)]
-    try:
-        for p in processes:
-            p.start()
-
-        k = 0
-        while k < n_proc:
-            # get result
-            res = Q.get()
-            if res == "Done":
-                k += 1
-            else:
-                gather_res[res[0]] = res[1]
-
-    except Exception as e:
-        print("Exception: ", e)
-        for p in processes:
-            p.terminate()
-
-        raise e
-    finally:
-        for p in processes:
-            p.join()
-        print(f"Prefetching complete. [{time.time() - start} sec.]")
-
-    if target_data_type == 'ndarray':
-        if not isinstance(gather_res[0], np.ndarray):
-            return np.concatenate([np.asarray(r) for r in gather_res], axis=0)
-
-        # order outputs
-        return np.concatenate(gather_res, axis=0)
-    elif target_data_type == 'list':
-        out = []
-        for r in gather_res:
-            out.extend(r)
-        return out
-    else:
-        return gather_res
diff --git a/examples/tutorial/stable_diffusion/main.py b/examples/tutorial/stable_diffusion/main.py
deleted file mode 100644
index 7cd00e4c0c26..000000000000
--- a/examples/tutorial/stable_diffusion/main.py
+++ /dev/null
@@ -1,830 +0,0 @@
-import argparse, os, sys, datetime, glob, importlib, csv
-import numpy as np
-import time
-import torch
-import torchvision
-import pytorch_lightning as pl
-
-from packaging import version
-from omegaconf import OmegaConf
-from torch.utils.data import random_split, DataLoader, Dataset, Subset
-from functools import partial
-from PIL import Image
-# from pytorch_lightning.strategies.colossalai import ColossalAIStrategy
-# from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
-from colossalai.nn.optimizer import HybridAdam
-from prefetch_generator import BackgroundGenerator
-
-from pytorch_lightning import seed_everything
-from pytorch_lightning.trainer import Trainer
-from pytorch_lightning.callbacks import ModelCheckpoint, Callback, LearningRateMonitor
-from pytorch_lightning.utilities.rank_zero import rank_zero_only
-from pytorch_lightning.utilities import rank_zero_info
-from diffusers.models.unet_2d import UNet2DModel
-
-from clip.model import Bottleneck
-from transformers.models.clip.modeling_clip import CLIPTextTransformer
-
-from ldm.data.base import Txt2ImgIterableBaseDataset
-from ldm.util import instantiate_from_config
-import clip
-from einops import rearrange, repeat
-from transformers import CLIPTokenizer, CLIPTextModel
-import kornia
-
-from ldm.modules.x_transformer import *
-from ldm.modules.encoders.modules import *
-from taming.modules.diffusionmodules.model import ResnetBlock
-from taming.modules.transformer.mingpt import *
-from taming.modules.transformer.permuter import *
-
-
-from ldm.modules.ema import LitEma
-from ldm.modules.distributions.distributions import normal_kl, DiagonalGaussianDistribution
-from ldm.models.autoencoder import AutoencoderKL
-from ldm.models.autoencoder import *
-from ldm.models.diffusion.ddim import *
-from ldm.modules.diffusionmodules.openaimodel import *
-from ldm.modules.diffusionmodules.model import *
-from ldm.modules.diffusionmodules.model import Decoder, Encoder, Up_module, Down_module, Mid_module, temb_module
-from ldm.modules.attention import enable_flash_attention
-
-class DataLoaderX(DataLoader):
-
-    def __iter__(self):
-        return BackgroundGenerator(super().__iter__())
-
-
-def get_parser(**parser_kwargs):
-    def str2bool(v):
-        if isinstance(v, bool):
-            return v
-        if v.lower() in ("yes", "true", "t", "y", "1"):
-            return True
-        elif v.lower() in ("no", "false", "f", "n", "0"):
-            return False
-        else:
-            raise argparse.ArgumentTypeError("Boolean value expected.")
-
-    parser = argparse.ArgumentParser(**parser_kwargs)
-    parser.add_argument(
-        "-n",
-        "--name",
-        type=str,
-        const=True,
-        default="",
-        nargs="?",
-        help="postfix for logdir",
-    )
-    parser.add_argument(
-        "-r",
-        "--resume",
-        type=str,
-        const=True,
-        default="",
-        nargs="?",
-        help="resume from logdir or checkpoint in logdir",
-    )
-    parser.add_argument(
-        "-b",
-        "--base",
-        nargs="*",
-        metavar="base_config.yaml",
-        help="paths to base configs. Loaded from left-to-right. "
-             "Parameters can be overwritten or added with command-line options of the form `--key value`.",
-        default=list(),
-    )
-    parser.add_argument(
-        "-t",
-        "--train",
-        type=str2bool,
-        const=True,
-        default=False,
-        nargs="?",
-        help="train",
-    )
-    parser.add_argument(
-        "--no-test",
-        type=str2bool,
-        const=True,
-        default=False,
-        nargs="?",
-        help="disable test",
-    )
-    parser.add_argument(
-        "-p",
-        "--project",
-        help="name of new or path to existing project"
-    )
-    parser.add_argument(
-        "-d",
-        "--debug",
-        type=str2bool,
-        nargs="?",
-        const=True,
-        default=False,
-        help="enable post-mortem debugging",
-    )
-    parser.add_argument(
-        "-s",
-        "--seed",
-        type=int,
-        default=23,
-        help="seed for seed_everything",
-    )
-    parser.add_argument(
-        "-f",
-        "--postfix",
-        type=str,
-        default="",
-        help="post-postfix for default name",
-    )
-    parser.add_argument(
-        "-l",
-        "--logdir",
-        type=str,
-        default="logs",
-        help="directory for logging dat shit",
-    )
-    parser.add_argument(
-        "--scale_lr",
-        type=str2bool,
-        nargs="?",
-        const=True,
-        default=True,
-        help="scale base-lr by ngpu * batch_size * n_accumulate",
-    )
-    parser.add_argument(
-        "--use_fp16",
-        type=str2bool,
-        nargs="?",
-        const=True,
-        default=True,
-        help="whether to use fp16",
-    )
-    parser.add_argument(
-        "--flash",
-        type=str2bool,
-        const=True,
-        default=False,
-        nargs="?",
-        help="whether to use flash attention",
-    )
-    return parser
-
-
-def nondefault_trainer_args(opt):
-    parser = argparse.ArgumentParser()
-    parser = Trainer.add_argparse_args(parser)
-    args = parser.parse_args([])
-    return sorted(k for k in vars(args) if getattr(opt, k) != getattr(args, k))
-
-
-class WrappedDataset(Dataset):
-    """Wraps an arbitrary object with __len__ and __getitem__ into a pytorch dataset"""
-
-    def __init__(self, dataset):
-        self.data = dataset
-
-    def __len__(self):
-        return len(self.data)
-
-    def __getitem__(self, idx):
-        return self.data[idx]
-
-
-def worker_init_fn(_):
-    worker_info = torch.utils.data.get_worker_info()
-
-    dataset = worker_info.dataset
-    worker_id = worker_info.id
-
-    if isinstance(dataset, Txt2ImgIterableBaseDataset):
-        split_size = dataset.num_records // worker_info.num_workers
-        # reset num_records to the true number to retain reliable length information
-        dataset.sample_ids = dataset.valid_ids[worker_id * split_size:(worker_id + 1) * split_size]
-        current_id = np.random.choice(len(np.random.get_state()[1]), 1)
-        return np.random.seed(np.random.get_state()[1][current_id] + worker_id)
-    else:
-        return np.random.seed(np.random.get_state()[1][0] + worker_id)
-
-
-class DataModuleFromConfig(pl.LightningDataModule):
-    def __init__(self, batch_size, train=None, validation=None, test=None, predict=None,
-                 wrap=False, num_workers=None, shuffle_test_loader=False, use_worker_init_fn=False,
-                 shuffle_val_dataloader=False):
-        super().__init__()
-        self.batch_size = batch_size
-        self.dataset_configs = dict()
-        self.num_workers = num_workers if num_workers is not None else batch_size * 2
-        self.use_worker_init_fn = use_worker_init_fn
-        if train is not None:
-            self.dataset_configs["train"] = train
-            self.train_dataloader = self._train_dataloader
-        if validation is not None:
-            self.dataset_configs["validation"] = validation
-            self.val_dataloader = partial(self._val_dataloader, shuffle=shuffle_val_dataloader)
-        if test is not None:
-            self.dataset_configs["test"] = test
-            self.test_dataloader = partial(self._test_dataloader, shuffle=shuffle_test_loader)
-        if predict is not None:
-            self.dataset_configs["predict"] = predict
-            self.predict_dataloader = self._predict_dataloader
-        self.wrap = wrap
-
-    def prepare_data(self):
-        for data_cfg in self.dataset_configs.values():
-            instantiate_from_config(data_cfg)
-
-    def setup(self, stage=None):
-        self.datasets = dict(
-            (k, instantiate_from_config(self.dataset_configs[k]))
-            for k in self.dataset_configs)
-        if self.wrap:
-            for k in self.datasets:
-                self.datasets[k] = WrappedDataset(self.datasets[k])
-
-    def _train_dataloader(self):
-        is_iterable_dataset = isinstance(self.datasets['train'], Txt2ImgIterableBaseDataset)
-        if is_iterable_dataset or self.use_worker_init_fn:
-            init_fn = worker_init_fn
-        else:
-            init_fn = None
-        return DataLoaderX(self.datasets["train"], batch_size=self.batch_size,
-                          num_workers=self.num_workers, shuffle=False if is_iterable_dataset else True,
-                          worker_init_fn=init_fn)
-
-    def _val_dataloader(self, shuffle=False):
-        if isinstance(self.datasets['validation'], Txt2ImgIterableBaseDataset) or self.use_worker_init_fn:
-            init_fn = worker_init_fn
-        else:
-            init_fn = None
-        return DataLoaderX(self.datasets["validation"],
-                          batch_size=self.batch_size,
-                          num_workers=self.num_workers,
-                          worker_init_fn=init_fn,
-                          shuffle=shuffle)
-
-    def _test_dataloader(self, shuffle=False):
-        is_iterable_dataset = isinstance(self.datasets['train'], Txt2ImgIterableBaseDataset)
-        if is_iterable_dataset or self.use_worker_init_fn:
-            init_fn = worker_init_fn
-        else:
-            init_fn = None
-
-        # do not shuffle dataloader for iterable dataset
-        shuffle = shuffle and (not is_iterable_dataset)
-
-        return DataLoaderX(self.datasets["test"], batch_size=self.batch_size,
-                          num_workers=self.num_workers, worker_init_fn=init_fn, shuffle=shuffle)
-
-    def _predict_dataloader(self, shuffle=False):
-        if isinstance(self.datasets['predict'], Txt2ImgIterableBaseDataset) or self.use_worker_init_fn:
-            init_fn = worker_init_fn
-        else:
-            init_fn = None
-        return DataLoaderX(self.datasets["predict"], batch_size=self.batch_size,
-                          num_workers=self.num_workers, worker_init_fn=init_fn)
-
-
-class SetupCallback(Callback):
-    def __init__(self, resume, now, logdir, ckptdir, cfgdir, config, lightning_config):
-        super().__init__()
-        self.resume = resume
-        self.now = now
-        self.logdir = logdir
-        self.ckptdir = ckptdir
-        self.cfgdir = cfgdir
-        self.config = config
-        self.lightning_config = lightning_config
-
-    def on_keyboard_interrupt(self, trainer, pl_module):
-        if trainer.global_rank == 0:
-            print("Summoning checkpoint.")
-            ckpt_path = os.path.join(self.ckptdir, "last.ckpt")
-            trainer.save_checkpoint(ckpt_path)
-
-    # def on_pretrain_routine_start(self, trainer, pl_module):
-    def on_fit_start(self, trainer, pl_module):
-        if trainer.global_rank == 0:
-            # Create logdirs and save configs
-            os.makedirs(self.logdir, exist_ok=True)
-            os.makedirs(self.ckptdir, exist_ok=True)
-            os.makedirs(self.cfgdir, exist_ok=True)
-
-            if "callbacks" in self.lightning_config:
-                if 'metrics_over_trainsteps_checkpoint' in self.lightning_config['callbacks']:
-                    os.makedirs(os.path.join(self.ckptdir, 'trainstep_checkpoints'), exist_ok=True)
-            print("Project config")
-            print(OmegaConf.to_yaml(self.config))
-            OmegaConf.save(self.config,
-                           os.path.join(self.cfgdir, "{}-project.yaml".format(self.now)))
-
-            print("Lightning config")
-            print(OmegaConf.to_yaml(self.lightning_config))
-            OmegaConf.save(OmegaConf.create({"lightning": self.lightning_config}),
-                           os.path.join(self.cfgdir, "{}-lightning.yaml".format(self.now)))
-
-        else:
-            # ModelCheckpoint callback created log directory --- remove it
-            if not self.resume and os.path.exists(self.logdir):
-                dst, name = os.path.split(self.logdir)
-                dst = os.path.join(dst, "child_runs", name)
-                os.makedirs(os.path.split(dst)[0], exist_ok=True)
-                try:
-                    os.rename(self.logdir, dst)
-                except FileNotFoundError:
-                    pass
-
-
-class ImageLogger(Callback):
-    def __init__(self, batch_frequency, max_images, clamp=True, increase_log_steps=True,
-                 rescale=True, disabled=False, log_on_batch_idx=False, log_first_step=False,
-                 log_images_kwargs=None):
-        super().__init__()
-        self.rescale = rescale
-        self.batch_freq = batch_frequency
-        self.max_images = max_images
-        self.logger_log_images = {
-            pl.loggers.CSVLogger: self._testtube,
-        }
-        self.log_steps = [2 ** n for n in range(int(np.log2(self.batch_freq)) + 1)]
-        if not increase_log_steps:
-            self.log_steps = [self.batch_freq]
-        self.clamp = clamp
-        self.disabled = disabled
-        self.log_on_batch_idx = log_on_batch_idx
-        self.log_images_kwargs = log_images_kwargs if log_images_kwargs else {}
-        self.log_first_step = log_first_step
-
-    @rank_zero_only
-    def _testtube(self, pl_module, images, batch_idx, split):
-        for k in images:
-            grid = torchvision.utils.make_grid(images[k])
-            grid = (grid + 1.0) / 2.0  # -1,1 -> 0,1; c,h,w
-
-            tag = f"{split}/{k}"
-            pl_module.logger.experiment.add_image(
-                tag, grid,
-                global_step=pl_module.global_step)
-
-    @rank_zero_only
-    def log_local(self, save_dir, split, images,
-                  global_step, current_epoch, batch_idx):
-        root = os.path.join(save_dir, "images", split)
-        for k in images:
-            grid = torchvision.utils.make_grid(images[k], nrow=4)
-            if self.rescale:
-                grid = (grid + 1.0) / 2.0  # -1,1 -> 0,1; c,h,w
-            grid = grid.transpose(0, 1).transpose(1, 2).squeeze(-1)
-            grid = grid.numpy()
-            grid = (grid * 255).astype(np.uint8)
-            filename = "{}_gs-{:06}_e-{:06}_b-{:06}.png".format(
-                k,
-                global_step,
-                current_epoch,
-                batch_idx)
-            path = os.path.join(root, filename)
-            os.makedirs(os.path.split(path)[0], exist_ok=True)
-            Image.fromarray(grid).save(path)
-
-    def log_img(self, pl_module, batch, batch_idx, split="train"):
-        check_idx = batch_idx if self.log_on_batch_idx else pl_module.global_step
-        if (self.check_frequency(check_idx) and  # batch_idx % self.batch_freq == 0
-                hasattr(pl_module, "log_images") and
-                callable(pl_module.log_images) and
-                self.max_images > 0):
-            logger = type(pl_module.logger)
-
-            is_train = pl_module.training
-            if is_train:
-                pl_module.eval()
-
-            with torch.no_grad():
-                images = pl_module.log_images(batch, split=split, **self.log_images_kwargs)
-
-            for k in images:
-                N = min(images[k].shape[0], self.max_images)
-                images[k] = images[k][:N]
-                if isinstance(images[k], torch.Tensor):
-                    images[k] = images[k].detach().cpu()
-                    if self.clamp:
-                        images[k] = torch.clamp(images[k], -1., 1.)
-
-            self.log_local(pl_module.logger.save_dir, split, images,
-                           pl_module.global_step, pl_module.current_epoch, batch_idx)
-
-            logger_log_images = self.logger_log_images.get(logger, lambda *args, **kwargs: None)
-            logger_log_images(pl_module, images, pl_module.global_step, split)
-
-            if is_train:
-                pl_module.train()
-
-    def check_frequency(self, check_idx):
-        if ((check_idx % self.batch_freq) == 0 or (check_idx in self.log_steps)) and (
-                check_idx > 0 or self.log_first_step):
-            try:
-                self.log_steps.pop(0)
-            except IndexError as e:
-                print(e)
-                pass
-            return True
-        return False
-
-    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
-        # if not self.disabled and (pl_module.global_step > 0 or self.log_first_step):
-        #     self.log_img(pl_module, batch, batch_idx, split="train")
-        pass
-
-    def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
-        if not self.disabled and pl_module.global_step > 0:
-            self.log_img(pl_module, batch, batch_idx, split="val")
-        if hasattr(pl_module, 'calibrate_grad_norm'):
-            if (pl_module.calibrate_grad_norm and batch_idx % 25 == 0) and batch_idx > 0:
-                self.log_gradients(trainer, pl_module, batch_idx=batch_idx)
-
-
-class CUDACallback(Callback):
-    # see https://github.com/SeanNaren/minGPT/blob/master/mingpt/callback.py
-
-    def on_train_start(self, trainer, pl_module):
-        rank_zero_info("Training is starting")
-
-    def on_train_end(self, trainer, pl_module):
-        rank_zero_info("Training is ending")
-
-    def on_train_epoch_start(self, trainer, pl_module):
-        # Reset the memory use counter
-        torch.cuda.reset_peak_memory_stats(trainer.strategy.root_device.index)
-        torch.cuda.synchronize(trainer.strategy.root_device.index)
-        self.start_time = time.time()
-
-    def on_train_epoch_end(self, trainer, pl_module):
-        torch.cuda.synchronize(trainer.strategy.root_device.index)
-        max_memory = torch.cuda.max_memory_allocated(trainer.strategy.root_device.index) / 2 ** 20
-        epoch_time = time.time() - self.start_time
-
-        try:
-            max_memory = trainer.strategy.reduce(max_memory)
-            epoch_time = trainer.strategy.reduce(epoch_time)
-
-            rank_zero_info(f"Average Epoch time: {epoch_time:.2f} seconds")
-            rank_zero_info(f"Average Peak memory {max_memory:.2f}MiB")
-        except AttributeError:
-            pass
-
-
-if __name__ == "__main__":
-    # custom parser to specify config files, train, test and debug mode,
-    # postfix, resume.
-    # `--key value` arguments are interpreted as arguments to the trainer.
-    # `nested.key=value` arguments are interpreted as config parameters.
-    # configs are merged from left-to-right followed by command line parameters.
-
-    # model:
-    #   base_learning_rate: float
-    #   target: path to lightning module
-    #   params:
-    #       key: value
-    # data:
-    #   target: main.DataModuleFromConfig
-    #   params:
-    #      batch_size: int
-    #      wrap: bool
-    #      train:
-    #          target: path to train dataset
-    #          params:
-    #              key: value
-    #      validation:
-    #          target: path to validation dataset
-    #          params:
-    #              key: value
-    #      test:
-    #          target: path to test dataset
-    #          params:
-    #              key: value
-    # lightning: (optional, has sane defaults and can be specified on cmdline)
-    #   trainer:
-    #       additional arguments to trainer
-    #   logger:
-    #       logger to instantiate
-    #   modelcheckpoint:
-    #       modelcheckpoint to instantiate
-    #   callbacks:
-    #       callback1:
-    #           target: importpath
-    #           params:
-    #               key: value
-
-    now = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
-
-    # add cwd for convenience and to make classes in this file available when
-    # running as `python main.py`
-    # (in particular `main.DataModuleFromConfig`)
-    sys.path.append(os.getcwd())
-
-    parser = get_parser()
-    parser = Trainer.add_argparse_args(parser)
-
-    opt, unknown = parser.parse_known_args()
-    if opt.name and opt.resume:
-        raise ValueError(
-            "-n/--name and -r/--resume cannot be specified both."
-            "If you want to resume training in a new log folder, "
-            "use -n/--name in combination with --resume_from_checkpoint"
-        )
-    if opt.flash:
-        enable_flash_attention()
-    if opt.resume:
-        if not os.path.exists(opt.resume):
-            raise ValueError("Cannot find {}".format(opt.resume))
-        if os.path.isfile(opt.resume):
-            paths = opt.resume.split("/")
-            # idx = len(paths)-paths[::-1].index("logs")+1
-            # logdir = "/".join(paths[:idx])
-            logdir = "/".join(paths[:-2])
-            ckpt = opt.resume
-        else:
-            assert os.path.isdir(opt.resume), opt.resume
-            logdir = opt.resume.rstrip("/")
-            ckpt = os.path.join(logdir, "checkpoints", "last.ckpt")
-
-        opt.resume_from_checkpoint = ckpt
-        base_configs = sorted(glob.glob(os.path.join(logdir, "configs/*.yaml")))
-        opt.base = base_configs + opt.base
-        _tmp = logdir.split("/")
-        nowname = _tmp[-1]
-    else:
-        if opt.name:
-            name = "_" + opt.name
-        elif opt.base:
-            cfg_fname = os.path.split(opt.base[0])[-1]
-            cfg_name = os.path.splitext(cfg_fname)[0]
-            name = "_" + cfg_name
-        else:
-            name = ""
-        nowname = now + name + opt.postfix
-        logdir = os.path.join(opt.logdir, nowname)
-
-    ckptdir = os.path.join(logdir, "checkpoints")
-    cfgdir = os.path.join(logdir, "configs")
-    seed_everything(opt.seed)
-
-    try:
-        # init and save configs
-        configs = [OmegaConf.load(cfg) for cfg in opt.base]
-        cli = OmegaConf.from_dotlist(unknown)
-        config = OmegaConf.merge(*configs, cli)
-        lightning_config = config.pop("lightning", OmegaConf.create())
-        # merge trainer cli with config
-        trainer_config = lightning_config.get("trainer", OmegaConf.create())
-  
-        for k in nondefault_trainer_args(opt):
-            trainer_config[k] = getattr(opt, k)
-
-        print(trainer_config)
-        if not trainer_config["accelerator"] == "gpu":
-            del trainer_config["accelerator"]
-            cpu = True
-            print("Running on CPU")
-        else:
-            cpu = False
-            print("Running on GPU")
-        trainer_opt = argparse.Namespace(**trainer_config)
-        lightning_config.trainer = trainer_config
-
-        # model
-        use_fp16 = trainer_config.get("precision", 32) == 16
-        if use_fp16:
-            config.model["params"].update({"use_fp16": True})
-            print("Using FP16 = {}".format(config.model["params"]["use_fp16"]))
-        else:
-            config.model["params"].update({"use_fp16": False})
-            print("Using FP16 = {}".format(config.model["params"]["use_fp16"]))
-        
-        model = instantiate_from_config(config.model)
-        # trainer and callbacks
-        trainer_kwargs = dict()
-
-        # config the logger
-        # default logger configs
-        default_logger_cfgs = {
-            "wandb": {
-                "target": "pytorch_lightning.loggers.WandbLogger",
-                "params": {
-                    "name": nowname,
-                    "save_dir": logdir,
-                    "offline": opt.debug,
-                    "id": nowname,
-                }
-            },
-            "tensorboard":{
-                "target": "pytorch_lightning.loggers.TensorBoardLogger",
-                "params":{
-                    "save_dir": logdir,
-                    "name": "diff_tb",
-                    "log_graph": True
-                }
-            }
-        }
-
-        default_logger_cfg = default_logger_cfgs["tensorboard"]
-        if "logger" in lightning_config:
-            logger_cfg = lightning_config.logger
-        else:
-            logger_cfg = default_logger_cfg
-        logger_cfg = OmegaConf.merge(default_logger_cfg, logger_cfg)
-        trainer_kwargs["logger"] = instantiate_from_config(logger_cfg)
-
-        # config the strategy, defualt is ddp
-        if "strategy" in trainer_config:
-            strategy_cfg = trainer_config["strategy"]
-            print("Using strategy: {}".format(strategy_cfg["target"]))
-        else:
-            strategy_cfg = {
-                "target": "pytorch_lightning.strategies.DDPStrategy",
-                "params": {
-                    "find_unused_parameters": False
-                }
-            }
-            print("Using strategy: DDPStrategy")
-
-        trainer_kwargs["strategy"] = instantiate_from_config(strategy_cfg)
-
-        # modelcheckpoint - use TrainResult/EvalResult(checkpoint_on=metric) to
-        # specify which metric is used to determine best models
-        default_modelckpt_cfg = {
-            "target": "pytorch_lightning.callbacks.ModelCheckpoint",
-            "params": {
-                "dirpath": ckptdir,
-                "filename": "{epoch:06}",
-                "verbose": True,
-                "save_last": True,
-            }
-        }
-        if hasattr(model, "monitor"):
-            print(f"Monitoring {model.monitor} as checkpoint metric.")
-            default_modelckpt_cfg["params"]["monitor"] = model.monitor
-            default_modelckpt_cfg["params"]["save_top_k"] = 3
-
-        if "modelcheckpoint" in lightning_config:
-            modelckpt_cfg = lightning_config.modelcheckpoint
-        else:
-            modelckpt_cfg =  OmegaConf.create()
-        modelckpt_cfg = OmegaConf.merge(default_modelckpt_cfg, modelckpt_cfg)
-        print(f"Merged modelckpt-cfg: \n{modelckpt_cfg}")
-        if version.parse(pl.__version__) < version.parse('1.4.0'):
-            trainer_kwargs["checkpoint_callback"] = instantiate_from_config(modelckpt_cfg)
-
-        # add callback which sets up log directory
-        default_callbacks_cfg = {
-            "setup_callback": {
-                "target": "main.SetupCallback",
-                "params": {
-                    "resume": opt.resume,
-                    "now": now,
-                    "logdir": logdir,
-                    "ckptdir": ckptdir,
-                    "cfgdir": cfgdir,
-                    "config": config,
-                    "lightning_config": lightning_config,
-                }
-            },
-            "image_logger": {
-                "target": "main.ImageLogger",
-                "params": {
-                    "batch_frequency": 750,
-                    "max_images": 4,
-                    "clamp": True
-                }
-            },
-            "learning_rate_logger": {
-                "target": "main.LearningRateMonitor",
-                "params": {
-                    "logging_interval": "step",
-                    # "log_momentum": True
-                }
-            },
-            "cuda_callback": {
-                "target": "main.CUDACallback"
-            },
-        }
-        if version.parse(pl.__version__) >= version.parse('1.4.0'):
-            default_callbacks_cfg.update({'checkpoint_callback': modelckpt_cfg})
-
-        if "callbacks" in lightning_config:
-            callbacks_cfg = lightning_config.callbacks
-        else:
-            callbacks_cfg = OmegaConf.create()
-
-        if 'metrics_over_trainsteps_checkpoint' in callbacks_cfg:
-            print(
-                'Caution: Saving checkpoints every n train steps without deleting. This might require some free space.')
-            default_metrics_over_trainsteps_ckpt_dict = {
-                'metrics_over_trainsteps_checkpoint':
-                    {"target": 'pytorch_lightning.callbacks.ModelCheckpoint',
-                     'params': {
-                         "dirpath": os.path.join(ckptdir, 'trainstep_checkpoints'),
-                         "filename": "{epoch:06}-{step:09}",
-                         "verbose": True,
-                         'save_top_k': -1,
-                         'every_n_train_steps': 10000,
-                         'save_weights_only': True
-                     }
-                     }
-            }
-            default_callbacks_cfg.update(default_metrics_over_trainsteps_ckpt_dict)
-
-        callbacks_cfg = OmegaConf.merge(default_callbacks_cfg, callbacks_cfg)
-        if 'ignore_keys_callback' in callbacks_cfg and hasattr(trainer_opt, 'resume_from_checkpoint'):
-            callbacks_cfg.ignore_keys_callback.params['ckpt_path'] = trainer_opt.resume_from_checkpoint
-        elif 'ignore_keys_callback' in callbacks_cfg:
-            del callbacks_cfg['ignore_keys_callback']
-
-        trainer_kwargs["callbacks"] = [instantiate_from_config(callbacks_cfg[k]) for k in callbacks_cfg]
-
-        trainer = Trainer.from_argparse_args(trainer_opt, **trainer_kwargs)
-        trainer.logdir = logdir  ###
-
-        # data
-        data = instantiate_from_config(config.data)
-        # NOTE according to https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html
-        # calling these ourselves should not be necessary but it is.
-        # lightning still takes care of proper multiprocessing though
-        data.prepare_data()
-        data.setup()
-        print("#### Data #####")
-        for k in data.datasets:
-            print(f"{k}, {data.datasets[k].__class__.__name__}, {len(data.datasets[k])}")
-
-        # configure learning rate
-        bs, base_lr = config.data.params.batch_size, config.model.base_learning_rate
-        if not cpu:
-            ngpu = trainer_config["devices"]
-        else:
-            ngpu = 1
-        if 'accumulate_grad_batches' in lightning_config.trainer:
-            accumulate_grad_batches = lightning_config.trainer.accumulate_grad_batches
-        else:
-            accumulate_grad_batches = 1
-        print(f"accumulate_grad_batches = {accumulate_grad_batches}")
-        lightning_config.trainer.accumulate_grad_batches = accumulate_grad_batches
-        if opt.scale_lr:
-            model.learning_rate = accumulate_grad_batches * ngpu * bs * base_lr
-            print(
-                "Setting learning rate to {:.2e} = {} (accumulate_grad_batches) * {} (num_gpus) * {} (batchsize) * {:.2e} (base_lr)".format(
-                    model.learning_rate, accumulate_grad_batches, ngpu, bs, base_lr))
-        else:
-            model.learning_rate = base_lr
-            print("++++ NOT USING LR SCALING ++++")
-            print(f"Setting learning rate to {model.learning_rate:.2e}")
-
-
-        # allow checkpointing via USR1
-        def melk(*args, **kwargs):
-            # run all checkpoint hooks
-            if trainer.global_rank == 0:
-                print("Summoning checkpoint.")
-                ckpt_path = os.path.join(ckptdir, "last.ckpt")
-                trainer.save_checkpoint(ckpt_path)
-
-
-        def divein(*args, **kwargs):
-            if trainer.global_rank == 0:
-                import pudb;
-                pudb.set_trace()
-
-
-        import signal
-
-        signal.signal(signal.SIGUSR1, melk)
-        signal.signal(signal.SIGUSR2, divein)
-
-        # run
-        if opt.train:
-            try:
-                for name, m in model.named_parameters():
-                    print(name)
-                trainer.fit(model, data)
-            except Exception:
-                melk()
-                raise
-        # if not opt.no_test and not trainer.interrupted:
-        #     trainer.test(model, data)
-    except Exception:
-        if opt.debug and trainer.global_rank == 0:
-            try:
-                import pudb as debugger
-            except ImportError:
-                import pdb as debugger
-            debugger.post_mortem()
-        raise
-    finally:
-        # move newly created debug project to debug_runs
-        if opt.debug and not opt.resume and trainer.global_rank == 0:
-            dst, name = os.path.split(logdir)
-            dst = os.path.join(dst, "debug_runs", name)
-            os.makedirs(os.path.split(dst)[0], exist_ok=True)
-            os.rename(logdir, dst)
-        if trainer.global_rank == 0:
-            print(trainer.profiler.summary())
diff --git a/examples/tutorial/stable_diffusion/requirements.txt b/examples/tutorial/stable_diffusion/requirements.txt
deleted file mode 100644
index a57003562a3b..000000000000
--- a/examples/tutorial/stable_diffusion/requirements.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-albumentations==0.4.3
-diffusers
-pudb==2019.2
-datasets
-invisible-watermark
-imageio==2.9.0
-imageio-ffmpeg==0.4.2
-omegaconf==2.1.1
-multiprocess
-test-tube>=0.7.5
-streamlit>=0.73.1
-einops==0.3.0
-torch-fidelity==0.3.0
-transformers==4.19.2
-torchmetrics==0.6.0
-kornia==0.6
-opencv-python==4.6.0.66
-prefetch_generator
-colossalai
--e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers
--e git+https://github.com/openai/CLIP.git@main#egg=clip
--e .
diff --git a/examples/tutorial/stable_diffusion/scripts/download_first_stages.sh b/examples/tutorial/stable_diffusion/scripts/download_first_stages.sh
deleted file mode 100644
index a8d79e99ccdf..000000000000
--- a/examples/tutorial/stable_diffusion/scripts/download_first_stages.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/bin/bash
-wget -O models/first_stage_models/kl-f4/model.zip https://ommer-lab.com/files/latent-diffusion/kl-f4.zip
-wget -O models/first_stage_models/kl-f8/model.zip https://ommer-lab.com/files/latent-diffusion/kl-f8.zip
-wget -O models/first_stage_models/kl-f16/model.zip https://ommer-lab.com/files/latent-diffusion/kl-f16.zip
-wget -O models/first_stage_models/kl-f32/model.zip https://ommer-lab.com/files/latent-diffusion/kl-f32.zip
-wget -O models/first_stage_models/vq-f4/model.zip https://ommer-lab.com/files/latent-diffusion/vq-f4.zip
-wget -O models/first_stage_models/vq-f4-noattn/model.zip https://ommer-lab.com/files/latent-diffusion/vq-f4-noattn.zip
-wget -O models/first_stage_models/vq-f8/model.zip https://ommer-lab.com/files/latent-diffusion/vq-f8.zip
-wget -O models/first_stage_models/vq-f8-n256/model.zip https://ommer-lab.com/files/latent-diffusion/vq-f8-n256.zip
-wget -O models/first_stage_models/vq-f16/model.zip https://ommer-lab.com/files/latent-diffusion/vq-f16.zip
-
-
-
-cd models/first_stage_models/kl-f4
-unzip -o model.zip
-
-cd ../kl-f8
-unzip -o model.zip
-
-cd ../kl-f16
-unzip -o model.zip
-
-cd ../kl-f32
-unzip -o model.zip
-
-cd ../vq-f4
-unzip -o model.zip
-
-cd ../vq-f4-noattn
-unzip -o model.zip
-
-cd ../vq-f8
-unzip -o model.zip
-
-cd ../vq-f8-n256
-unzip -o model.zip
-
-cd ../vq-f16
-unzip -o model.zip
-
-cd ../..
\ No newline at end of file
diff --git a/examples/tutorial/stable_diffusion/scripts/download_models.sh b/examples/tutorial/stable_diffusion/scripts/download_models.sh
deleted file mode 100644
index 84297d7b8b9a..000000000000
--- a/examples/tutorial/stable_diffusion/scripts/download_models.sh
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/bin/bash
-wget -O models/ldm/celeba256/celeba-256.zip https://ommer-lab.com/files/latent-diffusion/celeba.zip
-wget -O models/ldm/ffhq256/ffhq-256.zip https://ommer-lab.com/files/latent-diffusion/ffhq.zip
-wget -O models/ldm/lsun_churches256/lsun_churches-256.zip https://ommer-lab.com/files/latent-diffusion/lsun_churches.zip
-wget -O models/ldm/lsun_beds256/lsun_beds-256.zip https://ommer-lab.com/files/latent-diffusion/lsun_bedrooms.zip
-wget -O models/ldm/text2img256/model.zip https://ommer-lab.com/files/latent-diffusion/text2img.zip
-wget -O models/ldm/cin256/model.zip https://ommer-lab.com/files/latent-diffusion/cin.zip
-wget -O models/ldm/semantic_synthesis512/model.zip https://ommer-lab.com/files/latent-diffusion/semantic_synthesis.zip
-wget -O models/ldm/semantic_synthesis256/model.zip https://ommer-lab.com/files/latent-diffusion/semantic_synthesis256.zip
-wget -O models/ldm/bsr_sr/model.zip https://ommer-lab.com/files/latent-diffusion/sr_bsr.zip
-wget -O models/ldm/layout2img-openimages256/model.zip https://ommer-lab.com/files/latent-diffusion/layout2img_model.zip
-wget -O models/ldm/inpainting_big/model.zip https://ommer-lab.com/files/latent-diffusion/inpainting_big.zip
-
-
-
-cd models/ldm/celeba256
-unzip -o celeba-256.zip
-
-cd ../ffhq256
-unzip -o ffhq-256.zip
-
-cd ../lsun_churches256
-unzip -o lsun_churches-256.zip
-
-cd ../lsun_beds256
-unzip -o lsun_beds-256.zip
-
-cd ../text2img256
-unzip -o model.zip
-
-cd ../cin256
-unzip -o model.zip
-
-cd ../semantic_synthesis512
-unzip -o model.zip
-
-cd ../semantic_synthesis256
-unzip -o model.zip
-
-cd ../bsr_sr
-unzip -o model.zip
-
-cd ../layout2img-openimages256
-unzip -o model.zip
-
-cd ../inpainting_big
-unzip -o model.zip
-
-cd ../..
diff --git a/examples/tutorial/stable_diffusion/scripts/img2img.py b/examples/tutorial/stable_diffusion/scripts/img2img.py
deleted file mode 100644
index 421e2151d9e9..000000000000
--- a/examples/tutorial/stable_diffusion/scripts/img2img.py
+++ /dev/null
@@ -1,293 +0,0 @@
-"""make variations of input image"""
-
-import argparse, os, sys, glob
-import PIL
-import torch
-import numpy as np
-from omegaconf import OmegaConf
-from PIL import Image
-from tqdm import tqdm, trange
-from itertools import islice
-from einops import rearrange, repeat
-from torchvision.utils import make_grid
-from torch import autocast
-from contextlib import nullcontext
-import time
-from pytorch_lightning import seed_everything
-
-from ldm.util import instantiate_from_config
-from ldm.models.diffusion.ddim import DDIMSampler
-from ldm.models.diffusion.plms import PLMSSampler
-
-
-def chunk(it, size):
-    it = iter(it)
-    return iter(lambda: tuple(islice(it, size)), ())
-
-
-def load_model_from_config(config, ckpt, verbose=False):
-    print(f"Loading model from {ckpt}")
-    pl_sd = torch.load(ckpt, map_location="cpu")
-    if "global_step" in pl_sd:
-        print(f"Global Step: {pl_sd['global_step']}")
-    sd = pl_sd["state_dict"]
-    model = instantiate_from_config(config.model)
-    m, u = model.load_state_dict(sd, strict=False)
-    if len(m) > 0 and verbose:
-        print("missing keys:")
-        print(m)
-    if len(u) > 0 and verbose:
-        print("unexpected keys:")
-        print(u)
-
-    model.cuda()
-    model.eval()
-    return model
-
-
-def load_img(path):
-    image = Image.open(path).convert("RGB")
-    w, h = image.size
-    print(f"loaded input image of size ({w}, {h}) from {path}")
-    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
-    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
-    image = np.array(image).astype(np.float32) / 255.0
-    image = image[None].transpose(0, 3, 1, 2)
-    image = torch.from_numpy(image)
-    return 2.*image - 1.
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--prompt",
-        type=str,
-        nargs="?",
-        default="a painting of a virus monster playing guitar",
-        help="the prompt to render"
-    )
-
-    parser.add_argument(
-        "--init-img",
-        type=str,
-        nargs="?",
-        help="path to the input image"
-    )
-
-    parser.add_argument(
-        "--outdir",
-        type=str,
-        nargs="?",
-        help="dir to write results to",
-        default="outputs/img2img-samples"
-    )
-
-    parser.add_argument(
-        "--skip_grid",
-        action='store_true',
-        help="do not save a grid, only individual samples. Helpful when evaluating lots of samples",
-    )
-
-    parser.add_argument(
-        "--skip_save",
-        action='store_true',
-        help="do not save indiviual samples. For speed measurements.",
-    )
-
-    parser.add_argument(
-        "--ddim_steps",
-        type=int,
-        default=50,
-        help="number of ddim sampling steps",
-    )
-
-    parser.add_argument(
-        "--plms",
-        action='store_true',
-        help="use plms sampling",
-    )
-    parser.add_argument(
-        "--fixed_code",
-        action='store_true',
-        help="if enabled, uses the same starting code across all samples ",
-    )
-
-    parser.add_argument(
-        "--ddim_eta",
-        type=float,
-        default=0.0,
-        help="ddim eta (eta=0.0 corresponds to deterministic sampling",
-    )
-    parser.add_argument(
-        "--n_iter",
-        type=int,
-        default=1,
-        help="sample this often",
-    )
-    parser.add_argument(
-        "--C",
-        type=int,
-        default=4,
-        help="latent channels",
-    )
-    parser.add_argument(
-        "--f",
-        type=int,
-        default=8,
-        help="downsampling factor, most often 8 or 16",
-    )
-    parser.add_argument(
-        "--n_samples",
-        type=int,
-        default=2,
-        help="how many samples to produce for each given prompt. A.k.a batch size",
-    )
-    parser.add_argument(
-        "--n_rows",
-        type=int,
-        default=0,
-        help="rows in the grid (default: n_samples)",
-    )
-    parser.add_argument(
-        "--scale",
-        type=float,
-        default=5.0,
-        help="unconditional guidance scale: eps = eps(x, empty) + scale * (eps(x, cond) - eps(x, empty))",
-    )
-
-    parser.add_argument(
-        "--strength",
-        type=float,
-        default=0.75,
-        help="strength for noising/unnoising. 1.0 corresponds to full destruction of information in init image",
-    )
-    parser.add_argument(
-        "--from-file",
-        type=str,
-        help="if specified, load prompts from this file",
-    )
-    parser.add_argument(
-        "--config",
-        type=str,
-        default="configs/stable-diffusion/v1-inference.yaml",
-        help="path to config which constructs model",
-    )
-    parser.add_argument(
-        "--ckpt",
-        type=str,
-        default="models/ldm/stable-diffusion-v1/model.ckpt",
-        help="path to checkpoint of model",
-    )
-    parser.add_argument(
-        "--seed",
-        type=int,
-        default=42,
-        help="the seed (for reproducible sampling)",
-    )
-    parser.add_argument(
-        "--precision",
-        type=str,
-        help="evaluate at this precision",
-        choices=["full", "autocast"],
-        default="autocast"
-    )
-
-    opt = parser.parse_args()
-    seed_everything(opt.seed)
-
-    config = OmegaConf.load(f"{opt.config}")
-    model = load_model_from_config(config, f"{opt.ckpt}")
-
-    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
-    model = model.to(device)
-
-    if opt.plms:
-        raise NotImplementedError("PLMS sampler not (yet) supported")
-        sampler = PLMSSampler(model)
-    else:
-        sampler = DDIMSampler(model)
-
-    os.makedirs(opt.outdir, exist_ok=True)
-    outpath = opt.outdir
-
-    batch_size = opt.n_samples
-    n_rows = opt.n_rows if opt.n_rows > 0 else batch_size
-    if not opt.from_file:
-        prompt = opt.prompt
-        assert prompt is not None
-        data = [batch_size * [prompt]]
-
-    else:
-        print(f"reading prompts from {opt.from_file}")
-        with open(opt.from_file, "r") as f:
-            data = f.read().splitlines()
-            data = list(chunk(data, batch_size))
-
-    sample_path = os.path.join(outpath, "samples")
-    os.makedirs(sample_path, exist_ok=True)
-    base_count = len(os.listdir(sample_path))
-    grid_count = len(os.listdir(outpath)) - 1
-
-    assert os.path.isfile(opt.init_img)
-    init_image = load_img(opt.init_img).to(device)
-    init_image = repeat(init_image, '1 ... -> b ...', b=batch_size)
-    init_latent = model.get_first_stage_encoding(model.encode_first_stage(init_image))  # move to latent space
-
-    sampler.make_schedule(ddim_num_steps=opt.ddim_steps, ddim_eta=opt.ddim_eta, verbose=False)
-
-    assert 0. <= opt.strength <= 1., 'can only work with strength in [0.0, 1.0]'
-    t_enc = int(opt.strength * opt.ddim_steps)
-    print(f"target t_enc is {t_enc} steps")
-
-    precision_scope = autocast if opt.precision == "autocast" else nullcontext
-    with torch.no_grad():
-        with precision_scope("cuda"):
-            with model.ema_scope():
-                tic = time.time()
-                all_samples = list()
-                for n in trange(opt.n_iter, desc="Sampling"):
-                    for prompts in tqdm(data, desc="data"):
-                        uc = None
-                        if opt.scale != 1.0:
-                            uc = model.get_learned_conditioning(batch_size * [""])
-                        if isinstance(prompts, tuple):
-                            prompts = list(prompts)
-                        c = model.get_learned_conditioning(prompts)
-
-                        # encode (scaled latent)
-                        z_enc = sampler.stochastic_encode(init_latent, torch.tensor([t_enc]*batch_size).to(device))
-                        # decode it
-                        samples = sampler.decode(z_enc, c, t_enc, unconditional_guidance_scale=opt.scale,
-                                                 unconditional_conditioning=uc,)
-
-                        x_samples = model.decode_first_stage(samples)
-                        x_samples = torch.clamp((x_samples + 1.0) / 2.0, min=0.0, max=1.0)
-
-                        if not opt.skip_save:
-                            for x_sample in x_samples:
-                                x_sample = 255. * rearrange(x_sample.cpu().numpy(), 'c h w -> h w c')
-                                Image.fromarray(x_sample.astype(np.uint8)).save(
-                                    os.path.join(sample_path, f"{base_count:05}.png"))
-                                base_count += 1
-                        all_samples.append(x_samples)
-
-                if not opt.skip_grid:
-                    # additionally, save as grid
-                    grid = torch.stack(all_samples, 0)
-                    grid = rearrange(grid, 'n b c h w -> (n b) c h w')
-                    grid = make_grid(grid, nrow=n_rows)
-
-                    # to image
-                    grid = 255. * rearrange(grid, 'c h w -> h w c').cpu().numpy()
-                    Image.fromarray(grid.astype(np.uint8)).save(os.path.join(outpath, f'grid-{grid_count:04}.png'))
-                    grid_count += 1
-
-                toc = time.time()
-
-    print(f"Your samples are ready and waiting for you here: \n{outpath} \n"
-          f" \nEnjoy.")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/tutorial/stable_diffusion/scripts/inpaint.py b/examples/tutorial/stable_diffusion/scripts/inpaint.py
deleted file mode 100644
index d6e6387a9a3b..000000000000
--- a/examples/tutorial/stable_diffusion/scripts/inpaint.py
+++ /dev/null
@@ -1,98 +0,0 @@
-import argparse, os, sys, glob
-from omegaconf import OmegaConf
-from PIL import Image
-from tqdm import tqdm
-import numpy as np
-import torch
-from main import instantiate_from_config
-from ldm.models.diffusion.ddim import DDIMSampler
-
-
-def make_batch(image, mask, device):
-    image = np.array(Image.open(image).convert("RGB"))
-    image = image.astype(np.float32)/255.0
-    image = image[None].transpose(0,3,1,2)
-    image = torch.from_numpy(image)
-
-    mask = np.array(Image.open(mask).convert("L"))
-    mask = mask.astype(np.float32)/255.0
-    mask = mask[None,None]
-    mask[mask < 0.5] = 0
-    mask[mask >= 0.5] = 1
-    mask = torch.from_numpy(mask)
-
-    masked_image = (1-mask)*image
-
-    batch = {"image": image, "mask": mask, "masked_image": masked_image}
-    for k in batch:
-        batch[k] = batch[k].to(device=device)
-        batch[k] = batch[k]*2.0-1.0
-    return batch
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--indir",
-        type=str,
-        nargs="?",
-        help="dir containing image-mask pairs (`example.png` and `example_mask.png`)",
-    )
-    parser.add_argument(
-        "--outdir",
-        type=str,
-        nargs="?",
-        help="dir to write results to",
-    )
-    parser.add_argument(
-        "--steps",
-        type=int,
-        default=50,
-        help="number of ddim sampling steps",
-    )
-    opt = parser.parse_args()
-
-    masks = sorted(glob.glob(os.path.join(opt.indir, "*_mask.png")))
-    images = [x.replace("_mask.png", ".png") for x in masks]
-    print(f"Found {len(masks)} inputs.")
-
-    config = OmegaConf.load("models/ldm/inpainting_big/config.yaml")
-    model = instantiate_from_config(config.model)
-    model.load_state_dict(torch.load("models/ldm/inpainting_big/last.ckpt")["state_dict"],
-                          strict=False)
-
-    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
-    model = model.to(device)
-    sampler = DDIMSampler(model)
-
-    os.makedirs(opt.outdir, exist_ok=True)
-    with torch.no_grad():
-        with model.ema_scope():
-            for image, mask in tqdm(zip(images, masks)):
-                outpath = os.path.join(opt.outdir, os.path.split(image)[1])
-                batch = make_batch(image, mask, device=device)
-
-                # encode masked image and concat downsampled mask
-                c = model.cond_stage_model.encode(batch["masked_image"])
-                cc = torch.nn.functional.interpolate(batch["mask"],
-                                                     size=c.shape[-2:])
-                c = torch.cat((c, cc), dim=1)
-
-                shape = (c.shape[1]-1,)+c.shape[2:]
-                samples_ddim, _ = sampler.sample(S=opt.steps,
-                                                 conditioning=c,
-                                                 batch_size=c.shape[0],
-                                                 shape=shape,
-                                                 verbose=False)
-                x_samples_ddim = model.decode_first_stage(samples_ddim)
-
-                image = torch.clamp((batch["image"]+1.0)/2.0,
-                                    min=0.0, max=1.0)
-                mask = torch.clamp((batch["mask"]+1.0)/2.0,
-                                   min=0.0, max=1.0)
-                predicted_image = torch.clamp((x_samples_ddim+1.0)/2.0,
-                                              min=0.0, max=1.0)
-
-                inpainted = (1-mask)*image+mask*predicted_image
-                inpainted = inpainted.cpu().numpy().transpose(0,2,3,1)[0]*255
-                Image.fromarray(inpainted.astype(np.uint8)).save(outpath)
diff --git a/examples/tutorial/stable_diffusion/scripts/knn2img.py b/examples/tutorial/stable_diffusion/scripts/knn2img.py
deleted file mode 100644
index e6eaaecab53e..000000000000
--- a/examples/tutorial/stable_diffusion/scripts/knn2img.py
+++ /dev/null
@@ -1,398 +0,0 @@
-import argparse, os, sys, glob
-import clip
-import torch
-import torch.nn as nn
-import numpy as np
-from omegaconf import OmegaConf
-from PIL import Image
-from tqdm import tqdm, trange
-from itertools import islice
-from einops import rearrange, repeat
-from torchvision.utils import make_grid
-import scann
-import time
-from multiprocessing import cpu_count
-
-from ldm.util import instantiate_from_config, parallel_data_prefetch
-from ldm.models.diffusion.ddim import DDIMSampler
-from ldm.models.diffusion.plms import PLMSSampler
-from ldm.modules.encoders.modules import FrozenClipImageEmbedder, FrozenCLIPTextEmbedder
-
-DATABASES = [
-    "openimages",
-    "artbench-art_nouveau",
-    "artbench-baroque",
-    "artbench-expressionism",
-    "artbench-impressionism",
-    "artbench-post_impressionism",
-    "artbench-realism",
-    "artbench-romanticism",
-    "artbench-renaissance",
-    "artbench-surrealism",
-    "artbench-ukiyo_e",
-]
-
-
-def chunk(it, size):
-    it = iter(it)
-    return iter(lambda: tuple(islice(it, size)), ())
-
-
-def load_model_from_config(config, ckpt, verbose=False):
-    print(f"Loading model from {ckpt}")
-    pl_sd = torch.load(ckpt, map_location="cpu")
-    if "global_step" in pl_sd:
-        print(f"Global Step: {pl_sd['global_step']}")
-    sd = pl_sd["state_dict"]
-    model = instantiate_from_config(config.model)
-    m, u = model.load_state_dict(sd, strict=False)
-    if len(m) > 0 and verbose:
-        print("missing keys:")
-        print(m)
-    if len(u) > 0 and verbose:
-        print("unexpected keys:")
-        print(u)
-
-    model.cuda()
-    model.eval()
-    return model
-
-
-class Searcher(object):
-    def __init__(self, database, retriever_version='ViT-L/14'):
-        assert database in DATABASES
-        # self.database = self.load_database(database)
-        self.database_name = database
-        self.searcher_savedir = f'data/rdm/searchers/{self.database_name}'
-        self.database_path = f'data/rdm/retrieval_databases/{self.database_name}'
-        self.retriever = self.load_retriever(version=retriever_version)
-        self.database = {'embedding': [],
-                         'img_id': [],
-                         'patch_coords': []}
-        self.load_database()
-        self.load_searcher()
-
-    def train_searcher(self, k,
-                       metric='dot_product',
-                       searcher_savedir=None):
-
-        print('Start training searcher')
-        searcher = scann.scann_ops_pybind.builder(self.database['embedding'] /
-                                                  np.linalg.norm(self.database['embedding'], axis=1)[:, np.newaxis],
-                                                  k, metric)
-        self.searcher = searcher.score_brute_force().build()
-        print('Finish training searcher')
-
-        if searcher_savedir is not None:
-            print(f'Save trained searcher under "{searcher_savedir}"')
-            os.makedirs(searcher_savedir, exist_ok=True)
-            self.searcher.serialize(searcher_savedir)
-
-    def load_single_file(self, saved_embeddings):
-        compressed = np.load(saved_embeddings)
-        self.database = {key: compressed[key] for key in compressed.files}
-        print('Finished loading of clip embeddings.')
-
-    def load_multi_files(self, data_archive):
-        out_data = {key: [] for key in self.database}
-        for d in tqdm(data_archive, desc=f'Loading datapool from {len(data_archive)} individual files.'):
-            for key in d.files:
-                out_data[key].append(d[key])
-
-        return out_data
-
-    def load_database(self):
-
-        print(f'Load saved patch embedding from "{self.database_path}"')
-        file_content = glob.glob(os.path.join(self.database_path, '*.npz'))
-
-        if len(file_content) == 1:
-            self.load_single_file(file_content[0])
-        elif len(file_content) > 1:
-            data = [np.load(f) for f in file_content]
-            prefetched_data = parallel_data_prefetch(self.load_multi_files, data,
-                                                     n_proc=min(len(data), cpu_count()), target_data_type='dict')
-
-            self.database = {key: np.concatenate([od[key] for od in prefetched_data], axis=1)[0] for key in
-                             self.database}
-        else:
-            raise ValueError(f'No npz-files in specified path "{self.database_path}" is this directory existing?')
-
-        print(f'Finished loading of retrieval database of length {self.database["embedding"].shape[0]}.')
-
-    def load_retriever(self, version='ViT-L/14', ):
-        model = FrozenClipImageEmbedder(model=version)
-        if torch.cuda.is_available():
-            model.cuda()
-        model.eval()
-        return model
-
-    def load_searcher(self):
-        print(f'load searcher for database {self.database_name} from {self.searcher_savedir}')
-        self.searcher = scann.scann_ops_pybind.load_searcher(self.searcher_savedir)
-        print('Finished loading searcher.')
-
-    def search(self, x, k):
-        if self.searcher is None and self.database['embedding'].shape[0] < 2e4:
-            self.train_searcher(k)   # quickly fit searcher on the fly for small databases
-        assert self.searcher is not None, 'Cannot search with uninitialized searcher'
-        if isinstance(x, torch.Tensor):
-            x = x.detach().cpu().numpy()
-        if len(x.shape) == 3:
-            x = x[:, 0]
-        query_embeddings = x / np.linalg.norm(x, axis=1)[:, np.newaxis]
-
-        start = time.time()
-        nns, distances = self.searcher.search_batched(query_embeddings, final_num_neighbors=k)
-        end = time.time()
-
-        out_embeddings = self.database['embedding'][nns]
-        out_img_ids = self.database['img_id'][nns]
-        out_pc = self.database['patch_coords'][nns]
-
-        out = {'nn_embeddings': out_embeddings / np.linalg.norm(out_embeddings, axis=-1)[..., np.newaxis],
-               'img_ids': out_img_ids,
-               'patch_coords': out_pc,
-               'queries': x,
-               'exec_time': end - start,
-               'nns': nns,
-               'q_embeddings': query_embeddings}
-
-        return out
-
-    def __call__(self, x, n):
-        return self.search(x, n)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # TODO: add n_neighbors and modes (text-only, text-image-retrieval, image-image retrieval etc)
-    # TODO: add 'image variation' mode when knn=0 but a single image is given instead of a text prompt?
-    parser.add_argument(
-        "--prompt",
-        type=str,
-        nargs="?",
-        default="a painting of a virus monster playing guitar",
-        help="the prompt to render"
-    )
-
-    parser.add_argument(
-        "--outdir",
-        type=str,
-        nargs="?",
-        help="dir to write results to",
-        default="outputs/txt2img-samples"
-    )
-
-    parser.add_argument(
-        "--skip_grid",
-        action='store_true',
-        help="do not save a grid, only individual samples. Helpful when evaluating lots of samples",
-    )
-
-    parser.add_argument(
-        "--ddim_steps",
-        type=int,
-        default=50,
-        help="number of ddim sampling steps",
-    )
-
-    parser.add_argument(
-        "--n_repeat",
-        type=int,
-        default=1,
-        help="number of repeats in CLIP latent space",
-    )
-
-    parser.add_argument(
-        "--plms",
-        action='store_true',
-        help="use plms sampling",
-    )
-
-    parser.add_argument(
-        "--ddim_eta",
-        type=float,
-        default=0.0,
-        help="ddim eta (eta=0.0 corresponds to deterministic sampling",
-    )
-    parser.add_argument(
-        "--n_iter",
-        type=int,
-        default=1,
-        help="sample this often",
-    )
-
-    parser.add_argument(
-        "--H",
-        type=int,
-        default=768,
-        help="image height, in pixel space",
-    )
-
-    parser.add_argument(
-        "--W",
-        type=int,
-        default=768,
-        help="image width, in pixel space",
-    )
-
-    parser.add_argument(
-        "--n_samples",
-        type=int,
-        default=3,
-        help="how many samples to produce for each given prompt. A.k.a batch size",
-    )
-
-    parser.add_argument(
-        "--n_rows",
-        type=int,
-        default=0,
-        help="rows in the grid (default: n_samples)",
-    )
-
-    parser.add_argument(
-        "--scale",
-        type=float,
-        default=5.0,
-        help="unconditional guidance scale: eps = eps(x, empty) + scale * (eps(x, cond) - eps(x, empty))",
-    )
-
-    parser.add_argument(
-        "--from-file",
-        type=str,
-        help="if specified, load prompts from this file",
-    )
-
-    parser.add_argument(
-        "--config",
-        type=str,
-        default="configs/retrieval-augmented-diffusion/768x768.yaml",
-        help="path to config which constructs model",
-    )
-
-    parser.add_argument(
-        "--ckpt",
-        type=str,
-        default="models/rdm/rdm768x768/model.ckpt",
-        help="path to checkpoint of model",
-    )
-
-    parser.add_argument(
-        "--clip_type",
-        type=str,
-        default="ViT-L/14",
-        help="which CLIP model to use for retrieval and NN encoding",
-    )
-    parser.add_argument(
-        "--database",
-        type=str,
-        default='artbench-surrealism',
-        choices=DATABASES,
-        help="The database used for the search, only applied when --use_neighbors=True",
-    )
-    parser.add_argument(
-        "--use_neighbors",
-        default=False,
-        action='store_true',
-        help="Include neighbors in addition to text prompt for conditioning",
-    )
-    parser.add_argument(
-        "--knn",
-        default=10,
-        type=int,
-        help="The number of included neighbors, only applied when --use_neighbors=True",
-    )
-
-    opt = parser.parse_args()
-
-    config = OmegaConf.load(f"{opt.config}")
-    model = load_model_from_config(config, f"{opt.ckpt}")
-
-    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
-    model = model.to(device)
-
-    clip_text_encoder = FrozenCLIPTextEmbedder(opt.clip_type).to(device)
-
-    if opt.plms:
-        sampler = PLMSSampler(model)
-    else:
-        sampler = DDIMSampler(model)
-
-    os.makedirs(opt.outdir, exist_ok=True)
-    outpath = opt.outdir
-
-    batch_size = opt.n_samples
-    n_rows = opt.n_rows if opt.n_rows > 0 else batch_size
-    if not opt.from_file:
-        prompt = opt.prompt
-        assert prompt is not None
-        data = [batch_size * [prompt]]
-
-    else:
-        print(f"reading prompts from {opt.from_file}")
-        with open(opt.from_file, "r") as f:
-            data = f.read().splitlines()
-            data = list(chunk(data, batch_size))
-
-    sample_path = os.path.join(outpath, "samples")
-    os.makedirs(sample_path, exist_ok=True)
-    base_count = len(os.listdir(sample_path))
-    grid_count = len(os.listdir(outpath)) - 1
-
-    print(f"sampling scale for cfg is {opt.scale:.2f}")
-
-    searcher = None
-    if opt.use_neighbors:
-        searcher = Searcher(opt.database)
-
-    with torch.no_grad():
-        with model.ema_scope():
-            for n in trange(opt.n_iter, desc="Sampling"):
-                all_samples = list()
-                for prompts in tqdm(data, desc="data"):
-                    print("sampling prompts:", prompts)
-                    if isinstance(prompts, tuple):
-                        prompts = list(prompts)
-                    c = clip_text_encoder.encode(prompts)
-                    uc = None
-                    if searcher is not None:
-                        nn_dict = searcher(c, opt.knn)
-                        c = torch.cat([c, torch.from_numpy(nn_dict['nn_embeddings']).cuda()], dim=1)
-                    if opt.scale != 1.0:
-                        uc = torch.zeros_like(c)
-                    if isinstance(prompts, tuple):
-                        prompts = list(prompts)
-                    shape = [16, opt.H // 16, opt.W // 16]  # note: currently hardcoded for f16 model
-                    samples_ddim, _ = sampler.sample(S=opt.ddim_steps,
-                                                     conditioning=c,
-                                                     batch_size=c.shape[0],
-                                                     shape=shape,
-                                                     verbose=False,
-                                                     unconditional_guidance_scale=opt.scale,
-                                                     unconditional_conditioning=uc,
-                                                     eta=opt.ddim_eta,
-                                                     )
-
-                    x_samples_ddim = model.decode_first_stage(samples_ddim)
-                    x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
-
-                    for x_sample in x_samples_ddim:
-                        x_sample = 255. * rearrange(x_sample.cpu().numpy(), 'c h w -> h w c')
-                        Image.fromarray(x_sample.astype(np.uint8)).save(
-                            os.path.join(sample_path, f"{base_count:05}.png"))
-                        base_count += 1
-                    all_samples.append(x_samples_ddim)
-
-                if not opt.skip_grid:
-                    # additionally, save as grid
-                    grid = torch.stack(all_samples, 0)
-                    grid = rearrange(grid, 'n b c h w -> (n b) c h w')
-                    grid = make_grid(grid, nrow=n_rows)
-
-                    # to image
-                    grid = 255. * rearrange(grid, 'c h w -> h w c').cpu().numpy()
-                    Image.fromarray(grid.astype(np.uint8)).save(os.path.join(outpath, f'grid-{grid_count:04}.png'))
-                    grid_count += 1
-
-    print(f"Your samples are ready and waiting for you here: \n{outpath} \nEnjoy.")
diff --git a/examples/tutorial/stable_diffusion/scripts/sample_diffusion.py b/examples/tutorial/stable_diffusion/scripts/sample_diffusion.py
deleted file mode 100644
index 876fe3c3642f..000000000000
--- a/examples/tutorial/stable_diffusion/scripts/sample_diffusion.py
+++ /dev/null
@@ -1,313 +0,0 @@
-import argparse, os, sys, glob, datetime, yaml
-import torch
-import time
-import numpy as np
-from tqdm import trange
-
-from omegaconf import OmegaConf
-from PIL import Image
-
-from ldm.models.diffusion.ddim import DDIMSampler
-from ldm.util import instantiate_from_config
-
-rescale = lambda x: (x + 1.) / 2.
-
-def custom_to_pil(x):
-    x = x.detach().cpu()
-    x = torch.clamp(x, -1., 1.)
-    x = (x + 1.) / 2.
-    x = x.permute(1, 2, 0).numpy()
-    x = (255 * x).astype(np.uint8)
-    x = Image.fromarray(x)
-    if not x.mode == "RGB":
-        x = x.convert("RGB")
-    return x
-
-
-def custom_to_np(x):
-    # saves the batch in adm style as in https://github.com/openai/guided-diffusion/blob/main/scripts/image_sample.py
-    sample = x.detach().cpu()
-    sample = ((sample + 1) * 127.5).clamp(0, 255).to(torch.uint8)
-    sample = sample.permute(0, 2, 3, 1)
-    sample = sample.contiguous()
-    return sample
-
-
-def logs2pil(logs, keys=["sample"]):
-    imgs = dict()
-    for k in logs:
-        try:
-            if len(logs[k].shape) == 4:
-                img = custom_to_pil(logs[k][0, ...])
-            elif len(logs[k].shape) == 3:
-                img = custom_to_pil(logs[k])
-            else:
-                print(f"Unknown format for key {k}. ")
-                img = None
-        except:
-            img = None
-        imgs[k] = img
-    return imgs
-
-
-@torch.no_grad()
-def convsample(model, shape, return_intermediates=True,
-               verbose=True,
-               make_prog_row=False):
-
-
-    if not make_prog_row:
-        return model.p_sample_loop(None, shape,
-                                   return_intermediates=return_intermediates, verbose=verbose)
-    else:
-        return model.progressive_denoising(
-            None, shape, verbose=True
-        )
-
-
-@torch.no_grad()
-def convsample_ddim(model, steps, shape, eta=1.0
-                    ):
-    ddim = DDIMSampler(model)
-    bs = shape[0]
-    shape = shape[1:]
-    samples, intermediates = ddim.sample(steps, batch_size=bs, shape=shape, eta=eta, verbose=False,)
-    return samples, intermediates
-
-
-@torch.no_grad()
-def make_convolutional_sample(model, batch_size, vanilla=False, custom_steps=None, eta=1.0,):
-
-
-    log = dict()
-
-    shape = [batch_size,
-             model.model.diffusion_model.in_channels,
-             model.model.diffusion_model.image_size,
-             model.model.diffusion_model.image_size]
-
-    with model.ema_scope("Plotting"):
-        t0 = time.time()
-        if vanilla:
-            sample, progrow = convsample(model, shape,
-                                         make_prog_row=True)
-        else:
-            sample, intermediates = convsample_ddim(model,  steps=custom_steps, shape=shape,
-                                                    eta=eta)
-
-        t1 = time.time()
-
-    x_sample = model.decode_first_stage(sample)
-
-    log["sample"] = x_sample
-    log["time"] = t1 - t0
-    log['throughput'] = sample.shape[0] / (t1 - t0)
-    print(f'Throughput for this batch: {log["throughput"]}')
-    return log
-
-def run(model, logdir, batch_size=50, vanilla=False, custom_steps=None, eta=None, n_samples=50000, nplog=None):
-    if vanilla:
-        print(f'Using Vanilla DDPM sampling with {model.num_timesteps} sampling steps.')
-    else:
-        print(f'Using DDIM sampling with {custom_steps} sampling steps and eta={eta}')
-
-
-    tstart = time.time()
-    n_saved = len(glob.glob(os.path.join(logdir,'*.png')))-1
-    # path = logdir
-    if model.cond_stage_model is None:
-        all_images = []
-
-        print(f"Running unconditional sampling for {n_samples} samples")
-        for _ in trange(n_samples // batch_size, desc="Sampling Batches (unconditional)"):
-            logs = make_convolutional_sample(model, batch_size=batch_size,
-                                             vanilla=vanilla, custom_steps=custom_steps,
-                                             eta=eta)
-            n_saved = save_logs(logs, logdir, n_saved=n_saved, key="sample")
-            all_images.extend([custom_to_np(logs["sample"])])
-            if n_saved >= n_samples:
-                print(f'Finish after generating {n_saved} samples')
-                break
-        all_img = np.concatenate(all_images, axis=0)
-        all_img = all_img[:n_samples]
-        shape_str = "x".join([str(x) for x in all_img.shape])
-        nppath = os.path.join(nplog, f"{shape_str}-samples.npz")
-        np.savez(nppath, all_img)
-
-    else:
-       raise NotImplementedError('Currently only sampling for unconditional models supported.')
-
-    print(f"sampling of {n_saved} images finished in {(time.time() - tstart) / 60.:.2f} minutes.")
-
-
-def save_logs(logs, path, n_saved=0, key="sample", np_path=None):
-    for k in logs:
-        if k == key:
-            batch = logs[key]
-            if np_path is None:
-                for x in batch:
-                    img = custom_to_pil(x)
-                    imgpath = os.path.join(path, f"{key}_{n_saved:06}.png")
-                    img.save(imgpath)
-                    n_saved += 1
-            else:
-                npbatch = custom_to_np(batch)
-                shape_str = "x".join([str(x) for x in npbatch.shape])
-                nppath = os.path.join(np_path, f"{n_saved}-{shape_str}-samples.npz")
-                np.savez(nppath, npbatch)
-                n_saved += npbatch.shape[0]
-    return n_saved
-
-
-def get_parser():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-r",
-        "--resume",
-        type=str,
-        nargs="?",
-        help="load from logdir or checkpoint in logdir",
-    )
-    parser.add_argument(
-        "-n",
-        "--n_samples",
-        type=int,
-        nargs="?",
-        help="number of samples to draw",
-        default=50000
-    )
-    parser.add_argument(
-        "-e",
-        "--eta",
-        type=float,
-        nargs="?",
-        help="eta for ddim sampling (0.0 yields deterministic sampling)",
-        default=1.0
-    )
-    parser.add_argument(
-        "-v",
-        "--vanilla_sample",
-        default=False,
-        action='store_true',
-        help="vanilla sampling (default option is DDIM sampling)?",
-    )
-    parser.add_argument(
-        "-l",
-        "--logdir",
-        type=str,
-        nargs="?",
-        help="extra logdir",
-        default="none"
-    )
-    parser.add_argument(
-        "-c",
-        "--custom_steps",
-        type=int,
-        nargs="?",
-        help="number of steps for ddim and fastdpm sampling",
-        default=50
-    )
-    parser.add_argument(
-        "--batch_size",
-        type=int,
-        nargs="?",
-        help="the bs",
-        default=10
-    )
-    return parser
-
-
-def load_model_from_config(config, sd):
-    model = instantiate_from_config(config)
-    model.load_state_dict(sd,strict=False)
-    model.cuda()
-    model.eval()
-    return model
-
-
-def load_model(config, ckpt, gpu, eval_mode):
-    if ckpt:
-        print(f"Loading model from {ckpt}")
-        pl_sd = torch.load(ckpt, map_location="cpu")
-        global_step = pl_sd["global_step"]
-    else:
-        pl_sd = {"state_dict": None}
-        global_step = None
-    model = load_model_from_config(config.model,
-                                   pl_sd["state_dict"])
-
-    return model, global_step
-
-
-if __name__ == "__main__":
-    now = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
-    sys.path.append(os.getcwd())
-    command = " ".join(sys.argv)
-
-    parser = get_parser()
-    opt, unknown = parser.parse_known_args()
-    ckpt = None
-
-    if not os.path.exists(opt.resume):
-        raise ValueError("Cannot find {}".format(opt.resume))
-    if os.path.isfile(opt.resume):
-        # paths = opt.resume.split("/")
-        try:
-            logdir = '/'.join(opt.resume.split('/')[:-1])
-            # idx = len(paths)-paths[::-1].index("logs")+1
-            print(f'Logdir is {logdir}')
-        except ValueError:
-            paths = opt.resume.split("/")
-            idx = -2  # take a guess: path/to/logdir/checkpoints/model.ckpt
-            logdir = "/".join(paths[:idx])
-        ckpt = opt.resume
-    else:
-        assert os.path.isdir(opt.resume), f"{opt.resume} is not a directory"
-        logdir = opt.resume.rstrip("/")
-        ckpt = os.path.join(logdir, "model.ckpt")
-
-    base_configs = sorted(glob.glob(os.path.join(logdir, "config.yaml")))
-    opt.base = base_configs
-
-    configs = [OmegaConf.load(cfg) for cfg in opt.base]
-    cli = OmegaConf.from_dotlist(unknown)
-    config = OmegaConf.merge(*configs, cli)
-
-    gpu = True
-    eval_mode = True
-
-    if opt.logdir != "none":
-        locallog = logdir.split(os.sep)[-1]
-        if locallog == "": locallog = logdir.split(os.sep)[-2]
-        print(f"Switching logdir from '{logdir}' to '{os.path.join(opt.logdir, locallog)}'")
-        logdir = os.path.join(opt.logdir, locallog)
-
-    print(config)
-
-    model, global_step = load_model(config, ckpt, gpu, eval_mode)
-    print(f"global step: {global_step}")
-    print(75 * "=")
-    print("logging to:")
-    logdir = os.path.join(logdir, "samples", f"{global_step:08}", now)
-    imglogdir = os.path.join(logdir, "img")
-    numpylogdir = os.path.join(logdir, "numpy")
-
-    os.makedirs(imglogdir)
-    os.makedirs(numpylogdir)
-    print(logdir)
-    print(75 * "=")
-
-    # write config out
-    sampling_file = os.path.join(logdir, "sampling_config.yaml")
-    sampling_conf = vars(opt)
-
-    with open(sampling_file, 'w') as f:
-        yaml.dump(sampling_conf, f, default_flow_style=False)
-    print(sampling_conf)
-
-
-    run(model, imglogdir, eta=opt.eta,
-        vanilla=opt.vanilla_sample,  n_samples=opt.n_samples, custom_steps=opt.custom_steps,
-        batch_size=opt.batch_size, nplog=numpylogdir)
-
-    print("done.")
diff --git a/examples/tutorial/stable_diffusion/scripts/tests/test_checkpoint.py b/examples/tutorial/stable_diffusion/scripts/tests/test_checkpoint.py
deleted file mode 100644
index a32e66d44cf2..000000000000
--- a/examples/tutorial/stable_diffusion/scripts/tests/test_checkpoint.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import os
-import sys
-from copy import deepcopy
-
-import yaml
-from datetime import datetime
-
-from diffusers import StableDiffusionPipeline
-import torch
-from ldm.util import instantiate_from_config
-from main import get_parser
-
-if __name__ == "__main__":
-    with torch.no_grad():
-        yaml_path = "../../train_colossalai.yaml"
-        with open(yaml_path, 'r', encoding='utf-8') as f:
-            config = f.read()
-        base_config = yaml.load(config, Loader=yaml.FullLoader)
-        unet_config = base_config['model']['params']['unet_config']
-        diffusion_model = instantiate_from_config(unet_config).to("cuda:0")
-
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "/data/scratch/diffuser/stable-diffusion-v1-4"
-        ).to("cuda:0")
-        dif_model_2 = pipe.unet
-
-        random_input_ = torch.rand((4, 4, 32, 32)).to("cuda:0")
-        random_input_2 = torch.clone(random_input_).to("cuda:0")
-        time_stamp = torch.randint(20, (4,)).to("cuda:0")
-        time_stamp2 = torch.clone(time_stamp).to("cuda:0")
-        context_ = torch.rand((4, 77, 768)).to("cuda:0")
-        context_2 = torch.clone(context_).to("cuda:0")
-
-        out_1 = diffusion_model(random_input_, time_stamp, context_)
-        out_2 = dif_model_2(random_input_2, time_stamp2, context_2)
-        print(out_1.shape)
-        print(out_2['sample'].shape)
\ No newline at end of file
diff --git a/examples/tutorial/stable_diffusion/scripts/tests/test_watermark.py b/examples/tutorial/stable_diffusion/scripts/tests/test_watermark.py
deleted file mode 100644
index f93f8a6e7076..000000000000
--- a/examples/tutorial/stable_diffusion/scripts/tests/test_watermark.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import cv2
-import fire
-from imwatermark import WatermarkDecoder
-
-
-def testit(img_path):
-    bgr = cv2.imread(img_path)
-    decoder = WatermarkDecoder('bytes', 136)
-    watermark = decoder.decode(bgr, 'dwtDct')
-    try:
-        dec = watermark.decode('utf-8')
-    except:
-        dec = "null"
-    print(dec)
-
-
-if __name__ == "__main__":
-    fire.Fire(testit)
\ No newline at end of file
diff --git a/examples/tutorial/stable_diffusion/scripts/train_searcher.py b/examples/tutorial/stable_diffusion/scripts/train_searcher.py
deleted file mode 100644
index 1e7904889c01..000000000000
--- a/examples/tutorial/stable_diffusion/scripts/train_searcher.py
+++ /dev/null
@@ -1,147 +0,0 @@
-import os, sys
-import numpy as np
-import scann
-import argparse
-import glob
-from multiprocessing import cpu_count
-from tqdm import tqdm
-
-from ldm.util import parallel_data_prefetch
-
-
-def search_bruteforce(searcher):
-    return searcher.score_brute_force().build()
-
-
-def search_partioned_ah(searcher, dims_per_block, aiq_threshold, reorder_k,
-                        partioning_trainsize, num_leaves, num_leaves_to_search):
-    return searcher.tree(num_leaves=num_leaves,
-                         num_leaves_to_search=num_leaves_to_search,
-                         training_sample_size=partioning_trainsize). \
-        score_ah(dims_per_block, anisotropic_quantization_threshold=aiq_threshold).reorder(reorder_k).build()
-
-
-def search_ah(searcher, dims_per_block, aiq_threshold, reorder_k):
-    return searcher.score_ah(dims_per_block, anisotropic_quantization_threshold=aiq_threshold).reorder(
-        reorder_k).build()
-
-def load_datapool(dpath):
-
-
-    def load_single_file(saved_embeddings):
-        compressed = np.load(saved_embeddings)
-        database = {key: compressed[key] for key in compressed.files}
-        return database
-
-    def load_multi_files(data_archive):
-        database = {key: [] for key in data_archive[0].files}
-        for d in tqdm(data_archive, desc=f'Loading datapool from {len(data_archive)} individual files.'):
-            for key in d.files:
-                database[key].append(d[key])
-
-        return database
-
-    print(f'Load saved patch embedding from "{dpath}"')
-    file_content = glob.glob(os.path.join(dpath, '*.npz'))
-
-    if len(file_content) == 1:
-        data_pool = load_single_file(file_content[0])
-    elif len(file_content) > 1:
-        data = [np.load(f) for f in file_content]
-        prefetched_data = parallel_data_prefetch(load_multi_files, data,
-                                                 n_proc=min(len(data), cpu_count()), target_data_type='dict')
-
-        data_pool = {key: np.concatenate([od[key] for od in prefetched_data], axis=1)[0] for key in prefetched_data[0].keys()}
-    else:
-        raise ValueError(f'No npz-files in specified path "{dpath}" is this directory existing?')
-
-    print(f'Finished loading of retrieval database of length {data_pool["embedding"].shape[0]}.')
-    return data_pool
-
-
-def train_searcher(opt,
-                   metric='dot_product',
-                   partioning_trainsize=None,
-                   reorder_k=None,
-                   # todo tune
-                   aiq_thld=0.2,
-                   dims_per_block=2,
-                   num_leaves=None,
-                   num_leaves_to_search=None,):
-
-    data_pool = load_datapool(opt.database)
-    k = opt.knn
-
-    if not reorder_k:
-        reorder_k = 2 * k
-
-    # normalize
-    # embeddings =
-    searcher = scann.scann_ops_pybind.builder(data_pool['embedding'] / np.linalg.norm(data_pool['embedding'], axis=1)[:, np.newaxis], k, metric)
-    pool_size = data_pool['embedding'].shape[0]
-
-    print(*(['#'] * 100))
-    print('Initializing scaNN searcher with the following values:')
-    print(f'k: {k}')
-    print(f'metric: {metric}')
-    print(f'reorder_k: {reorder_k}')
-    print(f'anisotropic_quantization_threshold: {aiq_thld}')
-    print(f'dims_per_block: {dims_per_block}')
-    print(*(['#'] * 100))
-    print('Start training searcher....')
-    print(f'N samples in pool is {pool_size}')
-
-    # this reflects the recommended design choices proposed at
-    # https://github.com/google-research/google-research/blob/aca5f2e44e301af172590bb8e65711f0c9ee0cfd/scann/docs/algorithms.md
-    if pool_size < 2e4:
-        print('Using brute force search.')
-        searcher = search_bruteforce(searcher)
-    elif 2e4 <= pool_size and pool_size < 1e5:
-        print('Using asymmetric hashing search and reordering.')
-        searcher = search_ah(searcher, dims_per_block, aiq_thld, reorder_k)
-    else:
-        print('Using using partioning, asymmetric hashing search and reordering.')
-
-        if not partioning_trainsize:
-            partioning_trainsize = data_pool['embedding'].shape[0] // 10
-        if not num_leaves:
-            num_leaves = int(np.sqrt(pool_size))
-
-        if not num_leaves_to_search:
-            num_leaves_to_search = max(num_leaves // 20, 1)
-
-        print('Partitioning params:')
-        print(f'num_leaves: {num_leaves}')
-        print(f'num_leaves_to_search: {num_leaves_to_search}')
-        # self.searcher = self.search_ah(searcher, dims_per_block, aiq_thld, reorder_k)
-        searcher = search_partioned_ah(searcher, dims_per_block, aiq_thld, reorder_k,
-                                                 partioning_trainsize, num_leaves, num_leaves_to_search)
-
-    print('Finish training searcher')
-    searcher_savedir = opt.target_path
-    os.makedirs(searcher_savedir, exist_ok=True)
-    searcher.serialize(searcher_savedir)
-    print(f'Saved trained searcher under "{searcher_savedir}"')
-
-if __name__ == '__main__':
-    sys.path.append(os.getcwd())
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--database',
-                        '-d',
-                        default='data/rdm/retrieval_databases/openimages',
-                        type=str,
-                        help='path to folder containing the clip feature of the database')
-    parser.add_argument('--target_path',
-                        '-t',
-                        default='data/rdm/searchers/openimages',
-                        type=str,
-                        help='path to the target folder where the searcher shall be stored.')
-    parser.add_argument('--knn',
-                        '-k',
-                        default=20,
-                        type=int,
-                        help='number of nearest neighbors, for which the searcher shall be optimized')
-
-    opt, _  = parser.parse_known_args()
-
-    train_searcher(opt,)
\ No newline at end of file
diff --git a/examples/tutorial/stable_diffusion/scripts/txt2img.py b/examples/tutorial/stable_diffusion/scripts/txt2img.py
deleted file mode 100644
index 59c16a1db871..000000000000
--- a/examples/tutorial/stable_diffusion/scripts/txt2img.py
+++ /dev/null
@@ -1,344 +0,0 @@
-import argparse, os, sys, glob
-import cv2
-import torch
-import numpy as np
-from omegaconf import OmegaConf
-from PIL import Image
-from tqdm import tqdm, trange
-from imwatermark import WatermarkEncoder
-from itertools import islice
-from einops import rearrange
-from torchvision.utils import make_grid
-import time
-from pytorch_lightning import seed_everything
-from torch import autocast
-from contextlib import contextmanager, nullcontext
-
-from ldm.util import instantiate_from_config
-from ldm.models.diffusion.ddim import DDIMSampler
-from ldm.models.diffusion.plms import PLMSSampler
-
-from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
-from transformers import AutoFeatureExtractor
-
-
-# load safety model
-safety_model_id = "CompVis/stable-diffusion-safety-checker"
-safety_feature_extractor = AutoFeatureExtractor.from_pretrained(safety_model_id)
-safety_checker = StableDiffusionSafetyChecker.from_pretrained(safety_model_id)
-
-
-def chunk(it, size):
-    it = iter(it)
-    return iter(lambda: tuple(islice(it, size)), ())
-
-
-def numpy_to_pil(images):
-    """
-    Convert a numpy image or a batch of images to a PIL image.
-    """
-    if images.ndim == 3:
-        images = images[None, ...]
-    images = (images * 255).round().astype("uint8")
-    pil_images = [Image.fromarray(image) for image in images]
-
-    return pil_images
-
-
-def load_model_from_config(config, ckpt, verbose=False):
-    print(f"Loading model from {ckpt}")
-    pl_sd = torch.load(ckpt, map_location="cpu")
-    if "global_step" in pl_sd:
-        print(f"Global Step: {pl_sd['global_step']}")
-    sd = pl_sd["state_dict"]
-    model = instantiate_from_config(config.model)
-    m, u = model.load_state_dict(sd, strict=False)
-    if len(m) > 0 and verbose:
-        print("missing keys:")
-        print(m)
-    if len(u) > 0 and verbose:
-        print("unexpected keys:")
-        print(u)
-
-    model.cuda()
-    model.eval()
-    return model
-
-
-def put_watermark(img, wm_encoder=None):
-    if wm_encoder is not None:
-        img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
-        img = wm_encoder.encode(img, 'dwtDct')
-        img = Image.fromarray(img[:, :, ::-1])
-    return img
-
-
-def load_replacement(x):
-    try:
-        hwc = x.shape
-        y = Image.open("assets/rick.jpeg").convert("RGB").resize((hwc[1], hwc[0]))
-        y = (np.array(y)/255.0).astype(x.dtype)
-        assert y.shape == x.shape
-        return y
-    except Exception:
-        return x
-
-
-def check_safety(x_image):
-    safety_checker_input = safety_feature_extractor(numpy_to_pil(x_image), return_tensors="pt")
-    x_checked_image, has_nsfw_concept = safety_checker(images=x_image, clip_input=safety_checker_input.pixel_values)
-    assert x_checked_image.shape[0] == len(has_nsfw_concept)
-    for i in range(len(has_nsfw_concept)):
-        if has_nsfw_concept[i]:
-            x_checked_image[i] = load_replacement(x_checked_image[i])
-    return x_checked_image, has_nsfw_concept
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--prompt",
-        type=str,
-        nargs="?",
-        default="a painting of a virus monster playing guitar",
-        help="the prompt to render"
-    )
-    parser.add_argument(
-        "--outdir",
-        type=str,
-        nargs="?",
-        help="dir to write results to",
-        default="outputs/txt2img-samples"
-    )
-    parser.add_argument(
-        "--skip_grid",
-        action='store_true',
-        help="do not save a grid, only individual samples. Helpful when evaluating lots of samples",
-    )
-    parser.add_argument(
-        "--skip_save",
-        action='store_true',
-        help="do not save individual samples. For speed measurements.",
-    )
-    parser.add_argument(
-        "--ddim_steps",
-        type=int,
-        default=50,
-        help="number of ddim sampling steps",
-    )
-    parser.add_argument(
-        "--plms",
-        action='store_true',
-        help="use plms sampling",
-    )
-    parser.add_argument(
-        "--laion400m",
-        action='store_true',
-        help="uses the LAION400M model",
-    )
-    parser.add_argument(
-        "--fixed_code",
-        action='store_true',
-        help="if enabled, uses the same starting code across samples ",
-    )
-    parser.add_argument(
-        "--ddim_eta",
-        type=float,
-        default=0.0,
-        help="ddim eta (eta=0.0 corresponds to deterministic sampling",
-    )
-    parser.add_argument(
-        "--n_iter",
-        type=int,
-        default=2,
-        help="sample this often",
-    )
-    parser.add_argument(
-        "--H",
-        type=int,
-        default=512,
-        help="image height, in pixel space",
-    )
-    parser.add_argument(
-        "--W",
-        type=int,
-        default=512,
-        help="image width, in pixel space",
-    )
-    parser.add_argument(
-        "--C",
-        type=int,
-        default=4,
-        help="latent channels",
-    )
-    parser.add_argument(
-        "--f",
-        type=int,
-        default=8,
-        help="downsampling factor",
-    )
-    parser.add_argument(
-        "--n_samples",
-        type=int,
-        default=3,
-        help="how many samples to produce for each given prompt. A.k.a. batch size",
-    )
-    parser.add_argument(
-        "--n_rows",
-        type=int,
-        default=0,
-        help="rows in the grid (default: n_samples)",
-    )
-    parser.add_argument(
-        "--scale",
-        type=float,
-        default=7.5,
-        help="unconditional guidance scale: eps = eps(x, empty) + scale * (eps(x, cond) - eps(x, empty))",
-    )
-    parser.add_argument(
-        "--from-file",
-        type=str,
-        help="if specified, load prompts from this file",
-    )
-    parser.add_argument(
-        "--config",
-        type=str,
-        default="configs/stable-diffusion/v1-inference.yaml",
-        help="path to config which constructs model",
-    )
-    parser.add_argument(
-        "--ckpt",
-        type=str,
-        default="models/ldm/stable-diffusion-v1/model.ckpt",
-        help="path to checkpoint of model",
-    )
-    parser.add_argument(
-        "--seed",
-        type=int,
-        default=42,
-        help="the seed (for reproducible sampling)",
-    )
-    parser.add_argument(
-        "--precision",
-        type=str,
-        help="evaluate at this precision",
-        choices=["full", "autocast"],
-        default="autocast"
-    )
-    opt = parser.parse_args()
-
-    if opt.laion400m:
-        print("Falling back to LAION 400M model...")
-        opt.config = "configs/latent-diffusion/txt2img-1p4B-eval.yaml"
-        opt.ckpt = "models/ldm/text2img-large/model.ckpt"
-        opt.outdir = "outputs/txt2img-samples-laion400m"
-
-    seed_everything(opt.seed)
-
-    config = OmegaConf.load(f"{opt.config}")
-    model = load_model_from_config(config, f"{opt.ckpt}")
-
-    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
-    model = model.to(device)
-
-    if opt.plms:
-        sampler = PLMSSampler(model)
-    else:
-        sampler = DDIMSampler(model)
-
-    os.makedirs(opt.outdir, exist_ok=True)
-    outpath = opt.outdir
-
-    print("Creating invisible watermark encoder (see https://github.com/ShieldMnt/invisible-watermark)...")
-    wm = "StableDiffusionV1"
-    wm_encoder = WatermarkEncoder()
-    wm_encoder.set_watermark('bytes', wm.encode('utf-8'))
-
-    batch_size = opt.n_samples
-    n_rows = opt.n_rows if opt.n_rows > 0 else batch_size
-    if not opt.from_file:
-        prompt = opt.prompt
-        assert prompt is not None
-        data = [batch_size * [prompt]]
-
-    else:
-        print(f"reading prompts from {opt.from_file}")
-        with open(opt.from_file, "r") as f:
-            data = f.read().splitlines()
-            data = list(chunk(data, batch_size))
-
-    sample_path = os.path.join(outpath, "samples")
-    os.makedirs(sample_path, exist_ok=True)
-    base_count = len(os.listdir(sample_path))
-    grid_count = len(os.listdir(outpath)) - 1
-
-    start_code = None
-    if opt.fixed_code:
-        start_code = torch.randn([opt.n_samples, opt.C, opt.H // opt.f, opt.W // opt.f], device=device)
-
-    precision_scope = autocast if opt.precision=="autocast" else nullcontext
-    with torch.no_grad():
-        with precision_scope("cuda"):
-            with model.ema_scope():
-                tic = time.time()
-                all_samples = list()
-                for n in trange(opt.n_iter, desc="Sampling"):
-                    for prompts in tqdm(data, desc="data"):
-                        uc = None
-                        if opt.scale != 1.0:
-                            uc = model.get_learned_conditioning(batch_size * [""])
-                        if isinstance(prompts, tuple):
-                            prompts = list(prompts)
-                        c = model.get_learned_conditioning(prompts)
-                        shape = [opt.C, opt.H // opt.f, opt.W // opt.f]
-                        samples_ddim, _ = sampler.sample(S=opt.ddim_steps,
-                                                         conditioning=c,
-                                                         batch_size=opt.n_samples,
-                                                         shape=shape,
-                                                         verbose=False,
-                                                         unconditional_guidance_scale=opt.scale,
-                                                         unconditional_conditioning=uc,
-                                                         eta=opt.ddim_eta,
-                                                         x_T=start_code)
-
-                        x_samples_ddim = model.decode_first_stage(samples_ddim)
-                        x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
-                        x_samples_ddim = x_samples_ddim.cpu().permute(0, 2, 3, 1).numpy()
-
-                        x_checked_image, has_nsfw_concept = check_safety(x_samples_ddim)
-
-                        x_checked_image_torch = torch.from_numpy(x_checked_image).permute(0, 3, 1, 2)
-
-                        if not opt.skip_save:
-                            for x_sample in x_checked_image_torch:
-                                x_sample = 255. * rearrange(x_sample.cpu().numpy(), 'c h w -> h w c')
-                                img = Image.fromarray(x_sample.astype(np.uint8))
-                                img = put_watermark(img, wm_encoder)
-                                img.save(os.path.join(sample_path, f"{base_count:05}.png"))
-                                base_count += 1
-
-                        if not opt.skip_grid:
-                            all_samples.append(x_checked_image_torch)
-
-                if not opt.skip_grid:
-                    # additionally, save as grid
-                    grid = torch.stack(all_samples, 0)
-                    grid = rearrange(grid, 'n b c h w -> (n b) c h w')
-                    grid = make_grid(grid, nrow=n_rows)
-
-                    # to image
-                    grid = 255. * rearrange(grid, 'c h w -> h w c').cpu().numpy()
-                    img = Image.fromarray(grid.astype(np.uint8))
-                    img = put_watermark(img, wm_encoder)
-                    img.save(os.path.join(outpath, f'grid-{grid_count:04}.png'))
-                    grid_count += 1
-
-                toc = time.time()
-
-    print(f"Your samples are ready and waiting for you here: \n{outpath} \n"
-          f" \nEnjoy.")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/tutorial/stable_diffusion/setup.py b/examples/tutorial/stable_diffusion/setup.py
deleted file mode 100644
index a24d54167640..000000000000
--- a/examples/tutorial/stable_diffusion/setup.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from setuptools import setup, find_packages
-
-setup(
-    name='latent-diffusion',
-    version='0.0.1',
-    description='',
-    packages=find_packages(),
-    install_requires=[
-        'torch',
-        'numpy',
-        'tqdm',
-    ],
-)
\ No newline at end of file
diff --git a/examples/tutorial/stable_diffusion/train.sh b/examples/tutorial/stable_diffusion/train.sh
deleted file mode 100644
index 63abcadbf62b..000000000000
--- a/examples/tutorial/stable_diffusion/train.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-HF_DATASETS_OFFLINE=1 
-TRANSFORMERS_OFFLINE=1 
-
-python main.py --logdir /tmp -t --postfix test -b configs/train_colossalai.yaml 

From bb4e9a311a7a32acb6370f39e6b1a3e4c250b885 Mon Sep 17 00:00:00 2001
From: HELSON <c2h214748@gmail.com>
Date: Wed, 11 Jan 2023 10:07:37 +0800
Subject: [PATCH 155/503] [zero] add inference mode and its unit test (#2418)

---
 colossalai/gemini/gemini_mgr.py            |  18 ++-
 colossalai/nn/parallel/data_parallel.py    |  23 ++++
 tests/test_gemini/update/test_inference.py | 122 +++++++++++++++++++++
 3 files changed, 157 insertions(+), 6 deletions(-)
 create mode 100644 tests/test_gemini/update/test_inference.py

diff --git a/colossalai/gemini/gemini_mgr.py b/colossalai/gemini/gemini_mgr.py
index 08961b95832a..08fc0cf922d4 100644
--- a/colossalai/gemini/gemini_mgr.py
+++ b/colossalai/gemini/gemini_mgr.py
@@ -50,6 +50,17 @@ def __init__(self, placement_policy: str, chunk_manager: ChunkManager, memstats:
         self._warmup = True
         self._comp_cuda_demand_time = 0
 
+    def reset_attributes(self):
+        self._compute_idx = -1
+        self._h2d_volume = 0
+        self._d2h_volume = 0
+        self._layout_time = 0
+        self._evict_time = 0
+        self._comp_cuda_demand_time = 0
+
+    def is_warmup(self):
+        return self._warmup
+
     def memstats(self):
         """memstats
 
@@ -73,12 +84,7 @@ def post_iter(self):
         if self._mem_stats_collector and self._warmup:
             self._mem_stats_collector.finish_collection()
         self._warmup = False
-        self._compute_idx = -1
-        self._h2d_volume = 0
-        self._d2h_volume = 0
-        self._layout_time = 0
-        self._evict_time = 0
-        self._comp_cuda_demand_time = 0
+        self.reset_attributes()
 
     def adjust_layout(self, chunks: Tuple[Chunk, ...]) -> None:
         """ Adjust the layout of stateful tensors according to the information provided
diff --git a/colossalai/nn/parallel/data_parallel.py b/colossalai/nn/parallel/data_parallel.py
index a7d79be160d0..5e547059a937 100644
--- a/colossalai/nn/parallel/data_parallel.py
+++ b/colossalai/nn/parallel/data_parallel.py
@@ -268,12 +268,35 @@ def __init__(self,
 
         self._logger = get_dist_logger()
 
+    def _post_forward(self):
+        """This function is only triggered for inference.
+        """
+        access_list = list(self.chunk_manager.accessed_chunks)
+        # we need to scatter all accessed chunks and move them to their original places
+        for chunk in access_list:
+            assert chunk.can_release
+            self.chunk_manager.release_chunk(chunk)
+            first_param = next(iter(chunk.tensors_info))
+            self.chunk_manager.move_chunk(chunk, self.grads_device[first_param])
+        assert self.chunk_manager.accessed_mem == 0
+        # reset all recorded attributes
+        self.gemini_manager.reset_attributes()
+
     def forward(self, *args, **kwargs):
+        # check whether we are in a inference mode
+        grad_flag = torch.is_grad_enabled()
+        if not grad_flag:
+            assert not self.gemini_manager.is_warmup(), "You should run a completed iteration as your warmup iter"
+
         args, kwargs = _cast_float(args, torch.half), _cast_float(kwargs, torch.half)
         self.module.zero_grad(set_to_none=True)
         self.gemini_manager.pre_iter(*args)
         with ColoParamOpHookManager.use_hooks(self.param_op_hook):
             outputs = self.module(*args, **kwargs)
+        # scatter chunks in the inference mode
+        if not grad_flag:
+            self._post_forward()
+
         if self.force_outputs_fp32:
             return _cast_float(outputs, torch.float)
         return outputs
diff --git a/tests/test_gemini/update/test_inference.py b/tests/test_gemini/update/test_inference.py
new file mode 100644
index 000000000000..aec945fc9243
--- /dev/null
+++ b/tests/test_gemini/update/test_inference.py
@@ -0,0 +1,122 @@
+from functools import partial
+
+import pytest
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.testing import assert_close
+
+import colossalai
+from colossalai.amp import convert_to_apex_amp
+from colossalai.gemini.chunk import ChunkManager, init_chunk_manager, search_chunk_configuration
+from colossalai.gemini.gemini_mgr import GeminiManager
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.nn.optimizer.zero_optimizer import ZeroOptimizer
+from colossalai.nn.parallel import ZeroDDP
+from colossalai.testing import parameterize, rerun_if_address_is_in_use
+from colossalai.utils import free_port
+from colossalai.utils.cuda import get_current_device
+from colossalai.utils.model.colo_init_context import ColoInitContext, post_process_colo_init_ctx
+from tests.components_to_test import run_fwd_bwd
+from tests.components_to_test.registry import non_distributed_component_funcs
+from tests.test_tensor.common_utils import debug_print, set_seed
+
+
+def check_param(model: ZeroDDP, torch_model: torch.nn.Module):
+    zero_dict = model.state_dict(only_rank_0=False)
+    torch_dict = torch_model.state_dict()
+
+    for key, value in torch_dict.items():
+        # key is 'module.model.PARAMETER', so we truncate it
+        key = key[7:]
+        assert key in zero_dict, "{} not in ZeRO dictionary.".format(key)
+        temp_zero_value = zero_dict[key].to(device=value.device, dtype=value.dtype)
+        # debug_print([0], "max range: ", key, torch.max(torch.abs(value - temp_zero_value)))
+        assert_close(value, temp_zero_value, rtol=1e-3, atol=4e-3)
+
+
+@parameterize('placement_policy', ['cuda', 'cpu', 'auto', 'const'])
+@parameterize('model_name', ['gpt2'])
+def exam_inference(placement_policy, model_name: str):
+    set_seed(19360226)
+    get_components_func = non_distributed_component_funcs.get_callable(model_name)
+    model_builder, train_dataloader, test_dataloader, optimizer_class, criterion = get_components_func()
+
+    torch_model = model_builder().cuda()
+    amp_config = dict(opt_level='O2', keep_batchnorm_fp32=False, loss_scale=128)
+    torch_optim = torch.optim.Adam(torch_model.parameters(), lr=1e-3)
+    torch_model, torch_optim = convert_to_apex_amp(torch_model, torch_optim, amp_config)
+    torch_model = DDP(torch_model, device_ids=[dist.get_rank()])
+
+    init_dev = get_current_device()
+    with ColoInitContext(device=init_dev):
+        model = model_builder()
+
+    for torch_p, p in zip(torch_model.parameters(), model.parameters()):
+        p.data.copy_(torch_p.data)
+
+    world_size = torch.distributed.get_world_size()
+    config_dict, _ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
+    config_dict[world_size]['chunk_size'] = 5000
+    config_dict[world_size]['keep_gathered'] = False
+    if placement_policy != 'cuda':
+        init_device = torch.device('cpu')
+    else:
+        init_device = None
+    chunk_manager = ChunkManager(config_dict, init_device=init_device)
+    gemini_manager = GeminiManager(placement_policy, chunk_manager)
+    model = ZeroDDP(model, gemini_manager, pin_memory=True)
+
+    optimizer = HybridAdam(model.parameters(), lr=1e-3)
+    zero_optim = ZeroOptimizer(optimizer, model, initial_scale=128)
+
+    model.eval()
+    torch_model.eval()
+
+    set_seed(dist.get_rank() * 3 + 128)
+    train_dataloader = iter(train_dataloader)
+
+    def train_iter():
+        input_ids, label = next(train_dataloader)
+        input_ids, label = input_ids.cuda(), label.cuda()
+        zero_optim.zero_grad()
+        torch_optim.zero_grad()
+        torch_loss = run_fwd_bwd(torch_model, input_ids, label, criterion, torch_optim)
+        loss = run_fwd_bwd(model, input_ids, label, criterion, zero_optim)
+        assert_close(torch_loss, loss)
+        zero_optim.step()
+        torch_optim.step()
+        check_param(model, torch_model)
+
+    def inference_iter():
+        input_ids, label = next(train_dataloader)
+        input_ids, label = input_ids.cuda(), label.cuda()
+        with torch.no_grad():
+            torch_output = torch_model(input_ids)
+            torch_loss = criterion(torch_output.float(), label)
+            zero_output = model(input_ids)
+            zero_loss = criterion(zero_output.float(), label)
+        assert_close(torch_loss, zero_loss)
+
+    train_iter()
+    inference_iter()
+    train_iter()
+
+
+def run_dist(rank, world_size, port):
+    config = {}
+    colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+    exam_inference()
+
+
+@pytest.mark.dist
+@pytest.mark.parametrize('world_size', [1, 4])
+@rerun_if_address_is_in_use()
+def test_inference(world_size):
+    run_func = partial(run_dist, world_size=world_size, port=free_port())
+    mp.spawn(run_func, nprocs=world_size)
+
+
+if __name__ == '__main__':
+    test_inference(1)

From 21256674e99eef3da80b9572238ee2aef04b21a3 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Wed, 11 Jan 2023 10:44:52 +0800
Subject: [PATCH 156/503] [workflow] report test coverage even if below
 threshold (#2431)

---
 .github/workflows/report_test_coverage.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/report_test_coverage.yml b/.github/workflows/report_test_coverage.yml
index 167aa28b6b62..361eae8e4b75 100644
--- a/.github/workflows/report_test_coverage.yml
+++ b/.github/workflows/report_test_coverage.yml
@@ -40,7 +40,6 @@ jobs:
         with:
           filename: coverage.xml
           badge: true
-          fail_below_min: true
           format: markdown
           hide_branch_rate: false
           hide_complexity: false

From a3e549615627c7893f1b7189719644a02d0f0319 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Wed, 11 Jan 2023 10:46:32 +0800
Subject: [PATCH 157/503] [example] improved the clarity yof the example readme
 (#2427)

* [example] improved the clarity yof the example readme

* polish workflow

* polish workflow

* polish workflow

* polish workflow

* polish workflow

* polish workflow
---
 .github/workflows/auto_example_check.yml |  5 ++-
 examples/README.md                       | 48 +++++++++++++++---------
 2 files changed, 33 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/auto_example_check.yml b/.github/workflows/auto_example_check.yml
index d9063bad9f33..f88b6858e003 100644
--- a/.github/workflows/auto_example_check.yml
+++ b/.github/workflows/auto_example_check.yml
@@ -39,7 +39,7 @@ jobs:
           res=`python .github/workflows/scripts/example_checks/detect_changed_example.py --fileNameList $changedFileName`
           echo "All changed examples are $res"
 
-          if [ "$x" = "[]" ]; then
+          if [ "$res" = "[]" ]; then
             echo "anyChanged=false" >> $GITHUB_OUTPUT
             echo "matrix=null" >> $GITHUB_OUTPUT
           else
@@ -54,7 +54,8 @@ jobs:
     if: |
         github.event.pull_request.draft == false &&
         github.base_ref == 'main' &&
-        github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request'
+        github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request' &&
+        needs.detect-changed-example.outputs.anyChanged == 'true'
     name: Test the changed example
     needs: detect-changed-example
     runs-on: [self-hosted, gpu]
diff --git a/examples/README.md b/examples/README.md
index 53ab0896da0b..78facea5406d 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -1,28 +1,40 @@
-## Examples folder document
+# Colossal-AI Examples
 
 ## Table of Contents
-<ul>
- <li><a href="#Example-folder-description">Example folder description</a> </li>
- <li><a href="#Integrate-Your-Example-With-System-Testing">Integrate Your Example With System Testing</a> </li>
-</ul>
 
-## Example folder description
+- [Colossal-AI Examples](#colossal-ai-examples)
+  - [Table of Contents](#table-of-contents)
+  - [Overview](#overview)
+  - [Folder Structure](#folder-structure)
+  - [Integrate Your Example With Testing](#integrate-your-example-with-testing)
 
-This folder provides several examples using colossalai. The images folder includes model like diffusion, dreambooth and vit. The language folder includes gpt, opt, palm and roberta. The tutorial folder is for concept illustration, such as auto-parallel, hybrid-parallel and so on.
+## Overview
 
+This folder provides several examples accelerated by Colossal-AI. The `tutorial` folder is for everyone to quickly try out the different features in Colossal-AI. Other folders such as `images` and `language` include a wide range of deep learning tasks and applications.
 
-## Integrate Your Example With System Testing
+## Folder Structure
 
-For example code contributor, to meet the expectation and test your code automatically using github workflow function, here are several steps:
+```text
+└─ examples
+  └─ images
+      └─ vit
+        └─ test_ci.sh
+        └─ train.py
+        └─ README.md
+      └─ ...
+  └─ ...
+```
 
+## Integrate Your Example With Testing
 
-- (must) Have a test_ci.sh file in the folder like shown below in 'File Structure Chart'
-- The dataset should be located in the company's machine and can be announced using environment variable and thus no need for a separate terminal command.
-- The model parameters should be small to allow fast testing.
-- File Structure Chart
+Regular checks are important to ensure that all examples run without apparent bugs and stay compatible with the latest API.
+Colossal-AI runs workflows to check for examples on a on-pull-request and weekly basis.
+When a new example is added or changed, the workflow will run the example to test whether it can run.
+Moreover, Colossal-AI will run testing for examples every week.
 
-       └─examples
-          └─images
-              └─vit
-                └─requirements.txt
-                └─test_ci.sh
+Therefore, it is essential for the example contributors to know how to integrate your example with the testing workflow. Simply, you can follow the steps below.
+
+1. Create a script called `test_ci.sh` in your example folder
+2. Configure your testing parameters such as number steps, batch size in `test_ci.sh`, e.t.c. Keep these parameters small such that each example only takes several minutes.
+3. Export your dataset path with the prefix `/data` and make sure you have a copy of the dataset in the `/data/scratch/examples-data` directory on the CI machine. Community contributors can contact us via slack to request for downloading the dataset on the CI machine.
+4. Implement the logic such as dependency setup and example execution

From 7829aa094e2835273d7b0616369dcae3d083274f Mon Sep 17 00:00:00 2001
From: HELSON <c2h214748@gmail.com>
Date: Wed, 11 Jan 2023 12:22:45 +0800
Subject: [PATCH 158/503] [ddp] add is_ddp_ignored (#2434)

[ddp] rename to is_ddp_ignored
---
 colossalai/gemini/chunk/search_utils.py   |  9 ++---
 colossalai/gemini/chunk/utils.py          |  6 ++--
 colossalai/nn/optimizer/zero_optimizer.py |  4 +--
 colossalai/nn/parallel/data_parallel.py   | 12 +++----
 colossalai/utils/__init__.py              | 42 ++++++++++++++++++-----
 colossalai/utils/common.py                |  8 +++--
 colossalai/zero/utils/gemini_hook.py      |  5 +--
 7 files changed, 56 insertions(+), 30 deletions(-)

diff --git a/colossalai/gemini/chunk/search_utils.py b/colossalai/gemini/chunk/search_utils.py
index 312d77f1826c..572c3d94531f 100644
--- a/colossalai/gemini/chunk/search_utils.py
+++ b/colossalai/gemini/chunk/search_utils.py
@@ -6,17 +6,14 @@
 
 from colossalai.gemini.memory_tracer import MemStats, OrderedParamGenerator
 from colossalai.tensor import ColoParameter
-
-
-def in_ddp(param: nn.Parameter) -> bool:
-    return not getattr(param, '_ddp_to_ignore', False)
+from colossalai.utils import is_ddp_ignored
 
 
 def _filter_exlarge_params(model: nn.Module, size_dict: Dict[int, List[int]]) -> None:
     """
     Filter those parameters whose size is too large (more than 3x standard deviations) from others.
     """
-    params_size = [p.numel() for p in model.parameters() if in_ddp(p)]
+    params_size = [p.numel() for p in model.parameters() if not is_ddp_ignored(p)]
     params_size_arr = np.array(params_size)
 
     std = np.std(params_size_arr)
@@ -56,7 +53,7 @@ def classify_params_by_dp_degree(param_order: OrderedParamGenerator) -> Dict[int
     params_dict: Dict[int, List[ColoParameter]] = dict()
     for param in param_order.generate():
         assert isinstance(param, ColoParameter), "please init model in the ColoInitContext"
-        if not in_ddp(param):
+        if is_ddp_ignored(param):
             continue
 
         param_key = param.process_group.dp_world_size()
diff --git a/colossalai/gemini/chunk/utils.py b/colossalai/gemini/chunk/utils.py
index e9a9f84e7a93..883022fe89b8 100644
--- a/colossalai/gemini/chunk/utils.py
+++ b/colossalai/gemini/chunk/utils.py
@@ -6,8 +6,8 @@
 import torch.nn as nn
 
 from colossalai.gemini.chunk import ChunkManager
-from colossalai.gemini.chunk.search_utils import in_ddp, search_chunk_configuration
-from colossalai.gemini.memory_tracer import MemStats
+from colossalai.gemini.chunk.search_utils import search_chunk_configuration
+from colossalai.utils import is_ddp_ignored
 
 
 def init_chunk_manager(model: nn.Module,
@@ -34,7 +34,7 @@ def init_chunk_manager(model: nn.Module,
     if filter_exlarge_params:
         kwargs_dict["filter_exlarge_params"] = filter_exlarge_params
 
-    params_sizes = [p.numel() for p in model.parameters() if in_ddp(p)]
+    params_sizes = [p.numel() for p in model.parameters() if not is_ddp_ignored(p)]
     total_size = sum(params_sizes) / 1024**2
 
     dist.barrier()
diff --git a/colossalai/nn/optimizer/zero_optimizer.py b/colossalai/nn/optimizer/zero_optimizer.py
index 7f9d2fe8fc97..3dd9d1e93b36 100644
--- a/colossalai/nn/optimizer/zero_optimizer.py
+++ b/colossalai/nn/optimizer/zero_optimizer.py
@@ -12,7 +12,7 @@
 from colossalai.logging import get_dist_logger
 from colossalai.nn.optimizer import ColossalaiOptimizer, CPUAdam, FusedAdam, HybridAdam
 from colossalai.nn.parallel.data_parallel import ZeroDDP
-from colossalai.utils import disposable, get_current_device
+from colossalai.utils import disposable, get_current_device, is_ddp_ignored
 
 _AVAIL_OPTIM_LIST = {FusedAdam, CPUAdam, HybridAdam}
 
@@ -78,7 +78,7 @@ def __init__(self,
         if self.clipping_flag:
             assert norm_type == 2.0, "ZeroOptimizer only supports L2 norm now"
 
-        params_list = [p for p in module.parameters() if not getattr(p, '_ddp_to_ignore', False)]
+        params_list = [p for p in module.parameters() if not is_ddp_ignored(p)]
         for p, fp32_p in zip(params_list, module.fp32_params):
             chunk_16 = self.chunk_manager.get_chunk(p)
             if chunk_16 not in self.chunk16_set:
diff --git a/colossalai/nn/parallel/data_parallel.py b/colossalai/nn/parallel/data_parallel.py
index 5e547059a937..649bd920d3b2 100644
--- a/colossalai/nn/parallel/data_parallel.py
+++ b/colossalai/nn/parallel/data_parallel.py
@@ -14,7 +14,7 @@
 from colossalai.tensor import ProcessGroup as ColoProcessGroup
 from colossalai.tensor.colo_parameter import ColoParameter, ColoTensor, ColoTensorSpec
 from colossalai.tensor.param_op_hook import ColoParamOpHookManager
-from colossalai.utils import get_current_device
+from colossalai.utils import get_current_device, is_ddp_ignored
 from colossalai.zero.utils.gemini_hook import GeminiZeROHook
 
 from .reducer import Reducer
@@ -81,7 +81,7 @@ def __init__(self,
         self.reducer = Reducer(bucket_cap_mb)
         self.rebuild_bucket = rebuild_bucket
         for p in module.parameters():
-            if getattr(p, '_ddp_to_ignore', False):
+            if is_ddp_ignored(p):
                 continue
             if p.requires_grad:
                 p.register_hook(partial(self.grad_handle, p))
@@ -116,7 +116,7 @@ def backward(self, loss: torch.Tensor):
         if self.rebuild_bucket:
             self.reducer.free()
         for p in self.module.parameters():
-            if getattr(p, '_ddp_to_ignore', False):
+            if is_ddp_ignored(p):
                 continue
             if p.grad.device.type != "cpu":
                 p.grad = p._saved_grad
@@ -232,7 +232,7 @@ def __init__(self,
         for p in param_order.generate():
             assert isinstance(p, ColoParameter)
 
-            if getattr(p, '_ddp_to_ignore', False):
+            if is_ddp_ignored(p):
                 p.data = p.data.half()
                 continue
 
@@ -256,7 +256,7 @@ def __init__(self,
         self.chunk_manager.close_all_groups()
         self._cast_buffers()
 
-        params_list = [p for p in param_order.generate() if not getattr(p, '_ddp_to_ignore', False)]
+        params_list = [p for p in param_order.generate() if not is_ddp_ignored(p)]
         for p, fp32_p in zip(params_list, self.fp32_params):
             chunk_16 = self.chunk_manager.get_chunk(p)
             chunk_32 = self.chunk_manager.get_chunk(fp32_p)
@@ -303,7 +303,7 @@ def forward(self, *args, **kwargs):
 
     def _setup_grads_ptr(self):
         for p in self.module.parameters():
-            if getattr(p, '_ddp_to_ignore', False):
+            if is_ddp_ignored(p):
                 continue
             p.grad = None
 
diff --git a/colossalai/utils/__init__.py b/colossalai/utils/__init__.py
index 875b5a93ba4f..3f16bd91e5fe 100644
--- a/colossalai/utils/__init__.py
+++ b/colossalai/utils/__init__.py
@@ -1,22 +1,46 @@
-from .cuda import empty_cache, get_current_device, set_to_cuda, synchronize
 from .activation_checkpoint import checkpoint
 from .checkpointing import load_checkpoint, save_checkpoint
-from .common import (clip_grad_norm_fp32, conditional_context, copy_tensor_parallel_attributes, count_zeros_fp32,
-                     ensure_path_exists, free_port, is_dp_rank_0, is_model_parallel_parameter, is_no_pp_or_last_stage,
-                     is_tp_rank_0, is_using_ddp, is_using_pp, is_using_sequence, multi_tensor_applier,
-                     param_is_not_tensor_parallel_duplicate, print_rank_0, switch_virtual_pipeline_parallel_rank,
-                     sync_model_param, disposable)
+from .common import (
+    clip_grad_norm_fp32,
+    conditional_context,
+    copy_tensor_parallel_attributes,
+    count_zeros_fp32,
+    disposable,
+    ensure_path_exists,
+    free_port,
+    is_ddp_ignored,
+    is_dp_rank_0,
+    is_model_parallel_parameter,
+    is_no_pp_or_last_stage,
+    is_tp_rank_0,
+    is_using_ddp,
+    is_using_pp,
+    is_using_sequence,
+    multi_tensor_applier,
+    param_is_not_tensor_parallel_duplicate,
+    print_rank_0,
+    switch_virtual_pipeline_parallel_rank,
+    sync_model_param,
+)
+from .cuda import empty_cache, get_current_device, set_to_cuda, synchronize
 from .data_sampler import DataParallelSampler, get_dataloader
-from .memory import (report_memory_usage, colo_device_memory_used, colo_set_process_memory_fraction,
-                     colo_device_memory_capacity, colo_set_cpu_memory_capacity, colo_get_cpu_memory_capacity)
-from .timer import MultiTimer, Timer
+from .memory import (
+    colo_device_memory_capacity,
+    colo_device_memory_used,
+    colo_get_cpu_memory_capacity,
+    colo_set_cpu_memory_capacity,
+    colo_set_process_memory_fraction,
+    report_memory_usage,
+)
 from .tensor_detector import TensorDetector
+from .timer import MultiTimer, Timer
 
 __all__ = [
     'checkpoint',
     'free_port',
     'print_rank_0',
     'sync_model_param',
+    'is_ddp_ignored',
     'is_dp_rank_0',
     'is_tp_rank_0',
     'is_no_pp_or_last_stage',
diff --git a/colossalai/utils/common.py b/colossalai/utils/common.py
index 7575fa292f14..2099883fbdf7 100644
--- a/colossalai/utils/common.py
+++ b/colossalai/utils/common.py
@@ -126,14 +126,18 @@ def is_model_parallel_parameter(p):
     return hasattr(p, IS_TENSOR_PARALLEL) and getattr(p, IS_TENSOR_PARALLEL)
 
 
+def is_ddp_ignored(p):
+    return getattr(p, '_ddp_to_ignore', False)
+
+
 def _calc_l2_norm(grads):
-    # we should not 
+    # we should not
     global fused_optim
 
     if fused_optim is None:
         from colossalai.kernel.op_builder import FusedOptimBuilder
         fused_optim = FusedOptimBuilder().load()
-        
+
     norm = 0.0
     if len(grads) > 0:
         dummy_overflow_buf = torch.cuda.IntTensor([0])
diff --git a/colossalai/zero/utils/gemini_hook.py b/colossalai/zero/utils/gemini_hook.py
index 35569c7172b3..bddc307a0504 100644
--- a/colossalai/zero/utils/gemini_hook.py
+++ b/colossalai/zero/utils/gemini_hook.py
@@ -8,6 +8,7 @@
 from colossalai.gemini import TensorState
 from colossalai.gemini.gemini_mgr import GeminiManager
 from colossalai.tensor.param_op_hook import ColoParamOpHook
+from colossalai.utils import is_ddp_ignored
 
 
 class TrainingPhase(Enum):
@@ -24,7 +25,7 @@ def __init__(self, gemini_manager: GeminiManager) -> None:
         self._training_phase = TrainingPhase.FORWARD
 
     def pre_op(self, params):
-        params = [p for p in params if not getattr(p, '_ddp_to_ignore', False)]
+        params = [p for p in params if not is_ddp_ignored(p)]
         chunks = self._chunk_manager.get_chunks(params)
         for p in params:
             self._chunk_manager.trans_tensor_state(p, TensorState.COMPUTE)
@@ -37,7 +38,7 @@ def pre_op(self, params):
         self._gemini_manager.record_model_data_volume()
 
     def post_op(self, params):
-        params = [p for p in params if not getattr(p, '_ddp_to_ignore', False)]
+        params = [p for p in params if not is_ddp_ignored(p)]
         for p in params:
             tensor_state = TensorState.HOLD if self._training_phase == TrainingPhase.FORWARD or not p.requires_grad else TensorState.HOLD_AFTER_BWD
             self._chunk_manager.trans_tensor_state(p, tensor_state)

From 1b7587d95891e972553ef0e9b06614706f783bfc Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Wed, 11 Jan 2023 13:37:48 +0800
Subject: [PATCH 159/503] [workflow] make test coverage report collapsable
 (#2436)

---
 .github/workflows/report_test_coverage.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.github/workflows/report_test_coverage.yml b/.github/workflows/report_test_coverage.yml
index 361eae8e4b75..dc3fe395f00b 100644
--- a/.github/workflows/report_test_coverage.yml
+++ b/.github/workflows/report_test_coverage.yml
@@ -47,6 +47,12 @@ jobs:
           output: both
           thresholds: '80 90'
 
+      - name: Make Coverage Report Collapsable
+        run: |
+          sed -i '2 i <details>' code-coverage-results.md
+          sed -i '3 i <summary>Click me to view the complete report</summary>' code-coverage-results.md
+          echo "</details>" >> code-coverage-results.md
+
       - name: 'Comment on PR'
         uses: actions/github-script@v6
         with:

From 41429b9b28b1e826a13f74cd71c7dfdfcad86300 Mon Sep 17 00:00:00 2001
From: YuliangLiu0306 <72588413+YuliangLiu0306@users.noreply.github.com>
Date: Wed, 11 Jan 2023 13:40:33 +0800
Subject: [PATCH 160/503] [autoparallel] add shard option (#2423)

---
 .../tensor_shard/node_handler/__init__.py     |   3 +-
 .../tensor_shard/node_handler/node_handler.py |  18 +++
 .../tensor_shard/node_handler/option.py       |  17 +++
 .../test_node_handler/test_shard_option.py    | 112 ++++++++++++++++++
 4 files changed, 149 insertions(+), 1 deletion(-)
 create mode 100644 colossalai/auto_parallel/tensor_shard/node_handler/option.py
 create mode 100644 tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_shard_option.py

diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/__init__.py b/colossalai/auto_parallel/tensor_shard/node_handler/__init__.py
index a5e3f649a345..87bd8966bb70 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/__init__.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/__init__.py
@@ -11,6 +11,7 @@
 from .linear_handler import LinearFunctionHandler, LinearModuleHandler
 from .matmul_handler import MatMulHandler
 from .normal_pooling_handler import NormPoolingHandler
+from .option import ShardOption
 from .output_handler import OutputHandler
 from .placeholder_handler import PlaceholderHandler
 from .registry import operator_registry
@@ -27,5 +28,5 @@
     'UnaryElementwiseHandler', 'ReshapeHandler', 'PlaceholderHandler', 'OutputHandler', 'WhereHandler',
     'NormPoolingHandler', 'BinaryElementwiseHandler', 'MatMulHandler', 'operator_registry', 'ADDMMFunctionHandler',
     'GetItemHandler', 'GetattrHandler', 'ViewHandler', 'PermuteHandler', 'TensorConstructorHandler',
-    'EmbeddingModuleHandler', 'EmbeddingFunctionHandler', 'SumHandler', 'SoftmaxHandler'
+    'EmbeddingModuleHandler', 'EmbeddingFunctionHandler', 'SumHandler', 'SoftmaxHandler', 'ShardOption'
 ]
diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py b/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py
index 78dc58c905ec..fbab2b61e5af 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py
@@ -5,6 +5,7 @@
 from torch.fx.node import Node
 
 from colossalai.auto_parallel.meta_profiler.metainfo import MetaInfo, meta_register
+from colossalai.auto_parallel.tensor_shard.node_handler.option import ShardOption
 from colossalai.auto_parallel.tensor_shard.sharding_strategy import (
     OperationData,
     OperationDataType,
@@ -35,12 +36,14 @@ def __init__(
         node: Node,
         device_mesh: DeviceMesh,
         strategies_vector: StrategiesVector,
+        shard_option: ShardOption = ShardOption.STANDARD,
     ) -> None:
         self.node = node
         self.predecessor_node = list(node._input_nodes.keys())
         self.successor_node = list(node.users.keys())
         self.device_mesh = device_mesh
         self.strategies_vector = strategies_vector
+        self.shard_option = shard_option
 
     def update_resharding_cost(self, strategy: ShardingStrategy) -> None:
         """
@@ -181,6 +184,21 @@ def register_strategy(self, compute_resharding_cost: bool = True) -> StrategiesV
                 if op_data.data is not None and isinstance(op_data.data, torch.Tensor):
                     check_sharding_spec_validity(sharding_spec, op_data.data)
 
+        remove_strategy_list = []
+        for strategy in self.strategies_vector:
+            shard_level = 0
+            for op_data, sharding_spec in strategy.sharding_specs.items():
+                if op_data.data is not None and isinstance(op_data.data, torch.Tensor):
+                    for dim, shard_axis in sharding_spec.dim_partition_dict.items():
+                        shard_level += len(shard_axis)
+            if self.shard_option == ShardOption.SHARD and shard_level == 0:
+                remove_strategy_list.append(strategy)
+            if self.shard_option == ShardOption.FULL_SHARD and shard_level <= 1:
+                remove_strategy_list.append(strategy)
+
+        for strategy in remove_strategy_list:
+            self.strategies_vector.remove(strategy)
+
         return self.strategies_vector
 
     def post_process(self, strategy: ShardingStrategy) -> Union[ShardingStrategy, List[ShardingStrategy]]:
diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/option.py b/colossalai/auto_parallel/tensor_shard/node_handler/option.py
new file mode 100644
index 000000000000..dffb0386df62
--- /dev/null
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/option.py
@@ -0,0 +1,17 @@
+from enum import Enum
+
+__all__ = ['ShardOption']
+
+
+class ShardOption(Enum):
+    """
+    This enum class is to define the shard level required in node strategies.
+
+    Notes:
+        STANDARD: We do not add any extra shard requirements.
+        SHARD: We require the node to be shard using at least one device mesh axis.
+        FULL_SHARD: We require the node to be shard using all device mesh axes.
+    """
+    STANDARD = 0
+    SHARD = 1
+    FULL_SHARD = 2
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_shard_option.py b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_shard_option.py
new file mode 100644
index 000000000000..fda0411104b8
--- /dev/null
+++ b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_shard_option.py
@@ -0,0 +1,112 @@
+from functools import partial
+
+import torch
+import torch.multiprocessing as mp
+import torch.nn as nn
+
+from colossalai.auto_parallel.tensor_shard.node_handler import LinearFunctionHandler
+from colossalai.auto_parallel.tensor_shard.node_handler.option import ShardOption
+from colossalai.auto_parallel.tensor_shard.sharding_strategy import StrategiesVector
+from colossalai.device.device_mesh import DeviceMesh
+from colossalai.fx import ColoGraphModule, ColoTracer
+from colossalai.testing import parameterize
+from colossalai.testing.pytest_wrapper import run_on_environment_flag
+from colossalai.testing.utils import parameterize
+
+
+class LinearModel(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input, others, bias=None):
+        x = nn.functional.linear(input, others, bias=bias)
+        return x
+
+
+def check_shard_option(shard_option):
+    model = LinearModel().cuda()
+    physical_mesh_id = torch.arange(0, 4)
+    mesh_shape = (2, 2)
+    device_mesh = DeviceMesh(physical_mesh_id, mesh_shape)
+
+    tracer = ColoTracer()
+    graph = tracer.trace(model,
+                         meta_args={
+                             "input": torch.rand(4, 4, 4, 16).to('meta'),
+                             'others': torch.rand(32, 16).to('meta')
+                         })
+    gm = ColoGraphModule(model, graph)
+    linear_func_node = list(graph.nodes)[2]
+    strategies_vector = StrategiesVector(linear_func_node)
+
+    # build handler
+    handler = LinearFunctionHandler(node=linear_func_node,
+                                    device_mesh=device_mesh,
+                                    strategies_vector=strategies_vector,
+                                    shard_option=shard_option)
+
+    strategies_vector = handler.register_strategy(compute_resharding_cost=False)
+    strategy_name_list = [val.name for val in strategies_vector]
+
+    # SS = SR x RS
+    assert 'S1S0 = S1R x RS0_0' in strategy_name_list
+    assert 'S0S1 = S0R x RS1_1' in strategy_name_list
+    assert 'S0S1 = S0R x RS1_2' in strategy_name_list
+    assert 'S0S1 = S0R x RS1_0' in strategy_name_list
+    assert 'S1S0 = S1R x RS0_1' in strategy_name_list
+    assert 'S1S0 = S1R x RS0_2' in strategy_name_list
+
+    # SR = SS x SR
+    assert 'S0R = S0S1 x S1R_1' in strategy_name_list
+    assert 'S0R = S0S1 x S1R_2' in strategy_name_list
+    assert 'S1R = S1S0 x S0R_0' in strategy_name_list
+    assert 'S0R = S0S1 x S1R_0' in strategy_name_list
+    assert 'S1R = S1S0 x S0R_1' in strategy_name_list
+    assert 'S1R = S1S0 x S0R_2' in strategy_name_list
+
+    # RS = RS x SS
+    assert 'RS0 = RS1 x S1S0' in strategy_name_list
+    assert 'RS1 = RS0 x S0S1' in strategy_name_list
+
+    # S01R = S01R x RR
+    assert 'S01R = S01R x RR_0' in strategy_name_list
+    assert 'S01R = S01R x RR_1' in strategy_name_list
+    assert 'S01R = S01R x RR_2' in strategy_name_list
+
+    # RR = RS01 x S01R
+    assert 'RR = RS01 x S01R' in strategy_name_list
+
+    # RS01 = RR x RS01
+    assert 'RS01 = RR x RS01' in strategy_name_list
+
+    if shard_option == ShardOption.SHARD:
+        # RR = RS x SR
+        assert 'RR = RS0 x S0R' in strategy_name_list
+        assert 'RR = RS1 x S1R' in strategy_name_list
+
+        # RS= RR x RS
+        assert 'RS0 = RR x RS0' in strategy_name_list
+        assert 'RS1 = RR x RS1' in strategy_name_list
+
+    if shard_option == ShardOption.STANDARD:
+        # RR = RS x SR
+        assert 'RR = RS0 x S0R' in strategy_name_list
+        assert 'RR = RS1 x S1R' in strategy_name_list
+
+        # RS= RR x RS
+        assert 'RS0 = RR x RS0' in strategy_name_list
+        assert 'RS1 = RR x RS1' in strategy_name_list
+
+        # RR = RR x RR
+        assert 'RR = RR x RR' in strategy_name_list
+
+
+@run_on_environment_flag(name='AUTO_PARALLEL')
+def test_shard_option():
+    for shard_option in [ShardOption.STANDARD, ShardOption.SHARD, ShardOption.FULL_SHARD]:
+        check_shard_option(shard_option)
+
+
+if __name__ == '__main__':
+    test_shard_option()

From c41e59e5adc27d08b17234eada91ebcb3d876b23 Mon Sep 17 00:00:00 2001
From: Super Daniel <78588128+super-dainiu@users.noreply.github.com>
Date: Wed, 11 Jan 2023 13:49:59 +0800
Subject: [PATCH 161/503] [fx] allow native ckpt trace and codegen. (#2438)

---
 colossalai/fx/graph_module.py           | 15 ++++++---
 colossalai/fx/tracer/_symbolic_trace.py |  3 +-
 colossalai/fx/tracer/experimental.py    | 42 +++++++++++++++----------
 3 files changed, 37 insertions(+), 23 deletions(-)

diff --git a/colossalai/fx/graph_module.py b/colossalai/fx/graph_module.py
index fbafd326c6d4..2d6a71f19e16 100644
--- a/colossalai/fx/graph_module.py
+++ b/colossalai/fx/graph_module.py
@@ -1,17 +1,21 @@
 import os
 import warnings
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Set, Type, Union
+
 import torch
 import torch.nn as nn
 from torch.nn.modules.module import _addindent
-from typing import Type, Dict, List, Any, Union, Optional, Set
-from pathlib import Path
+
 try:
-    from torch.fx.graph_module import GraphModule, _EvalCacheLoader, _WrappedCall, _exec_with_source, _forward_from_src
-    from torch.fx.graph import Graph, _PyTreeCodeGen, _is_from_torch, _custom_builtins, PythonCode
+    from torch.fx.graph import Graph, PythonCode, _custom_builtins, _is_from_torch, _PyTreeCodeGen
+    from torch.fx.graph_module import GraphModule, _EvalCacheLoader, _exec_with_source, _forward_from_src, _WrappedCall
+
+    from colossalai.fx.codegen.activation_checkpoint_codegen import ActivationCheckpointCodeGen
     COLOGM = True
 except:
-    from torch.fx.graph_module import GraphModule
     from torch.fx.graph import Graph
+    from torch.fx.graph_module import GraphModule
     COLOGM = False
 
 if COLOGM:
@@ -19,6 +23,7 @@
     class ColoGraphModule(GraphModule):
 
         def __init__(self, root: Union[torch.nn.Module, Dict[str, Any]], graph: Graph, class_name: str = 'GraphModule'):
+            graph.set_codegen(ActivationCheckpointCodeGen())
             super().__init__(root, graph, class_name)
 
         def bind(self, ckpt_def, globals):
diff --git a/colossalai/fx/tracer/_symbolic_trace.py b/colossalai/fx/tracer/_symbolic_trace.py
index bff2f6a10fa6..5c04eeace0ad 100644
--- a/colossalai/fx/tracer/_symbolic_trace.py
+++ b/colossalai/fx/tracer/_symbolic_trace.py
@@ -13,6 +13,7 @@ def symbolic_trace(
     root: Union[torch.nn.Module, Callable[..., Any]],
     concrete_args: Optional[Dict[str, Any]] = None,
     meta_args: Optional[Dict[str, Any]] = None,
+    trace_act_ckpt=False,
 ) -> ColoGraphModule:
     """
     Symbolic tracing API
@@ -49,6 +50,6 @@ def symbolic_trace(
         This API is still under development and can incur some bugs. Feel free to report any bugs to the Colossal-AI team.
 
     """
-    graph = ColoTracer().trace(root, concrete_args=concrete_args, meta_args=meta_args)
+    graph = ColoTracer(trace_act_ckpt=trace_act_ckpt).trace(root, concrete_args=concrete_args, meta_args=meta_args)
     name = root.__class__.__name__ if isinstance(root, torch.nn.Module) else root.__name__
     return ColoGraphModule(root, graph, name)
diff --git a/colossalai/fx/tracer/experimental.py b/colossalai/fx/tracer/experimental.py
index 6fee5f5d061d..88b65b6188fa 100644
--- a/colossalai/fx/tracer/experimental.py
+++ b/colossalai/fx/tracer/experimental.py
@@ -1,7 +1,7 @@
 import enum
 import functools
-import operator
 import inspect
+import operator
 from contextlib import contextmanager
 from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
 
@@ -286,7 +286,6 @@ def _check_arg_name_valid(names):
         self.graph.lint()
         return self.graph
 
-
     @contextmanager
     def trace_activation_checkpoint(self, enabled: bool):
         if enabled:
@@ -316,7 +315,6 @@ def backward(ctx: Any, *grad_outputs: Any) -> Any:
             # recover the checkpoint function upon exit
             torch.utils.checkpoint.CheckpointFunction = orig_ckpt_func
 
-
     def _post_check(self, non_concrete_arg_names: Set[str]):
         # This is necessary because concrete args are added as input to the traced module since
         # https://github.com/pytorch/pytorch/pull/55888.
@@ -385,18 +383,23 @@ def symbolic_trace(
     root: Union[torch.nn.Module, Callable[..., Any]],
     concrete_args: Optional[Dict[str, Any]] = None,
     meta_args: Optional[Dict[str, Any]] = None,
+    trace_act_ckpt=False,
 ) -> ColoGraphModule:
     if is_compatible_with_meta():
         if meta_args is not None:
             root.to(default_device())
             wrap_fn = lambda x: MetaTensor(x, fake_device=default_device()) if isinstance(x, torch.Tensor) else x
-            graph = ColoTracer().trace(root, concrete_args=concrete_args, meta_args=tree_map(wrap_fn, meta_args))
+            graph = ColoTracer(trace_act_ckpt=trace_act_ckpt).trace(root,
+                                                                    concrete_args=concrete_args,
+                                                                    meta_args=tree_map(wrap_fn, meta_args))
             root.cpu()
         else:
             graph = Tracer().trace(root, concrete_args=concrete_args)
     else:
         from .tracer import ColoTracer as OrigColoTracer
-        graph = OrigColoTracer().trace(root, concrete_args=concrete_args, meta_args=meta_args)
+        graph = OrigColoTracer(trace_act_ckpt=trace_act_ckpt).trace(root,
+                                                                    concrete_args=concrete_args,
+                                                                    meta_args=meta_args)
     name = root.__class__.__name__ if isinstance(root, torch.nn.Module) else root.__name__
     return ColoGraphModule(root, graph, name)
 
@@ -471,11 +474,11 @@ def meta_prop_pass(gm: ColoGraphModule,
         node._meta_data = _meta_data_computing(meta_args, concrete_args, root, node.op, node.target, node.args,
                                                node.kwargs)
 
+
 def _meta_data_computing(meta_args, concrete_args, root, kind, target, args, kwargs):
     unwrap_fn = lambda n: n._meta_data if isinstance(n, Node) else n
     if kind == 'placeholder':
-        meta_out = meta_args[target] if target in meta_args else concrete_args.get(
-            _truncate_suffix(target), None)
+        meta_out = meta_args[target] if target in meta_args else concrete_args.get(_truncate_suffix(target), None)
     elif kind == 'get_attr':
         attr_itr = root
         atoms = target.split(".")
@@ -490,7 +493,7 @@ def _meta_data_computing(meta_args, concrete_args, root, kind, target, args, kwa
         else:
             if target not in _TensorPropertyMethod:
                 meta_out = getattr(unwrap_fn(args[0]), target)(*tree_map(unwrap_fn, args[1:]),
-                                                                       **tree_map(unwrap_fn, kwargs))
+                                                               **tree_map(unwrap_fn, kwargs))
     elif kind == 'call_module':
         mod = root.get_submodule(target)
         meta_out = mod.forward(*tree_map(unwrap_fn, args), **tree_map(unwrap_fn, kwargs))
@@ -498,6 +501,7 @@ def _meta_data_computing(meta_args, concrete_args, root, kind, target, args, kwa
         meta_out = None
     return meta_out
 
+
 def _meta_data_computing_v0(meta_args, root, kind, target, args, kwargs):
     if kind == "placeholder" and target in meta_args and meta_args[target].is_meta:
         meta_out = meta_args[target]
@@ -568,7 +572,7 @@ def _meta_data_computing_v0(meta_args, root, kind, target, args, kwargs):
     return meta_out
 
 
-def bias_addition_pass(gm: ColoGraphModule, root_model: torch.nn.Module, meta_args: Optional[Dict[str, Any]]=None):
+def bias_addition_pass(gm: ColoGraphModule, root_model: torch.nn.Module, meta_args: Optional[Dict[str, Any]] = None):
     result_graph = Graph()
     value_remap = {}
     unwrap_fn = lambda n: n._meta_data if isinstance(n, Node) else n
@@ -601,20 +605,24 @@ def wrap_fn(n):
                 if target == torch.nn.functional.linear:
                     if 'bias' in kwargs and kwargs['bias'] is not None:
                         function_to_substitute = func_to_func_dict[target]
-                        handle = bias_addition_function.get(target)(tracer, target, args_proxy, kwargs_proxy, function_to_substitute)
+                        handle = bias_addition_function.get(target)(tracer, target, args_proxy, kwargs_proxy,
+                                                                    function_to_substitute)
                 else:
                     function_to_substitute = func_to_func_dict[target]
-                    handle = bias_addition_function.get(target)(tracer, target, args_proxy, kwargs_proxy, function_to_substitute)
+                    handle = bias_addition_function.get(target)(tracer, target, args_proxy, kwargs_proxy,
+                                                                function_to_substitute)
             elif bias_addition_function.has(target.__name__):
                 # use name for some builtin op like @ (matmul)
                 function_to_substitute = func_to_func_dict[target]
-                handle = bias_addition_function.get(target.__name__)(tracer, target, args_proxy, kwargs_proxy, function_to_substitute)
+                handle = bias_addition_function.get(target.__name__)(tracer, target, args_proxy, kwargs_proxy,
+                                                                     function_to_substitute)
 
         elif kind == "call_method":
             method = getattr(args_metas[0].__class__, target)
             if bias_addition_method.has(method):
                 function_to_substitute = method_to_func_dict[method]
-                handle = bias_addition_method.get(method)(tracer, target, args_proxy, kwargs_proxy, function_to_substitute)
+                handle = bias_addition_method.get(method)(tracer, target, args_proxy, kwargs_proxy,
+                                                          function_to_substitute)
 
         elif kind == "call_module":
             # if not hasattr(self, "orig_forward"):
@@ -623,20 +631,20 @@ def wrap_fn(n):
             mod_type = type(mod)
             if bias_addition_module.has(mod_type) and mod.bias is not None:
                 function_to_substitute = module_to_func_dict[mod_type]
-                handle = bias_addition_module.get(mod_type)(tracer, target, args_proxy, kwargs_proxy, function_to_substitute)
+                handle = bias_addition_module.get(mod_type)(tracer, target, args_proxy, kwargs_proxy,
+                                                            function_to_substitute)
 
         if handle is not None:
             handle.generate()
             for node_inserted in tracer.graph.nodes:
-                value_remap[node_inserted] = result_graph.node_copy(node_inserted, lambda n : value_remap[n])
+                value_remap[node_inserted] = result_graph.node_copy(node_inserted, lambda n: value_remap[n])
                 last_node = value_remap[node_inserted]
             value_remap[orig_node] = last_node
         else:
-            value_remap[orig_node] = result_graph.node_copy(orig_node, lambda n : value_remap[n])
+            value_remap[orig_node] = result_graph.node_copy(orig_node, lambda n: value_remap[n])
 
         del tracer
 
     gm.graph = result_graph
     gm.recompile()
     meta_prop_pass(gm, root_model, meta_args)
-

From c72c827e95bf4f58ed6dd051326453a19b61c317 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Wed, 11 Jan 2023 13:56:42 +0800
Subject: [PATCH 162/503] [cli] provided more details if colossalai run fail
 (#2442)

---
 colossalai/cli/launcher/multinode_runner.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/colossalai/cli/launcher/multinode_runner.py b/colossalai/cli/launcher/multinode_runner.py
index c45ad5e5a082..a51e1e371f13 100644
--- a/colossalai/cli/launcher/multinode_runner.py
+++ b/colossalai/cli/launcher/multinode_runner.py
@@ -1,8 +1,10 @@
-import fabric
-from .hostinfo import HostInfo, HostInfoList
 from multiprocessing import Pipe, Process
 from multiprocessing import connection as mp_connection
+
 import click
+import fabric
+
+from .hostinfo import HostInfo, HostInfoList
 
 
 def run_on_host(hostinfo: HostInfo, workdir: str, recv_conn: mp_connection.Connection,
@@ -45,8 +47,10 @@ def run_on_host(hostinfo: HostInfo, workdir: str, recv_conn: mp_connection.Conne
                             # execute on the remote machine
                             fab_conn.run(cmds, hide=False)
                     send_conn.send('success')
-            except:
-                click.echo(f"Error: failed to run {cmds} on {hostinfo.hostname}")
+            except Exception as e:
+                click.echo(
+                    f"Error: failed to run {cmds} on {hostinfo.hostname}, is localhost: {hostinfo.is_local_host}, exception: {e}"
+                )
                 send_conn.send('failure')
 
     # shutdown

From 2731531bc23a93282ca5408afa3b1a329c0e331d Mon Sep 17 00:00:00 2001
From: YuliangLiu0306 <72588413+YuliangLiu0306@users.noreply.github.com>
Date: Wed, 11 Jan 2023 14:03:49 +0800
Subject: [PATCH 163/503] [autoparallel] integrate device mesh initialization
 into autoparallelize (#2393)

* [autoparallel] integrate device mesh initialization into autoparallelize

* add megatron solution

* update gpt autoparallel examples with latest api

* adapt beta value to fit the current computation cost
---
 .../auto_parallel/tensor_shard/initialize.py  |  61 ++++++++++--------
 colossalai/device/alpha_beta_profiler.py      |   4 +-
 colossalai/device/device_mesh.py              |  30 +++++----
 .../auto_parallel/auto_parallel_with_gpt.py   |  20 ++----
 .../saved_solution/solution_12_layers.pt      | Bin 0 -> 1903 bytes
 .../saved_solution/solution_1_layers.pt       | Bin 0 -> 559 bytes
 .../saved_solution/solution_4_layers.pt       | Bin 0 -> 943 bytes
 7 files changed, 64 insertions(+), 51 deletions(-)
 create mode 100644 examples/language/gpt/experiments/auto_parallel/saved_solution/solution_12_layers.pt
 create mode 100644 examples/language/gpt/experiments/auto_parallel/saved_solution/solution_1_layers.pt
 create mode 100644 examples/language/gpt/experiments/auto_parallel/saved_solution/solution_4_layers.pt

diff --git a/colossalai/auto_parallel/tensor_shard/initialize.py b/colossalai/auto_parallel/tensor_shard/initialize.py
index 0dce2564c519..8c24c0d7b5df 100644
--- a/colossalai/auto_parallel/tensor_shard/initialize.py
+++ b/colossalai/auto_parallel/tensor_shard/initialize.py
@@ -59,18 +59,6 @@ def extract_meta_args_from_dataloader(data_loader: torch.utils.data.DataLoader,
     pass
 
 
-def search_best_logical_mesh_shape(world_size: int, alpha_beta_dict: Dict[Tuple[int], Tuple[float]]):
-    '''
-    This method is used to search the best logical mesh shape for the given world size
-    based on the alpha_beta_dict.
-
-    For example:
-        if the world_size is 8, and the possible logical shape will be (1, 8), (2, 4), (4, 2), (8, 1).
-    '''
-    # TODO: implement this function
-    return (world_size, 1)
-
-
 def extract_alpha_beta_for_device_mesh(alpha_beta_dict: Dict[Tuple[int], Tuple[float]], logical_mesh_shape: Tuple[int]):
     '''
     This method is used to extract the mesh_alpha and mesh_beta for the given logical_mesh_shape
@@ -127,39 +115,56 @@ def transform_to_sharded_model(gm: GraphModule, solution: List[int], device_mesh
 
 
 def initialize_device_mesh(world_size: int = -1,
+                           physical_devices: List[int] = None,
                            alpha_beta_dict: Dict[Tuple[int], Tuple[float]] = None,
-                           logical_mesh_shape: Tuple[int] = None):
+                           logical_mesh_shape: Tuple[int] = None,
+                           logical_mesh_id: torch.Tensor = None):
     '''
     This method is used to initialize the device mesh.
 
     Args:
-        world_size(optional): the size of device mesh. If the world_size is -1,
+        world_size: the size of device mesh. If the world_size is -1,
             the world size will be set to the number of GPUs in the current machine.
+        physical_devices: the physical devices used to initialize the device mesh.
         alpha_beta_dict(optional): the alpha_beta_dict contains the alpha and beta values
             for each devices. if the alpha_beta_dict is None, the alpha_beta_dict will be
             generated by profile_alpha_beta function.
         logical_mesh_shape(optional): the logical_mesh_shape is used to specify the logical
-            mesh shape. If the logical_mesh_shape is None, the logical_mesh_shape will be
-            generated by search_best_logical_mesh_shape function.
+            mesh shape.
+        logical_mesh_id(optional): the logical_mesh_id is used to specify the logical mesh id.
     '''
     # if world_size is not set, use the world size from torch.distributed
     if world_size == -1:
         world_size = dist.get_world_size()
-    device1d = [i for i in range(world_size)]
+
+    if physical_devices is None:
+        physical_devices = [i for i in range(world_size)]
+    physical_mesh = torch.tensor(physical_devices)
 
     if alpha_beta_dict is None:
         # if alpha_beta_dict is not given, use a series of executions to profile alpha and beta values for each device
-        alpha_beta_dict = profile_alpha_beta(device1d)
+        ab_profiler = AlphaBetaProfiler(physical_devices)
+        alpha_beta_dict = ab_profiler.alpha_beta_dict
+    else:
+        ab_profiler = AlphaBetaProfiler(physical_devices, alpha_beta_dict=alpha_beta_dict)
 
-    if logical_mesh_shape is None:
+    if logical_mesh_shape is None and logical_mesh_id is None:
         # search for the best logical mesh shape
-        logical_mesh_shape = search_best_logical_mesh_shape(world_size, alpha_beta_dict)
+        logical_mesh_id = ab_profiler.search_best_logical_mesh()
+        logical_mesh_id = torch.Tensor(logical_mesh_id).to(torch.int)
+        logical_mesh_shape = logical_mesh_id.shape
+
+        # extract alpha and beta values for the chosen logical mesh shape
+        mesh_alpha, mesh_beta = ab_profiler.extract_alpha_beta_for_device_mesh()
+
+    elif logical_mesh_shape is not None and logical_mesh_id is None:
+        logical_mesh_id = physical_mesh.reshape(logical_mesh_shape)
+
+        # extract alpha and beta values for the chosen logical mesh shape
+        mesh_alpha, mesh_beta = extract_alpha_beta_for_device_mesh(alpha_beta_dict, logical_mesh_id)
 
-    # extract alpha and beta values for the chosen logical mesh shape
-    mesh_alpha, mesh_beta = extract_alpha_beta_for_device_mesh(alpha_beta_dict, logical_mesh_shape)
-    physical_mesh = torch.tensor(device1d)
     device_mesh = DeviceMesh(physical_mesh_id=physical_mesh,
-                             mesh_shape=logical_mesh_shape,
+                             logical_mesh_id=logical_mesh_id,
                              mesh_alpha=mesh_alpha,
                              mesh_beta=mesh_beta,
                              init_process_group=True)
@@ -224,6 +229,7 @@ def autoparallelize(model: nn.Module,
                     data_process_func: callable = None,
                     alpha_beta_dict: Dict[Tuple[int], Tuple[float]] = None,
                     logical_mesh_shape: Tuple[int] = None,
+                    logical_mesh_id: torch.Tensor = None,
                     save_solver_solution: bool = False,
                     load_solver_solution: bool = False,
                     solver_solution_path: str = None,
@@ -245,6 +251,7 @@ def autoparallelize(model: nn.Module,
         logical_mesh_shape(optional): the logical_mesh_shape is used to specify the logical
             mesh shape. If the logical_mesh_shape is None, the logical_mesh_shape will be
             generated by search_best_logical_mesh_shape function.
+        logical_mesh_id(optional): the logical_mesh_id is used to specify the logical mesh id.
         save_solver_solution(optional): if the save_solver_solution is True, the solution will be saved
             to the solution_path.
         load_solver_solution(optional): if the load_solver_solution is True, the solution will be loaded
@@ -254,7 +261,9 @@ def autoparallelize(model: nn.Module,
         memory_budget(optional): the max cuda memory could be used. If the memory budget is -1.0,
             the memory budget will be infinity.
     '''
-    device_mesh = initialize_device_mesh(alpha_beta_dict=alpha_beta_dict, logical_mesh_shape=logical_mesh_shape)
+    device_mesh = initialize_device_mesh(alpha_beta_dict=alpha_beta_dict,
+                                         logical_mesh_shape=logical_mesh_shape,
+                                         logical_mesh_id=logical_mesh_id)
     if meta_args is None:
         meta_args = extract_meta_args_from_dataloader(data_loader, data_process_func)
 
@@ -263,7 +272,7 @@ def autoparallelize(model: nn.Module,
                                      device_mesh,
                                      save_solver_solution=save_solver_solution,
                                      load_solver_solution=load_solver_solution,
-                                     solver_solution_path=solver_solution_path,
+                                     solution_path=solver_solution_path,
                                      return_solution=return_solution,
                                      memory_budget=memory_budget)
 
diff --git a/colossalai/device/alpha_beta_profiler.py b/colossalai/device/alpha_beta_profiler.py
index 9c66cb85de5c..af2b10928c6f 100644
--- a/colossalai/device/alpha_beta_profiler.py
+++ b/colossalai/device/alpha_beta_profiler.py
@@ -381,6 +381,8 @@ def _extract_alpha_beta(pg, pg_handler):
         first_latency, first_bandwidth = _extract_alpha_beta(first_axis, first_axis_process_group)
         second_latency, second_bandwidth = _extract_alpha_beta(second_axis, second_axis_process_group)
         mesh_alpha = [first_latency, second_latency]
-        mesh_beta = [1 / first_bandwidth, 1 / second_bandwidth]
+        # The beta values have been enlarged by 1e10 times temporarilly because the computation cost
+        # is still estimated in the unit of TFLOPs instead of time. We will remove this factor in future.
+        mesh_beta = [1e10 / first_bandwidth, 1e10 / second_bandwidth]
 
         return mesh_alpha, mesh_beta
diff --git a/colossalai/device/device_mesh.py b/colossalai/device/device_mesh.py
index 7596a100bf93..b5a97eded90c 100644
--- a/colossalai/device/device_mesh.py
+++ b/colossalai/device/device_mesh.py
@@ -1,5 +1,6 @@
 import operator
 from functools import reduce
+from typing import List, Tuple
 
 import torch
 import torch.distributed as dist
@@ -15,7 +16,8 @@ class DeviceMesh:
 
     Arguments:
         physical_mesh_id (torch.Tensor): physical view of the devices in global rank.
-        mesh_shape (torch.Size): shape of logical view.
+        logical_mesh_id (torch.Tensor): logical view of the devices in global rank.
+        mesh_shape (torch.Size, optional): shape of logical view.
         mesh_alpha (List[float], optional): coefficients used for computing
             communication cost (default: None)
         mesh_beta (List[float], optional): coefficients used for computing
@@ -28,15 +30,21 @@ class DeviceMesh:
     """
 
     def __init__(self,
-                 physical_mesh_id,
-                 mesh_shape,
-                 mesh_alpha=None,
-                 mesh_beta=None,
-                 init_process_group=False,
-                 need_flatten=True):
+                 physical_mesh_id: torch.Tensor,
+                 mesh_shape: torch.Size = None,
+                 logical_mesh_id: torch.Tensor = None,
+                 mesh_alpha: List[float] = None,
+                 mesh_beta: List[float] = None,
+                 init_process_group: bool = False,
+                 need_flatten: bool = True):
         self.physical_mesh_id = physical_mesh_id
-        self.mesh_shape = mesh_shape
-        self._logical_mesh_id = self.physical_mesh_id.reshape(self.mesh_shape)
+        if logical_mesh_id is None:
+            self.mesh_shape = mesh_shape
+            self._logical_mesh_id = self.physical_mesh_id.reshape(self.mesh_shape)
+        else:
+            self._logical_mesh_id = logical_mesh_id
+            self.mesh_shape = self._logical_mesh_id.shape
+
         # map global rank into logical rank
         self.convert_map = {}
         self._global_rank_to_logical_rank_map(self._logical_mesh_id, [])
@@ -54,8 +62,8 @@ def __init__(self,
         if self.need_flatten and self._logical_mesh_id.dim() > 1:
             self.flatten_device_mesh = self.flatten()
             # Create a new member `flatten_device_meshes` to distinguish from original flatten methods (Because I'm not sure if there are functions that rely on the self.flatten())
-            self.flatten_device_meshes = FlattenDeviceMesh(self.physical_mesh_id, self.mesh_shape, self.mesh_alpha,
-                                                           self.mesh_beta)
+            # self.flatten_device_meshes = FlattenDeviceMesh(self.physical_mesh_id, self.mesh_shape, self.mesh_alpha,
+            #                                                self.mesh_beta)
 
     @property
     def shape(self):
diff --git a/examples/language/gpt/experiments/auto_parallel/auto_parallel_with_gpt.py b/examples/language/gpt/experiments/auto_parallel/auto_parallel_with_gpt.py
index 85c8d64d7809..6ceb7fd87c0a 100644
--- a/examples/language/gpt/experiments/auto_parallel/auto_parallel_with_gpt.py
+++ b/examples/language/gpt/experiments/auto_parallel/auto_parallel_with_gpt.py
@@ -16,14 +16,14 @@
 from colossalai.initialize import launch_from_torch
 from colossalai.logging import disable_existing_loggers, get_dist_logger
 
-BATCH_SIZE = 8
-SEQ_LENGTH = 128
-HIDDEN_DIM = 3072
+BATCH_SIZE = 16
+SEQ_LENGTH = 1024
+HIDDEN_DIM = 4096
 NUM_HEADS = 16
-NUM_LAYERS = 1
+NUM_LAYERS = 4
 VOCAB_SIZE = 50257
 NUM_STEPS = 10
-FP16 = False
+FP16 = True
 
 
 def get_cpu_mem():
@@ -40,7 +40,7 @@ def get_mem_info(prefix=''):
 
 def get_tflops(model_numel, batch_size, seq_len, step_time):
     # Tflops_per_GPU = global_batch * global_numel * seq_len * 8 / #gpu
-    return model_numel * batch_size * seq_len * 8 / 1e12 / (step_time + 1e-12) / 4
+    return model_numel * batch_size * seq_len * 8 / 1e12 / (step_time + 1e-12) / 8
 
 
 # Randomly Generated Data
@@ -66,13 +66,7 @@ def main():
         'attention_mask': torch.zeros((BATCH_SIZE, SEQ_LENGTH), dtype=torch.int64).to('meta'),
     }
 
-    # Both device mesh initialization and model initialization will be integrated into autoparallelize
-    physical_mesh_id = torch.arange(0, 4)
-    mesh_shape = (2, 2)
-    device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
-
-    # Enable auto-parallel
-    gm, solution = initialize_model(model, meta_input_sample, device_mesh, return_solution=True)
+    gm, solution = autoparallelize(model, meta_input_sample, return_solution=True)
 
     # print solution on rank 0
     if gpc.get_global_rank() == 0:
diff --git a/examples/language/gpt/experiments/auto_parallel/saved_solution/solution_12_layers.pt b/examples/language/gpt/experiments/auto_parallel/saved_solution/solution_12_layers.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7b8cd7edd11e6d1f605e0e9f992b6a13676ecd10
GIT binary patch
literal 1903
zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfho3-LBpE?19EtC
zc(Zvkdvil&n1O5#AV!C5-n>9@5CoYa3gm$x8x%4FX(k|K2dm)6Fbkp+L4xeSrykYz
zQM*UOd^D|%<}pzDF<Q33%4bAr3SnU@Q`x*3QuPA7**Si!U0L*pm4N|-18~+Q{0!{y
zx}+?%s5mn}&&`P$RFxQWflRqLFFlkASObA@fHxzEg4apNHVA+uPyl)jg{~XfWPTK#
zK5Pu=ZVK>bW7B~ul4I6|D+jv=$b-?<P?<?UBR~MCj~ztwf=LDjMz9zwSe6N>3#1;R
F768pwDM|nU

literal 0
HcmV?d00001

diff --git a/examples/language/gpt/experiments/auto_parallel/saved_solution/solution_1_layers.pt b/examples/language/gpt/experiments/auto_parallel/saved_solution/solution_1_layers.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9b431a45baba43b9581fb5cf3d4bf39a2aaea5d6
GIT binary patch
literal 559
zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfho3-LBpE?19EtC
zc(Zvkdvil&n1O5#AV!C5-n>9@5CoYa3gm$x8x%4FX(k|K2dm)6Fbkp+L4xc+SI_3n
zkg6Bp&Can)mSgS;pg%!40H?qC8KmL<E=w&c&dkqqa}r1MHarZBxj-ggoR=QT1T+<d
z1H2hQ6g*6kZ504XpaAp`M%RsOGCzvWA3z>VH`JX0-fV0-P(^agx^U%S_W*e?x*94I
d#0X&k^|6CkXQ6x$72wUv1`=ZeLXdihS^z_{RyzOy

literal 0
HcmV?d00001

diff --git a/examples/language/gpt/experiments/auto_parallel/saved_solution/solution_4_layers.pt b/examples/language/gpt/experiments/auto_parallel/saved_solution/solution_4_layers.pt
new file mode 100644
index 0000000000000000000000000000000000000000..79a448c1b06f1db8731d2d45f988ff0b57810b04
GIT binary patch
literal 943
zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfho3-LBpE?19EtC
zc(Zvkdvil&n1O5#AV!C5-n>9@5CoYa3gm$x8x%4FX(k|K2dm)6Fbkp+L4xeSrykYz
zQM+js=4{>!sd@q4>>LMcl}lup7#Kh}0B2g`XHbWywX)Qr;>`R!Hz#GZq=u62U>svE
zkS!PIrH2A7U;yC&Z$=OWPt(XQ5CBP_0Q3}&t{d58eiWTKKwDtCp>7WFW@FQVDw1Q?
qg)0ZU2grlb)livBPywJmc94)SGem+BNCkMavVnL^KnPL~Q40VIP?Na;

literal 0
HcmV?d00001


From 5521af7877cebf1f3147dd9d60224e20a3733b8f Mon Sep 17 00:00:00 2001
From: HELSON <c2h214748@gmail.com>
Date: Wed, 11 Jan 2023 14:55:41 +0800
Subject: [PATCH 164/503] [zero] fix state_dict and load_state_dict for ddp
 ignored parameters (#2443)

* [ddp] add is_ddp_ignored

[ddp] rename to is_ddp_ignored

* [zero] fix state_dict and load_state_dict

* fix bugs

* [zero] update unit test for ZeroDDP
---
 colossalai/nn/parallel/data_parallel.py       | 22 +++++++++++++++----
 .../update/test_zeroddp_state_dict.py         | 12 ++++++++--
 2 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/colossalai/nn/parallel/data_parallel.py b/colossalai/nn/parallel/data_parallel.py
index 649bd920d3b2..28a10c4b6c92 100644
--- a/colossalai/nn/parallel/data_parallel.py
+++ b/colossalai/nn/parallel/data_parallel.py
@@ -233,7 +233,7 @@ def __init__(self,
             assert isinstance(p, ColoParameter)
 
             if is_ddp_ignored(p):
-                p.data = p.data.half()
+                p.data = p.data.to(device=get_current_device(), dtype=torch.float16)
                 continue
 
             fp32_data = p.data.float()
@@ -451,8 +451,14 @@ def _save_to_state_dict(self, destination, prefix, keep_vars, only_rank_0=True):
         assert keep_vars is False, "`state_dict` with parameter, `keep_vars=True`, is not supported now."
 
         param_to_save_data = self._get_param_to_save_data(self.fp32_params, only_rank_0)
-        # TODO: (HELSON) deal with ddp ignored parameters
-        for (name, p), fp32_p in zip(self.named_parameters(), self.fp32_params):
+        ddp_param_list = []
+        for name, param in self.named_parameters():
+            if is_ddp_ignored(param):
+                # deal with ddp ignored parameters
+                destination[prefix + name] = param if keep_vars else param.detach()
+            else:
+                ddp_param_list.append((name, param))
+        for (name, p), fp32_p in zip(ddp_param_list, self.fp32_params):
             if p is not None:
                 assert fp32_p in param_to_save_data, "Parameter '{}' is neglected in the chunk list".format(name)
                 record_parameter = param_to_save_data[fp32_p]
@@ -588,8 +594,16 @@ def load(param_name, dest_tensor, copy_func):
         def load_fp32_parameter(chunk_slice, data):
             chunk_slice.copy_(data.flatten())
 
+        ddp_param_list = []
+        for name, param in self.named_parameters():
+            if is_ddp_ignored(param):
+                # deal with ddp ignored parameters
+                load(name, param, param.copy_)
+            else:
+                ddp_param_list.append((name, param))
+
         fp32_to_name = dict()
-        for (name, p), fp32_p in zip(self.named_parameters(), self.fp32_params):
+        for (name, p), fp32_p in zip(ddp_param_list, self.fp32_params):
             if p is not None:
                 fp32_to_name[fp32_p] = name
 
diff --git a/tests/test_gemini/update/test_zeroddp_state_dict.py b/tests/test_gemini/update/test_zeroddp_state_dict.py
index b902bb0f010e..266b8eab121b 100644
--- a/tests/test_gemini/update/test_zeroddp_state_dict.py
+++ b/tests/test_gemini/update/test_zeroddp_state_dict.py
@@ -4,6 +4,7 @@
 import torch
 import torch.distributed as dist
 import torch.multiprocessing as mp
+from torch.testing import assert_close
 
 import colossalai
 from colossalai.gemini.chunk import ChunkManager, search_chunk_configuration
@@ -17,6 +18,13 @@
 from tests.test_tensor.common_utils import debug_print, set_seed
 
 
+def ignore_the_first_parameter(model: torch.nn.Module):
+    for name, param in model.named_parameters():
+        print(f"parameter `{name}` is set ignored")
+        ZeroDDP.set_params_to_ignore([param])
+        return
+
+
 @parameterize('placement_policy', ['cuda', 'cpu', 'auto'])
 @parameterize('keep_gathered', [True, False])
 @parameterize('model_name', ['gpt2', 'bert'])
@@ -47,7 +55,7 @@ def exam_state_dict(placement_policy, keep_gathered, model_name: str):
     for key, value in torch_dict.items():
         assert key in zero_dict, "{} not in ZeRO dictionary.".format(key)
         temp_zero_value = zero_dict[key].to(device=value.device, dtype=value.dtype)
-        assert torch.equal(value, temp_zero_value), "parameter '{}' has problem.".format(key)
+        assert_close(value, temp_zero_value, rtol=1e-3, atol=1e-5)
 
 
 @parameterize('placement_policy', ['cuda', 'cpu', 'auto'])
@@ -84,7 +92,7 @@ def exam_load_state_dict(placement_policy, keep_gathered, model_name: str):
     for key, value in torch_dict.items():
         assert key in zero_dict, "{} not in ZeRO dictionary.".format(key)
         temp_zero_value = zero_dict[key].to(device=value.device, dtype=value.dtype)
-        assert torch.equal(value, temp_zero_value), "parameter '{}' has problem.".format(key)
+        assert_close(value, temp_zero_value, rtol=1e-3, atol=1e-5)
 
 
 def run_dist(rank, world_size, port):

From 39163417a1462516ac251766439411bfa203e217 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Wed, 11 Jan 2023 15:17:17 +0800
Subject: [PATCH 165/503] [example] updated the hybrid parallel tutorial
 (#2444)

* [example] updated the hybrid parallel tutorial

* polish code
---
 colossalai/cli/launcher/run.py                | 52 +++++++++++++++----
 examples/tutorial/hybrid_parallel/README.md   | 47 ++++++++---------
 examples/tutorial/hybrid_parallel/config.py   |  6 +--
 .../tutorial/hybrid_parallel/requirements.txt |  6 +--
 examples/tutorial/hybrid_parallel/test_ci.sh  |  2 +-
 examples/tutorial/hybrid_parallel/train.py    | 22 +++-----
 6 files changed, 76 insertions(+), 59 deletions(-)

diff --git a/colossalai/cli/launcher/run.py b/colossalai/cli/launcher/run.py
index e078a57c15c9..6411b4302e95 100644
--- a/colossalai/cli/launcher/run.py
+++ b/colossalai/cli/launcher/run.py
@@ -1,12 +1,15 @@
-import click
-import sys
 import os
+import sys
+from typing import List
+
+import click
 import torch
+from packaging import version
+
 from colossalai.context import Config
-from .multinode_runner import MultiNodeRunner
+
 from .hostinfo import HostInfo, HostInfoList
-from typing import List
-from packaging import version
+from .multinode_runner import MultiNodeRunner
 
 # Constants that define our syntax
 NODE_SEP = ','
@@ -15,7 +18,7 @@
 def fetch_hostfile(hostfile_path: str, ssh_port: int) -> HostInfoList:
     """
     Parse the hostfile to obtain a list of hosts.
-    
+
     A hostfile should look like:
     worker-0
     worker-1
@@ -63,7 +66,7 @@ def parse_device_filter(device_pool: HostInfoList, include_str=None, exclude_str
         device_pool (HostInfoList): a list of HostInfo objects
         include_str (str): --include option passed by user, default None
         exclude_str (str): --exclude option passed by user, default None
-    
+
     Returns:
         filtered_hosts (HostInfoList): filtered hosts after inclusion/exclusion
     '''
@@ -192,7 +195,7 @@ def launch_multi_processes(args: Config) -> None:
     Launch multiple processes on a single node or multiple nodes.
 
     The overall logic can be summarized as the pseudo code below:
-    
+
         if hostfile given:
             hostinfo = parse_hostfile(hostfile)
             hostinfo = include_or_exclude_hosts(hostinfo)
@@ -202,7 +205,7 @@ def launch_multi_processes(args: Config) -> None:
             launch_on_multi_nodes(hostinfo)
         else:
             launch_on_current_node()
-    
+
     Args:
         args (Config): the arguments taken from command line
 
@@ -276,6 +279,33 @@ def launch_multi_processes(args: Config) -> None:
                                  extra_launch_args=args.extra_launch_args)
         runner.send(hostinfo=hostinfo, cmd=cmd)
 
-    runner.recv_from_all()
+    # start training
+    msg_from_node = runner.recv_from_all()
+    has_error = False
+
+    # print node status
+    click.echo("\n====== Training on All Nodes =====")
+    for hostname, msg in msg_from_node.items():
+        click.echo(f"{hostname}: {msg}")
+
+        # check if a process failed
+        if msg == "failure":
+            has_error = True
+
+    # stop all nodes
     runner.stop_all()
-    runner.recv_from_all()
+
+    # receive the stop status
+    msg_from_node = runner.recv_from_all()
+
+    # printe node status
+    click.echo("\n====== Stopping All Nodes =====")
+    for hostname, msg in msg_from_node.items():
+        click.echo(f"{hostname}: {msg}")
+
+    # give the process an exit code
+    # so that it behaves like a normal process
+    if has_error:
+        sys.exit(1)
+    else:
+        sys.exit(0)
diff --git a/examples/tutorial/hybrid_parallel/README.md b/examples/tutorial/hybrid_parallel/README.md
index 6f975e86330a..1b5e54f928d4 100644
--- a/examples/tutorial/hybrid_parallel/README.md
+++ b/examples/tutorial/hybrid_parallel/README.md
@@ -1,45 +1,40 @@
 # Multi-dimensional Parallelism with Colossal-AI
 
+## Table of contents
 
-## 🚀Quick Start
-1. Install our model zoo.
-```bash
-pip install titans
-```
-2. Run with synthetic data which is of similar shape to CIFAR10 with the `-s` flag.
-```bash
-colossalai run --nproc_per_node 4 train.py --config config.py -s
-```
+- [Overview](#-overview)
+- [Quick Start](#-quick-start)
 
-3. Modify the config file to play with different types of tensor parallelism, for example, change tensor parallel size to be 4 and mode to be 2d and run on 8 GPUs.
+## 📚 Overview
 
+This example lets you to quickly try out the hybrid parallelism provided by Colossal-AI.
+You can change the parameters below to try out different settings in the `config.py`.
 
-## Install Titans Model Zoo
+```python
+# parallel setting
+TENSOR_PARALLEL_SIZE = 2
+TENSOR_PARALLEL_MODE = '1d'
 
-```bash
-pip install titans
+parallel = dict(
+    pipeline=2,
+    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
+)
 ```
 
+## 🚀 Quick Start
 
-## Prepare Dataset
+1. Install PyTorch
 
-We use CIFAR10 dataset in this example. You should invoke the `donwload_cifar10.py` in the tutorial root directory or directly run the `auto_parallel_with_resnet.py`.
-The dataset will be downloaded to `colossalai/examples/tutorials/data` by default.
-If you wish to use customized directory for the dataset. You can set the environment variable `DATA` via the following command.
+2. Install the dependencies.
 
 ```bash
-export DATA=/path/to/data
+pip install -r requirements.txt
 ```
 
-
-## Run on 2*2 device mesh
-
-Current configuration setting on `config.py` is TP=2, PP=2.
+3. Run the training scripts with synthetic data.
 
 ```bash
-# train with cifar10
 colossalai run --nproc_per_node 4 train.py --config config.py
-
-# train with synthetic data
-colossalai run --nproc_per_node 4 train.py --config config.py -s
 ```
+
+4. Modify the config file to play with different types of tensor parallelism, for example, change tensor parallel size to be 4 and mode to be 2d and run on 8 GPUs.
diff --git a/examples/tutorial/hybrid_parallel/config.py b/examples/tutorial/hybrid_parallel/config.py
index ac273c305006..fe9abf2f1955 100644
--- a/examples/tutorial/hybrid_parallel/config.py
+++ b/examples/tutorial/hybrid_parallel/config.py
@@ -3,7 +3,7 @@
 # hyperparameters
 # BATCH_SIZE is as per GPU
 # global batch size = BATCH_SIZE x data parallel size
-BATCH_SIZE = 256
+BATCH_SIZE = 4
 LEARNING_RATE = 3e-3
 WEIGHT_DECAY = 0.3
 NUM_EPOCHS = 2
@@ -12,11 +12,11 @@
 # model config
 IMG_SIZE = 224
 PATCH_SIZE = 16
-HIDDEN_SIZE = 512
+HIDDEN_SIZE = 128
 DEPTH = 4
 NUM_HEADS = 4
 MLP_RATIO = 2
-NUM_CLASSES = 1000
+NUM_CLASSES = 10
 CHECKPOINT = False
 SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE)**2 + 1    # add 1 for cls token
 
diff --git a/examples/tutorial/hybrid_parallel/requirements.txt b/examples/tutorial/hybrid_parallel/requirements.txt
index dbf6aaf3e4e2..99b7ecfe162e 100644
--- a/examples/tutorial/hybrid_parallel/requirements.txt
+++ b/examples/tutorial/hybrid_parallel/requirements.txt
@@ -1,3 +1,3 @@
-colossalai >= 0.1.12
-torch >= 1.8.1
-titans
\ No newline at end of file
+torch
+colossalai
+titans
diff --git a/examples/tutorial/hybrid_parallel/test_ci.sh b/examples/tutorial/hybrid_parallel/test_ci.sh
index 8860b72a2fb3..e0dbef354e2d 100644
--- a/examples/tutorial/hybrid_parallel/test_ci.sh
+++ b/examples/tutorial/hybrid_parallel/test_ci.sh
@@ -2,4 +2,4 @@
 set -euxo pipefail
 
 pip install -r requirements.txt
-torchrun --standalone --nproc_per_node 4 train.py --config config.py -s
+colossalai run --nproc_per_node 4 train.py --config config.py
diff --git a/examples/tutorial/hybrid_parallel/train.py b/examples/tutorial/hybrid_parallel/train.py
index 2a8576db747b..4953d5350f31 100644
--- a/examples/tutorial/hybrid_parallel/train.py
+++ b/examples/tutorial/hybrid_parallel/train.py
@@ -1,7 +1,6 @@
 import os
 
 import torch
-from titans.dataloader.cifar10 import build_cifar
 from titans.model.vit.vit import _create_vit_model
 from tqdm import tqdm
 
@@ -12,7 +11,7 @@
 from colossalai.nn import CrossEntropyLoss
 from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
 from colossalai.pipeline.pipelinable import PipelinableContext
-from colossalai.utils import get_dataloader, is_using_pp
+from colossalai.utils import is_using_pp
 
 
 class DummyDataloader():
@@ -42,12 +41,9 @@ def __len__(self):
 
 
 def main():
-    # initialize distributed setting
+    # launch from torch
     parser = colossalai.get_default_parser()
-    parser.add_argument('-s', '--synthetic', action="store_true", help="whether use synthetic data")
     args = parser.parse_args()
-
-    # launch from torch
     colossalai.launch_from_torch(config=args.config)
 
     # get logger
@@ -94,15 +90,10 @@ def main():
         pipeline_stage = gpc.get_local_rank(ParallelMode.PIPELINE)
     logger.info(f"number of parameters: {total_numel} on pipeline stage {pipeline_stage}")
 
-    # create dataloaders
-    root = os.environ.get('DATA', '../data')
-    if args.synthetic:
-        # if we use synthetic dataset
-        # we train for 10 steps and eval for 5 steps per epoch
-        train_dataloader = DummyDataloader(length=10, batch_size=gpc.config.BATCH_SIZE)
-        test_dataloader = DummyDataloader(length=5, batch_size=gpc.config.BATCH_SIZE)
-    else:
-        train_dataloader, test_dataloader = build_cifar(gpc.config.BATCH_SIZE, root, pad_if_needed=True)
+    # use synthetic dataset
+    # we train for 10 steps and eval for 5 steps per epoch
+    train_dataloader = DummyDataloader(length=10, batch_size=gpc.config.BATCH_SIZE)
+    test_dataloader = DummyDataloader(length=5, batch_size=gpc.config.BATCH_SIZE)
 
     # create loss function
     criterion = CrossEntropyLoss(label_smoothing=0.1)
@@ -139,6 +130,7 @@ def main():
             engine.execute_schedule(data_iter, return_output_label=False)
             engine.step()
             lr_scheduler.step()
+    gpc.destroy()
 
 
 if __name__ == '__main__':

From 2bfeb24308aa8c55e7a2c8ea42eb87a680618b50 Mon Sep 17 00:00:00 2001
From: HELSON <c2h214748@gmail.com>
Date: Wed, 11 Jan 2023 15:30:09 +0800
Subject: [PATCH 166/503] [zero] add warning for ignored parameters (#2446)

---
 colossalai/gemini/chunk/utils.py          |  9 +++++++--
 colossalai/nn/optimizer/zero_optimizer.py | 15 +++++++++++++--
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/colossalai/gemini/chunk/utils.py b/colossalai/gemini/chunk/utils.py
index 883022fe89b8..ebfdee778979 100644
--- a/colossalai/gemini/chunk/utils.py
+++ b/colossalai/gemini/chunk/utils.py
@@ -10,13 +10,18 @@
 from colossalai.utils import is_ddp_ignored
 
 
+def safe_div(a, b):
+    if a == 0:
+        return 0
+    return a / b
+
+
 def init_chunk_manager(model: nn.Module,
                        init_device: Optional[torch.device] = None,
                        hidden_dim: Optional[int] = None,
                        search_range_mb: Optional[float] = None,
                        min_chunk_size_mb: Optional[float] = None,
                        filter_exlarge_params: Optional[bool] = None) -> ChunkManager:
-
     kwargs_dict = dict()
 
     if hidden_dim:
@@ -50,7 +55,7 @@ def init_chunk_manager(model: nn.Module,
     if dist.get_rank() == 0:
         print("searching chunk configuration is completed in {:.2f} s.\n".format(span_s),
               "used number: {:.2f} MB, wasted number: {:.2f} MB\n".format(total_size, wasted_size),
-              "total wasted percentage is {:.2f}%".format(100 * wasted_size / (total_size + wasted_size)),
+              "total wasted percentage is {:.2f}%".format(100 * safe_div(wasted_size, total_size + wasted_size)),
               sep='',
               flush=True)
     dist.barrier()
diff --git a/colossalai/nn/optimizer/zero_optimizer.py b/colossalai/nn/optimizer/zero_optimizer.py
index 3dd9d1e93b36..9f761efdb12c 100644
--- a/colossalai/nn/optimizer/zero_optimizer.py
+++ b/colossalai/nn/optimizer/zero_optimizer.py
@@ -1,4 +1,5 @@
 import math
+import warnings
 from enum import Enum
 from typing import Any, Dict, Set, Tuple
 
@@ -78,8 +79,16 @@ def __init__(self,
         if self.clipping_flag:
             assert norm_type == 2.0, "ZeroOptimizer only supports L2 norm now"
 
-        params_list = [p for p in module.parameters() if not is_ddp_ignored(p)]
-        for p, fp32_p in zip(params_list, module.fp32_params):
+        ddp_param_list = []
+        for name, param in module.named_parameters():
+            if is_ddp_ignored(param):
+                if param.requires_grad:
+                    warnings.warn(f"Parameter `{name}` is ignored by DDP but requires gradient! "
+                                  "You should handle its optimizer update by yourself!")
+            else:
+                ddp_param_list.append(param)
+
+        for p, fp32_p in zip(ddp_param_list, module.fp32_params):
             chunk_16 = self.chunk_manager.get_chunk(p)
             if chunk_16 not in self.chunk16_set:
                 chunk_16.l2_norm_flag = self.clipping_flag
@@ -290,6 +299,8 @@ def get_range_pair(local_chunk: Chunk, local_param: Parameter):
             fake_params_list = list()
 
             for param in group['params']:
+                if is_ddp_ignored(param):
+                    continue
                 chunk16 = self.chunk_manager.get_chunk(param)
                 range_pair = get_range_pair(chunk16, param)
                 if range_pair[0] >= range_pair[1]:

From ac18a445fafae6378d97d605f5d3edfb915666d9 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Wed, 11 Jan 2023 16:27:31 +0800
Subject: [PATCH 167/503] [example] updated large-batch optimizer tutorial
 (#2448)

* [example] updated large-batch optimizer tutorial

* polish code

* polish code
---
 .../tutorial/large_batch_optimizer/README.md  | 38 +++++-----
 .../tutorial/large_batch_optimizer/config.py  | 26 +------
 .../large_batch_optimizer/requirements.txt    |  5 +-
 .../tutorial/large_batch_optimizer/test_ci.sh |  8 ++
 .../tutorial/large_batch_optimizer/train.py   | 76 +++++--------------
 5 files changed, 53 insertions(+), 100 deletions(-)
 create mode 100644 examples/tutorial/large_batch_optimizer/test_ci.sh

diff --git a/examples/tutorial/large_batch_optimizer/README.md b/examples/tutorial/large_batch_optimizer/README.md
index 20bddb383434..d85afa427518 100644
--- a/examples/tutorial/large_batch_optimizer/README.md
+++ b/examples/tutorial/large_batch_optimizer/README.md
@@ -1,31 +1,35 @@
 # Comparison of Large Batch Training Optimization
 
-## 🚀Quick Start
-Run with synthetic data
-```bash
-colossalai run --nproc_per_node 4 train.py --config config.py -s
-```
+## Table of contents
 
+- [Overview](#-overview)
+- [Quick Start](#-quick-start)
 
-## Prepare Dataset
+## 📚 Overview
 
-We use CIFAR10 dataset in this example. You should invoke the `donwload_cifar10.py` in the tutorial root directory or directly run the `auto_parallel_with_resnet.py`.
-The dataset will be downloaded to `colossalai/examples/tutorials/data` by default.
-If you wish to use customized directory for the dataset. You can set the environment variable `DATA` via the following command.
+This example lets you to quickly try out the large batch training optimization provided by Colossal-AI. We use synthetic dataset to go through the process, thus, you don't need to prepare any dataset. You can try out the `Lamb` and `Lars` optimizers from Colossal-AI with the following code.
 
-```bash
-export DATA=/path/to/data
+```python
+from colossalai.nn.optimizer import Lamb, Lars
 ```
 
-You can also use synthetic data for this tutorial if you don't wish to download the `CIFAR10` dataset by adding the `-s` or `--synthetic` flag to the command.
+## 🚀 Quick Start
+
+1. Install PyTorch
 
+2. Install the dependencies.
+
+```bash
+pip install -r requirements.txt
+```
 
-## Run on 2*2 device mesh
+3. Run the training scripts with synthetic data.
 
 ```bash
-# run with cifar10
-colossalai run --nproc_per_node 4 train.py --config config.py
+# run on 4 GPUs
+# run with lars
+colossalai run --nproc_per_node 4 train.py --config config.py --optimizer lars
 
-# run with synthetic dataset
-colossalai run --nproc_per_node 4 train.py --config config.py -s
+# run with lamb
+colossalai run --nproc_per_node 4 train.py --config config.py --optimizer lamb
 ```
diff --git a/examples/tutorial/large_batch_optimizer/config.py b/examples/tutorial/large_batch_optimizer/config.py
index e019154e4b12..2efa0ffd0556 100644
--- a/examples/tutorial/large_batch_optimizer/config.py
+++ b/examples/tutorial/large_batch_optimizer/config.py
@@ -6,31 +6,11 @@
 BATCH_SIZE = 512
 LEARNING_RATE = 3e-3
 WEIGHT_DECAY = 0.3
-NUM_EPOCHS = 10
-WARMUP_EPOCHS = 3
+NUM_EPOCHS = 2
+WARMUP_EPOCHS = 1
 
 # model config
-IMG_SIZE = 224
-PATCH_SIZE = 16
-HIDDEN_SIZE = 512
-DEPTH = 4
-NUM_HEADS = 4
-MLP_RATIO = 2
-NUM_CLASSES = 1000
-CHECKPOINT = False
-SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE)**2 + 1    # add 1 for cls token
-
-# parallel setting
-TENSOR_PARALLEL_SIZE = 2
-TENSOR_PARALLEL_MODE = '1d'
-
-parallel = dict(
-    pipeline=2,
-    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
-)
+NUM_CLASSES = 10
 
 fp16 = dict(mode=AMP_TYPE.NAIVE)
 clip_grad_norm = 1.0
-
-# pipeline config
-NUM_MICRO_BATCHES = parallel['pipeline']
diff --git a/examples/tutorial/large_batch_optimizer/requirements.txt b/examples/tutorial/large_batch_optimizer/requirements.txt
index 137a69e80498..c013287751bf 100644
--- a/examples/tutorial/large_batch_optimizer/requirements.txt
+++ b/examples/tutorial/large_batch_optimizer/requirements.txt
@@ -1,2 +1,3 @@
-colossalai >= 0.1.12
-torch >= 1.8.1
+colossalai
+torch
+titans
diff --git a/examples/tutorial/large_batch_optimizer/test_ci.sh b/examples/tutorial/large_batch_optimizer/test_ci.sh
new file mode 100644
index 000000000000..89f426c542b1
--- /dev/null
+++ b/examples/tutorial/large_batch_optimizer/test_ci.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+set -euxo pipefail
+
+pip install -r requirements.txt
+
+# run test
+colossalai run --nproc_per_node 4 --master_port 29500 train.py --config config.py --optimizer lars
+colossalai run --nproc_per_node 4 --master_port 29501 train.py --config config.py --optimizer lamb
diff --git a/examples/tutorial/large_batch_optimizer/train.py b/examples/tutorial/large_batch_optimizer/train.py
index d403c275d1af..35e54582f494 100644
--- a/examples/tutorial/large_batch_optimizer/train.py
+++ b/examples/tutorial/large_batch_optimizer/train.py
@@ -1,19 +1,13 @@
-import os
-
 import torch
-from titans.dataloader.cifar10 import build_cifar
-from titans.model.vit.vit import _create_vit_model
+import torch.nn as nn
+from torchvision.models import resnet18
 from tqdm import tqdm
 
 import colossalai
-from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.logging import get_dist_logger
-from colossalai.nn import CrossEntropyLoss
 from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
 from colossalai.nn.optimizer import Lamb, Lars
-from colossalai.pipeline.pipelinable import PipelinableContext
-from colossalai.utils import get_dataloader, is_using_pp
 
 
 class DummyDataloader():
@@ -45,7 +39,10 @@ def __len__(self):
 def main():
     # initialize distributed setting
     parser = colossalai.get_default_parser()
-    parser.add_argument('-s', '--synthetic', action="store_true", help="whether use synthetic data")
+    parser.add_argument('--optimizer',
+                        choices=['lars', 'lamb'],
+                        help="Choose your large-batch optimizer",
+                        required=True)
     args = parser.parse_args()
 
     # launch from torch
@@ -55,59 +52,22 @@ def main():
     logger = get_dist_logger()
     logger.info("initialized distributed environment", ranks=[0])
 
-    if hasattr(gpc.config, 'LOG_PATH'):
-        if gpc.get_global_rank() == 0:
-            log_path = gpc.config.LOG_PATH
-            if not os.path.exists(log_path):
-                os.mkdir(log_path)
-            logger.log_to_file(log_path)
-
-    use_pipeline = is_using_pp()
-
-    # create model
-    model_kwargs = dict(img_size=gpc.config.IMG_SIZE,
-                        patch_size=gpc.config.PATCH_SIZE,
-                        hidden_size=gpc.config.HIDDEN_SIZE,
-                        depth=gpc.config.DEPTH,
-                        num_heads=gpc.config.NUM_HEADS,
-                        mlp_ratio=gpc.config.MLP_RATIO,
-                        num_classes=10,
-                        init_method='jax',
-                        checkpoint=gpc.config.CHECKPOINT)
-
-    if use_pipeline:
-        pipelinable = PipelinableContext()
-        with pipelinable:
-            model = _create_vit_model(**model_kwargs)
-        pipelinable.to_layer_list()
-        pipelinable.policy = "uniform"
-        model = pipelinable.partition(1, gpc.pipeline_parallel_size, gpc.get_local_rank(ParallelMode.PIPELINE))
-    else:
-        model = _create_vit_model(**model_kwargs)
-
-    # count number of parameters
-    total_numel = 0
-    for p in model.parameters():
-        total_numel += p.numel()
-    if not gpc.is_initialized(ParallelMode.PIPELINE):
-        pipeline_stage = 0
-    else:
-        pipeline_stage = gpc.get_local_rank(ParallelMode.PIPELINE)
-    logger.info(f"number of parameters: {total_numel} on pipeline stage {pipeline_stage}")
-
-    # create dataloaders
-    root = os.environ.get('DATA', '../data/')
-    if args.synthetic:
-        train_dataloader = DummyDataloader(length=30, batch_size=gpc.config.BATCH_SIZE)
-        test_dataloader = DummyDataloader(length=10, batch_size=gpc.config.BATCH_SIZE)
-    else:
-        train_dataloader, test_dataloader = build_cifar(gpc.config.BATCH_SIZE, root, pad_if_needed=True)
+    # create synthetic dataloaders
+    train_dataloader = DummyDataloader(length=10, batch_size=gpc.config.BATCH_SIZE)
+    test_dataloader = DummyDataloader(length=5, batch_size=gpc.config.BATCH_SIZE)
+
+    # build model
+    model = resnet18(num_classes=gpc.config.NUM_CLASSES)
 
     # create loss function
-    criterion = CrossEntropyLoss(label_smoothing=0.1)
+    criterion = nn.CrossEntropyLoss()
 
     # create optimizer
-    optimizer = Lars(model.parameters(), lr=gpc.config.LEARNING_RATE, weight_decay=gpc.config.WEIGHT_DECAY)
+    if args.optimizer == "lars":
+        optim_cls = Lars
+    elif args.optimizer == "lamb":
+        optim_cls = Lamb
+    optimizer = optim_cls(model.parameters(), lr=gpc.config.LEARNING_RATE, weight_decay=gpc.config.WEIGHT_DECAY)
 
     # create lr scheduler
     lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer,

From cfd1d5ee4970ecf168a09ca6d5f187b4520eaea3 Mon Sep 17 00:00:00 2001
From: Haofan Wang <haofanwang.ai@gmail.com>
Date: Wed, 11 Jan 2023 16:56:15 +0800
Subject: [PATCH 168/503] [example] fixed seed error in
 train_dreambooth_colossalai.py (#2445)

---
 examples/images/dreambooth/train_dreambooth_colossalai.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/examples/images/dreambooth/train_dreambooth_colossalai.py b/examples/images/dreambooth/train_dreambooth_colossalai.py
index b7e24bfe4a15..7c90b939abaa 100644
--- a/examples/images/dreambooth/train_dreambooth_colossalai.py
+++ b/examples/images/dreambooth/train_dreambooth_colossalai.py
@@ -355,10 +355,11 @@ def gemini_zero_dpp(model: torch.nn.Module, placememt_policy: str = "auto"):
 
 
 def main(args):
-    colossalai.launch_from_torch(config={})
 
-    if args.seed is not None:
-        gpc.set_seed(args.seed)
+    if args.seed is None:
+        colossalai.launch_from_torch(config={})
+    else:
+        colossalai.launch_from_torch(config={}, seed=args.seed)
 
     if args.with_prior_preservation:
         class_images_dir = Path(args.class_data_dir)

From 483efdabc5875545391f88c5fa7a71ff02800d58 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Wed, 11 Jan 2023 17:22:11 +0800
Subject: [PATCH 169/503] [workflow] fixed the on-merge condition check (#2452)

---
 .github/workflows/draft_github_release_post.yml | 3 +--
 .github/workflows/release_docker.yml            | 9 ++++++---
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/draft_github_release_post.yml b/.github/workflows/draft_github_release_post.yml
index 413714dafa86..53bfa9e8deb6 100644
--- a/.github/workflows/draft_github_release_post.yml
+++ b/.github/workflows/draft_github_release_post.yml
@@ -8,11 +8,10 @@ on:
     types:
       - closed
 
-
 jobs:
   release:
     name: Draft Release Post
-    if: github.repository == 'hpcaitech/ColossalAI'
+    if: ( github.event_name == 'workflow_dispatch' || github.event.pull_request.merged == true ) && github.repository == 'hpcaitech/ColossalAI'
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v2
diff --git a/.github/workflows/release_docker.yml b/.github/workflows/release_docker.yml
index c72d3fb33edd..8da6e5f87606 100644
--- a/.github/workflows/release_docker.yml
+++ b/.github/workflows/release_docker.yml
@@ -2,13 +2,16 @@ name: Publish Docker Image to DockerHub
 
 on:
   workflow_dispatch:
-  release:
-    types: [published]
+  pull_request:
+    paths:
+      - 'version.txt'
+    types:
+      - closed
 
 jobs:
   release:
     name: Publish Docker Image to DockerHub
-    if: github.repository == 'hpcaitech/ColossalAI'
+    if: ( github.event_name == 'workflow_dispatch' || github.event.pull_request.merged == true ) && github.repository == 'hpcaitech/ColossalAI'
     runs-on: [self-hosted, gpu]
     container:
       image: "hpcaitech/docker-in-docker:latest"

From c9ec5190a076b130c72ab8a86c35626ac6e3d5e7 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Wed, 11 Jan 2023 23:40:16 +0800
Subject: [PATCH 170/503] [workflow] automated the compatiblity test (#2453)

* [workflow] automated the compatiblity test

* polish code
---
 .compatibility                                |  3 +
 .github/workflows/README.md                   | 52 ++++++++-----
 .github/workflows/auto_compatibility_test.yml | 74 +++++++++++++++++++
 ...st.yml => dispatch_compatibility_test.yml} |  2 +-
 4 files changed, 110 insertions(+), 21 deletions(-)
 create mode 100644 .compatibility
 create mode 100644 .github/workflows/auto_compatibility_test.yml
 rename .github/workflows/{compatibility_test.yml => dispatch_compatibility_test.yml} (98%)

diff --git a/.compatibility b/.compatibility
new file mode 100644
index 000000000000..c8ac4083d2a2
--- /dev/null
+++ b/.compatibility
@@ -0,0 +1,3 @@
+1.12.0-11.3.0
+1.11.0-11.3.0
+1.10.1-11.3.0
diff --git a/.github/workflows/README.md b/.github/workflows/README.md
index 65017a397c81..bc1f8504df3c 100644
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@@ -14,6 +14,7 @@
       - [Dispatch Example Test](#dispatch-example-test)
       - [Compatibility Test](#compatibility-test)
     - [User Friendliness](#user-friendliness)
+  - [Configuration](#configuration)
   - [Progress Log](#progress-log)
 
 ## Overview
@@ -37,30 +38,32 @@ In the section below, we will dive into the details of different workflows avail
 
 ### Regular Checks
 
-| Workflow Name           | File name                | Description                                                                                                            |
-| ----------------------- | ------------------------ | ---------------------------------------------------------------------------------------------------------------------- |
-| `Test example`          | `auto_example_check.yml` | This workflow will test all examples every Sunday                                                                      |
-| `Build on 8 GPUs`       | `build_gpu_8.yml`        | This workflow will run the unit tests everyday with 8 GPUs.                                                            |
-| `Synchronize submodule` | `submodule.yml`          | This workflow will check if any git submodule is updated. If so, it will create a PR to update the submodule pointers. |
-| `Close inactive issues` | `close_inactive.yml`     | This workflow will close issues which are stale for 14 days.                                                           |
+| Workflow Name           | File name                     | Description                                                                                                                                                      |
+| ----------------------- | ----------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `Test example`          | `auto_example_check.yml`      | This workflow will test all examples every Sunday                                                                                                                |
+| `Compatibility Test`    | `auto_compatibility_test.yml` | This workflow will check the compatiblity of Colossal-AI against PyTorch and CUDA every Sunday. The PyTorch and CUDA versions are specified in `.compatibility`. |
+| `Build on 8 GPUs`       | `build_gpu_8.yml`             | This workflow will run the unit tests everyday with 8 GPUs.                                                                                                      |
+| `Synchronize submodule` | `submodule.yml`               | This workflow will check if any git submodule is updated. If so, it will create a PR to update the submodule pointers.                                           |
+| `Close inactive issues` | `close_inactive.yml`          | This workflow will close issues which are stale for 14 days.                                                                                                     |
 
 ### Release
 
-| Workflow Name               | File name                       | Description                                                                                                       |
-| --------------------------- | ------------------------------- | ----------------------------------------------------------------------------------------------------------------- |
-| `Draft GitHub Release Post` | `draft_github_release_post.yml` | Compose a GitHub release post draft based on the commit history. Triggered when `version.txt` is updated.         |
-| `Release to PyPI`           | `release_pypi.yml`              | Build and release the wheel to PyPI. Triggered when `version.txt` is updated.                                     |
-| `Release Nightly to PyPI`   | `release_nightly.yml`           | Build and release the nightly wheel to PyPI as `colossalai-nightly`. Automatically executed every Sunday.         |
-| `Release Docker`            | `release_docker.yml`            | Build and release the Docker image to DockerHub. Triggered when `version.txt` is updated.                         |
-| `Release bdist wheel`       | `release_bdist.yml`             | Build binary wheels with pre-built PyTorch extensions. Manually dispatched. See more details in the next section. |
+| Workflow Name               | File name                       | Description                                                                                                                                           |
+| --------------------------- | ------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `Draft GitHub Release Post` | `draft_github_release_post.yml` | Compose a GitHub release post draft based on the commit history.  Triggered when the change of `version.txt` is merged.                               |
+| `Release to PyPI`           | `release_pypi.yml`              | Build and release the wheel to PyPI.  Triggered when the change of `version.txt` is merged.                                                           |
+| `Release Nightly to PyPI`   | `release_nightly.yml`           | Build and release the nightly wheel to PyPI as `colossalai-nightly`. Automatically executed every Sunday.                                             |
+| `Release Docker`            | `release_docker.yml`            | Build and release the Docker image to DockerHub. Triggered when the change of `version.txt` is merged.                                                |
+| `Release bdist wheel`       | `release_bdist.yml`             | Build binary wheels with pre-built PyTorch extensions. Manually dispatched. See more details in the next section.                                     |
+| `Auto Compatibility Test`   | `auto_compatibility_test.yml`   | Check Colossal-AI's compatiblity against the PyTorch and CUDA version specified in `.compatibility`. Triggered when `version.txt` is changed in a PR. |
 
 ### Manual Dispatch
 
-| Workflow Name           | File name                    | Description                                            |
-| ----------------------- | ---------------------------- | ------------------------------------------------------ |
-| `Release bdist wheel`   | `release_bdist.yml`          | Build binary wheels with pre-built PyTorch extensions. |
-| `Dispatch Example Test` | `dispatch_example_check.yml` | Manually test a specified example.                     |
-| `Compatiblity Test`     | `compatiblity_test.yml`      | Test PyTorch and Python Compatibility.                 |
+| Workflow Name                | File name                        | Description                                            |
+| ---------------------------- | -------------------------------- | ------------------------------------------------------ |
+| `Release bdist wheel`        | `release_bdist.yml`              | Build binary wheels with pre-built PyTorch extensions. |
+| `Dispatch Example Test`      | `dispatch_example_check.yml`     | Manually test a specified example.                     |
+| `Dispatch Compatiblity Test` | `dispatch_compatiblity_test.yml` | Test PyTorch and Python Compatibility.                 |
 
 Refer to this [documentation](https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow) on how to manually trigger a workflow.
 I will provide the details of each workflow below.
@@ -93,6 +96,15 @@ Parameters:
 | ----------------- | ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------- |
 | `issue-translate` | `translate_comment.yml` | This workflow is triggered when a new issue comment is created. The comment will be translated into English if not written in English. |
 
+
+## Configuration
+
+This section lists the files used to configure the workflow.
+
+1. `.compatibility`
+
+This `.compatibility` file is to tell GitHub Actions which PyTorch and CUDA versions to test against. Each line in the file is in the format `${torch-version}-${cuda-version}`, which is a tag for Docker image. Thus, this tag must be present in the [docker registry](https://hub.docker.com/r/pytorch/conda-cuda) so as to perform the test.
+
 ## Progress Log
 
 - [x] unit testing
@@ -112,9 +124,9 @@ Parameters:
   - [x] check on PR
   - [x] regular check
   - [x] manual dispatch
-- [ ] compatiblity check
+- [x] compatiblity check
   - [x] manual dispatch
-  - [ ] auto test when release
+  - [x] auto test when release
 - [x] helpers
   - [x] comment translation
   - [x] submodule update
diff --git a/.github/workflows/auto_compatibility_test.yml b/.github/workflows/auto_compatibility_test.yml
new file mode 100644
index 000000000000..4b026c63e7f7
--- /dev/null
+++ b/.github/workflows/auto_compatibility_test.yml
@@ -0,0 +1,74 @@
+name: Compatibility Test
+
+on:
+  pull_request:
+    paths:
+      - 'version.txt'
+      - '.compatibility'
+  # run at 03:00 of every Sunday(singapore time) so here is UTC time Saturday 16:00
+  schedule:
+    - cron:  '0 19 * * 6'
+
+jobs:
+  matrix_preparation:
+    name: Prepare Container List
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+      - uses: actions/checkout@v3
+      - id: set-matrix
+        run: |
+          IFS=','
+          DOCKER_IMAGE=()
+
+          while read tag; do
+            DOCKER_IMAGE+=("\"hpcaitech/pytorch-cuda:${tag}\"")
+          done <.compatibility
+
+          container=$( IFS=',' ; echo "${DOCKER_IMAGE[*]}" )
+          container="[${container}]"
+          echo "$container"
+          echo "::set-output name=matrix::{\"container\":$(echo "$container")}"
+
+  build:
+    name: Test for PyTorch Compatibility
+    needs: matrix_preparation
+    if: github.repository == 'hpcaitech/ColossalAI'
+    runs-on: [self-hosted, gpu]
+    strategy:
+      fail-fast: false
+      matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
+    container:
+      image: ${{ matrix.container }}
+      options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10
+    timeout-minutes: 120
+    steps:
+      - name: Install dependencies
+        run: |
+          pip install -U pip setuptools wheel --user
+      - uses: actions/checkout@v2
+        with:
+          repository: hpcaitech/TensorNVMe
+          ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
+          path: TensorNVMe
+      - name: Install tensornvme
+        run: |
+          cd TensorNVMe
+          conda install cmake
+          pip install -r requirements.txt
+          pip install -v .
+      - uses: actions/checkout@v2
+        with:
+          ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
+      - name: Install Colossal-AI
+        run: |
+          pip install -v --no-cache-dir .
+          pip install -r requirements/requirements-test.txt
+      - name: Unit Testing
+        run: |
+          PYTHONPATH=$PWD pytest tests
+        env:
+          DATA: /data/scratch/cifar-10
+          NCCL_SHM_DISABLE: 1
+          LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
diff --git a/.github/workflows/compatibility_test.yml b/.github/workflows/dispatch_compatibility_test.yml
similarity index 98%
rename from .github/workflows/compatibility_test.yml
rename to .github/workflows/dispatch_compatibility_test.yml
index eadd07886106..ac5669c6f7f0 100644
--- a/.github/workflows/compatibility_test.yml
+++ b/.github/workflows/dispatch_compatibility_test.yml
@@ -1,4 +1,4 @@
-name: Compatibility Test
+name: Dispatch Compatibility Test
 
 on:
   workflow_dispatch:

From 8221fd7485772d0133cb177ef7f5dbf984d7a76e Mon Sep 17 00:00:00 2001
From: YuliangLiu0306 <72588413+YuliangLiu0306@users.noreply.github.com>
Date: Thu, 12 Jan 2023 09:35:10 +0800
Subject: [PATCH 171/503] [autoparallel] update binary elementwise handler
 (#2451)

* [autoparallel] update binary elementwise handler

* polish
---
 .../binary_elementwise_handler.py             | 27 ++++++--
 .../test_binary_elementwise_handler.py        | 63 ++++++++++++++-----
 .../test_node_handler/utils.py                |  5 +-
 3 files changed, 73 insertions(+), 22 deletions(-)

diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/binary_elementwise_handler.py b/colossalai/auto_parallel/tensor_shard/node_handler/binary_elementwise_handler.py
index f510f74776b6..db8f0b54ddee 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/binary_elementwise_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/binary_elementwise_handler.py
@@ -32,20 +32,32 @@ def _get_op_data_type(tensor):
                 return OperationDataType.ARG
 
         def _get_arg_value(idx):
+            non_tensor = False
             if isinstance(self.node.args[idx], Node):
                 meta_data = self.node.args[idx]._meta_data
+                # The meta_data of node type argument could also possibly be a non-tensor object.
+                if not isinstance(meta_data, torch.Tensor):
+                    assert isinstance(meta_data, (int, float))
+                    meta_data = torch.Tensor([meta_data]).to('meta')
+                    non_tensor = True
+
             else:
                 # this is in fact a real data like int 1
                 # but we can deem it as meta data
                 # as it won't affect the strategy generation
                 assert isinstance(self.node.args[idx], (int, float))
                 meta_data = torch.Tensor([self.node.args[idx]]).to('meta')
-            return meta_data
+                non_tensor = True
 
-        input_meta_data = _get_arg_value(0)
-        other_meta_data = _get_arg_value(1)
-        output_meta_data = self.node._meta_data
+            return meta_data, non_tensor
 
+        input_meta_data, non_tensor_input = _get_arg_value(0)
+        other_meta_data, non_tensor_other = _get_arg_value(1)
+        output_meta_data = self.node._meta_data
+        # we need record op_data with non-tensor data in this list,
+        # and filter the non-tensor op_data in post_process.
+        self.non_tensor_list = []
+        # assert False
         input_op_data = OperationData(name=str(self.node.args[0]),
                                       type=_get_op_data_type(input_meta_data),
                                       data=input_meta_data,
@@ -58,6 +70,10 @@ def _get_arg_value(idx):
                                        type=OperationDataType.OUTPUT,
                                        data=output_meta_data,
                                        logical_shape=bcast_shape)
+        if non_tensor_input:
+            self.non_tensor_list.append(input_op_data)
+        if non_tensor_other:
+            self.non_tensor_list.append(other_op_data)
 
         mapping = {'input': input_op_data, 'other': other_op_data, 'output': output_op_data}
         return mapping
@@ -73,9 +89,10 @@ def post_process(self, strategy: ShardingStrategy) -> Union[ShardingStrategy, Li
         op_data_mapping = self.get_operation_data_mapping()
 
         for op_name, op_data in op_data_mapping.items():
-            if not isinstance(op_data.data, torch.Tensor):
+            if op_data in self.non_tensor_list:
                 # remove the sharding spec if the op_data is not a tensor, e.g. torch.pow(tensor, 2)
                 strategy.sharding_specs.pop(op_data)
+
             else:
                 # convert the logical sharding spec to physical sharding spec if broadcast
                 # e.g. torch.rand(4, 4) + torch.rand(4)
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_binary_elementwise_handler.py b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_binary_elementwise_handler.py
index 42430d5a24cb..50385c0450a8 100644
--- a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_binary_elementwise_handler.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_binary_elementwise_handler.py
@@ -122,25 +122,41 @@ def forward(self, x1, x2):
             assert input_sharding_spec.sharding_sequence[-1] == other_sharding_spec.sharding_sequence[-1]
 
 
-def check_binary_elementwise_handler_with_int(rank, op, other_dim, world_size, port):
-    disable_existing_loggers()
-    launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+class BEOpModelWithNodeConst(nn.Module):
 
-    class BinaryElementwiseOpModel(nn.Module):
+    def __init__(self, op):
+        super().__init__()
+        self.op = op
 
-        def __init__(self, op, const):
-            super().__init__()
-            self.op = op
-            self.const = const
+    def forward(self, x1):
+        const = x1.dim()
+        out = self.op(x1, const)
+        return out
 
-        def forward(self, x1):
-            out = self.op(x1, self.const)
-            return out
+
+class BEOpModelWithIntConst(nn.Module):
+
+    def __init__(self, op, const):
+        super().__init__()
+        self.op = op
+        self.const = const
+
+    def forward(self, x1):
+        out = self.op(x1, self.const)
+        return out
+
+
+def check_binary_elementwise_handler_with_int(rank, op, other_dim, model_cls, world_size, port):
+    disable_existing_loggers()
+    launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
 
     physical_mesh_id = torch.arange(0, 4)
     mesh_shape = (2, 2)
     device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
-    model = BinaryElementwiseOpModel(op, other_dim).cuda()
+    if model_cls == BEOpModelWithNodeConst:
+        model = model_cls(op).cuda()
+    else:
+        model = model_cls(op, other_dim).cuda()
     x1 = torch.rand(4, 4).cuda()
     # the index of binary-elementwise node in computation graph
     node_index = 1
@@ -159,9 +175,14 @@ def forward(self, x1):
     tracer = ColoTracer()
     meta_args = {'x1': torch.rand(4, 4).to('meta')}
     graph = tracer.trace(model, meta_args=meta_args)
+    print(graph)
+    # assert False
     gm = ColoGraphModule(model, graph)
 
-    op_node = list(graph.nodes)[1]
+    if model_cls == BEOpModelWithNodeConst:
+        op_node = list(graph.nodes)[2]
+    else:
+        op_node = list(graph.nodes)[1]
     strategies_vector = StrategiesVector(op_node)
 
     # build handler
@@ -212,7 +233,7 @@ def forward(self, x1):
 @parameterize('other_dim', [1, 2])
 @pytest.mark.dist
 @rerun_if_address_is_in_use()
-def test_binary_elementwise_handler(op, other_dim):
+def test_binary_elementwise_handler_with_tensor(op, other_dim):
     world_size = 4
     run_func_tensor = partial(check_binary_elementwise_handler_with_tensor,
                               op=op,
@@ -220,8 +241,19 @@ def test_binary_elementwise_handler(op, other_dim):
                               world_size=world_size,
                               port=free_port())
     mp.spawn(run_func_tensor, nprocs=world_size)
+
+
+@run_on_environment_flag(name='AUTO_PARALLEL')
+@parameterize('op', [torch.add])
+@parameterize('other_dim', [1, 2])
+@parameterize('model_cls', [BEOpModelWithNodeConst, BEOpModelWithIntConst])
+@pytest.mark.dist
+@rerun_if_address_is_in_use()
+def test_binary_elementwise_handler_with_int(op, model_cls, other_dim):
+    world_size = 4
     run_func_int = partial(check_binary_elementwise_handler_with_int,
                            op=op,
+                           model_cls=model_cls,
                            other_dim=other_dim,
                            world_size=world_size,
                            port=free_port())
@@ -229,4 +261,5 @@ def test_binary_elementwise_handler(op, other_dim):
 
 
 if __name__ == '__main__':
-    test_binary_elementwise_handler()
+    test_binary_elementwise_handler_with_tensor()
+    test_binary_elementwise_handler_with_int()
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/utils.py b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/utils.py
index d02e1e31eb40..db76ed9b85df 100644
--- a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/utils.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/utils.py
@@ -90,7 +90,8 @@ def numerical_test_for_node_strategy(model: torch.nn.Module,
         solver_options = SolverOptions()
         strategies_constructor = StrategiesConstructor(graph, device_mesh, solver_options)
         strategies_constructor.build_strategies_and_cost()
-        target_node = list(graph.nodes)[node_index]
+        target_node = [strategies_vector.node for strategies_vector in strategies_constructor.leaf_strategies
+                      ][node_index]
         if node_type == 'normal':
             solution_len = len(strategies_constructor.leaf_strategies)
             solution = [0] * solution_len
@@ -112,7 +113,7 @@ def numerical_test_for_node_strategy(model: torch.nn.Module,
             ret = solver.call_solver_serialized_args()
             solution = list(ret[0])
         gm, sharding_spec_dict, origin_spec_dict, comm_actions_dict = runtime_preparation_pass(
-            gm, solution, device_mesh)
+            gm, solution, device_mesh, strategies_constructor)
         gm = runtime_apply_pass(gm)
         gm.recompile()
 

From 32c46e146e96d9f6ee949b9f64b84a789e5479ea Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Thu, 12 Jan 2023 10:57:02 +0800
Subject: [PATCH 172/503] [workflow] automated bdist wheel build (#2459)

* [workflow] automated bdist wheel build

* polish workflow

* polish readme

* polish readme
---
 .bdist.json                              | 24 ++++++++
 .github/workflows/README.md              | 32 ++++++++---
 .github/workflows/auto_release_bdist.yml | 70 ++++++++++++++++++++++++
 3 files changed, 118 insertions(+), 8 deletions(-)
 create mode 100644 .bdist.json
 create mode 100644 .github/workflows/auto_release_bdist.yml

diff --git a/.bdist.json b/.bdist.json
new file mode 100644
index 000000000000..8693bca489e8
--- /dev/null
+++ b/.bdist.json
@@ -0,0 +1,24 @@
+{
+  "build": [
+    {
+      "torch_version": "1.11.0",
+      "cuda_image": "hpcaitech/cuda-conda:10.2"
+    },
+    {
+      "torch_version": "1.11.0",
+      "cuda_image": "hpcaitech/cuda-conda:11.3"
+    },
+    {
+      "torch_version": "1.12.1",
+      "cuda_image": "hpcaitech/cuda-conda:10.2"
+    },
+    {
+      "torch_version": "1.12.1",
+      "cuda_image": "hpcaitech/cuda-conda:11.3"
+    },
+    {
+      "torch_version": "1.12.1",
+      "cuda_image": "hpcaitech/cuda-conda:11.6"
+    }
+  ]
+}
diff --git a/.github/workflows/README.md b/.github/workflows/README.md
index bc1f8504df3c..cda6a3139a1b 100644
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@@ -48,14 +48,15 @@ In the section below, we will dive into the details of different workflows avail
 
 ### Release
 
-| Workflow Name               | File name                       | Description                                                                                                                                           |
-| --------------------------- | ------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `Draft GitHub Release Post` | `draft_github_release_post.yml` | Compose a GitHub release post draft based on the commit history.  Triggered when the change of `version.txt` is merged.                               |
-| `Release to PyPI`           | `release_pypi.yml`              | Build and release the wheel to PyPI.  Triggered when the change of `version.txt` is merged.                                                           |
-| `Release Nightly to PyPI`   | `release_nightly.yml`           | Build and release the nightly wheel to PyPI as `colossalai-nightly`. Automatically executed every Sunday.                                             |
-| `Release Docker`            | `release_docker.yml`            | Build and release the Docker image to DockerHub. Triggered when the change of `version.txt` is merged.                                                |
-| `Release bdist wheel`       | `release_bdist.yml`             | Build binary wheels with pre-built PyTorch extensions. Manually dispatched. See more details in the next section.                                     |
-| `Auto Compatibility Test`   | `auto_compatibility_test.yml`   | Check Colossal-AI's compatiblity against the PyTorch and CUDA version specified in `.compatibility`. Triggered when `version.txt` is changed in a PR. |
+| Workflow Name               | File name                       | Description                                                                                                                                                 |
+| --------------------------- | ------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `Draft GitHub Release Post` | `draft_github_release_post.yml` | Compose a GitHub release post draft based on the commit history.  Triggered when the change of `version.txt` is merged.                                     |
+| `Release to PyPI`           | `release_pypi.yml`              | Build and release the wheel to PyPI.  Triggered when the change of `version.txt` is merged.                                                                 |
+| `Release Nightly to PyPI`   | `release_nightly.yml`           | Build and release the nightly wheel to PyPI as `colossalai-nightly`. Automatically executed every Sunday.                                                   |
+| `Release Docker`            | `release_docker.yml`            | Build and release the Docker image to DockerHub. Triggered when the change of `version.txt` is merged.                                                      |
+| `Release bdist wheel`       | `release_bdist.yml`             | Build binary wheels with pre-built PyTorch extensions. Manually dispatched. See more details in the next section.                                           |
+| `Auto Release bdist wheel`  | `auto_release_bdist.yml`        | Build binary wheels with pre-built PyTorch extensions.Triggered when the change of `version.txt` is merged. Build specificatons are stored in `.bdist.json` |
+| `Auto Compatibility Test`   | `auto_compatibility_test.yml`   | Check Colossal-AI's compatiblity against the PyTorch and CUDA version specified in `.compatibility`. Triggered when `version.txt` is changed in a PR.       |
 
 ### Manual Dispatch
 
@@ -105,6 +106,21 @@ This section lists the files used to configure the workflow.
 
 This `.compatibility` file is to tell GitHub Actions which PyTorch and CUDA versions to test against. Each line in the file is in the format `${torch-version}-${cuda-version}`, which is a tag for Docker image. Thus, this tag must be present in the [docker registry](https://hub.docker.com/r/pytorch/conda-cuda) so as to perform the test.
 
+2. `.bdist.json`
+
+This file controls what pytorch/cuda compatible pre-built releases will be built and published. You can add a new entry according to the json schema below if there is a new wheel that needs to be built with AOT compilation of PyTorch extensions.
+
+```json
+{
+  "build": [
+    {
+      "torch_version": "",
+      "cuda_image": ""
+    },
+  ]
+}
+```
+
 ## Progress Log
 
 - [x] unit testing
diff --git a/.github/workflows/auto_release_bdist.yml b/.github/workflows/auto_release_bdist.yml
new file mode 100644
index 000000000000..56a3036f8c94
--- /dev/null
+++ b/.github/workflows/auto_release_bdist.yml
@@ -0,0 +1,70 @@
+name: Auto Release bdist wheel
+
+on:
+  workflow_dispatch:
+  pull_request:
+    paths:
+      - 'version.txt'
+    types:
+      - closed
+
+jobs:
+  matrix_preparation:
+    name: Prepare Container List
+    if: ( github.event_name == 'workflow_dispatch' || github.event.pull_request.merged == true ) && github.repository == 'hpcaitech/ColossalAI'
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+      - uses: actions/checkout@v3
+      - id: set-matrix
+        run: |
+          bdist=$(cat .bdist.json | tr '\n' ' ')
+          echo "matrix=${bdist}" >> $GITHUB_OUTPUT
+
+  build:
+    name: Release bdist wheels
+    needs: matrix_preparation
+    runs-on: [self-hosted, gpu]
+    strategy:
+      fail-fast: false
+      matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
+    container:
+      image: ${{ matrix.build.cuda_image }}
+      options: --gpus all --rm
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      # cub is for cuda 10.2
+      - name: Copy scripts
+        run: |
+          cp -r ./.github/workflows/scripts/* ./
+
+          # link the cache diretories to current path
+          ln -s /github/home/conda_pkgs ./conda_pkgs
+          ln -s /github/home/pip_wheels ./pip_wheels
+
+          # set the conda package path
+          echo "pkgs_dirs:\n  - $PWD/conda_pkgs" > ~/.condarc
+
+          # set safe directory
+          git config --global --add safe.directory /__w/ColossalAI/ColossalAI
+
+          # get cub package for cuda 10.2
+          wget https://github.com/NVIDIA/cub/archive/refs/tags/1.8.0.zip
+          unzip 1.8.0.zip
+      - name: Build bdist wheel
+        run: |
+          pip install beautifulsoup4 requests packaging
+          python ./build_colossalai_wheel.py --torch_version $TORCH_VERSIONS
+        env:
+          TORCH_VERSIONS: ${{ matrix.build.torch_version }}
+      - name: 🚀 Deploy
+        uses: garygrossgarten/github-action-scp@release
+        with:
+          local: all_dist
+          remote: ${{ secrets.PRIVATE_PYPI_DIR }}
+          host: ${{ secrets.PRIVATE_PYPI_HOST }}
+          username: ${{ secrets.PRIVATE_PYPI_USER }}
+          password: ${{ secrets.PRIVATE_PYPI_PASSWD }}

From 93582629927dd6e413f9d46e0b96801bb14bd1d2 Mon Sep 17 00:00:00 2001
From: Haofan Wang <haofanwang.ai@gmail.com>
Date: Thu, 12 Jan 2023 13:49:01 +0800
Subject: [PATCH 173/503] Fix False warning in initialize.py (#2456)

* Update initialize.py

* pre-commit run check
---
 colossalai/initialize.py | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/colossalai/initialize.py b/colossalai/initialize.py
index e907efddee69..f3719dcb47b3 100644
--- a/colossalai/initialize.py
+++ b/colossalai/initialize.py
@@ -15,26 +15,25 @@
 from torch.optim.optimizer import Optimizer
 from torch.utils.data import DataLoader
 
-from colossalai.core import global_context as gpc
-from colossalai.context.moe_context import MOE_CONTEXT
-
-from colossalai.logging import get_dist_logger
-
-from colossalai.engine.schedule import NonPipelineSchedule, PipelineSchedule, InterleavedPipelineSchedule, get_tensor_shape
-from colossalai.engine import Engine
-from colossalai.gemini.ophooks import BaseOpHook
-
-from colossalai.utils import (get_current_device, is_using_ddp, is_using_pp, is_using_sequence, sync_model_param)
-from colossalai.utils.moe import sync_moe_model_param
-
 from colossalai.amp import AMP_TYPE, convert_to_amp
 from colossalai.amp.naive_amp import NaiveAMPModel
 from colossalai.builder.builder import build_gradient_handler
 from colossalai.context import Config, ConfigException, ParallelMode
+from colossalai.context.moe_context import MOE_CONTEXT
+from colossalai.core import global_context as gpc
+from colossalai.engine import Engine
 from colossalai.engine.gradient_accumulation import accumulate_gradient
-
+from colossalai.engine.schedule import (
+    InterleavedPipelineSchedule,
+    NonPipelineSchedule,
+    PipelineSchedule,
+    get_tensor_shape,
+)
+from colossalai.gemini.ophooks import BaseOpHook
+from colossalai.logging import get_dist_logger
 from colossalai.nn.optimizer.colossalai_optimizer import ColossalaiOptimizer
-
+from colossalai.utils import get_current_device, is_using_ddp, is_using_pp, is_using_sequence, sync_model_param
+from colossalai.utils.moe import sync_moe_model_param
 from colossalai.zero import convert_to_zero_v2
 from colossalai.zero.sharded_optim.sharded_optim_v2 import ShardedOptimizerV2
 
@@ -301,9 +300,9 @@ def initialize(model: nn.Module,
             model = model().to(get_current_device())
 
         # optimizer maybe a optimizer_cls
-        logger.warning("Initializing an non ZeRO model with optimizer class")
         if isinstance(optimizer, Callable):
             optimizer = optimizer(model.parameters())
+            logger.warning("Initializing an non ZeRO model with optimizer class")
 
     if not use_zero:
         if is_using_sequence():

From c20529fe78f52e36df209bd2ab4143609eec7535 Mon Sep 17 00:00:00 2001
From: YuliangLiu0306 <72588413+YuliangLiu0306@users.noreply.github.com>
Date: Thu, 12 Jan 2023 14:30:58 +0800
Subject: [PATCH 174/503] [examples] update autoparallel tutorial demo (#2449)

* [examples] update autoparallel tutorial demo

* add test_ci.sh

* polish

* add conda yaml
---
 .../auto_parallel_with_resnet.py              | 132 +++---------------
 .../tutorial/auto_parallel/environment.yaml   |  32 +++++
 examples/tutorial/auto_parallel/setup.py      |  13 ++
 examples/tutorial/auto_parallel/test_ci.sh    |  11 ++
 4 files changed, 72 insertions(+), 116 deletions(-)
 create mode 100644 examples/tutorial/auto_parallel/environment.yaml
 create mode 100644 examples/tutorial/auto_parallel/setup.py
 create mode 100644 examples/tutorial/auto_parallel/test_ci.sh

diff --git a/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py b/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py
index e4aff13e484a..1f0d720449e5 100644
--- a/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py
+++ b/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py
@@ -4,23 +4,14 @@
 
 import torch
 from titans.utils import barrier_context
-from torch.fx import GraphModule
 from torchvision import transforms
 from torchvision.datasets import CIFAR10
 from torchvision.models import resnet50
 from tqdm import tqdm
 
 import colossalai
-from colossalai.auto_parallel.passes.runtime_apply_pass import runtime_apply_pass
-from colossalai.auto_parallel.passes.runtime_preparation_pass import runtime_preparation_pass
-from colossalai.auto_parallel.tensor_shard.solver.cost_graph import CostGraph
-from colossalai.auto_parallel.tensor_shard.solver.graph_analysis import GraphAnalyser
-from colossalai.auto_parallel.tensor_shard.solver.options import DataloaderOption, SolverOptions
-from colossalai.auto_parallel.tensor_shard.solver.solver import Solver
-from colossalai.auto_parallel.tensor_shard.solver.strategies_constructor import StrategiesConstructor
+from colossalai.auto_parallel.tensor_shard.initialize import autoparallelize
 from colossalai.core import global_context as gpc
-from colossalai.device.device_mesh import DeviceMesh
-from colossalai.fx.tracer.tracer import ColoTracer
 from colossalai.logging import get_dist_logger
 from colossalai.nn.lr_scheduler import CosineAnnealingLR
 from colossalai.utils import get_dataloader
@@ -28,12 +19,6 @@
 DATA_ROOT = Path(os.environ.get('DATA', '../data')).absolute()
 
 
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-s', '--synthetic', action="store_true", help="use synthetic dataset instead of CIFAR10")
-    return parser.parse_args()
-
-
 def synthesize_data():
     img = torch.rand(gpc.config.BATCH_SIZE, 3, 32, 32)
     label = torch.randint(low=0, high=10, size=(gpc.config.BATCH_SIZE,))
@@ -41,82 +26,15 @@ def synthesize_data():
 
 
 def main():
-    args = parse_args()
     colossalai.launch_from_torch(config='./config.py')
 
     logger = get_dist_logger()
 
-    if not args.synthetic:
-        with barrier_context():
-            # build dataloaders
-            train_dataset = CIFAR10(root=DATA_ROOT,
-                                    download=True,
-                                    transform=transforms.Compose([
-                                        transforms.RandomCrop(size=32, padding=4),
-                                        transforms.RandomHorizontalFlip(),
-                                        transforms.ToTensor(),
-                                        transforms.Normalize(mean=[0.4914, 0.4822, 0.4465],
-                                                             std=[0.2023, 0.1994, 0.2010]),
-                                    ]))
-
-        test_dataset = CIFAR10(root=DATA_ROOT,
-                               train=False,
-                               transform=transforms.Compose([
-                                   transforms.ToTensor(),
-                                   transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010]),
-                               ]))
-
-        train_dataloader = get_dataloader(
-            dataset=train_dataset,
-            add_sampler=True,
-            shuffle=True,
-            batch_size=gpc.config.BATCH_SIZE,
-            pin_memory=True,
-        )
-
-        test_dataloader = get_dataloader(
-            dataset=test_dataset,
-            add_sampler=True,
-            batch_size=gpc.config.BATCH_SIZE,
-            pin_memory=True,
-        )
-    else:
-        train_dataloader, test_dataloader = None, None
-
-    # initialize device mesh
-    physical_mesh_id = torch.arange(0, 4)
-    mesh_shape = (2, 2)
-    device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
-
     # trace the model with meta data
-    tracer = ColoTracer()
     model = resnet50(num_classes=10).cuda()
     input_sample = {'x': torch.rand([gpc.config.BATCH_SIZE * torch.distributed.get_world_size(), 3, 32, 32]).to('meta')}
-    graph = tracer.trace(root=model, meta_args=input_sample)
-    gm = GraphModule(model, graph, model.__class__.__name__)
-    gm.recompile()
-
-    # prepare info for solver
-    solver_options = SolverOptions(dataloader_option=DataloaderOption.DISTRIBUTED)
-    strategies_constructor = StrategiesConstructor(graph, device_mesh, solver_options)
-    strategies_constructor.build_strategies_and_cost()
-    cost_graph = CostGraph(strategies_constructor.leaf_strategies)
-    cost_graph.simplify_graph()
-    graph_analyser = GraphAnalyser(gm)
-
-    # solve the solution
-    solver = Solver(gm.graph, strategies_constructor, cost_graph, graph_analyser)
-    ret = solver.call_solver_serialized_args()
-    solution = list(ret[0])
-    if gpc.get_global_rank() == 0:
-        for index, node in enumerate(graph.nodes):
-            print(node.name, node.strategies_vector[solution[index]].name)
-
-    # process the graph for distributed training ability
-    gm, sharding_spec_dict, origin_spec_dict, comm_actions_dict = runtime_preparation_pass(gm, solution, device_mesh)
-    gm = runtime_apply_pass(gm)
-    gm.recompile()
 
+    model = autoparallelize(model, input_sample)
     # build criterion
     criterion = torch.nn.CrossEntropyLoss()
 
@@ -127,65 +45,47 @@ def main():
     lr_scheduler = CosineAnnealingLR(optimizer, total_steps=gpc.config.NUM_EPOCHS)
 
     for epoch in range(gpc.config.NUM_EPOCHS):
-        gm.train()
+        model.train()
 
-        if args.synthetic:
-            # if we use synthetic data
-            # we assume it only has 30 steps per epoch
-            num_steps = range(30)
+        # if we use synthetic data
+        # we assume it only has 30 steps per epoch
+        num_steps = range(30)
 
-        else:
-            # we use the actual number of steps for training
-            num_steps = range(len(train_dataloader))
-            data_iter = iter(train_dataloader)
         progress = tqdm(num_steps)
 
         for _ in progress:
-            if args.synthetic:
-                # generate fake data
-                img, label = synthesize_data()
-            else:
-                # get the real data
-                img, label = next(data_iter)
+            # generate fake data
+            img, label = synthesize_data()
 
             img = img.cuda()
             label = label.cuda()
             optimizer.zero_grad()
-            output = gm(img, sharding_spec_dict, origin_spec_dict, comm_actions_dict)
+            output = model(img)
             train_loss = criterion(output, label)
             train_loss.backward(train_loss)
             optimizer.step()
         lr_scheduler.step()
 
         # run evaluation
-        gm.eval()
+        model.eval()
         correct = 0
         total = 0
 
-        if args.synthetic:
-            # if we use synthetic data
-            # we assume it only has 10 steps for evaluation
-            num_steps = range(30)
+        # if we use synthetic data
+        # we assume it only has 10 steps for evaluation
+        num_steps = range(30)
 
-        else:
-            # we use the actual number of steps for training
-            num_steps = range(len(test_dataloader))
-            data_iter = iter(test_dataloader)
         progress = tqdm(num_steps)
 
         for _ in progress:
-            if args.synthetic:
-                # generate fake data
-                img, label = synthesize_data()
-            else:
-                # get the real data
-                img, label = next(data_iter)
+            # generate fake data
+            img, label = synthesize_data()
 
             img = img.cuda()
             label = label.cuda()
 
             with torch.no_grad():
-                output = gm(img, sharding_spec_dict, origin_spec_dict, comm_actions_dict)
+                output = model(img)
                 test_loss = criterion(output, label)
             pred = torch.argmax(output, dim=-1)
             correct += torch.sum(pred == label)
diff --git a/examples/tutorial/auto_parallel/environment.yaml b/examples/tutorial/auto_parallel/environment.yaml
new file mode 100644
index 000000000000..5b811631a19f
--- /dev/null
+++ b/examples/tutorial/auto_parallel/environment.yaml
@@ -0,0 +1,32 @@
+name: auto
+channels:
+  - pytorch
+  - conda-forge
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=conda_forge
+  - _openmp_mutex=4.5=2_kmp_llvm
+  - blas=1.0=mkl
+  - brotlipy=0.7.0=py38h27cfd23_1003
+  - bzip2=1.0.8=h7b6447c_0
+  - ca-certificates=2022.12.7=ha878542_0
+  - certifi=2022.12.7=pyhd8ed1ab_0
+  - cffi=1.15.1=py38h74dc2b5_0
+  - charset-normalizer=2.0.4=pyhd3eb1b0_0
+  - coin-or-cbc=2.10.8=h3786ebc_0
+  - coin-or-cgl=0.60.6=h6f57e76_2
+  - coin-or-clp=1.17.7=hc56784d_2
+  - coin-or-osi=0.108.7=h2720bb7_2
+  - coin-or-utils=2.11.6=h202d8b1_2
+  - python=3.8.13
+  - pip=22.2.2
+  - cudatoolkit=11.3
+  - pytorch=1.12.1
+  - torchvision=0.13.1
+  - numpy=1.23.1
+  - pip:
+    - titans
+    - torch==1.12.1
+    - pulp==2.7.0
+    - datasets
+    - colossalai
diff --git a/examples/tutorial/auto_parallel/setup.py b/examples/tutorial/auto_parallel/setup.py
new file mode 100644
index 000000000000..6e6cff32ed23
--- /dev/null
+++ b/examples/tutorial/auto_parallel/setup.py
@@ -0,0 +1,13 @@
+from setuptools import find_packages, setup
+
+setup(
+    name='auto_parallel',
+    version='0.0.1',
+    description='',
+    packages=find_packages(),
+    install_requires=[
+        'torch',
+        'numpy',
+        'tqdm',
+    ],
+)
diff --git a/examples/tutorial/auto_parallel/test_ci.sh b/examples/tutorial/auto_parallel/test_ci.sh
new file mode 100644
index 000000000000..74332548f623
--- /dev/null
+++ b/examples/tutorial/auto_parallel/test_ci.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+set -euxo pipefail
+
+conda init bash
+conda env create -f environment.yaml
+conda activate auto
+cd ../../..
+pip uninstall colossalai
+pip install -v .
+cd ./examples/tutorial/auto_parallel
+colossalai run --nproc_per_node 4 auto_parallel_with_resnet.py -s

From 14d929936065fae2ebd99bc5d4ab32d8de7db11e Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Thu, 12 Jan 2023 14:52:09 +0800
Subject: [PATCH 175/503] [cli] fixed hostname mismatch error (#2465)

---
 colossalai/cli/launcher/hostinfo.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/colossalai/cli/launcher/hostinfo.py b/colossalai/cli/launcher/hostinfo.py
index 2f0830c5880d..065cbc37101f 100644
--- a/colossalai/cli/launcher/hostinfo.py
+++ b/colossalai/cli/launcher/hostinfo.py
@@ -1,5 +1,5 @@
-from typing import List
 import socket
+from typing import List
 
 
 class HostInfo:
@@ -35,9 +35,14 @@ def is_host_localhost(hostname: str, port: str = None) -> None:
 
         if port is None:
             port = 22    # no port specified, lets just use the ssh port
-        hostname = socket.getfqdn(hostname)
+
+        # socket.getfqdn("127.0.0.1") does not return localhost
+        # on some users' machines
+        # thus, we directly return True if hostname is locahost, 127.0.0.1 or 0.0.0.0
         if hostname in ("localhost", "127.0.0.1", "0.0.0.0"):
             return True
+
+        hostname = socket.getfqdn(hostname)
         localhost = socket.gethostname()
         localaddrs = socket.getaddrinfo(localhost, port)
         targetaddrs = socket.getaddrinfo(hostname, port)

From e6943e2d11fbe7dd69c694e435f598cd140b1574 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Thu, 12 Jan 2023 16:26:42 +0800
Subject: [PATCH 176/503] [example] integrate autoparallel demo with CI (#2466)

* [example] integrate autoparallel demo with CI

* polish code

* polish code

* polish code

* polish code
---
 examples/tutorial/auto_parallel/README.md     | 95 ++++++-------------
 .../auto_parallel_with_resnet.py              | 18 +---
 examples/tutorial/auto_parallel/config.py     |  4 +-
 .../tutorial/auto_parallel/environment.yaml   | 32 -------
 .../tutorial/auto_parallel/requirements.txt   |  9 +-
 examples/tutorial/auto_parallel/test_ci.sh    | 11 +--
 6 files changed, 43 insertions(+), 126 deletions(-)
 delete mode 100644 examples/tutorial/auto_parallel/environment.yaml

diff --git a/examples/tutorial/auto_parallel/README.md b/examples/tutorial/auto_parallel/README.md
index e99a018c2da1..bb014b9067b2 100644
--- a/examples/tutorial/auto_parallel/README.md
+++ b/examples/tutorial/auto_parallel/README.md
@@ -1,73 +1,52 @@
-# Auto-Parallelism with ResNet
+# Auto-Parallelism
 
-## 🚀Quick Start
-### Auto-Parallel Tutorial
-1. Install `pulp` and `coin-or-cbc` for the solver.
-```bash
-pip install pulp
-conda install -c conda-forge coin-or-cbc
-```
-2. Run the auto parallel resnet example with 4 GPUs with synthetic dataset.
-```bash
-colossalai run --nproc_per_node 4 auto_parallel_with_resnet.py -s
-```
+## Table of contents
 
-You should expect to the log like this. This log shows the edge cost on the computation graph as well as the sharding strategy for an operation. For example, `layer1_0_conv1 S01R = S01R X RR` means that the first dimension (batch) of the input and output is sharded while the weight is not sharded (S means sharded, R means replicated), simply equivalent to data parallel training.
-![](https://raw.githubusercontent.com/hpcaitech/public_assets/main/examples/tutorial/auto-parallel%20demo.png)
+- [Auto-Parallelism](#auto-parallelism)
+  - [Table of contents](#table-of-contents)
+  - [📚 Overview](#-overview)
+  - [🚀 Quick Start](#-quick-start)
+    - [Setup](#setup)
+    - [Auto-Parallel Tutorial](#auto-parallel-tutorial)
+    - [Auto-Checkpoint Tutorial](#auto-checkpoint-tutorial)
 
 
-### Auto-Checkpoint Tutorial
-1. Stay in the `auto_parallel` folder.
-2. Install the dependencies.
-```bash
-pip install matplotlib transformers
-```
-3. Run a simple resnet50 benchmark to automatically checkpoint the model.
-```bash
-python auto_ckpt_solver_test.py --model resnet50
-```
+## 📚 Overview
 
-You should expect the log to be like this
-![](https://raw.githubusercontent.com/hpcaitech/public_assets/main/examples/tutorial/auto-ckpt%20demo.png)
+This tutorial folder contains a simple demo to run auto-parallelism with ResNet. Meanwhile, this diretory also contains demo scripts to run automatic activation checkpointing, but both features are still experimental for now and no guarantee that they will work for your version of Colossal-AI.
 
-This shows that given different memory budgets, the model is automatically injected with activation checkpoint and its time taken per iteration. You can run this benchmark for GPT as well but it can much longer since the model is larger.
-```bash
-python auto_ckpt_solver_test.py --model gpt2
-```
+## 🚀 Quick Start
 
-4. Run a simple benchmark to find the optimal batch size for checkpointed model.
-```bash
-python auto_ckpt_batchsize_test.py
-```
+### Setup
 
-You can expect the log to be like
-![](https://raw.githubusercontent.com/hpcaitech/public_assets/main/examples/tutorial/auto-ckpt%20batchsize.png)
-
-
-## Prepare Dataset
-
-We use CIFAR10 dataset in this example. You should invoke the `donwload_cifar10.py` in the tutorial root directory or directly run the `auto_parallel_with_resnet.py`.
-The dataset will be downloaded to `colossalai/examples/tutorials/data` by default.
-If you wish to use customized directory for the dataset. You can set the environment variable `DATA` via the following command.
+1. Create a conda environment
 
 ```bash
-export DATA=/path/to/data
+conda create -n auto python=3.8
+conda activate auto
 ```
 
-## extra requirements to use autoparallel
+2. Install `requirements` and `coin-or-cbc` for the solver.
 
 ```bash
-pip install pulp
-conda install coin-or-cbc
+pip install -r requirements.txt
+conda install -c conda-forge coin-or-cbc
 ```
 
-## Run on 2*2 device mesh
+
+### Auto-Parallel Tutorial
+
+Run the auto parallel resnet example with 4 GPUs with synthetic dataset.
 
 ```bash
 colossalai run --nproc_per_node 4 auto_parallel_with_resnet.py
 ```
 
-## Auto Checkpoint Benchmarking
+You should expect to the log like this. This log shows the edge cost on the computation graph as well as the sharding strategy for an operation. For example, `layer1_0_conv1 S01R = S01R X RR` means that the first dimension (batch) of the input and output is sharded while the weight is not sharded (S means sharded, R means replicated), simply equivalent to data parallel training.
+![](https://raw.githubusercontent.com/hpcaitech/public_assets/main/examples/tutorial/auto-parallel%20demo.png)
+
+
+### Auto-Checkpoint Tutorial
 
 We prepare two bechmarks for you to test the performance of auto checkpoint
 
@@ -86,21 +65,3 @@ python auto_ckpt_solver_test.py --model resnet50
 # tun auto_ckpt_batchsize_test.py
 python auto_ckpt_batchsize_test.py
 ```
-
-There are some results for your reference
-
-## Auto Checkpoint Solver Test
-
-### ResNet 50
-![](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/tutorial/resnet50_benchmark.png)
-
-### GPT2 Medium
-![](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/tutorial/gpt2_benchmark.png)
-
-## Auto Checkpoint Batch Size Test
-```bash
-===============test summary================
-batch_size: 512, peak memory: 73314.392 MB, through put: 254.286 images/s
-batch_size: 1024, peak memory: 73316.216 MB, through put: 397.608 images/s
-batch_size: 2048, peak memory: 72927.837 MB, through put: 277.429 images/s
-```
diff --git a/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py b/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py
index 1f0d720449e5..15429f19cbcf 100644
--- a/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py
+++ b/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py
@@ -1,11 +1,4 @@
-import argparse
-import os
-from pathlib import Path
-
 import torch
-from titans.utils import barrier_context
-from torchvision import transforms
-from torchvision.datasets import CIFAR10
 from torchvision.models import resnet50
 from tqdm import tqdm
 
@@ -14,9 +7,6 @@
 from colossalai.core import global_context as gpc
 from colossalai.logging import get_dist_logger
 from colossalai.nn.lr_scheduler import CosineAnnealingLR
-from colossalai.utils import get_dataloader
-
-DATA_ROOT = Path(os.environ.get('DATA', '../data')).absolute()
 
 
 def synthesize_data():
@@ -48,9 +38,8 @@ def main():
         model.train()
 
         # if we use synthetic data
-        # we assume it only has 30 steps per epoch
-        num_steps = range(30)
-
+        # we assume it only has 10 steps per epoch
+        num_steps = range(10)
         progress = tqdm(num_steps)
 
         for _ in progress:
@@ -73,8 +62,7 @@ def main():
 
         # if we use synthetic data
         # we assume it only has 10 steps for evaluation
-        num_steps = range(30)
-
+        num_steps = range(10)
         progress = tqdm(num_steps)
 
         for _ in progress:
diff --git a/examples/tutorial/auto_parallel/config.py b/examples/tutorial/auto_parallel/config.py
index fa14eda740f7..52e0abcef698 100644
--- a/examples/tutorial/auto_parallel/config.py
+++ b/examples/tutorial/auto_parallel/config.py
@@ -1,2 +1,2 @@
-BATCH_SIZE = 128
-NUM_EPOCHS = 10
+BATCH_SIZE = 32
+NUM_EPOCHS = 2
diff --git a/examples/tutorial/auto_parallel/environment.yaml b/examples/tutorial/auto_parallel/environment.yaml
deleted file mode 100644
index 5b811631a19f..000000000000
--- a/examples/tutorial/auto_parallel/environment.yaml
+++ /dev/null
@@ -1,32 +0,0 @@
-name: auto
-channels:
-  - pytorch
-  - conda-forge
-  - defaults
-dependencies:
-  - _libgcc_mutex=0.1=conda_forge
-  - _openmp_mutex=4.5=2_kmp_llvm
-  - blas=1.0=mkl
-  - brotlipy=0.7.0=py38h27cfd23_1003
-  - bzip2=1.0.8=h7b6447c_0
-  - ca-certificates=2022.12.7=ha878542_0
-  - certifi=2022.12.7=pyhd8ed1ab_0
-  - cffi=1.15.1=py38h74dc2b5_0
-  - charset-normalizer=2.0.4=pyhd3eb1b0_0
-  - coin-or-cbc=2.10.8=h3786ebc_0
-  - coin-or-cgl=0.60.6=h6f57e76_2
-  - coin-or-clp=1.17.7=hc56784d_2
-  - coin-or-osi=0.108.7=h2720bb7_2
-  - coin-or-utils=2.11.6=h202d8b1_2
-  - python=3.8.13
-  - pip=22.2.2
-  - cudatoolkit=11.3
-  - pytorch=1.12.1
-  - torchvision=0.13.1
-  - numpy=1.23.1
-  - pip:
-    - titans
-    - torch==1.12.1
-    - pulp==2.7.0
-    - datasets
-    - colossalai
diff --git a/examples/tutorial/auto_parallel/requirements.txt b/examples/tutorial/auto_parallel/requirements.txt
index 137a69e80498..ce89e7c80070 100644
--- a/examples/tutorial/auto_parallel/requirements.txt
+++ b/examples/tutorial/auto_parallel/requirements.txt
@@ -1,2 +1,7 @@
-colossalai >= 0.1.12
-torch >= 1.8.1
+torch
+colossalai
+titans
+pulp
+datasets
+matplotlib
+transformers
diff --git a/examples/tutorial/auto_parallel/test_ci.sh b/examples/tutorial/auto_parallel/test_ci.sh
index 74332548f623..bf6275b673ff 100644
--- a/examples/tutorial/auto_parallel/test_ci.sh
+++ b/examples/tutorial/auto_parallel/test_ci.sh
@@ -1,11 +1,6 @@
 #!/bin/bash
 set -euxo pipefail
 
-conda init bash
-conda env create -f environment.yaml
-conda activate auto
-cd ../../..
-pip uninstall colossalai
-pip install -v .
-cd ./examples/tutorial/auto_parallel
-colossalai run --nproc_per_node 4 auto_parallel_with_resnet.py -s
+pip install -r requirements.txt
+conda install -c conda-forge coin-or-cbc
+colossalai run --nproc_per_node 4 auto_parallel_with_resnet.py

From 867c8c2d3a90bbf55a5bedba80a3aeabe0299d0f Mon Sep 17 00:00:00 2001
From: Jiarui Fang <fangjiarui123@gmail.com>
Date: Fri, 13 Jan 2023 10:05:58 +0800
Subject: [PATCH 177/503] [zero] low level optim supports ProcessGroup (#2464)

---
 colossalai/zero/sharded_optim/_utils.py       | 25 +++++--
 .../sharded_optim/bookkeeping/base_store.py   | 13 +++-
 .../sharded_optim/bookkeeping/bucket_store.py |  9 +--
 .../bookkeeping/parameter_store.py            |  8 ++-
 .../zero/sharded_optim/low_level_optim.py     | 69 +++++++++++--------
 .../language/gpt/gemini/train_gpt_demo.py     | 17 +++--
 .../test_zero/low_level_zero/test_grad_acc.py | 13 +++-
 .../test_zero/low_level_zero/test_zero1_2.py  |  6 ++
 8 files changed, 107 insertions(+), 53 deletions(-)

diff --git a/colossalai/zero/sharded_optim/_utils.py b/colossalai/zero/sharded_optim/_utils.py
index 9a839a5705c3..7369f8a2edde 100644
--- a/colossalai/zero/sharded_optim/_utils.py
+++ b/colossalai/zero/sharded_optim/_utils.py
@@ -1,4 +1,5 @@
 import math
+from typing import Optional
 
 import torch
 import torch.distributed as dist
@@ -7,6 +8,7 @@
 
 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
+from colossalai.tensor import ProcessGroup
 from colossalai.utils import is_model_parallel_parameter
 
 
@@ -101,7 +103,7 @@ def split_half_float_double(tensor_list):
     return buckets
 
 
-def reduce_tensor(tensor, dtype=None, dst_rank=None, parallel_mode=ParallelMode.DATA):
+def reduce_tensor_dp_group(tensor, dtype=None, dst_rank=None, pg: Optional[ProcessGroup] = None):
     """
     Reduce the tensor in the data parallel process group
 
@@ -114,7 +116,7 @@ def reduce_tensor(tensor, dtype=None, dst_rank=None, parallel_mode=ParallelMode.
     :type tensor: torch.Tensor
     :type dtype: torch.dtype, optional
     :type dst_rank: int, optional
-    :type parallel_mode: ParallelMode, optional
+    :type pg: ProcessGroup, optional
     """
     # use the original dtype
     if dtype is None:
@@ -126,8 +128,13 @@ def reduce_tensor(tensor, dtype=None, dst_rank=None, parallel_mode=ParallelMode.
     else:
         tensor_to_reduce = tensor
 
-    world_size = gpc.get_world_size(parallel_mode)
-    group = gpc.get_group(parallel_mode)
+    if isinstance(pg, ProcessGroup):
+        group = pg.dp_process_group()
+        world_size = pg.dp_world_size()
+    else:
+        world_size = gpc.get_world_size(ParallelMode.DATA)
+        group = gpc.get_group(ParallelMode.DATA)
+
     tensor_to_reduce.div_(world_size)
 
     # if rank is None, all reduce will be used
@@ -137,13 +144,19 @@ def reduce_tensor(tensor, dtype=None, dst_rank=None, parallel_mode=ParallelMode.
     if use_all_reduce:
         dist.all_reduce(tensor_to_reduce, group=group)
     else:
-        ranks_in_group = gpc.get_ranks_in_group(parallel_mode)
+        if pg is not None:
+            ranks_in_group = pg.dp_rank_list()
+        else:
+            ranks_in_group = gpc.get_ranks_in_group(ParallelMode.DATA)
         global_rank = ranks_in_group[dst_rank]
         dist.reduce(tensor=tensor_to_reduce, dst=global_rank, group=group)
 
     # recover the original dtype
     if tensor.dtype != dtype and tensor is not tensor_to_reduce:
-        local_rank = gpc.get_local_rank(parallel_mode)
+        if pg is not None:
+            local_rank = pg.dp_local_rank()
+        else:
+            local_rank = gpc.get_local_rank(ParallelMode.DATA)
         if use_all_reduce or dst_rank == local_rank:
             tensor.copy_(tensor_to_reduce)
 
diff --git a/colossalai/zero/sharded_optim/bookkeeping/base_store.py b/colossalai/zero/sharded_optim/bookkeeping/base_store.py
index d4436acaa4bf..3623ed1f048c 100644
--- a/colossalai/zero/sharded_optim/bookkeeping/base_store.py
+++ b/colossalai/zero/sharded_optim/bookkeeping/base_store.py
@@ -1,12 +1,19 @@
+from typing import Optional
+
 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
+from colossalai.tensor import ProcessGroup
 
 
 class BaseStore:
 
-    def __init__(self, dp_parallel_mode=ParallelMode.DATA):
-        self._world_size = gpc.get_world_size(dp_parallel_mode)
-        self._local_rank = gpc.get_local_rank(dp_parallel_mode)
+    def __init__(self, pg: Optional[ProcessGroup] = None):
+        if isinstance(pg, ProcessGroup):
+            self._world_size = pg.dp_world_size()
+            self._local_rank = pg.dp_local_rank()
+        else:
+            self._world_size = gpc.get_world_size(ParallelMode.DATA)
+            self._local_rank = gpc.get_local_rank(ParallelMode.DATA)
 
     @property
     def world_size(self):
diff --git a/colossalai/zero/sharded_optim/bookkeeping/bucket_store.py b/colossalai/zero/sharded_optim/bookkeeping/bucket_store.py
index 0f2b1bb88b58..aba61624e46e 100644
--- a/colossalai/zero/sharded_optim/bookkeeping/bucket_store.py
+++ b/colossalai/zero/sharded_optim/bookkeeping/bucket_store.py
@@ -1,13 +1,14 @@
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
+from typing import Optional
+
+from colossalai.tensor import ProcessGroup
 
 from .base_store import BaseStore
 
 
 class BucketStore(BaseStore):
 
-    def __init__(self, dp_parallel_mode):
-        super().__init__(dp_parallel_mode)
+    def __init__(self, pg: Optional[ProcessGroup] = None):
+        super().__init__(pg)
         self._grads = dict()
         self._params = dict()
         self._num_elements_in_bucket = dict()
diff --git a/colossalai/zero/sharded_optim/bookkeeping/parameter_store.py b/colossalai/zero/sharded_optim/bookkeeping/parameter_store.py
index 09ebaaf9938c..c22186abee0f 100644
--- a/colossalai/zero/sharded_optim/bookkeeping/parameter_store.py
+++ b/colossalai/zero/sharded_optim/bookkeeping/parameter_store.py
@@ -1,14 +1,16 @@
-from typing import List
+from typing import List, Optional
 
 from torch import Tensor
 
+from colossalai.tensor import ProcessGroup
+
 from .base_store import BaseStore
 
 
 class ParameterStore(BaseStore):
 
-    def __init__(self, dp_paralle_mode):
-        super().__init__(dp_paralle_mode)
+    def __init__(self, pg: Optional[ProcessGroup] = None):
+        super().__init__(pg)
         # param partitioning data structures
         self._fp16_param_to_rank = dict()
         self._rank_groupid_to_fp16_param_list = dict()
diff --git a/colossalai/zero/sharded_optim/low_level_optim.py b/colossalai/zero/sharded_optim/low_level_optim.py
index c437ac54939c..e372eaa50be4 100644
--- a/colossalai/zero/sharded_optim/low_level_optim.py
+++ b/colossalai/zero/sharded_optim/low_level_optim.py
@@ -1,5 +1,5 @@
 from functools import partial
-from itertools import groupby
+from typing import Optional
 
 import torch
 import torch.distributed as dist
@@ -10,6 +10,7 @@
 from colossalai.core import global_context as gpc
 from colossalai.logging import get_dist_logger
 from colossalai.nn.optimizer import ColossalaiOptimizer
+from colossalai.tensor import ProcessGroup
 from colossalai.utils.cuda import get_current_device
 
 from ._utils import (
@@ -18,7 +19,7 @@
     flatten,
     get_grad_accumulate_object,
     has_inf_or_nan,
-    reduce_tensor,
+    reduce_tensor_dp_group,
     release_param_grad,
     split_half_float_double,
     sync_param,
@@ -33,7 +34,7 @@ class LowLevelZeroOptimizer(ColossalaiOptimizer):
     def __init__(
             self,
             optimizer: Optimizer,
-
+            pg: Optional[ProcessGroup] = None,
     # grad scaler config
             initial_scale=2**16,
             min_scale=1,
@@ -54,9 +55,6 @@ def __init__(
 
     # stage 2
             partition_grad=False,
-            dp_parallel_mode=ParallelMode.DATA,
-            mp_parallel_mode=ParallelMode.MODEL,
-
     # cpu offload
             cpu_offload=False,
 
@@ -76,21 +74,33 @@ def __init__(
         # stage 2
         self._partition_grads = partition_grad
 
-        # cpu_offload
         self._cpu_offload = cpu_offload
 
-        # get process groups
-        self._dp_parallel_mode = dp_parallel_mode
-        self._mp_parallel_mode = mp_parallel_mode
-        self._local_rank = gpc.get_local_rank(dp_parallel_mode)
-        self._world_size = gpc.get_world_size(dp_parallel_mode)
-
-        self._dp_group = gpc.get_group(dp_parallel_mode)
-        if gpc.is_initialized(mp_parallel_mode) and gpc.get_world_size(mp_parallel_mode) > 1:
-            self._mp_group = gpc.get_group(mp_parallel_mode)
+        self._pg = pg
+        if isinstance(pg, ProcessGroup):
+            self._local_rank = pg.dp_local_rank()
+            self._world_size = pg.dp_world_size()
+            self._dp_group = pg.dp_process_group()
+            if pg.tp_world_size() > 1:
+                self._mp_group = pg.tp_process_group()
+            else:
+                self._mp_group = None
+        elif pg is None:
+            dp_parallel_mode = ParallelMode.DATA
+            mp_parallel_mode = ParallelMode.MODEL
+
+            self._dp_parallel_mode = dp_parallel_mode
+            self._mp_parallel_mode = mp_parallel_mode
+            self._local_rank = gpc.get_local_rank(dp_parallel_mode)
+            self._world_size = gpc.get_world_size(dp_parallel_mode)
+
+            self._dp_group = gpc.get_group(dp_parallel_mode)
+            if gpc.is_initialized(mp_parallel_mode) and gpc.get_world_size(mp_parallel_mode) > 1:
+                self._mp_group = gpc.get_group(mp_parallel_mode)
+            else:
+                self._mp_group = None
         else:
-            self._mp_group = None
-
+            raise TypeError(f"pg should be None or a ProcesGroup")
         # fp16 and fp32 params for mixed precision training
         self._fp16_param_groups = dict()
         self._fp32_flat_param_groups_of_current_rank = dict()
@@ -126,9 +136,14 @@ def __init__(
 
         # ParameterStore will manage the tensor buffers used for zero
         # it will not manage the tensors used by mixed precision training
-        self._param_store = ParameterStore(self._dp_parallel_mode)
-        self._grad_store = GradientStore(self._dp_parallel_mode)
-        self._bucket_store = BucketStore(self._dp_parallel_mode)
+        if self._pg is not None:
+            self._param_store = ParameterStore(self._pg)
+            self._grad_store = GradientStore(self._pg)
+            self._bucket_store = BucketStore(self._pg)
+        else:
+            self._param_store = ParameterStore(self._dp_parallel_mode)
+            self._grad_store = GradientStore(self._dp_parallel_mode)
+            self._bucket_store = BucketStore(self._dp_parallel_mode)
 
         # iterate over the param group in the optimizer
         # partition these param groups for data parallel training
@@ -223,9 +238,7 @@ def _partition_param_list(self, param_list):
             numel_per_rank[rank_to_go] += param.numel()
 
         if self._verbose:
-            self._logger.info(f'Number of elements on ranks: {numel_per_rank}',
-                              ranks=[0],
-                              parallel_mode=self._dp_parallel_mode)
+            self._logger.info(f'Number of elements on ranks: {numel_per_rank}', ranks=[0])
         return params_per_rank
 
     def _sanity_checks(self):
@@ -371,10 +384,10 @@ def _reduce_and_copy(self, bucket: TensorBucket, reduce_rank):
 
         with torch.cuda.stream(stream):
             flat = bucket.flatten()
-            reduced_flat = reduce_tensor(tensor=flat,
-                                         dtype=self._communication_dtype,
-                                         dst_rank=reduce_rank,
-                                         parallel_mode=self._dp_parallel_mode)
+            reduced_flat = reduce_tensor_dp_group(tensor=flat,
+                                                  dtype=self._communication_dtype,
+                                                  dst_rank=reduce_rank,
+                                                  pg=self._pg)
 
             # update the reduced tensor
             if reduce_rank is None or reduce_rank == self._local_rank:
diff --git a/examples/language/gpt/gemini/train_gpt_demo.py b/examples/language/gpt/gemini/train_gpt_demo.py
index 92cb7393c37b..7bec980f95bd 100644
--- a/examples/language/gpt/gemini/train_gpt_demo.py
+++ b/examples/language/gpt/gemini/train_gpt_demo.py
@@ -290,14 +290,19 @@ def main():
             from torch.distributed.optim import ZeroRedundancyOptimizer
             optimizer = ZeroRedundancyOptimizer(model.parameters(), optimizer_class=torch.optim.Adam, lr=0.01)
     elif args.distplan.startswith("zero"):
+        pg = ProcessGroup()
         model = model.half()
-        partition_flag = args.distplan == "zero2"
+        partition_flag = (args.distplan == "zero2")
         optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
-        optimizer = LowLevelZeroOptimizer(optimizer,
-                                          reduce_bucket_size=12 * 1024 * 1024,
-                                          overlap_communication=True,
-                                          partition_grad=partition_flag,
-                                          verbose=True)
+
+        optimizer = LowLevelZeroOptimizer(
+            optimizer,
+            pg=pg,
+            reduce_bucket_size=12 * 1024 * 1024,
+            overlap_communication=True,
+            partition_grad=partition_flag,
+            verbose=True,
+        )
 
     # model is shared after TP
     numel = get_model_size(model)
diff --git a/tests/test_zero/low_level_zero/test_grad_acc.py b/tests/test_zero/low_level_zero/test_grad_acc.py
index c23b3a3e8fd8..a0d1ac531485 100644
--- a/tests/test_zero/low_level_zero/test_grad_acc.py
+++ b/tests/test_zero/low_level_zero/test_grad_acc.py
@@ -9,6 +9,7 @@
 from torch.testing import assert_close
 
 import colossalai
+from colossalai.tensor import ProcessGroup
 from colossalai.testing.random import seed_all
 from colossalai.utils import free_port
 from colossalai.zero import LowLevelZeroOptimizer
@@ -34,16 +35,18 @@ def exam_zero_1_2_grad_acc():
     # create model
     zero1_model = TestModel().cuda()
     zero2_model = copy.deepcopy(zero1_model)
-
+    pg = ProcessGroup()
     # create optimizer
     zero1_optimizer = torch.optim.Adam(zero1_model.parameters(), lr=1)
     zero2_optimizer = torch.optim.Adam(zero2_model.parameters(), lr=1)
     zero1_optimizer = LowLevelZeroOptimizer(zero1_optimizer,
+                                            pg=pg,
                                             overlap_communication=True,
                                             initial_scale=32,
                                             clip_grad_norm=1.0,
                                             verbose=True)
     zero2_optimizer = LowLevelZeroOptimizer(zero2_optimizer,
+                                            pg=pg,
                                             overlap_communication=True,
                                             partition_grad=True,
                                             initial_scale=32,
@@ -83,7 +86,7 @@ def fwd_bwd_func(number, cur_data):
         assert torch.equal(z1p.data, z2p.data)
 
 
-def exam_zero_1_grad_acc():
+def exam_zero_1_grad_acc(use_pg=True):
     local_rank = torch.distributed.get_rank()
     grad_scale = 32
     seed_all(2008)
@@ -92,6 +95,7 @@ def exam_zero_1_grad_acc():
     zero_model = TestModel()
     torch_model = copy.deepcopy(zero_model)
 
+    seed_all(2008)
     zero_model = zero_model.cuda()
     torch_model = DDP(torch_model.cuda(), bucket_cap_mb=0)
 
@@ -101,7 +105,9 @@ def exam_zero_1_grad_acc():
     # we only test stage 1 here
     # in `check_sharded_param_consistency.py`, we will test whether
     # level 1 and 2 will produce exactly the same results
+    pg = ProcessGroup() if use_pg else None    #ProcessGroup()
     zero_optimizer = LowLevelZeroOptimizer(zero_optimizer,
+                                           pg=pg,
                                            overlap_communication=False,
                                            initial_scale=grad_scale,
                                            reduce_bucket_size=262144,
@@ -152,7 +158,8 @@ def fwd_bwd_func(number, cur_data, check_flag):
 def run_dist(rank, world_size, port):
     colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host='localhost')
 
-    exam_zero_1_grad_acc()
+    exam_zero_1_grad_acc(True)
+    exam_zero_1_grad_acc(False)
     # exam_zero_1_2_grad_acc()
 
 
diff --git a/tests/test_zero/low_level_zero/test_zero1_2.py b/tests/test_zero/low_level_zero/test_zero1_2.py
index b02d3a6a4486..6924827fe4b4 100644
--- a/tests/test_zero/low_level_zero/test_zero1_2.py
+++ b/tests/test_zero/low_level_zero/test_zero1_2.py
@@ -9,6 +9,7 @@
 from torch.testing import assert_close
 
 import colossalai
+from colossalai.tensor import ProcessGroup
 from colossalai.testing.random import seed_all
 from colossalai.utils import free_port
 from colossalai.zero import LowLevelZeroOptimizer
@@ -58,14 +59,17 @@ def exam_zero_1_2():
     zero1_model = TestModel().cuda()
     zero2_model = copy.deepcopy(zero1_model)
 
+    pg = ProcessGroup()
     # create optimizer
     zero1_optimizer = torch.optim.Adam(zero1_model.parameters(), lr=1)
     zero2_optimizer = torch.optim.Adam(zero2_model.parameters(), lr=1)
     zero1_optimizer = LowLevelZeroOptimizer(zero1_optimizer,
+                                            pg=pg,
                                             overlap_communication=True,
                                             initial_scale=128,
                                             verbose=True)
     zero2_optimizer = LowLevelZeroOptimizer(zero2_optimizer,
+                                            pg=pg,
                                             overlap_communication=True,
                                             partition_grad=True,
                                             initial_scale=128)
@@ -127,7 +131,9 @@ def exam_zero_1_torch_ddp():
     # we only test stage 1 here
     # in `check_sharded_param_consistency.py`, we will test whether
     # level 1 and 2 will produce exactly the same results
+    pg = ProcessGroup()
     zero_optimizer = LowLevelZeroOptimizer(zero_optimizer,
+                                           pg=pg,
                                            overlap_communication=True,
                                            initial_scale=1,
                                            reduce_bucket_size=262144)

From 8e85d2440a7d980a37431110ed583260d6cca7fe Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Fri, 13 Jan 2023 13:31:27 +0800
Subject: [PATCH 178/503] [example] update vit ci script (#2469)

* [example] update vit ci script

* [example] update requirements

* [example] update requirements
---
 examples/images/vit/configs/vit_1d_tp2_ci.py | 32 ++++++++++++++++++++
 examples/images/vit/requirements.txt         |  6 ++++
 examples/images/vit/test_ci.sh               |  9 ++++++
 examples/images/vit/train.py                 | 25 +++++++++++----
 examples/images/vit/vit.py                   | 23 ++++++++------
 5 files changed, 79 insertions(+), 16 deletions(-)
 create mode 100644 examples/images/vit/configs/vit_1d_tp2_ci.py
 create mode 100644 examples/images/vit/test_ci.sh

diff --git a/examples/images/vit/configs/vit_1d_tp2_ci.py b/examples/images/vit/configs/vit_1d_tp2_ci.py
new file mode 100644
index 000000000000..e491e4ada45e
--- /dev/null
+++ b/examples/images/vit/configs/vit_1d_tp2_ci.py
@@ -0,0 +1,32 @@
+from colossalai.amp import AMP_TYPE
+
+# hyperparameters
+# BATCH_SIZE is as per GPU
+# global batch size = BATCH_SIZE x data parallel size
+BATCH_SIZE = 8
+LEARNING_RATE = 3e-3
+WEIGHT_DECAY = 0.3
+NUM_EPOCHS = 3
+WARMUP_EPOCHS = 1
+
+# model config
+IMG_SIZE = 224
+PATCH_SIZE = 16
+HIDDEN_SIZE = 32
+DEPTH = 2
+NUM_HEADS = 4
+MLP_RATIO = 4
+NUM_CLASSES = 10
+CHECKPOINT = False
+SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE)**2 + 1    # add 1 for cls token
+
+USE_DDP = True
+TP_WORLD_SIZE = 2
+TP_TYPE = 'row'
+parallel = dict(tensor=dict(mode="1d", size=TP_WORLD_SIZE),)
+
+fp16 = dict(mode=AMP_TYPE.NAIVE)
+clip_grad_norm = 1.0
+gradient_accumulation = 2
+
+LOG_PATH = "./log_ci"
diff --git a/examples/images/vit/requirements.txt b/examples/images/vit/requirements.txt
index 137a69e80498..1f69794ebe70 100644
--- a/examples/images/vit/requirements.txt
+++ b/examples/images/vit/requirements.txt
@@ -1,2 +1,8 @@
 colossalai >= 0.1.12
 torch >= 1.8.1
+numpy>=1.24.1
+timm>=0.6.12
+titans>=0.0.7
+tqdm>=4.61.2
+transformers>=4.25.1
+nvidia-dali-cuda110>=1.8.0 --extra-index-url https://developer.download.nvidia.com/compute/redist
diff --git a/examples/images/vit/test_ci.sh b/examples/images/vit/test_ci.sh
new file mode 100644
index 000000000000..41d25ee23521
--- /dev/null
+++ b/examples/images/vit/test_ci.sh
@@ -0,0 +1,9 @@
+export OMP_NUM_THREADS=4
+
+pip install -r requirements.txt
+
+# train
+colossalai run \
+--nproc_per_node 4 train.py \
+--config configs/vit_1d_tp2_ci.py \
+--dummy_data
diff --git a/examples/images/vit/train.py b/examples/images/vit/train.py
index de39801c7972..0b4489244368 100644
--- a/examples/images/vit/train.py
+++ b/examples/images/vit/train.py
@@ -7,6 +7,7 @@
 from timm.models.vision_transformer import _create_vision_transformer
 from titans.dataloader.imagenet import build_dali_imagenet
 from tqdm import tqdm
+from vit import DummyDataLoader
 
 import colossalai
 from colossalai.core import global_context as gpc
@@ -56,8 +57,8 @@ def init_spec_func(model, tp_type):
 def train_imagenet():
 
     parser = colossalai.get_default_parser()
-    parser.add_argument('--from_torch', default=True, action='store_true')
-    parser.add_argument('--resume_from', default=False)
+    parser.add_argument('--resume_from', default=False, action='store_true')
+    parser.add_argument('--dummy_data', default=False, action='store_true')
 
     args = parser.parse_args()
     colossalai.launch_from_torch(config=args.config)
@@ -74,10 +75,22 @@ def train_imagenet():
             logger.log_to_file(log_path)
 
     logger.info('Build data loader', ranks=[0])
-    root = os.environ['DATA']
-    train_dataloader, test_dataloader = build_dali_imagenet(root,
-                                                            train_batch_size=gpc.config.BATCH_SIZE,
-                                                            test_batch_size=gpc.config.BATCH_SIZE)
+    if not args.dummy_data:
+        root = os.environ['DATA']
+        train_dataloader, test_dataloader = build_dali_imagenet(root,
+                                                                train_batch_size=gpc.config.BATCH_SIZE,
+                                                                test_batch_size=gpc.config.BATCH_SIZE)
+    else:
+        train_dataloader = DummyDataLoader(length=10,
+                                           batch_size=gpc.config.BATCH_SIZE,
+                                           category=gpc.config.NUM_CLASSES,
+                                           image_size=gpc.config.IMG_SIZE,
+                                           return_dict=False)
+        test_dataloader = DummyDataLoader(length=5,
+                                          batch_size=gpc.config.BATCH_SIZE,
+                                          category=gpc.config.NUM_CLASSES,
+                                          image_size=gpc.config.IMG_SIZE,
+                                          return_dict=False)
 
     logger.info('Build model', ranks=[0])
 
diff --git a/examples/images/vit/vit.py b/examples/images/vit/vit.py
index 14c870b39268..f22e8ea90cec 100644
--- a/examples/images/vit/vit.py
+++ b/examples/images/vit/vit.py
@@ -32,21 +32,24 @@ def __len__(self):
 
 
 class DummyDataLoader(DummyDataGenerator):
-    batch_size = 4
-    channel = 3
-    category = 8
-    image_size = 224
+
+    def __init__(self, length=10, batch_size=4, channel=3, category=8, image_size=224, return_dict=True):
+        super().__init__(length)
+        self.batch_size = batch_size
+        self.channel = channel
+        self.category = category
+        self.image_size = image_size
+        self.return_dict = return_dict
 
     def generate(self):
         image_dict = {}
-        image_dict['pixel_values'] = torch.rand(DummyDataLoader.batch_size,
-                                                DummyDataLoader.channel,
-                                                DummyDataLoader.image_size,
-                                                DummyDataLoader.image_size,
-                                                device=get_current_device()) * 2 - 1
-        image_dict['label'] = torch.randint(DummyDataLoader.category, (DummyDataLoader.batch_size,),
+        image_dict['pixel_values'] = torch.rand(
+            self.batch_size, self.channel, self.image_size, self.image_size, device=get_current_device()) * 2 - 1
+        image_dict['label'] = torch.randint(self.category, (self.batch_size,),
                                             dtype=torch.int64,
                                             device=get_current_device())
+        if not self.return_dict:
+            return image_dict['pixel_values'], image_dict['label']
         return image_dict
 
 
From 8b7495dd541ea12e1af84b3a3a0e24abc1e847d1 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Fri, 13 Jan 2023 14:40:05 +0800
Subject: [PATCH 179/503] [example] integrate seq-parallel tutorial with CI
 (#2463)

---
 .../kernel/cuda_native/scaled_softmax.py      |  17 ++-
 .../tutorial/large_batch_optimizer/README.md  |   8 +-
 examples/tutorial/sequence_parallel/README.md | 141 ++++--------------
 examples/tutorial/sequence_parallel/config.py |  15 +-
 .../sequence_parallel/requirements.txt        |   4 +-
 .../tutorial/sequence_parallel/test_ci.sh     |   7 +
 examples/tutorial/sequence_parallel/train.py  |  44 ++----
 7 files changed, 69 insertions(+), 167 deletions(-)
 create mode 100644 examples/tutorial/sequence_parallel/test_ci.sh

diff --git a/colossalai/kernel/cuda_native/scaled_softmax.py b/colossalai/kernel/cuda_native/scaled_softmax.py
index 3f0260aaed87..44d750c5cbde 100644
--- a/colossalai/kernel/cuda_native/scaled_softmax.py
+++ b/colossalai/kernel/cuda_native/scaled_softmax.py
@@ -114,6 +114,13 @@ def __init__(
         self.softmax_in_fp32 = softmax_in_fp32
         self.scale = scale
 
+        try:
+            from colossalai._C import scaled_masked_softmax
+        except ImportError:
+            from colossalai.kernel.op_builder.scaled_masked_softmax import ScaledMaskedSoftmaxBuilder
+            scaled_masked_softmax = ScaledMaskedSoftmaxBuilder().load()
+        self.scaled_masked_softmax = scaled_masked_softmax
+
         assert (self.scale is None or softmax_in_fp32), "softmax should be in fp32 when scaled"
 
     def forward(self, input, mask):
@@ -178,11 +185,5 @@ def forward_torch_softmax(self, input, mask):
 
         return probs
 
-    @staticmethod
-    def get_batch_per_block(sq, sk, b, np):
-        try:
-            import colossalai._C.scaled_masked_softmax
-        except ImportError:
-            raise RuntimeError('ScaledMaskedSoftmax requires cuda extensions')
-
-        return colossalai._C.scaled_masked_softmax.get_batch_per_block(sq, sk, b, np)
+    def get_batch_per_block(self, sq, sk, b, np):
+        return self.scaled_masked_softmax.get_batch_per_block(sq, sk, b, np)
diff --git a/examples/tutorial/large_batch_optimizer/README.md b/examples/tutorial/large_batch_optimizer/README.md
index d85afa427518..1a17c2d8740f 100644
--- a/examples/tutorial/large_batch_optimizer/README.md
+++ b/examples/tutorial/large_batch_optimizer/README.md
@@ -1,9 +1,11 @@
-# Comparison of Large Batch Training Optimization
+# Large Batch Training Optimization
 
 ## Table of contents
 
-- [Overview](#-overview)
-- [Quick Start](#-quick-start)
+- [Large Batch Training Optimization](#large-batch-training-optimization)
+  - [Table of contents](#table-of-contents)
+  - [📚 Overview](#-overview)
+  - [🚀 Quick Start](#-quick-start)
 
 ## 📚 Overview
 
diff --git a/examples/tutorial/sequence_parallel/README.md b/examples/tutorial/sequence_parallel/README.md
index 7058f53db8b6..1b7c60e22861 100644
--- a/examples/tutorial/sequence_parallel/README.md
+++ b/examples/tutorial/sequence_parallel/README.md
@@ -1,139 +1,56 @@
-# Sequence Parallelism with BERT
+# Sequence Parallelism
 
-In this example, we implemented BERT with sequence parallelism. Sequence parallelism splits the input tensor and intermediate
-activation along the sequence dimension. This method can achieve better memory efficiency and allows us to train with larger batch size and longer sequence length.
+## Table of contents
 
-Paper: [Sequence Parallelism: Long Sequence Training from System Perspective](https://arxiv.org/abs/2105.13120)
+- [Sequence Parallelism](#sequence-parallelism)
+  - [Table of contents](#table-of-contents)
+  - [📚 Overview](#-overview)
+  - [🚀 Quick Start](#-quick-start)
+  - [🏎 How to Train with Sequence Parallelism](#-how-to-train-with-sequence-parallelism)
+    - [Step 1. Configure your parameters](#step-1-configure-your-parameters)
+    - [Step 2. Invoke parallel training](#step-2-invoke-parallel-training)
 
-## 🚀Quick Start
-1. Run with the following command
-```bash
-export PYTHONPATH=$PWD
-colossalai run --nproc_per_node 4 train.py -s
-```
-2. The default config is sequence parallel size = 2, pipeline size = 1, let’s change pipeline size to be 2 and try it again.
-
-
-## How to Prepare WikiPedia Dataset
-
-First, let's prepare the WikiPedia dataset from scratch. To generate a preprocessed dataset, we need four items:
-1. raw WikiPedia dataset
-2. wikipedia extractor (extract data from the raw dataset)
-3. vocabulary file
-4. preprocessing scripts (generate final data from extracted data)
-
-For the preprocessing script, we thank Megatron-LM for providing a preprocessing script to generate the corpus file.
-
-```python
-# download raw data
-mkdir data && cd ./data
-wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
-
-# install wiki extractor
-git clone https://github.com/FrankLeeeee/wikiextractor.git
-pip install ./wikiextractor
-
-# extractmodule
-wikiextractor --json enwiki-latest-pages-articles.xml.bz2
-cat text/*/* > ./corpus.json
-cd ..
-
-# download vocab file
-mkdir vocab && cd ./vocab
-wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt
-cd ..
-
-# preprocess some data
-git clone https://github.com/NVIDIA/Megatron-LM.git
-cd ./Megatron-LM
-python tools/preprocess_data.py \
-    --input ../data/corpus.json \
-    --output-prefix my-bert \
-    --vocab ../vocab/bert-large-uncased-vocab.txt \
-    --dataset-impl mmap \
-    --tokenizer-type BertWordPieceLowerCase \
-    --split-sentences \
-    --workers 24
-```
+## 📚 Overview
 
-After running the preprocessing scripts, you will obtain two files:
-1. my-bert_text_sentence.bin
-2. my-bert_text_sentence.idx
-
-If you happen to encouter `index out of range` problem when running Megatron's script,
-this is probably because that a sentence starts with a punctuation and cannot be tokenized. A work-around is to update `Encoder.encode` method with the code below:
-
-```python
-class Encoder(object):
-    def __init__(self, args):
-        ...
-
-    def initializer(self):
-        ...
-
-    def encode(self, json_line):
-        data = json.loads(json_line)
-        ids = {}
-        for key in self.args.json_keys:
-            text = data[key]
-            doc_ids = []
-
-            # lsg: avoid sentences which start with a punctuation
-            # as it cannot be tokenized by splitter
-            if len(text) > 0 and text[0] in string.punctuation:
-                text = text[1:]
-
-            for sentence in Encoder.splitter.tokenize(text):
-                sentence_ids = Encoder.tokenizer.tokenize(sentence)
-                if len(sentence_ids) > 0:
-                    doc_ids.append(sentence_ids)
-            if len(doc_ids) > 0 and self.args.append_eod:
-                doc_ids[-1].append(Encoder.tokenizer.eod)
-            ids[key] = doc_ids
-        return ids, len(json_line)
-```
+In this tutorial, we implemented BERT with sequence parallelism. Sequence parallelism splits the input tensor and intermediate
+activation along the sequence dimension. This method can achieve better memory efficiency and allows us to train with larger batch size and longer sequence length.
 
-## How to Train with Sequence Parallelism
+Paper: [Sequence Parallelism: Long Sequence Training from System Perspective](https://arxiv.org/abs/2105.13120)
 
-We provided `train.py` for you to execute training. Before invoking the script, there are several
-steps to perform.
+## 🚀 Quick Start
 
-### Step 1. Set data path and vocab path
+1. Install PyTorch
 
-At the top of `config.py`, you can see two global variables `DATA_PATH` and `VOCAB_FILE_PATH`.
+2. Install the dependencies.
 
-```python
-DATA_PATH = <data-path>
-VOCAB_FILE_PATH = <vocab-path>
+```bash
+pip install -r requirements.txt
 ```
 
-`DATA_PATH` refers to the path to the data file generated by Megatron's script. For example, in the section above, you should get two data files (my-bert_text_sentence.bin and my-bert_text_sentence.idx). You just need to `DATA_PATH` to the path to the bin file without the file extension.
+3. Run with the following command
 
-For example, if your my-bert_text_sentence.bin is /home/Megatron-LM/my-bert_text_sentence.bin, then you should set
+```bash
+export PYTHONPATH=$PWD
 
-```python
-DATA_PATH = '/home/Megatron-LM/my-bert_text_sentence'
+# run with synthetic dataset
+colossalai run --nproc_per_node 4 train.py
 ```
 
-The `VOCAB_FILE_PATH` refers to the path to the vocabulary downloaded when you prepare the dataset
-(e.g. bert-large-uncased-vocab.txt).
+> The default config is sequence parallel size = 2, pipeline size = 1, let’s change pipeline size to be 2 and try it again.
 
-### Step 3. Make Dataset Helper
 
-Build BERT dataset helper. Requirements are `CUDA`, `g++`, `pybind11` and `make`.
+## 🏎 How to Train with Sequence Parallelism
 
-```python
-cd ./data/datasets
-make
-```
+We provided `train.py` for you to execute training. Before invoking the script, there are several
+steps to perform.
 
-### Step 3. Configure your parameters
+### Step 1. Configure your parameters
 
 In the `config.py` provided, a set of parameters are defined including training scheme, model, etc.
 You can also modify the ColossalAI setting. For example, if you wish to parallelize over the
 sequence dimension on 8 GPUs. You can change `size=4` to `size=8`. If you wish to use pipeline parallelism, you can set `pipeline=<num_of_pipeline_stages>`.
 
-### Step 4. Invoke parallel training
+### Step 2. Invoke parallel training
 
 Lastly, you can start training with sequence parallelism. How you invoke `train.py` depends on your
 machine setting.
diff --git a/examples/tutorial/sequence_parallel/config.py b/examples/tutorial/sequence_parallel/config.py
index df0c5282f032..6edf9cc2c7e5 100644
--- a/examples/tutorial/sequence_parallel/config.py
+++ b/examples/tutorial/sequence_parallel/config.py
@@ -1,11 +1,8 @@
 from colossalai.amp import AMP_TYPE
 
-DATA_PATH = ''
-VOCAB_FILE_PATH = ''
-
 # hyper-parameters
-TRAIN_ITERS = 1000000
-DECAY_ITERS = 990000
+TRAIN_ITERS = 10
+DECAY_ITERS = 4
 WARMUP_FRACTION = 0.01
 GLOBAL_BATCH_SIZE = 32    # dp world size * sentences per GPU
 EVAL_ITERS = 10
@@ -13,12 +10,12 @@
 LR = 0.0001
 MIN_LR = 1e-05
 WEIGHT_DECAY = 0.01
-SEQ_LENGTH = 512
+SEQ_LENGTH = 128
 
 # BERT config
-DEPTH = 12
-NUM_ATTENTION_HEADS = 12
-HIDDEN_SIZE = 768
+DEPTH = 4
+NUM_ATTENTION_HEADS = 4
+HIDDEN_SIZE = 128
 
 # model config
 ADD_BINARY_HEAD = False
diff --git a/examples/tutorial/sequence_parallel/requirements.txt b/examples/tutorial/sequence_parallel/requirements.txt
index 137a69e80498..b49a94554afb 100644
--- a/examples/tutorial/sequence_parallel/requirements.txt
+++ b/examples/tutorial/sequence_parallel/requirements.txt
@@ -1,2 +1,2 @@
-colossalai >= 0.1.12
-torch >= 1.8.1
+colossalai
+torch
diff --git a/examples/tutorial/sequence_parallel/test_ci.sh b/examples/tutorial/sequence_parallel/test_ci.sh
new file mode 100644
index 000000000000..7bc20de3b6e4
--- /dev/null
+++ b/examples/tutorial/sequence_parallel/test_ci.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+set -euxo pipefail
+
+pip install -r requirements.txt
+
+# run test
+colossalai run --nproc_per_node 4 train.py
diff --git a/examples/tutorial/sequence_parallel/train.py b/examples/tutorial/sequence_parallel/train.py
index b92061000d10..a89747b5845e 100644
--- a/examples/tutorial/sequence_parallel/train.py
+++ b/examples/tutorial/sequence_parallel/train.py
@@ -1,9 +1,8 @@
 import argparse
 
 import torch
-from data import build_train_valid_test_data_iterators
 from data.bert_helper import SequenceParallelDataIterator, get_batch_for_sequence_parallel
-from data.tokenizer import get_padded_vocab_size, initialize_tokenizer
+from data.dummy_dataloader import DummyDataloader
 from loss_func.bert_loss import BertLoss
 from lr_scheduler import AnnealingLR
 from model.bert import BertForPretrain, build_pipeline_bert
@@ -36,7 +35,7 @@ def parse_args():
 
 
 def pipeline_data_process_func(stage_output, micro_batch_data):
-    tokens, types, sentence_order, loss_mask, lm_labels, padding_mask = micro_batch_data 
+    tokens, types, sentence_order, loss_mask, lm_labels, padding_mask = micro_batch_data
     if gpc.is_first_rank(ParallelMode.PIPELINE):
         data = (tokens, padding_mask, types, lm_labels)
         label = (loss_mask, sentence_order)
@@ -53,36 +52,15 @@ def main():
 
     logger = get_dist_logger()
 
-    # build dataloader
-    if not args.synthetic:
-        initialize_tokenizer(gpc.config.VOCAB_FILE_PATH, tokenizer_type='BertWordPieceLowerCase')
-        VOCAB_SIZE = get_padded_vocab_size()
-        trainloader, validloader, testloader = build_train_valid_test_data_iterators(
-            train_iters=gpc.config.TRAIN_ITERS,
-            global_batch_size=gpc.config.GLOBAL_BATCH_SIZE,
-            eval_interval=gpc.config.EVAL_INTERVAL,
-            eval_iters=gpc.config.EVAL_ITERS,
-            data_prefix=[gpc.config.DATA_PATH],
-            data_impl='mmap',
-            splits_string='949,50,1',
-            max_seq_length=gpc.config.SEQ_LENGTH,
-            masked_lm_prob=0.15,
-            short_seq_prob=0.1,
-            seed=1234,
-            skip_warmup=True,
-            binary_head=False,
-        )
-    else:
-        from data.dummy_dataloader import DummyDataloader
-
-        BATCH_SIZE_PER_GPUS = gpc.config.GLOBAL_BATCH_SIZE // gpc.get_world_size(ParallelMode.DATA)
-        VOCAB_SIZE = 30528
-        trainloader = DummyDataloader(batch_size=BATCH_SIZE_PER_GPUS,
-                                      vocab_size=VOCAB_SIZE,
-                                      seq_length=gpc.config.SEQ_LENGTH)
-        validloader = DummyDataloader(batch_size=BATCH_SIZE_PER_GPUS,
-                                      vocab_size=VOCAB_SIZE,
-                                      seq_length=gpc.config.SEQ_LENGTH)
+    # build synthetic dataloader
+    BATCH_SIZE_PER_GPUS = gpc.config.GLOBAL_BATCH_SIZE // gpc.get_world_size(ParallelMode.DATA)
+    VOCAB_SIZE = 30528
+    trainloader = DummyDataloader(batch_size=BATCH_SIZE_PER_GPUS,
+                                  vocab_size=VOCAB_SIZE,
+                                  seq_length=gpc.config.SEQ_LENGTH)
+    validloader = DummyDataloader(batch_size=BATCH_SIZE_PER_GPUS,
+                                  vocab_size=VOCAB_SIZE,
+                                  seq_length=gpc.config.SEQ_LENGTH)
 
     logger.info("Dataloaders are built", ranks=[0])
 

From a5dc4253c6ea5ea1d9a5529a379eb2ffdf81622b Mon Sep 17 00:00:00 2001
From: HELSON <c2h214748@gmail.com>
Date: Fri, 13 Jan 2023 14:56:17 +0800
Subject: [PATCH 180/503] [zero] polish low level optimizer (#2473)

---
 colossalai/zero/sharded_optim/_utils.py       |  30 ++--
 .../sharded_optim/bookkeeping/base_store.py   |  17 +--
 .../sharded_optim/bookkeeping/bucket_store.py |   8 +-
 .../bookkeeping/parameter_store.py            |   9 +-
 .../zero/sharded_optim/low_level_optim.py     | 135 +++++++++---------
 .../test_zero/low_level_zero/test_grad_acc.py |  12 +-
 .../test_zero/low_level_zero/test_zero1_2.py  |   8 +-
 7 files changed, 95 insertions(+), 124 deletions(-)

diff --git a/colossalai/zero/sharded_optim/_utils.py b/colossalai/zero/sharded_optim/_utils.py
index 7369f8a2edde..70d9c040cb53 100644
--- a/colossalai/zero/sharded_optim/_utils.py
+++ b/colossalai/zero/sharded_optim/_utils.py
@@ -103,7 +103,11 @@ def split_half_float_double(tensor_list):
     return buckets
 
 
-def reduce_tensor_dp_group(tensor, dtype=None, dst_rank=None, pg: Optional[ProcessGroup] = None):
+def reduce_tensor_dp_group(tensor: torch.Tensor,
+                           dtype: Optional[torch.dtype] = None,
+                           dst_local_rank: Optional[int] = None,
+                           dst_global_rank: Optional[int] = None,
+                           group: Optional[dist.ProcessGroup] = None):
     """
     Reduce the tensor in the data parallel process group
 
@@ -128,36 +132,22 @@ def reduce_tensor_dp_group(tensor, dtype=None, dst_rank=None, pg: Optional[Proce
     else:
         tensor_to_reduce = tensor
 
-    if isinstance(pg, ProcessGroup):
-        group = pg.dp_process_group()
-        world_size = pg.dp_world_size()
-    else:
-        world_size = gpc.get_world_size(ParallelMode.DATA)
-        group = gpc.get_group(ParallelMode.DATA)
-
+    world_size = dist.get_world_size(group=group)
     tensor_to_reduce.div_(world_size)
 
     # if rank is None, all reduce will be used
     # else, reduce is used
-    use_all_reduce = dst_rank is None
+    use_all_reduce = dst_local_rank is None
 
     if use_all_reduce:
         dist.all_reduce(tensor_to_reduce, group=group)
     else:
-        if pg is not None:
-            ranks_in_group = pg.dp_rank_list()
-        else:
-            ranks_in_group = gpc.get_ranks_in_group(ParallelMode.DATA)
-        global_rank = ranks_in_group[dst_rank]
-        dist.reduce(tensor=tensor_to_reduce, dst=global_rank, group=group)
+        dist.reduce(tensor=tensor_to_reduce, dst=dst_global_rank, group=group)
 
     # recover the original dtype
     if tensor.dtype != dtype and tensor is not tensor_to_reduce:
-        if pg is not None:
-            local_rank = pg.dp_local_rank()
-        else:
-            local_rank = gpc.get_local_rank(ParallelMode.DATA)
-        if use_all_reduce or dst_rank == local_rank:
+        local_rank = dist.get_rank(group=group)
+        if use_all_reduce or dst_local_rank == local_rank:
             tensor.copy_(tensor_to_reduce)
 
     return tensor
diff --git a/colossalai/zero/sharded_optim/bookkeeping/base_store.py b/colossalai/zero/sharded_optim/bookkeeping/base_store.py
index 3623ed1f048c..2ebd122464f4 100644
--- a/colossalai/zero/sharded_optim/bookkeeping/base_store.py
+++ b/colossalai/zero/sharded_optim/bookkeeping/base_store.py
@@ -1,19 +1,12 @@
-from typing import Optional
-
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.tensor import ProcessGroup
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
 
 
 class BaseStore:
 
-    def __init__(self, pg: Optional[ProcessGroup] = None):
-        if isinstance(pg, ProcessGroup):
-            self._world_size = pg.dp_world_size()
-            self._local_rank = pg.dp_local_rank()
-        else:
-            self._world_size = gpc.get_world_size(ParallelMode.DATA)
-            self._local_rank = gpc.get_local_rank(ParallelMode.DATA)
+    def __init__(self, torch_pg: ProcessGroup):
+        self._world_size = dist.get_world_size(group=torch_pg)
+        self._local_rank = dist.get_rank(group=torch_pg)
 
     @property
     def world_size(self):
diff --git a/colossalai/zero/sharded_optim/bookkeeping/bucket_store.py b/colossalai/zero/sharded_optim/bookkeeping/bucket_store.py
index aba61624e46e..9e0c05d8941a 100644
--- a/colossalai/zero/sharded_optim/bookkeeping/bucket_store.py
+++ b/colossalai/zero/sharded_optim/bookkeeping/bucket_store.py
@@ -1,14 +1,12 @@
-from typing import Optional
-
-from colossalai.tensor import ProcessGroup
+from torch.distributed import ProcessGroup
 
 from .base_store import BaseStore
 
 
 class BucketStore(BaseStore):
 
-    def __init__(self, pg: Optional[ProcessGroup] = None):
-        super().__init__(pg)
+    def __init__(self, torch_pg: ProcessGroup):
+        super().__init__(torch_pg)
         self._grads = dict()
         self._params = dict()
         self._num_elements_in_bucket = dict()
diff --git a/colossalai/zero/sharded_optim/bookkeeping/parameter_store.py b/colossalai/zero/sharded_optim/bookkeeping/parameter_store.py
index c22186abee0f..cbf708b3471f 100644
--- a/colossalai/zero/sharded_optim/bookkeeping/parameter_store.py
+++ b/colossalai/zero/sharded_optim/bookkeeping/parameter_store.py
@@ -1,16 +1,15 @@
-from typing import List, Optional
+from typing import List
 
 from torch import Tensor
-
-from colossalai.tensor import ProcessGroup
+from torch.distributed import ProcessGroup
 
 from .base_store import BaseStore
 
 
 class ParameterStore(BaseStore):
 
-    def __init__(self, pg: Optional[ProcessGroup] = None):
-        super().__init__(pg)
+    def __init__(self, torch_pg: ProcessGroup):
+        super().__init__(torch_pg)
         # param partitioning data structures
         self._fp16_param_to_rank = dict()
         self._rank_groupid_to_fp16_param_list = dict()
diff --git a/colossalai/zero/sharded_optim/low_level_optim.py b/colossalai/zero/sharded_optim/low_level_optim.py
index e372eaa50be4..38736d01afef 100644
--- a/colossalai/zero/sharded_optim/low_level_optim.py
+++ b/colossalai/zero/sharded_optim/low_level_optim.py
@@ -10,7 +10,7 @@
 from colossalai.core import global_context as gpc
 from colossalai.logging import get_dist_logger
 from colossalai.nn.optimizer import ColossalaiOptimizer
-from colossalai.tensor import ProcessGroup
+from colossalai.tensor import ColoParameter, ProcessGroup
 from colossalai.utils.cuda import get_current_device
 
 from ._utils import (
@@ -34,32 +34,21 @@ class LowLevelZeroOptimizer(ColossalaiOptimizer):
     def __init__(
             self,
             optimizer: Optimizer,
-            pg: Optional[ProcessGroup] = None,
-    # grad scaler config
-            initial_scale=2**16,
-            min_scale=1,
-            growth_factor=2,
-            backoff_factor=0.5,
-            growth_interval=2000,
-            hysteresis=2,
+            initial_scale: int = 2**16,    # grad scaler config
+            min_scale: int = 1,
+            growth_factor: float = 2.,
+            backoff_factor: float = .5,
+            growth_interval: int = 2000,
+            hysteresis: int = 2,
             max_scale: int = 2**24,
-
-    # grad clipping
-            clip_grad_norm=0.0,
-            verbose=False,
-
-    # communication
-            reduce_bucket_size=1024 * 1024,
-            communication_dtype=None,
-            overlap_communication=False,
-
-    # stage 2
-            partition_grad=False,
-    # cpu offload
-            cpu_offload=False,
-
-    # forced dtype
-            forced_dtype=None):
+            clip_grad_norm: float = 0.0,    # grad clipping
+            verbose: bool = False,
+            reduce_bucket_size: int = 1024 * 1024,    # communication
+            communication_dtype: Optional[torch.dtype] = None,
+            overlap_communication: bool = False,
+            partition_grad: bool = False,    # stage 2
+            cpu_offload: bool = False,    # cpu offload
+            forced_dtype: Optional[torch.dtype] = None):
 
         # TODO: add support for
         # 1. fp16 master weights
@@ -76,16 +65,16 @@ def __init__(
 
         self._cpu_offload = cpu_offload
 
-        self._pg = pg
-        if isinstance(pg, ProcessGroup):
-            self._local_rank = pg.dp_local_rank()
-            self._world_size = pg.dp_world_size()
-            self._dp_group = pg.dp_process_group()
-            if pg.tp_world_size() > 1:
-                self._mp_group = pg.tp_process_group()
-            else:
-                self._mp_group = None
-        elif pg is None:
+        colo_pg = self._search_colo_process_group()
+        if isinstance(colo_pg, ProcessGroup):
+            self._local_rank = colo_pg.dp_local_rank()
+            self._world_size = colo_pg.dp_world_size()
+            self._dp_global_ranks = colo_pg.get_ranks_in_dp()
+            self._dp_torch_group = colo_pg.dp_process_group()
+            self._mp_torch_group = None
+            if colo_pg.tp_world_size() > 1:
+                self._mp_torch_group = colo_pg.tp_process_group()
+        elif colo_pg is None:
             dp_parallel_mode = ParallelMode.DATA
             mp_parallel_mode = ParallelMode.MODEL
 
@@ -93,14 +82,13 @@ def __init__(
             self._mp_parallel_mode = mp_parallel_mode
             self._local_rank = gpc.get_local_rank(dp_parallel_mode)
             self._world_size = gpc.get_world_size(dp_parallel_mode)
-
-            self._dp_group = gpc.get_group(dp_parallel_mode)
+            self._dp_global_ranks = gpc.get_ranks_in_group(dp_parallel_mode)
+            self._dp_torch_group = gpc.get_group(dp_parallel_mode)
+            self._mp_torch_group = None
             if gpc.is_initialized(mp_parallel_mode) and gpc.get_world_size(mp_parallel_mode) > 1:
-                self._mp_group = gpc.get_group(mp_parallel_mode)
-            else:
-                self._mp_group = None
+                self._mp_torch_group = gpc.get_group(mp_parallel_mode)
         else:
-            raise TypeError(f"pg should be None or a ProcesGroup")
+            raise NotImplementedError
         # fp16 and fp32 params for mixed precision training
         self._fp16_param_groups = dict()
         self._fp32_flat_param_groups_of_current_rank = dict()
@@ -136,14 +124,9 @@ def __init__(
 
         # ParameterStore will manage the tensor buffers used for zero
         # it will not manage the tensors used by mixed precision training
-        if self._pg is not None:
-            self._param_store = ParameterStore(self._pg)
-            self._grad_store = GradientStore(self._pg)
-            self._bucket_store = BucketStore(self._pg)
-        else:
-            self._param_store = ParameterStore(self._dp_parallel_mode)
-            self._grad_store = GradientStore(self._dp_parallel_mode)
-            self._bucket_store = BucketStore(self._dp_parallel_mode)
+        self._param_store = ParameterStore(self._dp_torch_group)
+        self._grad_store = GradientStore(self._dp_torch_group)
+        self._bucket_store = BucketStore(self._dp_torch_group)
 
         # iterate over the param group in the optimizer
         # partition these param groups for data parallel training
@@ -224,6 +207,30 @@ def loss_scale(self):
     def num_param_groups(self):
         return len(self._fp16_param_groups)
 
+    def _sanity_checks(self):
+        assert torch.cuda.is_available(), 'CUDA is required'
+        for param_group in self.optim.param_groups:
+            group_params = param_group['params']
+            for param in group_params:
+                assert param.dtype == self._dtype, \
+                    f"Parameters are expected to have the same dtype `{self._dtype}`, but got `{param.dtype}`"
+
+    def _search_colo_process_group(self):
+        colo_flag = False
+        colo_pg = None
+        for param_group in self.optim.param_groups:
+            group_params = param_group['params']
+            for param in group_params:
+                if isinstance(param, ColoParameter):
+                    colo_flag = True
+                    if colo_pg is None:
+                        colo_pg = param.get_process_group()
+                    else:
+                        assert colo_pg == param.get_process_group(), "All parameters should be in a same process group"
+                elif colo_flag:
+                    raise RuntimeError("All parameters should be ColoParameter if you use ColoParameter.")
+        return colo_pg
+
     def _partition_param_list(self, param_list):
         params_per_rank = [[] for _ in range(self._world_size)]
         numel_per_rank = [0 for _ in range(self._world_size)]
@@ -241,14 +248,6 @@ def _partition_param_list(self, param_list):
             self._logger.info(f'Number of elements on ranks: {numel_per_rank}', ranks=[0])
         return params_per_rank
 
-    def _sanity_checks(self):
-        assert torch.cuda.is_available(), 'CUDA is required'
-        for param_group in self.optim.param_groups:
-            group_params = param_group['params']
-            for param in group_params:
-                assert param.dtype == self._dtype, \
-                    f"Parameters are expected to have the same dtype `{self._dtype}`, but got `{param.dtype}`"
-
     ###########################################################
     # Backward Reduction Hook
     ###########################################################
@@ -384,10 +383,14 @@ def _reduce_and_copy(self, bucket: TensorBucket, reduce_rank):
 
         with torch.cuda.stream(stream):
             flat = bucket.flatten()
+            reduce_global_rank = None
+            if reduce_rank is not None:
+                reduce_global_rank = self._dp_global_ranks[reduce_rank]
             reduced_flat = reduce_tensor_dp_group(tensor=flat,
                                                   dtype=self._communication_dtype,
-                                                  dst_rank=reduce_rank,
-                                                  pg=self._pg)
+                                                  dst_local_rank=reduce_rank,
+                                                  dst_global_rank=reduce_global_rank,
+                                                  group=self._dp_torch_group)
 
             # update the reduced tensor
             if reduce_rank is None or reduce_rank == self._local_rank:
@@ -456,8 +459,8 @@ def step(self, closure=None):
             norm_group = compute_norm(gradients=self._grad_store._averaged_gradients[group_id],
                                       params=self._param_store.get_fp16_params_by_rank_group(group_id=group_id,
                                                                                              rank=self._local_rank),
-                                      dp_group=self._dp_group,
-                                      mp_group=self._mp_group)
+                                      dp_group=self._dp_torch_group,
+                                      mp_group=self._mp_torch_group)
             norm_groups.append(norm_group)
 
             # create flat gradient for the flat fp32 params
@@ -497,7 +500,7 @@ def step(self, closure=None):
         for group_id in range(self.num_param_groups):
             for rank in range(self._world_size):
                 fp16_param = self._param_store.get_flat_fp16_param_by_rank_group(rank=rank, group_id=group_id)
-                handle = dist.broadcast(fp16_param, src=rank, group=self._dp_group, async_op=True)
+                handle = dist.broadcast(fp16_param, src=rank, group=self._dp_torch_group, async_op=True)
                 handles.append(handle)
 
         for handle in handles:
@@ -519,11 +522,11 @@ def _check_overflow(self):
                     break
 
         # all-reduce across dp group
-        dist.all_reduce(self._found_overflow, op=dist.ReduceOp.MAX, group=self._dp_group)
+        dist.all_reduce(self._found_overflow, op=dist.ReduceOp.MAX, group=self._dp_torch_group)
 
         # all-reduce over model parallel group
-        if self._mp_group:
-            dist.all_reduce(self._found_overflow, op=dist.ReduceOp.MAX, group=self._mp_group)
+        if self._mp_torch_group:
+            dist.all_reduce(self._found_overflow, op=dist.ReduceOp.MAX, group=self._mp_torch_group)
 
         if self._found_overflow.item() > 0:
             return True
diff --git a/tests/test_zero/low_level_zero/test_grad_acc.py b/tests/test_zero/low_level_zero/test_grad_acc.py
index a0d1ac531485..69795ed6a2e5 100644
--- a/tests/test_zero/low_level_zero/test_grad_acc.py
+++ b/tests/test_zero/low_level_zero/test_grad_acc.py
@@ -35,18 +35,15 @@ def exam_zero_1_2_grad_acc():
     # create model
     zero1_model = TestModel().cuda()
     zero2_model = copy.deepcopy(zero1_model)
-    pg = ProcessGroup()
     # create optimizer
     zero1_optimizer = torch.optim.Adam(zero1_model.parameters(), lr=1)
     zero2_optimizer = torch.optim.Adam(zero2_model.parameters(), lr=1)
     zero1_optimizer = LowLevelZeroOptimizer(zero1_optimizer,
-                                            pg=pg,
                                             overlap_communication=True,
                                             initial_scale=32,
                                             clip_grad_norm=1.0,
                                             verbose=True)
     zero2_optimizer = LowLevelZeroOptimizer(zero2_optimizer,
-                                            pg=pg,
                                             overlap_communication=True,
                                             partition_grad=True,
                                             initial_scale=32,
@@ -86,7 +83,7 @@ def fwd_bwd_func(number, cur_data):
         assert torch.equal(z1p.data, z2p.data)
 
 
-def exam_zero_1_grad_acc(use_pg=True):
+def exam_zero_1_grad_acc():
     local_rank = torch.distributed.get_rank()
     grad_scale = 32
     seed_all(2008)
@@ -105,9 +102,7 @@ def exam_zero_1_grad_acc(use_pg=True):
     # we only test stage 1 here
     # in `check_sharded_param_consistency.py`, we will test whether
     # level 1 and 2 will produce exactly the same results
-    pg = ProcessGroup() if use_pg else None    #ProcessGroup()
     zero_optimizer = LowLevelZeroOptimizer(zero_optimizer,
-                                           pg=pg,
                                            overlap_communication=False,
                                            initial_scale=grad_scale,
                                            reduce_bucket_size=262144,
@@ -158,9 +153,8 @@ def fwd_bwd_func(number, cur_data, check_flag):
 def run_dist(rank, world_size, port):
     colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host='localhost')
 
-    exam_zero_1_grad_acc(True)
-    exam_zero_1_grad_acc(False)
-    # exam_zero_1_2_grad_acc()
+    exam_zero_1_grad_acc()
+    exam_zero_1_2_grad_acc()
 
 
 @pytest.mark.dist
diff --git a/tests/test_zero/low_level_zero/test_zero1_2.py b/tests/test_zero/low_level_zero/test_zero1_2.py
index 6924827fe4b4..8771bfbe6049 100644
--- a/tests/test_zero/low_level_zero/test_zero1_2.py
+++ b/tests/test_zero/low_level_zero/test_zero1_2.py
@@ -9,7 +9,6 @@
 from torch.testing import assert_close
 
 import colossalai
-from colossalai.tensor import ProcessGroup
 from colossalai.testing.random import seed_all
 from colossalai.utils import free_port
 from colossalai.zero import LowLevelZeroOptimizer
@@ -59,17 +58,14 @@ def exam_zero_1_2():
     zero1_model = TestModel().cuda()
     zero2_model = copy.deepcopy(zero1_model)
 
-    pg = ProcessGroup()
     # create optimizer
     zero1_optimizer = torch.optim.Adam(zero1_model.parameters(), lr=1)
     zero2_optimizer = torch.optim.Adam(zero2_model.parameters(), lr=1)
     zero1_optimizer = LowLevelZeroOptimizer(zero1_optimizer,
-                                            pg=pg,
                                             overlap_communication=True,
                                             initial_scale=128,
                                             verbose=True)
     zero2_optimizer = LowLevelZeroOptimizer(zero2_optimizer,
-                                            pg=pg,
                                             overlap_communication=True,
                                             partition_grad=True,
                                             initial_scale=128)
@@ -119,7 +115,7 @@ def exam_zero_1_torch_ddp():
     torch_model = copy.deepcopy(zero_model)
 
     zero_model = zero_model.cuda().half()
-    # torch_model = DDP(torch_model.cuda(), bucket_cap_mb=0)
+    torch_model = DDP(torch_model.cuda(), bucket_cap_mb=0)
     torch_model = torch_model.cuda()
 
     # for (n, p), z1p in zip(torch_model.named_parameters(), zero_model.parameters()):
@@ -131,9 +127,7 @@ def exam_zero_1_torch_ddp():
     # we only test stage 1 here
     # in `check_sharded_param_consistency.py`, we will test whether
     # level 1 and 2 will produce exactly the same results
-    pg = ProcessGroup()
     zero_optimizer = LowLevelZeroOptimizer(zero_optimizer,
-                                           pg=pg,
                                            overlap_communication=True,
                                            initial_scale=1,
                                            reduce_bucket_size=262144)

From fef5c949c35b1f1e0075a9e4abb23a5ec0f48e3c Mon Sep 17 00:00:00 2001
From: Ziyue Jiang <ziyue.jiang97@gmail.com>
Date: Fri, 13 Jan 2023 16:56:01 +0800
Subject: [PATCH 181/503] polish pp middleware (#2476)

Co-authored-by: Ziyue Jiang <ziyue.jiang@gmail.com>
---
 colossalai/pipeline/rpc/_pipeline_base.py                     | 4 ++--
 colossalai/pipeline/rpc/_pipeline_schedule.py                 | 3 ---
 .../gpt/experiments/pipeline_parallel/train_gpt_pp.py         | 2 +-
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/colossalai/pipeline/rpc/_pipeline_base.py b/colossalai/pipeline/rpc/_pipeline_base.py
index 4739cdaa9bd3..1edc1ac70d20 100644
--- a/colossalai/pipeline/rpc/_pipeline_base.py
+++ b/colossalai/pipeline/rpc/_pipeline_base.py
@@ -211,7 +211,7 @@ def _get_output_all(self, key: UniqueKey, ref_use=False, rank=None):
                 refcount = 0
 
             with self.output_list_condition_lock:
-                if refcount < lifecycle:
+                if refcount <= lifecycle:
                     self.output_list[key] = output_work_item
                     self.output_list_condition_lock.notify_all()
 
@@ -390,7 +390,7 @@ def _subscribe_producer(self, microbatch_id: int, forward_only: bool):
                         subscribe_forward_futures[target_index] = []
                     else:
                         subscribe_forward_futures[target_index] = producer_worker_rref.rpc_async().get_output_by_key(
-                            producer_output_key, rank=self.pp_rank)
+                            producer_output_key, rank=self.pp_rank, offsets=offsets)
 
             else:
                 for i in range(producer_num):
diff --git a/colossalai/pipeline/rpc/_pipeline_schedule.py b/colossalai/pipeline/rpc/_pipeline_schedule.py
index e6aa961f19bc..0d572231d378 100644
--- a/colossalai/pipeline/rpc/_pipeline_schedule.py
+++ b/colossalai/pipeline/rpc/_pipeline_schedule.py
@@ -29,9 +29,6 @@ def _get_work_item_key(self) -> UniqueKey:
 
         target_key = UniqueKey(target_microbatch_id, target_phase)
 
-        with self.work_list_condition_lock:
-            self.work_list_condition_lock.wait_for(lambda: target_key in self.work_list)
-
         return target_key
 
 
diff --git a/examples/language/gpt/experiments/pipeline_parallel/train_gpt_pp.py b/examples/language/gpt/experiments/pipeline_parallel/train_gpt_pp.py
index 79efa61b0783..c3451c18db8f 100644
--- a/examples/language/gpt/experiments/pipeline_parallel/train_gpt_pp.py
+++ b/examples/language/gpt/experiments/pipeline_parallel/train_gpt_pp.py
@@ -120,7 +120,7 @@ def run_master(args):
         logger.info(f'{rank=} numel in the partition:{numel}')
 
     # build optim
-    pp_engine.initialize_optimizer(HybridAdam, lr=1e-3)
+    pp_engine.initialize_optimizer(torch.optim.Adam, lr=1e-3)
 
     ranks_tflops = {}
     for n in range(NUM_STEPS):

From f525d1f528dc25518c931f9e1f294787cf1b59b6 Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Fri, 13 Jan 2023 22:37:31 +0800
Subject: [PATCH 182/503] [example] update gpt gemini example ci test (#2477)

---
 .../language/gpt/gemini/train_gpt_demo.py     |  5 ++--
 examples/language/gpt/test_ci.sh              | 27 +++++++++----------
 2 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/examples/language/gpt/gemini/train_gpt_demo.py b/examples/language/gpt/gemini/train_gpt_demo.py
index 7bec980f95bd..f77be12d2d05 100644
--- a/examples/language/gpt/gemini/train_gpt_demo.py
+++ b/examples/language/gpt/gemini/train_gpt_demo.py
@@ -65,6 +65,7 @@ def parse_args():
         default="gpt2_medium",
         help="model model scale",
     )
+    parser.add_argument("--steps", type=int, default=10, help="num of training steps")
     args = parser.parse_args()
     return args
 
@@ -236,7 +237,7 @@ def main():
     SEQ_LEN = 1024
     VOCAB_SIZE = 50257
 
-    NUM_STEPS = 10
+    NUM_STEPS = args.steps
     WARMUP_STEPS = 1
     assert WARMUP_STEPS < NUM_STEPS, "warmup steps should smaller than the total steps"
     assert (NUM_STEPS - WARMUP_STEPS) % 2 == 1, "the number of valid steps should be odd to take the median "
@@ -290,14 +291,12 @@ def main():
             from torch.distributed.optim import ZeroRedundancyOptimizer
             optimizer = ZeroRedundancyOptimizer(model.parameters(), optimizer_class=torch.optim.Adam, lr=0.01)
     elif args.distplan.startswith("zero"):
-        pg = ProcessGroup()
         model = model.half()
         partition_flag = (args.distplan == "zero2")
         optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
 
         optimizer = LowLevelZeroOptimizer(
             optimizer,
-            pg=pg,
             reduce_bucket_size=12 * 1024 * 1024,
             overlap_communication=True,
             partition_grad=partition_flag,
diff --git a/examples/language/gpt/test_ci.sh b/examples/language/gpt/test_ci.sh
index ad0cfa325d37..d04ece182016 100644
--- a/examples/language/gpt/test_ci.sh
+++ b/examples/language/gpt/test_ci.sh
@@ -1,16 +1,15 @@
 pip install -r requirements.txt
 
-# distplan in ["colossalai", "zero1", "zero2", "torch_ddp", "torch_zero"]
-export DISTPAN="colossalai"
-
-# The following options only valid when DISTPAN="colossalai"
-export TPDEGREE=2
-export GPUNUM=4
-export PLACEMENT='cpu'
-export USE_SHARD_INIT=False
-export BATCH_SIZE=8
-export MODEL_TYPE="gpt2_medium"
-
-
-mkdir -p logs
-torchrun --standalone --nproc_per_node=${GPUNUM} train_gpt_demo.py --tp_degree=${TPDEGREE} --model_type=${MODEL_TYPE} --batch_size=${BATCH_SIZE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee ./logs/${MODEL_TYPE}_${DISTPAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}.log
+# test colossalai
+for TP in 1 2; do
+    for PLACEMENT in "cpu" "cuda" "auto" "const"; do
+        for SHARD in "True" "False"; do
+            colossalai run --nproc_per_node=4 ./gemini/train_gpt_demo.py --steps 4 --distplan colossalai --tp_degree $TP --placement $PLACEMENT --shardinit $SHARD || exit 1
+        done
+    done
+done
+
+# test zero1&2
+for DIST in "zero1" "zero2"; do
+    colossalai run --nproc_per_node=4 ./gemini/train_gpt_demo.py --steps 4 --distplan $DIST || exit 1
+done

From 21c88220ce64203dab1a462e4c2894233242468d Mon Sep 17 00:00:00 2001
From: HELSON <c2h214748@gmail.com>
Date: Sun, 15 Jan 2023 10:42:01 +0800
Subject: [PATCH 183/503] [zero] add unit test for low-level zero init (#2474)

---
 .../low_level_zero/test_zero_init.py          | 61 +++++++++++++++++++
 1 file changed, 61 insertions(+)
 create mode 100644 tests/test_zero/low_level_zero/test_zero_init.py

diff --git a/tests/test_zero/low_level_zero/test_zero_init.py b/tests/test_zero/low_level_zero/test_zero_init.py
new file mode 100644
index 000000000000..84d7b8c514b6
--- /dev/null
+++ b/tests/test_zero/low_level_zero/test_zero_init.py
@@ -0,0 +1,61 @@
+from functools import partial
+
+import pytest
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+import torch.nn as nn
+
+import colossalai
+from colossalai.tensor import ProcessGroup
+from colossalai.utils import free_port, get_current_device
+from colossalai.utils.model.colo_init_context import ColoInitContext
+from colossalai.zero import LowLevelZeroOptimizer
+
+
+class TestModel(nn.Module):
+
+    def __init__(self):
+        super(TestModel, self).__init__()
+        self.linear1 = nn.Linear(128, 256)
+        self.linear2 = nn.Linear(256, 512)
+
+    def forward(self, x):
+        x = self.linear1(x)
+        x = self.linear2(x)
+        return x
+
+
+def exam_zero_init():
+    dp_2_tp_2_pg = ProcessGroup(dp_degree=2, tp_degree=2)
+    model1 = TestModel().cuda()
+    with ColoInitContext(device=get_current_device(), default_pg=dp_2_tp_2_pg):
+        model2 = TestModel()
+    optimizer1 = LowLevelZeroOptimizer(torch.optim.Adam(model1.parameters(), lr=1))
+    optimizer2 = LowLevelZeroOptimizer(torch.optim.Adam(model2.parameters(), lr=1))
+
+    assert optimizer1._local_rank == optimizer2._local_rank
+    assert optimizer1._world_size == optimizer2._world_size
+    assert optimizer1._dp_global_ranks == optimizer2._dp_global_ranks
+
+    mp_group1 = optimizer1._mp_torch_group
+    mp_group2 = optimizer2._mp_torch_group
+    assert dist.get_world_size(mp_group1) == dist.get_world_size(mp_group2)
+    assert dist.get_rank(mp_group1) == dist.get_rank(mp_group2)
+
+
+def run_dist(rank, world_size, port):
+    config_dict = dict(parallel=dict(data=2, tensor=dict(size=2, mode='1d')))
+    colossalai.launch(config=config_dict, rank=rank, world_size=world_size, port=port, host='localhost')
+    exam_zero_init()
+
+
+@pytest.mark.dist
+def test_zero_init():
+    world_size = 4
+    run_func = partial(run_dist, world_size=world_size, port=free_port())
+    mp.spawn(run_func, nprocs=world_size)
+
+
+if __name__ == '__main__':
+    test_zero_init()

From 579dba572f77a28c14f610c0bff48aba31a685b8 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 16 Jan 2023 10:05:41 +0800
Subject: [PATCH 184/503] [workflow] fixed the skip condition of  example
 weekly check workflow (#2481)

---
 .github/workflows/auto_example_check.yml | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/auto_example_check.yml b/.github/workflows/auto_example_check.yml
index f88b6858e003..5e4022f7f0ea 100644
--- a/.github/workflows/auto_example_check.yml
+++ b/.github/workflows/auto_example_check.yml
@@ -81,9 +81,8 @@ jobs:
   # This is for all files' weekly check. Specifically, this job is to find all the directories.
   matrix_preparation:
     if: |
-        github.event.pull_request.draft == false &&
-        github.base_ref == 'main' &&
-        github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'schedule'
+        github.repository == 'hpcaitech/ColossalAI' &&
+        github.event_name == 'schedule'
     name: Prepare matrix for weekly check
     runs-on: ubuntu-latest
     outputs:
@@ -101,9 +100,8 @@ jobs:
 
   weekly_check:
     if: |
-        github.event.pull_request.draft == false &&
-        github.base_ref == 'main' &&
-        github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'schedule'
+        github.repository == 'hpcaitech/ColossalAI' &&
+        github.event_name == 'schedule'
     name: Weekly check all examples
     needs: matrix_preparation
     runs-on: [self-hosted, gpu]

From f78bad21ede6ce227cfe86d0ed46b8ce958667fd Mon Sep 17 00:00:00 2001
From: jiaruifang <fangjiarui123@gmail.com>
Date: Mon, 16 Jan 2023 11:34:26 +0800
Subject: [PATCH 185/503] [example] stable diffusion add roadmap

---
 examples/images/diffusion/README.md | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/examples/images/diffusion/README.md b/examples/images/diffusion/README.md
index abb1d24c0262..725052bdb69d 100644
--- a/examples/images/diffusion/README.md
+++ b/examples/images/diffusion/README.md
@@ -26,6 +26,17 @@ Acceleration of AIGC (AI-Generated Content) models such as [Stable Diffusion v1]
 
 More details can be found in our [blog of Stable Diffusion v1](https://www.hpc-ai.tech/blog/diffusion-pretraining-and-hardware-fine-tuning-can-be-almost-7x-cheaper) and [blog of Stable Diffusion v2](https://www.hpc-ai.tech/blog/colossal-ai-0-2-0).
 
+
+## Roadmap
+This project is in rapid development.
+
+- [X] Train a stable diffusion model v1/v2 from scatch
+- [X] finetune a pretrained Stable diffusion v1 model
+- [X] Inference a pretrained model using PyTorch
+- [ ] finetune a pretrained Stable diffusion v2 model
+- [ ] Inference a pretrained model using TensoRT
+
+
 ## Installation
 
 ### Option #1: install from source
@@ -123,7 +134,7 @@ git clone https://huggingface.co/CompVis/stable-diffusion-v1-4
 
 ### stable-diffusion-v1-5 from runway
 
-If you want to useed the Last [stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) wiegh from runwayml
+If you want to useed the Last [stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) weight from runwayml
 
 ```
 git lfs install
@@ -156,7 +167,7 @@ You can change the trainging config in the yaml file
 - precision: the precision type used in training, default 16 (fp16), you must use fp16 if you want to apply colossalai
 - more information about the configuration of ColossalAIStrategy can be found [here](https://pytorch-lightning.readthedocs.io/en/latest/advanced/model_parallel.html#colossal-ai)
 
-## Finetune Example
+## Finetune Example (Work In Progress)
 ### Training on Teyvat Datasets
 
 We provide the finetuning example on [Teyvat](https://huggingface.co/datasets/Fazzie/Teyvat) dataset, which is create by BLIP generated captions.

From 9cba38b49257a748f782655f1c6a0d3935f0f8f6 Mon Sep 17 00:00:00 2001
From: jiaruifang <fangjiarui123@gmail.com>
Date: Mon, 16 Jan 2023 12:03:48 +0800
Subject: [PATCH 186/503] add dummy test_ci.sh

---
 examples/images/diffusion/README.md  | 5 ++---
 examples/images/diffusion/test_ci.sh | 0
 2 files changed, 2 insertions(+), 3 deletions(-)
 create mode 100644 examples/images/diffusion/test_ci.sh

diff --git a/examples/images/diffusion/README.md b/examples/images/diffusion/README.md
index 725052bdb69d..ddc7e2d97128 100644
--- a/examples/images/diffusion/README.md
+++ b/examples/images/diffusion/README.md
@@ -31,12 +31,11 @@ More details can be found in our [blog of Stable Diffusion v1](https://www.hpc-a
 This project is in rapid development.
 
 - [X] Train a stable diffusion model v1/v2 from scatch
-- [X] finetune a pretrained Stable diffusion v1 model
+- [X] Finetune a pretrained Stable diffusion v1 model
 - [X] Inference a pretrained model using PyTorch
-- [ ] finetune a pretrained Stable diffusion v2 model
+- [ ] Finetune a pretrained Stable diffusion v2 model
 - [ ] Inference a pretrained model using TensoRT
 
-
 ## Installation
 
 ### Option #1: install from source
diff --git a/examples/images/diffusion/test_ci.sh b/examples/images/diffusion/test_ci.sh
new file mode 100644
index 000000000000..e69de29bb2d1

From e4c38ba36728629423b26f2f382476cd9c3e65c4 Mon Sep 17 00:00:00 2001
From: Jiarui Fang <fangjiarui123@gmail.com>
Date: Mon, 16 Jan 2023 12:14:49 +0800
Subject: [PATCH 187/503] [example] stable diffusion add roadmap (#2482)

---
 examples/images/diffusion/README.md  | 14 ++++++++++++--
 examples/images/diffusion/test_ci.sh |  0
 2 files changed, 12 insertions(+), 2 deletions(-)
 create mode 100644 examples/images/diffusion/test_ci.sh

diff --git a/examples/images/diffusion/README.md b/examples/images/diffusion/README.md
index abb1d24c0262..ddc7e2d97128 100644
--- a/examples/images/diffusion/README.md
+++ b/examples/images/diffusion/README.md
@@ -26,6 +26,16 @@ Acceleration of AIGC (AI-Generated Content) models such as [Stable Diffusion v1]
 
 More details can be found in our [blog of Stable Diffusion v1](https://www.hpc-ai.tech/blog/diffusion-pretraining-and-hardware-fine-tuning-can-be-almost-7x-cheaper) and [blog of Stable Diffusion v2](https://www.hpc-ai.tech/blog/colossal-ai-0-2-0).
 
+
+## Roadmap
+This project is in rapid development.
+
+- [X] Train a stable diffusion model v1/v2 from scatch
+- [X] Finetune a pretrained Stable diffusion v1 model
+- [X] Inference a pretrained model using PyTorch
+- [ ] Finetune a pretrained Stable diffusion v2 model
+- [ ] Inference a pretrained model using TensoRT
+
 ## Installation
 
 ### Option #1: install from source
@@ -123,7 +133,7 @@ git clone https://huggingface.co/CompVis/stable-diffusion-v1-4
 
 ### stable-diffusion-v1-5 from runway
 
-If you want to useed the Last [stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) wiegh from runwayml
+If you want to useed the Last [stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) weight from runwayml
 
 ```
 git lfs install
@@ -156,7 +166,7 @@ You can change the trainging config in the yaml file
 - precision: the precision type used in training, default 16 (fp16), you must use fp16 if you want to apply colossalai
 - more information about the configuration of ColossalAIStrategy can be found [here](https://pytorch-lightning.readthedocs.io/en/latest/advanced/model_parallel.html#colossal-ai)
 
-## Finetune Example
+## Finetune Example (Work In Progress)
 ### Training on Teyvat Datasets
 
 We provide the finetuning example on [Teyvat](https://huggingface.co/datasets/Fazzie/Teyvat) dataset, which is create by BLIP generated captions.
diff --git a/examples/images/diffusion/test_ci.sh b/examples/images/diffusion/test_ci.sh
new file mode 100644
index 000000000000..e69de29bb2d1

From 7c317062277d6fbb082f6b9b051d8c7b30ce7cc4 Mon Sep 17 00:00:00 2001
From: Jiarui Fang <fangjiarui123@gmail.com>
Date: Mon, 16 Jan 2023 14:44:29 +0800
Subject: [PATCH 188/503] [CI] add test_ci.sh for palm, opt and gpt (#2475)

---
 examples/language/gpt/gemini/run_gemini.sh    |  3 +-
 examples/language/gpt/gemini/test_ci.sh       | 35 ++++++++++
 .../language/gpt/gemini/train_gpt_demo.py     | 11 +++-
 examples/language/gpt/test_ci.sh              | 17 +----
 examples/language/opt/test_ci.sh              |  4 ++
 examples/language/palm/run.sh                 |  2 +-
 examples/language/palm/test_ci.sh             |  9 +++
 examples/language/palm/train.py               | 64 +++++++++++++------
 8 files changed, 107 insertions(+), 38 deletions(-)
 create mode 100644 examples/language/gpt/gemini/test_ci.sh
 create mode 100644 examples/language/opt/test_ci.sh
 create mode 100644 examples/language/palm/test_ci.sh

diff --git a/examples/language/gpt/gemini/run_gemini.sh b/examples/language/gpt/gemini/run_gemini.sh
index 0c2ea660f1e0..6f0710d54f01 100644
--- a/examples/language/gpt/gemini/run_gemini.sh
+++ b/examples/language/gpt/gemini/run_gemini.sh
@@ -9,7 +9,7 @@ export PLACEMENT=${PLACEMENT:-"cpu"}
 export USE_SHARD_INIT=${USE_SHARD_INIT:-False}
 export BATCH_SIZE=${BATCH_SIZE:-16}
 export MODEL_TYPE=${MODEL_TYPE:-"gpt2_medium"}
-
+export TRAIN_STEP=${TRAIN_STEP:-10}
 # export PYTHONPATH=$PWD:$PYTHONPATH
 
 mkdir -p gemini_logs
@@ -21,4 +21,5 @@ torchrun --standalone --nproc_per_node=${GPUNUM} ./train_gpt_demo.py \
 --placement=${PLACEMENT} \
 --shardinit=${USE_SHARD_INIT} \
 --distplan=${DISTPLAN} \
+--train_step=${TRAIN_STEP} \
 2>&1 | tee ./gemini_logs/${MODEL_TYPE}_${DISTPLAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}_${PLACEMENT}.log
diff --git a/examples/language/gpt/gemini/test_ci.sh b/examples/language/gpt/gemini/test_ci.sh
new file mode 100644
index 000000000000..6079d5ed615b
--- /dev/null
+++ b/examples/language/gpt/gemini/test_ci.sh
@@ -0,0 +1,35 @@
+set -x
+$(cd `dirname $0`;pwd)
+export TRAIN_STEP=4
+
+for MODEL_TYPE in "gpt2_medium"; do
+  for DISTPLAN in "colossalai"; do
+    for BATCH_SIZE in 2; do
+      for GPUNUM in 1 4; do
+        for TPDEGREE in 1 2; do
+          if [ ${TPDEGREE} -gt ${GPUNUM} ]; then
+            continue
+          fi
+          for PLACEMENT in "cpu" "auto"; do
+            MODEL_TYPE=${MODEL_TYPE} DISTPLAN=${DISTPLAN} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE} PLACEMENT=${PLACEMENT} \
+            bash ./run_gemini.sh
+          done
+        done
+      done
+    done
+  done
+
+  for DISTPLAN in "zero1" "zero2"; do
+    for BATCH_SIZE in 2; do
+      for GPUNUM in 1 4; do
+        for TPDEGREE in 1; do
+          if [ ${TPDEGREE} -gt ${GPUNUM} ]; then
+            continue
+          fi
+            MODEL_TYPE=${MODEL_TYPE} DISTPLAN=${DISTPLAN} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE}\
+            bash ./run_gemini.sh
+          done
+        done
+      done
+    done
+done
diff --git a/examples/language/gpt/gemini/train_gpt_demo.py b/examples/language/gpt/gemini/train_gpt_demo.py
index f77be12d2d05..713de6f9fb45 100644
--- a/examples/language/gpt/gemini/train_gpt_demo.py
+++ b/examples/language/gpt/gemini/train_gpt_demo.py
@@ -65,7 +65,13 @@ def parse_args():
         default="gpt2_medium",
         help="model model scale",
     )
-    parser.add_argument("--steps", type=int, default=10, help="num of training steps")
+    parser.add_argument(
+        "--train_step",
+        type=int,
+        default=10,
+        help="training iterations for test",
+    )
+
     args = parser.parse_args()
     return args
 
@@ -237,7 +243,8 @@ def main():
     SEQ_LEN = 1024
     VOCAB_SIZE = 50257
 
-    NUM_STEPS = args.steps
+    NUM_STEPS = args.train_step
+
     WARMUP_STEPS = 1
     assert WARMUP_STEPS < NUM_STEPS, "warmup steps should smaller than the total steps"
     assert (NUM_STEPS - WARMUP_STEPS) % 2 == 1, "the number of valid steps should be odd to take the median "
diff --git a/examples/language/gpt/test_ci.sh b/examples/language/gpt/test_ci.sh
index d04ece182016..d67c17229e71 100644
--- a/examples/language/gpt/test_ci.sh
+++ b/examples/language/gpt/test_ci.sh
@@ -1,15 +1,2 @@
-pip install -r requirements.txt
-
-# test colossalai
-for TP in 1 2; do
-    for PLACEMENT in "cpu" "cuda" "auto" "const"; do
-        for SHARD in "True" "False"; do
-            colossalai run --nproc_per_node=4 ./gemini/train_gpt_demo.py --steps 4 --distplan colossalai --tp_degree $TP --placement $PLACEMENT --shardinit $SHARD || exit 1
-        done
-    done
-done
-
-# test zero1&2
-for DIST in "zero1" "zero2"; do
-    colossalai run --nproc_per_node=4 ./gemini/train_gpt_demo.py --steps 4 --distplan $DIST || exit 1
-done
+set -x
+cd gemini && bash test_ci.sh
diff --git a/examples/language/opt/test_ci.sh b/examples/language/opt/test_ci.sh
new file mode 100644
index 000000000000..317f602cda3c
--- /dev/null
+++ b/examples/language/opt/test_ci.sh
@@ -0,0 +1,4 @@
+for GPUNUM in 2 1
+do
+env BS=2 MODEL="125m" GPUNUM=$GPUNUM bash ./run_gemini.sh
+done
diff --git a/examples/language/palm/run.sh b/examples/language/palm/run.sh
index 4aa868953f7b..7a533509e009 100644
--- a/examples/language/palm/run.sh
+++ b/examples/language/palm/run.sh
@@ -8,4 +8,4 @@ export PLACEMENT='cpu'
 export USE_SHARD_INIT=False
 export BATCH_SIZE=4
 
-env OMP_NUM_THREADS=12 torchrun  --standalone --nproc_per_node=${GPUNUM}  --master_port 29501  train_new.py  --tp_degree=${TPDEGREE} --batch_size=${BATCH_SIZE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee run.log
\ No newline at end of file
+env OMP_NUM_THREADS=12 torchrun  --standalone --nproc_per_node=${GPUNUM}  --master_port 29501  train.py  --tp_degree=${TPDEGREE} --batch_size=${BATCH_SIZE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee run.log
diff --git a/examples/language/palm/test_ci.sh b/examples/language/palm/test_ci.sh
new file mode 100644
index 000000000000..f21095578077
--- /dev/null
+++ b/examples/language/palm/test_ci.sh
@@ -0,0 +1,9 @@
+$(cd `dirname $0`;pwd)
+
+for BATCH_SIZE in 2
+do
+for GPUNUM in 1 4
+do
+env OMP_NUM_THREADS=12 torchrun  --standalone --nproc_per_node=${GPUNUM}  --master_port 29501  train.py --dummy_data=True --batch_size=${BATCH_SIZE}  2>&1 | tee run.log
+done
+done
diff --git a/examples/language/palm/train.py b/examples/language/palm/train.py
index 6725c07dfac7..a334ea9511fb 100644
--- a/examples/language/palm/train.py
+++ b/examples/language/palm/train.py
@@ -1,11 +1,12 @@
 import gzip
 import random
-from time import time
 from functools import partial
+from time import time
+
 import numpy as np
 import torch
-import torch.optim as optim
 import torch.nn as nn
+import torch.optim as optim
 import tqdm
 from packaging import version
 from palm_pytorch import PaLM
@@ -23,7 +24,7 @@
 
 # constants
 
-NUM_BATCHES = int(100)
+NUM_BATCHES = int(10)
 WARMUP_BATCHES = 1
 GRADIENT_ACCUMULATE_EVERY = 1
 LEARNING_RATE = 2e-4
@@ -66,9 +67,16 @@ def parse_args():
         default=8,
         help="batch size per DP group of training.",
     )
+    parser.add_argument(
+        "--dummy_data",
+        type=bool,
+        default=False,
+        help="use dummy dataset.",
+    )
     args = parser.parse_args()
     return args
 
+
 # helpers
 def cycle(loader):
     while True:
@@ -79,12 +87,15 @@ def cycle(loader):
 def decode_token(token):
     return str(chr(max(32, token)))
 
+
 def get_tflops(model_numel, batch_size, seq_len, step_time):
     return model_numel * batch_size * seq_len * 8 / 1e12 / (step_time + 1e-12)
 
+
 def decode_tokens(tokens):
     return "".join(list(map(decode_token, tokens)))
 
+
 def get_model_size(model: nn.Module):
     total_numel = 0
     for module in model.modules():
@@ -92,6 +103,7 @@ def get_model_size(model: nn.Module):
             total_numel += p.numel()
     return total_numel
 
+
 # Gemini + ZeRO DDP
 def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy: str = "auto"):
     cai_version = colossalai.__version__
@@ -115,6 +127,7 @@ def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy:
         raise NotImplemented(f"CAI version {cai_version} is not supported")
     return model
 
+
 ## Parameter Sharding Strategies for Tensor Parallelism
 def split_param_single_dim_tp1d(dim: int, param: ColoParameter, pg: ProcessGroup):
     spec = (ShardSpec([dim], [pg.tp_world_size()]), ComputeSpec(ComputePattern.TP1D))
@@ -128,6 +141,7 @@ def split_param_row_tp1d(param: ColoParameter, pg: ProcessGroup):
 def split_param_col_tp1d(param: ColoParameter, pg: ProcessGroup):
     split_param_single_dim_tp1d(-1, param, pg)
 
+
 # Tensor Parallel
 def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup):
     """tensor_parallelize
@@ -159,15 +173,28 @@ def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup):
 
 args = parse_args()
 if args.distplan not in ["colossalai", "pytorch"]:
-        raise TypeError(f"{args.distplan} is error")
+    raise TypeError(f"{args.distplan} is error")
 disable_existing_loggers()
 colossalai.launch_from_torch(config={})
 logger = get_dist_logger()
 
-with gzip.open("./data/enwik8.gz") as file:
-    X = np.fromstring(file.read(int(95e6)), dtype=np.uint8)
-    trX, vaX = np.split(X, [int(90e6)])
-    data_train, data_val = torch.from_numpy(trX), torch.from_numpy(vaX)
+
+def generate_dataset(dummy_data: bool = False):
+    if not dummy_data:
+        with gzip.open("./data/enwik8.gz") as file:
+            X = np.fromstring(file.read(int(95e6)), dtype=np.uint8)
+            trX, vaX = np.split(X, [int(90e6)])
+            data_train, data_val = torch.from_numpy(trX), torch.from_numpy(vaX)
+            # print(f"data_train {data_train.shape} {data_train.dtype} {max(data_train)} {min(data_train)}")
+            # print(f"data_val {data_val.shape} {data_val.dtype}  {max(data_val)} {min(data_val)}")
+            return data_train, data_val
+    else:
+        return torch.randint(0, 100, (90000000,)), torch.randint(0, 100, (5000000,))
+
+
+data_train, data_val = generate_dataset(args.dummy_data)
+
+print("generate dataset ready!")
 
 
 class TextSamplerDataset(Dataset):
@@ -216,7 +243,7 @@ def __len__(self):
     model.cuda()
     optim = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
 
- # model is shared after TP
+# model is shared after TP
 numel = get_model_size(model)
 get_tflops_func = partial(get_tflops, numel, args.batch_size, SEQ_LEN)
 
@@ -251,7 +278,7 @@ def __len__(self):
         )
         if i >= WARMUP_BATCHES:
             tflops_list.append(step_tflops)
-    
+
     else:
         for __ in range(GRADIENT_ACCUMULATE_EVERY):
             loss = model(next(train_loader))
@@ -261,18 +288,17 @@ def __len__(self):
         torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
         optim.step()
         optim.zero_grad()
-    
+
 tflops_list.sort()
 median_index = ((NUM_BATCHES - WARMUP_BATCHES) >> 1) + WARMUP_BATCHES
 logger.info(f"Median TFLOPS is {tflops_list[median_index]:.3f}")
 
-
-    # TODO
-    # if i % VALIDATE_EVERY == 0:
-    #     model.eval()
-    #     with torch.no_grad():
-    #         loss = model(next(val_loader))
-    #         print(f"validation loss: {loss.item()}")
+# TODO
+# if i % VALIDATE_EVERY == 0:
+#     model.eval()
+#     with torch.no_grad():
+#         loss = model(next(val_loader))
+#         print(f"validation loss: {loss.item()}")
 
     # if i % GENERATE_EVERY == 0:
     #     model.eval()
@@ -282,4 +308,4 @@ def __len__(self):
 
     #     sample = model.generate(inp[None, ...], GENERATE_LENGTH)
     #     output_str = decode_tokens(sample[0])
-    #     print(output_str)
\ No newline at end of file
+    #     print(output_str)

From e64a05b38b37436f9fa6872373260b455b3e8645 Mon Sep 17 00:00:00 2001
From: jiaruifang <fangjiarui123@gmail.com>
Date: Mon, 16 Jan 2023 14:45:06 +0800
Subject: [PATCH 189/503] polish code

---
 examples/language/palm/train.py | 38 +++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 16 deletions(-)

diff --git a/examples/language/palm/train.py b/examples/language/palm/train.py
index 6725c07dfac7..b17496954f06 100644
--- a/examples/language/palm/train.py
+++ b/examples/language/palm/train.py
@@ -1,22 +1,22 @@
 import gzip
 import random
-from time import time
 from functools import partial
+from time import time
+
 import numpy as np
 import torch
-import torch.optim as optim
 import torch.nn as nn
+import torch.optim as optim
 import tqdm
 from packaging import version
 from palm_pytorch import PaLM
 from palm_pytorch.autoregressive_wrapper import AutoregressiveWrapper
-from torch.nn import functional as F
 from torch.utils.data import DataLoader, Dataset
 
 import colossalai
 from colossalai.logging import disable_existing_loggers, get_dist_logger
 from colossalai.nn.optimizer.gemini_optimizer import GeminiAdamOptimizer
-from colossalai.nn.parallel import GeminiDDP, ZeroDDP
+from colossalai.nn.parallel import ZeroDDP
 from colossalai.tensor import ColoParameter, ComputePattern, ComputeSpec, ProcessGroup, ReplicaSpec, ShardSpec
 from colossalai.utils import MultiTimer, get_current_device
 from colossalai.utils.model.colo_init_context import ColoInitContext
@@ -69,6 +69,7 @@ def parse_args():
     args = parser.parse_args()
     return args
 
+
 # helpers
 def cycle(loader):
     while True:
@@ -79,12 +80,15 @@ def cycle(loader):
 def decode_token(token):
     return str(chr(max(32, token)))
 
+
 def get_tflops(model_numel, batch_size, seq_len, step_time):
     return model_numel * batch_size * seq_len * 8 / 1e12 / (step_time + 1e-12)
 
+
 def decode_tokens(tokens):
     return "".join(list(map(decode_token, tokens)))
 
+
 def get_model_size(model: nn.Module):
     total_numel = 0
     for module in model.modules():
@@ -92,6 +96,7 @@ def get_model_size(model: nn.Module):
             total_numel += p.numel()
     return total_numel
 
+
 # Gemini + ZeRO DDP
 def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy: str = "auto"):
     cai_version = colossalai.__version__
@@ -115,6 +120,7 @@ def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy:
         raise NotImplemented(f"CAI version {cai_version} is not supported")
     return model
 
+
 ## Parameter Sharding Strategies for Tensor Parallelism
 def split_param_single_dim_tp1d(dim: int, param: ColoParameter, pg: ProcessGroup):
     spec = (ShardSpec([dim], [pg.tp_world_size()]), ComputeSpec(ComputePattern.TP1D))
@@ -128,6 +134,7 @@ def split_param_row_tp1d(param: ColoParameter, pg: ProcessGroup):
 def split_param_col_tp1d(param: ColoParameter, pg: ProcessGroup):
     split_param_single_dim_tp1d(-1, param, pg)
 
+
 # Tensor Parallel
 def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup):
     """tensor_parallelize
@@ -159,7 +166,7 @@ def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup):
 
 args = parse_args()
 if args.distplan not in ["colossalai", "pytorch"]:
-        raise TypeError(f"{args.distplan} is error")
+    raise TypeError(f"{args.distplan} is error")
 disable_existing_loggers()
 colossalai.launch_from_torch(config={})
 logger = get_dist_logger()
@@ -216,7 +223,7 @@ def __len__(self):
     model.cuda()
     optim = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
 
- # model is shared after TP
+# model is shared after TP
 numel = get_model_size(model)
 get_tflops_func = partial(get_tflops, numel, args.batch_size, SEQ_LEN)
 
@@ -251,7 +258,7 @@ def __len__(self):
         )
         if i >= WARMUP_BATCHES:
             tflops_list.append(step_tflops)
-    
+
     else:
         for __ in range(GRADIENT_ACCUMULATE_EVERY):
             loss = model(next(train_loader))
@@ -261,18 +268,17 @@ def __len__(self):
         torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
         optim.step()
         optim.zero_grad()
-    
+
 tflops_list.sort()
 median_index = ((NUM_BATCHES - WARMUP_BATCHES) >> 1) + WARMUP_BATCHES
 logger.info(f"Median TFLOPS is {tflops_list[median_index]:.3f}")
 
-
-    # TODO
-    # if i % VALIDATE_EVERY == 0:
-    #     model.eval()
-    #     with torch.no_grad():
-    #         loss = model(next(val_loader))
-    #         print(f"validation loss: {loss.item()}")
+# TODO
+# if i % VALIDATE_EVERY == 0:
+#     model.eval()
+#     with torch.no_grad():
+#         loss = model(next(val_loader))
+#         print(f"validation loss: {loss.item()}")
 
     # if i % GENERATE_EVERY == 0:
     #     model.eval()
@@ -282,4 +288,4 @@ def __len__(self):
 
     #     sample = model.generate(inp[None, ...], GENERATE_LENGTH)
     #     output_str = decode_tokens(sample[0])
-    #     print(output_str)
\ No newline at end of file
+    #     print(output_str)

From 37baea20cb6e2cd35f0364bb6604950706c83ce4 Mon Sep 17 00:00:00 2001
From: jiaruifang <fangjiarui123@gmail.com>
Date: Mon, 16 Jan 2023 14:59:25 +0800
Subject: [PATCH 190/503] [example] titans for gpt

---
 examples/language/gpt/titans/LICENSE          | 201 ++++++
 examples/language/gpt/titans/README.md        |  48 ++
 .../titans/configs/gpt2_small_zero3_pp1d.py   |  31 +
 .../gpt/titans/configs/gpt3_zero3_pp1d.py     |  31 +
 .../language/gpt/titans/model/__init__.py     |   3 +
 examples/language/gpt/titans/model/embed.py   | 599 ++++++++++++++++++
 examples/language/gpt/titans/model/gpt1d.py   | 349 ++++++++++
 .../gpt/titans/model/pipeline_gpt1d.py        | 322 ++++++++++
 examples/language/gpt/titans/requirements.txt |   4 +
 examples/language/gpt/titans/run.sh           |   2 +
 examples/language/gpt/titans/test_ci.sh       |   1 +
 examples/language/gpt/titans/train_gpt.py     | 148 +++++
 12 files changed, 1739 insertions(+)
 create mode 100644 examples/language/gpt/titans/LICENSE
 create mode 100644 examples/language/gpt/titans/README.md
 create mode 100644 examples/language/gpt/titans/configs/gpt2_small_zero3_pp1d.py
 create mode 100644 examples/language/gpt/titans/configs/gpt3_zero3_pp1d.py
 create mode 100644 examples/language/gpt/titans/model/__init__.py
 create mode 100644 examples/language/gpt/titans/model/embed.py
 create mode 100644 examples/language/gpt/titans/model/gpt1d.py
 create mode 100644 examples/language/gpt/titans/model/pipeline_gpt1d.py
 create mode 100644 examples/language/gpt/titans/requirements.txt
 create mode 100644 examples/language/gpt/titans/run.sh
 create mode 100644 examples/language/gpt/titans/test_ci.sh
 create mode 100644 examples/language/gpt/titans/train_gpt.py

diff --git a/examples/language/gpt/titans/LICENSE b/examples/language/gpt/titans/LICENSE
new file mode 100644
index 000000000000..261eeb9e9f8b
--- /dev/null
+++ b/examples/language/gpt/titans/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/examples/language/gpt/titans/README.md b/examples/language/gpt/titans/README.md
new file mode 100644
index 000000000000..14c07442b82a
--- /dev/null
+++ b/examples/language/gpt/titans/README.md
@@ -0,0 +1,48 @@
+# Run GPT With Colossal-AI
+
+## How to Prepare Webtext Dataset
+
+You can download the preprocessed sample dataset for this demo via our [Google Drive sharing link](https://drive.google.com/file/d/1QKI6k-e2gJ7XgS8yIpgPPiMmwiBP_BPE/view?usp=sharing).
+
+
+You can also avoid dataset preparation by using `--use_dummy_data` during running.
+
+## Run this Demo
+
+Use the following commands to install prerequisites.
+
+```bash
+# assuming using cuda 11.3
+conda install pytorch==1.11.0 torchvision==0.12.0 torchaudio==0.11.0 cudatoolkit=11.3 -c pytorch
+pip install colossalai==0.1.9+torch1.11cu11.3 -f https://release.colossalai.org
+```
+
+Use the following commands to execute training.
+
+```Bash
+#!/usr/bin/env sh
+export DATA=/path/to/small-gpt-dataset.json'
+
+# run on a single node
+colossalai run --nproc_per_node=<num_gpus> train_gpt.py --config configs/<config_file> --from_torch
+
+# run on multiple nodes with slurm
+colossalai run --nproc_per_node=<num_gpus> \
+   --master_addr <hostname> \
+   --master_port <port-number> \
+   --hosts <list-of-hostname-separated-by-comma> \
+   train_gpt.py \
+   --config configs/<config_file> \
+   --from_torch \
+   --use_dummy_data
+
+# run on multiple nodes with slurm
+srun python \
+   train_gpt.py \
+   --config configs/<config_file> \
+   --host <master_node> \
+   --use_dummy_data
+
+```
+
+You can set the `<config_file>` to any file in the `configs` folder. To simply get it running, you can start with `gpt_small_zero3_pp1d.py` on a single node first. You can view the explanations in the config file regarding how to change the parallel setting.
diff --git a/examples/language/gpt/titans/configs/gpt2_small_zero3_pp1d.py b/examples/language/gpt/titans/configs/gpt2_small_zero3_pp1d.py
new file mode 100644
index 000000000000..8ef81cb0a14f
--- /dev/null
+++ b/examples/language/gpt/titans/configs/gpt2_small_zero3_pp1d.py
@@ -0,0 +1,31 @@
+from model import GPT2_small_pipeline_hybrid
+
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.zero.shard_utils import TensorShardStrategy
+
+BATCH_SIZE = 8
+NUM_EPOCHS = 10
+SEQ_LEN = 1024
+NUM_MICRO_BATCHES = 4
+HIDDEN_SIZE = 768
+TENSOR_SHAPE = (BATCH_SIZE // NUM_MICRO_BATCHES, SEQ_LEN, HIDDEN_SIZE)
+
+# if you do no want zero, just comment out this dictionary
+zero = dict(model_config=dict(tensor_placement_policy='cuda', shard_strategy=TensorShardStrategy()),
+            optimizer_config=dict(initial_scale=2**16))
+
+optimizer = dict(
+    type=HybridAdam,
+    lr=0.00015,
+    weight_decay=1e-2,
+)
+
+model = dict(type=GPT2_small_pipeline_hybrid, checkpoint=True, num_chunks=1)
+
+# pipeline parallel: modify integer value for the number of pipeline stages
+# tensor parallel: modify size to set the tensor parallel size, usually the number of GPUs per node
+# for the current model implementation, mode can only be 1D or None
+parallel = dict(
+    pipeline=1,
+    tensor=dict(size=2, mode='1d'),
+)
diff --git a/examples/language/gpt/titans/configs/gpt3_zero3_pp1d.py b/examples/language/gpt/titans/configs/gpt3_zero3_pp1d.py
new file mode 100644
index 000000000000..9f9816b3004f
--- /dev/null
+++ b/examples/language/gpt/titans/configs/gpt3_zero3_pp1d.py
@@ -0,0 +1,31 @@
+from model import GPT3_pipeline_hybrid
+
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.zero.shard_utils import TensorShardStrategy
+
+BATCH_SIZE = 192
+NUM_EPOCHS = 60
+SEQ_LEN = 2048
+NUM_MICRO_BATCHES = 192
+HIDDEN_SIZE = 12288
+TENSOR_SHAPE = (BATCH_SIZE // NUM_MICRO_BATCHES, SEQ_LEN, HIDDEN_SIZE)
+
+# if you do no want zero, just comment out this dictionary
+zero = dict(model_config=dict(tensor_placement_policy='cuda', shard_strategy=TensorShardStrategy()),
+            optimizer_config=dict(initial_scale=2**16))
+
+optimizer = dict(
+    type=HybridAdam,
+    lr=0.00015,
+    weight_decay=1e-2,
+)
+
+model = dict(type=GPT3_pipeline_hybrid, checkpoint=True, num_chunks=1)
+
+# pipeline parallel: modify integer value for the number of pipeline stages
+# tensor parallel: modify size to set the tensor parallel size, usually the number of GPUs per node
+# for the current model implementation, mode can only be 1D or None
+parallel = dict(
+    pipeline=1,
+    tensor=dict(size=2, mode='1d'),    # for the current model implementation, mode can only be 1D or None
+)
diff --git a/examples/language/gpt/titans/model/__init__.py b/examples/language/gpt/titans/model/__init__.py
new file mode 100644
index 000000000000..eec48ef893fb
--- /dev/null
+++ b/examples/language/gpt/titans/model/__init__.py
@@ -0,0 +1,3 @@
+from .embed import vocab_parallel_cross_entropy
+from .gpt1d import *
+from .pipeline_gpt1d import *
diff --git a/examples/language/gpt/titans/model/embed.py b/examples/language/gpt/titans/model/embed.py
new file mode 100644
index 000000000000..6369b9f8c5a1
--- /dev/null
+++ b/examples/language/gpt/titans/model/embed.py
@@ -0,0 +1,599 @@
+import torch
+import torch.nn.init as init
+from torch import Tensor
+from torch import distributed as dist
+from torch import nn as nn
+from torch.nn import functional as F
+from torch.nn.parameter import Parameter
+
+from colossalai.context import ParallelMode, seed
+from colossalai.core import global_context as gpc
+from colossalai.nn.layer.base_layer import ParallelLayer
+from colossalai.nn.layer.parallel_1d._utils import gather_forward_split_backward, reduce_grad, reduce_input
+from colossalai.nn.layer.parallel_1d.layers import Linear1D_Row
+from colossalai.nn.layer.utils import divide
+from colossalai.registry import LAYERS, LOSSES, MODELS
+from colossalai.utils import get_current_device
+
+
+class VocabParallelEmbedding(torch.nn.Module):
+    """Language model embeddings.
+
+    Arguments:
+        hidden_size: hidden size
+        vocab_size: vocabulary size
+        max_sequence_length: maximum size of sequence. This
+                             is used for positional embedding
+        embedding_dropout_prob: dropout probability for embeddings
+        init_method: weight initialization method
+        num_tokentypes: size of the token-type embeddings. 0 value
+                        will ignore this embedding
+    """
+
+    def __init__(self,
+                 hidden_size,
+                 vocab_size,
+                 max_sequence_length,
+                 embedding_dropout_prob,
+                 num_tokentypes=0,
+                 dtype=torch.float):
+        super(VocabParallelEmbedding, self).__init__()
+
+        self.hidden_size = hidden_size
+        self.num_tokentypes = num_tokentypes
+
+        # Word embeddings (parallel).
+        self.word_embeddings = VocabParallelEmbedding1D(vocab_size, self.hidden_size, dtype=dtype)
+        self._word_embeddings_key = 'word_embeddings'
+
+        # Position embedding (serial).
+        self.position_embeddings = torch.nn.Embedding(max_sequence_length, self.hidden_size, dtype=dtype)
+        self._position_embeddings_key = 'position_embeddings'
+        # Initialize the position embeddings.
+        # self.init_method(self.position_embeddings.weight)
+
+        # Token type embedding.
+        # Add this as an optional field that can be added through
+        # method call so we can load a pretrain model without
+        # token types and add them as needed.
+        self._tokentype_embeddings_key = 'tokentype_embeddings'
+        if self.num_tokentypes > 0:
+            self.tokentype_embeddings = torch.nn.Embedding(self.num_tokentypes, self.hidden_size, dtype=dtype)
+            # Initialize the token-type embeddings.
+            # self.init_method(self.tokentype_embeddings.weight)
+        else:
+            self.tokentype_embeddings = None
+
+        # Embeddings dropout
+        self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)
+
+    def zero_parameters(self):
+        """Zero out all parameters in embedding."""
+        self.word_embeddings.weight.data.fill_(0)
+        self.word_embeddings.weight.shared = True
+        self.position_embeddings.weight.data.fill_(0)
+        self.position_embeddings.weight.shared = True
+        if self.num_tokentypes > 0:
+            self.tokentype_embeddings.weight.data.fill_(0)
+            self.tokentype_embeddings.weight.shared = True
+
+    def add_tokentype_embeddings(self, num_tokentypes):
+        """Add token-type embedding. This function is provided so we can add
+        token-type embeddings in case the pretrained model does not have it.
+        This allows us to load the model normally and then add this embedding.
+        """
+        if self.tokentype_embeddings is not None:
+            raise Exception('tokentype embeddings is already initialized')
+        if torch.distributed.get_rank() == 0:
+            print('adding embedding for {} tokentypes'.format(num_tokentypes), flush=True)
+        self.num_tokentypes = num_tokentypes
+        self.tokentype_embeddings = torch.nn.Embedding(num_tokentypes, self.hidden_size)
+        # Initialize the token-type embeddings.
+        # self.init_method(self.tokentype_embeddings.weight)
+
+    def forward(self, input_ids, position_ids=None, tokentype_ids=None):
+        # Embeddings.
+        if input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        words_embeddings = self.word_embeddings(input_ids)
+
+        if position_ids is not None:
+            position_ids = position_ids.view(-1, input_shape[-1])
+        if position_ids is None:
+            position_ids = torch.arange(0, input_shape[-1] + 0, dtype=torch.long, device=get_current_device())
+            position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
+        position_embeddings = self.position_embeddings(position_ids)
+
+        embeddings = words_embeddings + position_embeddings
+
+        # Dropout.
+        with seed(ParallelMode.TENSOR):
+            embeddings = self.embedding_dropout(embeddings)
+        return embeddings
+
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='', keep_vars=False):
+        """For easy load."""
+
+        state_dict_ = {}
+        state_dict_[self._word_embeddings_key] \
+            = self.word_embeddings.state_dict(destination, prefix, keep_vars)
+        state_dict_[self._position_embeddings_key] \
+            = self.position_embeddings.state_dict(
+                destination, prefix, keep_vars)
+        if self.num_tokentypes > 0:
+            state_dict_[self._tokentype_embeddings_key] \
+                = self.tokentype_embeddings.state_dict(
+                    destination, prefix, keep_vars)
+
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+
+        # Word embedding.
+        if self._word_embeddings_key in state_dict:
+            state_dict_ = state_dict[self._word_embeddings_key]
+        else:
+            # for backward compatibility.
+            state_dict_ = {}
+            for key in state_dict.keys():
+                if 'word_embeddings' in key:
+                    state_dict_[key.split('word_embeddings.')[1]] \
+                        = state_dict[key]
+        self.word_embeddings.load_state_dict(state_dict_, strict=strict)
+
+        # Position embedding.
+        if self._position_embeddings_key in state_dict:
+            state_dict_ = state_dict[self._position_embeddings_key]
+        else:
+            # for backward compatibility.
+            state_dict_ = {}
+            for key in state_dict.keys():
+                if 'position_embeddings' in key:
+                    state_dict_[key.split('position_embeddings.')[1]] \
+                        = state_dict[key]
+        self.position_embeddings.load_state_dict(state_dict_, strict=strict)
+
+        # Tokentype embedding.
+        if self.num_tokentypes > 0:
+            state_dict_ = {}
+            if self._tokentype_embeddings_key in state_dict:
+                state_dict_ = state_dict[self._tokentype_embeddings_key]
+            else:
+                # for backward compatibility.
+                for key in state_dict.keys():
+                    if 'tokentype_embeddings' in key:
+                        state_dict_[key.split('tokentype_embeddings.')[1]] \
+                            = state_dict[key]
+            if len(state_dict_.keys()) > 0:
+                self.tokentype_embeddings.load_state_dict(state_dict_, strict=strict)
+            else:
+                print('***WARNING*** expected tokentype embeddings in the '
+                      'checkpoint but could not find it',
+                      flush=True)
+
+
+class VocabParallelEmbedding1D(torch.nn.Module):
+    """Embedding parallelized in the vocabulary dimension.
+
+    This is mainly adapted from torch.nn.Embedding and all the default
+    values are kept.
+    Arguments:
+        num_embeddings: vocabulary size.
+        embedding_dim: size of hidden state.
+        init_method: method to initialize weights.
+    """
+
+    def __init__(self, num_embeddings, embedding_dim, dtype=None, init_method=None):
+        super(VocabParallelEmbedding1D, self).__init__()
+        # Keep the input dimensions.
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        # Set the details for compatibility.
+        self.padding_idx = None
+        self.max_norm = None
+        self.norm_type = 2.
+        self.scale_grad_by_freq = False
+        self.sparse = False
+        self._weight = None
+        self.tensor_model_parallel_size = gpc.tensor_parallel_size
+        # Divide the weight matrix along the vocabulary dimension.
+        self.vocab_start_index, self.vocab_end_index = \
+            VocabUtility.vocab_range_from_global_vocab_size(
+                self.num_embeddings, gpc.get_local_rank(ParallelMode.PARALLEL_1D),
+                self.tensor_model_parallel_size)
+        self.num_embeddings_per_partition = self.vocab_end_index - \
+            self.vocab_start_index
+
+        # Allocate weights and initialize.
+        factory_kwargs = {'device': get_current_device(), 'dtype': dtype}
+        self.weight = Parameter(torch.empty(self.num_embeddings_per_partition, self.embedding_dim, **factory_kwargs))
+        init.uniform_(self.weight, -1, 1)
+
+    def forward(self, input_):
+        if self.tensor_model_parallel_size > 1:
+            # Build the mask.
+            input_mask = (input_ < self.vocab_start_index) | \
+                         (input_ >= self.vocab_end_index)
+            # Mask the input.
+            masked_input = input_.clone() - self.vocab_start_index
+            masked_input[input_mask] = 0
+        else:
+            masked_input = input_
+            # Get the embeddings.
+        output_parallel = F.embedding(masked_input, self.weight, self.padding_idx, self.max_norm, self.norm_type,
+                                      self.scale_grad_by_freq, self.sparse)
+        # Mask the output embedding.
+        if self.tensor_model_parallel_size > 1:
+            output_parallel[input_mask, :] = 0.0
+        # Reduce across all the model parallel GPUs.
+        output = output = reduce_input(output_parallel, ParallelMode.PARALLEL_1D)
+        return output
+
+
+@LOSSES.register_module
+class vocab_parallel_cross_entropy(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, vocab_parallel_logits, target):
+        """Helper function for the cross entropy."""
+        vocab_parallel_logits = vocab_parallel_logits[..., :-1, :].contiguous()
+        target = target[..., 1:].contiguous()
+        return _VocabParallelCrossEntropy.apply(vocab_parallel_logits.view(-1, vocab_parallel_logits.size(-1)),
+                                                target.view(-1))
+
+
+class _VocabParallelCrossEntropy(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, vocab_parallel_logits, target):
+
+        # Maximum value along vocab dimension across all GPUs.
+        logits_max = torch.max(vocab_parallel_logits, dim=-1)[0]
+        torch.distributed.all_reduce(logits_max,
+                                     op=torch.distributed.ReduceOp.MAX,
+                                     group=gpc.get_group(ParallelMode.PARALLEL_1D))
+        # Subtract the maximum value.
+        vocab_parallel_logits.sub_(logits_max.unsqueeze(dim=-1))
+
+        # Get the partition's vocab indices
+        get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size
+        partition_vocab_size = vocab_parallel_logits.size()[-1]
+        rank = gpc.get_local_rank(ParallelMode.PARALLEL_1D)
+        world_size = gpc.tensor_parallel_size
+        vocab_start_index, vocab_end_index = get_vocab_range(partition_vocab_size, rank, world_size)
+
+        # Create a mask of valid vocab ids (1 means it needs to be masked).
+        target_mask = (target < vocab_start_index) | (target >= vocab_end_index)
+        masked_target = target.clone() - vocab_start_index
+        masked_target[target_mask] = 0
+
+        # Get predicted-logits = logits[target].
+        # For Simplicity, we convert logits to a 2-D tensor with size
+        # [*, partition-vocab-size] and target to a 1-D tensor of size [*].
+        logits_2d = vocab_parallel_logits.view(-1, partition_vocab_size)
+        masked_target_1d = masked_target.view(-1)
+        arange_1d = torch.arange(start=0, end=logits_2d.size()[0], device=logits_2d.device)
+        predicted_logits_1d = logits_2d[arange_1d, masked_target_1d]
+        predicted_logits_1d = predicted_logits_1d.clone().contiguous()
+        predicted_logits = predicted_logits_1d.view_as(target)
+        predicted_logits[target_mask] = 0.0
+        # All reduce is needed to get the chunks from other GPUs.
+        torch.distributed.all_reduce(predicted_logits,
+                                     op=torch.distributed.ReduceOp.SUM,
+                                     group=gpc.get_group(ParallelMode.PARALLEL_1D))
+
+        # Sum of exponential of logits along vocab dimension across all GPUs.
+        exp_logits = vocab_parallel_logits
+        torch.exp(vocab_parallel_logits, out=exp_logits)
+        sum_exp_logits = exp_logits.sum(dim=-1)
+        torch.distributed.all_reduce(sum_exp_logits,
+                                     op=torch.distributed.ReduceOp.SUM,
+                                     group=gpc.get_group(ParallelMode.PARALLEL_1D))
+
+        # Loss = log(sum(exp(logits))) - predicted-logit.
+        loss = torch.log(sum_exp_logits) - predicted_logits
+        loss = loss.mean()
+        # Store softmax, target-mask and masked-target for backward pass.
+        exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
+        ctx.save_for_backward(exp_logits, target_mask, masked_target_1d)
+        return loss
+
+    @staticmethod
+    def backward(ctx, grad_output):
+
+        # Retreive tensors from the forward path.
+        softmax, target_mask, masked_target_1d = ctx.saved_tensors
+
+        # All the inputs have softmax as their gradient.
+        grad_input = softmax
+        # For simplicity, work with the 2D gradient.
+        partition_vocab_size = softmax.size()[-1]
+        grad_2d = grad_input.view(-1, partition_vocab_size)
+
+        # Add the gradient from matching classes.
+        arange_1d = torch.arange(start=0, end=grad_2d.size()[0], device=grad_2d.device)
+        grad_2d[arange_1d, masked_target_1d] -= (1.0 - target_mask.view(-1).float())
+
+        # Finally elementwise multiplication with the output gradients.
+        grad_input.mul_(grad_output.unsqueeze(dim=-1))
+
+        return grad_input, None
+
+
+class VocabUtility:
+    """Split the vocabulary into `world_size` chunks amd return the
+        first and last index of the vocabulary belonging to the `rank`
+        partition: Note that indices in [fist, last)"""
+
+    @staticmethod
+    def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, rank, world_size):
+        index_f = rank * per_partition_vocab_size
+        index_l = index_f + per_partition_vocab_size
+        return index_f, index_l
+
+    @staticmethod
+    def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size):
+        per_partition_vocab_size = divide(global_vocab_size, world_size)
+        return VocabUtility.vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, rank, world_size)
+
+
+class VocabParallelGPTLMHead1D(ParallelLayer):
+    """
+    Language model head that shares the same parameters with the embedding matrix.
+    """
+
+    def __init__(self, embed=None, vocab_size=None, dtype=None, embed_dim=None):
+        super().__init__()
+        if embed is not None:
+            self.head = embed
+        else:
+            self.head = VocabParallelEmbedding1D(vocab_size, embed_dim, dtype=dtype)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = reduce_grad(x, ParallelMode.PARALLEL_1D)
+        x = F.linear(x, self.head.weight)
+        return x
+
+
+###################################
+
+
+class HiddenParallelEmbedding(torch.nn.Module):
+    """Language model embeddings.
+
+    Arguments:
+        hidden_size: hidden size
+        vocab_size: vocabulary size
+        max_sequence_length: maximum size of sequence. This
+                             is used for positional embedding
+        embedding_dropout_prob: dropout probability for embeddings
+        init_method: weight initialization method
+        num_tokentypes: size of the token-type embeddings. 0 value
+                        will ignore this embedding
+    """
+
+    def __init__(
+        self,
+        hidden_size,
+        vocab_size,
+        max_sequence_length,
+        embedding_dropout_prob,
+        dtype=torch.float,
+        padding_idx: int = 0,
+        num_tokentypes=0,
+    ):
+        super(HiddenParallelEmbedding, self).__init__()
+
+        self.hidden_size = hidden_size
+        self.num_tokentypes = num_tokentypes
+
+        # Word embeddings (parallel).
+        self.word_embeddings = HiddenParallelEmbedding1D(vocab_size, hidden_size, dtype, padding_idx)
+        self._word_embeddings_key = 'word_embeddings'
+
+        # Position embedding (serial).
+        self.position_embeddings = torch.nn.Embedding(max_sequence_length, self.hidden_size)
+        self._position_embeddings_key = 'position_embeddings'
+        # Initialize the position embeddings.
+        # self.init_method(self.position_embeddings.weight)
+
+        # Token type embedding.
+        # Add this as an optional field that can be added through
+        # method call so we can load a pretrain model without
+        # token types and add them as needed.
+        self._tokentype_embeddings_key = 'tokentype_embeddings'
+        if self.num_tokentypes > 0:
+            self.tokentype_embeddings = torch.nn.Embedding(self.num_tokentypes, self.hidden_size)
+            # Initialize the token-type embeddings.
+            # self.init_method(self.tokentype_embeddings.weight)
+        else:
+            self.tokentype_embeddings = None
+
+        # Embeddings dropout
+        self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)
+
+    def zero_parameters(self):
+        """Zero out all parameters in embedding."""
+        self.word_embeddings.weight.data.fill_(0)
+        self.word_embeddings.weight.shared = True
+        self.position_embeddings.weight.data.fill_(0)
+        self.position_embeddings.weight.shared = True
+        if self.num_tokentypes > 0:
+            self.tokentype_embeddings.weight.data.fill_(0)
+            self.tokentype_embeddings.weight.shared = True
+
+    def add_tokentype_embeddings(self, num_tokentypes):
+        """Add token-type embedding. This function is provided so we can add
+        token-type embeddings in case the pretrained model does not have it.
+        This allows us to load the model normally and then add this embedding.
+        """
+        if self.tokentype_embeddings is not None:
+            raise Exception('tokentype embeddings is already initialized')
+        if torch.distributed.get_rank() == 0:
+            print('adding embedding for {} tokentypes'.format(num_tokentypes), flush=True)
+        self.num_tokentypes = num_tokentypes
+        self.tokentype_embeddings = torch.nn.Embedding(num_tokentypes, self.hidden_size)
+        # Initialize the token-type embeddings.
+        # self.init_method(self.tokentype_embeddings.weight)
+
+    def forward(self, input_ids, position_ids=None, tokentype_ids=None):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        words_embeddings = self.word_embeddings(input_ids)
+
+        if position_ids is not None:
+            position_ids = position_ids.view(-1, input_shape[-1])
+        if position_ids is None:
+            position_ids = torch.arange(0, input_shape[-1] + 0, dtype=torch.long, device=get_current_device())
+            position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
+        position_embeddings = self.position_embeddings(position_ids)
+
+        embeddings = words_embeddings + position_embeddings
+
+        # Dropout.
+        with seed(ParallelMode.TENSOR):
+            embeddings = self.embedding_dropout(embeddings)
+        return embeddings
+
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='', keep_vars=False):
+        """For easy load."""
+
+        state_dict_ = {}
+        state_dict_[self._word_embeddings_key] \
+            = self.word_embeddings.state_dict(destination, prefix, keep_vars)
+        state_dict_[self._position_embeddings_key] \
+            = self.position_embeddings.state_dict(
+                destination, prefix, keep_vars)
+        if self.num_tokentypes > 0:
+            state_dict_[self._tokentype_embeddings_key] \
+                = self.tokentype_embeddings.state_dict(
+                    destination, prefix, keep_vars)
+
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+
+        # Word embedding.
+        if self._word_embeddings_key in state_dict:
+            state_dict_ = state_dict[self._word_embeddings_key]
+        else:
+            # for backward compatibility.
+            state_dict_ = {}
+            for key in state_dict.keys():
+                if 'word_embeddings' in key:
+                    state_dict_[key.split('word_embeddings.')[1]] \
+                        = state_dict[key]
+        self.word_embeddings.load_state_dict(state_dict_, strict=strict)
+
+        # Position embedding.
+        if self._position_embeddings_key in state_dict:
+            state_dict_ = state_dict[self._position_embeddings_key]
+        else:
+            # for backward compatibility.
+            state_dict_ = {}
+            for key in state_dict.keys():
+                if 'position_embeddings' in key:
+                    state_dict_[key.split('position_embeddings.')[1]] \
+                        = state_dict[key]
+        self.position_embeddings.load_state_dict(state_dict_, strict=strict)
+
+        # Tokentype embedding.
+        if self.num_tokentypes > 0:
+            state_dict_ = {}
+            if self._tokentype_embeddings_key in state_dict:
+                state_dict_ = state_dict[self._tokentype_embeddings_key]
+            else:
+                # for backward compatibility.
+                for key in state_dict.keys():
+                    if 'tokentype_embeddings' in key:
+                        state_dict_[key.split('tokentype_embeddings.')[1]] \
+                            = state_dict[key]
+            if len(state_dict_.keys()) > 0:
+                self.tokentype_embeddings.load_state_dict(state_dict_, strict=strict)
+            else:
+                print('***WARNING*** expected tokentype embeddings in the '
+                      'checkpoint but could not find it',
+                      flush=True)
+
+
+class HiddenParallelEmbedding1D(torch.nn.Module):
+    """Embedding parallelized in the vocabulary dimension.
+
+    This is mainly adapted from torch.nn.Embedding and all the default
+    values are kept.
+    Arguments:
+        num_embeddings: vocabulary size.
+        embedding_dim: size of hidden state.
+        init_method: method to initialize weights.
+    """
+
+    def __init__(self, num_embeddings, embedding_dim, dtype=torch.float, padding_idx: int = None, init_method=None):
+        super(HiddenParallelEmbedding1D, self).__init__()
+        # Keep the input dimensions.
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        embed_dim_per_partition = divide(embedding_dim, gpc.tensor_parallel_size)
+        # Set the details for compatibility.
+        self.padding_idx = padding_idx
+        self.max_norm = None
+        self.norm_type = 2.
+        self.scale_grad_by_freq = False
+        self.sparse = False
+        self._weight = None
+
+        # Allocate weights and initialize.
+        factory_kwargs = {'device': get_current_device(), 'dtype': dtype}
+        self.weight = Parameter(torch.empty(num_embeddings, embed_dim_per_partition, **factory_kwargs))
+        init.uniform_(self.weight, -1, 1)
+
+    def forward(self, input_):
+
+        # Get the embeddings.
+        output_parallel = F.embedding(input_, self.weight, self.padding_idx, self.max_norm, self.norm_type,
+                                      self.scale_grad_by_freq, self.sparse)
+
+        # Reduce across all the model parallel GPUs.
+        output = gather_forward_split_backward(output_parallel, ParallelMode.PARALLEL_1D, dim=-1)
+        return output
+
+
+@LAYERS.register_module
+class HiddenParallelGPTLMHead1D(ParallelLayer):
+    """
+    Language model head that shares the same parameters with the embedding matrix.
+    """
+
+    def __init__(
+        self,
+        embed=None,
+        embed_dim=None,
+        vocab_size=None,
+        dtype=None,
+    ):
+        super().__init__()
+        if embed is not None:
+            self.head = embed
+            self.synced_embed = True
+        else:
+            # self.embedding = HiddenParallelEmbedding1D(vocab_size, hidden_size, dtype, padding_idx)
+            # (hidden_size/q, vocab_size)
+            self.synced_embed = False
+            self.head = Linear1D_Row(in_features=embed_dim,
+                                     out_features=vocab_size,
+                                     bias=False,
+                                     dtype=dtype,
+                                     parallel_input=False)
+
+    def forward(self, x: Tensor) -> Tensor:
+        if self.synced_embed:
+            x = F.linear(x, self.head.weight)
+        else:
+            x = self.head(x)
+
+        return x
diff --git a/examples/language/gpt/titans/model/gpt1d.py b/examples/language/gpt/titans/model/gpt1d.py
new file mode 100644
index 000000000000..2edd03606b7d
--- /dev/null
+++ b/examples/language/gpt/titans/model/gpt1d.py
@@ -0,0 +1,349 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+import math
+
+import torch
+from torch import Tensor
+from torch import nn as nn
+
+from colossalai import kernel
+from colossalai import nn as col_nn
+from colossalai.core import global_context as gpc
+from colossalai.kernel.cuda_native.scaled_softmax import AttnMaskType
+from colossalai.nn.layer import Linear1D_Col, Linear1D_Row
+from colossalai.nn.layer.base_layer import ParallelLayer
+from colossalai.nn.layer.utils import ACT2FN, divide
+from colossalai.utils import checkpoint
+from colossalai.utils.activation_checkpoint import checkpoint
+
+__all__ = [
+    'GPTMLP1D', 'GPTSelfAttention1D', 'GPTTransformerLayer1D', 'FusedGPTSelfAttention1D', 'FusedGPTTransformerLayer1D'
+]
+
+
+class GPTMLP1D(ParallelLayer):
+
+    def __init__(
+        self,
+        in_features: int,
+        mlp_ratio: int,
+        act_func: str = 'gelu',
+        dropout_prob: float = 0.,
+        dtype=None,
+        checkpoint: bool = False,
+        skip_bias_add: bool = False,
+    ):
+        super().__init__()
+
+        self.in_features = in_features
+        self.mlp_ratio = mlp_ratio
+        self.checkpoint = checkpoint
+        self.skip_bias_add = skip_bias_add
+
+        self.act = ACT2FN[act_func]
+        skip_dense_1_add_bias = False
+
+        # Project to mlp_ratio * h.
+        self.dense_1 = Linear1D_Col(
+            self.in_features,
+            int(self.mlp_ratio * self.in_features),
+            dtype=dtype,
+            gather_output=False,
+            skip_bias_add=skip_dense_1_add_bias,
+        )
+
+        # Project back to h.
+        self.dense_2 = Linear1D_Row(
+            int(self.mlp_ratio * self.in_features),
+            self.in_features,
+            dtype=dtype,
+            parallel_input=True,
+        )
+
+        self.dropout = col_nn.Dropout(dropout_prob)
+
+    def _forward(self, hidden_states: Tensor) -> Tensor:
+        intermediate_output = self.dense_1(hidden_states)
+        intermediate_output = self.act(intermediate_output)
+
+        output = self.dense_2(intermediate_output)
+        output = self.dropout(output)
+        return output
+
+    def _checkpoint_forward(self, hidden_states: Tensor) -> Tensor:
+        return checkpoint(self._forward, False, hidden_states)
+
+    def forward(self, hidden_states: Tensor) -> Tensor:
+        if self.checkpoint:
+            return self._checkpoint_forward(hidden_states)
+        else:
+            return self._forward(hidden_states)
+
+
+class GenericGPTSelfAttention1D(ParallelLayer):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_attention_heads: int,
+        attention_dropout_prob: float,
+        hidden_dropout_prob: float,
+        dtype=None,
+        checkpoint: bool = False,
+        max_position_embeddings=1024,
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.attention_head_size = divide(hidden_size, num_attention_heads)
+        self.num_attention_heads_per_partition = divide(num_attention_heads, gpc.tensor_parallel_size)
+        self.hidden_size_per_partition = divide(hidden_size, gpc.tensor_parallel_size)
+        self.checkpoint = checkpoint
+        self.query_key_value = Linear1D_Col(
+            hidden_size,
+            3 * hidden_size,
+            dtype=dtype,
+        )
+        self.attention_dropout = col_nn.Dropout(attention_dropout_prob)
+        self.dense = Linear1D_Row(
+            hidden_size,
+            hidden_size,
+            dtype=dtype,
+            parallel_input=True,
+        )
+        self.dropout = col_nn.Dropout(hidden_dropout_prob)
+
+    def softmax_forward(self, attention_scores, attention_mask, query_layer, key_layer):
+        raise NotImplementedError
+
+    def _forward(self, hidden_states: Tensor, attention_mask=None) -> Tensor:
+        query_key_value = self.query_key_value(hidden_states)
+        new_qkv_shape = query_key_value.shape[:-1] + \
+            (self.num_attention_heads_per_partition, 3 * self.attention_head_size)
+        query_key_value = query_key_value.view(new_qkv_shape)
+        query_key_value = query_key_value.permute((0, 2, 1, 3))
+        query_layer, key_layer, value_layer = torch.chunk(query_key_value, 3, dim=-1)
+
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        attention_scores = self.softmax_forward(attention_scores, attention_mask, query_layer, key_layer)
+
+        attention_scores = attention_scores.type(value_layer.dtype)
+
+        attention_probs = self.attention_dropout(attention_scores)
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.transpose(1, 2)
+        new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
+        context_layer = context_layer.reshape(new_context_layer_shape)
+        output = self.dense(context_layer)
+        output = self.dropout(output)
+
+        return output
+
+    def _checkpoint_forward(self, hidden_states: Tensor, attention_mask=None) -> Tensor:
+        return checkpoint(self._forward, False, hidden_states, attention_mask)
+
+    def forward(self, hidden_states: Tensor, attention_mask=None) -> Tensor:
+        if self.checkpoint:
+            return self._checkpoint_forward(hidden_states, attention_mask)
+        else:
+            return self._forward(hidden_states, attention_mask)
+
+
+class GPTSelfAttention1D(GenericGPTSelfAttention1D):
+
+    def __init__(self,
+                 hidden_size: int,
+                 num_attention_heads: int,
+                 attention_dropout_prob: float,
+                 hidden_dropout_prob: float,
+                 dtype=None,
+                 checkpoint: bool = False,
+                 max_position_embeddings=1024):
+        super().__init__(hidden_size,
+                         num_attention_heads,
+                         attention_dropout_prob,
+                         hidden_dropout_prob,
+                         dtype=dtype,
+                         checkpoint=checkpoint,
+                         max_position_embeddings=max_position_embeddings)
+        self.softmax = nn.Softmax(dim=-1)
+        max_positions = max_position_embeddings
+        self.register_buffer(
+            "bias",
+            torch.tril(torch.ones((max_positions, max_positions),
+                                  dtype=torch.uint8)).view(1, 1, max_positions, max_positions),
+        )
+        self.register_buffer("masked_bias", torch.tensor(-1e4))
+
+    def softmax_forward(self, attention_scores, attention_mask, query_layer, key_layer):
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        # causal mask
+        query_length, key_length = query_layer.size(-2), key_layer.size(-2)
+        causal_mask = self.bias[:, :, key_length - query_length:key_length, :key_length].bool()
+        attention_scores = torch.where(causal_mask, attention_scores, self.masked_bias.to(attention_scores))
+        if attention_mask is not None:
+            # Apply the attention mask
+            attention_scores = attention_scores + attention_mask
+        attention_scores = self.softmax(attention_scores)
+        return attention_scores
+
+
+class FusedGPTSelfAttention1D(GenericGPTSelfAttention1D):
+
+    def __init__(self,
+                 hidden_size: int,
+                 num_attention_heads: int,
+                 attention_dropout_prob: float,
+                 hidden_dropout_prob: float,
+                 dtype=None,
+                 checkpoint: bool = False,
+                 max_position_embeddings=1024):
+        super().__init__(hidden_size,
+                         num_attention_heads,
+                         attention_dropout_prob,
+                         hidden_dropout_prob,
+                         dtype=dtype,
+                         checkpoint=checkpoint,
+                         max_position_embeddings=max_position_embeddings)
+        self.softmax = kernel.FusedScaleMaskSoftmax(input_in_fp16=True,
+                                                    input_in_bf16=False,
+                                                    attn_mask_type=AttnMaskType.causal,
+                                                    scaled_masked_softmax_fusion=True,
+                                                    mask_func=None,
+                                                    softmax_in_fp32=True,
+                                                    scale=math.sqrt(self.attention_head_size))
+
+    def softmax_forward(self, attention_scores, attention_mask, query_layer, key_layer):
+        return self.softmax(attention_scores, attention_mask)
+
+
+class GenericGPTTransformerLayer1D(ParallelLayer):
+
+    def __init__(self,
+                 hidden_size: int,
+                 num_attention_heads: int,
+                 act_func: str = 'gelu',
+                 mlp_ratio: float = 4.0,
+                 attention_dropout_prob: float = 0.,
+                 hidden_dropout_prob: float = 0.,
+                 dtype=None,
+                 checkpoint: bool = False,
+                 max_position_embeddings: int = 1024,
+                 layer_norm_epsilon: float = 1e-5,
+                 apply_post_layer_norm: bool = False,
+                 attention=None,
+                 layer_norm=None):
+        super().__init__()
+        self.checkpoint = checkpoint
+        self.dtype = dtype
+        self.norm1 = layer_norm(hidden_size, eps=layer_norm_epsilon)
+        self.apply_post_layer_norm = apply_post_layer_norm
+        self.attention = attention(
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            attention_dropout_prob=attention_dropout_prob,
+            hidden_dropout_prob=hidden_dropout_prob,
+            dtype=dtype,
+            max_position_embeddings=max_position_embeddings,
+            checkpoint=False,
+        )
+
+        self.norm2 = layer_norm(hidden_size, eps=layer_norm_epsilon)
+        self.mlp = GPTMLP1D(
+            in_features=hidden_size,
+            dropout_prob=hidden_dropout_prob,
+            act_func=act_func,
+            mlp_ratio=mlp_ratio,
+            dtype=dtype,
+            checkpoint=False,
+        )
+
+    def _forward(self, hidden_states, attention_mask) -> Tensor:
+        if not self.apply_post_layer_norm:
+            residual = hidden_states
+        hidden_states = self.norm1(hidden_states)
+        if self.apply_post_layer_norm:
+            residual = hidden_states
+        attention_output = self.attention(hidden_states, attention_mask)
+        hidden_states = residual + attention_output
+
+        if not self.apply_post_layer_norm:
+            residual = hidden_states
+        hidden_states = self.norm2(hidden_states)
+        if self.apply_post_layer_norm:
+            residual = hidden_states
+        feed_forward_hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + feed_forward_hidden_states
+
+        output = (hidden_states, attention_mask)
+        return output
+
+    def forward(self, hidden_states, attention_mask):
+        if self.checkpoint:
+            return checkpoint(self._forward, False, hidden_states, attention_mask)
+        else:
+            return self._forward(hidden_states, attention_mask)
+
+
+class GPTTransformerLayer1D(GenericGPTTransformerLayer1D):
+
+    def __init__(self,
+                 hidden_size: int,
+                 num_attention_heads: int,
+                 act_func: str = 'gelu',
+                 mlp_ratio: float = 4,
+                 attention_dropout_prob: float = 0,
+                 hidden_dropout_prob: float = 0,
+                 dtype=None,
+                 checkpoint: bool = False,
+                 max_position_embeddings: int = 1024,
+                 layer_norm_epsilon: float = 0.00001,
+                 apply_post_layer_norm: bool = False):
+        attention = GPTSelfAttention1D
+        layer_norm = nn.LayerNorm
+        super().__init__(hidden_size,
+                         num_attention_heads,
+                         act_func=act_func,
+                         mlp_ratio=mlp_ratio,
+                         attention_dropout_prob=attention_dropout_prob,
+                         hidden_dropout_prob=hidden_dropout_prob,
+                         dtype=dtype,
+                         checkpoint=checkpoint,
+                         max_position_embeddings=max_position_embeddings,
+                         layer_norm_epsilon=layer_norm_epsilon,
+                         apply_post_layer_norm=apply_post_layer_norm,
+                         attention=attention,
+                         layer_norm=layer_norm)
+
+
+class FusedGPTTransformerLayer1D(GenericGPTTransformerLayer1D):
+
+    def __init__(self,
+                 hidden_size: int,
+                 num_attention_heads: int,
+                 act_func: str = 'gelu',
+                 mlp_ratio: float = 4,
+                 attention_dropout_prob: float = 0,
+                 hidden_dropout_prob: float = 0,
+                 dtype=None,
+                 checkpoint: bool = False,
+                 max_position_embeddings: int = 1024,
+                 layer_norm_epsilon: float = 0.00001,
+                 apply_post_layer_norm: bool = False):
+        attention = FusedGPTSelfAttention1D
+        layer_norm = kernel.LayerNorm
+        super().__init__(hidden_size,
+                         num_attention_heads,
+                         act_func=act_func,
+                         mlp_ratio=mlp_ratio,
+                         attention_dropout_prob=attention_dropout_prob,
+                         hidden_dropout_prob=hidden_dropout_prob,
+                         dtype=dtype,
+                         checkpoint=checkpoint,
+                         max_position_embeddings=max_position_embeddings,
+                         layer_norm_epsilon=layer_norm_epsilon,
+                         apply_post_layer_norm=apply_post_layer_norm,
+                         attention=attention,
+                         layer_norm=layer_norm)
diff --git a/examples/language/gpt/titans/model/pipeline_gpt1d.py b/examples/language/gpt/titans/model/pipeline_gpt1d.py
new file mode 100644
index 000000000000..30180285bc70
--- /dev/null
+++ b/examples/language/gpt/titans/model/pipeline_gpt1d.py
@@ -0,0 +1,322 @@
+import inspect
+
+# import model_zoo.gpt.gpt as col_gpt
+import titans.model.gpt.gpt as col_gpt
+import torch
+import torch.nn as nn
+
+from colossalai import kernel
+from colossalai import nn as col_nn
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.logging import get_dist_logger
+from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper
+from colossalai.pipeline.utils import partition_uniform
+
+from .embed import HiddenParallelEmbedding, HiddenParallelGPTLMHead1D, VocabParallelEmbedding, VocabParallelGPTLMHead1D
+from .gpt1d import FusedGPTTransformerLayer1D, GPTTransformerLayer1D
+
+__all__ = [
+    'GPT2_small_pipeline_1D',
+    'GPT2_exlarge_pipeline_1D',
+    'GPT3_pipeline_1D',
+    'GPT2_exlarge_pipeline_hybrid',
+    'GPT2_small_pipeline_hybrid',
+    'GPT3_pipeline_hybrid',
+]
+
+
+class GenericPipelineGPT(nn.Module):
+
+    def __init__(self, embedding=None, blocks=None, norm=None, head=None) -> None:
+        super().__init__()
+        self.embedding = embedding
+        self.blocks = blocks
+        self.norm = norm
+        self.head = head
+        assert blocks is not None
+        if norm is not None or head is not None:
+            assert norm is not None and head is not None
+
+    def forward(self, hidden_states=None, input_ids=None, attention_mask=None):
+        if self.embedding is not None:
+            hidden_states = self.embedding(input_ids=input_ids)
+        batch_size = hidden_states.shape[0]
+        attention_mask = attention_mask.view(batch_size, -1)
+        attention_mask = attention_mask[:, None, None, :]
+        attention_mask = attention_mask.to(dtype=hidden_states.dtype)    # fp16 compatibility
+        attention_mask = (1.0 - attention_mask) * -10000.0
+        for block in self.blocks:
+            hidden_states, attention_mask = block(hidden_states, attention_mask)
+        if self.norm is not None:
+            hidden_states = self.head(self.norm(hidden_states))
+        return hidden_states
+
+
+class PipelineGPT1D(GenericPipelineGPT):
+
+    def __init__(self,
+                 num_layers: int = 12,
+                 hidden_size: int = 768,
+                 num_attention_heads: int = 12,
+                 vocab_size: int = 50304,
+                 embed_drop_rate: float = 0.,
+                 act_func: str = 'gelu',
+                 mlp_ratio: int = 4.0,
+                 attn_drop_rate: float = 0.,
+                 drop_rate: float = 0.,
+                 dtype: torch.dtype = torch.float,
+                 checkpoint: bool = False,
+                 max_position_embeddings: int = 1024,
+                 layer_norm_epsilon: float = 1e-5,
+                 apply_post_layer_norm: bool = False,
+                 first: bool = False,
+                 last: bool = False,
+                 embed_split_hidden=False):
+        embedding = None
+        norm = None
+        head = None
+        embed_cls = VocabParallelEmbedding
+        head_cls = VocabParallelGPTLMHead1D
+        if embed_split_hidden:
+            embed_cls = HiddenParallelEmbedding
+            head_cls = HiddenParallelGPTLMHead1D
+        if first:
+            embedding = embed_cls(hidden_size, vocab_size, max_position_embeddings, embed_drop_rate, dtype=dtype)
+        blocks = nn.ModuleList([
+            GPTTransformerLayer1D(hidden_size,
+                                  num_attention_heads,
+                                  act_func=act_func,
+                                  mlp_ratio=mlp_ratio,
+                                  attention_dropout_prob=attn_drop_rate,
+                                  hidden_dropout_prob=drop_rate,
+                                  dtype=dtype,
+                                  checkpoint=checkpoint,
+                                  max_position_embeddings=max_position_embeddings,
+                                  layer_norm_epsilon=layer_norm_epsilon,
+                                  apply_post_layer_norm=apply_post_layer_norm) for _ in range(num_layers)
+        ])
+        if last:
+            norm = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon)
+            head = head_cls(vocab_size=vocab_size, embed_dim=hidden_size, dtype=dtype)
+        super().__init__(embedding=embedding, blocks=blocks, norm=norm, head=head)
+
+
+class FusedPipelineGPT1D(GenericPipelineGPT):
+
+    def __init__(self,
+                 num_layers: int = 12,
+                 hidden_size: int = 768,
+                 num_attention_heads: int = 12,
+                 vocab_size: int = 50304,
+                 embed_drop_rate: float = 0.,
+                 act_func: str = 'gelu',
+                 mlp_ratio: int = 4.0,
+                 attn_drop_rate: float = 0.,
+                 drop_rate: float = 0.,
+                 dtype: torch.dtype = torch.float,
+                 checkpoint: bool = False,
+                 max_position_embeddings: int = 1024,
+                 layer_norm_epsilon: float = 1e-5,
+                 apply_post_layer_norm: bool = False,
+                 first: bool = False,
+                 last: bool = False,
+                 embed_split_hidden=False):
+        embedding = None
+        norm = None
+        head = None
+        embed_cls = VocabParallelEmbedding
+        head_cls = VocabParallelGPTLMHead1D
+        if embed_split_hidden:
+            embed_cls = HiddenParallelEmbedding
+            head_cls = HiddenParallelGPTLMHead1D
+        if first:
+            embedding = embed_cls(hidden_size, vocab_size, max_position_embeddings, embed_drop_rate, dtype=dtype)
+        blocks = nn.ModuleList([
+            FusedGPTTransformerLayer1D(hidden_size,
+                                       num_attention_heads,
+                                       act_func=act_func,
+                                       mlp_ratio=mlp_ratio,
+                                       attention_dropout_prob=attn_drop_rate,
+                                       hidden_dropout_prob=drop_rate,
+                                       dtype=dtype,
+                                       checkpoint=checkpoint,
+                                       max_position_embeddings=max_position_embeddings,
+                                       layer_norm_epsilon=layer_norm_epsilon,
+                                       apply_post_layer_norm=apply_post_layer_norm) for _ in range(num_layers)
+        ])
+        if last:
+            norm = kernel.LayerNorm(hidden_size, eps=layer_norm_epsilon)
+            head = head_cls(vocab_size=vocab_size, embed_dim=hidden_size, dtype=dtype)
+        super().__init__(embedding=embedding, blocks=blocks, norm=norm, head=head)
+
+    def forward(self, hidden_states=None, input_ids=None, attention_mask=None):
+        if self.embedding is not None:
+            hidden_states = self.embedding(input_ids=input_ids)
+        attention_mask = attention_mask.to(dtype=hidden_states.dtype)    # fp16 compatibility
+        for block in self.blocks:
+            hidden_states, attention_mask = block(hidden_states, attention_mask)
+        if self.norm is not None:
+            hidden_states = self.head(self.norm(hidden_states))
+        return hidden_states
+
+
+class PipelineGPTHybrid(GenericPipelineGPT):
+
+    def __init__(self,
+                 num_layers: int = 12,
+                 hidden_size: int = 768,
+                 num_attention_heads: int = 12,
+                 vocab_size: int = 50304,
+                 embed_drop_rate: float = 0.,
+                 act_func: str = 'gelu',
+                 mlp_ratio: int = 4,
+                 attn_drop_rate: float = 0.,
+                 drop_rate: float = 0.,
+                 dtype: torch.dtype = torch.float,
+                 checkpoint: bool = False,
+                 max_position_embeddings: int = 1024,
+                 layer_norm_epsilon: float = 1e-5,
+                 apply_post_layer_norm: bool = False,
+                 first: bool = False,
+                 last: bool = False,
+                 embed_split_hidden=False):
+        embedding = None
+        norm = None
+        head = None
+        if first:
+            embedding = col_gpt.GPTEmbedding(hidden_size,
+                                             vocab_size,
+                                             max_position_embeddings,
+                                             dropout=embed_drop_rate,
+                                             dtype=dtype)
+        blocks = nn.ModuleList([
+            col_gpt.GPTBlock(hidden_size,
+                             num_attention_heads,
+                             mlp_ratio=mlp_ratio,
+                             attention_dropout=attn_drop_rate,
+                             dropout=drop_rate,
+                             dtype=dtype,
+                             checkpoint=checkpoint,
+                             activation=nn.functional.gelu) for _ in range(num_layers)
+        ])
+        if last:
+            norm = col_nn.LayerNorm(hidden_size, eps=layer_norm_epsilon)
+            # head = col_gpt.GPTLMHead(vocab_size=vocab_size,
+            #                          hidden_size=hidden_size,
+            #                          dtype=dtype,
+            #                          bias=False)
+            head = col_nn.Classifier(hidden_size, vocab_size, dtype=dtype, bias=False)
+        super().__init__(embedding=embedding, blocks=blocks, norm=norm, head=head)
+
+
+def _filter_kwargs(func, kwargs):
+    sig = inspect.signature(func)
+    return {k: v for k, v in kwargs.items() if k in sig.parameters}
+
+
+def _build_generic_gpt_pipeline_1d(module_cls, num_layers, num_chunks, device=torch.device('cuda'), **kwargs):
+    logger = get_dist_logger()
+
+    if gpc.is_initialized(ParallelMode.PIPELINE):
+        pipeline_size = gpc.get_world_size(ParallelMode.PIPELINE)
+        pipeline_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
+    else:
+        pipeline_size = 1
+        pipeline_rank = 0
+    rank = gpc.get_global_rank()
+
+    if pipeline_size > 1:
+        wrapper = PipelineSharedModuleWrapper([0, pipeline_size - 1])
+    else:
+        wrapper = None
+    parts = partition_uniform(num_layers, pipeline_size, num_chunks)[pipeline_rank]
+    models = []
+    for start, end in parts:
+        kwargs['num_layers'] = end - start
+        kwargs['first'] = start == 0
+        kwargs['last'] = end == num_layers
+        logger.info(f'Rank{rank} build layer {start}-{end}, {end-start}/{num_layers} layers')
+        chunk = module_cls(**_filter_kwargs(module_cls.__init__, kwargs)).to(device)
+
+        if wrapper is not None:
+            if start == 0:
+                wrapper.register_module(chunk.embedding.word_embeddings)
+            elif end == num_layers:
+                wrapper.register_module(chunk.head)
+        models.append(chunk)
+    if len(models) == 1:
+        model = models[0]
+    else:
+        model = nn.ModuleList(models)
+
+    numel = 0
+    for _, param in model.named_parameters(recurse=True):
+        numel += param.numel()
+    logger.info(f'Rank{rank}/{pipeline_rank} model size = {numel * 2 / 1e9} GB')
+    return model
+
+
+def _build_gpt_pipeline_1d(num_layers, num_chunks, device=torch.device('cuda'), fused=False, **kwargs):
+    model = FusedPipelineGPT1D if fused else PipelineGPT1D
+    return _build_generic_gpt_pipeline_1d(model, num_layers, num_chunks, device, **kwargs)
+
+
+def _build_gpt_pipeline_hybrid(num_layers, num_chunks, device=torch.device('cuda'), **kwargs):
+    return _build_generic_gpt_pipeline_1d(PipelineGPTHybrid, num_layers, num_chunks, device, **kwargs)
+
+
+def GPT2_small_pipeline_1D(num_chunks=1, checkpoint=False, dtype=torch.float, embed_split_hidden=False, fused=False):
+    cfg = dict(hidden_size=768,
+               num_attention_heads=12,
+               checkpoint=checkpoint,
+               dtype=dtype,
+               embed_split_hidden=embed_split_hidden)
+    return _build_gpt_pipeline_1d(12, num_chunks, fused=fused, **cfg)
+
+
+def GPT2_exlarge_pipeline_1D(num_chunks=1, checkpoint=False, dtype=torch.float, embed_split_hidden=False, fused=False):
+    cfg = dict(hidden_size=1600,
+               num_attention_heads=32,
+               checkpoint=checkpoint,
+               dtype=dtype,
+               embed_split_hidden=embed_split_hidden)
+    return _build_gpt_pipeline_1d(48, num_chunks, fused=fused, **cfg)
+
+
+def GPT3_pipeline_1D(num_chunks=1, checkpoint=False, dtype=torch.float, embed_split_hidden=False, fused=False):
+    cfg = dict(hidden_size=12288,
+               num_attention_heads=96,
+               checkpoint=checkpoint,
+               max_position_embeddings=2048,
+               dtype=dtype,
+               embed_split_hidden=embed_split_hidden)
+    return _build_gpt_pipeline_1d(96, num_chunks, fused=fused, **cfg)
+
+
+def GPT2_exlarge_pipeline_hybrid(num_chunks=1, checkpoint=False, dtype=torch.float, embed_split_hidden=False):
+    cfg = dict(hidden_size=1600,
+               num_attention_heads=32,
+               checkpoint=checkpoint,
+               dtype=dtype,
+               embed_split_hidden=embed_split_hidden)
+    return _build_gpt_pipeline_hybrid(48, num_chunks, **cfg)
+
+
+def GPT2_small_pipeline_hybrid(num_chunks=1, checkpoint=False, dtype=torch.float, embed_split_hidden=False):
+    cfg = dict(hidden_size=768,
+               num_attention_heads=12,
+               checkpoint=checkpoint,
+               dtype=dtype,
+               embed_split_hidden=embed_split_hidden)
+    return _build_gpt_pipeline_hybrid(12, num_chunks, **cfg)
+
+
+def GPT3_pipeline_hybrid(num_chunks=1, checkpoint=False, dtype=torch.float, embed_split_hidden=False):
+    cfg = dict(hidden_size=12288,
+               num_attention_heads=96,
+               checkpoint=checkpoint,
+               max_position_embeddings=2048,
+               dtype=dtype,
+               embed_split_hidden=embed_split_hidden)
+    return _build_gpt_pipeline_hybrid(96, num_chunks, **cfg)
diff --git a/examples/language/gpt/titans/requirements.txt b/examples/language/gpt/titans/requirements.txt
new file mode 100644
index 000000000000..64ff7a4abcd8
--- /dev/null
+++ b/examples/language/gpt/titans/requirements.txt
@@ -0,0 +1,4 @@
+torch==1.12.1
+titans==0.0.7
+colossalai==0.2.0+torch1.12cu11.3
+-f https://release.colossalai.org
diff --git a/examples/language/gpt/titans/run.sh b/examples/language/gpt/titans/run.sh
new file mode 100644
index 000000000000..157bd377aa34
--- /dev/null
+++ b/examples/language/gpt/titans/run.sh
@@ -0,0 +1,2 @@
+export DATA=/data/scratch/gpt_data/small-gpt-dataset.json
+colossalai run --nproc_per_node=4 train_gpt.py --config ./configs/gpt2_small_zero3_pp1d.py --from_torch
diff --git a/examples/language/gpt/titans/test_ci.sh b/examples/language/gpt/titans/test_ci.sh
new file mode 100644
index 000000000000..7cb24c1a4082
--- /dev/null
+++ b/examples/language/gpt/titans/test_ci.sh
@@ -0,0 +1 @@
+colossalai run --nproc_per_node=4 train_gpt.py --config ./configs/gpt2_small_zero3_pp1d.py --from_torch --use_dummy_dataset
diff --git a/examples/language/gpt/titans/train_gpt.py b/examples/language/gpt/titans/train_gpt.py
new file mode 100644
index 000000000000..1380b4b3a7da
--- /dev/null
+++ b/examples/language/gpt/titans/train_gpt.py
@@ -0,0 +1,148 @@
+import contextlib
+import os
+
+import torch
+import torch.nn as nn
+from titans.model.gpt import GPTLMLoss
+
+import colossalai
+import colossalai.utils as utils
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.logging import disable_existing_loggers, get_dist_logger
+from colossalai.nn import LinearWarmupLR
+from colossalai.trainer import Trainer, hooks
+from colossalai.utils import colo_set_process_memory_fraction, is_using_pp
+from colossalai.utils.timer import MultiTimer
+from colossalai.zero.init_ctx import ZeroInitContext
+
+
+def calc_local_model_size(model: torch.nn.Module):
+    numel_per_device = 0
+    for p in model.parameters():
+        numel_per_device += p.numel()
+    return numel_per_device
+
+
+VOCAB_SIZE = 50257
+
+
+def main():
+    parser = colossalai.get_default_parser()
+    parser.add_argument('--from_torch', default=False, action='store_true')
+    parser.add_argument('--use_dummy_dataset', default=True, action='store_true')
+    args = parser.parse_args()
+    disable_existing_loggers()
+    if args.from_torch:
+        colossalai.launch_from_torch(config=args.config)
+    else:
+        colossalai.launch_from_slurm(config=args.config, host=args.host, port=29500, seed=42)
+    logger = get_dist_logger()
+
+    if not args.use_dummy_dataset:
+        data_path = os.environ['DATA']
+        logger.info(f'Build data loader from path {data_path}', ranks=[0])
+        from dataset.webtext import WebtextDataset
+        train_ds = WebtextDataset(os.environ['DATA'], seq_len=gpc.config.SEQ_LEN)
+        train_dataloader = utils.get_dataloader(train_ds,
+                                                seed=42,
+                                                batch_size=gpc.config.BATCH_SIZE,
+                                                pin_memory=True,
+                                                shuffle=True,
+                                                drop_last=True)
+    else:
+        # build a dummy train_dataloader
+        logger.info('Build data loader using dummy data', ranks=[0])
+
+        def get_data(batch_size, seq_len, vocab_size):
+            input_ids = torch.randint(0, vocab_size, (batch_size, seq_len), device=torch.cuda.current_device())
+            attention_mask = torch.ones_like(input_ids)
+            return input_ids, attention_mask
+
+        # 10 iterations
+        input_ids, attn_mask = get_data(gpc.config.BATCH_SIZE * 10, gpc.config.SEQ_LEN, VOCAB_SIZE)
+        from torch.utils.data import DataLoader, Dataset
+
+        class TextSamplerDataset(Dataset):
+
+            def __init__(self, data, seq_len):
+                super().__init__()
+                self.data = data
+                self.seq_len = seq_len
+
+            def __getitem__(self, index):
+                rand_start = torch.randint(0, self.data.size(0) - self.seq_len, (1,))
+                full_seq = self.data[rand_start:rand_start + self.seq_len + 1].long()
+                return full_seq.cuda()
+
+            def __len__(self):
+                return self.data.size(0) // self.seq_len
+
+        def cycle(loader):
+            while True:
+                for data in loader:
+                    yield data
+
+        train_dataset = TextSamplerDataset(input_ids, gpc.config.SEQ_LEN)
+        train_dataloader = DataLoader(train_dataset, batch_size=gpc.config.BATCH_SIZE)
+
+    logger.info('Build model', ranks=[0])
+    use_pipeline = is_using_pp()
+    use_interleaved = hasattr(gpc.config.model, 'num_chunks')
+    use_zero3 = hasattr(gpc.config, 'zero')
+    ctx = contextlib.nullcontext()
+    if use_zero3:
+        ctx = ZeroInitContext(target_device=torch.cuda.current_device(),
+                              shard_strategy=gpc.config.zero.model_config.shard_strategy,
+                              shard_param=True)
+    with ctx:
+        model = gpc.config.model.pop('type')(**gpc.config.model)
+    if use_pipeline and use_interleaved and not isinstance(model, nn.ModuleList):
+        model = nn.ModuleList([model])
+
+    if use_zero3:
+        numel = ctx.model_numel_tensor.item()
+    else:
+        numel = calc_local_model_size(model)
+
+    tflop = numel * gpc.config.BATCH_SIZE * gpc.config.SEQ_LEN \
+        * gpc.get_world_size(ParallelMode.MODEL) * gpc.get_world_size(ParallelMode.DATA) * 8 / (1024 ** 4)
+
+    criterion = getattr(gpc.config, 'loss_fn', None)
+    if criterion is not None:
+        criterion = criterion.type()
+    else:
+        criterion = GPTLMLoss()
+    logger.info('Build optimizer', ranks=[0])
+    optimizer = gpc.config.optimizer.pop('type')(model.parameters(), **gpc.config.optimizer)
+    lr_scheduler = LinearWarmupLR(optimizer, total_steps=gpc.config.NUM_EPOCHS, warmup_steps=5)
+    engine, train_dataloader, _, lr_scheduler = colossalai.initialize(model,
+                                                                      optimizer,
+                                                                      criterion,
+                                                                      train_dataloader=train_dataloader,
+                                                                      lr_scheduler=lr_scheduler)
+    global_batch_size = gpc.config.BATCH_SIZE * \
+        gpc.get_world_size(ParallelMode.DATA) * getattr(gpc.config, "gradient_accumulation", 1)
+    logger.info(f'Init done, global batch size = {global_batch_size}', ranks=[0])
+    timier = MultiTimer()
+    trainer = Trainer(engine=engine, logger=logger, timer=timier)
+    hook_list = [
+        hooks.LossHook(),
+        hooks.LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=True),
+        hooks.LogMetricByEpochHook(logger),
+        hooks.ThroughputHook(ignored_steps=10, tflop_per_step=tflop),
+        hooks.LogMetricByStepHook(),
+        hooks.LogMemoryByEpochHook(logger),
+    # hooks.LogMemoryByEpochHook(logger),
+    # hooks.LogTimingByEpochHook(timer, logger),
+    ]
+    trainer.fit(train_dataloader=train_dataloader,
+                epochs=gpc.config.NUM_EPOCHS,
+                test_interval=1,
+                hooks=hook_list,
+                display_progress=True,
+                return_output_label=False)
+
+
+if __name__ == '__main__':
+    main()

From 315e1433ce4a4f8a7a1c2de6b87ccc63a7203941 Mon Sep 17 00:00:00 2001
From: jiaruifang <fangjiarui123@gmail.com>
Date: Mon, 16 Jan 2023 15:17:27 +0800
Subject: [PATCH 191/503] polish readme

---
 examples/language/gpt/README.md        | 17 ++++++++++++++---
 examples/language/gpt/titans/README.md | 12 ++++++------
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/examples/language/gpt/README.md b/examples/language/gpt/README.md
index 8fdf6be3b6d9..7e6acb3d399b 100644
--- a/examples/language/gpt/README.md
+++ b/examples/language/gpt/README.md
@@ -39,9 +39,15 @@ If you want to test ZeRO1 and ZeRO2 in Colossal-AI, you need to ensure Colossal-
 For simplicity, the input data is randonly generated here.
 
 ## Training
-We provide two solutions. One utilizes the hybrid parallel strategies of Gemini, DDP/ZeRO, and Tensor Parallelism.
-The other one uses Pipeline Parallelism Only.
-In the future, we are going merge them together and they can be used orthogonally to each other.
+We provide two stable solutions.
+One utilizes the Gemini to implement hybrid parallel strategies of Gemini, DDP/ZeRO, and Tensor Parallelism for a huggingface GPT model.
+The other one use [Titans](https://github.com/hpcaitech/Titans), a distributed executed model zoo maintained by ColossalAI,to implement the hybrid parallel strategies of TP + ZeRO + PP.
+
+We recommend using Gemini to qucikly run your model in a distributed manner.
+It doesn't require significant changes to the model structures, therefore you can apply it on a new model easily.
+And use Titans as an advanced weapon to pursue a more extreme performance.
+Titans has included the some typical models, such as Vit and GPT.
+However, it requires some efforts to start if facing a new model structure.
 
 ### GeminiDPP/ZeRO + Tensor Parallelism
 ```bash
@@ -56,6 +62,11 @@ The `train_gpt_demo.py` provides three distributed plans, you can choose the pla
 - Pytorch DDP
 - Pytorch ZeRO
 
+### Titans (Tensor Parallelism) + ZeRO + Pipeline Parallelism
+
+Titans provides a customized GPT model, which uses distributed operators as building blocks.
+In [./titans/README.md], we provide a hybrid parallelism of ZeRO, TP and PP.
+You can switch parallel strategies using a config file.
 
 ## Performance
 
diff --git a/examples/language/gpt/titans/README.md b/examples/language/gpt/titans/README.md
index 14c07442b82a..9fc26ad801db 100644
--- a/examples/language/gpt/titans/README.md
+++ b/examples/language/gpt/titans/README.md
@@ -5,7 +5,7 @@
 You can download the preprocessed sample dataset for this demo via our [Google Drive sharing link](https://drive.google.com/file/d/1QKI6k-e2gJ7XgS8yIpgPPiMmwiBP_BPE/view?usp=sharing).
 
 
-You can also avoid dataset preparation by using `--use_dummy_data` during running.
+You can also avoid dataset preparation by using `--use_dummy_dataset` during running.
 
 ## Run this Demo
 
@@ -13,15 +13,15 @@ Use the following commands to install prerequisites.
 
 ```bash
 # assuming using cuda 11.3
-conda install pytorch==1.11.0 torchvision==0.12.0 torchaudio==0.11.0 cudatoolkit=11.3 -c pytorch
-pip install colossalai==0.1.9+torch1.11cu11.3 -f https://release.colossalai.org
+pip install -r requirements.txt
 ```
 
 Use the following commands to execute training.
 
 ```Bash
 #!/usr/bin/env sh
-export DATA=/path/to/small-gpt-dataset.json'
+# if you want to use real dataset, then remove --use_dummy_dataset
+# export DATA=/path/to/small-gpt-dataset.json'
 
 # run on a single node
 colossalai run --nproc_per_node=<num_gpus> train_gpt.py --config configs/<config_file> --from_torch
@@ -34,14 +34,14 @@ colossalai run --nproc_per_node=<num_gpus> \
    train_gpt.py \
    --config configs/<config_file> \
    --from_torch \
-   --use_dummy_data
+   --use_dummy_dataset
 
 # run on multiple nodes with slurm
 srun python \
    train_gpt.py \
    --config configs/<config_file> \
    --host <master_node> \
-   --use_dummy_data
+   --use_dummy_dataset
 
 ```
 

From 92f65fbbe36ce92bc52638382b21384506c55aae Mon Sep 17 00:00:00 2001
From: jiaruifang <fangjiarui123@gmail.com>
Date: Mon, 16 Jan 2023 15:18:49 +0800
Subject: [PATCH 192/503] remove license

---
 examples/language/gpt/titans/README.md | 48 --------------------------
 1 file changed, 48 deletions(-)
 delete mode 100644 examples/language/gpt/titans/README.md

diff --git a/examples/language/gpt/titans/README.md b/examples/language/gpt/titans/README.md
deleted file mode 100644
index 9fc26ad801db..000000000000
--- a/examples/language/gpt/titans/README.md
+++ /dev/null
@@ -1,48 +0,0 @@
-# Run GPT With Colossal-AI
-
-## How to Prepare Webtext Dataset
-
-You can download the preprocessed sample dataset for this demo via our [Google Drive sharing link](https://drive.google.com/file/d/1QKI6k-e2gJ7XgS8yIpgPPiMmwiBP_BPE/view?usp=sharing).
-
-
-You can also avoid dataset preparation by using `--use_dummy_dataset` during running.
-
-## Run this Demo
-
-Use the following commands to install prerequisites.
-
-```bash
-# assuming using cuda 11.3
-pip install -r requirements.txt
-```
-
-Use the following commands to execute training.
-
-```Bash
-#!/usr/bin/env sh
-# if you want to use real dataset, then remove --use_dummy_dataset
-# export DATA=/path/to/small-gpt-dataset.json'
-
-# run on a single node
-colossalai run --nproc_per_node=<num_gpus> train_gpt.py --config configs/<config_file> --from_torch
-
-# run on multiple nodes with slurm
-colossalai run --nproc_per_node=<num_gpus> \
-   --master_addr <hostname> \
-   --master_port <port-number> \
-   --hosts <list-of-hostname-separated-by-comma> \
-   train_gpt.py \
-   --config configs/<config_file> \
-   --from_torch \
-   --use_dummy_dataset
-
-# run on multiple nodes with slurm
-srun python \
-   train_gpt.py \
-   --config configs/<config_file> \
-   --host <master_node> \
-   --use_dummy_dataset
-
-```
-
-You can set the `<config_file>` to any file in the `configs` folder. To simply get it running, you can start with `gpt_small_zero3_pp1d.py` on a single node first. You can view the explanations in the config file regarding how to change the parallel setting.

From 38424db6ffbdafbc9e65a1802640202a07e53c50 Mon Sep 17 00:00:00 2001
From: jiaruifang <fangjiarui123@gmail.com>
Date: Mon, 16 Jan 2023 15:21:22 +0800
Subject: [PATCH 193/503] polish code

---
 examples/language/gpt/titans/README.md | 48 ++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)
 create mode 100644 examples/language/gpt/titans/README.md

diff --git a/examples/language/gpt/titans/README.md b/examples/language/gpt/titans/README.md
new file mode 100644
index 000000000000..9fc26ad801db
--- /dev/null
+++ b/examples/language/gpt/titans/README.md
@@ -0,0 +1,48 @@
+# Run GPT With Colossal-AI
+
+## How to Prepare Webtext Dataset
+
+You can download the preprocessed sample dataset for this demo via our [Google Drive sharing link](https://drive.google.com/file/d/1QKI6k-e2gJ7XgS8yIpgPPiMmwiBP_BPE/view?usp=sharing).
+
+
+You can also avoid dataset preparation by using `--use_dummy_dataset` during running.
+
+## Run this Demo
+
+Use the following commands to install prerequisites.
+
+```bash
+# assuming using cuda 11.3
+pip install -r requirements.txt
+```
+
+Use the following commands to execute training.
+
+```Bash
+#!/usr/bin/env sh
+# if you want to use real dataset, then remove --use_dummy_dataset
+# export DATA=/path/to/small-gpt-dataset.json'
+
+# run on a single node
+colossalai run --nproc_per_node=<num_gpus> train_gpt.py --config configs/<config_file> --from_torch
+
+# run on multiple nodes with slurm
+colossalai run --nproc_per_node=<num_gpus> \
+   --master_addr <hostname> \
+   --master_port <port-number> \
+   --hosts <list-of-hostname-separated-by-comma> \
+   train_gpt.py \
+   --config configs/<config_file> \
+   --from_torch \
+   --use_dummy_dataset
+
+# run on multiple nodes with slurm
+srun python \
+   train_gpt.py \
+   --config configs/<config_file> \
+   --host <master_node> \
+   --use_dummy_dataset
+
+```
+
+You can set the `<config_file>` to any file in the `configs` folder. To simply get it running, you can start with `gpt_small_zero3_pp1d.py` on a single node first. You can view the explanations in the config file regarding how to change the parallel setting.

From 438ea608f3492aa341223621e0b5d1ed537c8621 Mon Sep 17 00:00:00 2001
From: jiaruifang <fangjiarui123@gmail.com>
Date: Mon, 16 Jan 2023 15:54:36 +0800
Subject: [PATCH 194/503] update readme

---
 examples/language/gpt/titans/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/language/gpt/titans/README.md b/examples/language/gpt/titans/README.md
index 9fc26ad801db..fe1854c9ffdf 100644
--- a/examples/language/gpt/titans/README.md
+++ b/examples/language/gpt/titans/README.md
@@ -24,7 +24,7 @@ Use the following commands to execute training.
 # export DATA=/path/to/small-gpt-dataset.json'
 
 # run on a single node
-colossalai run --nproc_per_node=<num_gpus> train_gpt.py --config configs/<config_file> --from_torch
+colossalai run --nproc_per_node=<num_gpus> train_gpt.py --config configs/<config_file> --from_torch --use_dummy_dataset
 
 # run on multiple nodes with slurm
 colossalai run --nproc_per_node=<num_gpus> \

From 3a21485ead4b290b8a9590a392615e97749e36bd Mon Sep 17 00:00:00 2001
From: Jiarui Fang <fangjiarui123@gmail.com>
Date: Mon, 16 Jan 2023 15:55:41 +0800
Subject: [PATCH 195/503] [example] titans for gpt (#2484)

---
 examples/language/gpt/README.md               |  17 +-
 examples/language/gpt/titans/LICENSE          | 201 ++++++
 examples/language/gpt/titans/README.md        |  48 ++
 .../titans/configs/gpt2_small_zero3_pp1d.py   |  31 +
 .../gpt/titans/configs/gpt3_zero3_pp1d.py     |  31 +
 .../language/gpt/titans/model/__init__.py     |   3 +
 examples/language/gpt/titans/model/embed.py   | 599 ++++++++++++++++++
 examples/language/gpt/titans/model/gpt1d.py   | 349 ++++++++++
 .../gpt/titans/model/pipeline_gpt1d.py        | 322 ++++++++++
 examples/language/gpt/titans/requirements.txt |   4 +
 examples/language/gpt/titans/run.sh           |   2 +
 examples/language/gpt/titans/test_ci.sh       |   1 +
 examples/language/gpt/titans/train_gpt.py     | 148 +++++
 examples/language/palm/train.py               |   3 +-
 14 files changed, 1754 insertions(+), 5 deletions(-)
 create mode 100644 examples/language/gpt/titans/LICENSE
 create mode 100644 examples/language/gpt/titans/README.md
 create mode 100644 examples/language/gpt/titans/configs/gpt2_small_zero3_pp1d.py
 create mode 100644 examples/language/gpt/titans/configs/gpt3_zero3_pp1d.py
 create mode 100644 examples/language/gpt/titans/model/__init__.py
 create mode 100644 examples/language/gpt/titans/model/embed.py
 create mode 100644 examples/language/gpt/titans/model/gpt1d.py
 create mode 100644 examples/language/gpt/titans/model/pipeline_gpt1d.py
 create mode 100644 examples/language/gpt/titans/requirements.txt
 create mode 100644 examples/language/gpt/titans/run.sh
 create mode 100644 examples/language/gpt/titans/test_ci.sh
 create mode 100644 examples/language/gpt/titans/train_gpt.py

diff --git a/examples/language/gpt/README.md b/examples/language/gpt/README.md
index 8fdf6be3b6d9..7e6acb3d399b 100644
--- a/examples/language/gpt/README.md
+++ b/examples/language/gpt/README.md
@@ -39,9 +39,15 @@ If you want to test ZeRO1 and ZeRO2 in Colossal-AI, you need to ensure Colossal-
 For simplicity, the input data is randonly generated here.
 
 ## Training
-We provide two solutions. One utilizes the hybrid parallel strategies of Gemini, DDP/ZeRO, and Tensor Parallelism.
-The other one uses Pipeline Parallelism Only.
-In the future, we are going merge them together and they can be used orthogonally to each other.
+We provide two stable solutions.
+One utilizes the Gemini to implement hybrid parallel strategies of Gemini, DDP/ZeRO, and Tensor Parallelism for a huggingface GPT model.
+The other one use [Titans](https://github.com/hpcaitech/Titans), a distributed executed model zoo maintained by ColossalAI,to implement the hybrid parallel strategies of TP + ZeRO + PP.
+
+We recommend using Gemini to qucikly run your model in a distributed manner.
+It doesn't require significant changes to the model structures, therefore you can apply it on a new model easily.
+And use Titans as an advanced weapon to pursue a more extreme performance.
+Titans has included the some typical models, such as Vit and GPT.
+However, it requires some efforts to start if facing a new model structure.
 
 ### GeminiDPP/ZeRO + Tensor Parallelism
 ```bash
@@ -56,6 +62,11 @@ The `train_gpt_demo.py` provides three distributed plans, you can choose the pla
 - Pytorch DDP
 - Pytorch ZeRO
 
+### Titans (Tensor Parallelism) + ZeRO + Pipeline Parallelism
+
+Titans provides a customized GPT model, which uses distributed operators as building blocks.
+In [./titans/README.md], we provide a hybrid parallelism of ZeRO, TP and PP.
+You can switch parallel strategies using a config file.
 
 ## Performance
 
diff --git a/examples/language/gpt/titans/LICENSE b/examples/language/gpt/titans/LICENSE
new file mode 100644
index 000000000000..261eeb9e9f8b
--- /dev/null
+++ b/examples/language/gpt/titans/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/examples/language/gpt/titans/README.md b/examples/language/gpt/titans/README.md
new file mode 100644
index 000000000000..fe1854c9ffdf
--- /dev/null
+++ b/examples/language/gpt/titans/README.md
@@ -0,0 +1,48 @@
+# Run GPT With Colossal-AI
+
+## How to Prepare Webtext Dataset
+
+You can download the preprocessed sample dataset for this demo via our [Google Drive sharing link](https://drive.google.com/file/d/1QKI6k-e2gJ7XgS8yIpgPPiMmwiBP_BPE/view?usp=sharing).
+
+
+You can also avoid dataset preparation by using `--use_dummy_dataset` during running.
+
+## Run this Demo
+
+Use the following commands to install prerequisites.
+
+```bash
+# assuming using cuda 11.3
+pip install -r requirements.txt
+```
+
+Use the following commands to execute training.
+
+```Bash
+#!/usr/bin/env sh
+# if you want to use real dataset, then remove --use_dummy_dataset
+# export DATA=/path/to/small-gpt-dataset.json'
+
+# run on a single node
+colossalai run --nproc_per_node=<num_gpus> train_gpt.py --config configs/<config_file> --from_torch --use_dummy_dataset
+
+# run on multiple nodes with slurm
+colossalai run --nproc_per_node=<num_gpus> \
+   --master_addr <hostname> \
+   --master_port <port-number> \
+   --hosts <list-of-hostname-separated-by-comma> \
+   train_gpt.py \
+   --config configs/<config_file> \
+   --from_torch \
+   --use_dummy_dataset
+
+# run on multiple nodes with slurm
+srun python \
+   train_gpt.py \
+   --config configs/<config_file> \
+   --host <master_node> \
+   --use_dummy_dataset
+
+```
+
+You can set the `<config_file>` to any file in the `configs` folder. To simply get it running, you can start with `gpt_small_zero3_pp1d.py` on a single node first. You can view the explanations in the config file regarding how to change the parallel setting.
diff --git a/examples/language/gpt/titans/configs/gpt2_small_zero3_pp1d.py b/examples/language/gpt/titans/configs/gpt2_small_zero3_pp1d.py
new file mode 100644
index 000000000000..8ef81cb0a14f
--- /dev/null
+++ b/examples/language/gpt/titans/configs/gpt2_small_zero3_pp1d.py
@@ -0,0 +1,31 @@
+from model import GPT2_small_pipeline_hybrid
+
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.zero.shard_utils import TensorShardStrategy
+
+BATCH_SIZE = 8
+NUM_EPOCHS = 10
+SEQ_LEN = 1024
+NUM_MICRO_BATCHES = 4
+HIDDEN_SIZE = 768
+TENSOR_SHAPE = (BATCH_SIZE // NUM_MICRO_BATCHES, SEQ_LEN, HIDDEN_SIZE)
+
+# if you do no want zero, just comment out this dictionary
+zero = dict(model_config=dict(tensor_placement_policy='cuda', shard_strategy=TensorShardStrategy()),
+            optimizer_config=dict(initial_scale=2**16))
+
+optimizer = dict(
+    type=HybridAdam,
+    lr=0.00015,
+    weight_decay=1e-2,
+)
+
+model = dict(type=GPT2_small_pipeline_hybrid, checkpoint=True, num_chunks=1)
+
+# pipeline parallel: modify integer value for the number of pipeline stages
+# tensor parallel: modify size to set the tensor parallel size, usually the number of GPUs per node
+# for the current model implementation, mode can only be 1D or None
+parallel = dict(
+    pipeline=1,
+    tensor=dict(size=2, mode='1d'),
+)
diff --git a/examples/language/gpt/titans/configs/gpt3_zero3_pp1d.py b/examples/language/gpt/titans/configs/gpt3_zero3_pp1d.py
new file mode 100644
index 000000000000..9f9816b3004f
--- /dev/null
+++ b/examples/language/gpt/titans/configs/gpt3_zero3_pp1d.py
@@ -0,0 +1,31 @@
+from model import GPT3_pipeline_hybrid
+
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.zero.shard_utils import TensorShardStrategy
+
+BATCH_SIZE = 192
+NUM_EPOCHS = 60
+SEQ_LEN = 2048
+NUM_MICRO_BATCHES = 192
+HIDDEN_SIZE = 12288
+TENSOR_SHAPE = (BATCH_SIZE // NUM_MICRO_BATCHES, SEQ_LEN, HIDDEN_SIZE)
+
+# if you do no want zero, just comment out this dictionary
+zero = dict(model_config=dict(tensor_placement_policy='cuda', shard_strategy=TensorShardStrategy()),
+            optimizer_config=dict(initial_scale=2**16))
+
+optimizer = dict(
+    type=HybridAdam,
+    lr=0.00015,
+    weight_decay=1e-2,
+)
+
+model = dict(type=GPT3_pipeline_hybrid, checkpoint=True, num_chunks=1)
+
+# pipeline parallel: modify integer value for the number of pipeline stages
+# tensor parallel: modify size to set the tensor parallel size, usually the number of GPUs per node
+# for the current model implementation, mode can only be 1D or None
+parallel = dict(
+    pipeline=1,
+    tensor=dict(size=2, mode='1d'),    # for the current model implementation, mode can only be 1D or None
+)
diff --git a/examples/language/gpt/titans/model/__init__.py b/examples/language/gpt/titans/model/__init__.py
new file mode 100644
index 000000000000..eec48ef893fb
--- /dev/null
+++ b/examples/language/gpt/titans/model/__init__.py
@@ -0,0 +1,3 @@
+from .embed import vocab_parallel_cross_entropy
+from .gpt1d import *
+from .pipeline_gpt1d import *
diff --git a/examples/language/gpt/titans/model/embed.py b/examples/language/gpt/titans/model/embed.py
new file mode 100644
index 000000000000..6369b9f8c5a1
--- /dev/null
+++ b/examples/language/gpt/titans/model/embed.py
@@ -0,0 +1,599 @@
+import torch
+import torch.nn.init as init
+from torch import Tensor
+from torch import distributed as dist
+from torch import nn as nn
+from torch.nn import functional as F
+from torch.nn.parameter import Parameter
+
+from colossalai.context import ParallelMode, seed
+from colossalai.core import global_context as gpc
+from colossalai.nn.layer.base_layer import ParallelLayer
+from colossalai.nn.layer.parallel_1d._utils import gather_forward_split_backward, reduce_grad, reduce_input
+from colossalai.nn.layer.parallel_1d.layers import Linear1D_Row
+from colossalai.nn.layer.utils import divide
+from colossalai.registry import LAYERS, LOSSES, MODELS
+from colossalai.utils import get_current_device
+
+
+class VocabParallelEmbedding(torch.nn.Module):
+    """Language model embeddings.
+
+    Arguments:
+        hidden_size: hidden size
+        vocab_size: vocabulary size
+        max_sequence_length: maximum size of sequence. This
+                             is used for positional embedding
+        embedding_dropout_prob: dropout probability for embeddings
+        init_method: weight initialization method
+        num_tokentypes: size of the token-type embeddings. 0 value
+                        will ignore this embedding
+    """
+
+    def __init__(self,
+                 hidden_size,
+                 vocab_size,
+                 max_sequence_length,
+                 embedding_dropout_prob,
+                 num_tokentypes=0,
+                 dtype=torch.float):
+        super(VocabParallelEmbedding, self).__init__()
+
+        self.hidden_size = hidden_size
+        self.num_tokentypes = num_tokentypes
+
+        # Word embeddings (parallel).
+        self.word_embeddings = VocabParallelEmbedding1D(vocab_size, self.hidden_size, dtype=dtype)
+        self._word_embeddings_key = 'word_embeddings'
+
+        # Position embedding (serial).
+        self.position_embeddings = torch.nn.Embedding(max_sequence_length, self.hidden_size, dtype=dtype)
+        self._position_embeddings_key = 'position_embeddings'
+        # Initialize the position embeddings.
+        # self.init_method(self.position_embeddings.weight)
+
+        # Token type embedding.
+        # Add this as an optional field that can be added through
+        # method call so we can load a pretrain model without
+        # token types and add them as needed.
+        self._tokentype_embeddings_key = 'tokentype_embeddings'
+        if self.num_tokentypes > 0:
+            self.tokentype_embeddings = torch.nn.Embedding(self.num_tokentypes, self.hidden_size, dtype=dtype)
+            # Initialize the token-type embeddings.
+            # self.init_method(self.tokentype_embeddings.weight)
+        else:
+            self.tokentype_embeddings = None
+
+        # Embeddings dropout
+        self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)
+
+    def zero_parameters(self):
+        """Zero out all parameters in embedding."""
+        self.word_embeddings.weight.data.fill_(0)
+        self.word_embeddings.weight.shared = True
+        self.position_embeddings.weight.data.fill_(0)
+        self.position_embeddings.weight.shared = True
+        if self.num_tokentypes > 0:
+            self.tokentype_embeddings.weight.data.fill_(0)
+            self.tokentype_embeddings.weight.shared = True
+
+    def add_tokentype_embeddings(self, num_tokentypes):
+        """Add token-type embedding. This function is provided so we can add
+        token-type embeddings in case the pretrained model does not have it.
+        This allows us to load the model normally and then add this embedding.
+        """
+        if self.tokentype_embeddings is not None:
+            raise Exception('tokentype embeddings is already initialized')
+        if torch.distributed.get_rank() == 0:
+            print('adding embedding for {} tokentypes'.format(num_tokentypes), flush=True)
+        self.num_tokentypes = num_tokentypes
+        self.tokentype_embeddings = torch.nn.Embedding(num_tokentypes, self.hidden_size)
+        # Initialize the token-type embeddings.
+        # self.init_method(self.tokentype_embeddings.weight)
+
+    def forward(self, input_ids, position_ids=None, tokentype_ids=None):
+        # Embeddings.
+        if input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        words_embeddings = self.word_embeddings(input_ids)
+
+        if position_ids is not None:
+            position_ids = position_ids.view(-1, input_shape[-1])
+        if position_ids is None:
+            position_ids = torch.arange(0, input_shape[-1] + 0, dtype=torch.long, device=get_current_device())
+            position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
+        position_embeddings = self.position_embeddings(position_ids)
+
+        embeddings = words_embeddings + position_embeddings
+
+        # Dropout.
+        with seed(ParallelMode.TENSOR):
+            embeddings = self.embedding_dropout(embeddings)
+        return embeddings
+
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='', keep_vars=False):
+        """For easy load."""
+
+        state_dict_ = {}
+        state_dict_[self._word_embeddings_key] \
+            = self.word_embeddings.state_dict(destination, prefix, keep_vars)
+        state_dict_[self._position_embeddings_key] \
+            = self.position_embeddings.state_dict(
+                destination, prefix, keep_vars)
+        if self.num_tokentypes > 0:
+            state_dict_[self._tokentype_embeddings_key] \
+                = self.tokentype_embeddings.state_dict(
+                    destination, prefix, keep_vars)
+
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+
+        # Word embedding.
+        if self._word_embeddings_key in state_dict:
+            state_dict_ = state_dict[self._word_embeddings_key]
+        else:
+            # for backward compatibility.
+            state_dict_ = {}
+            for key in state_dict.keys():
+                if 'word_embeddings' in key:
+                    state_dict_[key.split('word_embeddings.')[1]] \
+                        = state_dict[key]
+        self.word_embeddings.load_state_dict(state_dict_, strict=strict)
+
+        # Position embedding.
+        if self._position_embeddings_key in state_dict:
+            state_dict_ = state_dict[self._position_embeddings_key]
+        else:
+            # for backward compatibility.
+            state_dict_ = {}
+            for key in state_dict.keys():
+                if 'position_embeddings' in key:
+                    state_dict_[key.split('position_embeddings.')[1]] \
+                        = state_dict[key]
+        self.position_embeddings.load_state_dict(state_dict_, strict=strict)
+
+        # Tokentype embedding.
+        if self.num_tokentypes > 0:
+            state_dict_ = {}
+            if self._tokentype_embeddings_key in state_dict:
+                state_dict_ = state_dict[self._tokentype_embeddings_key]
+            else:
+                # for backward compatibility.
+                for key in state_dict.keys():
+                    if 'tokentype_embeddings' in key:
+                        state_dict_[key.split('tokentype_embeddings.')[1]] \
+                            = state_dict[key]
+            if len(state_dict_.keys()) > 0:
+                self.tokentype_embeddings.load_state_dict(state_dict_, strict=strict)
+            else:
+                print('***WARNING*** expected tokentype embeddings in the '
+                      'checkpoint but could not find it',
+                      flush=True)
+
+
+class VocabParallelEmbedding1D(torch.nn.Module):
+    """Embedding parallelized in the vocabulary dimension.
+
+    This is mainly adapted from torch.nn.Embedding and all the default
+    values are kept.
+    Arguments:
+        num_embeddings: vocabulary size.
+        embedding_dim: size of hidden state.
+        init_method: method to initialize weights.
+    """
+
+    def __init__(self, num_embeddings, embedding_dim, dtype=None, init_method=None):
+        super(VocabParallelEmbedding1D, self).__init__()
+        # Keep the input dimensions.
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        # Set the details for compatibility.
+        self.padding_idx = None
+        self.max_norm = None
+        self.norm_type = 2.
+        self.scale_grad_by_freq = False
+        self.sparse = False
+        self._weight = None
+        self.tensor_model_parallel_size = gpc.tensor_parallel_size
+        # Divide the weight matrix along the vocabulary dimension.
+        self.vocab_start_index, self.vocab_end_index = \
+            VocabUtility.vocab_range_from_global_vocab_size(
+                self.num_embeddings, gpc.get_local_rank(ParallelMode.PARALLEL_1D),
+                self.tensor_model_parallel_size)
+        self.num_embeddings_per_partition = self.vocab_end_index - \
+            self.vocab_start_index
+
+        # Allocate weights and initialize.
+        factory_kwargs = {'device': get_current_device(), 'dtype': dtype}
+        self.weight = Parameter(torch.empty(self.num_embeddings_per_partition, self.embedding_dim, **factory_kwargs))
+        init.uniform_(self.weight, -1, 1)
+
+    def forward(self, input_):
+        if self.tensor_model_parallel_size > 1:
+            # Build the mask.
+            input_mask = (input_ < self.vocab_start_index) | \
+                         (input_ >= self.vocab_end_index)
+            # Mask the input.
+            masked_input = input_.clone() - self.vocab_start_index
+            masked_input[input_mask] = 0
+        else:
+            masked_input = input_
+            # Get the embeddings.
+        output_parallel = F.embedding(masked_input, self.weight, self.padding_idx, self.max_norm, self.norm_type,
+                                      self.scale_grad_by_freq, self.sparse)
+        # Mask the output embedding.
+        if self.tensor_model_parallel_size > 1:
+            output_parallel[input_mask, :] = 0.0
+        # Reduce across all the model parallel GPUs.
+        output = output = reduce_input(output_parallel, ParallelMode.PARALLEL_1D)
+        return output
+
+
+@LOSSES.register_module
+class vocab_parallel_cross_entropy(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, vocab_parallel_logits, target):
+        """Helper function for the cross entropy."""
+        vocab_parallel_logits = vocab_parallel_logits[..., :-1, :].contiguous()
+        target = target[..., 1:].contiguous()
+        return _VocabParallelCrossEntropy.apply(vocab_parallel_logits.view(-1, vocab_parallel_logits.size(-1)),
+                                                target.view(-1))
+
+
+class _VocabParallelCrossEntropy(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, vocab_parallel_logits, target):
+
+        # Maximum value along vocab dimension across all GPUs.
+        logits_max = torch.max(vocab_parallel_logits, dim=-1)[0]
+        torch.distributed.all_reduce(logits_max,
+                                     op=torch.distributed.ReduceOp.MAX,
+                                     group=gpc.get_group(ParallelMode.PARALLEL_1D))
+        # Subtract the maximum value.
+        vocab_parallel_logits.sub_(logits_max.unsqueeze(dim=-1))
+
+        # Get the partition's vocab indices
+        get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size
+        partition_vocab_size = vocab_parallel_logits.size()[-1]
+        rank = gpc.get_local_rank(ParallelMode.PARALLEL_1D)
+        world_size = gpc.tensor_parallel_size
+        vocab_start_index, vocab_end_index = get_vocab_range(partition_vocab_size, rank, world_size)
+
+        # Create a mask of valid vocab ids (1 means it needs to be masked).
+        target_mask = (target < vocab_start_index) | (target >= vocab_end_index)
+        masked_target = target.clone() - vocab_start_index
+        masked_target[target_mask] = 0
+
+        # Get predicted-logits = logits[target].
+        # For Simplicity, we convert logits to a 2-D tensor with size
+        # [*, partition-vocab-size] and target to a 1-D tensor of size [*].
+        logits_2d = vocab_parallel_logits.view(-1, partition_vocab_size)
+        masked_target_1d = masked_target.view(-1)
+        arange_1d = torch.arange(start=0, end=logits_2d.size()[0], device=logits_2d.device)
+        predicted_logits_1d = logits_2d[arange_1d, masked_target_1d]
+        predicted_logits_1d = predicted_logits_1d.clone().contiguous()
+        predicted_logits = predicted_logits_1d.view_as(target)
+        predicted_logits[target_mask] = 0.0
+        # All reduce is needed to get the chunks from other GPUs.
+        torch.distributed.all_reduce(predicted_logits,
+                                     op=torch.distributed.ReduceOp.SUM,
+                                     group=gpc.get_group(ParallelMode.PARALLEL_1D))
+
+        # Sum of exponential of logits along vocab dimension across all GPUs.
+        exp_logits = vocab_parallel_logits
+        torch.exp(vocab_parallel_logits, out=exp_logits)
+        sum_exp_logits = exp_logits.sum(dim=-1)
+        torch.distributed.all_reduce(sum_exp_logits,
+                                     op=torch.distributed.ReduceOp.SUM,
+                                     group=gpc.get_group(ParallelMode.PARALLEL_1D))
+
+        # Loss = log(sum(exp(logits))) - predicted-logit.
+        loss = torch.log(sum_exp_logits) - predicted_logits
+        loss = loss.mean()
+        # Store softmax, target-mask and masked-target for backward pass.
+        exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
+        ctx.save_for_backward(exp_logits, target_mask, masked_target_1d)
+        return loss
+
+    @staticmethod
+    def backward(ctx, grad_output):
+
+        # Retreive tensors from the forward path.
+        softmax, target_mask, masked_target_1d = ctx.saved_tensors
+
+        # All the inputs have softmax as their gradient.
+        grad_input = softmax
+        # For simplicity, work with the 2D gradient.
+        partition_vocab_size = softmax.size()[-1]
+        grad_2d = grad_input.view(-1, partition_vocab_size)
+
+        # Add the gradient from matching classes.
+        arange_1d = torch.arange(start=0, end=grad_2d.size()[0], device=grad_2d.device)
+        grad_2d[arange_1d, masked_target_1d] -= (1.0 - target_mask.view(-1).float())
+
+        # Finally elementwise multiplication with the output gradients.
+        grad_input.mul_(grad_output.unsqueeze(dim=-1))
+
+        return grad_input, None
+
+
+class VocabUtility:
+    """Split the vocabulary into `world_size` chunks amd return the
+        first and last index of the vocabulary belonging to the `rank`
+        partition: Note that indices in [fist, last)"""
+
+    @staticmethod
+    def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, rank, world_size):
+        index_f = rank * per_partition_vocab_size
+        index_l = index_f + per_partition_vocab_size
+        return index_f, index_l
+
+    @staticmethod
+    def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size):
+        per_partition_vocab_size = divide(global_vocab_size, world_size)
+        return VocabUtility.vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, rank, world_size)
+
+
+class VocabParallelGPTLMHead1D(ParallelLayer):
+    """
+    Language model head that shares the same parameters with the embedding matrix.
+    """
+
+    def __init__(self, embed=None, vocab_size=None, dtype=None, embed_dim=None):
+        super().__init__()
+        if embed is not None:
+            self.head = embed
+        else:
+            self.head = VocabParallelEmbedding1D(vocab_size, embed_dim, dtype=dtype)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = reduce_grad(x, ParallelMode.PARALLEL_1D)
+        x = F.linear(x, self.head.weight)
+        return x
+
+
+###################################
+
+
+class HiddenParallelEmbedding(torch.nn.Module):
+    """Language model embeddings.
+
+    Arguments:
+        hidden_size: hidden size
+        vocab_size: vocabulary size
+        max_sequence_length: maximum size of sequence. This
+                             is used for positional embedding
+        embedding_dropout_prob: dropout probability for embeddings
+        init_method: weight initialization method
+        num_tokentypes: size of the token-type embeddings. 0 value
+                        will ignore this embedding
+    """
+
+    def __init__(
+        self,
+        hidden_size,
+        vocab_size,
+        max_sequence_length,
+        embedding_dropout_prob,
+        dtype=torch.float,
+        padding_idx: int = 0,
+        num_tokentypes=0,
+    ):
+        super(HiddenParallelEmbedding, self).__init__()
+
+        self.hidden_size = hidden_size
+        self.num_tokentypes = num_tokentypes
+
+        # Word embeddings (parallel).
+        self.word_embeddings = HiddenParallelEmbedding1D(vocab_size, hidden_size, dtype, padding_idx)
+        self._word_embeddings_key = 'word_embeddings'
+
+        # Position embedding (serial).
+        self.position_embeddings = torch.nn.Embedding(max_sequence_length, self.hidden_size)
+        self._position_embeddings_key = 'position_embeddings'
+        # Initialize the position embeddings.
+        # self.init_method(self.position_embeddings.weight)
+
+        # Token type embedding.
+        # Add this as an optional field that can be added through
+        # method call so we can load a pretrain model without
+        # token types and add them as needed.
+        self._tokentype_embeddings_key = 'tokentype_embeddings'
+        if self.num_tokentypes > 0:
+            self.tokentype_embeddings = torch.nn.Embedding(self.num_tokentypes, self.hidden_size)
+            # Initialize the token-type embeddings.
+            # self.init_method(self.tokentype_embeddings.weight)
+        else:
+            self.tokentype_embeddings = None
+
+        # Embeddings dropout
+        self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)
+
+    def zero_parameters(self):
+        """Zero out all parameters in embedding."""
+        self.word_embeddings.weight.data.fill_(0)
+        self.word_embeddings.weight.shared = True
+        self.position_embeddings.weight.data.fill_(0)
+        self.position_embeddings.weight.shared = True
+        if self.num_tokentypes > 0:
+            self.tokentype_embeddings.weight.data.fill_(0)
+            self.tokentype_embeddings.weight.shared = True
+
+    def add_tokentype_embeddings(self, num_tokentypes):
+        """Add token-type embedding. This function is provided so we can add
+        token-type embeddings in case the pretrained model does not have it.
+        This allows us to load the model normally and then add this embedding.
+        """
+        if self.tokentype_embeddings is not None:
+            raise Exception('tokentype embeddings is already initialized')
+        if torch.distributed.get_rank() == 0:
+            print('adding embedding for {} tokentypes'.format(num_tokentypes), flush=True)
+        self.num_tokentypes = num_tokentypes
+        self.tokentype_embeddings = torch.nn.Embedding(num_tokentypes, self.hidden_size)
+        # Initialize the token-type embeddings.
+        # self.init_method(self.tokentype_embeddings.weight)
+
+    def forward(self, input_ids, position_ids=None, tokentype_ids=None):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        words_embeddings = self.word_embeddings(input_ids)
+
+        if position_ids is not None:
+            position_ids = position_ids.view(-1, input_shape[-1])
+        if position_ids is None:
+            position_ids = torch.arange(0, input_shape[-1] + 0, dtype=torch.long, device=get_current_device())
+            position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
+        position_embeddings = self.position_embeddings(position_ids)
+
+        embeddings = words_embeddings + position_embeddings
+
+        # Dropout.
+        with seed(ParallelMode.TENSOR):
+            embeddings = self.embedding_dropout(embeddings)
+        return embeddings
+
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='', keep_vars=False):
+        """For easy load."""
+
+        state_dict_ = {}
+        state_dict_[self._word_embeddings_key] \
+            = self.word_embeddings.state_dict(destination, prefix, keep_vars)
+        state_dict_[self._position_embeddings_key] \
+            = self.position_embeddings.state_dict(
+                destination, prefix, keep_vars)
+        if self.num_tokentypes > 0:
+            state_dict_[self._tokentype_embeddings_key] \
+                = self.tokentype_embeddings.state_dict(
+                    destination, prefix, keep_vars)
+
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+
+        # Word embedding.
+        if self._word_embeddings_key in state_dict:
+            state_dict_ = state_dict[self._word_embeddings_key]
+        else:
+            # for backward compatibility.
+            state_dict_ = {}
+            for key in state_dict.keys():
+                if 'word_embeddings' in key:
+                    state_dict_[key.split('word_embeddings.')[1]] \
+                        = state_dict[key]
+        self.word_embeddings.load_state_dict(state_dict_, strict=strict)
+
+        # Position embedding.
+        if self._position_embeddings_key in state_dict:
+            state_dict_ = state_dict[self._position_embeddings_key]
+        else:
+            # for backward compatibility.
+            state_dict_ = {}
+            for key in state_dict.keys():
+                if 'position_embeddings' in key:
+                    state_dict_[key.split('position_embeddings.')[1]] \
+                        = state_dict[key]
+        self.position_embeddings.load_state_dict(state_dict_, strict=strict)
+
+        # Tokentype embedding.
+        if self.num_tokentypes > 0:
+            state_dict_ = {}
+            if self._tokentype_embeddings_key in state_dict:
+                state_dict_ = state_dict[self._tokentype_embeddings_key]
+            else:
+                # for backward compatibility.
+                for key in state_dict.keys():
+                    if 'tokentype_embeddings' in key:
+                        state_dict_[key.split('tokentype_embeddings.')[1]] \
+                            = state_dict[key]
+            if len(state_dict_.keys()) > 0:
+                self.tokentype_embeddings.load_state_dict(state_dict_, strict=strict)
+            else:
+                print('***WARNING*** expected tokentype embeddings in the '
+                      'checkpoint but could not find it',
+                      flush=True)
+
+
+class HiddenParallelEmbedding1D(torch.nn.Module):
+    """Embedding parallelized in the vocabulary dimension.
+
+    This is mainly adapted from torch.nn.Embedding and all the default
+    values are kept.
+    Arguments:
+        num_embeddings: vocabulary size.
+        embedding_dim: size of hidden state.
+        init_method: method to initialize weights.
+    """
+
+    def __init__(self, num_embeddings, embedding_dim, dtype=torch.float, padding_idx: int = None, init_method=None):
+        super(HiddenParallelEmbedding1D, self).__init__()
+        # Keep the input dimensions.
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        embed_dim_per_partition = divide(embedding_dim, gpc.tensor_parallel_size)
+        # Set the details for compatibility.
+        self.padding_idx = padding_idx
+        self.max_norm = None
+        self.norm_type = 2.
+        self.scale_grad_by_freq = False
+        self.sparse = False
+        self._weight = None
+
+        # Allocate weights and initialize.
+        factory_kwargs = {'device': get_current_device(), 'dtype': dtype}
+        self.weight = Parameter(torch.empty(num_embeddings, embed_dim_per_partition, **factory_kwargs))
+        init.uniform_(self.weight, -1, 1)
+
+    def forward(self, input_):
+
+        # Get the embeddings.
+        output_parallel = F.embedding(input_, self.weight, self.padding_idx, self.max_norm, self.norm_type,
+                                      self.scale_grad_by_freq, self.sparse)
+
+        # Reduce across all the model parallel GPUs.
+        output = gather_forward_split_backward(output_parallel, ParallelMode.PARALLEL_1D, dim=-1)
+        return output
+
+
+@LAYERS.register_module
+class HiddenParallelGPTLMHead1D(ParallelLayer):
+    """
+    Language model head that shares the same parameters with the embedding matrix.
+    """
+
+    def __init__(
+        self,
+        embed=None,
+        embed_dim=None,
+        vocab_size=None,
+        dtype=None,
+    ):
+        super().__init__()
+        if embed is not None:
+            self.head = embed
+            self.synced_embed = True
+        else:
+            # self.embedding = HiddenParallelEmbedding1D(vocab_size, hidden_size, dtype, padding_idx)
+            # (hidden_size/q, vocab_size)
+            self.synced_embed = False
+            self.head = Linear1D_Row(in_features=embed_dim,
+                                     out_features=vocab_size,
+                                     bias=False,
+                                     dtype=dtype,
+                                     parallel_input=False)
+
+    def forward(self, x: Tensor) -> Tensor:
+        if self.synced_embed:
+            x = F.linear(x, self.head.weight)
+        else:
+            x = self.head(x)
+
+        return x
diff --git a/examples/language/gpt/titans/model/gpt1d.py b/examples/language/gpt/titans/model/gpt1d.py
new file mode 100644
index 000000000000..2edd03606b7d
--- /dev/null
+++ b/examples/language/gpt/titans/model/gpt1d.py
@@ -0,0 +1,349 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+import math
+
+import torch
+from torch import Tensor
+from torch import nn as nn
+
+from colossalai import kernel
+from colossalai import nn as col_nn
+from colossalai.core import global_context as gpc
+from colossalai.kernel.cuda_native.scaled_softmax import AttnMaskType
+from colossalai.nn.layer import Linear1D_Col, Linear1D_Row
+from colossalai.nn.layer.base_layer import ParallelLayer
+from colossalai.nn.layer.utils import ACT2FN, divide
+from colossalai.utils import checkpoint
+from colossalai.utils.activation_checkpoint import checkpoint
+
+__all__ = [
+    'GPTMLP1D', 'GPTSelfAttention1D', 'GPTTransformerLayer1D', 'FusedGPTSelfAttention1D', 'FusedGPTTransformerLayer1D'
+]
+
+
+class GPTMLP1D(ParallelLayer):
+
+    def __init__(
+        self,
+        in_features: int,
+        mlp_ratio: int,
+        act_func: str = 'gelu',
+        dropout_prob: float = 0.,
+        dtype=None,
+        checkpoint: bool = False,
+        skip_bias_add: bool = False,
+    ):
+        super().__init__()
+
+        self.in_features = in_features
+        self.mlp_ratio = mlp_ratio
+        self.checkpoint = checkpoint
+        self.skip_bias_add = skip_bias_add
+
+        self.act = ACT2FN[act_func]
+        skip_dense_1_add_bias = False
+
+        # Project to mlp_ratio * h.
+        self.dense_1 = Linear1D_Col(
+            self.in_features,
+            int(self.mlp_ratio * self.in_features),
+            dtype=dtype,
+            gather_output=False,
+            skip_bias_add=skip_dense_1_add_bias,
+        )
+
+        # Project back to h.
+        self.dense_2 = Linear1D_Row(
+            int(self.mlp_ratio * self.in_features),
+            self.in_features,
+            dtype=dtype,
+            parallel_input=True,
+        )
+
+        self.dropout = col_nn.Dropout(dropout_prob)
+
+    def _forward(self, hidden_states: Tensor) -> Tensor:
+        intermediate_output = self.dense_1(hidden_states)
+        intermediate_output = self.act(intermediate_output)
+
+        output = self.dense_2(intermediate_output)
+        output = self.dropout(output)
+        return output
+
+    def _checkpoint_forward(self, hidden_states: Tensor) -> Tensor:
+        return checkpoint(self._forward, False, hidden_states)
+
+    def forward(self, hidden_states: Tensor) -> Tensor:
+        if self.checkpoint:
+            return self._checkpoint_forward(hidden_states)
+        else:
+            return self._forward(hidden_states)
+
+
+class GenericGPTSelfAttention1D(ParallelLayer):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_attention_heads: int,
+        attention_dropout_prob: float,
+        hidden_dropout_prob: float,
+        dtype=None,
+        checkpoint: bool = False,
+        max_position_embeddings=1024,
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.attention_head_size = divide(hidden_size, num_attention_heads)
+        self.num_attention_heads_per_partition = divide(num_attention_heads, gpc.tensor_parallel_size)
+        self.hidden_size_per_partition = divide(hidden_size, gpc.tensor_parallel_size)
+        self.checkpoint = checkpoint
+        self.query_key_value = Linear1D_Col(
+            hidden_size,
+            3 * hidden_size,
+            dtype=dtype,
+        )
+        self.attention_dropout = col_nn.Dropout(attention_dropout_prob)
+        self.dense = Linear1D_Row(
+            hidden_size,
+            hidden_size,
+            dtype=dtype,
+            parallel_input=True,
+        )
+        self.dropout = col_nn.Dropout(hidden_dropout_prob)
+
+    def softmax_forward(self, attention_scores, attention_mask, query_layer, key_layer):
+        raise NotImplementedError
+
+    def _forward(self, hidden_states: Tensor, attention_mask=None) -> Tensor:
+        query_key_value = self.query_key_value(hidden_states)
+        new_qkv_shape = query_key_value.shape[:-1] + \
+            (self.num_attention_heads_per_partition, 3 * self.attention_head_size)
+        query_key_value = query_key_value.view(new_qkv_shape)
+        query_key_value = query_key_value.permute((0, 2, 1, 3))
+        query_layer, key_layer, value_layer = torch.chunk(query_key_value, 3, dim=-1)
+
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        attention_scores = self.softmax_forward(attention_scores, attention_mask, query_layer, key_layer)
+
+        attention_scores = attention_scores.type(value_layer.dtype)
+
+        attention_probs = self.attention_dropout(attention_scores)
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.transpose(1, 2)
+        new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
+        context_layer = context_layer.reshape(new_context_layer_shape)
+        output = self.dense(context_layer)
+        output = self.dropout(output)
+
+        return output
+
+    def _checkpoint_forward(self, hidden_states: Tensor, attention_mask=None) -> Tensor:
+        return checkpoint(self._forward, False, hidden_states, attention_mask)
+
+    def forward(self, hidden_states: Tensor, attention_mask=None) -> Tensor:
+        if self.checkpoint:
+            return self._checkpoint_forward(hidden_states, attention_mask)
+        else:
+            return self._forward(hidden_states, attention_mask)
+
+
+class GPTSelfAttention1D(GenericGPTSelfAttention1D):
+
+    def __init__(self,
+                 hidden_size: int,
+                 num_attention_heads: int,
+                 attention_dropout_prob: float,
+                 hidden_dropout_prob: float,
+                 dtype=None,
+                 checkpoint: bool = False,
+                 max_position_embeddings=1024):
+        super().__init__(hidden_size,
+                         num_attention_heads,
+                         attention_dropout_prob,
+                         hidden_dropout_prob,
+                         dtype=dtype,
+                         checkpoint=checkpoint,
+                         max_position_embeddings=max_position_embeddings)
+        self.softmax = nn.Softmax(dim=-1)
+        max_positions = max_position_embeddings
+        self.register_buffer(
+            "bias",
+            torch.tril(torch.ones((max_positions, max_positions),
+                                  dtype=torch.uint8)).view(1, 1, max_positions, max_positions),
+        )
+        self.register_buffer("masked_bias", torch.tensor(-1e4))
+
+    def softmax_forward(self, attention_scores, attention_mask, query_layer, key_layer):
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        # causal mask
+        query_length, key_length = query_layer.size(-2), key_layer.size(-2)
+        causal_mask = self.bias[:, :, key_length - query_length:key_length, :key_length].bool()
+        attention_scores = torch.where(causal_mask, attention_scores, self.masked_bias.to(attention_scores))
+        if attention_mask is not None:
+            # Apply the attention mask
+            attention_scores = attention_scores + attention_mask
+        attention_scores = self.softmax(attention_scores)
+        return attention_scores
+
+
+class FusedGPTSelfAttention1D(GenericGPTSelfAttention1D):
+
+    def __init__(self,
+                 hidden_size: int,
+                 num_attention_heads: int,
+                 attention_dropout_prob: float,
+                 hidden_dropout_prob: float,
+                 dtype=None,
+                 checkpoint: bool = False,
+                 max_position_embeddings=1024):
+        super().__init__(hidden_size,
+                         num_attention_heads,
+                         attention_dropout_prob,
+                         hidden_dropout_prob,
+                         dtype=dtype,
+                         checkpoint=checkpoint,
+                         max_position_embeddings=max_position_embeddings)
+        self.softmax = kernel.FusedScaleMaskSoftmax(input_in_fp16=True,
+                                                    input_in_bf16=False,
+                                                    attn_mask_type=AttnMaskType.causal,
+                                                    scaled_masked_softmax_fusion=True,
+                                                    mask_func=None,
+                                                    softmax_in_fp32=True,
+                                                    scale=math.sqrt(self.attention_head_size))
+
+    def softmax_forward(self, attention_scores, attention_mask, query_layer, key_layer):
+        return self.softmax(attention_scores, attention_mask)
+
+
+class GenericGPTTransformerLayer1D(ParallelLayer):
+
+    def __init__(self,
+                 hidden_size: int,
+                 num_attention_heads: int,
+                 act_func: str = 'gelu',
+                 mlp_ratio: float = 4.0,
+                 attention_dropout_prob: float = 0.,
+                 hidden_dropout_prob: float = 0.,
+                 dtype=None,
+                 checkpoint: bool = False,
+                 max_position_embeddings: int = 1024,
+                 layer_norm_epsilon: float = 1e-5,
+                 apply_post_layer_norm: bool = False,
+                 attention=None,
+                 layer_norm=None):
+        super().__init__()
+        self.checkpoint = checkpoint
+        self.dtype = dtype
+        self.norm1 = layer_norm(hidden_size, eps=layer_norm_epsilon)
+        self.apply_post_layer_norm = apply_post_layer_norm
+        self.attention = attention(
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            attention_dropout_prob=attention_dropout_prob,
+            hidden_dropout_prob=hidden_dropout_prob,
+            dtype=dtype,
+            max_position_embeddings=max_position_embeddings,
+            checkpoint=False,
+        )
+
+        self.norm2 = layer_norm(hidden_size, eps=layer_norm_epsilon)
+        self.mlp = GPTMLP1D(
+            in_features=hidden_size,
+            dropout_prob=hidden_dropout_prob,
+            act_func=act_func,
+            mlp_ratio=mlp_ratio,
+            dtype=dtype,
+            checkpoint=False,
+        )
+
+    def _forward(self, hidden_states, attention_mask) -> Tensor:
+        if not self.apply_post_layer_norm:
+            residual = hidden_states
+        hidden_states = self.norm1(hidden_states)
+        if self.apply_post_layer_norm:
+            residual = hidden_states
+        attention_output = self.attention(hidden_states, attention_mask)
+        hidden_states = residual + attention_output
+
+        if not self.apply_post_layer_norm:
+            residual = hidden_states
+        hidden_states = self.norm2(hidden_states)
+        if self.apply_post_layer_norm:
+            residual = hidden_states
+        feed_forward_hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + feed_forward_hidden_states
+
+        output = (hidden_states, attention_mask)
+        return output
+
+    def forward(self, hidden_states, attention_mask):
+        if self.checkpoint:
+            return checkpoint(self._forward, False, hidden_states, attention_mask)
+        else:
+            return self._forward(hidden_states, attention_mask)
+
+
+class GPTTransformerLayer1D(GenericGPTTransformerLayer1D):
+
+    def __init__(self,
+                 hidden_size: int,
+                 num_attention_heads: int,
+                 act_func: str = 'gelu',
+                 mlp_ratio: float = 4,
+                 attention_dropout_prob: float = 0,
+                 hidden_dropout_prob: float = 0,
+                 dtype=None,
+                 checkpoint: bool = False,
+                 max_position_embeddings: int = 1024,
+                 layer_norm_epsilon: float = 0.00001,
+                 apply_post_layer_norm: bool = False):
+        attention = GPTSelfAttention1D
+        layer_norm = nn.LayerNorm
+        super().__init__(hidden_size,
+                         num_attention_heads,
+                         act_func=act_func,
+                         mlp_ratio=mlp_ratio,
+                         attention_dropout_prob=attention_dropout_prob,
+                         hidden_dropout_prob=hidden_dropout_prob,
+                         dtype=dtype,
+                         checkpoint=checkpoint,
+                         max_position_embeddings=max_position_embeddings,
+                         layer_norm_epsilon=layer_norm_epsilon,
+                         apply_post_layer_norm=apply_post_layer_norm,
+                         attention=attention,
+                         layer_norm=layer_norm)
+
+
+class FusedGPTTransformerLayer1D(GenericGPTTransformerLayer1D):
+
+    def __init__(self,
+                 hidden_size: int,
+                 num_attention_heads: int,
+                 act_func: str = 'gelu',
+                 mlp_ratio: float = 4,
+                 attention_dropout_prob: float = 0,
+                 hidden_dropout_prob: float = 0,
+                 dtype=None,
+                 checkpoint: bool = False,
+                 max_position_embeddings: int = 1024,
+                 layer_norm_epsilon: float = 0.00001,
+                 apply_post_layer_norm: bool = False):
+        attention = FusedGPTSelfAttention1D
+        layer_norm = kernel.LayerNorm
+        super().__init__(hidden_size,
+                         num_attention_heads,
+                         act_func=act_func,
+                         mlp_ratio=mlp_ratio,
+                         attention_dropout_prob=attention_dropout_prob,
+                         hidden_dropout_prob=hidden_dropout_prob,
+                         dtype=dtype,
+                         checkpoint=checkpoint,
+                         max_position_embeddings=max_position_embeddings,
+                         layer_norm_epsilon=layer_norm_epsilon,
+                         apply_post_layer_norm=apply_post_layer_norm,
+                         attention=attention,
+                         layer_norm=layer_norm)
diff --git a/examples/language/gpt/titans/model/pipeline_gpt1d.py b/examples/language/gpt/titans/model/pipeline_gpt1d.py
new file mode 100644
index 000000000000..30180285bc70
--- /dev/null
+++ b/examples/language/gpt/titans/model/pipeline_gpt1d.py
@@ -0,0 +1,322 @@
+import inspect
+
+# import model_zoo.gpt.gpt as col_gpt
+import titans.model.gpt.gpt as col_gpt
+import torch
+import torch.nn as nn
+
+from colossalai import kernel
+from colossalai import nn as col_nn
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.logging import get_dist_logger
+from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper
+from colossalai.pipeline.utils import partition_uniform
+
+from .embed import HiddenParallelEmbedding, HiddenParallelGPTLMHead1D, VocabParallelEmbedding, VocabParallelGPTLMHead1D
+from .gpt1d import FusedGPTTransformerLayer1D, GPTTransformerLayer1D
+
+__all__ = [
+    'GPT2_small_pipeline_1D',
+    'GPT2_exlarge_pipeline_1D',
+    'GPT3_pipeline_1D',
+    'GPT2_exlarge_pipeline_hybrid',
+    'GPT2_small_pipeline_hybrid',
+    'GPT3_pipeline_hybrid',
+]
+
+
+class GenericPipelineGPT(nn.Module):
+
+    def __init__(self, embedding=None, blocks=None, norm=None, head=None) -> None:
+        super().__init__()
+        self.embedding = embedding
+        self.blocks = blocks
+        self.norm = norm
+        self.head = head
+        assert blocks is not None
+        if norm is not None or head is not None:
+            assert norm is not None and head is not None
+
+    def forward(self, hidden_states=None, input_ids=None, attention_mask=None):
+        if self.embedding is not None:
+            hidden_states = self.embedding(input_ids=input_ids)
+        batch_size = hidden_states.shape[0]
+        attention_mask = attention_mask.view(batch_size, -1)
+        attention_mask = attention_mask[:, None, None, :]
+        attention_mask = attention_mask.to(dtype=hidden_states.dtype)    # fp16 compatibility
+        attention_mask = (1.0 - attention_mask) * -10000.0
+        for block in self.blocks:
+            hidden_states, attention_mask = block(hidden_states, attention_mask)
+        if self.norm is not None:
+            hidden_states = self.head(self.norm(hidden_states))
+        return hidden_states
+
+
+class PipelineGPT1D(GenericPipelineGPT):
+
+    def __init__(self,
+                 num_layers: int = 12,
+                 hidden_size: int = 768,
+                 num_attention_heads: int = 12,
+                 vocab_size: int = 50304,
+                 embed_drop_rate: float = 0.,
+                 act_func: str = 'gelu',
+                 mlp_ratio: int = 4.0,
+                 attn_drop_rate: float = 0.,
+                 drop_rate: float = 0.,
+                 dtype: torch.dtype = torch.float,
+                 checkpoint: bool = False,
+                 max_position_embeddings: int = 1024,
+                 layer_norm_epsilon: float = 1e-5,
+                 apply_post_layer_norm: bool = False,
+                 first: bool = False,
+                 last: bool = False,
+                 embed_split_hidden=False):
+        embedding = None
+        norm = None
+        head = None
+        embed_cls = VocabParallelEmbedding
+        head_cls = VocabParallelGPTLMHead1D
+        if embed_split_hidden:
+            embed_cls = HiddenParallelEmbedding
+            head_cls = HiddenParallelGPTLMHead1D
+        if first:
+            embedding = embed_cls(hidden_size, vocab_size, max_position_embeddings, embed_drop_rate, dtype=dtype)
+        blocks = nn.ModuleList([
+            GPTTransformerLayer1D(hidden_size,
+                                  num_attention_heads,
+                                  act_func=act_func,
+                                  mlp_ratio=mlp_ratio,
+                                  attention_dropout_prob=attn_drop_rate,
+                                  hidden_dropout_prob=drop_rate,
+                                  dtype=dtype,
+                                  checkpoint=checkpoint,
+                                  max_position_embeddings=max_position_embeddings,
+                                  layer_norm_epsilon=layer_norm_epsilon,
+                                  apply_post_layer_norm=apply_post_layer_norm) for _ in range(num_layers)
+        ])
+        if last:
+            norm = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon)
+            head = head_cls(vocab_size=vocab_size, embed_dim=hidden_size, dtype=dtype)
+        super().__init__(embedding=embedding, blocks=blocks, norm=norm, head=head)
+
+
+class FusedPipelineGPT1D(GenericPipelineGPT):
+
+    def __init__(self,
+                 num_layers: int = 12,
+                 hidden_size: int = 768,
+                 num_attention_heads: int = 12,
+                 vocab_size: int = 50304,
+                 embed_drop_rate: float = 0.,
+                 act_func: str = 'gelu',
+                 mlp_ratio: int = 4.0,
+                 attn_drop_rate: float = 0.,
+                 drop_rate: float = 0.,
+                 dtype: torch.dtype = torch.float,
+                 checkpoint: bool = False,
+                 max_position_embeddings: int = 1024,
+                 layer_norm_epsilon: float = 1e-5,
+                 apply_post_layer_norm: bool = False,
+                 first: bool = False,
+                 last: bool = False,
+                 embed_split_hidden=False):
+        embedding = None
+        norm = None
+        head = None
+        embed_cls = VocabParallelEmbedding
+        head_cls = VocabParallelGPTLMHead1D
+        if embed_split_hidden:
+            embed_cls = HiddenParallelEmbedding
+            head_cls = HiddenParallelGPTLMHead1D
+        if first:
+            embedding = embed_cls(hidden_size, vocab_size, max_position_embeddings, embed_drop_rate, dtype=dtype)
+        blocks = nn.ModuleList([
+            FusedGPTTransformerLayer1D(hidden_size,
+                                       num_attention_heads,
+                                       act_func=act_func,
+                                       mlp_ratio=mlp_ratio,
+                                       attention_dropout_prob=attn_drop_rate,
+                                       hidden_dropout_prob=drop_rate,
+                                       dtype=dtype,
+                                       checkpoint=checkpoint,
+                                       max_position_embeddings=max_position_embeddings,
+                                       layer_norm_epsilon=layer_norm_epsilon,
+                                       apply_post_layer_norm=apply_post_layer_norm) for _ in range(num_layers)
+        ])
+        if last:
+            norm = kernel.LayerNorm(hidden_size, eps=layer_norm_epsilon)
+            head = head_cls(vocab_size=vocab_size, embed_dim=hidden_size, dtype=dtype)
+        super().__init__(embedding=embedding, blocks=blocks, norm=norm, head=head)
+
+    def forward(self, hidden_states=None, input_ids=None, attention_mask=None):
+        if self.embedding is not None:
+            hidden_states = self.embedding(input_ids=input_ids)
+        attention_mask = attention_mask.to(dtype=hidden_states.dtype)    # fp16 compatibility
+        for block in self.blocks:
+            hidden_states, attention_mask = block(hidden_states, attention_mask)
+        if self.norm is not None:
+            hidden_states = self.head(self.norm(hidden_states))
+        return hidden_states
+
+
+class PipelineGPTHybrid(GenericPipelineGPT):
+
+    def __init__(self,
+                 num_layers: int = 12,
+                 hidden_size: int = 768,
+                 num_attention_heads: int = 12,
+                 vocab_size: int = 50304,
+                 embed_drop_rate: float = 0.,
+                 act_func: str = 'gelu',
+                 mlp_ratio: int = 4,
+                 attn_drop_rate: float = 0.,
+                 drop_rate: float = 0.,
+                 dtype: torch.dtype = torch.float,
+                 checkpoint: bool = False,
+                 max_position_embeddings: int = 1024,
+                 layer_norm_epsilon: float = 1e-5,
+                 apply_post_layer_norm: bool = False,
+                 first: bool = False,
+                 last: bool = False,
+                 embed_split_hidden=False):
+        embedding = None
+        norm = None
+        head = None
+        if first:
+            embedding = col_gpt.GPTEmbedding(hidden_size,
+                                             vocab_size,
+                                             max_position_embeddings,
+                                             dropout=embed_drop_rate,
+                                             dtype=dtype)
+        blocks = nn.ModuleList([
+            col_gpt.GPTBlock(hidden_size,
+                             num_attention_heads,
+                             mlp_ratio=mlp_ratio,
+                             attention_dropout=attn_drop_rate,
+                             dropout=drop_rate,
+                             dtype=dtype,
+                             checkpoint=checkpoint,
+                             activation=nn.functional.gelu) for _ in range(num_layers)
+        ])
+        if last:
+            norm = col_nn.LayerNorm(hidden_size, eps=layer_norm_epsilon)
+            # head = col_gpt.GPTLMHead(vocab_size=vocab_size,
+            #                          hidden_size=hidden_size,
+            #                          dtype=dtype,
+            #                          bias=False)
+            head = col_nn.Classifier(hidden_size, vocab_size, dtype=dtype, bias=False)
+        super().__init__(embedding=embedding, blocks=blocks, norm=norm, head=head)
+
+
+def _filter_kwargs(func, kwargs):
+    sig = inspect.signature(func)
+    return {k: v for k, v in kwargs.items() if k in sig.parameters}
+
+
+def _build_generic_gpt_pipeline_1d(module_cls, num_layers, num_chunks, device=torch.device('cuda'), **kwargs):
+    logger = get_dist_logger()
+
+    if gpc.is_initialized(ParallelMode.PIPELINE):
+        pipeline_size = gpc.get_world_size(ParallelMode.PIPELINE)
+        pipeline_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
+    else:
+        pipeline_size = 1
+        pipeline_rank = 0
+    rank = gpc.get_global_rank()
+
+    if pipeline_size > 1:
+        wrapper = PipelineSharedModuleWrapper([0, pipeline_size - 1])
+    else:
+        wrapper = None
+    parts = partition_uniform(num_layers, pipeline_size, num_chunks)[pipeline_rank]
+    models = []
+    for start, end in parts:
+        kwargs['num_layers'] = end - start
+        kwargs['first'] = start == 0
+        kwargs['last'] = end == num_layers
+        logger.info(f'Rank{rank} build layer {start}-{end}, {end-start}/{num_layers} layers')
+        chunk = module_cls(**_filter_kwargs(module_cls.__init__, kwargs)).to(device)
+
+        if wrapper is not None:
+            if start == 0:
+                wrapper.register_module(chunk.embedding.word_embeddings)
+            elif end == num_layers:
+                wrapper.register_module(chunk.head)
+        models.append(chunk)
+    if len(models) == 1:
+        model = models[0]
+    else:
+        model = nn.ModuleList(models)
+
+    numel = 0
+    for _, param in model.named_parameters(recurse=True):
+        numel += param.numel()
+    logger.info(f'Rank{rank}/{pipeline_rank} model size = {numel * 2 / 1e9} GB')
+    return model
+
+
+def _build_gpt_pipeline_1d(num_layers, num_chunks, device=torch.device('cuda'), fused=False, **kwargs):
+    model = FusedPipelineGPT1D if fused else PipelineGPT1D
+    return _build_generic_gpt_pipeline_1d(model, num_layers, num_chunks, device, **kwargs)
+
+
+def _build_gpt_pipeline_hybrid(num_layers, num_chunks, device=torch.device('cuda'), **kwargs):
+    return _build_generic_gpt_pipeline_1d(PipelineGPTHybrid, num_layers, num_chunks, device, **kwargs)
+
+
+def GPT2_small_pipeline_1D(num_chunks=1, checkpoint=False, dtype=torch.float, embed_split_hidden=False, fused=False):
+    cfg = dict(hidden_size=768,
+               num_attention_heads=12,
+               checkpoint=checkpoint,
+               dtype=dtype,
+               embed_split_hidden=embed_split_hidden)
+    return _build_gpt_pipeline_1d(12, num_chunks, fused=fused, **cfg)
+
+
+def GPT2_exlarge_pipeline_1D(num_chunks=1, checkpoint=False, dtype=torch.float, embed_split_hidden=False, fused=False):
+    cfg = dict(hidden_size=1600,
+               num_attention_heads=32,
+               checkpoint=checkpoint,
+               dtype=dtype,
+               embed_split_hidden=embed_split_hidden)
+    return _build_gpt_pipeline_1d(48, num_chunks, fused=fused, **cfg)
+
+
+def GPT3_pipeline_1D(num_chunks=1, checkpoint=False, dtype=torch.float, embed_split_hidden=False, fused=False):
+    cfg = dict(hidden_size=12288,
+               num_attention_heads=96,
+               checkpoint=checkpoint,
+               max_position_embeddings=2048,
+               dtype=dtype,
+               embed_split_hidden=embed_split_hidden)
+    return _build_gpt_pipeline_1d(96, num_chunks, fused=fused, **cfg)
+
+
+def GPT2_exlarge_pipeline_hybrid(num_chunks=1, checkpoint=False, dtype=torch.float, embed_split_hidden=False):
+    cfg = dict(hidden_size=1600,
+               num_attention_heads=32,
+               checkpoint=checkpoint,
+               dtype=dtype,
+               embed_split_hidden=embed_split_hidden)
+    return _build_gpt_pipeline_hybrid(48, num_chunks, **cfg)
+
+
+def GPT2_small_pipeline_hybrid(num_chunks=1, checkpoint=False, dtype=torch.float, embed_split_hidden=False):
+    cfg = dict(hidden_size=768,
+               num_attention_heads=12,
+               checkpoint=checkpoint,
+               dtype=dtype,
+               embed_split_hidden=embed_split_hidden)
+    return _build_gpt_pipeline_hybrid(12, num_chunks, **cfg)
+
+
+def GPT3_pipeline_hybrid(num_chunks=1, checkpoint=False, dtype=torch.float, embed_split_hidden=False):
+    cfg = dict(hidden_size=12288,
+               num_attention_heads=96,
+               checkpoint=checkpoint,
+               max_position_embeddings=2048,
+               dtype=dtype,
+               embed_split_hidden=embed_split_hidden)
+    return _build_gpt_pipeline_hybrid(96, num_chunks, **cfg)
diff --git a/examples/language/gpt/titans/requirements.txt b/examples/language/gpt/titans/requirements.txt
new file mode 100644
index 000000000000..64ff7a4abcd8
--- /dev/null
+++ b/examples/language/gpt/titans/requirements.txt
@@ -0,0 +1,4 @@
+torch==1.12.1
+titans==0.0.7
+colossalai==0.2.0+torch1.12cu11.3
+-f https://release.colossalai.org
diff --git a/examples/language/gpt/titans/run.sh b/examples/language/gpt/titans/run.sh
new file mode 100644
index 000000000000..157bd377aa34
--- /dev/null
+++ b/examples/language/gpt/titans/run.sh
@@ -0,0 +1,2 @@
+export DATA=/data/scratch/gpt_data/small-gpt-dataset.json
+colossalai run --nproc_per_node=4 train_gpt.py --config ./configs/gpt2_small_zero3_pp1d.py --from_torch
diff --git a/examples/language/gpt/titans/test_ci.sh b/examples/language/gpt/titans/test_ci.sh
new file mode 100644
index 000000000000..7cb24c1a4082
--- /dev/null
+++ b/examples/language/gpt/titans/test_ci.sh
@@ -0,0 +1 @@
+colossalai run --nproc_per_node=4 train_gpt.py --config ./configs/gpt2_small_zero3_pp1d.py --from_torch --use_dummy_dataset
diff --git a/examples/language/gpt/titans/train_gpt.py b/examples/language/gpt/titans/train_gpt.py
new file mode 100644
index 000000000000..1380b4b3a7da
--- /dev/null
+++ b/examples/language/gpt/titans/train_gpt.py
@@ -0,0 +1,148 @@
+import contextlib
+import os
+
+import torch
+import torch.nn as nn
+from titans.model.gpt import GPTLMLoss
+
+import colossalai
+import colossalai.utils as utils
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.logging import disable_existing_loggers, get_dist_logger
+from colossalai.nn import LinearWarmupLR
+from colossalai.trainer import Trainer, hooks
+from colossalai.utils import colo_set_process_memory_fraction, is_using_pp
+from colossalai.utils.timer import MultiTimer
+from colossalai.zero.init_ctx import ZeroInitContext
+
+
+def calc_local_model_size(model: torch.nn.Module):
+    numel_per_device = 0
+    for p in model.parameters():
+        numel_per_device += p.numel()
+    return numel_per_device
+
+
+VOCAB_SIZE = 50257
+
+
+def main():
+    parser = colossalai.get_default_parser()
+    parser.add_argument('--from_torch', default=False, action='store_true')
+    parser.add_argument('--use_dummy_dataset', default=True, action='store_true')
+    args = parser.parse_args()
+    disable_existing_loggers()
+    if args.from_torch:
+        colossalai.launch_from_torch(config=args.config)
+    else:
+        colossalai.launch_from_slurm(config=args.config, host=args.host, port=29500, seed=42)
+    logger = get_dist_logger()
+
+    if not args.use_dummy_dataset:
+        data_path = os.environ['DATA']
+        logger.info(f'Build data loader from path {data_path}', ranks=[0])
+        from dataset.webtext import WebtextDataset
+        train_ds = WebtextDataset(os.environ['DATA'], seq_len=gpc.config.SEQ_LEN)
+        train_dataloader = utils.get_dataloader(train_ds,
+                                                seed=42,
+                                                batch_size=gpc.config.BATCH_SIZE,
+                                                pin_memory=True,
+                                                shuffle=True,
+                                                drop_last=True)
+    else:
+        # build a dummy train_dataloader
+        logger.info('Build data loader using dummy data', ranks=[0])
+
+        def get_data(batch_size, seq_len, vocab_size):
+            input_ids = torch.randint(0, vocab_size, (batch_size, seq_len), device=torch.cuda.current_device())
+            attention_mask = torch.ones_like(input_ids)
+            return input_ids, attention_mask
+
+        # 10 iterations
+        input_ids, attn_mask = get_data(gpc.config.BATCH_SIZE * 10, gpc.config.SEQ_LEN, VOCAB_SIZE)
+        from torch.utils.data import DataLoader, Dataset
+
+        class TextSamplerDataset(Dataset):
+
+            def __init__(self, data, seq_len):
+                super().__init__()
+                self.data = data
+                self.seq_len = seq_len
+
+            def __getitem__(self, index):
+                rand_start = torch.randint(0, self.data.size(0) - self.seq_len, (1,))
+                full_seq = self.data[rand_start:rand_start + self.seq_len + 1].long()
+                return full_seq.cuda()
+
+            def __len__(self):
+                return self.data.size(0) // self.seq_len
+
+        def cycle(loader):
+            while True:
+                for data in loader:
+                    yield data
+
+        train_dataset = TextSamplerDataset(input_ids, gpc.config.SEQ_LEN)
+        train_dataloader = DataLoader(train_dataset, batch_size=gpc.config.BATCH_SIZE)
+
+    logger.info('Build model', ranks=[0])
+    use_pipeline = is_using_pp()
+    use_interleaved = hasattr(gpc.config.model, 'num_chunks')
+    use_zero3 = hasattr(gpc.config, 'zero')
+    ctx = contextlib.nullcontext()
+    if use_zero3:
+        ctx = ZeroInitContext(target_device=torch.cuda.current_device(),
+                              shard_strategy=gpc.config.zero.model_config.shard_strategy,
+                              shard_param=True)
+    with ctx:
+        model = gpc.config.model.pop('type')(**gpc.config.model)
+    if use_pipeline and use_interleaved and not isinstance(model, nn.ModuleList):
+        model = nn.ModuleList([model])
+
+    if use_zero3:
+        numel = ctx.model_numel_tensor.item()
+    else:
+        numel = calc_local_model_size(model)
+
+    tflop = numel * gpc.config.BATCH_SIZE * gpc.config.SEQ_LEN \
+        * gpc.get_world_size(ParallelMode.MODEL) * gpc.get_world_size(ParallelMode.DATA) * 8 / (1024 ** 4)
+
+    criterion = getattr(gpc.config, 'loss_fn', None)
+    if criterion is not None:
+        criterion = criterion.type()
+    else:
+        criterion = GPTLMLoss()
+    logger.info('Build optimizer', ranks=[0])
+    optimizer = gpc.config.optimizer.pop('type')(model.parameters(), **gpc.config.optimizer)
+    lr_scheduler = LinearWarmupLR(optimizer, total_steps=gpc.config.NUM_EPOCHS, warmup_steps=5)
+    engine, train_dataloader, _, lr_scheduler = colossalai.initialize(model,
+                                                                      optimizer,
+                                                                      criterion,
+                                                                      train_dataloader=train_dataloader,
+                                                                      lr_scheduler=lr_scheduler)
+    global_batch_size = gpc.config.BATCH_SIZE * \
+        gpc.get_world_size(ParallelMode.DATA) * getattr(gpc.config, "gradient_accumulation", 1)
+    logger.info(f'Init done, global batch size = {global_batch_size}', ranks=[0])
+    timier = MultiTimer()
+    trainer = Trainer(engine=engine, logger=logger, timer=timier)
+    hook_list = [
+        hooks.LossHook(),
+        hooks.LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=True),
+        hooks.LogMetricByEpochHook(logger),
+        hooks.ThroughputHook(ignored_steps=10, tflop_per_step=tflop),
+        hooks.LogMetricByStepHook(),
+        hooks.LogMemoryByEpochHook(logger),
+    # hooks.LogMemoryByEpochHook(logger),
+    # hooks.LogTimingByEpochHook(timer, logger),
+    ]
+    trainer.fit(train_dataloader=train_dataloader,
+                epochs=gpc.config.NUM_EPOCHS,
+                test_interval=1,
+                hooks=hook_list,
+                display_progress=True,
+                return_output_label=False)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/language/palm/train.py b/examples/language/palm/train.py
index a334ea9511fb..2f012780da77 100644
--- a/examples/language/palm/train.py
+++ b/examples/language/palm/train.py
@@ -11,13 +11,12 @@
 from packaging import version
 from palm_pytorch import PaLM
 from palm_pytorch.autoregressive_wrapper import AutoregressiveWrapper
-from torch.nn import functional as F
 from torch.utils.data import DataLoader, Dataset
 
 import colossalai
 from colossalai.logging import disable_existing_loggers, get_dist_logger
 from colossalai.nn.optimizer.gemini_optimizer import GeminiAdamOptimizer
-from colossalai.nn.parallel import GeminiDDP, ZeroDDP
+from colossalai.nn.parallel import ZeroDDP
 from colossalai.tensor import ColoParameter, ComputePattern, ComputeSpec, ProcessGroup, ReplicaSpec, ShardSpec
 from colossalai.utils import MultiTimer, get_current_device
 from colossalai.utils.model.colo_init_context import ColoInitContext

From 67e1912b59760a54b5dc00ff6f7d9b47a309916c Mon Sep 17 00:00:00 2001
From: YuliangLiu0306 <72588413+YuliangLiu0306@users.noreply.github.com>
Date: Mon, 16 Jan 2023 16:25:13 +0800
Subject: [PATCH 196/503] [autoparallel] support origin activation ckpt on
 autoprallel system (#2468)

---
 .../passes/runtime_apply_pass.py              | 33 +++++++++
 .../passes/runtime_preparation_pass.py        |  2 +
 .../auto_parallel/tensor_shard/initialize.py  | 11 +--
 .../test_tensor_shard/test_checkpoint.py      | 70 +++++++++++++++++++
 4 files changed, 111 insertions(+), 5 deletions(-)
 create mode 100644 tests/test_auto_parallel/test_tensor_shard/test_checkpoint.py

diff --git a/colossalai/auto_parallel/passes/runtime_apply_pass.py b/colossalai/auto_parallel/passes/runtime_apply_pass.py
index 7f2aac42b7f8..9d83f105748b 100644
--- a/colossalai/auto_parallel/passes/runtime_apply_pass.py
+++ b/colossalai/auto_parallel/passes/runtime_apply_pass.py
@@ -128,6 +128,8 @@ def _shape_consistency_apply(gm: torch.fx.GraphModule):
                                                                    runtime_apply,
                                                                    args=(node, origin_dict_node, input_dict_node,
                                                                          node_to_index_dict[node], user_node_index))
+            if 'activation_checkpoint' in user_node.meta:
+                shape_consistency_node.meta['activation_checkpoint'] = user_node.meta['activation_checkpoint']
 
             new_args = list(user_node.args)
             new_kwargs = dict(user_node.kwargs)
@@ -208,6 +210,37 @@ def _comm_spec_apply(gm: torch.fx.GraphModule):
                         # substitute the origin node with comm_spec_apply_node
                         new_kwargs[str(node)] = comm_spec_apply_node
                         user.kwargs = new_kwargs
+
+            if 'activation_checkpoint' in node.meta:
+                comm_spec_apply_node.meta['activation_checkpoint'] = node.meta['activation_checkpoint']
+
+    return gm
+
+
+def _act_annotataion_pass(gm: torch.fx.GraphModule):
+    """
+    This pass is used to add the act annotation to the new inserted nodes.
+    """
+    mod_graph = gm.graph
+    nodes = tuple(mod_graph.nodes)
+
+    for node in nodes:
+        if not hasattr(node.meta, 'activation_checkpoint'):
+            from .runtime_preparation_pass import size_processing
+
+            user_act_annotation = -1
+            input_act_annotation = -1
+            for user_node in node.users.keys():
+                if 'activation_checkpoint' in user_node.meta:
+                    user_act_annotation = user_node.meta['activation_checkpoint']
+                    break
+            for input_node in node._input_nodes.keys():
+                if 'activation_checkpoint' in input_node.meta:
+                    input_act_annotation = input_node.meta['activation_checkpoint']
+                    break
+            if user_act_annotation == input_act_annotation and user_act_annotation != -1:
+                node.meta['activation_checkpoint'] = user_act_annotation
+
     return gm
 
 
diff --git a/colossalai/auto_parallel/passes/runtime_preparation_pass.py b/colossalai/auto_parallel/passes/runtime_preparation_pass.py
index f9b89026393d..1c25e4c94f24 100644
--- a/colossalai/auto_parallel/passes/runtime_preparation_pass.py
+++ b/colossalai/auto_parallel/passes/runtime_preparation_pass.py
@@ -179,6 +179,8 @@ def _size_value_converting(gm: torch.fx.GraphModule, device_mesh: DeviceMesh):
                 # It will be used to replace the original node with processing node in slice object
                 node_pairs[node] = size_processing_node
                 size_processing_node._meta_data = node._meta_data
+                if 'activation_checkpoint' in node.meta:
+                    size_processing_node.meta['activation_checkpoint'] = node.meta['activation_checkpoint']
 
             user_list = list(node.users.keys())
             for user in user_list:
diff --git a/colossalai/auto_parallel/tensor_shard/initialize.py b/colossalai/auto_parallel/tensor_shard/initialize.py
index 8c24c0d7b5df..387a682a1ad9 100644
--- a/colossalai/auto_parallel/tensor_shard/initialize.py
+++ b/colossalai/auto_parallel/tensor_shard/initialize.py
@@ -18,6 +18,7 @@
 )
 from colossalai.device.alpha_beta_profiler import AlphaBetaProfiler
 from colossalai.device.device_mesh import DeviceMesh
+from colossalai.fx.graph_module import ColoGraphModule
 from colossalai.fx.tracer import ColoTracer
 from colossalai.tensor.sharding_spec import ShardingSpec
 
@@ -28,7 +29,7 @@ class ModuleWrapper(nn.Module):
     into the forward function.
     '''
 
-    def __init__(self, module: GraphModule, sharding_spec_dict: Dict[int, List[ShardingSpec]],
+    def __init__(self, module: ColoGraphModule, sharding_spec_dict: Dict[int, List[ShardingSpec]],
                  origin_spec_dict: Dict[int, ShardingSpec], comm_actions_dict: Dict[int, Dict[str, CommAction]]):
         '''
         Args:
@@ -81,7 +82,7 @@ def build_strategy_constructor(graph: Graph, device_mesh: DeviceMesh):
     return strategies_constructor
 
 
-def solve_solution(gm: GraphModule, strategy_constructor: StrategiesConstructor, memory_budget: float = -1.0):
+def solve_solution(gm: ColoGraphModule, strategy_constructor: StrategiesConstructor, memory_budget: float = -1.0):
     '''
     This method is used to solve the best solution for the given graph.
     The solution is a list of integers, each integer represents the best strategy index of the corresponding node.
@@ -97,7 +98,7 @@ def solve_solution(gm: GraphModule, strategy_constructor: StrategiesConstructor,
     return solution
 
 
-def transform_to_sharded_model(gm: GraphModule, solution: List[int], device_mesh: DeviceMesh,
+def transform_to_sharded_model(gm: ColoGraphModule, solution: List[int], device_mesh: DeviceMesh,
                                strategies_constructor: StrategiesConstructor):
     '''
     This method is used to transform the original graph to the sharded graph.
@@ -197,10 +198,10 @@ def initialize_model(model: nn.Module,
             solution will be used to debug or help to analyze the sharding result. Therefore, we will not just
             return a series of integers, but return the best strategies.
     '''
-    tracer = ColoTracer()
+    tracer = ColoTracer(trace_act_ckpt=True)
 
     graph = tracer.trace(root=model, meta_args=meta_args)
-    gm = GraphModule(model, graph, model.__class__.__name__)
+    gm = ColoGraphModule(model, graph, model.__class__.__name__)
     gm.recompile()
     strategies_constructor = build_strategy_constructor(graph, device_mesh)
     if load_solver_solution:
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_checkpoint.py b/tests/test_auto_parallel/test_tensor_shard/test_checkpoint.py
new file mode 100644
index 000000000000..0b42722fec5f
--- /dev/null
+++ b/tests/test_auto_parallel/test_tensor_shard/test_checkpoint.py
@@ -0,0 +1,70 @@
+from functools import partial
+from typing import Optional, Tuple, Union
+
+import pytest
+import torch
+import torch.multiprocessing as mp
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+from transformers.pytorch_utils import Conv1D
+
+from colossalai.auto_parallel.tensor_shard.initialize import initialize_model
+from colossalai.device.device_mesh import DeviceMesh
+from colossalai.fx.graph_module import ColoGraphModule
+from colossalai.fx.tracer import ColoTracer
+from colossalai.initialize import launch
+from colossalai.logging import disable_existing_loggers
+from colossalai.tensor.shape_consistency import ShapeConsistencyManager
+from colossalai.testing import rerun_if_address_is_in_use
+from colossalai.testing.pytest_wrapper import run_on_environment_flag
+from colossalai.utils import free_port
+
+HIDDEN_SIZE = 16
+
+
+class GPT2MLPWithCkpt(nn.Module):
+
+    def __init__(self, intermediate_size, hidden_size):
+        super().__init__()
+        embed_dim = hidden_size
+        self.c_fc = Conv1D(intermediate_size, embed_dim)
+        self.c_proj = Conv1D(embed_dim, intermediate_size)
+        self.act = torch.nn.ReLU()
+
+    def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor:
+        hidden_states = self.c_fc(hidden_states)
+        hidden_states = checkpoint(self.c_proj, hidden_states)
+        hidden_states = self.act(hidden_states)
+
+        return hidden_states
+
+
+def check_act_ckpt(rank, world_size, port):
+    disable_existing_loggers()
+    launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+    model = GPT2MLPWithCkpt(intermediate_size=4 * HIDDEN_SIZE, hidden_size=HIDDEN_SIZE)
+    input_sample = {
+        'hidden_states': torch.rand(1, 64, HIDDEN_SIZE).to('meta'),
+    }
+    physical_mesh_id = torch.arange(0, 4)
+    mesh_shape = (2, 2)
+    # [[0, 1]
+    #  [2, 3]]
+    device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
+    gm = initialize_model(model, input_sample, device_mesh)
+    code = gm.module.graph.python_code('self').src
+    assert "runtime_comm_spec_apply_1 = colossalai_auto_parallel_passes_runtime_apply_pass_runtime_comm_spec_apply(linear_1, comm_actions_dict, 12, 'linear_1')" in code
+    assert "view_3 = colossalai.utils.activation_checkpoint.checkpoint(self.checkpoint_0, False, view_1, comm_actions_dict, use_reentrant=True)" in code
+
+
+@run_on_environment_flag(name='AUTO_PARALLEL')
+@pytest.mark.dist
+@rerun_if_address_is_in_use()
+def test_mlp_layer():
+    world_size = 4
+    run_func = partial(check_act_ckpt, world_size=world_size, port=free_port())
+    mp.spawn(run_func, nprocs=world_size)
+
+
+if __name__ == '__main__':
+    test_mlp_layer()

From 4953b4ace15da59f4d122c5e433b08ad9db13054 Mon Sep 17 00:00:00 2001
From: oahzxl <43881818+oahzxl@users.noreply.github.com>
Date: Mon, 16 Jan 2023 19:25:05 +0800
Subject: [PATCH 197/503] [autochunk] support evoformer tracer (#2485)

support full evoformer tracer, which is a main module of alphafold. previously we just support a simplifed version of it.
1. support some evoformer's op in fx
2. support evoformer test
3. add repos for test code
---
 colossalai/autochunk/autochunk_codegen.py     | 168 ++----
 colossalai/autochunk/trace_flow.py            |  92 +--
 colossalai/autochunk/trace_indice.py          |  62 +-
 colossalai/autochunk/utils.py                 |  48 +-
 colossalai/fx/profiler/opcount.py             |   5 +-
 ...chunk.py => benchmark_simple_evoformer.py} |  50 +-
 tests/test_autochunk/evoformer/evoformer.py   |  59 --
 tests/test_autochunk/evoformer/initializer.py |  29 -
 tests/test_autochunk/evoformer/kernel.py      |  19 -
 tests/test_autochunk/evoformer/msa.py         |  95 ----
 tests/test_autochunk/evoformer/ops.py         | 176 ------
 tests/test_autochunk/evoformer/triangle.py    | 192 -------
 .../test_autochunk/openfold/checkpointing.py  |  84 ---
 tests/test_autochunk/openfold/dropout.py      |  78 ---
 tests/test_autochunk/openfold/evoformer.py    | 431 --------------
 tests/test_autochunk/openfold/msa.py          | 331 -----------
 .../openfold/outer_product_mean.py            | 129 -----
 .../openfold/pair_transition.py               |  99 ----
 tests/test_autochunk/openfold/primitives.py   | 529 ------------------
 tests/test_autochunk/openfold/tensor_utils.py | 408 --------------
 .../openfold/triangular_attention.py          | 139 -----
 .../triangular_multiplicative_update.py       | 127 -----
 .../test_autochunk/test_evoformer_codegen.py  | 164 ++++++
 ...en.py => test_simple_evoformer_codegen.py} |  20 +-
 ...rch.py => test_simple_evoformer_search.py} |  20 +-
 25 files changed, 339 insertions(+), 3215 deletions(-)
 rename tests/test_autochunk/{benchmark_autochunk.py => benchmark_simple_evoformer.py} (66%)
 delete mode 100644 tests/test_autochunk/evoformer/evoformer.py
 delete mode 100755 tests/test_autochunk/evoformer/initializer.py
 delete mode 100644 tests/test_autochunk/evoformer/kernel.py
 delete mode 100644 tests/test_autochunk/evoformer/msa.py
 delete mode 100755 tests/test_autochunk/evoformer/ops.py
 delete mode 100644 tests/test_autochunk/evoformer/triangle.py
 delete mode 100644 tests/test_autochunk/openfold/checkpointing.py
 delete mode 100644 tests/test_autochunk/openfold/dropout.py
 delete mode 100644 tests/test_autochunk/openfold/evoformer.py
 delete mode 100644 tests/test_autochunk/openfold/msa.py
 delete mode 100644 tests/test_autochunk/openfold/outer_product_mean.py
 delete mode 100644 tests/test_autochunk/openfold/pair_transition.py
 delete mode 100644 tests/test_autochunk/openfold/primitives.py
 delete mode 100644 tests/test_autochunk/openfold/tensor_utils.py
 delete mode 100644 tests/test_autochunk/openfold/triangular_attention.py
 delete mode 100644 tests/test_autochunk/openfold/triangular_multiplicative_update.py
 create mode 100644 tests/test_autochunk/test_evoformer_codegen.py
 rename tests/test_autochunk/{test_autochunk_codegen.py => test_simple_evoformer_codegen.py} (88%)
 rename tests/test_autochunk/{test_autochunk_search.py => test_simple_evoformer_search.py} (87%)

diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
index e8af9bde86d8..ceccb9a9fde2 100644
--- a/colossalai/autochunk/autochunk_codegen.py
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -48,9 +48,7 @@ def _gen_chunk_slice_dim(chunk_dim: int, chunk_indice_name: str, shape: List) ->
     return new_shape
 
 
-def _gen_loop_start(
-    chunk_input: List[Node], chunk_output: Node, chunk_ouput_dim: int, chunk_size=2
-) -> str:
+def _gen_loop_start(chunk_input: List[Node], chunk_output: Node, chunk_ouput_dim: int, chunk_size=2) -> str:
     """
     Generate chunk loop start
 
@@ -72,9 +70,8 @@ def _gen_loop_start(
     out_shape = get_node_shape(chunk_output)
     out_str = str(list(out_shape))
     context = (
-        "chunk_result = torch.empty(%s, dtype=%s.dtype, device=%s.device); chunk_size = %d\nfor chunk_idx in range"
-        % (out_str, input_node.name, input_node.name, chunk_size)
-    )
+        "chunk_result = torch.empty(%s, dtype=%s.dtype, device=%s.device); chunk_size = %d\nfor chunk_idx in range" %
+        (out_str, input_node.name, input_node.name, chunk_size))
     context += "(0, %d, chunk_size):\n" % (out_shape[chunk_ouput_dim])
     return context
 
@@ -105,26 +102,17 @@ def _gen_loop_end(
     chunk_outputs_name = chunk_outputs.name
     chunk_outputs_idx = find_idx_by_name(chunk_outputs_name, node_list)
     chunk_output_shape = chunk_outputs.meta["tensor_meta"].shape
-    chunk_slice = _gen_chunk_slice_dim(
-        chunk_outputs_dim, "chunk_idx", chunk_output_shape
-    )
+    chunk_slice = _gen_chunk_slice_dim(chunk_outputs_dim, "chunk_idx", chunk_output_shape)
     context = "    chunk_result%s = %s;  %s = None\n" % (
         chunk_slice,
         chunk_outputs_name,
         chunk_outputs_name,
     )
-    context += (
-        chunk_outputs_name + " = chunk_result;  chunk_result = None;  chunk_size = None"
-    )
+    context += (chunk_outputs_name + " = chunk_result;  chunk_result = None;  chunk_size = None")
 
     # determine if its the last use for chunk input
     for chunk_input in chunk_inputs + chunk_non_compute_inputs:
-        if all(
-            [
-                find_idx_by_name(user.name, node_list) <= chunk_outputs_idx
-                for user in chunk_input.users.keys()
-            ]
-        ):
+        if all([find_idx_by_name(user.name, node_list) <= chunk_outputs_idx for user in chunk_input.users.keys()]):
             context += ";  %s = None" % chunk_input.name
 
     context += "\n"
@@ -171,17 +159,10 @@ def _replace_ones_like(
         chunk_dim = chunk_infos[region_idx]["node_chunk_dim"][meta_node]["chunk_dim"]
         if get_node_shape(meta_node)[chunk_dim] != 1:
             source_node = meta_node.args[0].args[0]
-            if (
-                source_node not in chunk_infos[region_idx]["node_chunk_dim"]
-                or chunk_infos[region_idx]["node_chunk_dim"][source_node]["chunk_dim"]
-                is None
-            ):
-                chunk_slice = _gen_chunk_slice_dim(
-                    chunk_dim, "chunk_idx", get_node_shape(node)
-                )
-                body[-1] = _replace_name(
-                    body[-1], node.args[0].name, node.args[0].name + chunk_slice
-                )
+            if (source_node not in chunk_infos[region_idx]["node_chunk_dim"]
+                    or chunk_infos[region_idx]["node_chunk_dim"][source_node]["chunk_dim"] is None):
+                chunk_slice = _gen_chunk_slice_dim(chunk_dim, "chunk_idx", get_node_shape(node))
+                body[-1] = _replace_name(body[-1], node.args[0].name, node.args[0].name + chunk_slice)
     return body
 
 
@@ -198,12 +179,8 @@ def _replace_input_node(
     for input_node_idx, input_node in enumerate(chunk_inputs[region_idx]):
         for idx, dim in chunk_inputs_dim[region_idx][input_node_idx].items():
             if idx == node_idx:
-                chunk_slice = _gen_chunk_slice_dim(
-                    dim[0], "chunk_idx", get_node_shape(input_node)
-                )
-                body[-1] = _replace_name(
-                    body[-1], input_node.name, input_node.name + chunk_slice
-                )
+                chunk_slice = _gen_chunk_slice_dim(dim[0], "chunk_idx", get_node_shape(input_node))
+                body[-1] = _replace_name(body[-1], input_node.name, input_node.name + chunk_slice)
     return body
 
 
@@ -236,14 +213,10 @@ def emit_code_with_chunk(
     chunk_ends = [i["region"][1] for i in chunk_infos]
 
     # chunk inputs
-    chunk_inputs = [i["inputs"] for i in chunk_infos]  # input with chunk
-    chunk_inputs_non_chunk = [
-        i["inputs_non_chunk"] for i in chunk_infos
-    ]  # input without chunk
-    chunk_inputs_dim = [i["inputs_dim"] for i in chunk_infos]  # input chunk dim
-    chunk_inputs_names = [j.name for i in chunk_inputs for j in i] + [
-        j.name for i in chunk_inputs_non_chunk for j in i
-    ]
+    chunk_inputs = [i["inputs"] for i in chunk_infos]    # input with chunk
+    chunk_inputs_non_chunk = [i["inputs_non_chunk"] for i in chunk_infos]    # input without chunk
+    chunk_inputs_dim = [i["inputs_dim"] for i in chunk_infos]    # input chunk dim
+    chunk_inputs_names = [j.name for i in chunk_inputs for j in i] + [j.name for i in chunk_inputs_non_chunk for j in i]
 
     # chunk outputs
     chunk_outputs = [i["outputs"][0] for i in chunk_infos]
@@ -267,23 +240,16 @@ def emit_code_with_chunk(
                     chunk_outputs[region_idx],
                     chunk_outputs_dim[region_idx],
                     chunk_infos[region_idx]["chunk_size"],
-                )
-            )
+                ))
 
         if within_chunk_region:
             emit_node_func(node, body)
             # replace input var with chunk var
-            body = _replace_input_node(
-                chunk_inputs, region_idx, chunk_inputs_dim, node_idx, body
-            )
+            body = _replace_input_node(chunk_inputs, region_idx, chunk_inputs_dim, node_idx, body)
             # ones like
-            body = _replace_ones_like(
-                search_chunk, chunk_infos, region_idx, node_idx, node, body
-            )
+            body = _replace_ones_like(search_chunk, chunk_infos, region_idx, node_idx, node, body)
             # reassgin reshape size
-            body[-1] = _replace_reshape_size(
-                body[-1], node.name, chunk_infos[region_idx]["reshape_size"]
-            )
+            body[-1] = _replace_reshape_size(body[-1], node.name, chunk_infos[region_idx]["reshape_size"])
             body[-1] = "    " + body[-1]
             delete_unused_value_func(node, body, chunk_inputs_names)
         else:
@@ -300,8 +266,7 @@ def emit_code_with_chunk(
                     chunk_outputs[region_idx],
                     chunk_outputs_dim[region_idx],
                     node_list,
-                )
-            )
+                ))
             within_chunk_region = False
 
         node_idx += 1
@@ -310,18 +275,14 @@ def emit_code_with_chunk(
 if CODEGEN_AVAILABLE:
 
     class AutoChunkCodeGen(CodeGen):
+
         def __init__(self, meta_graph, max_memory=None, print_mem=False):
             super().__init__()
-            self.meta_graph = meta_graph
-            self.max_memory = max_memory
-            self.meta_node = list(meta_graph.graph.nodes)
             # find the chunk regions
             self.search_chunk = SearchChunk(meta_graph, max_memory, print_mem)
             self.chunk_infos = self.search_chunk.search_region()
 
-        def _gen_python_code(
-            self, nodes, root_module: str, namespace: _Namespace
-        ) -> PythonCode:
+        def _gen_python_code(self, nodes, root_module: str, namespace: _Namespace) -> PythonCode:
             free_vars: List[str] = []
             body: List[str] = []
             globals_: Dict[str, Any] = {}
@@ -338,9 +299,7 @@ def add_global(name_hint: str, obj: Any):
 
                 Returns: the global name that should be used to reference 'obj' in generated source.
                 """
-                if (
-                    _is_from_torch(obj) and obj != torch.device
-                ):  # to support registering torch.device
+                if (_is_from_torch(obj) and obj != torch.device):    # to support registering torch.device
                     # HACK: workaround for how torch custom ops are registered. We
                     # can't import them like normal modules so they must retain their
                     # fully qualified name.
@@ -356,9 +315,7 @@ def add_global(name_hint: str, obj: Any):
                 return global_name
 
             # set _custom_builtins here so that we needn't import colossalai in forward
-            _custom_builtins["colossalai"] = _CustomBuiltin(
-                "import colossalai", colossalai
-            )
+            _custom_builtins["colossalai"] = _CustomBuiltin("import colossalai", colossalai)
 
             # Pre-fill the globals table with registered builtins.
             for name, (_, obj) in _custom_builtins.items():
@@ -394,9 +351,8 @@ def type_repr(o: Any):
                 # Common case: this is a regular module name like 'foo.bar.baz'
                 return add_global(typename, o)
 
-            def _format_args(
-                args: Tuple[Argument, ...], kwargs: Dict[str, Argument]
-            ) -> str:
+            def _format_args(args: Tuple[Argument, ...], kwargs: Dict[str, Argument]) -> str:
+
                 def _get_repr(arg):
                     # Handle NamedTuples (if it has `_fields`) via add_global.
                     if isinstance(arg, tuple) and hasattr(arg, "_fields"):
@@ -444,26 +400,18 @@ def delete_unused_values(user: Node, body, to_keep=[]):
                 nodes_to_delete = user_to_last_uses.get(user, [])
                 nodes_to_delete = [i for i in nodes_to_delete if i.name not in to_keep]
                 if len(nodes_to_delete):
-                    to_delete_str = " = ".join(
-                        [repr(n) for n in nodes_to_delete] + ["None"]
-                    )
+                    to_delete_str = " = ".join([repr(n) for n in nodes_to_delete] + ["None"])
                     body.append(f";  {to_delete_str}\n")
                 else:
                     body.append("\n")
 
             # NOTE: we add a variable to distinguish body and ckpt_func
             def emit_node(node: Node, body):
-                maybe_type_annotation = (
-                    "" if node.type is None else f" : {type_repr(node.type)}"
-                )
+                maybe_type_annotation = ("" if node.type is None else f" : {type_repr(node.type)}")
                 if node.op == "placeholder":
                     assert isinstance(node.target, str)
-                    maybe_default_arg = (
-                        "" if not node.args else f" = {repr(node.args[0])}"
-                    )
-                    free_vars.append(
-                        f"{node.target}{maybe_type_annotation}{maybe_default_arg}"
-                    )
+                    maybe_default_arg = ("" if not node.args else f" = {repr(node.args[0])}")
+                    free_vars.append(f"{node.target}{maybe_type_annotation}{maybe_default_arg}")
                     raw_name = node.target.replace("*", "")
                     if raw_name != repr(node):
                         body.append(f"{repr(node)} = {raw_name}\n")
@@ -472,68 +420,46 @@ def emit_node(node: Node, body):
                     assert isinstance(node.target, str)
                     body.append(
                         f"{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.target)}"
-                        f"({_format_args(node.args[1:], node.kwargs)})"
-                    )
+                        f"({_format_args(node.args[1:], node.kwargs)})")
                     return
                 elif node.op == "call_function":
                     assert callable(node.target)
                     # pretty print operators
-                    if (
-                        node.target.__module__ == "_operator"
-                        and node.target.__name__ in magic_methods
-                    ):
+                    if (node.target.__module__ == "_operator" and node.target.__name__ in magic_methods):
                         assert isinstance(node.args, tuple)
-                        body.append(
-                            f"{repr(node)}{maybe_type_annotation} = "
-                            f"{magic_methods[node.target.__name__].format(*(repr(a) for a in node.args))}"
-                        )
+                        body.append(f"{repr(node)}{maybe_type_annotation} = "
+                                    f"{magic_methods[node.target.__name__].format(*(repr(a) for a in node.args))}")
                         return
 
                     # pretty print inplace operators; required for jit.script to work properly
                     # not currently supported in normal FX graphs, but generated by torchdynamo
-                    if (
-                        node.target.__module__ == "_operator"
-                        and node.target.__name__ in inplace_methods
-                    ):
-                        body.append(
-                            f"{inplace_methods[node.target.__name__].format(*(repr(a) for a in node.args))};  "
-                            f"{repr(node)}{maybe_type_annotation} = {repr(node.args[0])}"
-                        )
+                    if (node.target.__module__ == "_operator" and node.target.__name__ in inplace_methods):
+                        body.append(f"{inplace_methods[node.target.__name__].format(*(repr(a) for a in node.args))};  "
+                                    f"{repr(node)}{maybe_type_annotation} = {repr(node.args[0])}")
                         return
 
                     qualified_name = _get_qualified_name(node.target)
                     global_name = add_global(qualified_name, node.target)
                     # special case for getattr: node.args could be 2-argument or 3-argument
                     # 2-argument: attribute access; 3-argument: fall through to attrib function call with default value
-                    if (
-                        global_name == "getattr"
-                        and isinstance(node.args, tuple)
-                        and isinstance(node.args[1], str)
-                        and node.args[1].isidentifier()
-                        and len(node.args) == 2
-                    ):
+                    if (global_name == "getattr" and isinstance(node.args, tuple) and isinstance(node.args[1], str)
+                            and node.args[1].isidentifier() and len(node.args) == 2):
                         body.append(
-                            f"{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.args[1])}"
-                        )
+                            f"{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.args[1])}")
                         return
                     body.append(
-                        f"{repr(node)}{maybe_type_annotation} = {global_name}({_format_args(node.args, node.kwargs)})"
-                    )
+                        f"{repr(node)}{maybe_type_annotation} = {global_name}({_format_args(node.args, node.kwargs)})")
                     if node.meta.get("is_wrapped", False):
                         wrapped_fns.setdefault(global_name)
                     return
                 elif node.op == "call_module":
                     assert isinstance(node.target, str)
-                    body.append(
-                        f"{repr(node)}{maybe_type_annotation} = "
-                        f"{_format_target(root_module, node.target)}({_format_args(node.args, node.kwargs)})"
-                    )
+                    body.append(f"{repr(node)}{maybe_type_annotation} = "
+                                f"{_format_target(root_module, node.target)}({_format_args(node.args, node.kwargs)})")
                     return
                 elif node.op == "get_attr":
                     assert isinstance(node.target, str)
-                    body.append(
-                        f"{repr(node)}{maybe_type_annotation} = {_format_target(root_module, node.target)}"
-                    )
+                    body.append(f"{repr(node)}{maybe_type_annotation} = {_format_target(root_module, node.target)}")
                     return
                 elif node.op == "output":
                     if node.type is not None:
@@ -564,9 +490,7 @@ def emit_node(node: Node, body):
 
             if len(wrapped_fns) > 0:
                 wrap_name = add_global("wrap", torch.fx.wrap)
-                wrap_stmts = "\n".join(
-                    [f'{wrap_name}("{name}")' for name in wrapped_fns]
-                )
+                wrap_stmts = "\n".join([f'{wrap_name}("{name}")' for name in wrapped_fns])
             else:
                 wrap_stmts = ""
 
diff --git a/colossalai/autochunk/trace_flow.py b/colossalai/autochunk/trace_flow.py
index 1e2e6dc1258b..ec1e012beb17 100644
--- a/colossalai/autochunk/trace_flow.py
+++ b/colossalai/autochunk/trace_flow.py
@@ -10,6 +10,7 @@
 
 
 class TraceFlow(object):
+
     def __init__(self, trace_indice: TraceIndice) -> None:
         self.trace_indice = trace_indice
 
@@ -28,9 +29,7 @@ def check_index_source(self, start_dim, start_node, start_idx, end_dim, end_node
         start_node_idx = find_idx_by_name(start_node.name, self.trace_indice.node_list)
         end_node_trace = self.trace_indice._find_trace_from_node(end_node)
         end_node_trace_source = end_node_trace["source"][end_dim]
-        sorted_source = sorted(
-            end_node_trace_source.items(), key=lambda d: d[0], reverse=True
-        )
+        sorted_source = sorted(end_node_trace_source.items(), key=lambda d: d[0], reverse=True)
         for node_idx, node_dim in sorted_source:
             if node_idx == start_node_idx and start_dim in node_dim:
                 return True
@@ -70,10 +69,8 @@ def _find_inherit_dim(self, input_node, input_dim, node):
         input_node_idx = find_idx_by_name(input_node.name, self.trace_indice.node_list)
         node_trace_source = self.trace_indice._find_source_trace_from_node(node)
         for node_dim in range(len(get_node_shape(node))):
-            if (
-                input_node_idx in node_trace_source[node_dim]
-                and input_dim[0] in node_trace_source[node_dim][input_node_idx]
-            ):
+            if (input_node_idx in node_trace_source[node_dim]
+                    and input_dim[0] in node_trace_source[node_dim][input_node_idx]):
                 return node_dim
         return None
 
@@ -81,15 +78,11 @@ def check_index_duplicate(self, chunk_infos, return_dim=False):
         input_dim_after_node = {}
         for input_node_idx, input_node in enumerate(chunk_infos["inputs"]):
             for k, v in chunk_infos["inputs_dim"][input_node_idx].items():
-                inherit_dim = self._find_inherit_dim(
-                    input_node, v, self.trace_indice.node_list[k]
-                )
+                inherit_dim = self._find_inherit_dim(input_node, v, self.trace_indice.node_list[k])
                 if inherit_dim:
                     input_dim_after_node[k] = inherit_dim
 
-        for node in self.trace_indice.node_list[
-            chunk_infos["region"][0] : chunk_infos["region"][1] + 1
-        ]:
+        for node in self.trace_indice.node_list[chunk_infos["region"][0]:chunk_infos["region"][1] + 1]:
             if is_non_compute_node_except_placeholder(node):
                 continue
             count = 0
@@ -159,9 +152,7 @@ def _assgin_single_node_flow(
         if arg_node in all_node_info:
             if all_node_info[arg_node]["chunk_dim"] != arg_dim:
                 return False
-            all_node_info[arg_node]["fix_dim"] = list(
-                set(all_node_info[arg_node]["fix_dim"] + arg_fix_dim)
-            )
+            all_node_info[arg_node]["fix_dim"] = list(set(all_node_info[arg_node]["fix_dim"] + arg_fix_dim))
         # else add it to list
         else:
             all_node_info[arg_node] = {"chunk_dim": arg_dim, "fix_dim": arg_fix_dim}
@@ -170,9 +161,7 @@ def _assgin_single_node_flow(
         return True
 
     def _get_all_node_info(self, end_dim, start_idx, end_idx):
-        cur_node_list = [
-            self.trace_indice.node_list[end_idx]
-        ]  # start from the last node
+        cur_node_list = [self.trace_indice.node_list[end_idx]]    # start from the last node
         all_node_info = {cur_node_list[0]: {"chunk_dim": end_dim, "fix_dim": []}}
 
         while len(cur_node_list) > 0:
@@ -183,12 +172,8 @@ def _get_all_node_info(self, end_dim, start_idx, end_idx):
                 cur_node_chunk_dim = all_node_info[cur_node]["chunk_dim"]
                 cur_node_fix_dim = all_node_info[cur_node]["fix_dim"]
                 if cur_node_chunk_dim:
-                    cur_node_compute = self.trace_indice._find_compute_trace_from_node(
-                        cur_node
-                    )
-                    cur_node_source = self.trace_indice._find_source_trace_from_node(
-                        cur_node
-                    )
+                    cur_node_compute = self.trace_indice._find_compute_trace_from_node(cur_node)
+                    cur_node_source = self.trace_indice._find_source_trace_from_node(cur_node)
                 else:
                     cur_node_compute = cur_node_source = None
 
@@ -215,15 +200,9 @@ def _get_all_node_info(self, end_dim, start_idx, end_idx):
                         return None
 
                 if len(arg_list) == 2:
-                    if any(i in cur_node.name for i in ["add", "mul"]):
+                    if any(i in cur_node.name for i in ["add", "mul", "truediv"]):
                         for arg in arg_list:
-                            if not (
-                                start_idx
-                                <= find_idx_by_name(
-                                    arg.name, self.trace_indice.node_list
-                                )
-                                < end_idx
-                            ):
+                            if not (start_idx <= find_idx_by_name(arg.name, self.trace_indice.node_list) < end_idx):
                                 continue
                             arg_chunk_dim = all_node_info[arg]["chunk_dim"]
                             arg_fix_dim = all_node_info[arg]["fix_dim"]
@@ -249,9 +228,7 @@ def _get_input_nodes_dim(self, inputs, start_idx, end_idx, all_node_info):
         remove_inputs = []
         for input_node in inputs:
             input_dict = {}
-            input_node_idx = find_idx_by_name(
-                input_node.name, self.trace_indice.node_list
-            )
+            input_node_idx = find_idx_by_name(input_node.name, self.trace_indice.node_list)
             for user in input_node.users.keys():
                 if is_non_compute_node(user):
                     continue
@@ -259,9 +236,7 @@ def _get_input_nodes_dim(self, inputs, start_idx, end_idx, all_node_info):
                 if start_idx <= user_idx <= end_idx:
                     chunk_dim = all_node_info[user]["chunk_dim"]
                     if chunk_dim is not None:
-                        user_source = self.trace_indice._find_source_trace_from_node(
-                            user
-                        )[chunk_dim]
+                        user_source = self.trace_indice._find_source_trace_from_node(user)[chunk_dim]
                         if input_node_idx in user_source:
                             input_dict[user_idx] = user_source[input_node_idx]
                         else:
@@ -284,7 +259,7 @@ def _get_prepose_nodes(self, all_node_info, start_idx, end_idx):
         maybe_prepose_nodes.sort(
             key=lambda x: find_idx_by_name(x.name, self.trace_indice.node_list),
             reverse=True,
-        )  # from last node to first node
+        )    # from last node to first node
         prepose_nodes = []
         # set every node as root, search its args, if all legal, turn root and args as prepose nodes
         while len(maybe_prepose_nodes) > 0:
@@ -305,13 +280,8 @@ def _get_prepose_nodes(self, all_node_info, start_idx, end_idx):
                         if type(cur_prepose_node_arg) != type(cur_prepose_node):
                             continue
                         # out of loop
-                        if not (
-                            start_idx
-                            <= find_idx_by_name(
-                                cur_prepose_node_arg.name, self.trace_indice.node_list
-                            )
-                            < end_idx
-                        ):
+                        if not (start_idx <= find_idx_by_name(cur_prepose_node_arg.name, self.trace_indice.node_list) <
+                                end_idx):
                             continue
                         # compute op in loop
                         elif cur_prepose_node_arg in all_node_info:
@@ -335,15 +305,13 @@ def _get_prepose_nodes(self, all_node_info, start_idx, end_idx):
                     if n in maybe_prepose_nodes:
                         maybe_prepose_nodes.remove(n)
         # sort by index
-        prepose_nodes.sort(
-            key=lambda x: find_idx_by_name(x.name, self.trace_indice.node_list)
-        )
+        prepose_nodes.sort(key=lambda x: find_idx_by_name(x.name, self.trace_indice.node_list))
 
         return prepose_nodes
 
     def _get_non_chunk_inputs(self, chunk_info, start_idx, end_idx):
         # we need to log input nodes to avoid deleteing them in the loop
-        chunk_node_list = self.trace_indice.node_list[start_idx : end_idx + 1]
+        chunk_node_list = self.trace_indice.node_list[start_idx:end_idx + 1]
         # also need to get some prepose node's arg out of non_chunk_inputs
         for n in chunk_info["args"]["prepose_nodes"]:
             chunk_node_list.remove(n)
@@ -354,9 +322,7 @@ def _get_non_chunk_inputs(self, chunk_info, start_idx, end_idx):
         return chunk_info
 
     def flow_search(self, start_idx, start_dim, end_idx, end_dim):
-        inputs, outputs = find_chunk_compute_input_and_output_nodes(
-            self.trace_indice.node_list[start_idx : end_idx + 1]
-        )
+        inputs, outputs = find_chunk_compute_input_and_output_nodes(self.trace_indice.node_list[start_idx:end_idx + 1])
         # only single ouput
         if len(outputs) > 1:
             return None
@@ -367,9 +333,7 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
             return None
 
         # get input nodes' chunk dim
-        inputs, inputs_dim = self._get_input_nodes_dim(
-            inputs, start_idx, end_idx, all_node_info
-        )
+        inputs, inputs_dim = self._get_input_nodes_dim(inputs, start_idx, end_idx, all_node_info)
         if inputs is None:
             return None
 
@@ -385,9 +349,7 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
         }
 
         # move useless nodes ahead of loop
-        chunk_info["args"]["prepose_nodes"] = self._get_prepose_nodes(
-            all_node_info, start_idx, end_idx
-        )
+        chunk_info["args"]["prepose_nodes"] = self._get_prepose_nodes(all_node_info, start_idx, end_idx)
 
         # find non chunk inputs
         chunk_info = self._get_non_chunk_inputs(chunk_info, start_idx, end_idx)
@@ -400,10 +362,8 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
     def _reassgin_reshape_size(self, chunk_info):
         chunk_region = chunk_info["region"]
         reshape_size = {}
-        chunk_shape = get_node_shape(chunk_info["outputs"][0])[
-            chunk_info["outputs_dim"]
-        ]
-        for node in self.trace_indice.node_list[chunk_region[0] : chunk_region[1] + 1]:
+        chunk_shape = get_node_shape(chunk_info["outputs"][0])[chunk_info["outputs_dim"]]
+        for node in self.trace_indice.node_list[chunk_region[0]:chunk_region[1] + 1]:
             if any(i in node.name for i in ["reshape", "view"]):
                 reshape_args = node.args[1:]
                 reshape_log = self.trace_indice.indice_view_list[node]
@@ -413,8 +373,6 @@ def _reassgin_reshape_size(self, chunk_info):
                     if reshape_arg_dim in reshape_log["dim_to"]:
                         continue
                     if reshape_arg_dim == chunk_dim:
-                        reshape_size[node.name][reshape_arg.name] = (
-                            "min(chunk_size, %d - chunk_idx)" % chunk_shape
-                        )
+                        reshape_size[node.name][reshape_arg.name] = ("min(chunk_size, %d - chunk_idx)" % chunk_shape)
         chunk_info["reshape_size"] = reshape_size
         return chunk_info
diff --git a/colossalai/autochunk/trace_indice.py b/colossalai/autochunk/trace_indice.py
index 1e16ab9bdf35..5a5d15e0a1f4 100644
--- a/colossalai/autochunk/trace_indice.py
+++ b/colossalai/autochunk/trace_indice.py
@@ -3,7 +3,7 @@
 
 from torch.fx.node import Node
 
-from .utils import find_idx_by_name, get_node_shape
+from .utils import find_first_tensor_arg, find_idx_by_name, get_node_shape, unflat_list
 
 
 class TraceIndice(object):
@@ -79,9 +79,7 @@ def _inherit_indice(self, node_from, node_from_dim, node_to, node_to_dim):
         node_from_trace = self._find_trace_from_node(node_from)
         node_to_trace = self._find_trace_from_node(node_to)
         node_to_trace["indice"][node_to_dim] = node_from_trace["indice"][node_from_dim]
-        node_to_trace["compute"][node_to_dim] = copy.deepcopy(
-            node_from_trace["compute"][node_from_dim]
-        )
+        node_to_trace["compute"][node_to_dim] = copy.deepcopy(node_from_trace["compute"][node_from_dim])
         self._add_source(node_from, node_from_dim, node_to, node_to_dim, init=True)
 
     def _inherit_all_computation(self, node_from, node_to):
@@ -209,7 +207,7 @@ def _assign_indice_as_input(self, node, node_idx, input_node=None):
             node_idx (int)
         """
         if input_node == None:
-            input_node = node.args[0]
+            input_node = find_first_tensor_arg(node)
         input_node_idx = find_idx_by_name(input_node.name, self.node_list)
         input_node_idx_trace = self.indice_trace_list[input_node_idx]["indice"]
 
@@ -227,6 +225,8 @@ def _assign_all_indice(self, node, node_idx):
             node_idx (int)
         """
         shape = node.meta["tensor_meta"].shape
+        if shape is None:
+            return
         new_trace = []
         for _ in shape:
             new_trace.append(self._add_indice())
@@ -259,7 +259,7 @@ def _assign_permute_indice(self, node, node_idx):
             node (node)
             node_idx (int)
         """
-        permute_dim = node.args[1:]
+        permute_dim = unflat_list(node.args[1:])
         input_node = node.args[0]
 
         self._assign_indice_as_input(node, node_idx, input_node)
@@ -359,6 +359,15 @@ def _assign_einsum_indice(self, node, idx):
         left, right = patterns.split("->")
         left = left.split(",")
 
+        if '...' in right:
+            replace_list = "!@#$%^&*"
+            target_len = len(get_node_shape(node))
+            add_len = target_len - len(right) + 3
+            replace_str = replace_list[:add_len]
+            right = right.replace("...", replace_str)
+            for ll in range(len(left)):
+                left[ll] = left[ll].replace("...", replace_str)
+
         all_index = []
         for i in left:
             for c in i:
@@ -369,9 +378,7 @@ def _assign_einsum_indice(self, node, idx):
             for left_idx, left_str in enumerate(left):
                 if right_indice in left_str:
                     source_idx = left_str.index(right_indice)
-                    self._inherit_indice(
-                        input_nodes[left_idx], source_idx, node, right_idx
-                    )
+                    self._inherit_indice(input_nodes[left_idx], source_idx, node, right_idx)
 
     def _assign_softmax_indice(self, node, idx):
         """
@@ -440,11 +447,12 @@ def _assign_view_reshape_indice(self, node, node_idx):
         origin_node = node.args[0]
         origin_shape = origin_node.meta["tensor_meta"].shape
         target_shape = []
-        for i in range(1, len(node.args)):
-            if isinstance(node.args[i], int):
-                target_shape.append(node.args[i])
+        unflated_args = unflat_list(node.args)
+        for i in range(1, len(unflated_args)):
+            if isinstance(unflated_args[i], int):
+                target_shape.append(unflated_args[i])
             else:
-                target_shape.append(node.args[i].meta["fwd_out"][0])
+                target_shape.append(unflated_args[i].meta["fwd_out"][0])
 
         # compute the value of -1
         if -1 in target_shape:
@@ -472,13 +480,7 @@ def _assign_view_reshape_indice(self, node, node_idx):
             dim_to = [dim_equal.index(False), dim_equal.index(False) + 1]
             self._del_dim(node_idx, -1)
         else:
-            raise NotImplementedError(
-                "shape"
-                + str(origin_shape)
-                + "and"
-                + str(target_shape)
-                + "view not implemented"
-            )
+            raise NotImplementedError("shape" + str(origin_shape) + "and" + str(target_shape) + "view not implemented")
 
         # get new indice
         origin_trace = self._find_indice_trace_from_node(origin_node)
@@ -521,6 +523,8 @@ def trace_indice(self):
                     self._assign_unsqueeze_indice(node, idx)
                 elif any(i in node.name for i in ["to", "contiguous"]):
                     self._assgin_no_change_indice(node, idx)
+                elif "new_ones" in node.name:
+                    self._assign_ones_like_indice(node, idx)
                 else:
                     raise NotImplementedError(node.name, "method not implemented yet!")
             elif node.op == "call_function":
@@ -530,7 +534,7 @@ def trace_indice(self):
                     self._assign_matmul_indice(node, idx)
                 elif "softmax" in node.name:
                     self._assign_softmax_indice(node, idx)
-                elif any(n in node.name for n in ["mul", "add", "sigmoid", "relu"]):
+                elif any(n in node.name for n in ["mul", "add", "sigmoid", "relu", "sub", "truediv"]):
                     self._assign_elementwise_indice(node, idx)
                 elif "ones_like" in node.name:
                     self._assign_ones_like_indice(node, idx)
@@ -538,21 +542,21 @@ def trace_indice(self):
                     self._assign_dropout_indice(node, idx)
                 elif "einsum" in node.name:
                     self._assign_einsum_indice(node, idx)
-                elif "getattr" in node.name:
-                    continue  # get attr like shape
-                elif "getitem" in node.name:
-                    continue  # get item in list
+                elif "layer_norm" in node.name:
+                    self._assign_layernorm_indice(node, idx)
+                elif any(i in node.name for i in ["getattr", "getitem", "eq", "_assert"]):
+                    continue
                 else:
-                    raise NotImplementedError(
-                        node.name, "function not implemented yet!"
-                    )
+                    raise NotImplementedError(node.name, "function not implemented yet!")
             elif node.op == "call_module":
                 if any(n in node.name for n in ["layernorm", "norm"]):
                     self._assign_layernorm_indice(node, idx)
+                elif any(n in node.name for n in ["sigmoid", "dropout", "relu"]):
+                    self._assign_elementwise_indice(node, idx)
                 else:
                     raise NotImplementedError(node.name, "module not implemented yet!")
             elif node.op == "get_attr":
-                self._assign_all_indice(node, idx)  # get param
+                self._assign_all_indice(node, idx)    # get param
             elif node.op == "output":
                 continue
             else:
diff --git a/colossalai/autochunk/utils.py b/colossalai/autochunk/utils.py
index b62a6600adc8..5f3ea3bf482d 100644
--- a/colossalai/autochunk/utils.py
+++ b/colossalai/autochunk/utils.py
@@ -3,10 +3,32 @@
 from torch.fx.node import Node
 
 
+def unflat_list(inputs):
+    """
+    unflat a list by recursion
+    """
+    res = []
+    for i in inputs:
+        if isinstance(i, list) or isinstance(i, set) or isinstance(i, tuple):
+            res.extend(unflat_list(i))
+        else:
+            res.append(i)
+    return res
+
+
+def find_first_tensor_arg(node):
+    """
+    Find the first input tensor arg for a node
+    """
+    for arg in node.args:
+        if type(arg) == type(node):
+            return arg
+    raise RuntimeError()
+
+
 def is_non_compute_node(node):
     if any(i in node.op for i in ["placeholder", "get_attr", "output"]) or any(
-        i in node.name for i in ["getitem", "getattr"]
-    ):
+            i in node.name for i in ["getitem", "getattr"]):
         return True
     return False
 
@@ -18,17 +40,13 @@ def get_node_shape(node):
 
 
 def is_non_compute_node_except_placeholder(node):
-    if any(i in node.op for i in ["get_attr", "output"]) or any(
-        i in node.name for i in ["getitem", "getattr"]
-    ):
+    if any(i in node.op for i in ["get_attr", "output"]) or any(i in node.name for i in ["getitem", "getattr"]):
         return True
     return False
 
 
 def is_non_compute_node_except_placeholder_output(node):
-    if any(i in node.op for i in ["get_attr"]) or any(
-        i in node.name for i in ["getitem", "getattr"]
-    ):
+    if any(i in node.op for i in ["get_attr"]) or any(i in node.name for i in ["getitem", "getattr"]):
         return True
     return False
 
@@ -74,22 +92,16 @@ def find_chunk_compute_input_and_output_nodes(nodes: List[Node]):
     # we treat that input node as the input of the checkpoint function
     for node in nodes:
         for input_node in node._input_nodes.keys():
-            if (
-                input_node not in nodes
-                and input_node not in input_nodes
-                and not is_non_compute_node_except_placeholder(input_node)
-            ):
+            if (input_node not in nodes and input_node not in input_nodes
+                    and not is_non_compute_node_except_placeholder(input_node)):
                 input_nodes.append(input_node)
 
     # if a node has a user node which is not in the node list
     # we treat that user node as the node receiving the current node output
     for node in nodes:
         for output_node in node.users.keys():
-            if (
-                output_node not in nodes
-                and node not in output_nodes
-                and not is_non_compute_node_except_placeholder_output(output_node)
-            ):
+            if (output_node not in nodes and node not in output_nodes
+                    and not is_non_compute_node_except_placeholder_output(output_node)):
                 output_nodes.append(node)
 
     return input_nodes, output_nodes
diff --git a/colossalai/fx/profiler/opcount.py b/colossalai/fx/profiler/opcount.py
index 1c39dc247750..6bd612ad2fd1 100644
--- a/colossalai/fx/profiler/opcount.py
+++ b/colossalai/fx/profiler/opcount.py
@@ -249,6 +249,8 @@ def zero_flop_jit(*args):
         aten.sum.default,
         aten.sum.dim_IntList,
         aten.mean.dim,
+        aten.sub.Tensor,
+        aten.sub_.Tensor,
 
     # activation op
         aten.hardswish.default,
@@ -313,7 +315,8 @@ def zero_flop_jit(*args):
         aten.where.self,
         aten.zero_.default,
         aten.zeros_like.default,
-    ]
+        aten.fill_.Scalar
+    ]  # yapf: disable
 
     for op in zero_flop_aten:
         flop_mapping[op] = zero_flop_jit
diff --git a/tests/test_autochunk/benchmark_autochunk.py b/tests/test_autochunk/benchmark_simple_evoformer.py
similarity index 66%
rename from tests/test_autochunk/benchmark_autochunk.py
rename to tests/test_autochunk/benchmark_simple_evoformer.py
index 6632ece61376..8b5d8a8bee77 100644
--- a/tests/test_autochunk/benchmark_autochunk.py
+++ b/tests/test_autochunk/benchmark_simple_evoformer.py
@@ -2,14 +2,13 @@
 
 import torch
 import torch.fx
+from simple_evoformer import base_evoformer, openfold_evoformer
 
 from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
 from colossalai.fx import ColoTracer
 from colossalai.fx.graph_module import ColoGraphModule
 from colossalai.fx.passes.meta_info_prop import MetaInfoProp
 from colossalai.fx.profiler import MetaTensor
-from tests.test_autochunk.evoformer.evoformer import evoformer_base
-from tests.test_autochunk.openfold.evoformer import EvoformerBlock
 
 
 def _benchmark_evoformer(model: torch.nn.Module, node, pair, title, chunk_size=None):
@@ -34,10 +33,7 @@ def _benchmark_evoformer(model: torch.nn.Module, node, pair, title, chunk_size=N
         time2 = time.time()
 
     new_max_mem = torch.cuda.max_memory_allocated() / 1024**2
-    print(
-        "%s: time %.4fs, mem %dMB"
-        % (title, (time2 - time1) / loop, new_max_mem - now_mem)
-    )
+    print("%s: time %.4fs, mem %dMB" % (title, (time2 - time1) / loop, new_max_mem - now_mem))
 
 
 def _build_autochunk(model, max_memory, node, pair):
@@ -50,18 +46,14 @@ def _build_autochunk(model, max_memory, node, pair):
         },
     )
 
-    gm_prop = torch.fx.symbolic_trace(model)  # must use symbolic_trace
+    gm_prop = torch.fx.symbolic_trace(model)    # must use symbolic_trace
     interp = MetaInfoProp(gm_prop)
-    interp.propagate(
-        MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0")
-    )
+    interp.propagate(MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0"))
 
     # now run it twice to get meta info in graph module, not necessary
     gm = torch.fx.GraphModule(model, graph)
     interp = MetaInfoProp(gm)
-    interp.propagate(
-        MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0")
-    )
+    interp.propagate(MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0"))
 
     # set code_gen
     codegen = AutoChunkCodeGen(gm_prop, max_memory, print_mem=False)
@@ -75,42 +67,22 @@ def _build_autochunk(model, max_memory, node, pair):
     return gm
 
 
-def _build_openfold():
-    model = EvoformerBlock(
-        c_m=256,
-        c_z=128,
-        c_hidden_msa_att=32,
-        c_hidden_opm=32,
-        c_hidden_mul=128,
-        c_hidden_pair_att=32,
-        no_heads_msa=8,
-        no_heads_pair=4,
-        transition_n=4,
-        msa_dropout=0.15,
-        pair_dropout=0.15,
-        inf=1e4,
-        eps=1e-4,
-        is_multimer=False,
-    ).cuda()
-    return model
-
-
 def benchmark_evoformer():
     # init data and model
-    msa_len = 256
-    pair_len = 512
+    msa_len = 128
+    pair_len = 256
     node = torch.randn(1, msa_len, pair_len, 256).cuda()
     pair = torch.randn(1, pair_len, pair_len, 128).cuda()
-    model = evoformer_base().cuda()
+    model = base_evoformer().cuda()
 
     # build autochunk model
     # max_memory = 1000  # MB, fit memory mode
-    max_memory = None  # min memory mode
-    autochunk = _build_autochunk(evoformer_base().cuda(), max_memory, node, pair)
+    max_memory = None    # min memory mode
+    autochunk = _build_autochunk(base_evoformer().cuda(), max_memory, node, pair)
 
     # build openfold
     chunk_size = 64
-    openfold = _build_openfold()
+    openfold = openfold_evoformer().cuda()
 
     # benchmark
     _benchmark_evoformer(model, node, pair, "base")
diff --git a/tests/test_autochunk/evoformer/evoformer.py b/tests/test_autochunk/evoformer/evoformer.py
deleted file mode 100644
index cfd2bb2a2529..000000000000
--- a/tests/test_autochunk/evoformer/evoformer.py
+++ /dev/null
@@ -1,59 +0,0 @@
-import torch
-import torch.nn as nn
-
-from .msa import MSAStack
-from .ops import OutProductMean
-from .triangle import PairStack
-
-
-def print_memory(init_mem, text=None):
-    now_mem = torch.cuda.memory_allocated() / 1024 ** 2 - init_mem
-    max_mem = torch.cuda.max_memory_allocated() / 1024 ** 2 - init_mem
-    print("%s now:%.2f max:%.2f" % ("" if text is None else text, now_mem, max_mem))
-    torch.cuda.reset_peak_memory_stats()
-
-
-class EvoformerBlock(nn.Module):
-
-    def __init__(self, d_node, d_pair):
-        super(EvoformerBlock, self).__init__()
-
-        self.msa_stack = MSAStack(d_node, d_pair, p_drop=0.15)
-        self.communication = OutProductMean(n_feat=d_node, n_feat_out=d_pair, n_feat_proj=32)
-        self.pair_stack = PairStack(d_pair=d_pair)
-
-    def forward(self, node, pair):
-        node = self.msa_stack(node, pair)
-        pair = pair + self.communication(node)
-        pair = self.pair_stack(pair)
-        return node, pair
-
-
-class Evoformer(nn.Module):
-
-    def __init__(self, d_node, d_pair):
-        super(Evoformer, self).__init__()
-
-        self.blocks = nn.ModuleList()
-        for _ in range(1):
-            self.blocks.append(EvoformerBlock(d_node, d_pair))
-
-    def forward(self, node, pair):
-        for b in self.blocks:
-            node, pair = b(node, pair)
-        return node, pair
-
-
-def evoformer_tiny():
-    return Evoformer(d_node=64, d_pair=32)
-
-
-def evoformer_base():
-    return Evoformer(d_node=256, d_pair=128)
-
-
-def evoformer_large():
-    return Evoformer(d_node=512, d_pair=256)
-
-
-__all__ = ['Evoformer', 'evoformer_base', 'evoformer_large']
diff --git a/tests/test_autochunk/evoformer/initializer.py b/tests/test_autochunk/evoformer/initializer.py
deleted file mode 100755
index c6ce0659e597..000000000000
--- a/tests/test_autochunk/evoformer/initializer.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import math
-
-import numpy as np
-import torch.nn as nn
-
-
-def glorot_uniform_af(x, gain=1.0):
-    """
-    initialize tensors the same as xavier_initializer in PyTorch, but the dimensions are different:
-    In PyTorch:
-    [feature_out, feature_in, n_head ...]
-    In Jax:
-    [... n_head, feature_in, feature_out]
-    However, there is a feature in original Alphafold2 code that they use the Jax version initializer to initialize tensors like:
-    [feature_in, n_head, feature_out]
-
-    In this function, we keep this feature to initialize [feature_in, n_head, ..., feature_out] tensors
-    """
-    fan_in, fan_out = x.shape[-2:]
-    if len(x.shape) > 2:
-        receptive_field_size = np.prod(x.shape[:-2])
-        fan_in *= receptive_field_size
-        fan_out *= receptive_field_size
-    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
-    dev = math.sqrt(3.0) * std  # Calculate uniform bounds from standard deviation
-
-    nn.init.uniform_(x, -dev, dev)
-
-    return x
diff --git a/tests/test_autochunk/evoformer/kernel.py b/tests/test_autochunk/evoformer/kernel.py
deleted file mode 100644
index 26ab5dc53261..000000000000
--- a/tests/test_autochunk/evoformer/kernel.py
+++ /dev/null
@@ -1,19 +0,0 @@
-import torch
-import torch.nn.functional as F
-
-
-def bias_sigmod_ele(y, bias, z):
-    return torch.sigmoid(y + bias) * z
-
-
-def bias_dropout_add(x: torch.Tensor, bias: torch.Tensor, dropmask: torch.Tensor,
-                     residual: torch.Tensor, prob: float) -> torch.Tensor:
-    out = (x + bias) * F.dropout(dropmask, p=prob, training=False)
-    out = residual + out
-    return out
-
-
-def bias_ele_dropout_residual(ab: torch.Tensor, b: torch.Tensor, g: torch.Tensor,
-                              dropout_mask: torch.Tensor, Z_raw: torch.Tensor,
-                              prob: float) -> torch.Tensor:
-    return Z_raw + F.dropout(dropout_mask, p=prob, training=True) * (g * (ab + b))
\ No newline at end of file
diff --git a/tests/test_autochunk/evoformer/msa.py b/tests/test_autochunk/evoformer/msa.py
deleted file mode 100644
index cac456638a55..000000000000
--- a/tests/test_autochunk/evoformer/msa.py
+++ /dev/null
@@ -1,95 +0,0 @@
-import math
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from einops import rearrange
-from torch.nn import LayerNorm
-
-from .kernel import bias_dropout_add
-from .ops import SelfAttention, Transition
-
-
-class MSARowAttentionWithPairBias(nn.Module):
-
-    def __init__(self, d_node, d_pair, c=32, n_head=8, p_drop=0.15):
-        super(MSARowAttentionWithPairBias, self).__init__()
-        self.d_node = d_node
-        self.d_pair = d_pair
-        self.c = c
-        self.n_head = n_head
-        self.p_drop = p_drop
-
-        self.layernormM = LayerNorm(d_node)
-        self.layernormZ = LayerNorm(d_pair)
-
-        _init_weights = torch.nn.init.normal_(torch.zeros([n_head, d_pair]),
-                                              std=1.0 / math.sqrt(d_pair))
-        self.linear_b_weights = nn.parameter.Parameter(data=_init_weights, requires_grad=True)
-
-        self.attention = SelfAttention(qkv_dim=d_node,
-                                       c=c,
-                                       n_head=n_head,
-                                       out_dim=d_node,
-                                       gating=True,
-                                       last_bias_fuse=True)
-
-        self.out_bias = nn.parameter.Parameter(data=torch.zeros((d_node,)), requires_grad=True)
-
-    def forward(self, M_raw, Z):
-        ## Input projections
-        M = self.layernormM(M_raw)
-        Z = self.layernormZ(Z)
-        b = F.linear(Z, self.linear_b_weights)
-        b = b.permute(0, 3, 1, 2)
-        # b = rearrange(b, 'b q k h -> b h q k')
-
-        M = self.attention(M, b)
-        dropout_mask = torch.ones_like(M[:, 0:1, :, :]).to(M.device).to(M.dtype)
-
-        return bias_dropout_add(M, self.out_bias, dropout_mask, M_raw, prob=self.p_drop)
-
-
-class MSAColumnAttention(nn.Module):
-
-    def __init__(self, d_node, c=32, n_head=8):
-        super(MSAColumnAttention, self).__init__()
-        self.d_node = d_node
-        self.c = c
-        self.n_head = n_head
-
-        self.layernormM = LayerNorm(d_node)
-        self.attention = SelfAttention(qkv_dim=d_node,
-                                       c=c,
-                                       n_head=n_head,
-                                       out_dim=d_node,
-                                       gating=True)
-
-    def forward(self, M_raw):
-        M = M_raw.transpose(-2, -3)
-        M = self.layernormM(M)
-
-        M = self.attention(M)
-
-        M = M.transpose(-2, -3)
-        return M_raw + M
-
-
-class MSAStack(nn.Module):
-
-    def __init__(self, d_node, d_pair, p_drop=0.15):
-        super(MSAStack, self).__init__()
-
-        self.MSARowAttentionWithPairBias = MSARowAttentionWithPairBias(d_node=d_node,
-                                                                       d_pair=d_pair,
-                                                                       p_drop=p_drop)
-
-        self.MSAColumnAttention = MSAColumnAttention(d_node=d_node)
-        self.MSATransition = Transition(d=d_node)
-
-    def forward(self, node, pair):
-        node = self.MSARowAttentionWithPairBias(node, pair)
-        node = self.MSAColumnAttention(node)
-        node = self.MSATransition(node)
-
-        return node
diff --git a/tests/test_autochunk/evoformer/ops.py b/tests/test_autochunk/evoformer/ops.py
deleted file mode 100755
index a56057522eaa..000000000000
--- a/tests/test_autochunk/evoformer/ops.py
+++ /dev/null
@@ -1,176 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from einops import rearrange
-from torch.nn import LayerNorm
-
-from .initializer import glorot_uniform_af
-from .kernel import bias_sigmod_ele
-
-
-class DropoutRowwise(nn.Module):
-
-    def __init__(self, p):
-        super(DropoutRowwise, self).__init__()
-        self.p = p
-        self.dropout = nn.Dropout(p=p)
-
-    def forward(self, x):
-        dropout_mask = torch.ones_like(x[:, 0:1, :, :])
-        dropout_mask = self.dropout(dropout_mask)
-        return dropout_mask * x
-
-
-class DropoutColumnwise(nn.Module):
-
-    def __init__(self, p):
-        super(DropoutColumnwise, self).__init__()
-        self.p = p
-        self.dropout = nn.Dropout(p=p)
-
-    def forward(self, x):
-        dropout_mask = torch.ones_like(x[:, :, 0:1, :])
-        dropout_mask = self.dropout(dropout_mask)
-        return dropout_mask * x
-
-
-class Transition(nn.Module):
-
-    def __init__(self, d, n=4):
-        super(Transition, self).__init__()
-        self.norm = LayerNorm(d)
-        self.linear1 = Linear(d, n * d, initializer='relu')
-        self.linear2 = Linear(n * d, d, initializer='zeros')
-
-    def forward(self, src):
-        x = self.norm(src)
-        x = self.linear2(F.relu(self.linear1(x)))
-        return src + x
-
-
-class OutProductMean(nn.Module):
-
-    def __init__(self, n_feat=64, n_feat_out=128, n_feat_proj=32):
-        super(OutProductMean, self).__init__()
-
-        self.layernormM = LayerNorm(n_feat)
-        self.linear_a = Linear(n_feat, n_feat_proj)
-        self.linear_b = Linear(n_feat, n_feat_proj)
-
-        self.o_linear = Linear(n_feat_proj * n_feat_proj,
-                               n_feat_out,
-                               initializer='zero',
-                               use_bias=True)
-
-    def forward(self, M):
-        M = self.layernormM(M)
-        left_act = self.linear_a(M)
-        right_act = self.linear_b(M)
-
-        o = torch.einsum('bsid,bsje->bijde', left_act, right_act).contiguous()
-        # O = rearrange(O, 'b i j d e -> b i j (d e)')
-        o = o.reshape(o.shape[0], o.shape[1], o.shape[2], -1)
-        Z = self.o_linear(o)
-
-        return Z
-
-
-class Linear(nn.Linear):
-    """
-    A Linear layer with built-in nonstandard initializations. Called just
-    like torch.nn.Linear.
-    Implements the initializers in 1.11.4, plus some additional ones found
-    in the code.
-    """
-
-    def __init__(
-        self,
-        feature_in: int,
-        feature_out: int,
-        initializer: str = 'linear',
-        use_bias: bool = True,
-        bias_init: float = 0.,
-    ):
-        super(Linear, self).__init__(feature_in, feature_out, bias=use_bias)
-
-        self.use_bias = use_bias
-        if initializer == 'linear':
-            glorot_uniform_af(self.weight, gain=1.0)
-        elif initializer == 'relu':
-            glorot_uniform_af(self.weight, gain=2.0)
-        elif initializer == 'zeros':
-            nn.init.zeros_(self.weight)
-        if self.use_bias:
-            with torch.no_grad():
-                self.bias.fill_(bias_init)
-
-
-class SelfAttention(nn.Module):
-    """
-    Multi-Head SelfAttention dealing with [batch_size1, batch_size2, len, dim] tensors
-    """
-
-    def __init__(self, qkv_dim, c, n_head, out_dim, gating=True, last_bias_fuse=False):
-        super(SelfAttention, self).__init__()
-        self.qkv_dim = qkv_dim
-        self.c = c
-        self.n_head = n_head
-        self.out_dim = out_dim
-        self.gating = gating
-        self.last_bias_fuse = last_bias_fuse
-
-        self.scaling = self.c**(-0.5)
-
-        # self.to_qkv = Linear(qkv_dim, 3 * n_head * c, initializer='linear')
-        self.to_q = Linear(qkv_dim, n_head * c, initializer='linear', use_bias=False)
-        self.to_k = Linear(qkv_dim, n_head * c, initializer='linear', use_bias=False)
-        self.to_v = Linear(qkv_dim, n_head * c, initializer='linear', use_bias=False)
-
-        if gating:
-            self.gating_bias = nn.parameter.Parameter(data=torch.ones((n_head * c,)))
-            self.gating_linear = Linear(qkv_dim, n_head * c, initializer='zero', use_bias=False)
-
-        self.o_linear = Linear(n_head * c,
-                               out_dim,
-                               initializer='zero',
-                               use_bias=(not last_bias_fuse))
-
-    def forward(self, in_data, nonbatched_bias=None):
-        """
-        :param in_data: [batch_size1, batch_size2, len_qkv, qkv_dim]
-        :param bias: None or [batch_size1, batch_size2, n_head, len_q, len_kv]
-        :param nonbatched_bias: None or [batch_size1, n_head, len_q, len_kv]
-        """
-
-        # qkv = self.to_qkv(in_data).chunk(3, dim=-1)
-        # q, k, v = map(lambda t: rearrange(t, 'b1 b2 n (h d) -> b1 b2 h n d', h=self.n_head), qkv)
-
-        q = self.to_q(in_data)
-        k = self.to_k(in_data)
-        v = self.to_v(in_data)
-
-        # q, k, v = map(lambda t: rearrange(t, 'b1 b2 n (h d) -> b1 b2 h n d', h=self.n_head),
-        #               [q, k, v])
-        q, k, v = map(lambda t: t.view(t.shape[0], t.shape[1], t.shape[2], self.n_head, -1).permute(0, 1, 3, 2, 4),
-                      [q, k, v])
-        
-        q = q * self.scaling
-
-        logits = torch.matmul(q, k.transpose(-1, -2))
-
-        if nonbatched_bias is not None:
-            logits += nonbatched_bias.unsqueeze(1)
-        weights = torch.softmax(logits, dim=-1)
-        # weights = softmax(logits)
-
-        weighted_avg = torch.matmul(weights, v)
-        # weighted_avg = rearrange(weighted_avg, 'b1 b2 h n d -> b1 b2 n (h d)')
-        weighted_avg = weighted_avg.permute(0, 1, 3, 2, 4)
-        weighted_avg = weighted_avg.reshape(weighted_avg.shape[0], weighted_avg.shape[1], weighted_avg.shape[2], -1)
-
-        if self.gating:
-            gate_values = self.gating_linear(in_data)
-            weighted_avg = bias_sigmod_ele(gate_values, self.gating_bias, weighted_avg)
-
-        output = self.o_linear(weighted_avg)
-        return output
diff --git a/tests/test_autochunk/evoformer/triangle.py b/tests/test_autochunk/evoformer/triangle.py
deleted file mode 100644
index f479469c3836..000000000000
--- a/tests/test_autochunk/evoformer/triangle.py
+++ /dev/null
@@ -1,192 +0,0 @@
-import math
-
-import torch
-import torch.nn as nn
-from torch.nn import LayerNorm
-
-from .kernel import bias_dropout_add, bias_ele_dropout_residual
-from .ops import Linear, SelfAttention, Transition
-
-
-def permute_final_dims(tensor, inds):
-    zero_index = -1 * len(inds)
-    first_inds = list(range(len(tensor.shape[:zero_index])))
-    return tensor.permute(first_inds + [zero_index + i for i in inds])
-
-
-class TriangleMultiplicationOutgoing(nn.Module):
-
-    def __init__(self, d_pair, p_drop, c=128):
-        super(TriangleMultiplicationOutgoing, self).__init__()
-        self.d_pair = d_pair
-        self.c = c
-
-        self.layernorm1 = LayerNorm(d_pair)
-        self.left_projection = Linear(d_pair, c)
-        self.right_projection = Linear(d_pair, c)
-        self.left_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
-        self.right_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
-
-        self.output_gate = Linear(d_pair, d_pair, initializer='zeros', bias_init=1.)
-        self.layernorm2 = LayerNorm(c)
-        self.output_projection = Linear(d_pair, d_pair, initializer='zeros', use_bias=False)
-        self.output_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
-
-        self.p_drop = p_drop
-
-    def forward(self, Z_raw):
-        Z = self.layernorm1(Z_raw)
-        left_proj_act = self.left_projection(Z)
-        right_proj_act = self.right_projection(Z)
-
-        left_proj_act = left_proj_act * torch.sigmoid(self.left_gate(Z))
-        right_proj_act = right_proj_act * torch.sigmoid(self.right_gate(Z))
-
-        g = torch.sigmoid(self.output_gate(Z))
-        # p = torch.matmul(
-        #     permute_final_dims(left_proj_act, (2, 0, 1)),
-        #     permute_final_dims(right_proj_act, (2, 1, 0)),
-        # )
-        # ab = permute_final_dims(p, (1, 2, 0))
-
-        ab = torch.einsum('bikd,bjkd->bijd', left_proj_act, right_proj_act)
-        ab = self.output_projection(self.layernorm2(ab))
-        dropout_mask = torch.ones_like(Z[:, 0:1, :, :]).to(Z.device).to(Z.dtype)
-        return bias_ele_dropout_residual(ab,
-                                         self.output_bias,
-                                         g,
-                                         dropout_mask,
-                                         Z_raw,
-                                         prob=self.p_drop)
-
-
-class TriangleMultiplicationIncoming(nn.Module):
-
-    def __init__(self, d_pair, p_drop, c=128):
-        super(TriangleMultiplicationIncoming, self).__init__()
-        self.d_pair = d_pair
-        self.c = c
-
-        self.layernorm1 = LayerNorm(d_pair)
-        self.left_projection = Linear(d_pair, c)
-        self.right_projection = Linear(d_pair, c)
-        self.left_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
-        self.right_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
-
-        self.output_gate = Linear(d_pair, d_pair, initializer='zeros', bias_init=1.)
-        self.layernorm2 = LayerNorm(c)
-        self.output_projection = Linear(d_pair, d_pair, initializer='zeros', use_bias=False)
-        self.output_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
-
-        self.p_drop = p_drop
-
-    def forward(self, Z_raw):
-        Z = self.layernorm1(Z_raw)
-        left_proj_act = self.left_projection(Z)
-        right_proj_act = self.right_projection(Z)
-
-        left_proj_act = left_proj_act * torch.sigmoid(self.left_gate(Z))
-        right_proj_act = right_proj_act * torch.sigmoid(self.right_gate(Z))
-
-        g = torch.sigmoid(self.output_gate(Z))
-        # p = torch.matmul(
-        #     permute_final_dims(left_proj_act, (2, 1, 0)),
-        #     permute_final_dims(right_proj_act, (2, 0, 1)),
-        # )
-        # ab = permute_final_dims(p, (1, 2, 0))
-
-        ab = torch.einsum('bkid,bkjd->bijd', left_proj_act, right_proj_act)
-        ab = self.output_projection(self.layernorm2(ab))
-        dropout_mask = torch.ones_like(Z[:, 0:1, :, :]).to(Z.device).to(Z.dtype)
-        return bias_ele_dropout_residual(ab,
-                                         self.output_bias,
-                                         g,
-                                         dropout_mask,
-                                         Z_raw,
-                                         prob=self.p_drop)
-
-
-class TriangleAttentionStartingNode(nn.Module):
-
-    def __init__(self, d_pair, p_drop, c=32, n_head=4):
-        super(TriangleAttentionStartingNode, self).__init__()
-        self.d_pair = d_pair
-        self.c = c
-        self.n_head = n_head
-        self.p_drop = p_drop
-
-        self.layernorm1 = LayerNorm(d_pair)
-        _init_weights = torch.nn.init.normal_(torch.zeros([d_pair, n_head]),
-                                              std=1.0 / math.sqrt(d_pair))
-        self.linear_b_weights = nn.parameter.Parameter(data=_init_weights)
-        self.attention = SelfAttention(qkv_dim=d_pair,
-                                       c=c,
-                                       n_head=n_head,
-                                       out_dim=d_pair,
-                                       gating=True,
-                                       last_bias_fuse=True)
-
-        self.out_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
-
-    def forward(self, Z_raw):
-        Z = self.layernorm1(Z_raw)
-        b = torch.einsum('bqkc,ch->bhqk', Z, self.linear_b_weights)
-
-        Z = self.attention(Z, b)
-
-        dropout_mask = torch.ones_like(Z[:, 0:1, :, :]).to(Z.device).to(Z.dtype)
-        return bias_dropout_add(Z, self.out_bias, dropout_mask, Z_raw, prob=self.p_drop)
-
-
-class TriangleAttentionEndingNode(nn.Module):
-
-    def __init__(self, d_pair, p_drop, c=32, n_head=4):
-        super(TriangleAttentionEndingNode, self).__init__()
-        self.d_pair = d_pair
-        self.c = c
-        self.n_head = n_head
-        self.p_drop = p_drop
-
-        self.layernorm1 = LayerNorm(d_pair)
-        _init_weights = torch.nn.init.normal_(torch.zeros([d_pair, n_head]),
-                                              std=1.0 / math.sqrt(d_pair))
-        self.linear_b_weights = nn.parameter.Parameter(data=_init_weights)
-        self.attention = SelfAttention(qkv_dim=d_pair,
-                                       c=c,
-                                       n_head=n_head,
-                                       out_dim=d_pair,
-                                       gating=True,
-                                       last_bias_fuse=True)
-
-        self.out_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
-
-    def forward(self, Z_raw):
-        Z = Z_raw.transpose(-2, -3)
-        Z = self.layernorm1(Z)
-        b = torch.einsum('bqkc,ch->bhqk', Z, self.linear_b_weights)
-
-        Z = self.attention(Z, b)
-
-        Z = Z.transpose(-2, -3)
-        dropout_mask = torch.ones_like(Z[:, :, 0:1, :]).to(Z.device).to(Z.dtype)
-        return bias_dropout_add(Z, self.out_bias, dropout_mask, Z_raw, prob=self.p_drop)
-
-
-class PairStack(nn.Module):
-
-    def __init__(self, d_pair, p_drop=0.25):
-        super(PairStack, self).__init__()
-
-        self.TriangleMultiplicationOutgoing = TriangleMultiplicationOutgoing(d_pair, p_drop=p_drop)
-        self.TriangleMultiplicationIncoming = TriangleMultiplicationIncoming(d_pair, p_drop=p_drop)
-        self.TriangleAttentionStartingNode = TriangleAttentionStartingNode(d_pair, p_drop=p_drop)
-        self.TriangleAttentionEndingNode = TriangleAttentionEndingNode(d_pair, p_drop=p_drop)
-        self.PairTransition = Transition(d=d_pair)
-
-    def forward(self, pair):
-        pair = self.TriangleMultiplicationOutgoing(pair)
-        pair = self.TriangleMultiplicationIncoming(pair)
-        pair = self.TriangleAttentionStartingNode(pair)
-        pair = self.TriangleAttentionEndingNode(pair)
-        pair = self.PairTransition(pair)
-        return pair
diff --git a/tests/test_autochunk/openfold/checkpointing.py b/tests/test_autochunk/openfold/checkpointing.py
deleted file mode 100644
index 83e77c638ec1..000000000000
--- a/tests/test_autochunk/openfold/checkpointing.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# Copyright 2021 AlQuraishi Laboratory
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch.utils.checkpoint
-from typing import Any, Tuple, List, Callable, Optional
-
-
-BLOCK_ARG = Any
-BLOCK_ARGS = List[BLOCK_ARG]
-
-
-def get_checkpoint_fn():
-    checkpoint = torch.utils.checkpoint.checkpoint
-
-    return checkpoint
-
-
-@torch.jit.ignore
-def checkpoint_blocks(
-    blocks: List[Callable],
-    args: BLOCK_ARGS,
-    blocks_per_ckpt: Optional[int],
-) -> BLOCK_ARGS:
-    """
-    Chunk a list of blocks and run each chunk with activation
-    checkpointing. We define a "block" as a callable whose only inputs are
-    the outputs of the previous block.
-
-    Implements Subsection 1.11.8
-
-    Args:
-        blocks:
-            List of blocks
-        args:
-            Tuple of arguments for the first block.
-        blocks_per_ckpt:
-            Size of each chunk. A higher value corresponds to fewer 
-            checkpoints, and trades memory for speed. If None, no checkpointing 
-            is performed.
-    Returns:
-        The output of the final block
-    """
-    def wrap(a):
-        return (a,) if type(a) is not tuple else a
-
-    def exec(b, a):
-        for block in b:
-            a = wrap(block(*a))
-        return a
-
-    def chunker(s, e):
-        def exec_sliced(*a):
-            return exec(blocks[s:e], a)
-
-        return exec_sliced
-
-    # Avoids mishaps when the blocks take just one argument
-    args = wrap(args)
-
-    if blocks_per_ckpt is None:
-        return exec(blocks, args)
-    elif blocks_per_ckpt < 1 or blocks_per_ckpt > len(blocks):
-        raise ValueError("blocks_per_ckpt must be between 1 and len(blocks)")
-
-    checkpoint = get_checkpoint_fn() 
-
-    for s in range(0, len(blocks), blocks_per_ckpt):
-        e = s + blocks_per_ckpt
-        args = checkpoint(chunker(s, e), *args)
-        args = wrap(args)
-
-    return args
diff --git a/tests/test_autochunk/openfold/dropout.py b/tests/test_autochunk/openfold/dropout.py
deleted file mode 100644
index 651b9775ef44..000000000000
--- a/tests/test_autochunk/openfold/dropout.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright 2021 AlQuraishi Laboratory
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import torch
-import torch.nn as nn
-from functools import partialmethod
-from typing import Union, List
-
-
-class Dropout(nn.Module):
-    """
-    Implementation of dropout with the ability to share the dropout mask
-    along a particular dimension.
-
-    If not in training mode, this module computes the identity function.
-    """
-
-    def __init__(self, r: float, batch_dim: Union[int, List[int]]):
-        """
-        Args:
-            r:
-                Dropout rate
-            batch_dim:
-                Dimension(s) along which the dropout mask is shared
-        """
-        super(Dropout, self).__init__()
-
-        self.r = r
-        if type(batch_dim) == int:
-            batch_dim = [batch_dim]
-        self.batch_dim = batch_dim
-        self.dropout = nn.Dropout(self.r)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Args:
-            x:
-                Tensor to which dropout is applied. Can have any shape
-                compatible with self.batch_dim
-        """
-        shape = list(x.shape)
-        if self.batch_dim is not None:
-            for bd in self.batch_dim:
-                shape[bd] = 1
-        mask = x.new_ones(shape)
-        mask = self.dropout(mask)
-        x *= mask
-        return x
-
-
-class DropoutRowwise(Dropout):
-    """
-    Convenience class for rowwise dropout as described in subsection
-    1.11.6.
-    """
-
-    __init__ = partialmethod(Dropout.__init__, batch_dim=-3)
-
-
-class DropoutColumnwise(Dropout):
-    """
-    Convenience class for columnwise dropout as described in subsection
-    1.11.6.
-    """
-
-    __init__ = partialmethod(Dropout.__init__, batch_dim=-2)
diff --git a/tests/test_autochunk/openfold/evoformer.py b/tests/test_autochunk/openfold/evoformer.py
deleted file mode 100644
index b53ec1aa51e5..000000000000
--- a/tests/test_autochunk/openfold/evoformer.py
+++ /dev/null
@@ -1,431 +0,0 @@
-# Copyright 2021 AlQuraishi Laboratory
-# Copyright 2021 DeepMind Technologies Limited
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-import torch
-import torch.nn as nn
-from typing import Tuple, Optional
-from functools import partial
-
-from .primitives import Linear, LayerNorm
-from .dropout import DropoutRowwise, DropoutColumnwise
-from .msa import (
-    MSARowAttentionWithPairBias,
-    MSAColumnAttention,
-    MSAColumnGlobalAttention,
-)
-from .outer_product_mean import OuterProductMean
-from .pair_transition import PairTransition
-from .triangular_attention import (
-    TriangleAttentionStartingNode,
-    TriangleAttentionEndingNode,
-)
-from .triangular_multiplicative_update import (
-    TriangleMultiplicationOutgoing,
-    TriangleMultiplicationIncoming,
-)
-from .checkpointing import checkpoint_blocks, get_checkpoint_fn
-from .tensor_utils import chunk_layer
-
-
-class MSATransition(nn.Module):
-    """
-    Feed-forward network applied to MSA activations after attention.
-
-    Implements Algorithm 9
-    """
-    def __init__(self, c_m, n):
-        """
-        Args:
-            c_m:
-                MSA channel dimension
-            n:
-                Factor multiplied to c_m to obtain the hidden channel
-                dimension
-        """
-        super(MSATransition, self).__init__()
-
-        self.c_m = c_m
-        self.n = n
-
-        self.layer_norm = LayerNorm(self.c_m)
-        self.linear_1 = Linear(self.c_m, self.n * self.c_m, init="relu")
-        self.relu = nn.ReLU()
-        self.linear_2 = Linear(self.n * self.c_m, self.c_m, init="final")
-
-    def _transition(self, m, mask):
-        m = self.linear_1(m)
-        m = self.relu(m)
-        m = self.linear_2(m) * mask
-        return m
-
-    @torch.jit.ignore
-    def _chunk(self,
-        m: torch.Tensor,
-        mask: torch.Tensor,
-        chunk_size: int,
-    ) -> torch.Tensor:
-         return chunk_layer(
-             self._transition,
-             {"m": m, "mask": mask},
-             chunk_size=chunk_size,
-             no_batch_dims=len(m.shape[:-2]),
-         )
-
-    def forward(
-        self,
-        m: torch.Tensor,
-        mask: Optional[torch.Tensor] = None,
-        chunk_size: Optional[int] = None,
-    ) -> torch.Tensor:
-        """
-        Args:
-            m:
-                [*, N_seq, N_res, C_m] MSA activation
-            mask:
-                [*, N_seq, N_res, C_m] MSA mask
-        Returns:
-            m:
-                [*, N_seq, N_res, C_m] MSA activation update
-        """
-
-        # DISCREPANCY: DeepMind forgets to apply the MSA mask here.
-        if mask is None:
-            mask = m.new_ones(m.shape[:-1])
-
-        # [*, N_seq, N_res, 1]
-        mask = mask.unsqueeze(-1)
-
-        m = self.layer_norm(m)
-
-        if chunk_size is not None:
-            m = self._chunk(m, mask, chunk_size)
-        else:
-            m = self._transition(m, mask)
-
-        return m
-
-
-class EvoformerBlockCore(nn.Module):
-    def __init__(
-        self,
-        c_m: int,
-        c_z: int,
-        c_hidden_opm: int,
-        c_hidden_mul: int,
-        c_hidden_pair_att: int,
-        no_heads_msa: int,
-        no_heads_pair: int,
-        transition_n: int,
-        pair_dropout: float,
-        inf: float,
-        eps: float,
-        _is_extra_msa_stack: bool = False,
-        is_multimer: bool = False,
-    ):
-        super(EvoformerBlockCore, self).__init__()
-        self.is_multimer = is_multimer
-        self.msa_transition = MSATransition(
-            c_m=c_m,
-            n=transition_n,
-        )
-
-        self.outer_product_mean = OuterProductMean(
-            c_m,
-            c_z,
-            c_hidden_opm,
-        )
-
-        self.tri_mul_out = TriangleMultiplicationOutgoing(
-            c_z,
-            c_hidden_mul,
-        )
-        self.tri_mul_in = TriangleMultiplicationIncoming(
-            c_z,
-            c_hidden_mul,
-        )
-
-        self.tri_att_start = TriangleAttentionStartingNode(
-            c_z,
-            c_hidden_pair_att,
-            no_heads_pair,
-            inf=inf,
-        )
-        self.tri_att_end = TriangleAttentionEndingNode(
-            c_z,
-            c_hidden_pair_att,
-            no_heads_pair,
-            inf=inf,
-        )
-
-        self.pair_transition = PairTransition(
-            c_z,
-            transition_n,
-        )
-
-        self.ps_dropout_row_layer = DropoutRowwise(pair_dropout)
-        self.ps_dropout_col_layer = DropoutColumnwise(pair_dropout)
-
-    def forward(
-        self,
-        m: torch.Tensor,
-        z: torch.Tensor,
-        chunk_size: Optional[int] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]: 
-        # DeepMind doesn't mask these transitions in the source, so _mask_trans
-        # should be disabled to better approximate the exact activations of
-        # the original.
-
-        m = m + self.msa_transition(
-            m, chunk_size=chunk_size
-        )
-        z = z + self.outer_product_mean(
-            m, chunk_size=chunk_size
-        )
-        z = z + self.ps_dropout_row_layer(self.tri_mul_out(z))
-        z = z + self.ps_dropout_row_layer(self.tri_mul_in(z))
-        z = z + self.ps_dropout_row_layer(
-            self.tri_att_start(z, chunk_size=chunk_size)
-        )
-        z = z + self.ps_dropout_col_layer(
-            self.tri_att_end(z, chunk_size=chunk_size)
-        )
-        z = z + self.pair_transition(
-            z, chunk_size=chunk_size
-        )
-
-        return m, z
-
-
-class EvoformerBlock(nn.Module):
-    def __init__(self,
-        c_m: int,
-        c_z: int,
-        c_hidden_msa_att: int,
-        c_hidden_opm: int,
-        c_hidden_mul: int,
-        c_hidden_pair_att: int,
-        no_heads_msa: int,
-        no_heads_pair: int,
-        transition_n: int,
-        msa_dropout: float,
-        pair_dropout: float,
-        inf: float,
-        eps: float,
-        is_multimer: bool,
-    ):
-        super(EvoformerBlock, self).__init__()
-
-        self.msa_att_row = MSARowAttentionWithPairBias(
-            c_m=c_m,
-            c_z=c_z,
-            c_hidden=c_hidden_msa_att,
-            no_heads=no_heads_msa,
-            inf=inf,
-        )
-
-        self.msa_att_col = MSAColumnAttention(
-            c_m,
-            c_hidden_msa_att,
-            no_heads_msa,
-            inf=inf,
-        )
-
-        self.msa_dropout_layer = DropoutRowwise(msa_dropout)
-
-        self.core = EvoformerBlockCore(
-            c_m=c_m,
-            c_z=c_z,
-            c_hidden_opm=c_hidden_opm,
-            c_hidden_mul=c_hidden_mul,
-            c_hidden_pair_att=c_hidden_pair_att,
-            no_heads_msa=no_heads_msa,
-            no_heads_pair=no_heads_pair,
-            transition_n=transition_n,
-            pair_dropout=pair_dropout,
-            inf=inf,
-            eps=eps,
-        )
-        
-        self.outer_product_mean = OuterProductMean(
-            c_m,
-            c_z,
-            c_hidden_opm,
-        )
-        self.is_multimer = is_multimer
-
-    def forward(self,
-        m: torch.Tensor,
-        z: torch.Tensor,
-        chunk_size: Optional[int] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        m = m + self.msa_dropout_layer(
-            self.msa_att_row(m, z=z, chunk_size=chunk_size)
-        )
-        m = m + self.msa_att_col(m, chunk_size=chunk_size)
-        m, z = self.core(
-            m, 
-            z, 
-            chunk_size=chunk_size, 
-        )
-
-        return m, z
-
-
-class EvoformerStack(nn.Module):
-    """
-    Main Evoformer trunk.
-
-    Implements Algorithm 6.
-    """
-
-    def __init__(
-        self,
-        c_m: int,
-        c_z: int,
-        c_hidden_msa_att: int,
-        c_hidden_opm: int,
-        c_hidden_mul: int,
-        c_hidden_pair_att: int,
-        c_s: int,
-        no_heads_msa: int,
-        no_heads_pair: int,
-        no_blocks: int,
-        transition_n: int,
-        msa_dropout: float,
-        pair_dropout: float,
-        blocks_per_ckpt: int,
-        inf: float,
-        eps: float,
-        clear_cache_between_blocks: bool = False, 
-        is_multimer: bool = False,
-        **kwargs,
-    ):
-        """
-        Args:
-            c_m:
-                MSA channel dimension
-            c_z:
-                Pair channel dimension
-            c_hidden_msa_att:
-                Hidden dimension in MSA attention
-            c_hidden_opm:
-                Hidden dimension in outer product mean module
-            c_hidden_mul:
-                Hidden dimension in multiplicative updates
-            c_hidden_pair_att:
-                Hidden dimension in triangular attention
-            c_s:
-                Channel dimension of the output "single" embedding
-            no_heads_msa:
-                Number of heads used for MSA attention
-            no_heads_pair:
-                Number of heads used for pair attention
-            no_blocks:
-                Number of Evoformer blocks in the stack
-            transition_n:
-                Factor by which to multiply c_m to obtain the MSATransition
-                hidden dimension
-            msa_dropout:
-                Dropout rate for MSA activations
-            pair_dropout:
-                Dropout used for pair activations
-            blocks_per_ckpt:
-                Number of Evoformer blocks in each activation checkpoint
-            clear_cache_between_blocks:
-                Whether to clear CUDA's GPU memory cache between blocks of the
-                stack. Slows down each block but can reduce fragmentation
-        """
-        super(EvoformerStack, self).__init__()
-
-        self.blocks_per_ckpt = blocks_per_ckpt
-        self.clear_cache_between_blocks = clear_cache_between_blocks
-
-        self.blocks = nn.ModuleList()
-
-        for _ in range(no_blocks):
-            block = EvoformerBlock(
-                c_m=c_m,
-                c_z=c_z,
-                c_hidden_msa_att=c_hidden_msa_att,
-                c_hidden_opm=c_hidden_opm,
-                c_hidden_mul=c_hidden_mul,
-                c_hidden_pair_att=c_hidden_pair_att,
-                no_heads_msa=no_heads_msa,
-                no_heads_pair=no_heads_pair,
-                transition_n=transition_n,
-                msa_dropout=msa_dropout,
-                pair_dropout=pair_dropout,
-                inf=inf,
-                eps=eps,
-                is_multimer=is_multimer,
-            )
-            self.blocks.append(block)
-
-        self.linear = Linear(c_m, c_s)
-
-    def forward(self,
-        m: torch.Tensor,
-        z: torch.Tensor,
-        msa_mask: torch.Tensor,
-        pair_mask: torch.Tensor,
-        chunk_size: int,
-        _mask_trans: bool = True,
-    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
-        """
-        Args:
-            m:
-                [*, N_seq, N_res, C_m] MSA embedding
-            z:
-                [*, N_res, N_res, C_z] pair embedding
-            msa_mask:
-                [*, N_seq, N_res] MSA mask
-            pair_mask:
-                [*, N_res, N_res] pair mask
-        Returns:
-            m:
-                [*, N_seq, N_res, C_m] MSA embedding
-            z:
-                [*, N_res, N_res, C_z] pair embedding
-            s:
-                [*, N_res, C_s] single embedding (or None if extra MSA stack)
-        """
-        blocks = [
-            partial(
-                b,
-                msa_mask=msa_mask,
-                pair_mask=pair_mask,
-                chunk_size=chunk_size,
-                _mask_trans=_mask_trans,
-            )
-            for b in self.blocks
-        ]
-
-        if(self.clear_cache_between_blocks):
-            def block_with_cache_clear(block, *args):
-                torch.cuda.empty_cache()
-                return block(*args)
-
-            blocks = [partial(block_with_cache_clear, b) for b in blocks]
-
-        m, z = checkpoint_blocks(
-            blocks,
-            args=(m, z),
-            blocks_per_ckpt=self.blocks_per_ckpt if self.training else None,
-        )
-
-        s = self.linear(m[..., 0, :, :])
-        
-        return m, z, s
diff --git a/tests/test_autochunk/openfold/msa.py b/tests/test_autochunk/openfold/msa.py
deleted file mode 100644
index 7c137286feab..000000000000
--- a/tests/test_autochunk/openfold/msa.py
+++ /dev/null
@@ -1,331 +0,0 @@
-# Copyright 2021 AlQuraishi Laboratory
-# Copyright 2021 DeepMind Technologies Limited
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-import torch
-import torch.nn as nn
-from typing import Optional, List, Tuple
-
-from .primitives import (
-    Linear, 
-    LayerNorm,
-    Attention, 
-    GlobalAttention, 
-    _attention_chunked_trainable,
-)
-from .checkpointing import get_checkpoint_fn
-from .tensor_utils import (
-    chunk_layer,
-    permute_final_dims,
-    flatten_final_dims,
-)
-
-
-class MSAAttention(nn.Module):
-    def __init__(
-        self,
-        c_in,
-        c_hidden,
-        no_heads,
-        pair_bias=False,
-        c_z=None,
-        inf=1e9,
-    ):
-        """
-        Args:
-            c_in:
-                Input channel dimension
-            c_hidden:
-                Per-head hidden channel dimension
-            no_heads:
-                Number of attention heads
-            pair_bias:
-                Whether to use pair embedding bias
-            c_z:
-                Pair embedding channel dimension. Ignored unless pair_bias
-                is true
-            inf:
-                A large number to be used in computing the attention mask
-        """
-        super(MSAAttention, self).__init__()
-
-        self.c_in = c_in
-        self.c_hidden = c_hidden
-        self.no_heads = no_heads
-        self.pair_bias = pair_bias
-        self.c_z = c_z
-        self.inf = inf
-
-        self.layer_norm_m = LayerNorm(self.c_in)
-
-        self.layer_norm_z = None
-        self.linear_z = None
-        if self.pair_bias:
-            self.layer_norm_z = LayerNorm(self.c_z)
-            self.linear_z = Linear(
-                self.c_z, self.no_heads, bias=False, init="normal"
-            )
-        
-        self.mha = Attention(
-            self.c_in, self.c_in, self.c_in, self.c_hidden, self.no_heads
-        )
-
-    @torch.jit.ignore
-    def _chunk(self, 
-        m: torch.Tensor,
-        biases: List[torch.Tensor],
-        chunk_size: int,
-    ) -> torch.Tensor:
-        return chunk_layer(
-            self.mha,
-            {"q_x": m, "kv_x": m, "biases": biases},
-            chunk_size=chunk_size,
-            no_batch_dims=len(m.shape[:-2]),
-        )
-
-    def _prep_inputs(self,
-        m: torch.Tensor,
-        z: Optional[torch.Tensor],
-        mask: Optional[torch.Tensor]
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        # [*, N_seq, N_res, C_m]
-        m = self.layer_norm_m(m)
-
-        n_seq, n_res = m.shape[-3:-1]
-        if mask is None:
-            # [*, N_seq, N_res]
-            mask = m.new_ones(
-                m.shape[:-3] + (n_seq, n_res),
-            )
-
-        # [*, N_seq, 1, 1, N_res]
-        mask_bias = (self.inf * (mask - 1))[..., :, None, None, :]
-
-        # This step simply returns a larger view of the bias, and does not
-        # consume additional memory.
-        # [*, N_seq, no_heads, N_res, N_res]
-        #bias = bias.expand(
-        #    ((-1,) * len(bias.shape[:-4])) + (-1, self.no_heads, n_res, -1)
-        #)
-
-        if (self.pair_bias and 
-            z is not None and                       # For the 
-            self.layer_norm_z is not None and       # benefit of
-            self.linear_z is not None               # TorchScript
-        ):
-            # [*, N_res, N_res, C_z]
-            z = self.layer_norm_z(z)
-            
-            # [*, N_res, N_res, no_heads]
-            z = self.linear_z(z)
-            
-            # [*, 1, no_heads, N_res, N_res]
-            z = permute_final_dims(z, (2, 0, 1)).unsqueeze(-4)
-
-        return m, mask_bias, z
-
-
-    def forward(self, 
-        m: torch.Tensor, 
-        z: Optional[torch.Tensor] = None, 
-        mask: Optional[torch.Tensor] = None, 
-        chunk_size: Optional[int] = None,
-        _chunk_logits: Optional[int] = None,
-        _checkpoint_chunks: Optional[bool] = None,
-    ) -> torch.Tensor:
-        """
-        Args:
-            m:
-                [*, N_seq, N_res, C_m] MSA embedding
-            z:
-                [*, N_res, N_res, C_z] pair embedding. Required only if
-                pair_bias is True
-            mask:
-                [*, N_seq, N_res] MSA mask
-            chunk_size:
-                Size of chunks into which the inputs are split along their
-                batch dimensions. A low value decreases memory overhead at the 
-                cost of slower execution. Chunking is not performed by default.
-                
-        """
-        m, mask_bias, z = self._prep_inputs(m, z, mask)
-
-        biases = [mask_bias]
-        if(z is not None):
-            biases.append(z)
-
-        if chunk_size is not None:
-            m = self._chunk(m, biases, chunk_size)
-        else:
-            m = self.mha(
-                q_x=m, 
-                kv_x=m, 
-                biases=biases 
-            )
-
-        return m
-
-
-class MSARowAttentionWithPairBias(MSAAttention):
-    """
-    Implements Algorithm 7.
-    """
-
-    def __init__(self, c_m, c_z, c_hidden, no_heads, inf=1e9):
-        """
-        Args:
-            c_m:
-                Input channel dimension
-            c_z:
-                Pair embedding channel dimension
-            c_hidden:
-                Per-head hidden channel dimension
-            no_heads:
-                Number of attention heads
-            inf:
-                Large number used to construct attention masks
-        """
-        super(MSARowAttentionWithPairBias, self).__init__(
-            c_m,
-            c_hidden,
-            no_heads,
-            pair_bias=True,
-            c_z=c_z,
-            inf=inf,
-        )
-
-
-class MSAColumnAttention(nn.Module):
-    """
-    Implements Algorithm 8.
-
-    By rights, this should also be a subclass of MSAAttention. Alas,
-    most inheritance isn't supported by TorchScript.
-    """
-
-    def __init__(self, c_m, c_hidden, no_heads, inf=1e9):
-        """
-        Args:
-            c_m:
-                MSA channel dimension
-            c_hidden:
-                Per-head hidden channel dimension
-            no_heads:
-                Number of attention heads
-            inf:
-                Large number used to construct attention masks
-        """
-        super(MSAColumnAttention, self).__init__()
-        
-        self.c_m = c_m
-        self.c_hidden = c_hidden
-        self.no_heads = no_heads
-        self.inf = inf
-
-        self._msa_att = MSAAttention(
-            c_in=c_m,
-            c_hidden=c_hidden,
-            no_heads=no_heads,
-            pair_bias=False,
-            c_z=None,
-            inf=inf,
-        )
-
-    def forward(self, 
-        m: torch.Tensor, 
-        mask: Optional[torch.Tensor] = None, 
-        chunk_size: Optional[int] = None
-    ) -> torch.Tensor:
-        """
-        Args:
-            m:
-                [*, N_seq, N_res, C_m] MSA embedding
-            mask:
-                [*, N_seq, N_res] MSA mask
-            chunk_size:
-                Size of chunks into which the inputs are split along their
-                batch dimensions. A low value decreases memory overhead at the 
-                cost of slower execution. Chunking is not performed by default.
-        """ 
-        # [*, N_res, N_seq, C_in]
-        m = m.transpose(-2, -3)
-
-        m = self._msa_att(m, chunk_size=chunk_size)
-
-        # [*, N_seq, N_res, C_in]
-        m = m.transpose(-2, -3)
-
-        return m
-
-
-class MSAColumnGlobalAttention(nn.Module):
-    def __init__(
-        self, c_in, c_hidden, no_heads, inf=1e9, eps=1e-10,
-    ):
-        super(MSAColumnGlobalAttention, self).__init__()
-
-        self.c_in = c_in
-        self.c_hidden = c_hidden
-        self.no_heads = no_heads
-        self.inf = inf
-        self.eps = eps
-
-        self.layer_norm_m = nn.LayerNorm(c_in)
-
-        self.global_attention = GlobalAttention(
-            c_in=c_in,
-            c_hidden=c_hidden,
-            no_heads=no_heads,
-            inf=inf,
-            eps=eps,
-        )
-
-    @torch.jit.ignore
-    def _chunk(self,
-        m: torch.Tensor,
-        chunk_size: int,
-    ) -> torch.Tensor:
-        mha_input = {
-            "m": m,
-        }
-        return chunk_layer(
-            self.global_attention,
-            mha_input,
-            chunk_size=chunk_size,
-            no_batch_dims=len(m.shape[:-2]),
-        )
-
-    def forward(
-        self, 
-        m: torch.Tensor, 
-        chunk_size: Optional[int] = None,
-    ) -> torch.Tensor:
-        n_seq, n_res, c_in = m.shape[-3:]
-
-        # [*, N_res, N_seq, C_in]
-        m = m.transpose(-2, -3)
-
-        # [*, N_res, N_seq, C_in]
-        m = self.layer_norm_m(m)
-
-        if chunk_size is not None:
-            m = self._chunk(m, chunk_size) 
-        else:
-            m = self.global_attention(m=m)
-
-        # [*, N_seq, N_res, C_in]
-        m = m.transpose(-2, -3)
-
-        return m
diff --git a/tests/test_autochunk/openfold/outer_product_mean.py b/tests/test_autochunk/openfold/outer_product_mean.py
deleted file mode 100644
index daadf1c272cf..000000000000
--- a/tests/test_autochunk/openfold/outer_product_mean.py
+++ /dev/null
@@ -1,129 +0,0 @@
-# Copyright 2021 AlQuraishi Laboratory
-# Copyright 2021 DeepMind Technologies Limited
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from functools import partial
-from typing import Optional
-
-import torch
-import torch.nn as nn
-
-from .primitives import Linear
-from .tensor_utils import chunk_layer
-
-
-class OuterProductMean(nn.Module):
-    """
-    Implements Algorithm 10.
-    """
-
-    def __init__(self, c_m, c_z, c_hidden, eps=1e-3):
-        """
-        Args:
-            c_m:
-                MSA embedding channel dimension
-            c_z:
-                Pair embedding channel dimension
-            c_hidden:
-                Hidden channel dimension
-        """
-        super(OuterProductMean, self).__init__()
-
-        self.c_m = c_m
-        self.c_z = c_z
-        self.c_hidden = c_hidden
-        self.eps = eps
-
-        self.layer_norm = nn.LayerNorm(c_m)
-        self.linear_1 = Linear(c_m, c_hidden)
-        self.linear_2 = Linear(c_m, c_hidden)
-        self.linear_out = Linear(c_hidden ** 2, c_z, init="final")
-
-    def _opm(self, a, b):
-        # [*, N_res, N_res, C, C]
-        outer = torch.einsum("...bac,...dae->...bdce", a, b)
-
-        # [*, N_res, N_res, C * C]
-        outer = outer.reshape(outer.shape[:-2] + (-1,))
-
-        # [*, N_res, N_res, C_z]
-        outer = self.linear_out(outer)
-
-        return outer
-
-    @torch.jit.ignore
-    def _chunk(self, 
-        a: torch.Tensor, 
-        b: torch.Tensor, 
-        chunk_size: int
-    ) -> torch.Tensor:
-        # Since the "batch dim" in this case is not a true batch dimension
-        # (in that the shape of the output depends on it), we need to
-        # iterate over it ourselves
-        a_reshape = a.reshape((-1,) + a.shape[-3:])
-        b_reshape = b.reshape((-1,) + b.shape[-3:])
-        out = []
-        for a_prime, b_prime in zip(a_reshape, b_reshape):
-            outer = chunk_layer(
-                partial(self._opm, b=b_prime),
-                {"a": a_prime},
-                chunk_size=chunk_size,
-                no_batch_dims=1,
-            )
-            out.append(outer)
-        outer = torch.stack(out, dim=0)
-        outer = outer.reshape(a.shape[:-3] + outer.shape[1:])
-
-        return outer
-
-    def forward(self, 
-        m: torch.Tensor, 
-        mask: Optional[torch.Tensor] = None,
-        chunk_size: Optional[int] = None
-    ) -> torch.Tensor:
-        """
-        Args:
-            m:
-                [*, N_seq, N_res, C_m] MSA embedding
-            mask:
-                [*, N_seq, N_res] MSA mask
-        Returns:
-            [*, N_res, N_res, C_z] pair embedding update
-        """
-        if mask is None:
-            mask = m.new_ones(m.shape[:-1])
-
-        # [*, N_seq, N_res, C_m]
-        m = self.layer_norm(m)
-
-        # [*, N_seq, N_res, C]
-        mask = mask.unsqueeze(-1)
-        a = self.linear_1(m) * mask
-        b = self.linear_2(m) * mask
-
-        a = a.transpose(-2, -3)
-        b = b.transpose(-2, -3)
-
-        if chunk_size is not None:
-            outer = self._chunk(a, b, chunk_size)
-        else:
-            outer = self._opm(a, b)
-
-        # [*, N_res, N_res, 1]
-        norm = torch.einsum("...abc,...adc->...bdc", mask, mask)
-
-        # [*, N_res, N_res, C_z]
-        outer = outer / (self.eps + norm)
-
-        return outer
diff --git a/tests/test_autochunk/openfold/pair_transition.py b/tests/test_autochunk/openfold/pair_transition.py
deleted file mode 100644
index 7d09914dc3cc..000000000000
--- a/tests/test_autochunk/openfold/pair_transition.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright 2021 AlQuraishi Laboratory
-# Copyright 2021 DeepMind Technologies Limited
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Optional
-
-import torch
-import torch.nn as nn
-
-from .primitives import Linear, LayerNorm
-from .tensor_utils import chunk_layer
-
-
-class PairTransition(nn.Module):
-    """
-    Implements Algorithm 15.
-    """
-
-    def __init__(self, c_z, n):
-        """
-        Args:
-            c_z:
-                Pair transition channel dimension
-            n:
-                Factor by which c_z is multiplied to obtain hidden channel
-                dimension
-        """
-        super(PairTransition, self).__init__()
-
-        self.c_z = c_z
-        self.n = n
-
-        self.layer_norm = LayerNorm(self.c_z)
-        self.linear_1 = Linear(self.c_z, self.n * self.c_z, init="relu")
-        self.relu = nn.ReLU()
-        self.linear_2 = Linear(self.n * self.c_z, c_z, init="final")
-
-    def _transition(self, z, mask):
-        # [*, N_res, N_res, C_hidden]
-        z = self.linear_1(z)
-        z = self.relu(z)
-
-        # [*, N_res, N_res, C_z]
-        z = self.linear_2(z) * mask
-
-        return z
-
-    @torch.jit.ignore
-    def _chunk(self,
-        z: torch.Tensor,
-        mask: torch.Tensor,
-        chunk_size: int,
-    ) -> torch.Tensor:
-        return chunk_layer(
-            self._transition,
-            {"z": z, "mask": mask},
-            chunk_size=chunk_size,
-            no_batch_dims=len(z.shape[:-2]),
-        )
-
-
-    def forward(self, 
-        z: torch.Tensor, 
-        mask: Optional[torch.Tensor] = None,
-        chunk_size: Optional[int] = None,
-    ) -> torch.Tensor:
-        """
-        Args:
-            z:
-                [*, N_res, N_res, C_z] pair embedding
-        Returns:
-            [*, N_res, N_res, C_z] pair embedding update
-        """
-        # DISCREPANCY: DeepMind forgets to apply the mask in this module.
-        if mask is None:
-            mask = z.new_ones(z.shape[:-1])
-
-        # [*, N_res, N_res, 1]
-        mask = mask.unsqueeze(-1)
-
-        # [*, N_res, N_res, C_z]
-        z = self.layer_norm(z)
-
-        if chunk_size is not None:
-            z = self._chunk(z, mask, chunk_size)
-        else:
-            z = self._transition(z=z, mask=mask)
-
-        return z
diff --git a/tests/test_autochunk/openfold/primitives.py b/tests/test_autochunk/openfold/primitives.py
deleted file mode 100644
index 32a9d487c441..000000000000
--- a/tests/test_autochunk/openfold/primitives.py
+++ /dev/null
@@ -1,529 +0,0 @@
-# Copyright 2021 AlQuraishi Laboratory
-# Copyright 2021 DeepMind Technologies Limited
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from functools import partial
-import math
-from typing import Optional, Callable, List, Tuple, Sequence
-import numpy as np
-
-import torch
-import torch.nn as nn
-
-from .checkpointing import get_checkpoint_fn
-from .tensor_utils import (
-    permute_final_dims,
-    flatten_final_dims,
-    _chunk_slice,
-)
-
-
-def _prod(nums):
-    out = 1
-    for n in nums:
-        out = out * n
-    return out
-
-
-def _calculate_fan(linear_weight_shape, fan="fan_in"):
-    fan_out, fan_in = linear_weight_shape
-
-    if fan == "fan_in":
-        f = fan_in
-    elif fan == "fan_out":
-        f = fan_out
-    elif fan == "fan_avg":
-        f = (fan_in + fan_out) / 2
-    else:
-        raise ValueError("Invalid fan option")
-
-    return f
-
-
-def glorot_uniform_init_(weights):
-    nn.init.xavier_uniform_(weights, gain=1)
-
-
-def final_init_(weights):
-    with torch.no_grad():
-        weights.fill_(0.0)
-
-
-def gating_init_(weights):
-    with torch.no_grad():
-        weights.fill_(0.0)
-
-
-def normal_init_(weights):
-    torch.nn.init.kaiming_normal_(weights, nonlinearity="linear")
-
-
-def ipa_point_weights_init_(weights):
-    with torch.no_grad():
-        softplus_inverse_1 = 0.541324854612918
-        weights.fill_(softplus_inverse_1)
-
-
-class Linear(nn.Linear):
-    """
-    A Linear layer with built-in nonstandard initializations. Called just
-    like torch.nn.Linear.
-
-    Implements the initializers in 1.11.4, plus some additional ones found
-    in the code.
-    """
-
-    def __init__(
-        self,
-        in_dim: int,
-        out_dim: int,
-        bias: bool = True,
-        init: str = "default",
-        init_fn: Optional[Callable[[torch.Tensor, torch.Tensor], None]] = None,
-    ):
-        """
-        Args:
-            in_dim:
-                The final dimension of inputs to the layer
-            out_dim:
-                The final dimension of layer outputs
-            bias:
-                Whether to learn an additive bias. True by default
-            init:
-                The initializer to use. Choose from:
-
-                "default": LeCun fan-in truncated normal initialization
-                "relu": He initialization w/ truncated normal distribution
-                "glorot": Fan-average Glorot uniform initialization
-                "gating": Weights=0, Bias=1
-                "normal": Normal initialization with std=1/sqrt(fan_in)
-                "final": Weights=0, Bias=0
-
-                Overridden by init_fn if the latter is not None.
-            init_fn:
-                A custom initializer taking weight and bias as inputs.
-                Overrides init if not None.
-        """
-        super(Linear, self).__init__(in_dim, out_dim, bias=bias)
-
-        if bias:
-            with torch.no_grad():
-                self.bias.fill_(0)
-
-        if init_fn is not None:
-            init_fn(self.weight, self.bias)
-        else:
-            if init == "default":
-                normal_init_(self.weight)
-            elif init == "relu":
-                normal_init_(self.weight)
-            elif init == "glorot":
-                glorot_uniform_init_(self.weight)
-            elif init == "gating":
-                gating_init_(self.weight)
-                if bias:
-                    with torch.no_grad():
-                        self.bias.fill_(1.0)
-            elif init == "normal":
-                normal_init_(self.weight)
-            elif init == "final":
-                final_init_(self.weight)
-            else:
-                raise ValueError("Invalid init string.")
-
-
-class LayerNorm(nn.Module):
-
-    def __init__(self, c_in, eps=1e-5):
-        super(LayerNorm, self).__init__()
-
-        self.c_in = (c_in,)
-        self.eps = eps
-
-        self.weight = nn.Parameter(torch.ones(c_in))
-        self.bias = nn.Parameter(torch.zeros(c_in))
-
-    def forward(self, x):
-        out = nn.functional.layer_norm(
-            x,
-            self.c_in,
-            self.weight,
-            self.bias,
-            self.eps,
-        )
-
-        return out
-
-
-@torch.jit.ignore
-def softmax(t: torch.Tensor, dim: int = -1) -> torch.Tensor:
-    """
-        Softmax, but without automatic casting to fp32 when the input is of
-        type bfloat16
-    """
-    s = torch.nn.functional.softmax(t, dim=dim)
-
-    return s
-
-
-#@torch.jit.script
-def _attention(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor,
-               biases: List[torch.Tensor]) -> torch.Tensor:
-    # [*, H, Q, C_hidden]
-    query = permute_final_dims(query, (1, 0, 2))
-
-    # [*, H, C_hidden, K]
-    key = permute_final_dims(key, (1, 2, 0))
-
-    # [*, H, V, C_hidden]
-    value = permute_final_dims(value, (1, 0, 2))
-
-    # [*, H, Q, K]
-    a = torch.matmul(query, key)
-
-    for b in biases:
-        a += b
-
-    a = softmax(a, -1)
-
-    # [*, H, Q, C_hidden]
-    a = torch.matmul(a, value)
-
-    # [*, Q, H, C_hidden]
-    a = a.transpose(-2, -3)
-
-    return a
-
-
-@torch.jit.ignore
-def _attention_chunked_trainable(
-    query,
-    key,
-    value,
-    biases,
-    chunk_size,
-    chunk_dim,
-    checkpoint,
-):
-    if (checkpoint and len(biases) > 2):
-        raise ValueError("Checkpointed version permits only permits two bias terms")
-
-    def _checkpointable_attention(q, k, v, b1, b2):
-        bs = [b for b in [b1, b2] if b is not None]
-        return _attention(q, k, v, bs)
-
-    o_chunks = []
-    checkpoint_fn = get_checkpoint_fn()
-    count = query.shape[chunk_dim]
-    for start in range(0, count, chunk_size):
-        end = start + chunk_size
-        idx = [slice(None)] * len(query.shape)
-        idx[chunk_dim] = slice(start, end)
-        idx_tup = tuple(idx)
-        q_chunk = query[idx_tup]
-        k_chunk = key[idx_tup]
-        v_chunk = value[idx_tup]
-
-        def _slice_bias(b):
-            idx[chunk_dim] = (slice(start, end) if b.shape[chunk_dim] != 1 else slice(None))
-            return b[tuple(idx)]
-
-        if (checkpoint):
-            bias_1_chunk, bias_2_chunk = [
-                _slice_bias(b) if b is not None else None for b in (biases + [None, None])[:2]
-            ]
-
-            o_chunk = checkpoint_fn(_checkpointable_attention, q_chunk, k_chunk, v_chunk,
-                                    bias_1_chunk, bias_2_chunk)
-        else:
-            bias_chunks = [_slice_bias(b) for b in biases]
-
-            o_chunk = _attention(q_chunk, k_chunk, v_chunk, bias_chunks)
-
-        o_chunks.append(o_chunk)
-
-    o = torch.cat(o_chunks, dim=chunk_dim)
-    return o
-
-
-class Attention(nn.Module):
-    """
-    Standard multi-head attention using AlphaFold's default layer
-    initialization. Allows multiple bias vectors.
-    """
-
-    def __init__(
-        self,
-        c_q: int,
-        c_k: int,
-        c_v: int,
-        c_hidden: int,
-        no_heads: int,
-        gating: bool = True,
-    ):
-        """
-        Args:
-            c_q:
-                Input dimension of query data
-            c_k:
-                Input dimension of key data
-            c_v:
-                Input dimension of value data
-            c_hidden:
-                Per-head hidden dimension
-            no_heads:
-                Number of attention heads
-            gating:
-                Whether the output should be gated using query data
-        """
-        super(Attention, self).__init__()
-
-        self.c_q = c_q
-        self.c_k = c_k
-        self.c_v = c_v
-        self.c_hidden = c_hidden
-        self.no_heads = no_heads
-        self.gating = gating
-
-        # DISCREPANCY: c_hidden is not the per-head channel dimension, as
-        # stated in the supplement, but the overall channel dimension.
-
-        self.linear_q = Linear(self.c_q, self.c_hidden * self.no_heads, bias=False, init="glorot")
-        self.linear_k = Linear(self.c_k, self.c_hidden * self.no_heads, bias=False, init="glorot")
-        self.linear_v = Linear(self.c_v, self.c_hidden * self.no_heads, bias=False, init="glorot")
-        self.linear_o = Linear(self.c_hidden * self.no_heads, self.c_q, init="final")
-
-        self.linear_g = None
-        if self.gating:
-            self.linear_g = Linear(self.c_q, self.c_hidden * self.no_heads, init="gating")
-
-        self.sigmoid = nn.Sigmoid()
-
-    def _prep_qkv(self, q_x: torch.Tensor,
-                  kv_x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        # [*, Q/K/V, H * C_hidden]
-        q = self.linear_q(q_x)
-        k = self.linear_k(kv_x)
-        v = self.linear_v(kv_x)
-
-        # [*, Q/K, H, C_hidden]
-        q = q.view(q.shape[:-1] + (self.no_heads, -1))
-        k = k.view(k.shape[:-1] + (self.no_heads, -1))
-        v = v.view(v.shape[:-1] + (self.no_heads, -1))
-
-        q /= math.sqrt(self.c_hidden)
-
-        return q, k, v
-
-    def _wrap_up(self, o: torch.Tensor, q_x: torch.Tensor) -> torch.Tensor:
-        if (self.linear_g is not None):
-            g = self.sigmoid(self.linear_g(q_x))
-
-            # [*, Q, H, C_hidden]
-            g = g.view(g.shape[:-1] + (self.no_heads, -1))
-            o = o * g
-
-        # [*, Q, H * C_hidden]
-        o = flatten_final_dims(o, 2)
-
-        # [*, Q, C_q]
-        o = self.linear_o(o)
-
-        return o
-
-    def forward(
-        self,
-        q_x: torch.Tensor,
-        kv_x: torch.Tensor,
-        biases: Optional[List[torch.Tensor]] = None,
-        use_lma: bool = False,
-        q_chunk_size: Optional[int] = None,
-        kv_chunk_size: Optional[int] = None,
-    ) -> torch.Tensor:
-        """
-        Args:
-            q_x:
-                [*, Q, C_q] query data
-            kv_x:
-                [*, K, C_k] key data
-            biases:
-                List of biases that broadcast to [*, H, Q, K]
-            use_lma:
-                Whether to use low-memory attention
-            q_chunk_size:
-                Query chunk size (for LMA)
-            kv_chunk_size:
-                Key/Value chunk size (for LMA)
-        Returns
-            [*, Q, C_q] attention update
-        """
-        if (biases is None):
-            biases = []
-        if (use_lma and (q_chunk_size is None or kv_chunk_size is None)):
-            raise ValueError("If use_lma is specified, q_chunk_size and kv_chunk_size must "
-                             "be provided")
-
-        q, k, v = self._prep_qkv(q_x, kv_x)
-
-        if (use_lma):
-            biases = [b.expand(b.shape[:-2] + (q_x.shape[-2],) + (kv_x.shape[-2],)) for b in biases]
-
-            o = _lma(q, k, v, biases, q_chunk_size, kv_chunk_size)
-        else:
-            o = _attention(q, k, v, biases)
-
-        o = self._wrap_up(o, q_x)
-
-        return o
-
-
-class GlobalAttention(nn.Module):
-
-    def __init__(self, c_in, c_hidden, no_heads, inf, eps):
-        super(GlobalAttention, self).__init__()
-
-        self.c_in = c_in
-        self.c_hidden = c_hidden
-        self.no_heads = no_heads
-        self.inf = inf
-        self.eps = eps
-
-        self.linear_q = Linear(c_in, c_hidden * no_heads, bias=False, init="glorot")
-
-        self.linear_k = Linear(
-            c_in,
-            c_hidden,
-            bias=False,
-            init="glorot",
-        )
-        self.linear_v = Linear(
-            c_in,
-            c_hidden,
-            bias=False,
-            init="glorot",
-        )
-        self.linear_g = Linear(c_in, c_hidden * no_heads, init="gating")
-        self.linear_o = Linear(c_hidden * no_heads, c_in, init="final")
-
-        self.sigmoid = nn.Sigmoid()
-
-    def forward(self, m: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
-        # [*, N_res, C_in]
-        q = torch.sum(m * mask.unsqueeze(-1),
-                      dim=-2) / (torch.sum(mask, dim=-1)[..., None] + self.eps)
-
-        # [*, N_res, H * C_hidden]
-        q = self.linear_q(q)
-        q *= (self.c_hidden**(-0.5))
-
-        # [*, N_res, H, C_hidden]
-        q = q.view(q.shape[:-1] + (self.no_heads, -1))
-
-        # [*, N_res, N_seq, C_hidden]
-        k = self.linear_k(m)
-        v = self.linear_v(m)
-
-        # [*, N_res, H, N_seq]
-        a = torch.matmul(
-            q,
-            k.transpose(-1, -2),  # [*, N_res, C_hidden, N_seq]
-        )
-        bias = (self.inf * (mask - 1))[..., :, None, :]
-        a += bias
-        a = softmax(a)
-
-        # [*, N_res, H, C_hidden]
-        o = torch.matmul(
-            a,
-            v,
-        )
-
-        # [*, N_res, N_seq, C_hidden]
-        g = self.sigmoid(self.linear_g(m))
-
-        # [*, N_res, N_seq, H, C_hidden]
-        g = g.view(g.shape[:-1] + (self.no_heads, -1))
-
-        # [*, N_res, N_seq, H, C_hidden]
-        o = o.unsqueeze(-3) * g
-
-        # [*, N_res, N_seq, H * C_hidden]
-        o = o.reshape(o.shape[:-2] + (-1,))
-
-        # [*, N_res, N_seq, C_in]
-        m = self.linear_o(o)
-
-        return m
-
-
-def _lma(
-    q: torch.Tensor,
-    k: torch.Tensor,
-    v: torch.Tensor,
-    biases: List[torch.Tensor],
-    q_chunk_size: int,
-    kv_chunk_size: int,
-):
-    no_q, no_kv = q.shape[-3], k.shape[-3]
-
-    # [*, Q, H, C_hidden]
-    o = q.new_zeros(q.shape)
-    for q_s in range(0, no_q, q_chunk_size):
-        q_chunk = q[..., q_s:q_s + q_chunk_size, :, :]
-        large_bias_chunks = [b[..., q_s:q_s + q_chunk_size, :] for b in biases]
-
-        maxes = []
-        weights = []
-        values = []
-        for kv_s in range(0, no_kv, kv_chunk_size):
-            k_chunk = k[..., kv_s:kv_s + kv_chunk_size, :, :]
-            v_chunk = v[..., kv_s:kv_s + kv_chunk_size, :, :]
-            small_bias_chunks = [b[..., kv_s:kv_s + kv_chunk_size] for b in large_bias_chunks]
-
-            a = torch.einsum(
-                "...qhd,...khd->...hqk",
-                q_chunk,
-                k_chunk,
-            )
-
-            for b in small_bias_chunks:
-                a += b
-
-            a = a.transpose(-2, -3)
-
-            max_a = torch.max(a, dim=-1, keepdim=True)[0]
-            exp_a = torch.exp(a - max_a)
-            exp_v = torch.einsum("...vhf,...qhv->...qhf", v_chunk, exp_a)
-
-            maxes.append(max_a.detach().squeeze(-1))
-            weights.append(torch.sum(exp_a, dim=-1))
-            values.append(exp_v)
-
-        chunk_max = torch.stack(maxes, dim=-3)
-        chunk_weights = torch.stack(weights, dim=-3)
-        chunk_values = torch.stack(values, dim=-4)
-
-        global_max = torch.max(chunk_max, dim=-3, keepdim=True)[0]
-        max_diffs = torch.exp(chunk_max - global_max)
-        chunk_values *= max_diffs.unsqueeze(-1)
-        chunk_weights *= max_diffs
-
-        all_values = torch.sum(chunk_values, dim=-4)
-        all_weights = torch.sum(chunk_weights.unsqueeze(-1), dim=-4)
-
-        q_chunk_out = all_values / all_weights
-
-        o[..., q_s:q_s + q_chunk_size, :, :] = q_chunk_out
-
-    return o
diff --git a/tests/test_autochunk/openfold/tensor_utils.py b/tests/test_autochunk/openfold/tensor_utils.py
deleted file mode 100644
index 384a71fb5ffd..000000000000
--- a/tests/test_autochunk/openfold/tensor_utils.py
+++ /dev/null
@@ -1,408 +0,0 @@
-# Copyright 2021 AlQuraishi Laboratory
-# Copyright 2021 DeepMind Technologies Limited
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from functools import partial
-import torch
-import torch.nn as nn
-from typing import Tuple, List, Callable, Any, Dict, Sequence, Optional
-
-
-def permute_final_dims(tensor: torch.Tensor, inds: List[int]):
-    zero_index = -1 * len(inds)
-    first_inds = list(range(len(tensor.shape[:zero_index])))
-    return tensor.permute(first_inds + [zero_index + i for i in inds])
-
-
-def flatten_final_dims(t: torch.Tensor, no_dims: int):
-    return t.reshape(t.shape[:-no_dims] + (-1,))
-
-
-def masked_mean(mask, value, dim, eps=1e-4):
-    mask = mask.expand(*value.shape)
-    return torch.sum(mask * value, dim=dim) / (eps + torch.sum(mask, dim=dim))
-
-
-def pts_to_distogram(pts, min_bin=2.3125, max_bin=21.6875, no_bins=64):
-    boundaries = torch.linspace(
-        min_bin, max_bin, no_bins - 1, device=pts.device
-    )
-    dists = torch.sqrt(
-        torch.sum((pts.unsqueeze(-2) - pts.unsqueeze(-3)) ** 2, dim=-1)
-    )
-    return torch.bucketize(dists, boundaries)
-
-
-def dict_multimap(fn, dicts):
-    first = dicts[0]
-    new_dict = {}
-    for k, v in first.items():
-        all_v = [d[k] for d in dicts]
-        if type(v) is dict:
-            new_dict[k] = dict_multimap(fn, all_v)
-        else:
-            new_dict[k] = fn(all_v)
-
-    return new_dict
-
-
-def one_hot(x, v_bins):
-    reshaped_bins = v_bins.view(((1,) * len(x.shape)) + (len(v_bins),))
-    diffs = x[..., None] - reshaped_bins
-    am = torch.argmin(torch.abs(diffs), dim=-1)
-    return nn.functional.one_hot(am, num_classes=len(v_bins)).float()
-
-
-def batched_gather(data, inds, dim=0, no_batch_dims=0):
-    ranges = []
-    for i, s in enumerate(data.shape[:no_batch_dims]):
-        r = torch.arange(s)
-        r = r.view(*(*((1,) * i), -1, *((1,) * (len(inds.shape) - i - 1))))
-        ranges.append(r)
-
-    remaining_dims = [
-        slice(None) for _ in range(len(data.shape) - no_batch_dims)
-    ]
-    remaining_dims[dim - no_batch_dims if dim >= 0 else dim] = inds
-    ranges.extend(remaining_dims)
-    return data[ranges]
-
-
-# With tree_map, a poor man's JAX tree_map
-def dict_map(fn, dic, leaf_type):
-    new_dict = {}
-    for k, v in dic.items():
-        if type(v) is dict:
-            new_dict[k] = dict_map(fn, v, leaf_type)
-        else:
-            new_dict[k] = tree_map(fn, v, leaf_type)
-
-    return new_dict
-
-
-def tree_map(fn, tree, leaf_type):
-    if isinstance(tree, dict):
-        return dict_map(fn, tree, leaf_type)
-    elif isinstance(tree, list):
-        return [tree_map(fn, x, leaf_type) for x in tree]
-    elif isinstance(tree, tuple):
-        return tuple([tree_map(fn, x, leaf_type) for x in tree])
-    elif isinstance(tree, leaf_type):
-        return fn(tree)
-    else:
-        print(type(tree))
-        raise ValueError("Not supported")
-
-
-tensor_tree_map = partial(tree_map, leaf_type=torch.Tensor)
-
-def _fetch_dims(tree):
-    shapes = []
-    tree_type = type(tree)
-    if tree_type is dict:
-        for v in tree.values():
-            shapes.extend(_fetch_dims(v))
-    elif tree_type is list or tree_type is tuple:
-        for t in tree:
-            shapes.extend(_fetch_dims(t))
-    elif tree_type is torch.Tensor:
-        shapes.append(tree.shape)
-    else:
-        raise ValueError("Not supported")
-
-    return shapes
-
-
-@torch.jit.ignore
-def _flat_idx_to_idx(
-    flat_idx: int,
-    dims: Tuple[int],
-) -> Tuple[int]:
-    idx = []
-    for d in reversed(dims):
-        idx.append(flat_idx % d)
-        flat_idx = flat_idx // d
-
-    return tuple(reversed(idx))
-
-
-@torch.jit.ignore
-def _get_minimal_slice_set(
-    start: Sequence[int],
-    end: Sequence[int],
-    dims: int,
-    start_edges: Optional[Sequence[bool]] = None,
-    end_edges: Optional[Sequence[bool]] = None,
-) -> Sequence[Tuple[int]]:
-    """ 
-        Produces an ordered sequence of tensor slices that, when used in
-        sequence on a tensor with shape dims, yields tensors that contain every
-        leaf in the contiguous range [start, end]. Care is taken to yield a 
-        short sequence of slices, and perhaps even the shortest possible (I'm 
-        pretty sure it's the latter).
-         
-        end is INCLUSIVE. 
-    """
-    # start_edges and end_edges both indicate whether, starting from any given
-    # dimension, the start/end index is at the top/bottom edge of the
-    # corresponding tensor, modeled as a tree
-    def reduce_edge_list(ll):
-        tally = 1
-        for i in range(len(ll)):
-            reversed_idx = -1 * (i + 1)
-            ll[reversed_idx] *= tally
-            tally = ll[reversed_idx]
-
-    if(start_edges is None):
-        start_edges = [s == 0 for s in start]
-        reduce_edge_list(start_edges)
-    if(end_edges is None):
-        end_edges = [e == (d - 1) for e,d in zip(end, dims)]
-        reduce_edge_list(end_edges)        
-
-    # Base cases. Either start/end are empty and we're done, or the final,
-    # one-dimensional tensor can be simply sliced
-    if(len(start) == 0):
-        return [tuple()]
-    elif(len(start) == 1):
-        return [(slice(start[0], end[0] + 1),)]
-
-    slices = []
-    path = []
- 
-    # Dimensions common to start and end can be selected directly
-    for s,e in zip(start, end):
-        if(s == e):
-            path.append(slice(s, s + 1))
-        else:
-            break
-
-    path = tuple(path)
-    divergence_idx = len(path)
-
-    # start == end, and we're done
-    if(divergence_idx == len(dims)):
-        return [tuple(path)]
-
-    def upper():
-        sdi = start[divergence_idx]
-        return [
-            path + (slice(sdi, sdi + 1),) + s for s in 
-            _get_minimal_slice_set(
-                start[divergence_idx + 1:],
-                [d - 1 for d in dims[divergence_idx + 1:]],
-                dims[divergence_idx + 1:],
-                start_edges=start_edges[divergence_idx + 1:],
-                end_edges=[1 for _ in end_edges[divergence_idx + 1:]]
-            )
-        ]
-
-    def lower():
-        edi = end[divergence_idx]
-        return [
-            path + (slice(edi, edi + 1),) + s for s in 
-            _get_minimal_slice_set(
-                [0 for _ in start[divergence_idx + 1:]],
-                end[divergence_idx + 1:],
-                dims[divergence_idx + 1:],
-                start_edges=[1 for _ in start_edges[divergence_idx + 1:]],
-                end_edges=end_edges[divergence_idx + 1:],
-            )
-        ]
-
-    # If both start and end are at the edges of the subtree rooted at
-    # divergence_idx, we can just select the whole subtree at once
-    if(start_edges[divergence_idx] and end_edges[divergence_idx]):
-        slices.append(
-            path + (slice(start[divergence_idx], end[divergence_idx] + 1),)
-        )
-    # If just start is at the edge, we can grab almost all of the subtree, 
-    # treating only the ragged bottom edge as an edge case
-    elif(start_edges[divergence_idx]):
-        slices.append(
-            path + (slice(start[divergence_idx], end[divergence_idx]),)
-        )
-        slices.extend(lower())
-    # Analogous to the previous case, but the top is ragged this time
-    elif(end_edges[divergence_idx]):
-        slices.extend(upper())
-        slices.append(
-            path + (slice(start[divergence_idx] + 1, end[divergence_idx] + 1),)
-        )
-    # If both sides of the range are ragged, we need to handle both sides
-    # separately. If there's contiguous meat in between them, we can index it
-    # in one big chunk
-    else:
-        slices.extend(upper())
-        middle_ground = end[divergence_idx] - start[divergence_idx]
-        if(middle_ground > 1):
-            slices.append(
-                path + (slice(start[divergence_idx] + 1, end[divergence_idx]),)
-            )
-        slices.extend(lower())
-
-    return [tuple(s) for s in slices]
-
-
-@torch.jit.ignore
-def _chunk_slice(
-    t: torch.Tensor,
-    flat_start: int,
-    flat_end: int,
-    no_batch_dims: int,
-) -> torch.Tensor:
-    """
-        Equivalent to
-        
-            t.reshape((-1,) + t.shape[no_batch_dims:])[flat_start:flat_end]
-
-        but without the need for the initial reshape call, which can be 
-        memory-intensive in certain situations. The only reshape operations
-        in this function are performed on sub-tensors that scale with
-        (flat_end - flat_start), the chunk size.
-    """
-
-    batch_dims = t.shape[:no_batch_dims]
-    start_idx = list(_flat_idx_to_idx(flat_start, batch_dims))
-    # _get_minimal_slice_set is inclusive
-    end_idx = list(_flat_idx_to_idx(flat_end - 1, batch_dims))
-
-    # Get an ordered list of slices to perform
-    slices = _get_minimal_slice_set(
-        start_idx,
-        end_idx,
-        batch_dims,
-    )
-
-    sliced_tensors = [t[s] for s in slices]
-
-    return torch.cat(
-        [s.view((-1,) + t.shape[no_batch_dims:]) for s in sliced_tensors]
-    )
-
-
-def chunk_layer(
-    layer: Callable,
-    inputs: Dict[str, Any],
-    chunk_size: int,
-    no_batch_dims: int,
-    low_mem: bool = False, 
-) -> Any:
-    """
-    Implements the "chunking" procedure described in section 1.11.8.
-
-    Layer outputs and inputs are assumed to be simple "pytrees,"
-    consisting only of (arbitrarily nested) lists, tuples, and dicts with
-    torch.Tensor leaves.
-
-    Args:
-        layer:
-            The layer to be applied chunk-wise
-        inputs:
-            A (non-nested) dictionary of keyworded inputs. All leaves must
-            be tensors and must share the same batch dimensions.
-        chunk_size:
-            The number of sub-batches per chunk. If multiple batch
-            dimensions are specified, a "sub-batch" is defined as a single
-            indexing of all batch dimensions simultaneously (s.t. the
-            number of sub-batches is the product of the batch dimensions).
-        no_batch_dims:
-            How many of the initial dimensions of each input tensor can
-            be considered batch dimensions.
-        low_mem:
-            Avoids flattening potentially large input tensors. Unnecessary
-            in most cases, and is ever so slightly slower than the default
-            setting.
-    Returns:
-        The reassembled output of the layer on the inputs.
-    """
-    if not (len(inputs) > 0):
-        raise ValueError("Must provide at least one input")
-
-    initial_dims = [shape[:no_batch_dims] for shape in _fetch_dims(inputs)]
-    orig_batch_dims = tuple([max(s) for s in zip(*initial_dims)])
-
-    def _prep_inputs(t):
-        # TODO: make this more memory efficient. This sucks
-        if(not low_mem):
-            if not sum(t.shape[:no_batch_dims]) == no_batch_dims:
-                t = t.expand(orig_batch_dims + t.shape[no_batch_dims:])
-            t = t.reshape(-1, *t.shape[no_batch_dims:])
-        else:
-            t = t.expand(orig_batch_dims + t.shape[no_batch_dims:])
-        return t
-
-    prepped_inputs = tensor_tree_map(_prep_inputs, inputs)
-
-    flat_batch_dim = 1
-    for d in orig_batch_dims:
-        flat_batch_dim *= d
-
-    no_chunks = flat_batch_dim // chunk_size + (
-        flat_batch_dim % chunk_size != 0
-    )
-
-    i = 0
-    out = None
-    for _ in range(no_chunks):
-        # Chunk the input
-        if(not low_mem):
-            select_chunk = (
-                lambda t: t[i : i + chunk_size] if t.shape[0] != 1 else t
-            )
-        else:
-            select_chunk = (
-                partial(
-                    _chunk_slice, 
-                    flat_start=i, 
-                    flat_end=min(flat_batch_dim, i + chunk_size), 
-                    no_batch_dims=len(orig_batch_dims)
-                )
-            )
-
-        chunks = tensor_tree_map(select_chunk, prepped_inputs)
-
-        # Run the layer on the chunk
-        output_chunk = layer(**chunks)
-
-        # Allocate space for the output
-        if out is None:
-            allocate = lambda t: t.new_zeros((flat_batch_dim,) + t.shape[1:])
-            out = tensor_tree_map(allocate, output_chunk)
-
-        # Put the chunk in its pre-allocated space
-        out_type = type(output_chunk)
-        if out_type is dict:
-            def assign(d1, d2):
-                for k, v in d1.items():
-                    if type(v) is dict:
-                        assign(v, d2[k])
-                    else:
-                        v[i : i + chunk_size] = d2[k]
-
-            assign(out, output_chunk)
-        elif out_type is tuple:
-            for x1, x2 in zip(out, output_chunk):
-                x1[i : i + chunk_size] = x2
-        elif out_type is torch.Tensor:
-            out[i : i + chunk_size] = output_chunk
-        else:
-            raise ValueError("Not supported")
-
-        i += chunk_size
-
-    reshape = lambda t: t.view(orig_batch_dims + t.shape[1:])
-    out = tensor_tree_map(reshape, out)
-
-    return out
diff --git a/tests/test_autochunk/openfold/triangular_attention.py b/tests/test_autochunk/openfold/triangular_attention.py
deleted file mode 100644
index 12d09c502daf..000000000000
--- a/tests/test_autochunk/openfold/triangular_attention.py
+++ /dev/null
@@ -1,139 +0,0 @@
-# Copyright 2021 AlQuraishi Laboratory
-# Copyright 2021 DeepMind Technologies Limited
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from functools import partialmethod, partial
-import math
-from typing import Optional, List
-
-import torch
-import torch.nn as nn
-
-from .primitives import Linear, LayerNorm, Attention
-from .tensor_utils import (
-    chunk_layer,
-    permute_final_dims,
-    flatten_final_dims,
-)
-
-
-class TriangleAttention(nn.Module):
-    def __init__(
-        self, c_in, c_hidden, no_heads, starting, inf=1e9
-    ):
-        """
-        Args:
-            c_in:
-                Input channel dimension
-            c_hidden:
-                Overall hidden channel dimension (not per-head)
-            no_heads:
-                Number of attention heads
-        """
-        super(TriangleAttention, self).__init__()
-
-        self.c_in = c_in
-        self.c_hidden = c_hidden
-        self.no_heads = no_heads
-        self.starting = starting
-        self.inf = inf
-
-        self.layer_norm = LayerNorm(self.c_in)
-
-        self.linear = Linear(c_in, self.no_heads, bias=False, init="normal")
-
-        self.mha = Attention(
-            self.c_in, self.c_in, self.c_in, self.c_hidden, self.no_heads
-        )
-
-    @torch.jit.ignore
-    def _chunk(self,
-        x: torch.Tensor,
-        biases: List[torch.Tensor],
-        chunk_size: int,
-    ) -> torch.Tensor:
-        mha_inputs = {
-            "q_x": x,
-            "kv_x": x,
-            "biases": biases,
-        }
-        return chunk_layer(
-            partial(self.mha),
-            mha_inputs,
-            chunk_size=chunk_size,
-            no_batch_dims=len(x.shape[:-2]),
-        )
-
-    def forward(self, 
-        x: torch.Tensor, 
-        mask: Optional[torch.Tensor] = None,
-        chunk_size: Optional[int] = None
-    ) -> torch.Tensor:
-        """
-        Args:
-            x:
-                [*, I, J, C_in] input tensor (e.g. the pair representation)
-        Returns:
-            [*, I, J, C_in] output tensor
-        """
-        if mask is None:
-            # [*, I, J]
-            mask = x.new_ones(
-                x.shape[:-1],
-            )
-
-        # Shape annotations assume self.starting. Else, I and J are flipped
-        if not self.starting:
-            x = x.transpose(-2, -3)
-            mask = mask.transpose(-1, -2)
-
-        # [*, I, J, C_in]
-        x = self.layer_norm(x)
-
-        # [*, I, 1, 1, J]
-        mask_bias = (self.inf * (mask - 1))[..., :, None, None, :]
-
-        # [*, H, I, J]
-        triangle_bias = permute_final_dims(self.linear(x), (2, 0, 1))
-
-        # [*, 1, H, I, J]
-        triangle_bias = triangle_bias.unsqueeze(-4)
-
-        biases = [mask_bias, triangle_bias]
-
-        if chunk_size is not None:
-            x = self._chunk(x, biases, chunk_size)
-        else:
-            x = self.mha(q_x=x, kv_x=x, biases=biases)
-
-        if not self.starting:
-            x = x.transpose(-2, -3)
-
-        return x
-
-
-class TriangleAttentionStartingNode(TriangleAttention):
-    """
-    Implements Algorithm 13.
-    """
-
-    __init__ = partialmethod(TriangleAttention.__init__, starting=True)
-
-
-class TriangleAttentionEndingNode(TriangleAttention):
-    """
-    Implements Algorithm 14.
-    """
-
-    __init__ = partialmethod(TriangleAttention.__init__, starting=False)
diff --git a/tests/test_autochunk/openfold/triangular_multiplicative_update.py b/tests/test_autochunk/openfold/triangular_multiplicative_update.py
deleted file mode 100644
index 29f7062c3212..000000000000
--- a/tests/test_autochunk/openfold/triangular_multiplicative_update.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# Copyright 2021 AlQuraishi Laboratory
-# Copyright 2021 DeepMind Technologies Limited
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from functools import partialmethod
-from typing import Optional
-
-import torch
-import torch.nn as nn
-
-from .primitives import Linear, LayerNorm
-from .tensor_utils import permute_final_dims
-
-
-class TriangleMultiplicativeUpdate(nn.Module):
-    """
-    Implements Algorithms 11 and 12.
-    """
-    def __init__(self, c_z, c_hidden, _outgoing=True):
-        """
-        Args:
-            c_z:
-                Input channel dimension
-            c:
-                Hidden channel dimension
-        """
-        super(TriangleMultiplicativeUpdate, self).__init__()
-        self.c_z = c_z
-        self.c_hidden = c_hidden
-        self._outgoing = _outgoing
-
-        self.linear_a_p = Linear(self.c_z, self.c_hidden)
-        self.linear_a_g = Linear(self.c_z, self.c_hidden, init="gating")
-        self.linear_b_p = Linear(self.c_z, self.c_hidden)
-        self.linear_b_g = Linear(self.c_z, self.c_hidden, init="gating")
-        self.linear_g = Linear(self.c_z, self.c_z, init="gating")
-        self.linear_z = Linear(self.c_hidden, self.c_z, init="final")
-
-        self.layer_norm_in = LayerNorm(self.c_z)
-        self.layer_norm_out = LayerNorm(self.c_hidden)
-
-        self.sigmoid = nn.Sigmoid()
-
-    def _combine_projections(self,
-        a: torch.Tensor,
-        b: torch.Tensor,
-    ) -> torch.Tensor:
-        raise NotImplementedError("This method needs to be overridden")
-
-    def forward(self, 
-        z: torch.Tensor, 
-        mask: Optional[torch.Tensor] = None
-    ) -> torch.Tensor:
-        """
-        Args:
-            x:
-                [*, N_res, N_res, C_z] input tensor
-            mask:
-                [*, N_res, N_res] input mask
-        Returns:
-            [*, N_res, N_res, C_z] output tensor
-        """
-        if mask is None:
-            mask = z.new_ones(z.shape[:-1])
-
-        mask = mask.unsqueeze(-1)
-
-        z = self.layer_norm_in(z)
-        a = self.linear_a_p(z) * self.sigmoid(self.linear_a_g(z))
-        a = a * mask
-        b = self.linear_b_p(z) * self.sigmoid(self.linear_b_g(z))
-        b = b * mask
-        x = self._combine_projections(a, b)
-        x = self.layer_norm_out(x)
-        x = self.linear_z(x)
-        g = self.sigmoid(self.linear_g(z))
-        z = x * g
-
-        return z
-
-
-class TriangleMultiplicationOutgoing(TriangleMultiplicativeUpdate):
-    """
-    Implements Algorithm 11.
-    """
-    def _combine_projections(self,
-        a: torch.Tensor,  # [*, N_i, N_k, C]
-        b: torch.Tensor,  # [*, N_j, N_k, C]
-    ):
-        # [*, C, N_i, N_j]
-        p = torch.matmul(
-            permute_final_dims(a, (2, 0, 1)),
-            permute_final_dims(b, (2, 1, 0)),
-        )
-
-        # [*, N_i, N_j, C]
-        return permute_final_dims(p, (1, 2, 0))
-
-
-class TriangleMultiplicationIncoming(TriangleMultiplicativeUpdate):
-    """
-    Implements Algorithm 12.
-    """
-    def _combine_projections(self,
-        a: torch.Tensor,  # [*, N_k, N_i, C]
-        b: torch.Tensor,  # [*, N_k, N_j, C]
-    ):
-        # [*, C, N_i, N_j]
-        p = torch.matmul(
-            permute_final_dims(a, (2, 1, 0)),
-            permute_final_dims(b, (2, 0, 1)),
-        )
-
-        # [*, N_i, N_j, C]
-        return permute_final_dims(p, (1, 2, 0))
-
diff --git a/tests/test_autochunk/test_evoformer_codegen.py b/tests/test_autochunk/test_evoformer_codegen.py
new file mode 100644
index 000000000000..1273bf2fecbf
--- /dev/null
+++ b/tests/test_autochunk/test_evoformer_codegen.py
@@ -0,0 +1,164 @@
+from functools import partial
+
+import pytest
+import torch
+import torch.fx
+import torch.multiprocessing as mp
+
+try:
+    from fastfold.model.nn.evoformer import EvoformerBlock
+    HAS_REPO = True
+except:
+    HAS_REPO = False
+
+import colossalai
+from colossalai.core import global_context as gpc
+from colossalai.fx._compatibility import is_compatible_with_meta
+from colossalai.fx.codegen.activation_checkpoint_codegen import CODEGEN_AVAILABLE
+from colossalai.fx.graph_module import ColoGraphModule
+from colossalai.fx.passes.meta_info_prop import MetaInfoProp
+from colossalai.utils import free_port
+
+if CODEGEN_AVAILABLE and is_compatible_with_meta():
+    from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
+    from colossalai.fx.profiler import MetaTensor
+    from colossalai.fx.tracer.experimental import ColoTracer, symbolic_trace
+
+
+def _test_fwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair, node_mask, pair_mask):
+    # for memory test
+    # torch.cuda.reset_peak_memory_stats()
+    # now_mem = torch.cuda.memory_allocated() / 1024**2
+    # with torch.no_grad():
+    #     node1 = node.clone()
+    #     pair1 = pair.clone()
+    #     gm(node1, pair1)
+    # new_now_mem = torch.cuda.memory_allocated() / 1024**2
+    # new_max_mem = torch.cuda.max_memory_allocated() / 1024**2
+    # print(
+    #     "autochunk now mem:%.2f max mem:%.2f"
+    #     % (new_now_mem - now_mem, new_max_mem - now_mem)
+    # )
+
+    # test forward
+    model = model.cuda()
+    with torch.no_grad():
+        non_fx_out = model(node, pair, node_mask, pair_mask)
+        fx_out = gm(node, pair, node_mask, pair_mask)
+
+    assert torch.allclose(non_fx_out[0], fx_out[0],
+                          atol=1e-4), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(
+                              torch.abs(non_fx_out[0] - fx_out[0]))
+    assert torch.allclose(non_fx_out[1], fx_out[1],
+                          atol=1e-4), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(
+                              torch.abs(non_fx_out[1] - fx_out[1]))
+
+
+def _build_openfold():
+    model = EvoformerBlock(
+        c_m=256,
+        c_z=128,
+        c_hidden_msa_att=32,
+        c_hidden_opm=32,
+        c_hidden_mul=128,
+        c_hidden_pair_att=32,
+        no_heads_msa=8,
+        no_heads_pair=4,
+        transition_n=4,
+        msa_dropout=0.15,
+        pair_dropout=0.15,
+        inf=1e4,
+        eps=1e-4,
+        is_multimer=False,
+    ).eval().cuda()
+    return model
+
+
+def _test_evoformer_codegen(rank, msa_len, pair_len, max_memory):
+    # launch colossalai
+    colossalai.launch(
+        config={},
+        rank=rank,
+        world_size=1,
+        host="localhost",
+        port=free_port(),
+        backend="nccl",
+    )
+
+    # build model and input
+    model = _build_openfold()
+    node = torch.randn(1, msa_len, pair_len, 256).cuda()
+    node_mask = torch.randn(1, msa_len, pair_len).cuda()
+    pair = torch.randn(1, pair_len, pair_len, 128).cuda()
+    pair_mask = torch.randn(1, pair_len, pair_len).cuda()
+
+    # trace the meta graph and setup codegen
+    meta_graph = symbolic_trace(
+        model,
+        meta_args={
+            "m": node.to(torch.device("meta")),
+            "z": pair.to(torch.device("meta")),
+            "msa_mask": node_mask.to(torch.device("meta")),
+            "pair_mask": pair_mask.to(torch.device("meta")),
+        },
+        concrete_args={
+            "chunk_size": None,
+            "_mask_trans": True,
+        },
+    )
+    interp = MetaInfoProp(meta_graph)
+    interp.propagate(
+        MetaTensor(node, fake_device="cuda:0"),
+        MetaTensor(pair, fake_device="cuda:0"),
+        MetaTensor(node_mask, fake_device="cuda:0"),
+        MetaTensor(pair_mask, fake_device="cuda:0"),
+    )
+    # codegen = AutoChunkCodeGen(meta_graph, max_memory=max_memory)
+
+    # trace and recompile
+    # MetaInfoProp requires symbolic_trace but CodeGen requires ColoTracer
+    graph = ColoTracer().trace(
+        model,
+        meta_args={
+            "m": node.to(torch.device("meta")),
+            "z": pair.to(torch.device("meta")),
+            "msa_mask": node_mask.to(torch.device("meta")),
+            "pair_mask": pair_mask.to(torch.device("meta")),
+        },
+        concrete_args={
+            "chunk_size": None,
+            "_mask_trans": True,
+        },
+    )
+    # graph.set_codegen(codegen)
+    gm = ColoGraphModule(model, graph)
+    gm.recompile()
+
+    # assert we have inserted chunk
+    code = graph.python_code("self").src
+    assert "chunk_size" in code
+    # print(code)
+
+    _test_fwd(model, gm, node, pair, node_mask, pair_mask)
+    gpc.destroy()
+
+
+@pytest.mark.skipif(
+    not (CODEGEN_AVAILABLE and is_compatible_with_meta() and HAS_REPO),
+    reason="torch version is lower than 1.12.0",
+)
+@pytest.mark.parametrize("max_memory", [None, 20, 25, 30])
+@pytest.mark.parametrize("msa_len", [32])
+@pytest.mark.parametrize("pair_len", [64])
+def test_evoformer_codegen(msa_len, pair_len, max_memory):
+    run_func = partial(
+        _test_evoformer_codegen,
+        msa_len=msa_len,
+        pair_len=pair_len,
+        max_memory=max_memory,
+    )
+    mp.spawn(run_func, nprocs=1)
+
+
+if __name__ == "__main__":
+    _test_evoformer_codegen(0, 32, 64, 25)
diff --git a/tests/test_autochunk/test_autochunk_codegen.py b/tests/test_autochunk/test_simple_evoformer_codegen.py
similarity index 88%
rename from tests/test_autochunk/test_autochunk_codegen.py
rename to tests/test_autochunk/test_simple_evoformer_codegen.py
index 02fa07e2ca00..f1272330fcd9 100644
--- a/tests/test_autochunk/test_autochunk_codegen.py
+++ b/tests/test_autochunk/test_simple_evoformer_codegen.py
@@ -5,6 +5,12 @@
 import torch.fx
 import torch.multiprocessing as mp
 
+try:
+    from simple_evoformer import base_evoformer
+    HAS_REPO = True
+except:
+    HAS_REPO = False
+
 import colossalai
 from colossalai.core import global_context as gpc
 from colossalai.fx import ColoTracer
@@ -13,7 +19,6 @@
 from colossalai.fx.graph_module import ColoGraphModule
 from colossalai.fx.passes.meta_info_prop import MetaInfoProp
 from colossalai.utils import free_port
-from tests.test_autochunk.evoformer.evoformer import evoformer_base
 
 if CODEGEN_AVAILABLE and is_compatible_with_meta():
     from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
@@ -48,7 +53,7 @@ def _test_fwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair):
                               torch.abs(non_fx_out[1] - fx_out[1]))
 
 
-def _test_autochunk_codegen(rank, msa_len, pair_len, max_memory):
+def _test_simple_evoformer_codegen(rank, msa_len, pair_len, max_memory):
     # launch colossalai
     colossalai.launch(
         config={},
@@ -60,7 +65,7 @@ def _test_autochunk_codegen(rank, msa_len, pair_len, max_memory):
     )
 
     # build model and input
-    model = evoformer_base().cuda()
+    model = base_evoformer().cuda()
     node = torch.randn(1, msa_len, pair_len, 256).cuda()
     pair = torch.randn(1, pair_len, pair_len, 128).cuda()
 
@@ -95,13 +100,14 @@ def _test_autochunk_codegen(rank, msa_len, pair_len, max_memory):
     gpc.destroy()
 
 
-@pytest.mark.skipif(not (CODEGEN_AVAILABLE and is_compatible_with_meta()), reason='torch version is lower than 1.12.0')
+@pytest.mark.skipif(not (CODEGEN_AVAILABLE and is_compatible_with_meta() and HAS_REPO),
+                    reason='torch version is lower than 1.12.0')
 @pytest.mark.parametrize("max_memory", [None, 20, 25, 30])
 @pytest.mark.parametrize("msa_len", [32])
 @pytest.mark.parametrize("pair_len", [64])
-def test_autochunk_codegen(msa_len, pair_len, max_memory):
+def test_simple_evoformer_codegen(msa_len, pair_len, max_memory):
     run_func = partial(
-        _test_autochunk_codegen,
+        _test_simple_evoformer_codegen,
         msa_len=msa_len,
         pair_len=pair_len,
         max_memory=max_memory,
@@ -110,4 +116,4 @@ def test_autochunk_codegen(msa_len, pair_len, max_memory):
 
 
 if __name__ == "__main__":
-    _test_autochunk_codegen(0, 32, 64, 25)
+    _test_simple_evoformer_codegen(0, 32, 64, 25)
diff --git a/tests/test_autochunk/test_autochunk_search.py b/tests/test_autochunk/test_simple_evoformer_search.py
similarity index 87%
rename from tests/test_autochunk/test_autochunk_search.py
rename to tests/test_autochunk/test_simple_evoformer_search.py
index 371fce64fdf7..04fb514fbf44 100644
--- a/tests/test_autochunk/test_autochunk_search.py
+++ b/tests/test_autochunk/test_simple_evoformer_search.py
@@ -5,13 +5,18 @@
 import torch.fx
 import torch.multiprocessing as mp
 
+try:
+    from simple_evoformer import base_evoformer
+    HAS_REPO = True
+except:
+    HAS_REPO = False
+
 import colossalai
 from colossalai.core import global_context as gpc
 from colossalai.fx._compatibility import is_compatible_with_meta
 from colossalai.fx.codegen.activation_checkpoint_codegen import CODEGEN_AVAILABLE
 from colossalai.fx.passes.meta_info_prop import MetaInfoProp
 from colossalai.utils import free_port
-from tests.test_autochunk.evoformer.evoformer import evoformer_base
 
 if CODEGEN_AVAILABLE and is_compatible_with_meta():
     from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
@@ -57,7 +62,7 @@ def assert_chunk_infos(chunk_infos, max_memory, msa_len, pair_len):
         )
 
 
-def _test_autochunk_search(rank, msa_len, pair_len, max_memory):
+def _test_simple_evoformer_search(rank, msa_len, pair_len, max_memory):
     # launch colossalai
     colossalai.launch(
         config={},
@@ -69,7 +74,7 @@ def _test_autochunk_search(rank, msa_len, pair_len, max_memory):
     )
 
     # build model and input
-    model = evoformer_base().cuda()
+    model = base_evoformer().cuda()
     node = torch.randn(1, msa_len, pair_len, 256).cuda()
     pair = torch.randn(1, pair_len, pair_len, 128).cuda()
 
@@ -84,13 +89,14 @@ def _test_autochunk_search(rank, msa_len, pair_len, max_memory):
     gpc.destroy()
 
 
-@pytest.mark.skipif(not (CODEGEN_AVAILABLE and is_compatible_with_meta()), reason="torch version is lower than 1.12.0")
+@pytest.mark.skipif(not (CODEGEN_AVAILABLE and is_compatible_with_meta() and HAS_REPO),
+                    reason="torch version is lower than 1.12.0")
 @pytest.mark.parametrize("max_memory", [None, 20, 25, 30])
 @pytest.mark.parametrize("msa_len", [32])
 @pytest.mark.parametrize("pair_len", [64])
-def test_autochunk_search(msa_len, pair_len, max_memory):
+def test_simple_evoformer_search(msa_len, pair_len, max_memory):
     run_func = partial(
-        _test_autochunk_search,
+        _test_simple_evoformer_search,
         msa_len=msa_len,
         pair_len=pair_len,
         max_memory=max_memory,
@@ -99,4 +105,4 @@ def test_autochunk_search(msa_len, pair_len, max_memory):
 
 
 if __name__ == "__main__":
-    _test_autochunk_search(0, 32, 64, 20)
+    _test_simple_evoformer_search(0, 32, 64, 20)

From fcc6d61d925a6ad3e95e8d71e8f16361595a725f Mon Sep 17 00:00:00 2001
From: binmakeswell <binmakeswell@gmail.com>
Date: Tue, 17 Jan 2023 13:07:25 +0800
Subject: [PATCH 198/503] [example] fix requirements (#2488)

---
 .../language/gpt/experiments/pipeline_parallel/requirements.txt | 2 ++
 examples/language/gpt/gemini/requirements.txt                   | 2 ++
 examples/language/gpt/requirements.txt                          | 1 +
 examples/language/opt/requirements.txt                          | 2 ++
 4 files changed, 7 insertions(+)
 create mode 100644 examples/language/gpt/experiments/pipeline_parallel/requirements.txt
 create mode 100644 examples/language/gpt/gemini/requirements.txt
 create mode 100644 examples/language/opt/requirements.txt

diff --git a/examples/language/gpt/experiments/pipeline_parallel/requirements.txt b/examples/language/gpt/experiments/pipeline_parallel/requirements.txt
new file mode 100644
index 000000000000..137a69e80498
--- /dev/null
+++ b/examples/language/gpt/experiments/pipeline_parallel/requirements.txt
@@ -0,0 +1,2 @@
+colossalai >= 0.1.12
+torch >= 1.8.1
diff --git a/examples/language/gpt/gemini/requirements.txt b/examples/language/gpt/gemini/requirements.txt
new file mode 100644
index 000000000000..137a69e80498
--- /dev/null
+++ b/examples/language/gpt/gemini/requirements.txt
@@ -0,0 +1,2 @@
+colossalai >= 0.1.12
+torch >= 1.8.1
diff --git a/examples/language/gpt/requirements.txt b/examples/language/gpt/requirements.txt
index e1f131468fb8..ef58bb76bfc8 100644
--- a/examples/language/gpt/requirements.txt
+++ b/examples/language/gpt/requirements.txt
@@ -1 +1,2 @@
 transformers >= 4.23
+colossalai
diff --git a/examples/language/opt/requirements.txt b/examples/language/opt/requirements.txt
new file mode 100644
index 000000000000..137a69e80498
--- /dev/null
+++ b/examples/language/opt/requirements.txt
@@ -0,0 +1,2 @@
+colossalai >= 0.1.12
+torch >= 1.8.1

From d565a248495b49e78c10d8f3b74de2f8abb63ece Mon Sep 17 00:00:00 2001
From: HELSON <c2h214748@gmail.com>
Date: Wed, 18 Jan 2023 10:36:10 +0800
Subject: [PATCH 199/503] [zero] add unit testings for hybrid parallelism 
 (#2486)

---
 .../sharded_optim/bookkeeping/bucket_store.py |  12 +-
 .../zero/sharded_optim/low_level_optim.py     | 159 ++++++++----------
 tests/test_tensor/common_utils/_utils.py      |  17 +-
 .../test_zero/low_level_zero/test_zero_tp.py  |  98 +++++++++++
 4 files changed, 188 insertions(+), 98 deletions(-)
 create mode 100644 tests/test_zero/low_level_zero/test_zero_tp.py

diff --git a/colossalai/zero/sharded_optim/bookkeeping/bucket_store.py b/colossalai/zero/sharded_optim/bookkeeping/bucket_store.py
index 9e0c05d8941a..ec322a78bf81 100644
--- a/colossalai/zero/sharded_optim/bookkeeping/bucket_store.py
+++ b/colossalai/zero/sharded_optim/bookkeeping/bucket_store.py
@@ -7,7 +7,6 @@ class BucketStore(BaseStore):
 
     def __init__(self, torch_pg: ProcessGroup):
         super().__init__(torch_pg)
-        self._grads = dict()
         self._params = dict()
         self._num_elements_in_bucket = dict()
 
@@ -19,25 +18,24 @@ def num_elements_in_bucket(self, reduce_rank: int = None):
     def add_num_elements_in_bucket(self, num_elements, reduce_rank: int = None):
         self._num_elements_in_bucket[reduce_rank] += num_elements
 
-    def add_grad(self, tensor, reduce_rank: int = None):
-        self._grads[reduce_rank].append(tensor)
-
     def add_param(self, tensor, reduce_rank: int = None):
         self._params[reduce_rank].append(tensor)
 
     def reset(self):
         keys = [None] + list(range(self._world_size))
-        self._grads = {rank: [] for rank in keys}
         self._params = {rank: [] for rank in keys}
         self._num_elements_in_bucket = {rank: 0 for rank in keys}
 
     def reset_by_rank(self, reduce_rank=None):
-        self._grads[reduce_rank] = []
         self._params[reduce_rank] = []
         self._num_elements_in_bucket[reduce_rank] = 0
 
     def get_grad(self, reduce_rank: int = None):
-        return self._grads[reduce_rank]
+        param_list = self.get_param(reduce_rank)
+        for param in param_list:
+            # the param must have grad for reduction
+            assert param.grad is not None, f'Parameter of size ({param.size()}) has None grad, cannot be reduced'
+        return [param.grad for param in param_list]
 
     def get_param(self, reduce_rank: int = None):
         return self._params[reduce_rank]
diff --git a/colossalai/zero/sharded_optim/low_level_optim.py b/colossalai/zero/sharded_optim/low_level_optim.py
index 38736d01afef..f45b5e200a61 100644
--- a/colossalai/zero/sharded_optim/low_level_optim.py
+++ b/colossalai/zero/sharded_optim/low_level_optim.py
@@ -46,7 +46,7 @@ def __init__(
             reduce_bucket_size: int = 1024 * 1024,    # communication
             communication_dtype: Optional[torch.dtype] = None,
             overlap_communication: bool = False,
-            partition_grad: bool = False,    # stage 2
+            partition_grad: bool = False,    # stage 2 flag
             cpu_offload: bool = False,    # cpu offload
             forced_dtype: Optional[torch.dtype] = None):
 
@@ -248,9 +248,13 @@ def _partition_param_list(self, param_list):
             self._logger.info(f'Number of elements on ranks: {numel_per_rank}', ranks=[0])
         return params_per_rank
 
-    ###########################################################
-    # Backward Reduction Hook
-    ###########################################################
+    ###########################
+    # Backward Reduction Hook #
+    ###########################
+
+    def _grad_handler(self, param, grad, reduce_rank):
+        self._add_to_reduction_bucket(param, reduce_rank)
+        return grad
 
     def _attach_reduction_hook(self):
         # we iterate over the fp16 params
@@ -268,53 +272,61 @@ def _attach_reduction_hook(self):
                     else:
                         reduce_rank = None
 
-                    def _define_and_attach(param, reduce_rank):
-                        # get the AccumulateGrad object of the param itself
-                        accum_grad_obj = get_grad_accumulate_object(param)
-                        self._grad_store.add_accumulate_grad_object(accum_grad_obj)
+                    param.register_hook(partial(self._grad_handler, param, reduce_rank=reduce_rank))
 
-                        reduction_func = partial(self._reduce_and_remove_grads_by_bucket,
-                                                 param=param,
-                                                 reduce_rank=reduce_rank)
+    def _reduce_tensor_bucket(self, bucket: TensorBucket, reduce_rank):
+        if self._overlap_communication:
+            torch.cuda.synchronize()
+            self._param_store.clear_grads_of_previous_reduced_params()
+            stream = self._comm_stream
+        else:
+            stream = torch.cuda.current_stream()
 
-                        # define hook
-                        # NOT IMPORTANT BUT GOOD TO KNOW:
-                        # args here is not grad, but allow_unreacable and accumulate_grad
-                        def reduce_grad_hook(*args):
-                            reduction_func()
+        with torch.cuda.stream(stream):
+            flat = bucket.flatten()
+            reduce_global_rank = None
+            if reduce_rank is not None:
+                reduce_global_rank = self._dp_global_ranks[reduce_rank]
+            reduced_flat = reduce_tensor_dp_group(tensor=flat,
+                                                  dtype=self._communication_dtype,
+                                                  dst_local_rank=reduce_rank,
+                                                  dst_global_rank=reduce_global_rank,
+                                                  group=self._dp_torch_group)
 
-                        accum_grad_obj.register_hook(reduce_grad_hook)
+            # update the reduced tensor
+            if reduce_rank is None or reduce_rank == self._local_rank:
+                bucket.unflatten_and_copy(reduced_flat)
 
-                    _define_and_attach(param, reduce_rank)
+    def _reduce_tensor_list_with_one_dtype(self, tensor_list, bucket_size, reduce_rank):
+        param_bucket = TensorBucket(size=bucket_size)
 
-    def _reduce_and_remove_grads_by_bucket(self, param, reduce_rank=None):
-        param_size = param.numel()
+        for tensor in tensor_list:
+            param_bucket.add_to_bucket(tensor, allow_oversize=True)
 
-        # check if the bucket is full
-        # if full, will reduce the grads already in the bucket
-        # after reduction, the bucket will be empty
-        if self._bucket_store.num_elements_in_bucket(reduce_rank) + param_size > self._reduce_bucket_size:
-            self._reduce_grads_in_bucket(reduce_rank)
+            if param_bucket.is_full_or_oversized():
+                self._reduce_tensor_bucket(bucket=param_bucket, reduce_rank=reduce_rank)
+                param_bucket.empty()
 
-        # the param must not be reduced to ensure correctness
-        is_param_reduced = self._param_store.is_param_reduced(param)
-        if is_param_reduced:
-            msg = f'Parameter of size ({param.size()}) has already been reduced, ' \
-                  + 'duplicate reduction will lead to arithmetic incorrectness'
-            raise RuntimeError(msg)
+        if not param_bucket.is_empty():
+            self._reduce_tensor_bucket(bucket=param_bucket, reduce_rank=reduce_rank)
 
-        # the param must have grad for reduction
-        assert param.grad is not None, f'Parameter of size ({param.size()}) has None grad, cannot be reduced'
+    def _reduce_grads(self, reduce_rank, grads, bucket_size):
+        grad_buckets_by_dtype = split_half_float_double(grads)
 
-        self._bucket_store.add_num_elements_in_bucket(param_size, reduce_rank)
-        self._bucket_store.add_grad(param.grad, reduce_rank)
-        self._bucket_store.add_param(param, reduce_rank)
+        for tensor_list in grad_buckets_by_dtype:
+            self._reduce_tensor_list_with_one_dtype(tensor_list=tensor_list,
+                                                    bucket_size=bucket_size,
+                                                    reduce_rank=reduce_rank)
+
+    #######################
+    # Reduction Functions #
+    #######################
 
-    def _reduce_grads_in_bucket(self, reduce_rank=None):
+    def _run_reduction(self, reduce_rank=None):
         # reduce grads
-        self._reduce_grads_by_rank(reduce_rank=reduce_rank,
-                                   grads=self._bucket_store.get_grad(reduce_rank=reduce_rank),
-                                   bucket_size=self._bucket_store.num_elements_in_bucket(reduce_rank))
+        self._reduce_grads(reduce_rank=reduce_rank,
+                           grads=self._bucket_store.get_grad(reduce_rank=reduce_rank),
+                           bucket_size=self._bucket_store.num_elements_in_bucket(reduce_rank))
 
         # use communication stream if overlapping
         # communication with computation
@@ -351,50 +363,24 @@ def _reduce_grads_in_bucket(self, reduce_rank=None):
 
         self._bucket_store.reset_by_rank(reduce_rank)
 
-    def _reduce_grads_by_rank(self, reduce_rank, grads, bucket_size):
-        grad_buckets_by_dtype = split_half_float_double(grads)
-
-        for tensor_list in grad_buckets_by_dtype:
-            self._reduce_no_retain(tensor_list=tensor_list, bucket_size=bucket_size, reduce_rank=reduce_rank)
-
-    ##############################
-    # Reduction Utility Function #
-    ##############################
-    def _reduce_no_retain(self, tensor_list, bucket_size, reduce_rank):
-        param_bucket = TensorBucket(size=bucket_size)
-
-        for tensor in tensor_list:
-            param_bucket.add_to_bucket(tensor, allow_oversize=True)
-
-            if param_bucket.is_full_or_oversized():
-                self._reduce_and_copy(bucket=param_bucket, reduce_rank=reduce_rank)
-                param_bucket.empty()
-
-        if not param_bucket.is_empty():
-            self._reduce_and_copy(bucket=param_bucket, reduce_rank=reduce_rank)
+    def _add_to_reduction_bucket(self, param, reduce_rank=None):
+        param_size = param.numel()
 
-    def _reduce_and_copy(self, bucket: TensorBucket, reduce_rank):
-        if self._overlap_communication:
-            torch.cuda.synchronize()
-            self._param_store.clear_grads_of_previous_reduced_params()
-            stream = self._comm_stream
-        else:
-            stream = torch.cuda.current_stream()
+        # check if the bucket is full
+        # if full, will reduce the grads already in the bucket
+        # after reduction, the bucket will be empty
+        if self._bucket_store.num_elements_in_bucket(reduce_rank) + param_size > self._reduce_bucket_size:
+            self._run_reduction(reduce_rank)
 
-        with torch.cuda.stream(stream):
-            flat = bucket.flatten()
-            reduce_global_rank = None
-            if reduce_rank is not None:
-                reduce_global_rank = self._dp_global_ranks[reduce_rank]
-            reduced_flat = reduce_tensor_dp_group(tensor=flat,
-                                                  dtype=self._communication_dtype,
-                                                  dst_local_rank=reduce_rank,
-                                                  dst_global_rank=reduce_global_rank,
-                                                  group=self._dp_torch_group)
+        # the param must not be reduced to ensure correctness
+        is_param_reduced = self._param_store.is_param_reduced(param)
+        if is_param_reduced:
+            msg = f'Parameter of size ({param.size()}) has already been reduced, ' \
+                  + 'duplicate reduction will lead to arithmetic incorrectness'
+            raise RuntimeError(msg)
 
-            # update the reduced tensor
-            if reduce_rank is None or reduce_rank == self._local_rank:
-                bucket.unflatten_and_copy(reduced_flat)
+        self._bucket_store.add_num_elements_in_bucket(param_size, reduce_rank)
+        self._bucket_store.add_param(param, reduce_rank)
 
     ################################
     # torch.optim.Optimizer methods
@@ -498,8 +484,9 @@ def step(self, closure=None):
         # broadcast the updated model weights
         handles = []
         for group_id in range(self.num_param_groups):
-            for rank in range(self._world_size):
-                fp16_param = self._param_store.get_flat_fp16_param_by_rank_group(rank=rank, group_id=group_id)
+            for index in range(self._world_size):
+                rank = self._dp_global_ranks[index]
+                fp16_param = self._param_store.get_flat_fp16_param_by_rank_group(rank=index, group_id=group_id)
                 handle = dist.broadcast(fp16_param, src=rank, group=self._dp_torch_group, async_op=True)
                 handles.append(handle)
 
@@ -585,11 +572,11 @@ def _reduce_grad_stage1(self):
                 param_group = self._fp16_param_groups[group_id]
                 for param in param_group:
                     if param.grad is not None:
-                        self._reduce_and_remove_grads_by_bucket(param)
+                        self._add_to_reduction_bucket(param)
 
         # we need to reduce the gradients
         # left in the communication bucket
-        self._reduce_grads_in_bucket()
+        self._run_reduction()
 
     def _reduce_grad_stage2(self):
         # when partition_grads is True, reduction hooks
@@ -597,4 +584,4 @@ def _reduce_grad_stage2(self):
         # only need to reduce the gradients
         # left in the communication bucket
         for reduce_rank in range(self._world_size):
-            self._reduce_grads_in_bucket(reduce_rank)
+            self._run_reduction(reduce_rank)
diff --git a/tests/test_tensor/common_utils/_utils.py b/tests/test_tensor/common_utils/_utils.py
index 6b58aa801d15..b405f8cd2108 100644
--- a/tests/test_tensor/common_utils/_utils.py
+++ b/tests/test_tensor/common_utils/_utils.py
@@ -4,6 +4,7 @@
 import numpy as np
 import torch
 import torch.distributed as dist
+from torch.testing import assert_close
 
 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
@@ -41,14 +42,20 @@ def broadcast_tensor_chunk(tensor, chunk_size=1, local_rank=0):
     return tensor_chunk.clone()
 
 
-def tensor_equal(A, B):
-    return torch.allclose(A, B, rtol=1e-3, atol=1e-1)
+def tensor_equal(t_a: torch.Tensor, t_b: torch.Tensor, rtol: float = 1e-3, atol: float = 1e-1):
+    assert_close(t_a, t_b, rtol=rtol, atol=atol)
+    return True
 
 
-def tensor_shard_equal(tensor: torch.Tensor, shard: torch.Tensor, rank, world_size):
+def tensor_shard_equal(tensor: torch.Tensor,
+                       shard: torch.Tensor,
+                       rank: int,
+                       world_size: int,
+                       rtol: float = 1e-3,
+                       atol: float = 1e-1):
     assert tensor.ndim == shard.ndim
     if tensor.shape == shard.shape:
-        return tensor_equal(tensor, shard)
+        return tensor_equal(tensor, shard, rtol, atol)
     else:
         dims_not_eq = torch.nonzero(torch.tensor(tensor.shape) != torch.tensor(shard.shape))
         if dims_not_eq.numel() == 1:
@@ -58,7 +65,7 @@ def tensor_shard_equal(tensor: torch.Tensor, shard: torch.Tensor, rank, world_si
                 world_size = gpc.get_world_size(ParallelMode.PARALLEL_1D)
             if rank is None:
                 rank = gpc.get_local_rank(ParallelMode.PARALLEL_1D)
-            return tensor_equal(tensor.chunk(world_size, dim)[rank], shard)
+            return tensor_equal(tensor.chunk(world_size, dim)[rank], shard, rtol, atol)
         else:
             raise NotImplementedError
 
diff --git a/tests/test_zero/low_level_zero/test_zero_tp.py b/tests/test_zero/low_level_zero/test_zero_tp.py
new file mode 100644
index 000000000000..8ba6e3cb61fd
--- /dev/null
+++ b/tests/test_zero/low_level_zero/test_zero_tp.py
@@ -0,0 +1,98 @@
+from functools import partial
+
+import pytest
+import torch
+import torch.multiprocessing as mp
+import torch.nn as nn
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.testing import assert_close
+
+import colossalai
+from colossalai.tensor import ProcessGroup
+from colossalai.testing import parameterize, rerun_if_address_is_in_use
+from colossalai.utils import free_port, get_current_device
+from colossalai.utils.model.colo_init_context import ColoInitContext
+from colossalai.zero import LowLevelZeroOptimizer
+from tests.test_tensor.common_utils import set_seed, split_param_col_tp1d, split_param_row_tp1d, tensor_shard_equal
+
+
+def strict_shard_equal(tensor, shard, tp_pg, rtol=1e-3, atol=1e-4):
+    return tensor_shard_equal(tensor, shard, tp_pg.tp_local_rank(), tp_pg.tp_world_size(), rtol, atol)
+
+
+class TestModel(nn.Module):
+
+    def __init__(self):
+        super(TestModel, self).__init__()
+        self.linear1 = nn.Linear(32, 128)
+        self.act = nn.GELU()
+        self.linear2 = nn.Linear(128, 32)
+
+    def forward(self, x):
+        y = self.linear1(x)
+        y = self.act(y)
+        y = self.linear2(y)
+        return x + y
+
+
+@parameterize("overlap_flag", [False, True])
+@parameterize("partition_flag", [False, True])
+def exam_zero_with_tp(overlap_flag, partition_flag):
+    set_seed(233010)
+    tp_pg = ProcessGroup(tp_degree=2)
+
+    with ColoInitContext(device=get_current_device(), default_pg=tp_pg):
+        hybrid_model = TestModel()
+    torch_model = TestModel().cuda()
+    for pt, ph in zip(torch_model.parameters(), hybrid_model.parameters()):
+        pt.data.copy_(ph.data)
+
+    for name, param in hybrid_model.named_parameters():
+        if 'linear1' in name:
+            split_param_row_tp1d(param, tp_pg)
+            param.compute_spec.set_output_replicate(False)
+        if 'linear2.weight' in name:
+            split_param_col_tp1d(param, tp_pg)
+
+    torch_model = DDP(torch_model, device_ids=[tp_pg.rank()], process_group=tp_pg.dp_process_group())
+    torch_optim = torch.optim.Adam(torch_model.parameters(), lr=1)
+    hybrid_optim = torch.optim.Adam(hybrid_model.parameters(), lr=1)
+    hybrid_optim = LowLevelZeroOptimizer(hybrid_optim,
+                                         initial_scale=1,
+                                         overlap_communication=overlap_flag,
+                                         partition_grad=partition_flag)
+
+    dp_local_rank = tp_pg.dp_local_rank()
+    set_seed(255 + dp_local_rank)
+
+    data = torch.randn(8, 32, device=get_current_device())
+    torch_loss = torch_model(data).sum()
+    hybrid_loss = hybrid_model(data).sum()
+    assert_close(torch_loss, hybrid_loss)
+
+    torch_loss.backward()
+    hybrid_optim.backward(hybrid_loss)
+    hybrid_optim.sync_grad()
+
+    torch_optim.step()
+    hybrid_optim.step()
+
+    for (name, pt), ph in zip(torch_model.named_parameters(), hybrid_model.parameters()):
+        assert strict_shard_equal(pt.data, ph.data, tp_pg)
+
+
+def run_dist(rank, world_size, port):
+    colossalai.launch(config={}, rank=rank, world_size=world_size, port=port, host='localhost')
+    exam_zero_with_tp()
+
+
+@pytest.mark.dist
+@rerun_if_address_is_in_use()
+def test_zero_with_tp():
+    world_size = 4
+    run_func = partial(run_dist, world_size=world_size, port=free_port())
+    mp.spawn(run_func, nprocs=world_size)
+
+
+if __name__ == '__main__':
+    test_zero_with_tp()

From a4b75b78a07254680286431963c527a482bac93c Mon Sep 17 00:00:00 2001
From: jiaruifang <fangjiarui123@gmail.com>
Date: Wed, 18 Jan 2023 11:37:16 +0800
Subject: [PATCH 200/503] [hotfix] gpt example titans bug #2493

---
 .../titans/configs/gpt2_small_zero3_pp1d.py   |  4 +-
 .../language/gpt/titans/dataset/webtext.py    | 39 +++++++++++++++++++
 examples/language/gpt/titans/train_gpt.py     |  2 +-
 3 files changed, 42 insertions(+), 3 deletions(-)
 create mode 100644 examples/language/gpt/titans/dataset/webtext.py

diff --git a/examples/language/gpt/titans/configs/gpt2_small_zero3_pp1d.py b/examples/language/gpt/titans/configs/gpt2_small_zero3_pp1d.py
index 8ef81cb0a14f..7bf53303948a 100644
--- a/examples/language/gpt/titans/configs/gpt2_small_zero3_pp1d.py
+++ b/examples/language/gpt/titans/configs/gpt2_small_zero3_pp1d.py
@@ -12,11 +12,11 @@
 
 # if you do no want zero, just comment out this dictionary
 zero = dict(model_config=dict(tensor_placement_policy='cuda', shard_strategy=TensorShardStrategy()),
-            optimizer_config=dict(initial_scale=2**16))
+            optimizer_config=dict(initial_scale=2**5))
 
 optimizer = dict(
     type=HybridAdam,
-    lr=0.00015,
+    lr=0.000015,
     weight_decay=1e-2,
 )
 
diff --git a/examples/language/gpt/titans/dataset/webtext.py b/examples/language/gpt/titans/dataset/webtext.py
new file mode 100644
index 000000000000..09d8870b530b
--- /dev/null
+++ b/examples/language/gpt/titans/dataset/webtext.py
@@ -0,0 +1,39 @@
+import json
+import os
+
+import torch
+from torch.utils.data import Dataset
+from transformers import GPT2Tokenizer
+
+from colossalai.registry import DATASETS
+
+
+@DATASETS.register_module
+class WebtextDataset(Dataset):
+
+    def __init__(self, path, seq_len=1024) -> None:
+        super().__init__()
+        root = os.path.dirname(path)
+        encoded_data_cache_path = os.path.join(root, f'gpt_webtext_{seq_len}.pt')
+        if os.path.isfile(encoded_data_cache_path):
+            seq_len_, data, attention_mask = torch.load(encoded_data_cache_path)
+            if seq_len_ == seq_len:
+                self.data = data
+                self.attention_mask = attention_mask
+                return
+        raw_data = []
+        with open(path) as f:
+            for line in f.readlines():
+                raw_data.append(json.loads(line)['text'])
+        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        tokenizer.pad_token = tokenizer.unk_token
+        encoded_data = tokenizer(raw_data, padding=True, truncation=True, max_length=seq_len, return_tensors='pt')
+        self.data = encoded_data['input_ids']
+        self.attention_mask = encoded_data['attention_mask']
+        torch.save((seq_len, self.data, self.attention_mask), encoded_data_cache_path)
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, index):
+        return {'input_ids': self.data[index], 'attention_mask': self.attention_mask[index]}, self.data[index]
diff --git a/examples/language/gpt/titans/train_gpt.py b/examples/language/gpt/titans/train_gpt.py
index 1380b4b3a7da..4db7a081fc17 100644
--- a/examples/language/gpt/titans/train_gpt.py
+++ b/examples/language/gpt/titans/train_gpt.py
@@ -30,7 +30,7 @@ def calc_local_model_size(model: torch.nn.Module):
 def main():
     parser = colossalai.get_default_parser()
     parser.add_argument('--from_torch', default=False, action='store_true')
-    parser.add_argument('--use_dummy_dataset', default=True, action='store_true')
+    parser.add_argument('--use_dummy_dataset', default=False, action='store_true')
     args = parser.parse_args()
     disable_existing_loggers()
     if args.from_torch:

From e58cc441e2142f53b61d2b95558974753f9a6e68 Mon Sep 17 00:00:00 2001
From: jiaruifang <fangjiarui123@gmail.com>
Date: Wed, 18 Jan 2023 12:00:08 +0800
Subject: [PATCH 201/503] polish code and fix dataloader bugs

---
 .../language/gpt/titans/dataset/webtext.py    | 42 +++++++-------
 examples/language/gpt/titans/run.sh           |  3 +-
 examples/language/gpt/titans/train_gpt.py     | 55 ++++---------------
 3 files changed, 35 insertions(+), 65 deletions(-)

diff --git a/examples/language/gpt/titans/dataset/webtext.py b/examples/language/gpt/titans/dataset/webtext.py
index 09d8870b530b..64f5944a97f9 100644
--- a/examples/language/gpt/titans/dataset/webtext.py
+++ b/examples/language/gpt/titans/dataset/webtext.py
@@ -1,5 +1,6 @@
 import json
 import os
+from typing import Optional
 
 import torch
 from torch.utils.data import Dataset
@@ -11,26 +12,29 @@
 @DATASETS.register_module
 class WebtextDataset(Dataset):
 
-    def __init__(self, path, seq_len=1024) -> None:
+    def __init__(self, path: Optional[str] = None, seq_len=1024) -> None:
         super().__init__()
-        root = os.path.dirname(path)
-        encoded_data_cache_path = os.path.join(root, f'gpt_webtext_{seq_len}.pt')
-        if os.path.isfile(encoded_data_cache_path):
-            seq_len_, data, attention_mask = torch.load(encoded_data_cache_path)
-            if seq_len_ == seq_len:
-                self.data = data
-                self.attention_mask = attention_mask
-                return
-        raw_data = []
-        with open(path) as f:
-            for line in f.readlines():
-                raw_data.append(json.loads(line)['text'])
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        tokenizer.pad_token = tokenizer.unk_token
-        encoded_data = tokenizer(raw_data, padding=True, truncation=True, max_length=seq_len, return_tensors='pt')
-        self.data = encoded_data['input_ids']
-        self.attention_mask = encoded_data['attention_mask']
-        torch.save((seq_len, self.data, self.attention_mask), encoded_data_cache_path)
+        if path is not None:
+            root = os.path.dirname(path)
+            encoded_data_cache_path = os.path.join(root, f'gpt_webtext_{seq_len}.pt')
+            if os.path.isfile(encoded_data_cache_path):
+                seq_len_, data, attention_mask = torch.load(encoded_data_cache_path)
+                if seq_len_ == seq_len:
+                    self.data = data
+                    self.attention_mask = attention_mask
+                    return
+            raw_data = []
+            with open(path) as f:
+                for line in f.readlines():
+                    raw_data.append(json.loads(line)['text'])
+            tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+            tokenizer.pad_token = tokenizer.unk_token
+            encoded_data = tokenizer(raw_data, padding=True, truncation=True, max_length=seq_len, return_tensors='pt')
+            self.data = encoded_data['input_ids']
+            self.attention_mask = encoded_data['attention_mask']
+        else:
+            self.data = torch.randint(0, 50257, (10240, seq_len))
+            self.attention_mask = torch.ones_like(self.data)
 
     def __len__(self):
         return len(self.data)
diff --git a/examples/language/gpt/titans/run.sh b/examples/language/gpt/titans/run.sh
index 157bd377aa34..a1a7fc737db0 100644
--- a/examples/language/gpt/titans/run.sh
+++ b/examples/language/gpt/titans/run.sh
@@ -1,2 +1,3 @@
 export DATA=/data/scratch/gpt_data/small-gpt-dataset.json
-colossalai run --nproc_per_node=4 train_gpt.py --config ./configs/gpt2_small_zero3_pp1d.py --from_torch
+DUMMY_DATA=--use_dummy_dataset
+colossalai run --nproc_per_node=2 train_gpt.py --config ./configs/gpt2_small_zero3_pp1d.py --from_torch $DUMMY_DATA
diff --git a/examples/language/gpt/titans/train_gpt.py b/examples/language/gpt/titans/train_gpt.py
index 4db7a081fc17..66225d6c8044 100644
--- a/examples/language/gpt/titans/train_gpt.py
+++ b/examples/language/gpt/titans/train_gpt.py
@@ -3,6 +3,7 @@
 
 import torch
 import torch.nn as nn
+from dataset.webtext import WebtextDataset
 from titans.model.gpt import GPTLMLoss
 
 import colossalai
@@ -39,52 +40,16 @@ def main():
         colossalai.launch_from_slurm(config=args.config, host=args.host, port=29500, seed=42)
     logger = get_dist_logger()
 
-    if not args.use_dummy_dataset:
-        data_path = os.environ['DATA']
-        logger.info(f'Build data loader from path {data_path}', ranks=[0])
-        from dataset.webtext import WebtextDataset
-        train_ds = WebtextDataset(os.environ['DATA'], seq_len=gpc.config.SEQ_LEN)
-        train_dataloader = utils.get_dataloader(train_ds,
-                                                seed=42,
-                                                batch_size=gpc.config.BATCH_SIZE,
-                                                pin_memory=True,
-                                                shuffle=True,
-                                                drop_last=True)
-    else:
-        # build a dummy train_dataloader
-        logger.info('Build data loader using dummy data', ranks=[0])
-
-        def get_data(batch_size, seq_len, vocab_size):
-            input_ids = torch.randint(0, vocab_size, (batch_size, seq_len), device=torch.cuda.current_device())
-            attention_mask = torch.ones_like(input_ids)
-            return input_ids, attention_mask
-
-        # 10 iterations
-        input_ids, attn_mask = get_data(gpc.config.BATCH_SIZE * 10, gpc.config.SEQ_LEN, VOCAB_SIZE)
-        from torch.utils.data import DataLoader, Dataset
-
-        class TextSamplerDataset(Dataset):
-
-            def __init__(self, data, seq_len):
-                super().__init__()
-                self.data = data
-                self.seq_len = seq_len
-
-            def __getitem__(self, index):
-                rand_start = torch.randint(0, self.data.size(0) - self.seq_len, (1,))
-                full_seq = self.data[rand_start:rand_start + self.seq_len + 1].long()
-                return full_seq.cuda()
-
-            def __len__(self):
-                return self.data.size(0) // self.seq_len
-
-        def cycle(loader):
-            while True:
-                for data in loader:
-                    yield data
+    data_path = None if args.use_dummy_dataset else os.environ['DATA']
+    logger.info(f'Build data loader from path {data_path}', ranks=[0])
 
-        train_dataset = TextSamplerDataset(input_ids, gpc.config.SEQ_LEN)
-        train_dataloader = DataLoader(train_dataset, batch_size=gpc.config.BATCH_SIZE)
+    train_ds = WebtextDataset(path=data_path, seq_len=gpc.config.SEQ_LEN)
+    train_dataloader = utils.get_dataloader(train_ds,
+                                            seed=42,
+                                            batch_size=gpc.config.BATCH_SIZE,
+                                            pin_memory=True,
+                                            shuffle=True,
+                                            drop_last=True)
 
     logger.info('Build model', ranks=[0])
     use_pipeline = is_using_pp()

From e327e95144f4db8875531699e5b048f77cb80eba Mon Sep 17 00:00:00 2001
From: Jiarui Fang <fangjiarui123@gmail.com>
Date: Wed, 18 Jan 2023 12:04:18 +0800
Subject: [PATCH 202/503] [hotfix] gpt example titans bug #2493 (#2494)

---
 .../titans/configs/gpt2_small_zero3_pp1d.py   |  4 +-
 .../language/gpt/titans/dataset/webtext.py    | 43 ++++++++++++++
 examples/language/gpt/titans/run.sh           |  3 +-
 examples/language/gpt/titans/train_gpt.py     | 57 ++++---------------
 4 files changed, 58 insertions(+), 49 deletions(-)
 create mode 100644 examples/language/gpt/titans/dataset/webtext.py

diff --git a/examples/language/gpt/titans/configs/gpt2_small_zero3_pp1d.py b/examples/language/gpt/titans/configs/gpt2_small_zero3_pp1d.py
index 8ef81cb0a14f..7bf53303948a 100644
--- a/examples/language/gpt/titans/configs/gpt2_small_zero3_pp1d.py
+++ b/examples/language/gpt/titans/configs/gpt2_small_zero3_pp1d.py
@@ -12,11 +12,11 @@
 
 # if you do no want zero, just comment out this dictionary
 zero = dict(model_config=dict(tensor_placement_policy='cuda', shard_strategy=TensorShardStrategy()),
-            optimizer_config=dict(initial_scale=2**16))
+            optimizer_config=dict(initial_scale=2**5))
 
 optimizer = dict(
     type=HybridAdam,
-    lr=0.00015,
+    lr=0.000015,
     weight_decay=1e-2,
 )
 
diff --git a/examples/language/gpt/titans/dataset/webtext.py b/examples/language/gpt/titans/dataset/webtext.py
new file mode 100644
index 000000000000..64f5944a97f9
--- /dev/null
+++ b/examples/language/gpt/titans/dataset/webtext.py
@@ -0,0 +1,43 @@
+import json
+import os
+from typing import Optional
+
+import torch
+from torch.utils.data import Dataset
+from transformers import GPT2Tokenizer
+
+from colossalai.registry import DATASETS
+
+
+@DATASETS.register_module
+class WebtextDataset(Dataset):
+
+    def __init__(self, path: Optional[str] = None, seq_len=1024) -> None:
+        super().__init__()
+        if path is not None:
+            root = os.path.dirname(path)
+            encoded_data_cache_path = os.path.join(root, f'gpt_webtext_{seq_len}.pt')
+            if os.path.isfile(encoded_data_cache_path):
+                seq_len_, data, attention_mask = torch.load(encoded_data_cache_path)
+                if seq_len_ == seq_len:
+                    self.data = data
+                    self.attention_mask = attention_mask
+                    return
+            raw_data = []
+            with open(path) as f:
+                for line in f.readlines():
+                    raw_data.append(json.loads(line)['text'])
+            tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+            tokenizer.pad_token = tokenizer.unk_token
+            encoded_data = tokenizer(raw_data, padding=True, truncation=True, max_length=seq_len, return_tensors='pt')
+            self.data = encoded_data['input_ids']
+            self.attention_mask = encoded_data['attention_mask']
+        else:
+            self.data = torch.randint(0, 50257, (10240, seq_len))
+            self.attention_mask = torch.ones_like(self.data)
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, index):
+        return {'input_ids': self.data[index], 'attention_mask': self.attention_mask[index]}, self.data[index]
diff --git a/examples/language/gpt/titans/run.sh b/examples/language/gpt/titans/run.sh
index 157bd377aa34..a1a7fc737db0 100644
--- a/examples/language/gpt/titans/run.sh
+++ b/examples/language/gpt/titans/run.sh
@@ -1,2 +1,3 @@
 export DATA=/data/scratch/gpt_data/small-gpt-dataset.json
-colossalai run --nproc_per_node=4 train_gpt.py --config ./configs/gpt2_small_zero3_pp1d.py --from_torch
+DUMMY_DATA=--use_dummy_dataset
+colossalai run --nproc_per_node=2 train_gpt.py --config ./configs/gpt2_small_zero3_pp1d.py --from_torch $DUMMY_DATA
diff --git a/examples/language/gpt/titans/train_gpt.py b/examples/language/gpt/titans/train_gpt.py
index 1380b4b3a7da..66225d6c8044 100644
--- a/examples/language/gpt/titans/train_gpt.py
+++ b/examples/language/gpt/titans/train_gpt.py
@@ -3,6 +3,7 @@
 
 import torch
 import torch.nn as nn
+from dataset.webtext import WebtextDataset
 from titans.model.gpt import GPTLMLoss
 
 import colossalai
@@ -30,7 +31,7 @@ def calc_local_model_size(model: torch.nn.Module):
 def main():
     parser = colossalai.get_default_parser()
     parser.add_argument('--from_torch', default=False, action='store_true')
-    parser.add_argument('--use_dummy_dataset', default=True, action='store_true')
+    parser.add_argument('--use_dummy_dataset', default=False, action='store_true')
     args = parser.parse_args()
     disable_existing_loggers()
     if args.from_torch:
@@ -39,52 +40,16 @@ def main():
         colossalai.launch_from_slurm(config=args.config, host=args.host, port=29500, seed=42)
     logger = get_dist_logger()
 
-    if not args.use_dummy_dataset:
-        data_path = os.environ['DATA']
-        logger.info(f'Build data loader from path {data_path}', ranks=[0])
-        from dataset.webtext import WebtextDataset
-        train_ds = WebtextDataset(os.environ['DATA'], seq_len=gpc.config.SEQ_LEN)
-        train_dataloader = utils.get_dataloader(train_ds,
-                                                seed=42,
-                                                batch_size=gpc.config.BATCH_SIZE,
-                                                pin_memory=True,
-                                                shuffle=True,
-                                                drop_last=True)
-    else:
-        # build a dummy train_dataloader
-        logger.info('Build data loader using dummy data', ranks=[0])
-
-        def get_data(batch_size, seq_len, vocab_size):
-            input_ids = torch.randint(0, vocab_size, (batch_size, seq_len), device=torch.cuda.current_device())
-            attention_mask = torch.ones_like(input_ids)
-            return input_ids, attention_mask
-
-        # 10 iterations
-        input_ids, attn_mask = get_data(gpc.config.BATCH_SIZE * 10, gpc.config.SEQ_LEN, VOCAB_SIZE)
-        from torch.utils.data import DataLoader, Dataset
-
-        class TextSamplerDataset(Dataset):
-
-            def __init__(self, data, seq_len):
-                super().__init__()
-                self.data = data
-                self.seq_len = seq_len
-
-            def __getitem__(self, index):
-                rand_start = torch.randint(0, self.data.size(0) - self.seq_len, (1,))
-                full_seq = self.data[rand_start:rand_start + self.seq_len + 1].long()
-                return full_seq.cuda()
-
-            def __len__(self):
-                return self.data.size(0) // self.seq_len
-
-        def cycle(loader):
-            while True:
-                for data in loader:
-                    yield data
+    data_path = None if args.use_dummy_dataset else os.environ['DATA']
+    logger.info(f'Build data loader from path {data_path}', ranks=[0])
 
-        train_dataset = TextSamplerDataset(input_ids, gpc.config.SEQ_LEN)
-        train_dataloader = DataLoader(train_dataset, batch_size=gpc.config.BATCH_SIZE)
+    train_ds = WebtextDataset(path=data_path, seq_len=gpc.config.SEQ_LEN)
+    train_dataloader = utils.get_dataloader(train_ds,
+                                            seed=42,
+                                            batch_size=gpc.config.BATCH_SIZE,
+                                            pin_memory=True,
+                                            shuffle=True,
+                                            drop_last=True)
 
     logger.info('Build model', ranks=[0])
     use_pipeline = is_using_pp()

From 5db3a5bf42a7f8c5fa00141d95fbac633bce4b37 Mon Sep 17 00:00:00 2001
From: oahzxl <43881818+oahzxl@users.noreply.github.com>
Date: Wed, 18 Jan 2023 17:02:46 +0800
Subject: [PATCH 203/503] [fx] allow control of ckpt_codegen init (#2498)

* [fx] allow control of ckpt_codegen init

Currently in ColoGraphModule, ActivationCheckpointCodeGen will be set automatically in __init__. But other codegen can't be set if so.
So I add an arg to control whether to set ActivationCheckpointCodeGen in __init__.

* code style
---
 colossalai/fx/graph_module.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/colossalai/fx/graph_module.py b/colossalai/fx/graph_module.py
index 2d6a71f19e16..ebb9975f27db 100644
--- a/colossalai/fx/graph_module.py
+++ b/colossalai/fx/graph_module.py
@@ -22,8 +22,13 @@
 
     class ColoGraphModule(GraphModule):
 
-        def __init__(self, root: Union[torch.nn.Module, Dict[str, Any]], graph: Graph, class_name: str = 'GraphModule'):
-            graph.set_codegen(ActivationCheckpointCodeGen())
+        def __init__(self,
+                     root: Union[torch.nn.Module, Dict[str, Any]],
+                     graph: Graph,
+                     class_name: str = 'GraphModule',
+                     ckpt_codegen: bool = True):
+            if ckpt_codegen:
+                graph.set_codegen(ActivationCheckpointCodeGen())
             super().__init__(root, graph, class_name)
 
         def bind(self, ckpt_def, globals):

From 025b482dc17c46df3d622bec5d793d22b9fca584 Mon Sep 17 00:00:00 2001
From: jiaruifang <fangjiarui123@gmail.com>
Date: Wed, 18 Jan 2023 18:42:56 +0800
Subject: [PATCH 204/503] [example] dreambooth example

---
 .../dreambooth/train_dreambooth_colossalai.py | 21 ++++++++++++-------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/examples/images/dreambooth/train_dreambooth_colossalai.py b/examples/images/dreambooth/train_dreambooth_colossalai.py
index 7c90b939abaa..9c72c06e79fe 100644
--- a/examples/images/dreambooth/train_dreambooth_colossalai.py
+++ b/examples/images/dreambooth/train_dreambooth_colossalai.py
@@ -153,7 +153,8 @@ def parse_args(input_args=None):
         "--gradient_accumulation_steps",
         type=int,
         default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
+        help=
+        "Number of updates steps to accumulate before performing a backward/update pass. If using Gemini, it must be 1",
     )
     parser.add_argument(
         "--gradient_checkpointing",
@@ -361,6 +362,9 @@ def main(args):
     else:
         colossalai.launch_from_torch(config={}, seed=args.seed)
 
+    local_rank = gpc.get_local_rank(ParallelMode.DATA)
+    world_size = gpc.get_world_size(ParallelMode.DATA)
+
     if args.with_prior_preservation:
         class_images_dir = Path(args.class_data_dir)
         if not class_images_dir.exists():
@@ -388,7 +392,7 @@ def main(args):
             for example in tqdm(
                     sample_dataloader,
                     desc="Generating class images",
-                    disable=not gpc.get_local_rank(ParallelMode.DATA) == 0,
+                    disable=not local_rank == 0,
             ):
                 images = pipeline(example["prompt"]).images
 
@@ -400,7 +404,7 @@ def main(args):
             del pipeline
 
     # Handle the repository creation
-    if gpc.get_local_rank(ParallelMode.DATA) == 0:
+    if local_rank == 0:
         if args.push_to_hub:
             if args.hub_model_id is None:
                 repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
@@ -465,8 +469,9 @@ def main(args):
     if args.gradient_checkpointing:
         unet.enable_gradient_checkpointing()
 
+    assert args.gradient_accumulation_steps == 1, "if using ColossalAI gradient_accumulation_steps must be set to 1."
     if args.scale_lr:
-        args.learning_rate = args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * gpc.get_world_size(ParallelMode.DATA)
+        args.learning_rate = args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * world_size
 
     unet = gemini_zero_dpp(unet, args.placement)
 
@@ -555,7 +560,7 @@ def collate_fn(examples):
     args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
 
     # Train!
-    total_batch_size = args.train_batch_size * gpc.get_world_size(ParallelMode.DATA) * args.gradient_accumulation_steps
+    total_batch_size = args.train_batch_size * world_size * args.gradient_accumulation_steps
 
     logger.info("***** Running training *****", ranks=[0])
     logger.info(f"  Num examples = {len(train_dataset)}", ranks=[0])
@@ -567,7 +572,7 @@ def collate_fn(examples):
     logger.info(f"  Total optimization steps = {args.max_train_steps}", ranks=[0])
 
     # Only show the progress bar once on each machine.
-    progress_bar = tqdm(range(args.max_train_steps), disable=not gpc.get_local_rank(ParallelMode.DATA) == 0)
+    progress_bar = tqdm(range(args.max_train_steps), disable=not local_rank == 0)
     progress_bar.set_description("Steps")
     global_step = 0
 
@@ -644,7 +649,7 @@ def collate_fn(examples):
             if global_step % args.save_steps == 0:
                 torch.cuda.synchronize()
                 torch_unet = get_static_torch_model(unet)
-                if gpc.get_local_rank(ParallelMode.DATA) == 0:
+                if local_rank == 0:
                     pipeline = DiffusionPipeline.from_pretrained(
                         args.pretrained_model_name_or_path,
                         unet=torch_unet,
@@ -659,7 +664,7 @@ def collate_fn(examples):
     torch.cuda.synchronize()
     unet = get_static_torch_model(unet)
 
-    if gpc.get_local_rank(ParallelMode.DATA) == 0:
+    if local_rank == 0:
         pipeline = DiffusionPipeline.from_pretrained(
             args.pretrained_model_name_or_path,
             unet=unet,

From 32390cbe8fa69e5d9df4228c5e4671257b2cc739 Mon Sep 17 00:00:00 2001
From: jiaruifang <fangjiarui123@gmail.com>
Date: Thu, 19 Jan 2023 09:46:28 +0800
Subject: [PATCH 205/503] add test_ci.sh to dreambooth

---
 examples/images/dreambooth/test_ci.sh | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 examples/images/dreambooth/test_ci.sh

diff --git a/examples/images/dreambooth/test_ci.sh b/examples/images/dreambooth/test_ci.sh
new file mode 100644
index 000000000000..e69de29bb2d1

From ecccc91f21edb1a9660215c7c1f62dae31fb0629 Mon Sep 17 00:00:00 2001
From: oahzxl <43881818+oahzxl@users.noreply.github.com>
Date: Thu, 19 Jan 2023 11:41:00 +0800
Subject: [PATCH 206/503] [autochunk] support autochunk on evoformer (#2497)

---
 colossalai/autochunk/autochunk_codegen.py     |  6 +-
 colossalai/autochunk/estimate_memory.py       | 67 +++++----------
 colossalai/autochunk/search_chunk.py          | 83 +++++--------------
 colossalai/autochunk/trace_flow.py            | 63 +++++++++++---
 colossalai/autochunk/trace_indice.py          | 76 +++++++++++++----
 colossalai/autochunk/utils.py                 | 27 +++---
 .../test_autochunk/test_evoformer_codegen.py  | 21 +++--
 .../test_simple_evoformer_codegen.py          | 39 +++------
 .../test_simple_evoformer_search.py           |  6 +-
 9 files changed, 200 insertions(+), 188 deletions(-)

diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
index ceccb9a9fde2..de5e7356bbfd 100644
--- a/colossalai/autochunk/autochunk_codegen.py
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -123,12 +123,13 @@ def _replace_name(context: str, name_from: str, name_to: str) -> str:
     """
     replace node name
     """
-    patterns = [(" ", " "), (" ", "."), (" ", ","), ("(", ")"), ("(", ","), (" ", ")")]
+    patterns = [(" ", " "), (" ", "."), (" ", ","), ("(", ")"), ("(", ","), (" ", ")"), (" ", ""), ("", " ")]
     for p in patterns:
         source = p[0] + name_from + p[1]
         target = p[0] + name_to + p[1]
         if source in context:
             context = context.replace(source, target)
+            break
     return context
 
 
@@ -138,8 +139,7 @@ def _replace_reshape_size(context: str, node_name: str, reshape_size_dict: Dict)
     """
     if node_name not in reshape_size_dict:
         return context
-    for size_name, size_value in reshape_size_dict[node_name].items():
-        context = context.replace(size_name, size_value)
+    context = context.replace(reshape_size_dict[node_name][0], reshape_size_dict[node_name][1])
     return context
 
 
diff --git a/colossalai/autochunk/estimate_memory.py b/colossalai/autochunk/estimate_memory.py
index e001423f1fbb..d386253850a7 100644
--- a/colossalai/autochunk/estimate_memory.py
+++ b/colossalai/autochunk/estimate_memory.py
@@ -37,10 +37,10 @@ def _get_output_node_size(self, n):
 
     def _add_active_node(self, n, active_list):
         new_active = self._get_output_node(n)[1]
-        if n.op == "placeholder":
+        if n.op == "placeholder" and get_node_shape(n) is not None:
             new_active.append(n.name)
         for i in new_active:
-            if i not in active_list:
+            if i not in active_list and get_node_shape(n) is not None:
                 active_list.append(i)
 
     def _get_delete_node(self, user, user_to_last_uses, to_keep=None):
@@ -77,15 +77,11 @@ def _remove_deactive_node(self, user, user_to_last_uses, active_list):
             if i in active_list:
                 active_list.remove(i)
 
-    def _get_chunk_inputs_size(
-        self, chunk_inputs, chunk_inputs_non_chunk, node_list, chunk_end_idx
-    ):
+    def _get_chunk_inputs_size(self, chunk_inputs, chunk_inputs_non_chunk, node_list, chunk_end_idx):
         nodes_to_delete = []
         for chunk_input in chunk_inputs + chunk_inputs_non_chunk:
             chunk_input_users = chunk_input.users.keys()
-            chunk_input_users_idx = [
-                find_idx_by_name(i.name, node_list) for i in chunk_input_users
-            ]
+            chunk_input_users_idx = [find_idx_by_name(i.name, node_list) for i in chunk_input_users]
             if all(i <= chunk_end_idx for i in chunk_input_users_idx):
                 if chunk_input not in nodes_to_delete:
                     nodes_to_delete.append(chunk_input)
@@ -112,9 +108,7 @@ def _get_contiguous_memory(self, node, not_contiguous_list, delete=False):
         not_contiguous_ops = ["permute"]
         inherit_contiguous_ops = ["transpose", "view"]
 
-        if node.op == "call_function" and any(
-            n in node.name for n in ["matmul", "reshape"]
-        ):
+        if node.op == "call_function" and any(n in node.name for n in ["matmul", "reshape"]):
             for n in node.args:
                 if n in not_contiguous_list:
                     # matmul won't change origin tensor, but create a tmp copy
@@ -125,9 +119,7 @@ def _get_contiguous_memory(self, node, not_contiguous_list, delete=False):
                     # module will just make origin tensor to contiguous
                     if delete:
                         not_contiguous_list.remove(n)
-        elif node.op == "call_method" and any(
-            i in node.name for i in not_contiguous_ops
-        ):
+        elif node.op == "call_method" and any(i in node.name for i in not_contiguous_ops):
             if node not in not_contiguous_list:
                 not_contiguous_list.append(node)
         return mem
@@ -142,9 +134,7 @@ def _get_chunk_ratio(self, node, chunk_node_dim, chunk_size):
         else:
             return float(chunk_size) / node_shape[chunk_dim]
 
-    def _get_chunk_delete_node_size(
-        self, user, user_to_last_uses, chunk_ratio, chunk_inputs_names
-    ):
+    def _get_chunk_delete_node_size(self, user, user_to_last_uses, chunk_ratio, chunk_inputs_names):
         # if any(j in user.name for j in ['transpose', 'permute', 'view']):
         #     return 0
         if user.op in ("placeholder", "output"):
@@ -196,7 +186,7 @@ def estimate_chunk_inference_mem(
         Returns:
             act_memory_peak_log (List): peak memory of every node
             act_memory_after_node_log (List): memory after excuting every node
-            active_node_list_log (List): active nodes of every node. active nodes refer to 
+            active_node_list_log (List): active nodes of every node. active nodes refer to
                 nodes generated but not deleted.
         """
         act_memory = 0.0
@@ -212,7 +202,7 @@ def estimate_chunk_inference_mem(
         use_chunk = True if chunk_infos is not None else False
         chunk_within = False
         chunk_region_idx = None
-        chunk_ratio = 1  # use it to estimate chunk mem
+        chunk_ratio = 1    # use it to estimate chunk mem
         chunk_inputs_names = []
 
         if use_chunk:
@@ -221,23 +211,18 @@ def estimate_chunk_inference_mem(
             chunk_ends = [i[1] for i in chunk_regions]
             chunk_inputs = [i["inputs"] for i in chunk_infos]
             chunk_inputs_non_chunk = [i["inputs_non_chunk"] for i in chunk_infos]
-            chunk_inputs_names = [j.name for i in chunk_inputs for j in i] + [
-                j.name for i in chunk_inputs_non_chunk for j in i
-            ]
+            chunk_inputs_names = [j.name for i in chunk_inputs for j in i
+                                 ] + [j.name for i in chunk_inputs_non_chunk for j in i]
             chunk_outputs = [i["outputs"][0] for i in chunk_infos]
             chunk_node_dim = [i["node_chunk_dim"] for i in chunk_infos]
-            chunk_sizes = [
-                i["chunk_size"] if "chunk_size" in i else 1 for i in chunk_infos
-            ]
+            chunk_sizes = [i["chunk_size"] if "chunk_size" in i else 1 for i in chunk_infos]
 
         for idx, node in enumerate(node_list):
             # if node in chunk start nodes, change chunk ratio and add chunk_tensor
             if use_chunk and idx in chunk_starts:
                 chunk_within = True
                 chunk_region_idx = chunk_starts.index(idx)
-                act_memory += self._get_output_node_size(
-                    chunk_outputs[chunk_region_idx]
-                ) / (1024**2)
+                act_memory += self._get_output_node_size(chunk_outputs[chunk_region_idx]) / (1024**2)
 
             # determine chunk ratio for current node
             if chunk_within:
@@ -262,22 +247,13 @@ def estimate_chunk_inference_mem(
             else:
                 # forward memory
                 # TODO: contiguous_memory still not accurate for matmul, view, reshape and transpose
-                act_memory += (
-                    self._get_contiguous_memory(node, not_contiguous_list)
-                    * chunk_ratio
-                    / (1024**2)
-                )
-                act_memory += (
-                    self._get_output_node_size(node) * chunk_ratio / (1024**2)
-                )
+                act_memory += (self._get_contiguous_memory(node, not_contiguous_list) * chunk_ratio / (1024**2))
+                act_memory += (self._get_output_node_size(node) * chunk_ratio / (1024**2))
                 # record max act memory
                 act_memory_peak_log.append(act_memory)
                 # delete useless memory
-                act_memory -= (
-                    self._get_contiguous_memory(node, not_contiguous_list, delete=True)
-                    * chunk_ratio
-                    / (1024**2)
-                )
+                act_memory -= (self._get_contiguous_memory(node, not_contiguous_list, delete=True) * chunk_ratio /
+                               (1024**2))
                 # delete unused vars not in chunk_input_list
                 # we can't delete input nodes until chunk ends
                 if chunk_within:
@@ -288,9 +264,8 @@ def estimate_chunk_inference_mem(
                         chunk_inputs_names,
                     ) / (1024**2)
                 else:
-                    act_memory -= self._get_delete_node_size(
-                        node, user_to_last_uses_no_free_var, chunk_inputs_names
-                    ) / (1024**2)
+                    act_memory -= self._get_delete_node_size(node, user_to_last_uses_no_free_var,
+                                                             chunk_inputs_names) / (1024**2)
 
             # log active node, only effective without chunk
             self._add_active_node(node, active_node_list)
@@ -298,9 +273,7 @@ def estimate_chunk_inference_mem(
 
             # if node in chunk end nodes, restore chunk settings
             if use_chunk and idx in chunk_ends:
-                act_memory -= (
-                    self._get_output_node_size(node) * chunk_ratio / (1024**2)
-                )
+                act_memory -= (self._get_output_node_size(node) * chunk_ratio / (1024**2))
                 act_memory -= self._get_chunk_inputs_size(
                     chunk_inputs[chunk_region_idx],
                     chunk_inputs_non_chunk[chunk_region_idx],
diff --git a/colossalai/autochunk/search_chunk.py b/colossalai/autochunk/search_chunk.py
index c9e5e5172274..236f9697df5d 100644
--- a/colossalai/autochunk/search_chunk.py
+++ b/colossalai/autochunk/search_chunk.py
@@ -8,11 +8,7 @@
 from .select_chunk import SelectChunk
 from .trace_flow import TraceFlow
 from .trace_indice import TraceIndice
-from .utils import (
-    get_node_shape,
-    is_non_compute_node,
-    is_non_compute_node_except_placeholder,
-)
+from .utils import get_node_shape, is_non_compute_node, is_non_compute_node_except_placeholder
 
 
 class SearchChunk(object):
@@ -73,13 +69,11 @@ def _get_free_var_idx(self) -> List:
         """
         free_var_idx = []
         for idx, n in enumerate(self.trace_indice.node_list):
-            if n.op == "placeholder":
+            if n.op == "placeholder" and get_node_shape(n) is not None:
                 free_var_idx.append(idx)
         return free_var_idx
 
-    def _search_max_chunk_region(
-        self, active_node: List, peak_node: Node, chunk_regions: List
-    ) -> Tuple:
+    def _search_max_chunk_region(self, active_node: List, peak_node: Node, chunk_regions: List) -> Tuple:
         """
         Search max chunk region according to peak memory node
 
@@ -124,15 +118,9 @@ def _search_max_chunk_region(
             region = i["region"]
             if chunk_region_start >= region[0] and chunk_region_end <= region[1]:
                 return None
-            elif (
-                region[0] <= chunk_region_start <= region[1]
-                and chunk_region_end > region[1]
-            ):
+            elif (region[0] <= chunk_region_start <= region[1] and chunk_region_end > region[1]):
                 chunk_region_start = region[1] + 1
-            elif (
-                region[0] <= chunk_region_end <= region[1]
-                and chunk_region_start < region[0]
-            ):
+            elif (region[0] <= chunk_region_end <= region[1] and chunk_region_start < region[0]):
                 chunk_region_end = region[0] - 1
         return chunk_region_start, chunk_region_end
 
@@ -164,25 +152,16 @@ def _find_chunk_info(self, input_trace, output_trace, start_idx, end_idx) -> Lis
             for start_node, start_trace in start_traces.items():
                 for start_dim, _ in enumerate(start_trace["indice"]):
                     # dim size cannot be 1
-                    if (
-                        get_node_shape(end_node)[end_dim] == 1
-                        or get_node_shape(start_node)[start_dim] == 1
-                    ):
+                    if (get_node_shape(end_node)[end_dim] == 1 or get_node_shape(start_node)[start_dim] == 1):
                         continue
                     # check index source align
-                    if not self.trace_flow.check_index_source(
-                        start_dim, start_node, start_idx, end_dim, end_node
-                    ):
+                    if not self.trace_flow.check_index_source(start_dim, start_node, start_idx, end_dim, end_node):
                         continue
                     # check index copmute
-                    if not self.trace_flow.check_index_compute(
-                        start_idx, end_dim, end_node, end_idx
-                    ):
+                    if not self.trace_flow.check_index_compute(start_idx, end_dim, end_node, end_idx):
                         continue
                     # flow search
-                    chunk_info = self.trace_flow.flow_search(
-                        start_idx, start_dim, end_idx, end_dim
-                    )
+                    chunk_info = self.trace_flow.flow_search(start_idx, start_dim, end_idx, end_dim)
                     if chunk_info is None:
                         continue
                     # check index copmute
@@ -191,9 +170,7 @@ def _find_chunk_info(self, input_trace, output_trace, start_idx, end_idx) -> Lis
                     chunk_infos.append(chunk_info)
         return chunk_infos
 
-    def _search_possible_chunk_regions(
-        self, max_chunk_region: Tuple, peak_node: Node
-    ) -> List:
+    def _search_possible_chunk_regions(self, max_chunk_region: Tuple, peak_node: Node) -> List:
         """
         Search every possible region within the max chunk region.
 
@@ -206,28 +183,23 @@ def _search_possible_chunk_regions(
         """
         possible_chunk_region = []
         output_trace = copy.deepcopy(self.trace_indice.indice_trace_list)
-        input_trace = []  # trace of a node's input nodes
+        input_trace = []    # trace of a node's input nodes
         for _, n in enumerate(self.trace_indice.node_list):
             cur_trace = {}
             for arg in n.args:
-                if type(arg) == type(n) and not is_non_compute_node_except_placeholder(
-                    arg
-                ):
+                if type(arg) == type(n) and not is_non_compute_node_except_placeholder(arg):
                     cur_trace[arg] = self.trace_indice._find_trace_from_node(arg)
             input_trace.append(cur_trace)
 
         for start_idx in range(max_chunk_region[0], peak_node + 1):
             for end_idx in range(peak_node, max_chunk_region[1] + 1):
                 # skip non compute nodes
-                if is_non_compute_node(
-                    self.trace_indice.node_list[start_idx]
-                ) or is_non_compute_node(self.trace_indice.node_list[end_idx]):
+                if is_non_compute_node(self.trace_indice.node_list[start_idx]) or is_non_compute_node(
+                        self.trace_indice.node_list[end_idx]):
                     continue
 
                 # select free dim
-                chunk_info = self._find_chunk_info(
-                    input_trace, output_trace, start_idx, end_idx
-                )
+                chunk_info = self._find_chunk_info(input_trace, output_trace, start_idx, end_idx)
                 if len(chunk_info) > 0:
                     possible_chunk_region.extend(chunk_info)
         return possible_chunk_region
@@ -256,17 +228,12 @@ def _step_search(
             best_chunk_region (Dict)
         """
         peak_node = self._find_peak_node(mem_peak)
-        max_chunk_region = self._search_max_chunk_region(
-            active_node, peak_node, chunk_infos
-        )
+        max_chunk_region = self._search_max_chunk_region(active_node, peak_node, chunk_infos)
         if max_chunk_region == None:
             return None
-        possible_chunk_regions = self._search_possible_chunk_regions(
-            max_chunk_region, peak_node
-        )
-        best_chunk_region = self.select_chunk._select_best_chunk_region(
-            possible_chunk_regions, chunk_infos, peak_node, max_chunk_region, mem_peak
-        )
+        possible_chunk_regions = self._search_possible_chunk_regions(max_chunk_region, peak_node)
+        best_chunk_region = self.select_chunk._select_best_chunk_region(possible_chunk_regions, chunk_infos, peak_node,
+                                                                        max_chunk_region, mem_peak)
         best_chunk_region = self.reorder_graph.reorder_all(best_chunk_region)
         return best_chunk_region
 
@@ -291,9 +258,7 @@ def search_region(self) -> Dict:
             init_mem_peak,
             _,
             active_node,
-        ) = self.estimate_memory.estimate_chunk_inference_mem(
-            self.trace_indice.node_list
-        )
+        ) = self.estimate_memory.estimate_chunk_inference_mem(self.trace_indice.node_list)
         mem_peak = init_mem_peak
 
         while True:
@@ -306,14 +271,10 @@ def search_region(self) -> Dict:
                 mem_peak,
                 _,
                 active_node,
-            ) = self.estimate_memory.estimate_chunk_inference_mem(
-                self.trace_indice.node_list, chunk_infos
-            )
+            ) = self.estimate_memory.estimate_chunk_inference_mem(self.trace_indice.node_list, chunk_infos)
             if self._stop_search(init_mem_peak, mem_peak):
                 break
         if self.print_mem:
             self.print_mem = False
-            self.estimate_memory.estimate_chunk_inference_mem(
-                self.trace_indice.node_list, chunk_infos, print_mem=True
-            )
+            self.estimate_memory.estimate_chunk_inference_mem(self.trace_indice.node_list, chunk_infos, print_mem=True)
         return chunk_infos
diff --git a/colossalai/autochunk/trace_flow.py b/colossalai/autochunk/trace_flow.py
index ec1e012beb17..04fa2b3bb480 100644
--- a/colossalai/autochunk/trace_flow.py
+++ b/colossalai/autochunk/trace_flow.py
@@ -1,8 +1,13 @@
+from typing import Dict, List, Tuple
+
+from torch.fx.node import Node
+
 from .trace_indice import TraceIndice
 from .utils import (
     find_chunk_all_input_nodes,
     find_chunk_compute_input_and_output_nodes,
     find_idx_by_name,
+    flat_list,
     get_node_shape,
     is_non_compute_node,
     is_non_compute_node_except_placeholder,
@@ -171,7 +176,7 @@ def _get_all_node_info(self, end_dim, start_idx, end_idx):
                 # get cur node info
                 cur_node_chunk_dim = all_node_info[cur_node]["chunk_dim"]
                 cur_node_fix_dim = all_node_info[cur_node]["fix_dim"]
-                if cur_node_chunk_dim:
+                if cur_node_chunk_dim is not None:
                     cur_node_compute = self.trace_indice._find_compute_trace_from_node(cur_node)
                     cur_node_source = self.trace_indice._find_source_trace_from_node(cur_node)
                 else:
@@ -223,15 +228,32 @@ def _get_all_node_info(self, end_dim, start_idx, end_idx):
             cur_node_list = next_node_list
         return all_node_info
 
-    def _get_input_nodes_dim(self, inputs, start_idx, end_idx, all_node_info):
+    def _get_input_nodes_dim(self, inputs: List[Node], start_idx: int, end_idx: int, all_node_info: Dict) -> Tuple:
+        """
+        Get chunk dim for every input node for their every entry, remove unchunked nodes
+
+        Args:
+            inputs (List[Node]): input nodes
+            all_node_info (Dict): describe all node's chunk dim and fix dim
+            start_idx (int): chunk start idx
+            end_idx (int): chunk end idx
+
+        Returns:
+            inputs (List(Node)): new inputs
+            inputs_dim (List): chunk dim for inputs
+        """
         inputs_dim = []
         remove_inputs = []
         for input_node in inputs:
             input_dict = {}
             input_node_idx = find_idx_by_name(input_node.name, self.trace_indice.node_list)
             for user in input_node.users.keys():
+                # skip non compute
                 if is_non_compute_node(user):
                     continue
+                # untraced node, mostly non compute
+                if user not in all_node_info:
+                    continue
                 user_idx = find_idx_by_name(user.name, self.trace_indice.node_list)
                 if start_idx <= user_idx <= end_idx:
                     chunk_dim = all_node_info[user]["chunk_dim"]
@@ -245,12 +267,24 @@ def _get_input_nodes_dim(self, inputs, start_idx, end_idx, all_node_info):
                 remove_inputs.append(input_node)
             else:
                 inputs_dim.append(input_dict)
+        # remove unchunked inputs
         for i in remove_inputs:
             if i in inputs:
                 inputs.remove(i)
         return inputs, inputs_dim
 
-    def _get_prepose_nodes(self, all_node_info, start_idx, end_idx):
+    def _get_prepose_nodes(self, all_node_info: Dict, start_idx: int, end_idx: int) -> List[Node]:
+        """
+        get all useless nodes in chunk region and prepose them
+
+        Args:
+            all_node_info (Dict): describe all node's chunk dim and fix dim
+            start_idx (int): chunk start idx
+            end_idx (int): chunk end idx
+
+        Returns:
+            List[Node]: all nodes to be preposed
+        """
         # get all possible prepose nodes
         maybe_prepose_nodes = []
         for node, node_info in all_node_info.items():
@@ -276,7 +310,7 @@ def _get_prepose_nodes(self, all_node_info, start_idx, end_idx):
                 for cur_prepose_node in tmp_cur_prepose_nodes:
                     if prepose_flag == False:
                         break
-                    for cur_prepose_node_arg in cur_prepose_node.args:
+                    for cur_prepose_node_arg in cur_prepose_node.all_input_nodes:
                         if type(cur_prepose_node_arg) != type(cur_prepose_node):
                             continue
                         # out of loop
@@ -360,19 +394,28 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
         return chunk_info
 
     def _reassgin_reshape_size(self, chunk_info):
+        """
+        Some shape args in reshape may have changed due to chunk
+        reassgin those changed shape
+        """
         chunk_region = chunk_info["region"]
         reshape_size = {}
         chunk_shape = get_node_shape(chunk_info["outputs"][0])[chunk_info["outputs_dim"]]
         for node in self.trace_indice.node_list[chunk_region[0]:chunk_region[1] + 1]:
             if any(i in node.name for i in ["reshape", "view"]):
-                reshape_args = node.args[1:]
-                reshape_log = self.trace_indice.indice_view_list[node]
+                reshape_args = flat_list(node.args[1:])
                 chunk_dim = chunk_info["node_chunk_dim"][node]["chunk_dim"]
-                reshape_size[node.name] = {}
+                new_shape = ""
                 for reshape_arg_dim, reshape_arg in enumerate(reshape_args):
-                    if reshape_arg_dim in reshape_log["dim_to"]:
-                        continue
                     if reshape_arg_dim == chunk_dim:
-                        reshape_size[node.name][reshape_arg.name] = ("min(chunk_size, %d - chunk_idx)" % chunk_shape)
+                        new_shape += "min(chunk_size, %d - chunk_idx), " % chunk_shape
+                    else:
+                        if isinstance(reshape_arg, int):
+                            new_shape += "%s, " % str(reshape_arg)
+                        else:
+                            new_shape += "%s, " % reshape_arg.name
+                new_shape = new_shape[:-2]
+                origin_shape = str(reshape_args)[1:-1]
+                reshape_size[node.name] = [origin_shape, new_shape]
         chunk_info["reshape_size"] = reshape_size
         return chunk_info
diff --git a/colossalai/autochunk/trace_indice.py b/colossalai/autochunk/trace_indice.py
index 5a5d15e0a1f4..862cd6b99ccc 100644
--- a/colossalai/autochunk/trace_indice.py
+++ b/colossalai/autochunk/trace_indice.py
@@ -3,7 +3,7 @@
 
 from torch.fx.node import Node
 
-from .utils import find_first_tensor_arg, find_idx_by_name, get_node_shape, unflat_list
+from .utils import find_first_tensor_arg, find_idx_by_name, flat_list, get_node_shape
 
 
 class TraceIndice(object):
@@ -28,7 +28,7 @@ class TraceIndice(object):
         node_list (List)
     """
 
-    def __init__(self, node_list: List) -> None:
+    def __init__(self, node_list: List[Node]) -> None:
         self.node_list = node_list
         self.indice_trace_list = self._init_indice_trace_list()
         self.indice_view_list = {}
@@ -198,7 +198,7 @@ def _find_compute_trace_from_node(self, node):
         node_idx = find_idx_by_name(node.name, self.node_list)
         return self.indice_trace_list[node_idx]["compute"]
 
-    def _assign_indice_as_input(self, node, node_idx, input_node=None):
+    def _assign_indice_as_input(self, node: Node, node_idx: int, input_node=None):
         """
         Assign node's trace as its input node.
 
@@ -216,7 +216,7 @@ def _assign_indice_as_input(self, node, node_idx, input_node=None):
 
         self._inherit_all_computation(input_node, node)
 
-    def _assign_all_indice(self, node, node_idx):
+    def _assign_all_indice(self, node: Node, node_idx: int):
         """
         Add new indice for all node's dims.
 
@@ -232,7 +232,7 @@ def _assign_all_indice(self, node, node_idx):
             new_trace.append(self._add_indice())
         self.indice_trace_list[node_idx]["indice"] = new_trace
 
-    def _assign_transpose_indice(self, node, node_idx):
+    def _assign_transpose_indice(self, node: Node, node_idx: int):
         """
         Assign indice for transpose op.
         1. swap input's dim according to transpose args
@@ -249,7 +249,7 @@ def _assign_transpose_indice(self, node, node_idx):
         self._inherit_indice(input_node, tranpose_dim[1], node, tranpose_dim[0])
         self._inherit_indice(input_node, tranpose_dim[0], node, tranpose_dim[1])
 
-    def _assign_permute_indice(self, node, node_idx):
+    def _assign_permute_indice(self, node: Node, node_idx: int):
         """
         Assign indice for permute op.
         1. swap input's dim according to permute args
@@ -259,14 +259,14 @@ def _assign_permute_indice(self, node, node_idx):
             node (node)
             node_idx (int)
         """
-        permute_dim = unflat_list(node.args[1:])
+        permute_dim = flat_list(node.args[1:])
         input_node = node.args[0]
 
         self._assign_indice_as_input(node, node_idx, input_node)
         for idx, d in enumerate(permute_dim):
             self._inherit_indice(input_node, d, node, idx)
 
-    def _assign_linear_indice(self, node, node_idx):
+    def _assign_linear_indice(self, node: Node, node_idx: int):
         """
         Assign indice for linear op.
         1. copy trace from input node and change last indice accroding to weight
@@ -287,7 +287,7 @@ def _assign_linear_indice(self, node, node_idx):
 
         self._mark_computation(node, node_idx, [-1])
 
-    def _assign_matmul_indice(self, node, node_idx):
+    def _assign_matmul_indice(self, node: Node, node_idx: int):
         """
         Assign indice for matmul op.
         1. copy trace from matmul_left and change last indice accroding to matmul_right. (assert they have same length)
@@ -393,7 +393,7 @@ def _assign_softmax_indice(self, node, idx):
         self._assign_indice_as_input(node, idx)
         self._mark_computation(node, idx, [node.kwargs["dim"]])
 
-    def _assign_unsqueeze_indice(self, node, node_idx):
+    def _assign_unsqueeze_indice(self, node: Node, node_idx: int):
         """
         Assign indice for unsqueeze op.
         1. assign new indice for unsqueeze dim
@@ -404,9 +404,13 @@ def _assign_unsqueeze_indice(self, node, node_idx):
         """
         self._del_dim(node_idx, -1)
         self._assign_indice_as_input(node, node_idx)
-        self._add_dim(node_idx, node.args[1])
+        dim_idx = node.args[1]
+        # unsqueeze(-1) = unsqueeze(shape_num + 1)
+        if dim_idx < 0:
+            dim_idx = list(range(len(get_node_shape(node))))[dim_idx]
+        self._add_dim(node_idx, dim_idx)
 
-    def _assign_dropout_indice(self, node, node_idx):
+    def _assign_dropout_indice(self, node: Node, node_idx: int):
         """
         Assign indice for unsqueeze op.
         1. assign new indice for unsqueeze dim
@@ -417,7 +421,7 @@ def _assign_dropout_indice(self, node, node_idx):
         """
         self._assign_indice_as_input(node, node_idx)
 
-    def _assign_ones_like_indice(self, node, node_idx):
+    def _assign_ones_like_indice(self, node: Node, node_idx: int):
         """
         Assign indice for oneslike op.
         1. assign new indice for all dim
@@ -428,7 +432,47 @@ def _assign_ones_like_indice(self, node, node_idx):
         """
         self._assign_all_indice(node, node_idx)
 
-    def _assign_view_reshape_indice(self, node, node_idx):
+    def _assign_getitem_indice(self, node: Node, node_idx: int):
+        """
+        Assign indice for getitem.
+        getitem can act like slice sometimes
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        node_args = flat_list(node.args[1:])
+        if not any(i == str(node_arg) for i in ["None", "Ellipsis"] for node_arg in node_args):
+            return
+
+        # node args should be like [Ellipsis, slice(start, step, end), None]
+        node_shape = get_node_shape(node)
+        origin_idx_count = 0
+        new_idx_count = 0
+        new_dim_num = sum([1 if str(i) == "None" else 0 for i in node_args])
+        for _ in range(new_dim_num):
+            self._del_dim(node_idx, 0)
+        self._assign_indice_as_input(node, node_idx)
+
+        for _, node_arg in enumerate(node_args):
+            node_arg_str = str(node_arg)
+            # Ellipsis means [..., ]
+            if "Ellipsis" == node_arg_str:
+                shape_gap = len(node_shape) - len(node_args) + 1
+                origin_idx_count += shape_gap
+                new_idx_count += shape_gap
+            # slice(None, None, None) means all indexes, doesn't support other slice
+            elif "slice(None, None, None)" == node_arg_str:
+                origin_idx_count += 1
+                new_idx_count += 1
+            # None means a new dim
+            elif "None" == node_arg_str:
+                self._add_dim(node_idx, new_idx_count)
+                new_idx_count += 1
+            else:
+                raise NotImplementedError()
+
+    def _assign_view_reshape_indice(self, node: Node, node_idx: int):
         """
         Assign indice for view and reshape op.
         1. get origin shape and target shape by meta info.
@@ -447,7 +491,7 @@ def _assign_view_reshape_indice(self, node, node_idx):
         origin_node = node.args[0]
         origin_shape = origin_node.meta["tensor_meta"].shape
         target_shape = []
-        unflated_args = unflat_list(node.args)
+        unflated_args = flat_list(node.args)
         for i in range(1, len(unflated_args)):
             if isinstance(unflated_args[i], int):
                 target_shape.append(unflated_args[i])
@@ -544,6 +588,8 @@ def trace_indice(self):
                     self._assign_einsum_indice(node, idx)
                 elif "layer_norm" in node.name:
                     self._assign_layernorm_indice(node, idx)
+                elif "getitem" in node.name:
+                    self._assign_getitem_indice(node, idx)
                 elif any(i in node.name for i in ["getattr", "getitem", "eq", "_assert"]):
                     continue
                 else:
diff --git a/colossalai/autochunk/utils.py b/colossalai/autochunk/utils.py
index 5f3ea3bf482d..9c2363b544e2 100644
--- a/colossalai/autochunk/utils.py
+++ b/colossalai/autochunk/utils.py
@@ -3,14 +3,14 @@
 from torch.fx.node import Node
 
 
-def unflat_list(inputs):
+def flat_list(inputs):
     """
-    unflat a list by recursion
+    flat a list by recursion
     """
     res = []
     for i in inputs:
         if isinstance(i, list) or isinstance(i, set) or isinstance(i, tuple):
-            res.extend(unflat_list(i))
+            res.extend(flat_list(i))
         else:
             res.append(i)
     return res
@@ -27,8 +27,13 @@ def find_first_tensor_arg(node):
 
 
 def is_non_compute_node(node):
-    if any(i in node.op for i in ["placeholder", "get_attr", "output"]) or any(
-            i in node.name for i in ["getitem", "getattr"]):
+    if any(i in node.op for i in ["placeholder", "get_attr", "output"]) or any(i in node.name for i in ["getattr"]):
+        return True
+    if "getitem" in node.name:
+        node_args = flat_list(node.args[1:])
+        for node_arg in node_args:
+            if any(i == str(node_arg) for i in ["None", "Ellipsis"]):
+                return False
         return True
     return False
 
@@ -40,15 +45,15 @@ def get_node_shape(node):
 
 
 def is_non_compute_node_except_placeholder(node):
-    if any(i in node.op for i in ["get_attr", "output"]) or any(i in node.name for i in ["getitem", "getattr"]):
-        return True
-    return False
+    if "placeholder" in node.op:
+        return False
+    return is_non_compute_node(node)
 
 
 def is_non_compute_node_except_placeholder_output(node):
-    if any(i in node.op for i in ["get_attr"]) or any(i in node.name for i in ["getitem", "getattr"]):
-        return True
-    return False
+    if "output" in node.op:
+        return False
+    return is_non_compute_node_except_placeholder(node)
 
 
 def find_idx_by_name(name, nodes_list):
diff --git a/tests/test_autochunk/test_evoformer_codegen.py b/tests/test_autochunk/test_evoformer_codegen.py
index 1273bf2fecbf..c5a893eda7cc 100644
--- a/tests/test_autochunk/test_evoformer_codegen.py
+++ b/tests/test_autochunk/test_evoformer_codegen.py
@@ -27,18 +27,17 @@
 
 def _test_fwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair, node_mask, pair_mask):
     # for memory test
+    # model = model.cuda()
     # torch.cuda.reset_peak_memory_stats()
     # now_mem = torch.cuda.memory_allocated() / 1024**2
     # with torch.no_grad():
     #     node1 = node.clone()
     #     pair1 = pair.clone()
-    #     gm(node1, pair1)
-    # new_now_mem = torch.cuda.memory_allocated() / 1024**2
+    #     node_mask1 = node_mask.clone()
+    #     pair_mask1 = pair_mask.clone()
+    #     gm(node1, pair1, node_mask1, pair_mask1)
     # new_max_mem = torch.cuda.max_memory_allocated() / 1024**2
-    # print(
-    #     "autochunk now mem:%.2f max mem:%.2f"
-    #     % (new_now_mem - now_mem, new_max_mem - now_mem)
-    # )
+    # print("autochunk max mem:%.2f"% (new_max_mem - now_mem))
 
     # test forward
     model = model.cuda()
@@ -113,7 +112,7 @@ def _test_evoformer_codegen(rank, msa_len, pair_len, max_memory):
         MetaTensor(node_mask, fake_device="cuda:0"),
         MetaTensor(pair_mask, fake_device="cuda:0"),
     )
-    # codegen = AutoChunkCodeGen(meta_graph, max_memory=max_memory)
+    codegen = AutoChunkCodeGen(meta_graph, max_memory=max_memory, print_mem=False)
 
     # trace and recompile
     # MetaInfoProp requires symbolic_trace but CodeGen requires ColoTracer
@@ -130,14 +129,14 @@ def _test_evoformer_codegen(rank, msa_len, pair_len, max_memory):
             "_mask_trans": True,
         },
     )
-    # graph.set_codegen(codegen)
+    graph.set_codegen(codegen)
     gm = ColoGraphModule(model, graph)
     gm.recompile()
 
     # assert we have inserted chunk
     code = graph.python_code("self").src
-    assert "chunk_size" in code
     # print(code)
+    assert "chunk_result = None;  chunk_size = None;" in code
 
     _test_fwd(model, gm, node, pair, node_mask, pair_mask)
     gpc.destroy()
@@ -147,7 +146,7 @@ def _test_evoformer_codegen(rank, msa_len, pair_len, max_memory):
     not (CODEGEN_AVAILABLE and is_compatible_with_meta() and HAS_REPO),
     reason="torch version is lower than 1.12.0",
 )
-@pytest.mark.parametrize("max_memory", [None, 20, 25, 30])
+@pytest.mark.parametrize("max_memory", [None, 24, 28, 32])
 @pytest.mark.parametrize("msa_len", [32])
 @pytest.mark.parametrize("pair_len", [64])
 def test_evoformer_codegen(msa_len, pair_len, max_memory):
@@ -161,4 +160,4 @@ def test_evoformer_codegen(msa_len, pair_len, max_memory):
 
 
 if __name__ == "__main__":
-    _test_evoformer_codegen(0, 32, 64, 25)
+    _test_evoformer_codegen(0, 32, 64, 24)
diff --git a/tests/test_autochunk/test_simple_evoformer_codegen.py b/tests/test_autochunk/test_simple_evoformer_codegen.py
index f1272330fcd9..8ab77024c1b9 100644
--- a/tests/test_autochunk/test_simple_evoformer_codegen.py
+++ b/tests/test_autochunk/test_simple_evoformer_codegen.py
@@ -13,7 +13,7 @@
 
 import colossalai
 from colossalai.core import global_context as gpc
-from colossalai.fx import ColoTracer
+from colossalai.fx import ColoTracer, symbolic_trace
 from colossalai.fx._compatibility import is_compatible_with_meta
 from colossalai.fx.codegen.activation_checkpoint_codegen import CODEGEN_AVAILABLE
 from colossalai.fx.graph_module import ColoGraphModule
@@ -26,21 +26,6 @@
 
 
 def _test_fwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair):
-    # for memory test
-    # torch.cuda.reset_peak_memory_stats()
-    # now_mem = torch.cuda.memory_allocated() / 1024**2
-    # with torch.no_grad():
-    #     node1 = node.clone()
-    #     pair1 = pair.clone()
-    #     gm(node1, pair1)
-    # new_now_mem = torch.cuda.memory_allocated() / 1024**2
-    # new_max_mem = torch.cuda.max_memory_allocated() / 1024**2
-    # print(
-    #     "autochunk now mem:%.2f max mem:%.2f"
-    #     % (new_now_mem - now_mem, new_max_mem - now_mem)
-    # )
-
-    # test forward
     with torch.no_grad():
         non_fx_out = model(node, pair)
         fx_out = gm(node, pair)
@@ -69,6 +54,16 @@ def _test_simple_evoformer_codegen(rank, msa_len, pair_len, max_memory):
     node = torch.randn(1, msa_len, pair_len, 256).cuda()
     pair = torch.randn(1, pair_len, pair_len, 128).cuda()
 
+    # meta info prop
+    meta_graph = symbolic_trace(model,
+                                meta_args={
+                                    "node": node.to(torch.device("meta")),
+                                    "pair": pair.to(torch.device("meta")),
+                                })    # must use symbolic_trace
+    interp = MetaInfoProp(meta_graph)
+    interp.propagate(MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0"))
+    codegen = AutoChunkCodeGen(meta_graph, max_memory=max_memory)
+
     # trace the module and replace codegen
     graph = ColoTracer().trace(
         model,
@@ -77,24 +72,14 @@ def _test_simple_evoformer_codegen(rank, msa_len, pair_len, max_memory):
             "pair": pair.to(torch.device("meta")),
         },
     )
-    gm_prop = torch.fx.symbolic_trace(model)    # must use symbolic_trace
-    interp = MetaInfoProp(gm_prop)
-    interp.propagate(MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0"))
-
-    # now run it twice to get meta info in graph module, not necessary
-    gm = torch.fx.GraphModule(model, graph)
-    interp = MetaInfoProp(gm)
-    interp.propagate(MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0"))
-
-    codegen = AutoChunkCodeGen(gm_prop, max_memory=max_memory)
     graph.set_codegen(codegen)
     gm = ColoGraphModule(model, graph)
     gm.recompile()
 
     # assert we have inserted chunk
     code = graph.python_code("self").src
-    assert "chunk_size" in code
     # print(code)
+    assert "chunk_result = None;  chunk_size = None;" in code
 
     _test_fwd(model, gm, node, pair)
     gpc.destroy()
diff --git a/tests/test_autochunk/test_simple_evoformer_search.py b/tests/test_autochunk/test_simple_evoformer_search.py
index 04fb514fbf44..4c591c48319e 100644
--- a/tests/test_autochunk/test_simple_evoformer_search.py
+++ b/tests/test_autochunk/test_simple_evoformer_search.py
@@ -47,18 +47,18 @@ def assert_chunk_infos(chunk_infos, max_memory, msa_len, pair_len):
             str(target_regions),
         )
     for region in target_regions:
-        assert (region in found_regions), "region:%s not in found regions for msa:%d, pair:%d, maxmem:%d" % (
+        assert (region in found_regions), "region:%s not in found regions for msa:%d, pair:%d, maxmem:%s" % (
             str(region),
             msa_len,
             pair_len,
-            max_memory,
+            str(max_memory),
         )
     for region in found_regions:
         assert (region in target_regions), "region:%s should not be found for msa:%d, pair:%d, maxmem:%d" % (
             str(region),
             msa_len,
             pair_len,
-            max_memory,
+            str(max_memory),
         )
 
 
From 99d9713b02664a51861e8ece23b974f5428c4f3e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E3=82=A2=E3=83=9E=E3=83=87=E3=82=A6=E3=82=B9?=
 <kurisusnowdeng@users.noreply.github.com>
Date: Thu, 19 Jan 2023 12:23:03 +0800
Subject: [PATCH 207/503] Revert "Update parallel_context.py (#2408)"

This reverts commit 7d5640b9db01b501e95b66e91be9fe27b58d2e58.
---
 colossalai/context/parallel_context.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/colossalai/context/parallel_context.py b/colossalai/context/parallel_context.py
index b7338b53ddde..dd12dad6d347 100644
--- a/colossalai/context/parallel_context.py
+++ b/colossalai/context/parallel_context.py
@@ -375,7 +375,7 @@ def init_global_dist(self, rank: int, world_size: int, backend: str, host: str,
 
         # None will give the default global process group for pytorch dist operations
         ranks = list(range(world_size))
-        cpu_group = dist.new_group(ranks, backend='gloo') if dist.get_backend() == 'gloo' else None
+        cpu_group = dist.new_group(ranks, backend='gloo') if dist.get_backend() != 'gloo' else None
         self._register_dist(rank, world_size, dist.GroupMember.WORLD, cpu_group, ranks, ParallelMode.GLOBAL)
         self.add_global_rank(ParallelMode.GLOBAL, rank)
 

From 0f02b8c6e67e565e41fe2546179209bb63dcd4a9 Mon Sep 17 00:00:00 2001
From: Ziyue Jiang <ziyue.jiang97@gmail.com>
Date: Thu, 19 Jan 2023 13:54:50 +0800
Subject: [PATCH 208/503] add avg partition (#2483)

Co-authored-by: Ziyue Jiang <ziyue.jiang@gmail.com>
---
 .../fx/passes/adding_split_node_pass.py       | 36 +++++++++++++++++++
 colossalai/fx/passes/meta_info_prop.py        |  3 +-
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/colossalai/fx/passes/adding_split_node_pass.py b/colossalai/fx/passes/adding_split_node_pass.py
index 373d20c51041..0499769d884d 100644
--- a/colossalai/fx/passes/adding_split_node_pass.py
+++ b/colossalai/fx/passes/adding_split_node_pass.py
@@ -9,6 +9,40 @@ def pipe_split():
     pass
 
 
+def avgcompute_split_pass(gm: torch.fx.GraphModule, pp_size: int):
+    """
+    In avgcompute_split_pass, we split module by the fwd flops.
+    """
+    mod_graph = gm.graph
+    # To use avgcompute_split_pass, we need run meta_info_prop interpreter first.
+    # If nodes don't have meta info, this pass will fall back to normal balanced split pass.
+    check_node = list(mod_graph.nodes)[0]
+    if 'tensor_meta' not in check_node.meta:
+        return balanced_split_pass(gm, pp_size)
+
+    total_fwd_flop = 0
+    for node in mod_graph.nodes:
+        total_fwd_flop += node.fwd_flop
+
+    partition_flop = total_fwd_flop // pp_size
+    accumulate_fwd_flop = 0
+    for node in mod_graph.nodes:
+        if pp_size <= 1:
+            break
+        if 'pipe_split' in node.name:
+            continue
+        accumulate_fwd_flop += node.fwd_flop
+        if accumulate_fwd_flop >= partition_flop:
+            total_fwd_flop = total_fwd_flop - accumulate_fwd_flop
+            accumulate_fwd_flop = 0
+            pp_size -= 1
+            partition_flop = total_fwd_flop // pp_size
+            with mod_graph.inserting_after(node):
+                split_node = mod_graph.create_node('call_function', pipe_split)
+    gm.recompile()
+    return gm
+
+
 def avgnode_split_pass(gm: torch.fx.GraphModule, pp_size: int):
     """
     In avgnode_split_pass, simpliy split graph by node number.
@@ -104,8 +138,10 @@ def balanced_split_pass_v2(gm: torch.fx.GraphModule, pp_size: int):
             continue
         accumulate_node_size += node.node_size
         if accumulate_node_size >= partition_size:
+            total_element_size = total_element_size - accumulate_node_size
             accumulate_node_size = 0
             pp_size -= 1
+            partition_size = total_element_size // pp_size
             with mod_graph.inserting_after(node):
                 split_node = mod_graph.create_node('call_function', pipe_split)
     gm.recompile()
diff --git a/colossalai/fx/passes/meta_info_prop.py b/colossalai/fx/passes/meta_info_prop.py
index 5137494ada6f..281cae41f77d 100644
--- a/colossalai/fx/passes/meta_info_prop.py
+++ b/colossalai/fx/passes/meta_info_prop.py
@@ -112,7 +112,8 @@ def extract_tensor_meta(obj):
         n.meta['tensor_meta'] = tensor_meta
         n.meta = {**n.meta, **asdict(meta_info)}    # extend MetaInfo to `n.meta`
         # TODO: the attribute node_size should be removed in the future
-        setattr(n, 'node_size', activation_size(n.meta.get('fwd_in', 0)) + activation_size(n.meta.get('fwd_tmp', 0)))
+        setattr(n, 'node_size', activation_size(n.meta.get('fwd_out', 0)) + activation_size(n.meta.get('fwd_tmp', 0)))
+        setattr(n, 'fwd_flop', n.meta.get('fwd_flop', 0))
         n.meta['type'] = type(result)
 
         # retain the autograd graph

From 72341e65f4fbeb1884d9cd1ce3d1996ae8642bc8 Mon Sep 17 00:00:00 2001
From: oahzxl <43881818+oahzxl@users.noreply.github.com>
Date: Fri, 20 Jan 2023 10:13:03 +0800
Subject: [PATCH 209/503] [auto-chunk] support extramsa (#3) (#2504)

---
 colossalai/autochunk/estimate_memory.py       |   9 +-
 colossalai/autochunk/trace_flow.py            |  43 +++--
 colossalai/autochunk/trace_indice.py          |  56 +++++-
 colossalai/autochunk/utils.py                 |  20 ++-
 .../test_autochunk/test_evoformer_codegen.py  |   2 +-
 tests/test_autochunk/test_extramsa_codegen.py | 164 ++++++++++++++++++
 .../test_simple_evoformer_codegen.py          |   2 +-
 .../test_simple_evoformer_search.py           |  41 ++---
 8 files changed, 283 insertions(+), 54 deletions(-)
 create mode 100644 tests/test_autochunk/test_extramsa_codegen.py

diff --git a/colossalai/autochunk/estimate_memory.py b/colossalai/autochunk/estimate_memory.py
index d386253850a7..21f34481ba70 100644
--- a/colossalai/autochunk/estimate_memory.py
+++ b/colossalai/autochunk/estimate_memory.py
@@ -6,12 +6,7 @@
 
 from colossalai.fx.profiler import activation_size, parameter_size
 
-from .utils import (
-    delete_free_var_from_last_use,
-    find_idx_by_name,
-    get_node_shape,
-    is_non_compute_node_except_placeholder,
-)
+from .utils import delete_free_var_from_last_use, find_idx_by_name, get_node_shape, is_non_memory_node
 
 
 class EstimateMemory(object):
@@ -240,7 +235,7 @@ def estimate_chunk_inference_mem(
             elif node.op == "output":
                 continue
             # no change for non compute node
-            elif is_non_compute_node_except_placeholder(node):
+            elif is_non_memory_node(node):
                 act_memory_peak_log.append(act_memory)
             # node is a compute op
             # calculate tmp, output node and delete node memory
diff --git a/colossalai/autochunk/trace_flow.py b/colossalai/autochunk/trace_flow.py
index 04fa2b3bb480..e657c188ead2 100644
--- a/colossalai/autochunk/trace_flow.py
+++ b/colossalai/autochunk/trace_flow.py
@@ -118,16 +118,34 @@ def check_index_duplicate(self, chunk_infos, return_dim=False):
 
     def _assgin_single_node_flow(
         self,
-        arg_node,
-        start_idx,
-        end_idx,
-        cur_node_dim,
-        cur_node_compute,
-        cur_node_source,
-        cur_node_fix_dim,
-        all_node_info,
-        next_node_list,
-    ):
+        arg_node: Node,
+        start_idx: int,
+        end_idx: int,
+        cur_node_dim: int,
+        cur_node_compute: Dict,
+        cur_node_source: Dict,
+        cur_node_fix_dim: List,
+        all_node_info: Dict,
+        next_node_list: List,
+    ) -> bool:
+        """
+        Given the current node and one of its arg node,
+        this function finds out arg node's chunk dim and fix dim
+
+        Args:
+            arg_node (Node): input node
+            start_idx (int): chunk region start
+            end_idx (int): chunk region end
+            cur_node_dim (int): current node chunk dim
+            cur_node_compute (Dict): current node compute dict
+            cur_node_source (Dict): current node source dict
+            cur_node_fix_dim (List): current node fix dim
+            all_node_info (Dict): all node chunk info in the chunk region
+            next_node_list (List)
+
+        Returns:
+            bool: True if this node can be added to the flow, vice versa.
+        """
         arg_idx = find_idx_by_name(arg_node.name, self.trace_indice.node_list)
         # arg in chunk range or be inputs
         if not (start_idx <= arg_idx < end_idx):
@@ -142,6 +160,9 @@ def _assgin_single_node_flow(
                 arg_dim = None
             else:
                 arg_dim = cur_node_source[cur_node_dim][arg_idx][0]
+                # chunk dim should be None if shape size is 1
+                if get_node_shape(arg_node)[arg_dim] == 1:
+                    arg_dim = None
         else:
             arg_dim = None
 
@@ -184,7 +205,7 @@ def _get_all_node_info(self, end_dim, start_idx, end_idx):
 
                 # get all valid args
                 arg_list = []
-                for arg in cur_node.args:
+                for arg in cur_node.all_input_nodes:
                     if type(arg) != type(cur_node):
                         continue
                     if is_non_compute_node(arg):
diff --git a/colossalai/autochunk/trace_indice.py b/colossalai/autochunk/trace_indice.py
index 862cd6b99ccc..5c2e9b5203b5 100644
--- a/colossalai/autochunk/trace_indice.py
+++ b/colossalai/autochunk/trace_indice.py
@@ -432,6 +432,38 @@ def _assign_ones_like_indice(self, node: Node, node_idx: int):
         """
         self._assign_all_indice(node, node_idx)
 
+    def _assign_cat_indice(self, node: Node, node_idx: int):
+        """
+        Assign indice for cat op.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        nodes_in = flat_list(node.args[0])
+        self._assign_indice_as_input(node, node_idx, input_node=nodes_in[0])
+        for n in nodes_in[1:]:
+            self._mark_computation_from_node(n, node)
+        cat_dim = node.kwargs["dim"]
+        self._del_dim(node_idx, cat_dim)
+        self._add_dim(node_idx, cat_dim)
+
+    def _assign_sum_indice(self, node: Node, node_idx: int):
+        """
+        Assign indice for sum op.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        nodes_in = flat_list(node.args[0])
+        self._add_dim(node_idx, 0)
+        self._assign_indice_as_input(node, node_idx, input_node=nodes_in[0])
+        for n in nodes_in[1:]:
+            self._mark_computation_from_node(n, node)
+        cat_dim = node.kwargs["dim"]
+        self._del_dim(node_idx, cat_dim)
+
     def _assign_getitem_indice(self, node: Node, node_idx: int):
         """
         Assign indice for getitem.
@@ -442,7 +474,16 @@ def _assign_getitem_indice(self, node: Node, node_idx: int):
             node_idx (int)
         """
         node_args = flat_list(node.args[1:])
-        if not any(i == str(node_arg) for i in ["None", "Ellipsis"] for node_arg in node_args):
+        flag = False
+        for node_arg in node_args:
+            node_arg_str = str(node_arg)
+            if any(i == node_arg_str for i in ["None", "Ellipsis"]):
+                flag = True
+                break
+            if "slice" in node_arg_str:
+                flag = True
+                break
+        if flag == False:
             return
 
         # node args should be like [Ellipsis, slice(start, step, end), None]
@@ -461,8 +502,11 @@ def _assign_getitem_indice(self, node: Node, node_idx: int):
                 shape_gap = len(node_shape) - len(node_args) + 1
                 origin_idx_count += shape_gap
                 new_idx_count += shape_gap
-            # slice(None, None, None) means all indexes, doesn't support other slice
-            elif "slice(None, None, None)" == node_arg_str:
+            # slice(None, None, None) means all indexes
+            elif "slice" in node_arg_str:
+                if "slice(None, None, None)" != node_arg_str:
+                    self._del_dim(node_idx, new_idx_count)
+                    self._add_dim(node_idx, new_idx_count)
                 origin_idx_count += 1
                 new_idx_count += 1
             # None means a new dim
@@ -565,7 +609,7 @@ def trace_indice(self):
                     self._assign_view_reshape_indice(node, idx)
                 elif "unsqueeze" in node.name:
                     self._assign_unsqueeze_indice(node, idx)
-                elif any(i in node.name for i in ["to", "contiguous"]):
+                elif any(i in node.name for i in ["to", "contiguous", "clone"]):
                     self._assgin_no_change_indice(node, idx)
                 elif "new_ones" in node.name:
                     self._assign_ones_like_indice(node, idx)
@@ -574,6 +618,8 @@ def trace_indice(self):
             elif node.op == "call_function":
                 if "linear" in node.name:
                     self._assign_linear_indice(node, idx)
+                elif "cat" in node.name:
+                    self._assign_cat_indice(node, idx)
                 elif "matmul" in node.name:
                     self._assign_matmul_indice(node, idx)
                 elif "softmax" in node.name:
@@ -586,6 +632,8 @@ def trace_indice(self):
                     self._assign_dropout_indice(node, idx)
                 elif "einsum" in node.name:
                     self._assign_einsum_indice(node, idx)
+                elif "sum" in node.name:
+                    self._assign_sum_indice(node, idx)
                 elif "layer_norm" in node.name:
                     self._assign_layernorm_indice(node, idx)
                 elif "getitem" in node.name:
diff --git a/colossalai/autochunk/utils.py b/colossalai/autochunk/utils.py
index 9c2363b544e2..ff1a64bc359d 100644
--- a/colossalai/autochunk/utils.py
+++ b/colossalai/autochunk/utils.py
@@ -3,10 +3,12 @@
 from torch.fx.node import Node
 
 
-def flat_list(inputs):
+def flat_list(inputs: Any) -> List:
     """
     flat a list by recursion
     """
+    if not (isinstance(inputs, list) or isinstance(inputs, set) or isinstance(inputs, tuple)):
+        return [inputs]
     res = []
     for i in inputs:
         if isinstance(i, list) or isinstance(i, set) or isinstance(i, tuple):
@@ -16,7 +18,7 @@ def flat_list(inputs):
     return res
 
 
-def find_first_tensor_arg(node):
+def find_first_tensor_arg(node: Node) -> Node:
     """
     Find the first input tensor arg for a node
     """
@@ -26,7 +28,7 @@ def find_first_tensor_arg(node):
     raise RuntimeError()
 
 
-def is_non_compute_node(node):
+def is_non_compute_node(node: Node) -> bool:
     if any(i in node.op for i in ["placeholder", "get_attr", "output"]) or any(i in node.name for i in ["getattr"]):
         return True
     if "getitem" in node.name:
@@ -34,16 +36,26 @@ def is_non_compute_node(node):
         for node_arg in node_args:
             if any(i == str(node_arg) for i in ["None", "Ellipsis"]):
                 return False
+            if "slice" in str(node_arg):
+                return False
         return True
     return False
 
 
-def get_node_shape(node):
+def get_node_shape(node: Node) -> List:
     if hasattr(node.meta["tensor_meta"], "shape"):
         return node.meta["tensor_meta"].shape
     return None
 
 
+def is_non_memory_node(node: Node) -> bool:
+    if "getitem" in node.name:
+        return True
+    if "output" in node.op:
+        return True
+    return is_non_compute_node(node)
+
+
 def is_non_compute_node_except_placeholder(node):
     if "placeholder" in node.op:
         return False
diff --git a/tests/test_autochunk/test_evoformer_codegen.py b/tests/test_autochunk/test_evoformer_codegen.py
index c5a893eda7cc..ba6a57a51ce3 100644
--- a/tests/test_autochunk/test_evoformer_codegen.py
+++ b/tests/test_autochunk/test_evoformer_codegen.py
@@ -130,7 +130,7 @@ def _test_evoformer_codegen(rank, msa_len, pair_len, max_memory):
         },
     )
     graph.set_codegen(codegen)
-    gm = ColoGraphModule(model, graph)
+    gm = ColoGraphModule(model, graph, ckpt_codegen=False)
     gm.recompile()
 
     # assert we have inserted chunk
diff --git a/tests/test_autochunk/test_extramsa_codegen.py b/tests/test_autochunk/test_extramsa_codegen.py
new file mode 100644
index 000000000000..2a41452a2ad7
--- /dev/null
+++ b/tests/test_autochunk/test_extramsa_codegen.py
@@ -0,0 +1,164 @@
+from functools import partial
+
+import pytest
+import torch
+import torch.fx
+import torch.multiprocessing as mp
+
+try:
+    from fastfold.model.nn.evoformer import ExtraMSABlock
+    HAS_REPO = True
+except:
+    HAS_REPO = False
+
+import colossalai
+from colossalai.core import global_context as gpc
+from colossalai.fx._compatibility import is_compatible_with_meta
+from colossalai.fx.codegen.activation_checkpoint_codegen import CODEGEN_AVAILABLE
+from colossalai.fx.graph_module import ColoGraphModule
+from colossalai.fx.passes.meta_info_prop import MetaInfoProp
+from colossalai.utils import free_port
+
+if CODEGEN_AVAILABLE and is_compatible_with_meta():
+    from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
+    from colossalai.fx.profiler import MetaTensor
+    from colossalai.fx.tracer.experimental import ColoTracer, symbolic_trace
+
+
+def _test_fwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair, node_mask, pair_mask):
+    # for memory test
+    # model = model.cuda()
+    # torch.cuda.reset_peak_memory_stats()
+    # now_mem = torch.cuda.memory_allocated() / 1024**2
+    # with torch.no_grad():
+    #     node1 = node.clone()
+    #     pair1 = pair.clone()
+    #     node_mask1 = node_mask.clone()
+    #     pair_mask1 = pair_mask.clone()
+    #     gm(node1, pair1, node_mask1, pair_mask1)
+    # new_max_mem = torch.cuda.max_memory_allocated() / 1024**2
+    # print("autochunk max mem:%.2f"% (new_max_mem - now_mem))
+
+    # test forward
+    model = model.cuda()
+    with torch.no_grad():
+        non_fx_out = model(node, pair, node_mask, pair_mask)
+        fx_out = gm(node, pair, node_mask, pair_mask)
+
+    assert torch.allclose(non_fx_out[0], fx_out[0],
+                          atol=1e-4), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(
+                              torch.abs(non_fx_out[0] - fx_out[0]))
+    assert torch.allclose(non_fx_out[1], fx_out[1],
+                          atol=1e-4), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(
+                              torch.abs(non_fx_out[1] - fx_out[1]))
+
+
+def _build_openfold():
+    model = ExtraMSABlock(
+        c_m=256,
+        c_z=128,
+        c_hidden_msa_att=32,
+        c_hidden_opm=32,
+        c_hidden_mul=128,
+        c_hidden_pair_att=32,
+        no_heads_msa=8,
+        no_heads_pair=4,
+        transition_n=4,
+        msa_dropout=0.15,
+        pair_dropout=0.15,
+        inf=1e4,
+        eps=1e-4,
+        ckpt=False,
+        is_multimer=False,
+    ).eval().cuda()
+    return model
+
+
+def _test_extramsa_codegen(rank, msa_len, pair_len, max_memory):
+    # launch colossalai
+    colossalai.launch(
+        config={},
+        rank=rank,
+        world_size=1,
+        host="localhost",
+        port=free_port(),
+        backend="nccl",
+    )
+
+    # build model and input
+    model = _build_openfold()
+    node = torch.randn(1, msa_len, pair_len, 256).cuda()
+    node_mask = torch.randn(1, msa_len, pair_len).cuda()
+    pair = torch.randn(1, pair_len, pair_len, 128).cuda()
+    pair_mask = torch.randn(1, pair_len, pair_len).cuda()
+
+    # trace the meta graph and setup codegen
+    meta_graph = symbolic_trace(
+        model,
+        meta_args={
+            "m": node.to(torch.device("meta")),
+            "z": pair.to(torch.device("meta")),
+            "msa_mask": node_mask.to(torch.device("meta")),
+            "pair_mask": pair_mask.to(torch.device("meta")),
+        },
+        concrete_args={
+            "chunk_size": None,
+            "_chunk_logits": 1024,
+        },
+    )
+    interp = MetaInfoProp(meta_graph)
+    interp.propagate(
+        MetaTensor(node, fake_device="cuda:0"),
+        MetaTensor(pair, fake_device="cuda:0"),
+        MetaTensor(node_mask, fake_device="cuda:0"),
+        MetaTensor(pair_mask, fake_device="cuda:0"),
+    )
+    codegen = AutoChunkCodeGen(meta_graph, max_memory=max_memory, print_mem=False)
+
+    # trace and recompile
+    # MetaInfoProp requires symbolic_trace but CodeGen requires ColoTracer
+    graph = ColoTracer().trace(
+        model,
+        meta_args={
+            "m": node.to(torch.device("meta")),
+            "z": pair.to(torch.device("meta")),
+            "msa_mask": node_mask.to(torch.device("meta")),
+            "pair_mask": pair_mask.to(torch.device("meta")),
+        },
+        concrete_args={
+            "chunk_size": None,
+            "_chunk_logits": 1024,
+        },
+    )
+    graph.set_codegen(codegen)
+    gm = ColoGraphModule(model, graph, ckpt_codegen=False)
+    gm.recompile()
+
+    # assert we have inserted chunk
+    code = graph.python_code("self").src
+    # print(code)
+    assert "chunk_result = None;  chunk_size = None;" in code
+
+    _test_fwd(model, gm, node, pair, node_mask, pair_mask)
+    gpc.destroy()
+
+
+@pytest.mark.skipif(
+    not (CODEGEN_AVAILABLE and is_compatible_with_meta() and HAS_REPO),
+    reason="torch version is lower than 1.12.0",
+)
+@pytest.mark.parametrize("max_memory", [None, 24, 28, 32])
+@pytest.mark.parametrize("msa_len", [32])
+@pytest.mark.parametrize("pair_len", [64])
+def test_extramsa_codegen(msa_len, pair_len, max_memory):
+    run_func = partial(
+        _test_extramsa_codegen,
+        msa_len=msa_len,
+        pair_len=pair_len,
+        max_memory=max_memory,
+    )
+    mp.spawn(run_func, nprocs=1)
+
+
+if __name__ == "__main__":
+    _test_extramsa_codegen(0, 32, 64, None)
diff --git a/tests/test_autochunk/test_simple_evoformer_codegen.py b/tests/test_autochunk/test_simple_evoformer_codegen.py
index 8ab77024c1b9..7fe149c5784d 100644
--- a/tests/test_autochunk/test_simple_evoformer_codegen.py
+++ b/tests/test_autochunk/test_simple_evoformer_codegen.py
@@ -73,7 +73,7 @@ def _test_simple_evoformer_codegen(rank, msa_len, pair_len, max_memory):
         },
     )
     graph.set_codegen(codegen)
-    gm = ColoGraphModule(model, graph)
+    gm = ColoGraphModule(model, graph, ckpt_codegen=False)
     gm.recompile()
 
     # assert we have inserted chunk
diff --git a/tests/test_autochunk/test_simple_evoformer_search.py b/tests/test_autochunk/test_simple_evoformer_search.py
index 4c591c48319e..89f28d625cbe 100644
--- a/tests/test_autochunk/test_simple_evoformer_search.py
+++ b/tests/test_autochunk/test_simple_evoformer_search.py
@@ -13,6 +13,7 @@
 
 import colossalai
 from colossalai.core import global_context as gpc
+from colossalai.fx import symbolic_trace
 from colossalai.fx._compatibility import is_compatible_with_meta
 from colossalai.fx.codegen.activation_checkpoint_codegen import CODEGEN_AVAILABLE
 from colossalai.fx.passes.meta_info_prop import MetaInfoProp
@@ -28,10 +29,10 @@ def assert_chunk_infos(chunk_infos, max_memory, msa_len, pair_len):
 
     if msa_len == 32 and pair_len == 64:
         if max_memory is None:
-            target_regions = [(142, 154), (366, 373), (233, 283), (301, 351), (127, 134), (204, 228), (167, 191),
-                              (161, 166), (198, 203), (6, 69)]
+            target_regions = [(142, 154), (366, 373), (234, 283), (302, 351), (127, 134), (211, 228), (174, 191),
+                              (161, 166), (198, 203), (7, 57)]
         elif max_memory == 20:
-            target_regions = [(142, 154), (369, 373), (233, 269), (301, 351)]
+            target_regions = [(142, 154), (369, 373), (235, 269), (303, 351), (130, 131)]
         elif max_memory == 25:
             target_regions = [(144, 154), (369, 370)]
         elif max_memory == 30:
@@ -41,25 +42,10 @@ def assert_chunk_infos(chunk_infos, max_memory, msa_len, pair_len):
     else:
         raise NotImplementedError()
 
-    assert len(found_regions) == len(
-        target_regions), "len of found regions %s doesn't equal len of target regions %s" % (
-            str(found_regions),
-            str(target_regions),
-        )
-    for region in target_regions:
-        assert (region in found_regions), "region:%s not in found regions for msa:%d, pair:%d, maxmem:%s" % (
-            str(region),
-            msa_len,
-            pair_len,
-            str(max_memory),
-        )
-    for region in found_regions:
-        assert (region in target_regions), "region:%s should not be found for msa:%d, pair:%d, maxmem:%d" % (
-            str(region),
-            msa_len,
-            pair_len,
-            str(max_memory),
-        )
+    assert found_regions == target_regions, "found regions %s doesn't equal target regions %s" % (
+        str(found_regions),
+        str(target_regions),
+    )
 
 
 def _test_simple_evoformer_search(rank, msa_len, pair_len, max_memory):
@@ -78,11 +64,14 @@ def _test_simple_evoformer_search(rank, msa_len, pair_len, max_memory):
     node = torch.randn(1, msa_len, pair_len, 256).cuda()
     pair = torch.randn(1, pair_len, pair_len, 128).cuda()
 
-    gm_prop = torch.fx.symbolic_trace(model)    # must use symbolic_trace
-    interp = MetaInfoProp(gm_prop)
+    meta_graph = symbolic_trace(model,
+                                meta_args={
+                                    "node": node.to(torch.device("meta")),
+                                    "pair": pair.to(torch.device("meta")),
+                                })    # must use symbolic_trace
+    interp = MetaInfoProp(meta_graph)
     interp.propagate(MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0"))
-
-    codegen = AutoChunkCodeGen(gm_prop, max_memory=max_memory)
+    codegen = AutoChunkCodeGen(meta_graph, max_memory=max_memory)
     chunk_infos = codegen.chunk_infos
     assert_chunk_infos(chunk_infos, max_memory, msa_len, pair_len)
 

From 35c0c0006e84e1f7272a36f84af609c465aa5d83 Mon Sep 17 00:00:00 2001
From: Super Daniel <78588128+super-dainiu@users.noreply.github.com>
Date: Fri, 20 Jan 2023 10:49:00 +0800
Subject: [PATCH 210/503] [utils] lazy init. (#2148)

* [utils] lazy init.

* [utils] remove description.

* [utils] complete.

* [utils] finalize.

* [utils] fix names.
---
 colossalai/fx/profiler/tensor.py       |  45 ++-
 colossalai/utils/model/experimental.py | 440 +++++++++++++++++++++++++
 2 files changed, 461 insertions(+), 24 deletions(-)
 create mode 100644 colossalai/utils/model/experimental.py

diff --git a/colossalai/fx/profiler/tensor.py b/colossalai/fx/profiler/tensor.py
index 43165305f010..7606f17cf9d5 100644
--- a/colossalai/fx/profiler/tensor.py
+++ b/colossalai/fx/profiler/tensor.py
@@ -1,6 +1,4 @@
 import uuid
-from copy import deepcopy
-from typing import Optional
 
 import torch
 from torch.types import _bool, _device, _dtype
@@ -28,8 +26,6 @@ class MetaTensor(torch.Tensor):
 
     _tensor: torch.Tensor
 
-    __slots__ = ['_tensor']
-
     @staticmethod
     def __new__(cls, elem, fake_device=None):
         # Avoid multiple wrapping
@@ -47,7 +43,7 @@ def __new__(cls, elem, fake_device=None):
             storage_offset=elem.storage_offset(),
             dtype=elem.dtype,
             layout=elem.layout,
-            device=fake_device if fake_device is not None else elem.device,
+            device=fake_device if fake_device is not None else torch.device('cpu'),
             requires_grad=elem.requires_grad)    # deceive the frontend for aten selections
         r._tensor = elem
         # ...the real tensor is held as an element on the tensor.
@@ -59,8 +55,8 @@ def __new__(cls, elem, fake_device=None):
 
     def __repr__(self):
         if self.grad_fn:
-            return f"MetaTensor({self._tensor}, fake_device='{self.device}', grad_fn={self.grad_fn})"
-        return f"MetaTensor({self._tensor}, fake_device='{self.device}')"
+            return f"MetaTensor(..., size={tuple(self.shape)}, device='{self.device}', dtype={self.dtype}, grad_fn={self.grad_fn})"
+        return f"MetaTensor(..., size={tuple(self.shape)}, device='{self.device}', dtype={self.dtype})"
 
     @classmethod
     def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
@@ -76,13 +72,13 @@ def unwrap(x):
                 x = x.to(torch.device('meta'))
             return x
 
+        args = tree_map(unwrap, args)
+        kwargs = tree_map(unwrap, kwargs)
+
         if 'device' in kwargs:
             fake_device = kwargs['device']
             kwargs['device'] = torch.device('meta')
 
-        args = tree_map(unwrap, args)
-        kwargs = tree_map(unwrap, kwargs)
-
         # run aten for backend=CPU but actually on backend=Meta
         out = func(*args, **kwargs)
 
@@ -118,23 +114,24 @@ def to(self, *args, **kwargs) -> torch.Tensor:
             MetaTensor(tensor(..., device='meta', size=(10,)), fake_device='vulkan')
         """
         # this imitates c++ function in the way of @overload
-        device = None
-        for arg in args:
-            if isinstance(arg, str) or isinstance(arg, _device):
-                device = arg
-        if 'device' in kwargs:
-            device = kwargs['device']
-        result = super().to(*args, **kwargs)
-        if device is not None:
-            result = MetaTensor(result, fake_device=device)
-        return result
+        fake_device = None
+
+        def replace(x):
+            nonlocal fake_device
+            if isinstance(x, str) or isinstance(x, _device):
+                fake_device = x
+                return 'meta'
+            return x
+
+        elem = self._tensor.to(*tree_map(replace, args), **tree_map(replace, kwargs))
+        return MetaTensor(elem, fake_device=fake_device)
 
     def cpu(self, *args, **kwargs):
         if self.device.type == 'cpu':
             return self.to(*args, **kwargs)
         return self.to(*args, device='cpu', **kwargs)
 
-    def cuda(self, *args, **kwargs):
-        if self.device.type == 'cuda':
-            return self.to(*args, **kwargs)
-        return self.to(*args, device='cuda', **kwargs)
+    def cuda(self, device=None, non_blocking=False):
+        if device is not None:
+            return self.to(device=device, non_blocking=non_blocking)
+        return self.to(device='cuda:0', non_blocking=non_blocking)
diff --git a/colossalai/utils/model/experimental.py b/colossalai/utils/model/experimental.py
new file mode 100644
index 000000000000..8291227b7ba2
--- /dev/null
+++ b/colossalai/utils/model/experimental.py
@@ -0,0 +1,440 @@
+import contextlib
+import copy
+import gc
+import pprint
+from typing import Callable, List, Optional, Union
+
+import torch
+import torch.nn as nn
+from torch.utils._pytree import tree_map
+
+from colossalai.device.device_mesh import DeviceMesh
+from colossalai.fx.profiler import MetaTensor
+from colossalai.tensor.shape_consistency import ShapeConsistencyManager
+from colossalai.tensor.sharding_spec import ShardingSpec
+
+# reference: https://pytorch.org/cppdocs/notes/tensor_creation.html
+_TorchFactoryMethod = [
+    "arange",
+    "empty",
+    "eye",
+    "full",
+    "linspace",
+    "logspace",
+    "ones",
+    "rand",
+    "randn",
+    "randint",
+    "randperm",
+    "zeros",
+    "tensor",
+]
+
+orig_empty = torch.empty    # avoid override
+
+scm = ShapeConsistencyManager()
+
+
+class LazyTensor(torch.Tensor):
+    """A naive implementation of LazyTensor (https://arxiv.org/pdf/2102.13267.pdf).
+
+    Usage:
+        1. Use ``LazyTensor`` instead of ``torch.Tensor``.
+        >>> x = LazyTensor(torch.zeros, 2, 3)
+        >>> x += 1
+        >>> y = x * x
+        >>> y = y.cuda().half()
+        >>> y[0, 0] = 0
+        >>> y = y.materialize()     # materialize the tensor
+        >>> print(y)
+        tensor([[0., 1., 1.],
+                [1., 1., 1.]], device='cuda:0', dtype=torch.float16)
+
+        2. Generate ``MetaTensor`` from ``LazyTensor``
+        >>> x = LazyTensor(torch.zeros, 2, 3)
+        >>> x.reshape(3, 2)
+        >>> x = x.traceable()    # generate ``MetaTensor``
+        >>> print(x)
+        MetaTensor(..., size=(3, 2), device=cpu, dtype=torch.float32)
+
+        3. Use ``LazyTensor`` to generate sharded ``nn.Parameter``.
+        >>> x = LazyTensor(torch.zeros, 2, 3)
+        >>> x.spec = ...    # some ``ShardingSpec``
+        >>> x.distribute()    # distribute the tensor according to the ``ShardingSpec``
+
+    Warnings:
+        1. Cases that ``LazyTensor`` can't deal with.
+        >>> x = LazyTensor(torch.ones, 2, 3)
+        >>> x[0, 0] = -x[0, 0]    # this will cause infinite recursion
+
+        2. ``LazyTensor.materialize()`` can't be called multiple times.
+        >>> x = LazyTensor(torch.ones, 2, 3)
+        >>> x.materialize()
+        >>> x.materialize()    # this is disallowed
+    """
+
+    _repr = True
+    _meta_data: Optional[MetaTensor] = None    # shape, dtype, device
+    _cached_data: Optional[torch.Tensor] = None    # materialized data
+
+    @staticmethod
+    def __new__(cls, func, *args, dtype=None, device=None, **kwargs):
+        elem = func(*args, dtype=dtype, device='meta', **kwargs)
+        r = torch.Tensor._make_wrapper_subclass(cls,
+                                                elem.size(),
+                                                strides=elem.stride(),
+                                                storage_offset=elem.storage_offset(),
+                                                dtype=elem.dtype,
+                                                layout=elem.layout,
+                                                device=device if device is not None else torch.device('cpu'),
+                                                requires_grad=elem.requires_grad)
+        r._meta_data = MetaTensor(elem, fake_device=device)
+        return r
+
+    def __init__(self, func, *args, dtype=None, device=None, **kwargs):
+        self._factory_method = (func, args, {'dtype': dtype, 'device': device, **kwargs})    # (func, args, kwargs)
+        self._cached_buffer = list()    # (func, args, kwargs)
+        self._spec = None
+        self._data = self
+
+    def __repr__(self):
+        if self._repr:
+            # avoid recursive representation
+            self.__class__._repr = False
+            s = f'LazyTensor(..., size={tuple(self._meta_data.shape)}, device={self._meta_data.device}, dtype={self._meta_data.dtype})\n'\
+                f'factory method: {self._factory_method}\n'\
+                f'cached: {pprint.pformat(self._cached_buffer) if self._cached_data is None else self._cached_data}\n'\
+                f'spec: {self._spec}'
+            self.__class__._repr = True
+            return s
+        else:
+            return 'LazyTensor(...)'
+
+    def materialize(self) -> torch.Tensor:
+        """Materialize the ``LazyTensor`` to ``torch.Tensor``.
+
+        Warnings:
+            Calling ``self.materialize()`` will clear all cached sequence and factory method,
+            because we don't allow materialize the same ``LazyTensor`` twice.
+            This is mentioned in the paper: https://arxiv.org/pdf/2102.13267.pdf (Part 4.3).
+
+        Returns:
+            torch.Tensor: The materialized tensor.
+        """
+        target = self._data._realize_cached_data()
+        if isinstance(self, nn.Parameter):
+            target = nn.Parameter(target, requires_grad=self.requires_grad)
+        self._clear_all()
+        return target
+
+    def traceable(self) -> MetaTensor:
+        """Generate ``MetaTensor`` from ``LazyTensor``. (Mostly for tracing)
+
+        Returns:
+            MetaTensor: The generated ``MetaTensor``.
+        """
+        if isinstance(self, nn.Parameter):
+            return nn.Parameter(self._meta_data, requires_grad=self.requires_grad)
+        else:
+            return self._meta_data
+
+    def distribute(self) -> torch.Tensor:
+        """Distribute the ``LazyTensor`` according to the ``ShardingSpec``.
+
+        Returns:
+            torch.Tensor: The sharded tensor.
+        """
+        if self._spec is None:
+            raise RuntimeError('ShardingSpec is not set for\n{self}')
+        spec, device_mesh = self._spec, self._spec.device_mesh
+        target = self.materialize()
+
+        # TODO(some man): better not be coupled with auto-parallel
+        target.data = scm.apply_for_autoparallel_runtime(target.data, ShardingSpec(device_mesh, target.shape, {}),
+                                                         spec).detach().clone()
+        return target
+
+    def _realize_cached_data(self) -> torch.Tensor:
+        # self._cached_data should be generated after the first call of this function
+        if self._cached_data is None:
+            if self._factory_method is not None:
+                # apply factory method
+                func, args, kwargs = self._factory_method
+
+                # apply cached sequence
+                self._cached_data = self._apply_cache_buffer(func(*args, **kwargs))
+            else:
+                # apply cached sequence only
+                self._cached_data = self._apply_cache_buffer()
+        return self._cached_data
+
+    def _apply_cache_buffer(self, target=None) -> torch.Tensor:
+        # dump all cached sequence
+        # super-dainiu: support methods for single Tensor only
+        def replace(x):
+            if x is self:
+                return target
+            elif isinstance(x, LazyTensor):
+                return x._realize_cached_data()
+            return x
+
+        packed = None
+
+        for (func, args, kwargs) in self._cached_buffer:
+            if func == torch.Tensor.requires_grad_:
+                packed = func, args, kwargs    # requires grad should be set at last
+            else:
+                o = func(*tree_map(replace, args), **tree_map(replace, kwargs))
+                target = o if isinstance(o, torch.Tensor) else target    # if func returns non-Tensor, discard the value
+
+        # super-dainiu: set requires_grad after all inplace-ops are done
+        if packed is not None:
+            func, args, kwargs = packed
+            func(*tree_map(replace, args), **tree_map(replace, kwargs))
+
+        return target
+
+    # clear all means:
+    #   1. clear factory method
+    #   2. clear cached sequence
+    #   3. clear cached data
+    def _clear_all(self):
+        self._cached_data = None
+        self._cached_buffer = None
+        self._data = None
+        gc.collect()    # avoid memory leak
+
+    # cache everything with __torch_function__
+    @classmethod
+    def __torch_function__(cls, func, types, args=(), kwargs=None):
+        if kwargs is None:
+            kwargs = {}
+        target = None
+
+        if isinstance(func, torch._C.ScriptMethod):
+
+            def unwrap(x):
+                if isinstance(x, LazyTensor):
+                    return x._meta_data
+                return x
+
+            target: LazyTensor = args[0].clone()
+            target._cached_buffer.append((func, args, kwargs))
+            target._meta_data = getattr(target._meta_data, func.name)(*tree_map(unwrap, args[1:]),
+                                                                      **tree_map(unwrap, kwargs))
+
+        else:
+
+            def unwrap(x):
+                nonlocal target
+                if isinstance(x, LazyTensor):
+                    target = x if (func.__name__.endswith('_') and not (func.__name__.endswith('__'))
+                                   or func.__name__ == "__setitem__") else x.clone()
+                    target._cached_buffer.append((func, args, kwargs))
+                    return x._meta_data
+                return x
+
+            args = tree_map(unwrap, args)
+            kwargs = tree_map(unwrap, kwargs)
+            o = func(*args, **kwargs)
+
+        if isinstance(o, MetaTensor):
+            target._meta_data = o
+            return target
+        else:
+            return o
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+        pass    # skip
+
+    def clone(self) -> "LazyTensor":
+        """Create a new ``LazyTensor`` with same cached sequence and factory method.
+
+        Returns:
+            LazyTensor: the new ``LazyTensor``
+        """
+        target = LazyTensor(orig_empty, 0, dtype=self._meta_data.dtype, device=self._meta_data.device)
+        target._factory_method = None
+        target._cached_buffer = list()
+        target._meta_data = self._meta_data.clone()
+        target._cached_data = self._cached_data.clone() if self._cached_data is not None else None
+        target._spec = copy.deepcopy(self._spec)
+        return target
+
+    def detach(self) -> "LazyTensor":
+        target = self.clone()
+        target._cached_buffer.append((torch.Tensor.detach_, (self,), {}))
+        return target
+
+    @property
+    def spec(self) -> ShardingSpec:
+        return self._spec
+
+    @spec.setter
+    def spec(self, other: ShardingSpec):
+        self._spec = other
+
+    @property
+    def data(self) -> "LazyTensor":
+        return self._data.detach()
+
+    @data.setter
+    def data(self, other: "LazyTensor") -> "LazyTensor":
+        """This avoid the following infinite recursion, which is very common in ``nn.Module`` initialization.
+
+        Usage:
+            >>> a = LazyTensor(torch.empty, 0, dtype=torch.float32, device='cpu')
+            >>> b = a.cuda()
+            >>> a.data = b
+        """
+        self._data = other
+
+
+class LazyInitContext():
+    """Context manager for lazy initialization. Enables initializing the model without allocating real memory.
+
+    Usage:
+        1. The model is initialized, but no real memory is allocated.
+        >>> ctx = LazyInitContext()
+        >>> with ctx:
+        >>>     model = MyModel().cuda()
+
+        2. The model is initialized with ``MetaTensor`` as weights, but still no real memory is allocated.
+        >>> with ctx.traceable(model):
+        >>>     gm = symbolic_trace(model, meta_args=meta_args)
+        >>> # Solve the execution strategy and apply the strategy to the model
+        >>> strategy = StrategyAndSpec()
+
+        3. The model is initialized with ``torch.Tensor`` as weights, and real memory is allocated. (single device)
+        >>> model = ctx.materialize(model)
+
+        3. The model is initialized with sharded ``torch.Tensor`` as weights, and real memory is allocated. (distributed scenario)
+        >>> model = apply_strategy_to_all_params(model, strategy)
+        >>> model = ctx.distribute(model)
+
+    Warnings:
+        This API is still experimental and further modifications can be made to it.
+        For example:
+            1. Quantization strategies can be applied before allocating real memory.
+            2. Lazy initialization seems slower than normal initialization.
+    """
+
+    def __init__(self):
+        self.overrides = {}
+
+    def __enter__(self):
+
+        def wrap_factory_method(target):
+            # factory functions (eg. torch.empty())
+            def wrapper(*args, **kwargs):
+                return LazyTensor(target, *args, **kwargs)
+
+            return wrapper, target
+
+        def wrap_factory_like_method(orig_target, target):
+            # factory_like functions (eg. torch.empty_like())
+            def wrapper(*args, **kwargs):
+                orig_t = args[0]
+                return LazyTensor(orig_target, *args[1:], device=orig_t.device, dtype=orig_t.dtype, **kwargs)
+
+            return wrapper, target
+
+        self.overrides = {
+            target: wrap_factory_method(getattr(torch, target))
+            for target in _TorchFactoryMethod
+            if callable(getattr(torch, target, None))
+        }
+
+        self.overrides.update({
+            target + '_like': wrap_factory_like_method(getattr(torch, target), getattr(torch, target + '_like'))
+            for target in _TorchFactoryMethod
+            if callable(getattr(torch, target + '_like', None))
+        })
+
+        for name, (wrapper, orig) in self.overrides.items():
+            setattr(torch, name, wrapper)
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        for name, (wrapper, orig) in self.overrides.items():
+            setattr(torch, name, orig)
+
+    @staticmethod
+    def materialize(module: torch.nn.Module):
+        """Initialize all ``nn.Parameter`` from ``LazyTensor``.
+
+        Args:
+            module (torch.nn.Module): Target ``nn.Module``
+        """
+
+        @torch.no_grad()
+        def init_recursively(module: nn.Module):
+            # recursively initialize the module
+            for mod in module.children():
+                init_recursively(mod)
+
+            # initialize tensors directly attached to the current module
+            for name, param in module.named_parameters(recurse=False):
+                setattr(module, name, param.materialize())
+
+            for name, buf in module.named_buffers(recurse=False):
+                setattr(module, name, buf.materialize())
+
+        init_recursively(module)
+        return module
+
+    @staticmethod
+    def distribute(module: torch.nn.Module):
+        """Initialize and shard all ``nn.Parameter`` from ``LazyTensor``.
+
+        Args:
+            module (torch.nn.Module): Sharded target ``nn.Module``
+        """
+
+        @torch.no_grad()
+        def init_recursively(module: nn.Module):
+            # recursively initialize the module
+            for mod in module.children():
+                init_recursively(mod)
+
+            # initialize tensors directly attached to the current module
+            for name, param in module.named_parameters(recurse=False):
+                setattr(module, name, param.distribute())
+
+            for name, buf in module.named_buffers(recurse=False):
+                setattr(module, name, buf.distribute())
+
+        init_recursively(module)
+        return module
+
+    @staticmethod
+    @contextlib.contextmanager
+    def traceable(module: torch.nn.Module):
+        """Initialize all ``nn.Parameters`` as ``MetaTensor``. This enables ``ColoTracer`` with control flow.
+
+        Args:
+            module (torch.nn.Module): Traceable ``nn.Module`` with ``MetaTensor`` as parameters.
+        """
+        orig_val = dict()
+
+        def init_recursively(module: nn.Module):
+            # recursively initialize the module
+            for mod in module.children():
+                init_recursively(mod)
+
+            # initialize tensors directly attached to the current module
+            for name, param in module.named_parameters(recurse=False):
+                setattr(module, name, param.traceable())
+                orig_val[(module, name)] = param
+
+            for name, buf in module.named_buffers(recurse=False):
+                setattr(module, name, buf.traceable())
+                orig_val[(module, name)] = buf
+
+        init_recursively(module)
+
+        yield
+
+        # restore original values
+        for (module, name), val in orig_val.items():
+            setattr(module, name, val)

From c04f183237da31033bfc2b0b69f2fcc22270fb0b Mon Sep 17 00:00:00 2001
From: oahzxl <43881818+oahzxl@users.noreply.github.com>
Date: Fri, 20 Jan 2023 11:18:17 +0800
Subject: [PATCH 211/503] [autochunk] support parsing blocks (#2506)

---
 colossalai/autochunk/autochunk_codegen.py     |  12 +-
 colossalai/autochunk/estimate_memory.py       |  27 +++
 colossalai/autochunk/search_chunk.py          |  75 ++++++--
 colossalai/autochunk/trace_flow.py            |   5 +-
 colossalai/autochunk/trace_indice.py          |  46 +++++
 colossalai/autochunk/utils.py                 |   8 +
 .../test_evoformer_stack_codegen.py           | 163 ++++++++++++++++++
 7 files changed, 314 insertions(+), 22 deletions(-)
 create mode 100644 tests/test_autochunk/test_evoformer_stack_codegen.py

diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
index de5e7356bbfd..8c3155a60685 100644
--- a/colossalai/autochunk/autochunk_codegen.py
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -22,7 +22,7 @@
 from torch.fx.node import Argument, Node, _get_qualified_name, _type_repr, map_arg
 
 from .search_chunk import SearchChunk
-from .utils import delete_free_var_from_last_use, find_idx_by_name, get_node_shape
+from .utils import delete_free_var_from_last_use, find_idx_by_name, get_logger, get_node_shape
 
 
 def _gen_chunk_slice_dim(chunk_dim: int, chunk_indice_name: str, shape: List) -> str:
@@ -276,11 +276,17 @@ def emit_code_with_chunk(
 
     class AutoChunkCodeGen(CodeGen):
 
-        def __init__(self, meta_graph, max_memory=None, print_mem=False):
+        def __init__(self,
+                     meta_graph,
+                     max_memory: int = None,
+                     print_mem: bool = False,
+                     print_progress: bool = False) -> None:
             super().__init__()
             # find the chunk regions
-            self.search_chunk = SearchChunk(meta_graph, max_memory, print_mem)
+            self.search_chunk = SearchChunk(meta_graph, max_memory, print_mem, print_progress)
             self.chunk_infos = self.search_chunk.search_region()
+            if print_progress:
+                get_logger().info("AutoChunk start codegen")
 
         def _gen_python_code(self, nodes, root_module: str, namespace: _Namespace) -> PythonCode:
             free_vars: List[str] = []
diff --git a/colossalai/autochunk/estimate_memory.py b/colossalai/autochunk/estimate_memory.py
index 21f34481ba70..a03a5413bc34 100644
--- a/colossalai/autochunk/estimate_memory.py
+++ b/colossalai/autochunk/estimate_memory.py
@@ -43,6 +43,8 @@ def _get_delete_node(self, user, user_to_last_uses, to_keep=None):
         delete_node = []
         if user.op not in ("output",):
             nodes_to_delete = user_to_last_uses.get(user, [])
+            if len(user.users) == 0:
+                nodes_to_delete.append(user)
             if to_keep is not None:
                 keep_list = []
                 for n in nodes_to_delete:
@@ -135,6 +137,8 @@ def _get_chunk_delete_node_size(self, user, user_to_last_uses, chunk_ratio, chun
         if user.op in ("placeholder", "output"):
             return 0
         nodes_to_delete = user_to_last_uses.get(user, [])
+        if len(user.users) == 0:
+            nodes_to_delete.append(user)
         delete_size = 0
         for n in nodes_to_delete:
             if n.name in chunk_inputs_names:
@@ -294,3 +298,26 @@ def estimate_chunk_inference_mem(
         # param_memory = parameter_size(gm)
         # all_memory = act_memory + param_memory
         return act_memory_peak_log, act_memory_after_node_log, active_node_list_log
+
+    def get_active_nodes(self, node_list: List) -> List:
+        """
+        Get active nodes for every node
+
+        Args:
+            node_list (List): _description_
+
+        Returns:
+            active_node_list_log (List): active nodes of every node. active nodes refer to
+                nodes generated but not deleted.
+        """
+        active_node_list = []
+        active_node_list_log = []
+        user_to_last_uses = self._get_last_usr(node_list)
+        user_to_last_uses_no_free_var = self._get_last_usr(node_list)
+        delete_free_var_from_last_use(user_to_last_uses_no_free_var)
+        for _, node in enumerate(node_list):
+            # log active node, only effective without chunk
+            self._add_active_node(node, active_node_list)
+            self._remove_deactive_node(node, user_to_last_uses, active_node_list)
+            active_node_list_log.append(copy.deepcopy(active_node_list))
+        return active_node_list_log
diff --git a/colossalai/autochunk/search_chunk.py b/colossalai/autochunk/search_chunk.py
index 236f9697df5d..a8619671268b 100644
--- a/colossalai/autochunk/search_chunk.py
+++ b/colossalai/autochunk/search_chunk.py
@@ -8,7 +8,7 @@
 from .select_chunk import SelectChunk
 from .trace_flow import TraceFlow
 from .trace_indice import TraceIndice
-from .utils import get_node_shape, is_non_compute_node, is_non_compute_node_except_placeholder
+from .utils import get_logger, get_node_shape, is_non_compute_node, is_non_compute_node_except_placeholder
 
 
 class SearchChunk(object):
@@ -40,14 +40,14 @@ class SearchChunk(object):
         print_mem (bool): print estimated memory
     """
 
-    def __init__(self, gm, max_memory=None, print_mem=False) -> None:
-        self.gm = gm
+    def __init__(self, gm, max_memory=None, print_mem=False, print_progress=False) -> None:
         self.print_mem = print_mem
+        self.print_progress = print_progress
         self.trace_indice = TraceIndice(list(gm.graph.nodes))
-        self.trace_indice.trace_indice()
+        self.estimate_memory = EstimateMemory()
+        self._init_trace()
         self.trace_flow = TraceFlow(self.trace_indice)
         self.reorder_graph = ReorderGraph(self.trace_indice)
-        self.estimate_memory = EstimateMemory()
         self.select_chunk = SelectChunk(
             self.trace_indice,
             self.estimate_memory,
@@ -55,7 +55,33 @@ def __init__(self, gm, max_memory=None, print_mem=False) -> None:
             max_memory=max_memory,
         )
 
-    def _find_peak_node(self, mem_peak):
+    def _init_trace(self) -> None:
+        """
+        find the max trace range for every node
+        reduce the computation complexity of trace_indice
+        """
+        # find all max ranges
+        active_nodes = self.estimate_memory.get_active_nodes(self.trace_indice.node_list)
+        cur_node_idx = len(self._get_free_var_idx())
+        max_chunk_region_list = []
+        while True:
+            max_chunk_region = self._search_max_chunk_region(active_nodes, cur_node_idx)
+            cur_node_idx = max_chunk_region[1]
+            if cur_node_idx == len(active_nodes) - 1:
+                break
+            max_chunk_region_list.append(max_chunk_region)
+
+        # nothing to limit for the first range
+        max_chunk_region_list = max_chunk_region_list[1:]
+        max_chunk_region_list[0] = (0, max_chunk_region_list[0][1])
+
+        # set trace range and do the trace
+        if self.print_progress:
+            get_logger().info("AutoChunk start tracing indice")
+        self.trace_indice.set_trace_range(max_chunk_region_list, active_nodes)
+        self.trace_indice.trace_indice()
+
+    def _find_peak_node(self, mem_peak: List) -> int:
         max_value = max(mem_peak)
         max_idx = mem_peak.index(max_value)
         return max_idx
@@ -73,7 +99,7 @@ def _get_free_var_idx(self) -> List:
                 free_var_idx.append(idx)
         return free_var_idx
 
-    def _search_max_chunk_region(self, active_node: List, peak_node: Node, chunk_regions: List) -> Tuple:
+    def _search_max_chunk_region(self, active_node: List, peak_node_idx: int, chunk_regions: List = None) -> Tuple:
         """
         Search max chunk region according to peak memory node
 
@@ -81,7 +107,7 @@ def _search_max_chunk_region(self, active_node: List, peak_node: Node, chunk_reg
 
         Args:
             active_node (List): active node status for every node
-            peak_node (Node): peak memory node
+            peak_node_idx (int): peak memory node idx
             chunk_regions (List): chunk region infos
 
         Returns:
@@ -97,7 +123,7 @@ def _search_max_chunk_region(self, active_node: List, peak_node: Node, chunk_reg
         # from peak_node to free_var
         inside_flag = False
         chunk_region_start = free_var_num
-        for i in range(peak_node, -1, -1):
+        for i in range(peak_node_idx, -1, -1):
             if active_node_num[i] <= threshold:
                 inside_flag = True
             if inside_flag and active_node_num[i] > threshold:
@@ -107,21 +133,23 @@ def _search_max_chunk_region(self, active_node: List, peak_node: Node, chunk_reg
         # from peak_node to len-2
         inside_flag = False
         chunk_region_end = len(active_node) - 1
-        for i in range(peak_node, len(active_node)):
+        for i in range(peak_node_idx, len(active_node)):
             if active_node_num[i] <= threshold:
                 inside_flag = True
             if inside_flag and active_node_num[i] > threshold:
                 chunk_region_end = i
                 break
 
-        for i in chunk_regions:
-            region = i["region"]
-            if chunk_region_start >= region[0] and chunk_region_end <= region[1]:
-                return None
-            elif (region[0] <= chunk_region_start <= region[1] and chunk_region_end > region[1]):
-                chunk_region_start = region[1] + 1
-            elif (region[0] <= chunk_region_end <= region[1] and chunk_region_start < region[0]):
-                chunk_region_end = region[0] - 1
+        # avoid chunk regions overlap
+        if chunk_regions is not None:
+            for i in chunk_regions:
+                region = i["region"]
+                if chunk_region_start >= region[0] and chunk_region_end <= region[1]:
+                    return None
+                elif (region[0] <= chunk_region_start <= region[1] and chunk_region_end > region[1]):
+                    chunk_region_start = region[1] + 1
+                elif (region[0] <= chunk_region_end <= region[1] and chunk_region_start < region[0]):
+                    chunk_region_end = region[0] - 1
         return chunk_region_start, chunk_region_end
 
     def _find_chunk_info(self, input_trace, output_trace, start_idx, end_idx) -> List:
@@ -154,6 +182,9 @@ def _find_chunk_info(self, input_trace, output_trace, start_idx, end_idx) -> Lis
                     # dim size cannot be 1
                     if (get_node_shape(end_node)[end_dim] == 1 or get_node_shape(start_node)[start_dim] == 1):
                         continue
+                    # must have users
+                    if len(end_node.users) == 0:
+                        continue
                     # check index source align
                     if not self.trace_flow.check_index_source(start_dim, start_node, start_idx, end_dim, end_node):
                         continue
@@ -253,6 +284,9 @@ def search_region(self) -> Dict:
         Returns:
             chunk_infos (Dict)
         """
+        if self.print_progress:
+            get_logger().info("AutoChunk start searching chunk regions")
+
         chunk_infos = []
         (
             init_mem_peak,
@@ -272,6 +306,11 @@ def search_region(self) -> Dict:
                 _,
                 active_node,
             ) = self.estimate_memory.estimate_chunk_inference_mem(self.trace_indice.node_list, chunk_infos)
+
+            if self.print_progress:
+                get_logger().info("AutoChunk find chunk region %d = (%d, %d)" %
+                                  (len(chunk_infos), chunk_info["region"][0], chunk_info["region"][1]))
+
             if self._stop_search(init_mem_peak, mem_peak):
                 break
         if self.print_mem:
diff --git a/colossalai/autochunk/trace_flow.py b/colossalai/autochunk/trace_flow.py
index e657c188ead2..830b4629ec1e 100644
--- a/colossalai/autochunk/trace_flow.py
+++ b/colossalai/autochunk/trace_flow.py
@@ -281,7 +281,10 @@ def _get_input_nodes_dim(self, inputs: List[Node], start_idx: int, end_idx: int,
                     if chunk_dim is not None:
                         user_source = self.trace_indice._find_source_trace_from_node(user)[chunk_dim]
                         if input_node_idx in user_source:
-                            input_dict[user_idx] = user_source[input_node_idx]
+                            if get_node_shape(input_node)[user_source[input_node_idx][0]] == 1:
+                                input_dict[user_idx] = [None]
+                            else:
+                                input_dict[user_idx] = user_source[input_node_idx]
                         else:
                             return None, None
             if len(input_dict) == 0:
diff --git a/colossalai/autochunk/trace_indice.py b/colossalai/autochunk/trace_indice.py
index 5c2e9b5203b5..827f60d8b53d 100644
--- a/colossalai/autochunk/trace_indice.py
+++ b/colossalai/autochunk/trace_indice.py
@@ -33,6 +33,8 @@ def __init__(self, node_list: List[Node]) -> None:
         self.indice_trace_list = self._init_indice_trace_list()
         self.indice_view_list = {}
         self.indice_count = -1
+        self.trace_range = []
+        self.active_node_list = []
 
     def _init_indice_trace_list(self):
         indice_trace_list = []
@@ -48,6 +50,10 @@ def _init_indice_trace_list(self):
             indice_trace_list.append(cur_trace)
         return indice_trace_list
 
+    def set_trace_range(self, trace_range: List, active_node_list: List) -> None:
+        self.trace_range = trace_range
+        self.active_node_list = active_node_list
+
     def _add_indice(self):
         """
         Update the count and return it. To record the idx number.
@@ -493,6 +499,9 @@ def _assign_getitem_indice(self, node: Node, node_idx: int):
         new_dim_num = sum([1 if str(i) == "None" else 0 for i in node_args])
         for _ in range(new_dim_num):
             self._del_dim(node_idx, 0)
+        delete_dim_num = sum([1 if str(i) == "0" else 0 for i in node_args])
+        for _ in range(delete_dim_num):
+            self._add_dim(node_idx, 0)
         self._assign_indice_as_input(node, node_idx)
 
         for _, node_arg in enumerate(node_args):
@@ -513,6 +522,9 @@ def _assign_getitem_indice(self, node: Node, node_idx: int):
             elif "None" == node_arg_str:
                 self._add_dim(node_idx, new_idx_count)
                 new_idx_count += 1
+            elif "0" == node_arg_str:
+                self._del_dim(node_idx, new_idx_count)
+                origin_idx_count += 1
             else:
                 raise NotImplementedError()
 
@@ -596,6 +608,37 @@ def _assign_view_reshape_indice(self, node: Node, node_idx: int):
         }
         self.indice_view_list[node] = view_dict
 
+    def _clear_trace(self, node_idx: int) -> None:
+        """
+        clear too far trace to speed up computation
+        """
+        trace_range = None
+        for i in range(len(self.trace_range)):
+            if self.trace_range[i][1] == node_idx:
+                trace_range = (self.trace_range[i][0], self.trace_range[i][1])
+                break
+            if self.trace_range[i][1] > node_idx:
+                break
+        if trace_range is None:
+            return
+
+        active_nodes = self.active_node_list[trace_range[0]:trace_range[1] + 1]
+        active_nodes = set(flat_list(active_nodes))
+        active_nodes = [find_idx_by_name(i, self.node_list) for i in active_nodes]
+        for i in range(trace_range[0], trace_range[1] + 1):
+            trace = self.indice_trace_list[i]
+            # clear compute
+            for dim_compute in trace["compute"]:
+                for i in range(len(dim_compute) - 1, -1, -1):
+                    if dim_compute[i] < trace_range[0] and dim_compute[i] not in active_nodes:
+                        dim_compute.pop(i)
+                continue
+            # clear source
+            for dim_source in trace["source"]:
+                for k in list(dim_source.keys()):
+                    if k < trace_range[0] and k not in active_nodes:
+                        dim_source.pop(k)
+
     def trace_indice(self):
         for idx, node in enumerate(self.node_list):
             if node.op == "placeholder":
@@ -655,3 +698,6 @@ def trace_indice(self):
                 continue
             else:
                 raise NotImplementedError(node.op, "op not implemented yet!")
+
+            # limit trace range
+            self._clear_trace(idx)
diff --git a/colossalai/autochunk/utils.py b/colossalai/autochunk/utils.py
index ff1a64bc359d..e870685122e3 100644
--- a/colossalai/autochunk/utils.py
+++ b/colossalai/autochunk/utils.py
@@ -2,6 +2,14 @@
 
 from torch.fx.node import Node
 
+from colossalai.logging import get_dist_logger
+
+logger = get_dist_logger()
+
+
+def get_logger():
+    return logger
+
 
 def flat_list(inputs: Any) -> List:
     """
diff --git a/tests/test_autochunk/test_evoformer_stack_codegen.py b/tests/test_autochunk/test_evoformer_stack_codegen.py
new file mode 100644
index 000000000000..5fabb27028f9
--- /dev/null
+++ b/tests/test_autochunk/test_evoformer_stack_codegen.py
@@ -0,0 +1,163 @@
+from functools import partial
+
+import pytest
+import torch
+import torch.fx
+import torch.multiprocessing as mp
+
+try:
+    from fastfold.model.nn.evoformer import EvoformerStack
+    HAS_REPO = True
+except:
+    HAS_REPO = False
+
+import colossalai
+from colossalai.core import global_context as gpc
+from colossalai.fx._compatibility import is_compatible_with_meta
+from colossalai.fx.codegen.activation_checkpoint_codegen import CODEGEN_AVAILABLE
+from colossalai.fx.graph_module import ColoGraphModule
+from colossalai.fx.passes.meta_info_prop import MetaInfoProp
+from colossalai.utils import free_port
+
+if CODEGEN_AVAILABLE and is_compatible_with_meta():
+    from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
+    from colossalai.fx.profiler import MetaTensor
+    from colossalai.fx.tracer.experimental import ColoTracer, symbolic_trace
+
+
+def _test_fwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair, node_mask, pair_mask):
+    # for memory test
+    # model = model.cuda()
+    # torch.cuda.reset_peak_memory_stats()
+    # now_mem = torch.cuda.memory_allocated() / 1024**2
+    # with torch.no_grad():
+    #     node1 = node.clone()
+    #     pair1 = pair.clone()
+    #     node_mask1 = node_mask.clone()
+    #     pair_mask1 = pair_mask.clone()
+    #     gm(node1, pair1, node_mask1, pair_mask1, None)
+    # new_max_mem = torch.cuda.max_memory_allocated() / 1024**2
+    # print("autochunk max mem:%.2f"% (new_max_mem - now_mem))
+
+    # test forward
+    model = model.cuda()
+    with torch.no_grad():
+        non_fx_out = model(node, pair, node_mask, pair_mask, None)
+        fx_out = gm(node, pair, node_mask, pair_mask, None)
+
+    assert torch.allclose(non_fx_out[0], fx_out[0],
+                          atol=1e-4), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(
+                              torch.abs(non_fx_out[0] - fx_out[0]))
+    assert torch.allclose(non_fx_out[1], fx_out[1],
+                          atol=1e-4), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(
+                              torch.abs(non_fx_out[1] - fx_out[1]))
+
+
+def _build_openfold():
+    model = EvoformerStack(
+        c_m=256,
+        c_z=128,
+        c_hidden_msa_att=32,
+        c_hidden_opm=32,
+        c_hidden_mul=128,
+        c_hidden_pair_att=32,
+        c_s=384,
+        no_heads_msa=8,
+        no_heads_pair=4,
+        no_blocks=2,    # 48
+        transition_n=4,
+        msa_dropout=0.15,
+        pair_dropout=0.25,
+        blocks_per_ckpt=None,
+        inf=1000000000.0,
+        eps=1e-08,
+        clear_cache_between_blocks=False,
+        is_multimer=False,
+    ).eval().cuda()
+    return model
+
+
+def _test_evoformer_stack_codegen(rank, msa_len, pair_len, max_memory):
+    # launch colossalai
+    colossalai.launch(
+        config={},
+        rank=rank,
+        world_size=1,
+        host="localhost",
+        port=free_port(),
+        backend="nccl",
+    )
+
+    # build model and input
+    model = _build_openfold()
+    node = torch.randn(1, msa_len, pair_len, 256).cuda()
+    node_mask = torch.randn(1, msa_len, pair_len).cuda()
+    pair = torch.randn(1, pair_len, pair_len, 128).cuda()
+    pair_mask = torch.randn(1, pair_len, pair_len).cuda()
+
+    # trace the meta graph and setup codegen
+    meta_graph = symbolic_trace(
+        model,
+        meta_args={
+            "m": node.to(torch.device("meta")),
+            "z": pair.to(torch.device("meta")),
+            "msa_mask": node_mask.to(torch.device("meta")),
+            "pair_mask": pair_mask.to(torch.device("meta")),
+        },
+        concrete_args={
+            "chunk_size": None,
+            "_mask_trans": True,
+        },
+    )
+    interp = MetaInfoProp(meta_graph)
+    interp.propagate(MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0"),
+                     MetaTensor(node_mask, fake_device="cuda:0"), MetaTensor(pair_mask, fake_device="cuda:0"), None)
+    codegen = AutoChunkCodeGen(meta_graph, max_memory=max_memory, print_mem=False, print_progress=False)
+
+    # trace and recompile
+    # MetaInfoProp requires symbolic_trace but CodeGen requires ColoTracer
+    graph = ColoTracer().trace(
+        model,
+        meta_args={
+            "m": node.to(torch.device("meta")),
+            "z": pair.to(torch.device("meta")),
+            "msa_mask": node_mask.to(torch.device("meta")),
+            "pair_mask": pair_mask.to(torch.device("meta")),
+        },
+        concrete_args={
+            "chunk_size": None,
+            "_mask_trans": True,
+        },
+    )
+    graph.set_codegen(codegen)
+    gm = ColoGraphModule(model, graph, ckpt_codegen=False)
+    gm.recompile()
+
+    # assert we have inserted chunk
+    code = graph.python_code("self").src
+    # print(code)
+    assert "chunk_result = None;  chunk_size = None;" in code
+
+    _test_fwd(model, gm, node, pair, node_mask, pair_mask)
+    gpc.destroy()
+
+
+@pytest.mark.skipif(
+    not (CODEGEN_AVAILABLE and is_compatible_with_meta() and HAS_REPO),
+    reason="torch version is lower than 1.12.0",
+)
+@pytest.mark.parametrize("max_memory", [None, 24, 28, 32])
+@pytest.mark.parametrize("msa_len", [32])
+@pytest.mark.parametrize("pair_len", [64])
+def test_evoformer_stack_codegen(msa_len, pair_len, max_memory):
+    run_func = partial(
+        _test_evoformer_stack_codegen,
+        msa_len=msa_len,
+        pair_len=pair_len,
+        max_memory=max_memory,
+    )
+    mp.spawn(run_func, nprocs=1)
+
+
+if __name__ == "__main__":
+    _test_evoformer_stack_codegen(0, 32, 64, None)

From 2d1a7dfe5f9f469ee75ec9c1a17f129766751fbb Mon Sep 17 00:00:00 2001
From: HELSON <c2h214748@gmail.com>
Date: Fri, 20 Jan 2023 14:04:38 +0800
Subject: [PATCH 212/503] [zero] add strict ddp mode (#2508)

* [zero] add strict ddp mode

* [polish] add comments for strict ddp mode

* [zero] fix test error
---
 colossalai/nn/parallel/data_parallel.py           | 12 ++++++++++--
 colossalai/nn/parallel/gemini_parallel.py         |  3 ++-
 examples/language/gpt/gemini/commons/model_zoo.py | 12 ++++++++++++
 examples/language/gpt/gemini/train_gpt_demo.py    | 10 ++++++----
 tests/test_tensor/test_tp_with_zero.py            |  2 +-
 5 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/colossalai/nn/parallel/data_parallel.py b/colossalai/nn/parallel/data_parallel.py
index 28a10c4b6c92..a742946f4c50 100644
--- a/colossalai/nn/parallel/data_parallel.py
+++ b/colossalai/nn/parallel/data_parallel.py
@@ -12,6 +12,7 @@
 from colossalai.logging import get_dist_logger
 from colossalai.nn.parallel.utils import get_temp_total_chunk_on_cuda
 from colossalai.tensor import ProcessGroup as ColoProcessGroup
+from colossalai.tensor import ReplicaSpec
 from colossalai.tensor.colo_parameter import ColoParameter, ColoTensor, ColoTensorSpec
 from colossalai.tensor.param_op_hook import ColoParamOpHookManager
 from colossalai.utils import get_current_device, is_ddp_ignored
@@ -200,14 +201,18 @@ class ZeroDDP(ColoDDP):
         gemini_manager (GeminiManager): Manages the chunk manager and heterogeneous momery space.
             For more details, see the API reference of ``GeminiManager``.
         pin_memory (bool): Chunks on CPU Memory use pin-memory.
-        force_outputs_fp32 (bool): If set to True, outputs will be fp32. Otherwise, outputs will be fp16.  Defaults to False.
+        force_outputs_fp32 (bool): If set to True, outputs will be fp32. Otherwise, outputs will be fp16.
+            Defaults to False.
+        strict_ddp_mode (bool): If set to True, there is no tensor sharding, each tensor is replicated.
+            Defaults to False. Users can set it to True, when they clearly know that they only need DDP.
     """
 
     def __init__(self,
                  module: torch.nn.Module,
                  gemini_manager: GeminiManager,
                  pin_memory: bool = False,
-                 force_outputs_fp32: bool = False) -> None:
+                 force_outputs_fp32: bool = False,
+                 strict_ddp_mode: bool = False) -> None:
         super().__init__(module, process_group=ColoProcessGroup())
         self.gemini_manager = gemini_manager
         self.chunk_manager: ChunkManager = gemini_manager.chunk_manager
@@ -232,6 +237,9 @@ def __init__(self,
         for p in param_order.generate():
             assert isinstance(p, ColoParameter)
 
+            if strict_ddp_mode and not p.is_replicate():
+                p.set_dist_spec(ReplicaSpec())
+
             if is_ddp_ignored(p):
                 p.data = p.data.to(device=get_current_device(), dtype=torch.float16)
                 continue
diff --git a/colossalai/nn/parallel/gemini_parallel.py b/colossalai/nn/parallel/gemini_parallel.py
index cd5ef424a1d9..868a3960fd3c 100644
--- a/colossalai/nn/parallel/gemini_parallel.py
+++ b/colossalai/nn/parallel/gemini_parallel.py
@@ -17,6 +17,7 @@ def __init__(self,
                  placement_policy: str = "cpu",
                  pin_memory: bool = False,
                  force_outputs_fp32: bool = False,
+                 strict_ddp_mode: bool = False,
                  search_range_mb: int = 32,
                  hidden_dim: Optional[int] = None,
                  min_chunk_size_mb: Optional[float] = None,
@@ -54,4 +55,4 @@ def __init__(self,
                                            search_range_mb=search_range_mb,
                                            min_chunk_size_mb=min_chunk_size_mb)
         gemini_manager = GeminiManager(placement_policy, chunk_manager, memstats)
-        super().__init__(module, gemini_manager, pin_memory, force_outputs_fp32)
+        super().__init__(module, gemini_manager, pin_memory, force_outputs_fp32, strict_ddp_mode)
diff --git a/examples/language/gpt/gemini/commons/model_zoo.py b/examples/language/gpt/gemini/commons/model_zoo.py
index c31b3fa6d103..65124d9e4884 100644
--- a/examples/language/gpt/gemini/commons/model_zoo.py
+++ b/examples/language/gpt/gemini/commons/model_zoo.py
@@ -53,6 +53,14 @@ def gpt2_24b(checkpoint=True):
     return GPTLMModel(hidden_size=8192, num_layers=30, num_attention_heads=16, checkpoint=checkpoint)
 
 
+def gpt2_30b(checkpoint=True):
+    return GPTLMModel(hidden_size=8192, num_layers=37, num_attention_heads=16, checkpoint=checkpoint)
+
+
+def gpt2_40b(checkpoint=True):
+    return GPTLMModel(hidden_size=8192, num_layers=50, num_attention_heads=16, checkpoint=checkpoint)
+
+
 def model_builder(model_size: str) -> callable:
     if model_size == "gpt2_medium":
         return gpt2_medium
@@ -66,6 +74,10 @@ def model_builder(model_size: str) -> callable:
         return gpt2_20b
     elif model_size == "gpt2_24b":
         return gpt2_24b
+    elif model_size == "gpt2_30b":
+        return gpt2_30b
+    elif model_size == "gpt2_40b":
+        return gpt2_40b
     else:
         raise TypeError(f"model_builder {model_size}")
 
diff --git a/examples/language/gpt/gemini/train_gpt_demo.py b/examples/language/gpt/gemini/train_gpt_demo.py
index 713de6f9fb45..285706596e8f 100644
--- a/examples/language/gpt/gemini/train_gpt_demo.py
+++ b/examples/language/gpt/gemini/train_gpt_demo.py
@@ -187,17 +187,18 @@ def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup):
 
 
 # Gemini + ZeRO DDP
-def build_gemini(model: torch.nn.Module, pg: ProcessGroup, placement_policy: str = "auto"):
+def build_gemini(model: torch.nn.Module, pg: ProcessGroup, placement_policy: str = "auto", ddp_flag: bool = True):
     fp16_init_scale = 2**5
     gpu_margin_mem_ratio_for_auto = 0
 
     if version.parse(CAI_VERSION) > version.parse("0.1.10"):
         model = GeminiDDP(model,
+                          strict_ddp_mode=ddp_flag,
                           device=get_current_device(),
                           placement_policy=placement_policy,
                           pin_memory=True,
                           hidden_dim=model.config.n_embd,
-                          search_range_mb=64)
+                          search_range_mb=128)
         # configure the const policy
         if placement_policy == 'const':
             model.gemini_manager._placement_policy.set_const_memory_boundary(2 * 1024)
@@ -279,11 +280,12 @@ def main():
         tp_pg = ProcessGroup(tp_degree=args.tp_degree)
         # Tensor Parallelism (TP)
         # You should notice that v0.1.10 is not compatible with TP degree > 1
-        tensor_parallelize(model, tp_pg)
+        if args.tp_degree > 1:
+            tensor_parallelize(model, tp_pg)
 
         # build a Gemini model and a highly optimized cpu optimizer
         # Gemini + ZeRO DP, Note it must be used after TP
-        model, optimizer = build_gemini(model, tp_pg, args.placement)
+        model, optimizer = build_gemini(model, tp_pg, args.placement, args.tp_degree == 1)
 
         logger.info(get_mem_info(prefix='After init optim, '), ranks=[0])
     else:
diff --git a/tests/test_tensor/test_tp_with_zero.py b/tests/test_tensor/test_tp_with_zero.py
index 7e611e8a14f9..83645bc6ebfd 100644
--- a/tests/test_tensor/test_tp_with_zero.py
+++ b/tests/test_tensor/test_tp_with_zero.py
@@ -93,7 +93,7 @@ def run_gpt(placement_policy, tp_init_spec_func=None):
     else:
         init_device = None
 
-    model = GeminiDDP(model, init_device, placement_policy, True, False, 32)
+    model = GeminiDDP(model, init_device, placement_policy, True, False)
     # The same as the following 3 lines
     # chunk_manager = ChunkManager(config_dict, init_device=init_device)
     # gemini_manager = GeminiManager(placement_policy, chunk_manager)

From a6a10616ec2dafca6640efd2d2e4029e2469512c Mon Sep 17 00:00:00 2001
From: binmakeswell <binmakeswell@gmail.com>
Date: Fri, 20 Jan 2023 17:29:13 +0800
Subject: [PATCH 213/503] [doc] update opt and tutorial links (#2509)

---
 README-zh-Hans.md | 6 ++++--
 README.md         | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/README-zh-Hans.md b/README-zh-Hans.md
index 6b1848c4bdd7..5ad22785cef5 100644
--- a/README-zh-Hans.md
+++ b/README-zh-Hans.md
@@ -145,7 +145,7 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/OPT_update.png" width=800/>
 
 - [Open Pretrained Transformer (OPT)](https://github.com/facebookresearch/metaseq), 由Meta发布的1750亿语言模型，由于完全公开了预训练参数权重，因此促进了下游任务和应用部署的发展。
-- 加速45%，仅用几行代码以低成本微调OPT。[[样例]](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/language/opt) [[在线推理]](https://service.colossalai.org/opt)
+- 加速45%，仅用几行代码以低成本微调OPT。[[样例]](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/language/opt) [[在线推理]](https://github.com/hpcaitech/ColossalAI-Documentation/blob/main/i18n/zh-Hans/docusaurus-plugin-content-docs/current/advanced_tutorials/opt_service.md)
 
 请访问我们的 [文档](https://www.colossalai.org/) 和 [例程](https://github.com/hpcaitech/ColossalAI-Examples) 以了解详情。
 
@@ -199,7 +199,7 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/OPT_serving.png" width=800/>
 </p>
 
-- [OPT推理服务](https://service.colossalai.org/opt): 无需注册，免费体验1750亿参数OPT在线推理服务
+- [OPT推理服务](https://github.com/hpcaitech/ColossalAI-Documentation/blob/main/i18n/zh-Hans/docusaurus-plugin-content-docs/current/advanced_tutorials/opt_service.md): 无需注册，免费体验1750亿参数OPT在线推理服务
 
 <p id="BLOOM-Inference" align="center">
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/BLOOM%20Inference.PNG" width=800/>
@@ -365,4 +365,6 @@ docker run -ti --gpus all --rm --ipc=host colossalai bash
 }
 ```
 
+Colossal-AI 已被 [SC](https://sc22.supercomputing.org/), [AAAI](https://aaai.org/Conferences/AAAI-23/), [PPoPP](https://ppopp23.sigplan.org/) 等顶级会议录取为官方教程。
+
 <p align="right">(<a href="#top">返回顶端</a>)</p>
diff --git a/README.md b/README.md
index 396260e97399..01e7b0ec50e4 100644
--- a/README.md
+++ b/README.md
@@ -149,7 +149,7 @@ distributed training and inference in a few lines.
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/OPT_update.png" width=800/>
 
 - [Open Pretrained Transformer (OPT)](https://github.com/facebookresearch/metaseq), a 175-Billion parameter AI language model released by Meta, which stimulates AI programmers to perform various downstream tasks and application deployments because public pretrained model weights.
-- 45% speedup fine-tuning OPT at low cost in lines. [[Example]](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/language/opt) [[Online Serving]](https://service.colossalai.org/opt)
+- 45% speedup fine-tuning OPT at low cost in lines. [[Example]](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/language/opt) [[Online Serving]](https://github.com/hpcaitech/ColossalAI-Documentation/blob/main/i18n/en/docusaurus-plugin-content-docs/current/advanced_tutorials/opt_service.md)
 
 Please visit our [documentation](https://www.colossalai.org/) and [examples](https://github.com/hpcaitech/ColossalAI-Examples) for more details.
 
@@ -202,7 +202,7 @@ Please visit our [documentation](https://www.colossalai.org/) and [examples](htt
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/OPT_serving.png" width=800/>
 </p>
 
-- [OPT Serving](https://service.colossalai.org/opt): Try 175-billion-parameter OPT online services for free, without any registration whatsoever.
+- [OPT Serving](https://github.com/hpcaitech/ColossalAI-Documentation/blob/main/i18n/en/docusaurus-plugin-content-docs/current/advanced_tutorials/opt_service.md): Try 175-billion-parameter OPT online services for free, without any registration whatsoever.
 
 <p id="BLOOM-Inference" align="center">
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/BLOOM%20Inference.PNG" width=800/>
@@ -369,4 +369,6 @@ We leverage the power of [GitHub Actions](https://github.com/features/actions) t
 }
 ```
 
+Colossal-AI has been accepted as official tutorials by top conference [SC](https://sc22.supercomputing.org/), [AAAI](https://aaai.org/Conferences/AAAI-23/), [PPoPP](https://ppopp23.sigplan.org/), etc.
+
 <p align="right">(<a href="#top">back to top</a>)</p>

From 0af793836c82ae5e9ee056caab91d9aafa781a43 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Thu, 26 Jan 2023 16:34:19 +0800
Subject: [PATCH 214/503] [workflow] fixed changed file detection (#2515)

---
 .github/workflows/auto_example_check.yml | 17 +++++++++++++++++
 .github/workflows/build.yml              | 11 +++++++++++
 .github/workflows/pre_commit.yml         | 14 ++++++++++++++
 3 files changed, 42 insertions(+)

diff --git a/.github/workflows/auto_example_check.yml b/.github/workflows/auto_example_check.yml
index 5e4022f7f0ea..df413f646c2c 100644
--- a/.github/workflows/auto_example_check.yml
+++ b/.github/workflows/auto_example_check.yml
@@ -25,9 +25,21 @@ jobs:
         with:
           fetch-depth: 0
           ref: ${{ github.event.pull_request.head.sha }}
+
+      - name: Locate base commit
+        id: locate-base-sha
+        run: |
+            curBranch=$(git rev-parse --abbrev-ref HEAD)
+            commonCommit=$(git merge-base origin/main $curBranch)
+            echo $commonCommit
+            echo "baseSHA=$commonCommit" >> $GITHUB_OUTPUT
+
       - name: Get all changed example files
         id: changed-files
         uses: tj-actions/changed-files@v35
+        with:
+          base_sha: ${{ steps.locate-base-sha.outputs.baseSHA }}
+
       - name: setup matrix
         id: setup-matrix
         run: |
@@ -67,9 +79,11 @@ jobs:
     timeout-minutes: 10
     steps:
       - uses: actions/checkout@v3
+
       - name: Install Colossal-AI
         run: |
           pip install -v .
+
       - name: Test the example
         run: |
           example_dir=${{ matrix.directory }}
@@ -90,6 +104,7 @@ jobs:
     steps:
     - name: 📚 Checkout
       uses: actions/checkout@v3
+
     - name: setup matrix
       id: setup-matrix
       run: |
@@ -113,9 +128,11 @@ jobs:
     steps:
       - name: 📚 Checkout
         uses: actions/checkout@v3
+
       - name: Install Colossal-AI
         run: |
           pip install -v .
+
       - name: Traverse all files
         run: |
           example_dir=${{ matrix.diretory }}
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 30b932729019..8f334d599124 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -21,14 +21,25 @@ jobs:
         with:
           fetch-depth: 0
           ref: ${{ github.event.pull_request.head.sha }}
+
+      - name: Locate base commit
+        id: locate-base-sha
+        run: |
+            curBranch=$(git rev-parse --abbrev-ref HEAD)
+            commonCommit=$(git merge-base origin/main $curBranch)
+            echo $commonCommit
+            echo "baseSHA=$commonCommit" >> $GITHUB_OUTPUT
+
       - name: Find the changed files
         id: find-changed-files
         uses: tj-actions/changed-files@v35
         with:
+          base_sha: ${{ steps.locate-base-sha.outputs.baseSHA }}
           files: |
             op_builder/**
             colossalai/kernel/**
             setup.py
+
       - name: List changed files
         run: |
           for file in ${{ steps.find-changed-files.outputs.all_changed_files }}; do
diff --git a/.github/workflows/pre_commit.yml b/.github/workflows/pre_commit.yml
index 113f50ee0569..3e71be2fc611 100644
--- a/.github/workflows/pre_commit.yml
+++ b/.github/workflows/pre_commit.yml
@@ -12,9 +12,23 @@ jobs:
           fetch-depth: 0
           ref: ${{ github.event.pull_request.head.sha }}
 
+    # the PR branch and the hpcaitech/colossal-ai main branch
+    # must share a common commit, we need to locate that commit,
+    # which is the commit checked-out or forked when the PR branch is created
+    # such that we can look for files changed since that commit
+    - name: Locate base commit
+      id: locate-base-sha
+      run: |
+          curBranch=$(git rev-parse --abbrev-ref HEAD)
+          commonCommit=$(git merge-base origin/main $curBranch)
+          echo $commonCommit
+          echo "baseSHA=$commonCommit" >> $GITHUB_OUTPUT
+
     - name: Find the changed files
       id: find-changed-files
       uses: tj-actions/changed-files@v35
+      with:
+        base_sha: ${{ steps.locate-base-sha.outputs.baseSHA }}
 
     - name: List all changed files
       run: |

From 707b11d4a031564f7a126114b82d7f716c89de34 Mon Sep 17 00:00:00 2001
From: HELSON <c2h214748@gmail.com>
Date: Sat, 28 Jan 2023 14:35:25 +0800
Subject: [PATCH 215/503] [gemini] update ddp strict mode (#2518)

* [zero] add strict ddp mode for chunk init

* [gemini] update gpt example
---
 colossalai/gemini/chunk/search_utils.py       | 47 +++++++++---
 colossalai/gemini/chunk/utils.py              | 28 ++------
 colossalai/nn/parallel/data_parallel.py       |  7 +-
 colossalai/nn/parallel/gemini_parallel.py     |  5 +-
 colossalai/tensor/colo_tensor.py              |  6 ++
 .../language/gpt/gemini/train_gpt_demo.py     |  2 +-
 tests/test_ddp/test_ddp_ignore_params.py      |  2 +-
 tests/test_gemini/update/test_fwd_bwd.py      |  2 +-
 .../test_gemini/update/test_gemini_use_rmt.py |  2 +-
 tests/test_gemini/update/test_grad_clip.py    |  2 +-
 tests/test_gemini/update/test_inference.py    |  2 +-
 tests/test_gemini/update/test_optim.py        |  2 +-
 tests/test_gemini/update/test_search.py       | 72 +++++++++++++++++--
 .../update/test_zeroddp_state_dict.py         |  4 +-
 .../update/test_zerooptim_state_dict.py       |  2 +-
 tests/test_tensor/test_tp_with_zero.py        |  2 +-
 16 files changed, 133 insertions(+), 54 deletions(-)

diff --git a/colossalai/gemini/chunk/search_utils.py b/colossalai/gemini/chunk/search_utils.py
index 572c3d94531f..57a708135708 100644
--- a/colossalai/gemini/chunk/search_utils.py
+++ b/colossalai/gemini/chunk/search_utils.py
@@ -2,6 +2,7 @@
 from typing import Dict, List, Optional, Tuple
 
 import numpy as np
+import torch.distributed as dist
 import torch.nn as nn
 
 from colossalai.gemini.memory_tracer import MemStats, OrderedParamGenerator
@@ -13,8 +14,14 @@ def _filter_exlarge_params(model: nn.Module, size_dict: Dict[int, List[int]]) ->
     """
     Filter those parameters whose size is too large (more than 3x standard deviations) from others.
     """
-    params_size = [p.numel() for p in model.parameters() if not is_ddp_ignored(p)]
-    params_size_arr = np.array(params_size)
+    agg_size_list = []
+    for key in size_dict:
+        agg_size_list.extend(size_dict[key])
+
+    if len(agg_size_list) == 0:
+        return
+
+    params_size_arr = np.array(agg_size_list)
 
     std = np.std(params_size_arr)
     mean = np.mean(params_size_arr)
@@ -38,7 +45,15 @@ def _get_unused_byte(size_list: List[int], chunk_size: int) -> int:
     return left + acc
 
 
-def classify_params_by_dp_degree(param_order: OrderedParamGenerator) -> Dict[int, List[ColoParameter]]:
+def _tensor_numel(local_param: ColoParameter, strict_ddp_flag: bool):
+    if strict_ddp_flag:
+        return local_param.numel_global()
+    else:
+        return local_param.numel()
+
+
+def classify_params_by_dp_degree(param_order: OrderedParamGenerator,
+                                 strict_ddp_flag: bool = False) -> Dict[int, List[ColoParameter]]:
     """classify_params_by_dp_degree
 
     Classify the parameters by their dp degree
@@ -56,7 +71,10 @@ def classify_params_by_dp_degree(param_order: OrderedParamGenerator) -> Dict[int
         if is_ddp_ignored(param):
             continue
 
-        param_key = param.process_group.dp_world_size()
+        if strict_ddp_flag:
+            param_key = dist.get_world_size()
+        else:
+            param_key = param.process_group.dp_world_size()
 
         if param_key not in params_dict:
             params_dict[param_key] = []
@@ -71,14 +89,18 @@ def search_chunk_configuration(
         search_interval_byte: int,    # hidden size is the best value for the interval
         min_chunk_size_mb: float = 32,
         filter_exlarge_params: bool = True,
-        memstas: Optional[MemStats] = None) -> Tuple[Dict, int]:
+        strict_ddp_flag: bool = False,
+        memstas: Optional[MemStats] = None) -> Tuple[Dict, int, int]:
     """search_chunk_configuration
 
     Args:
         model (nn.Module): torch module
         search_range_mb (float): searching range in mega byte.
         search_interval_byte (int): searching interval in byte.
+        min_chunk_size_mb (float, optional): the minimum size of a distributed chunk.
         filter_exlarge_params (bool, optional): filter extreme large parameters. Defaults to True.
+        strict_ddp_flag (bool, optional): whether to enable the strict ddp mode.
+            all parameters keep replicated in this mode.
 
     Returns:
         Tuple[Dict, int]: chunk config (a dict of dp_degree -> chunk init args) and its memory chunk waste in byte.
@@ -96,17 +118,20 @@ def search_chunk_configuration(
     min_chunk_size_byte = round(min_chunk_size_mb * 1024**2)
     assert search_range_byte >= 0
 
-    params_dict = classify_params_by_dp_degree(param_order)
+    params_dict = classify_params_by_dp_degree(param_order, strict_ddp_flag)
     config_dict: Dict[int, Dict] = dict()
+    total_param_size = 0
 
     size_dict: Dict[int, List[int]] = dict()
     for dp_degree in params_dict:
         params_list = params_dict[dp_degree]
-        size_list = [p.numel() for p in params_list]
+        size_list = [_tensor_numel(p, strict_ddp_flag) for p in params_list]
+        group_acc_size = sum(size_list)
+        total_param_size += group_acc_size
+
         # let small parameters keep gathered in CUDA all the time
-        total_size = sum(size_list)
-        if total_size < min_chunk_size_byte:
-            config_dict[dp_degree] = dict(chunk_size=total_size, keep_gathered=True)
+        if group_acc_size < min_chunk_size_byte:
+            config_dict[dp_degree] = dict(chunk_size=group_acc_size, keep_gathered=True)
         else:
             size_dict[dp_degree] = size_list
 
@@ -134,4 +159,4 @@ def search_chunk_configuration(
             continue
         config_dict[dp_degree] = dict(chunk_size=best_chunk_size, keep_gathered=False)
 
-    return config_dict, min_chunk_waste
+    return config_dict, total_param_size, min_chunk_waste
diff --git a/colossalai/gemini/chunk/utils.py b/colossalai/gemini/chunk/utils.py
index ebfdee778979..83512b8e0ee5 100644
--- a/colossalai/gemini/chunk/utils.py
+++ b/colossalai/gemini/chunk/utils.py
@@ -19,38 +19,24 @@ def safe_div(a, b):
 def init_chunk_manager(model: nn.Module,
                        init_device: Optional[torch.device] = None,
                        hidden_dim: Optional[int] = None,
-                       search_range_mb: Optional[float] = None,
-                       min_chunk_size_mb: Optional[float] = None,
-                       filter_exlarge_params: Optional[bool] = None) -> ChunkManager:
-    kwargs_dict = dict()
-
+                       **kwargs) -> ChunkManager:
     if hidden_dim:
         search_interval_byte = hidden_dim
     else:
-        search_interval_byte = 1024    # 1kb
-    kwargs_dict["search_interval_byte"] = search_interval_byte
-
-    if search_range_mb:
-        kwargs_dict["search_range_mb"] = search_range_mb
-
-    if min_chunk_size_mb:
-        kwargs_dict["min_chunk_size_mb"] = min_chunk_size_mb
-
-    if filter_exlarge_params:
-        kwargs_dict["filter_exlarge_params"] = filter_exlarge_params
-
-    params_sizes = [p.numel() for p in model.parameters() if not is_ddp_ignored(p)]
-    total_size = sum(params_sizes) / 1024**2
+        search_interval_byte = 1024    # defaults to 1kb
+    kwargs["search_interval_byte"] = search_interval_byte
 
     dist.barrier()
     begin = time()
 
-    config_dict, wasted_size = search_chunk_configuration(model, **kwargs_dict)
+    config_dict, total_size, wasted_size = search_chunk_configuration(model, **kwargs)
 
     dist.barrier()
     end = time()
     span_s = end - begin
-    wasted_size /= 1024**2
+    mb_size = 1024**2
+    total_size /= mb_size
+    wasted_size /= mb_size
 
     if dist.get_rank() == 0:
         print("searching chunk configuration is completed in {:.2f} s.\n".format(span_s),
diff --git a/colossalai/nn/parallel/data_parallel.py b/colossalai/nn/parallel/data_parallel.py
index a742946f4c50..24d59e177b80 100644
--- a/colossalai/nn/parallel/data_parallel.py
+++ b/colossalai/nn/parallel/data_parallel.py
@@ -234,11 +234,14 @@ def __init__(self,
             for p in module.parameters():
                 param_order.append(p)
 
+        ddp_pg = ColoProcessGroup()
         for p in param_order.generate():
             assert isinstance(p, ColoParameter)
 
-            if strict_ddp_mode and not p.is_replicate():
-                p.set_dist_spec(ReplicaSpec())
+            if strict_ddp_mode:
+                if not p.is_replicate():
+                    p.set_dist_spec(ReplicaSpec())
+                p.set_process_group(pg=ddp_pg)
 
             if is_ddp_ignored(p):
                 p.data = p.data.to(device=get_current_device(), dtype=torch.float16)
diff --git a/colossalai/nn/parallel/gemini_parallel.py b/colossalai/nn/parallel/gemini_parallel.py
index 868a3960fd3c..636f1ec7486e 100644
--- a/colossalai/nn/parallel/gemini_parallel.py
+++ b/colossalai/nn/parallel/gemini_parallel.py
@@ -20,7 +20,7 @@ def __init__(self,
                  strict_ddp_mode: bool = False,
                  search_range_mb: int = 32,
                  hidden_dim: Optional[int] = None,
-                 min_chunk_size_mb: Optional[float] = None,
+                 min_chunk_size_mb: float = 32,
                  memstats: Optional[MemStats] = None) -> None:
         """
         A torch.Module warpper using ZeRO-DP and Genimi.
@@ -53,6 +53,7 @@ def __init__(self,
                                            init_device=device,
                                            hidden_dim=hidden_dim,
                                            search_range_mb=search_range_mb,
-                                           min_chunk_size_mb=min_chunk_size_mb)
+                                           min_chunk_size_mb=min_chunk_size_mb,
+                                           strict_ddp_flag=strict_ddp_mode)
         gemini_manager = GeminiManager(placement_policy, chunk_manager, memstats)
         super().__init__(module, gemini_manager, pin_memory, force_outputs_fp32, strict_ddp_mode)
diff --git a/colossalai/tensor/colo_tensor.py b/colossalai/tensor/colo_tensor.py
index 3712d6a0acea..b27f5dea76a9 100644
--- a/colossalai/tensor/colo_tensor.py
+++ b/colossalai/tensor/colo_tensor.py
@@ -1,3 +1,4 @@
+import math
 from copy import copy
 from functools import lru_cache
 from typing import Callable, Optional, Set
@@ -303,6 +304,11 @@ def size_global(self, *args) -> torch.Size:
         else:
             return size_list[args[0]]
 
+    def numel_global(self):
+        """Returns the number of elements in the tensor when it's replicated.
+        """
+        return math.prod(self.size_global())
+
     # Some API for dist spec check
 
     def is_replicate(self):
diff --git a/examples/language/gpt/gemini/train_gpt_demo.py b/examples/language/gpt/gemini/train_gpt_demo.py
index 285706596e8f..02857ae9ce12 100644
--- a/examples/language/gpt/gemini/train_gpt_demo.py
+++ b/examples/language/gpt/gemini/train_gpt_demo.py
@@ -263,7 +263,7 @@ def main():
     if args.distplan == "colossalai":
         # all param must use the same process group.
         world_size = torch.distributed.get_world_size()
-        shard_pg = ProcessGroup(tp_degree=world_size)
+        shard_pg = ProcessGroup(tp_degree=world_size) if args.shardinit else None
         default_dist_spec = ShardSpec([-1], [world_size]) if args.shardinit else None
 
         # build GPT model
diff --git a/tests/test_ddp/test_ddp_ignore_params.py b/tests/test_ddp/test_ddp_ignore_params.py
index 2be962e1a2e5..679c8b0f6afe 100644
--- a/tests/test_ddp/test_ddp_ignore_params.py
+++ b/tests/test_ddp/test_ddp_ignore_params.py
@@ -35,7 +35,7 @@ def init_ddp(module: torch.nn.Module) -> ColoDDP:
 
 
 def init_ddpv2(module: torch.nn.Module) -> ZeroDDP:
-    chunk_config, _ = search_chunk_configuration(module, 4, 1024)
+    chunk_config, *_ = search_chunk_configuration(module, 4, 1024)
     chunk_manager = ChunkManager(chunk_config)
     gemini_manager = GeminiManager('cuda', chunk_manager)
     return ZeroDDP(module, gemini_manager)
diff --git a/tests/test_gemini/update/test_fwd_bwd.py b/tests/test_gemini/update/test_fwd_bwd.py
index af98878e9e70..0d35ba83d2e9 100644
--- a/tests/test_gemini/update/test_fwd_bwd.py
+++ b/tests/test_gemini/update/test_fwd_bwd.py
@@ -58,7 +58,7 @@ def exam_gpt_fwd_bwd(placement_policy,
         torch_p.data.copy_(p.data)
 
     world_size = torch.distributed.get_world_size()
-    config_dict, _ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
+    config_dict, *_ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
     config_dict[world_size]['chunk_size'] = 5000
     config_dict[world_size]['keep_gathered'] = keep_gather
     chunk_manager = ChunkManager(config_dict)
diff --git a/tests/test_gemini/update/test_gemini_use_rmt.py b/tests/test_gemini/update/test_gemini_use_rmt.py
index 7fce84a5099a..8cf17a0a726e 100644
--- a/tests/test_gemini/update/test_gemini_use_rmt.py
+++ b/tests/test_gemini/update/test_gemini_use_rmt.py
@@ -62,7 +62,7 @@ def run_gemini_use_rmt(placement_policy, keep_gather, model_name: str, use_grad_
                 assert len(step_list) == 4
 
     world_size = torch.distributed.get_world_size()
-    config_dict, _ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
+    config_dict, *_ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
     config_dict[world_size]['chunk_size'] = 5000
     config_dict[world_size]['keep_gathered'] = keep_gather
     chunk_manager = ChunkManager(config_dict)
diff --git a/tests/test_gemini/update/test_grad_clip.py b/tests/test_gemini/update/test_grad_clip.py
index fda1cf8cfd14..d97ba94399c0 100644
--- a/tests/test_gemini/update/test_grad_clip.py
+++ b/tests/test_gemini/update/test_grad_clip.py
@@ -58,7 +58,7 @@ def exam_grad_clipping(placement_policy, model_name: str):
         p.data.copy_(torch_p.data)
 
     world_size = torch.distributed.get_world_size()
-    config_dict, _ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
+    config_dict, *_ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
     config_dict[world_size]['chunk_size'] = 5000
     config_dict[world_size]['keep_gathered'] = False
     if placement_policy != 'cuda':
diff --git a/tests/test_gemini/update/test_inference.py b/tests/test_gemini/update/test_inference.py
index aec945fc9243..443155865667 100644
--- a/tests/test_gemini/update/test_inference.py
+++ b/tests/test_gemini/update/test_inference.py
@@ -57,7 +57,7 @@ def exam_inference(placement_policy, model_name: str):
         p.data.copy_(torch_p.data)
 
     world_size = torch.distributed.get_world_size()
-    config_dict, _ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
+    config_dict, *_ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
     config_dict[world_size]['chunk_size'] = 5000
     config_dict[world_size]['keep_gathered'] = False
     if placement_policy != 'cuda':
diff --git a/tests/test_gemini/update/test_optim.py b/tests/test_gemini/update/test_optim.py
index 07e6e65f2cd4..cd3aa6051d78 100644
--- a/tests/test_gemini/update/test_optim.py
+++ b/tests/test_gemini/update/test_optim.py
@@ -63,7 +63,7 @@ def exam_model_step(placement_policy, model_name: str):
         p.data.copy_(torch_p.data)
 
     world_size = torch.distributed.get_world_size()
-    config_dict, _ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
+    config_dict, *_ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
     config_dict[world_size]['chunk_size'] = 5000
     config_dict[world_size]['keep_gathered'] = False
     if placement_policy != 'cuda':
diff --git a/tests/test_gemini/update/test_search.py b/tests/test_gemini/update/test_search.py
index e0b4e207f16f..2fcdd5380906 100644
--- a/tests/test_gemini/update/test_search.py
+++ b/tests/test_gemini/update/test_search.py
@@ -6,7 +6,7 @@
 import torch.multiprocessing as mp
 
 import colossalai
-from colossalai.gemini.chunk import search_chunk_configuration
+from colossalai.gemini.chunk import init_chunk_manager, search_chunk_configuration
 from colossalai.tensor import ComputePattern, ComputeSpec, ProcessGroup, ShardSpec
 from colossalai.testing import rerun_if_address_is_in_use
 from colossalai.utils import free_port, get_current_device
@@ -23,7 +23,6 @@ def init_1d_row_spec(model, pg: ProcessGroup):
 
 
 def exam_search_chunk_size():
-
     world_size = torch.distributed.get_world_size()
     pg_tp = ProcessGroup(tp_degree=world_size)
 
@@ -34,11 +33,11 @@ def exam_search_chunk_size():
     with ColoInitContext(device=get_current_device()):
         model = model_builder()
     init_1d_row_spec(model, pg_tp)
-    config_dict, _ = search_chunk_configuration(model,
-                                                search_range_mb=1,
-                                                search_interval_byte=16,
-                                                min_chunk_size_mb=0,
-                                                filter_exlarge_params=True)
+    config_dict, *_ = search_chunk_configuration(model,
+                                                 search_range_mb=1,
+                                                 search_interval_byte=16,
+                                                 min_chunk_size_mb=0,
+                                                 filter_exlarge_params=True)
 
     for key in config_dict:
         chunk_size = config_dict[key]['chunk_size']
@@ -48,9 +47,68 @@ def exam_search_chunk_size():
             assert chunk_size == 1024
 
 
+def exam_search_strict_ddp():
+    world_size = torch.distributed.get_world_size()
+    default_shard_pg = ProcessGroup(tp_degree=world_size)
+    default_shard_spec = ShardSpec([-1], [world_size])
+
+    get_components_func = non_distributed_component_funcs.get_callable('gpt2')
+    model_builder, train_dataloader, test_dataloader, optimizer_class, criterion = get_components_func()
+    # get the chunk configuration over replicated models
+    with ColoInitContext(device=get_current_device()):
+        ddp_model = model_builder()
+    re_dict, re_total, re_wasted = search_chunk_configuration(ddp_model,
+                                                              search_range_mb=1,
+                                                              search_interval_byte=16,
+                                                              min_chunk_size_mb=0,
+                                                              filter_exlarge_params=True,
+                                                              strict_ddp_flag=False)
+    # get the chunk configuration over sharded ddp models
+    with ColoInitContext(device=get_current_device(), default_pg=default_shard_pg,
+                         default_dist_spec=default_shard_spec):
+        sharded_ddp_model = model_builder()
+    sh_dict, sh_total, sh_wasted = search_chunk_configuration(sharded_ddp_model,
+                                                              search_range_mb=1,
+                                                              search_interval_byte=16,
+                                                              min_chunk_size_mb=0,
+                                                              filter_exlarge_params=True,
+                                                              strict_ddp_flag=True)
+    assert re_dict == sh_dict
+    for key in re_dict:
+        assert re_dict[key] == sh_dict[key]
+
+    assert re_total == sh_total
+    assert re_wasted == sh_wasted
+
+
+def exam_chunk_manager():
+    world_size = torch.distributed.get_world_size()
+    default_shard_pg = ProcessGroup(tp_degree=world_size)
+    default_shard_spec = ShardSpec([-1], [world_size])
+
+    get_components_func = non_distributed_component_funcs.get_callable('gpt2')
+    model_builder, train_dataloader, test_dataloader, optimizer_class, criterion = get_components_func()
+
+    with ColoInitContext(device=get_current_device(), default_pg=default_shard_pg,
+                         default_dist_spec=default_shard_spec):
+        sharded_ddp_model = model_builder()
+    chunk_manager = init_chunk_manager(sharded_ddp_model,
+                                       get_current_device(),
+                                       hidden_dim=16,
+                                       search_range_mb=1,
+                                       min_chunk_size_mb=0,
+                                       filter_exlarge_params=True,
+                                       strict_ddp_flag=True)
+    config_dict = chunk_manager.dp_degree_chunk_size_dict
+    assert len(config_dict) == 1
+    assert config_dict[world_size] == 31616
+
+
 def run_dist(rank, world_size, port):
     colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
     exam_search_chunk_size()
+    exam_search_strict_ddp()
+    exam_chunk_manager()
 
 
 @pytest.mark.dist
diff --git a/tests/test_gemini/update/test_zeroddp_state_dict.py b/tests/test_gemini/update/test_zeroddp_state_dict.py
index 266b8eab121b..00d835842f79 100644
--- a/tests/test_gemini/update/test_zeroddp_state_dict.py
+++ b/tests/test_gemini/update/test_zeroddp_state_dict.py
@@ -41,7 +41,7 @@ def exam_state_dict(placement_policy, keep_gathered, model_name: str):
         torch_p.data.copy_(p.data)
 
     world_size = torch.distributed.get_world_size()
-    config_dict, _ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
+    config_dict, *_ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
     config_dict[world_size]['chunk_size'] = 5000
     config_dict[world_size]['keep_gathered'] = keep_gathered
     chunk_manager = ChunkManager(config_dict)
@@ -73,7 +73,7 @@ def exam_load_state_dict(placement_policy, keep_gathered, model_name: str):
     torch_model = model_builder()    # get a different model
 
     world_size = torch.distributed.get_world_size()
-    config_dict, _ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
+    config_dict, *_ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
     config_dict[world_size]['chunk_size'] = 5000
     config_dict[world_size]['keep_gathered'] = keep_gathered
 
diff --git a/tests/test_gemini/update/test_zerooptim_state_dict.py b/tests/test_gemini/update/test_zerooptim_state_dict.py
index 7f53415bf22c..dc3dda9d6df4 100644
--- a/tests/test_gemini/update/test_zerooptim_state_dict.py
+++ b/tests/test_gemini/update/test_zerooptim_state_dict.py
@@ -33,7 +33,7 @@ def exam_zero_optim_state_dict(placement_policy, keep_gathered):
     torch_model = model_builder()    # get a different model
 
     world_size = torch.distributed.get_world_size()
-    config_dict, _ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
+    config_dict, *_ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
     config_dict[world_size]['chunk_size'] = 5000
     config_dict[world_size]['keep_gathered'] = keep_gathered
 
diff --git a/tests/test_tensor/test_tp_with_zero.py b/tests/test_tensor/test_tp_with_zero.py
index 83645bc6ebfd..1a6d23f6a2eb 100644
--- a/tests/test_tensor/test_tp_with_zero.py
+++ b/tests/test_tensor/test_tp_with_zero.py
@@ -85,7 +85,7 @@ def run_gpt(placement_policy, tp_init_spec_func=None):
         tp_init_spec_func(model, pg)
 
     dp_world_size = pg.dp_world_size()
-    config_dict, _ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
+    config_dict, *_ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
     config_dict[dp_world_size]['chunk_size'] = 5000
     config_dict[dp_world_size]['keep_gathered'] = False
     if placement_policy != 'cuda':

From a360b9bc44fcf8b4b357b74f8a72d7695f1b8d14 Mon Sep 17 00:00:00 2001
From: binmakeswell <binmakeswell@gmail.com>
Date: Sun, 29 Jan 2023 10:53:57 +0800
Subject: [PATCH 216/503] [doc] update example link (#2520)

* [doc] update example link

* [doc] update example link
---
 README-zh-Hans.md | 2 +-
 README.md         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README-zh-Hans.md b/README-zh-Hans.md
index 5ad22785cef5..9931d434f50c 100644
--- a/README-zh-Hans.md
+++ b/README-zh-Hans.md
@@ -7,7 +7,7 @@
 
    <h3> <a href="https://arxiv.org/abs/2110.14883"> 论文 </a> |
    <a href="https://www.colossalai.org/"> 文档 </a> |
-   <a href="https://github.com/hpcaitech/ColossalAI-Examples"> 例程 </a> |
+   <a href="https://github.com/hpcaitech/ColossalAI/tree/main/examples"> 例程 </a> |
    <a href="https://github.com/hpcaitech/ColossalAI/discussions"> 论坛 </a> |
    <a href="https://medium.com/@hpcaitech"> 博客 </a></h3>
 
diff --git a/README.md b/README.md
index 01e7b0ec50e4..5f230e627efe 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@
 
    <h3> <a href="https://arxiv.org/abs/2110.14883"> Paper </a> |
    <a href="https://www.colossalai.org/"> Documentation </a> |
-   <a href="https://github.com/hpcaitech/ColossalAI-Examples"> Examples </a> |
+   <a href="https://github.com/hpcaitech/ColossalAI/tree/main/examples"> Examples </a> |
    <a href="https://github.com/hpcaitech/ColossalAI/discussions"> Forum </a> |
    <a href="https://medium.com/@hpcaitech"> Blog </a></h3>
 

From aa0f6686f90f3f5aad3b6c30efd0b5f97be42443 Mon Sep 17 00:00:00 2001
From: YuliangLiu0306 <72588413+YuliangLiu0306@users.noreply.github.com>
Date: Sun, 29 Jan 2023 11:13:15 +0800
Subject: [PATCH 217/503] [autoparallel] accelerate gpt2 training (#2495)

---
 .../passes/runtime_preparation_pass.py             | 14 ++++++++------
 .../tensor_shard/node_handler/matmul_handler.py    |  2 ++
 .../strategy/matmul_strategy_generator.py          | 14 +++++++-------
 colossalai/device/device_mesh.py                   |  2 +-
 colossalai/tensor/comm_spec.py                     |  6 +++---
 5 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/colossalai/auto_parallel/passes/runtime_preparation_pass.py b/colossalai/auto_parallel/passes/runtime_preparation_pass.py
index 1c25e4c94f24..98897095753d 100644
--- a/colossalai/auto_parallel/passes/runtime_preparation_pass.py
+++ b/colossalai/auto_parallel/passes/runtime_preparation_pass.py
@@ -387,14 +387,15 @@ def _module_params_sharding(gm: torch.fx.GraphModule, device_mesh: DeviceMesh):
                     # register hook to the parameters
                     if operation_data.type == OperationDataType.PARAM and operation_data.name == name and comm_action.comm_type == CommType.HOOK:
 
-                        def wrapper(param, comm_spec):
+                        def wrapper(param, comm_spec, stream):
 
                             def hook_fn(grad):
-                                _all_reduce(grad, comm_spec, async_op=False)
+                                with torch.cuda.stream(stream):
+                                    _all_reduce(grad, comm_spec, async_op=True)
 
                             param.register_hook(hook_fn)
 
-                        wrapper(param, comm_spec_to_use)
+                        wrapper(param, comm_spec_to_use, reduction_stream)
 
             sharded_buffer_dict = {}
             # apply the sharding spec of buffers
@@ -440,14 +441,15 @@ def hook_fn(grad):
                 # register hook to the parameters
                 if isinstance(node._meta_data, torch.nn.parameter.Parameter) and comm_action.comm_type == CommType.HOOK:
 
-                    def wrapper(param, comm_spec):
+                    def wrapper(param, comm_spec, stream):
 
                         def hook_fn(grad):
-                            _all_reduce(grad, comm_spec, async_op=False)
+                            with torch.cuda.stream(stream):
+                                _all_reduce(grad, comm_spec, async_op=True)
 
                         param.register_hook(hook_fn)
 
-                    wrapper(target, comm_spec_to_use)
+                    wrapper(target, comm_spec_to_use, reduction_stream)
     return gm
 
 
diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/matmul_handler.py b/colossalai/auto_parallel/tensor_shard/node_handler/matmul_handler.py
index d3f9fd01d891..131c35156dcd 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/matmul_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/matmul_handler.py
@@ -483,4 +483,6 @@ def post_process(self, strategy: ShardingStrategy) -> Union[ShardingStrategy, Li
                         raise TypeError(
                             f"Found unexpected output type {type(output)} from the recover method of BmmTransform")
                 strategies = recovered_stragies
+            for index, strategies in enumerate(strategies):
+                strategies.name = f"{strategies.name}_{index}"
             return strategies
diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/matmul_strategy_generator.py b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/matmul_strategy_generator.py
index fa2246f952a9..9aa95b43a966 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/matmul_strategy_generator.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/matmul_strategy_generator.py
@@ -247,12 +247,12 @@ def collate_strategies(self) -> List[ShardingStrategy]:
         strategies.append(self.split_rhs_space_both_contract(1, 0))
 
         # RR= RS x SR
-        strategies.append(self.recompute_split_both_contract(0))
-        strategies.append(self.recompute_split_both_contract(1))
+        # strategies.append(self.recompute_split_both_contract(0))
+        # strategies.append(self.recompute_split_both_contract(1))
 
-        # RS = RR x RS
-        strategies.append(self.split_rhs_space_only(0))
-        strategies.append(self.split_rhs_space_only(1))
+        # # RS = RR x RS
+        # strategies.append(self.split_rhs_space_only(0))
+        # strategies.append(self.split_rhs_space_only(1))
 
         # S01R = S01R x RR
         strategies.append(self.split_lhs_1st_dim_1d(0, 1))
@@ -263,8 +263,8 @@ def collate_strategies(self) -> List[ShardingStrategy]:
         # RS01 = RR x RS01
         strategies.append(self.split_rhs_2nd_dim_1d(0, 1))
 
-        # RR = RR x RR
-        strategies.append(self.non_split())
+        # # RR = RR x RR
+        # strategies.append(self.non_split())
 
         return strategies
 
diff --git a/colossalai/device/device_mesh.py b/colossalai/device/device_mesh.py
index b5a97eded90c..22a01dddb869 100644
--- a/colossalai/device/device_mesh.py
+++ b/colossalai/device/device_mesh.py
@@ -98,7 +98,7 @@ def flatten(self):
         return DeviceMesh(self.physical_mesh_id,
                           tuple(flatten_mesh_shape),
                           mesh_alpha=[max(self.mesh_alpha)] * (flatten_mesh_shape_size - 1),
-                          mesh_beta=[min(self.mesh_beta)] * (flatten_mesh_shape_size - 1),
+                          mesh_beta=[max(self.mesh_beta)] * (flatten_mesh_shape_size - 1),
                           init_process_group=self.init_process_group,
                           need_flatten=False)
 
diff --git a/colossalai/tensor/comm_spec.py b/colossalai/tensor/comm_spec.py
index 3c9e0fd56696..b31c06994190 100644
--- a/colossalai/tensor/comm_spec.py
+++ b/colossalai/tensor/comm_spec.py
@@ -463,7 +463,7 @@ def get_comm_cost(self):
         if self.comm_pattern == CollectiveCommPattern.GATHER_FWD_SPLIT_BWD:
             forward_communication_cost = self.device_mesh.all_gather_cost(comm_size, self.logical_process_axis)
             # give a tiny cost to shard
-            backward_communication_cost = 10
+            backward_communication_cost = 100
 
         if self.comm_pattern == CollectiveCommPattern.ALL2ALL_FWD_ALL2ALL_BWD:
             forward_communication_cost = self.device_mesh.all_to_all_cost(comm_size, self.logical_process_axis)
@@ -481,13 +481,13 @@ def get_comm_cost(self):
 
         if self.comm_pattern == CollectiveCommPattern.SPLIT_FWD_GATHER_BWD:
             # give a tiny cost to shard
-            forward_communication_cost = 10
+            forward_communication_cost = 100
             backward_communication_cost = self.device_mesh.all_gather_cost(comm_size, self.logical_process_axis)
 
         if self.comm_pattern == CollectiveCommPattern.MIXGATHER_FWD_SPLIT_BWD:
             # no need for axis because all devices are used in mix_gather
             forward_communication_cost = self.device_mesh.mix_gather_cost(comm_size)
-            backward_communication_cost = 10
+            backward_communication_cost = 100
 
         if self.forward_only:
             cost_dict["forward"] = forward_communication_cost

From fd8d19a6e7ed6f0f3e8516788a3d6ddb4ef52aa8 Mon Sep 17 00:00:00 2001
From: Jiarui Fang <fangjiarui123@gmail.com>
Date: Sun, 29 Jan 2023 13:52:15 +0800
Subject: [PATCH 218/503] [example] update lightning dependency for stable
 diffusion (#2522)

---
 examples/images/diffusion/docker/Dockerfile | 7 -------
 examples/images/diffusion/requirements.txt  | 1 +
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/examples/images/diffusion/docker/Dockerfile b/examples/images/diffusion/docker/Dockerfile
index 17cc8bc8bbc7..e8e6957b79e1 100644
--- a/examples/images/diffusion/docker/Dockerfile
+++ b/examples/images/diffusion/docker/Dockerfile
@@ -18,13 +18,6 @@ RUN git clone https://github.com/NVIDIA/apex && \
 RUN pip install colossalai==0.1.12+torch1.12cu11.3 -f https://release.colossalai.org
 
 
-# install our lightning, it will be merged to Lightning official repo.
-RUN git clone https://github.com/1SAA/lightning.git && \
-    cd lightning && \
-    git checkout strategy/colossalai && \
-    export PACKAGE_NAME=pytorch && \
-    pip install --no-cache-dir .
-
 # install titans
 RUN pip install --no-cache-dir titans
 
diff --git a/examples/images/diffusion/requirements.txt b/examples/images/diffusion/requirements.txt
index 60c4b903e01f..1a9233d578ef 100644
--- a/examples/images/diffusion/requirements.txt
+++ b/examples/images/diffusion/requirements.txt
@@ -13,6 +13,7 @@ transformers==4.19.2
 webdataset==0.2.5
 open-clip-torch==2.7.0
 gradio==3.11
+lightning==1.9.0
 datasets
 colossalai
 -e .

From 077a5cdde409cc89b726240b4788717fba1e62c4 Mon Sep 17 00:00:00 2001
From: HELSON <c2h214748@gmail.com>
Date: Sun, 29 Jan 2023 15:09:57 +0800
Subject: [PATCH 219/503] [zero] fix gradient clipping in hybrid parallelism
 (#2521)

* [zero] fix gradient clipping in hybrid parallelism

* [testing] change model name to avoid pytest warning

* [hotfix] fix unit testing
---
 .../grad_scaler/dynamic_grad_scaler.py        | 20 +++++++++++++++++--
 colossalai/zero/sharded_optim/_utils.py       | 11 +++++-----
 .../test_zero/low_level_zero/test_grad_acc.py |  8 ++++----
 .../test_zero/low_level_zero/test_zero1_2.py  |  8 ++++----
 .../low_level_zero/test_zero_init.py          |  8 ++++----
 .../test_zero/low_level_zero/test_zero_tp.py  | 16 ++++++++-------
 6 files changed, 45 insertions(+), 26 deletions(-)

diff --git a/colossalai/amp/naive_amp/grad_scaler/dynamic_grad_scaler.py b/colossalai/amp/naive_amp/grad_scaler/dynamic_grad_scaler.py
index 6d6f2f287e32..e899b9ca4c89 100644
--- a/colossalai/amp/naive_amp/grad_scaler/dynamic_grad_scaler.py
+++ b/colossalai/amp/naive_amp/grad_scaler/dynamic_grad_scaler.py
@@ -58,10 +58,12 @@ def _sanity_checks(self) -> None:
 
         if self._min_scale:
             assert self._min_scale > 0, 'The minimum gradient scale cannot be zero or negative'
+            assert self._min_scale <= self._scale, 'The minimum gradient scale cannot be greater than the current scale'
         if self._max_scale:
-            assert self._min_scale > 0, 'The maximum gradient scale cannot be zero or negative'
+            assert self._max_scale > 0, 'The maximum gradient scale cannot be zero or negative'
+            assert self._max_scale >= self._scale, 'The maximum gradient scale cannot be smaller than the current scale'
         assert self._growth_factor > 1, 'The growth factor cannot be equal or smaller than 1'
-        assert self._backoff_factor < 1 and self._backoff_factor > 0, 'The backoff factor must be between 0 and 1'
+        assert 0 < self._backoff_factor < 1, 'The backoff factor must be between 0 and 1'
         assert self._hysteresis >= 0, 'The hysteresis cannot be negative'
 
     def update(self, overflow: bool) -> None:
@@ -103,3 +105,17 @@ def _grow_scale(self) -> None:
         self._scale = self._scale * self._growth_factor
         if self._max_scale:
             self._scale = torch.min(self._scale, self._max_scale)
+
+    def state_dict(self):
+        state_dict = dict()
+        state_dict['scale'] = self._scale
+        state_dict['growth_factor'] = self._growth_factor
+        state_dict['backoff_factor'] = self._backoff_factor
+        state_dict['hysteresis'] = self._hysteresis
+        return state_dict
+
+    def load_state_dict(self, state_dict):
+        self._scale = state_dict['scale'].cuda(torch.cuda.current_device())
+        self._growth_factor = state_dict['growth_factor']
+        self._backoff_factor = state_dict['backoff_factor']
+        self._hysteresis = state_dict['hysteresis']
diff --git a/colossalai/zero/sharded_optim/_utils.py b/colossalai/zero/sharded_optim/_utils.py
index 70d9c040cb53..e674344018d4 100644
--- a/colossalai/zero/sharded_optim/_utils.py
+++ b/colossalai/zero/sharded_optim/_utils.py
@@ -6,9 +6,7 @@
 from torch._six import inf
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.tensor import ProcessGroup
+from colossalai.tensor import ColoParameter
 from colossalai.utils import is_model_parallel_parameter
 
 
@@ -225,7 +223,10 @@ def compute_norm(gradients, params, dp_group, mp_group, norm_type=2):
 
         for g, p in zip(gradients, params):
             # Pipeline parallelism may replicate parameters. Avoid multi-counting.
-            if is_model_parallel_parameter(p) or mp_rank == 0:
+            tp_param_flag = False
+            if is_model_parallel_parameter(p) or (isinstance(p, ColoParameter) and not p.is_replicate()):
+                tp_param_flag = True
+            if tp_param_flag or mp_rank == 0:
                 param_norm = g.data.double().norm(2)
                 total_norm += param_norm.item()**2
 
@@ -234,7 +235,7 @@ def compute_norm(gradients, params, dp_group, mp_group, norm_type=2):
         torch.distributed.all_reduce(total_norm_cuda, op=torch.distributed.ReduceOp.SUM, group=dp_group)
 
         if mp_group is not None:
-            dist.all_reduce(tensor=total_norm_cuda, op=torch.distributed.ReduceOp.SUM)
+            dist.all_reduce(tensor=total_norm_cuda, op=torch.distributed.ReduceOp.SUM, group=mp_group)
 
         total_norm = total_norm_cuda[0].item()**(1. / norm_type)
 
diff --git a/tests/test_zero/low_level_zero/test_grad_acc.py b/tests/test_zero/low_level_zero/test_grad_acc.py
index 69795ed6a2e5..1e157c70a7ef 100644
--- a/tests/test_zero/low_level_zero/test_grad_acc.py
+++ b/tests/test_zero/low_level_zero/test_grad_acc.py
@@ -15,10 +15,10 @@
 from colossalai.zero import LowLevelZeroOptimizer
 
 
-class TestModel(nn.Module):
+class MlpModel(nn.Module):
 
     def __init__(self):
-        super(TestModel, self).__init__()
+        super(MlpModel, self).__init__()
         self.linear1 = nn.Linear(128, 256)
         self.linear2 = nn.Linear(256, 512)
 
@@ -33,7 +33,7 @@ def exam_zero_1_2_grad_acc():
     seed_all(2009)
 
     # create model
-    zero1_model = TestModel().cuda()
+    zero1_model = MlpModel().cuda()
     zero2_model = copy.deepcopy(zero1_model)
     # create optimizer
     zero1_optimizer = torch.optim.Adam(zero1_model.parameters(), lr=1)
@@ -89,7 +89,7 @@ def exam_zero_1_grad_acc():
     seed_all(2008)
 
     # create models
-    zero_model = TestModel()
+    zero_model = MlpModel()
     torch_model = copy.deepcopy(zero_model)
 
     seed_all(2008)
diff --git a/tests/test_zero/low_level_zero/test_zero1_2.py b/tests/test_zero/low_level_zero/test_zero1_2.py
index 8771bfbe6049..4949630725ca 100644
--- a/tests/test_zero/low_level_zero/test_zero1_2.py
+++ b/tests/test_zero/low_level_zero/test_zero1_2.py
@@ -14,10 +14,10 @@
 from colossalai.zero import LowLevelZeroOptimizer
 
 
-class TestModel(nn.Module):
+class MlpModel(nn.Module):
 
     def __init__(self):
-        super(TestModel, self).__init__()
+        super(MlpModel, self).__init__()
         self.linear1 = nn.Linear(128, 256)
         self.linear2 = nn.Linear(256, 512)
 
@@ -55,7 +55,7 @@ def exam_zero_1_2():
     seed_all(2001)
 
     # create model
-    zero1_model = TestModel().cuda()
+    zero1_model = MlpModel().cuda()
     zero2_model = copy.deepcopy(zero1_model)
 
     # create optimizer
@@ -111,7 +111,7 @@ def exam_zero_1_torch_ddp():
     seed_all(1453)
 
     # create models
-    zero_model = TestModel()
+    zero_model = MlpModel()
     torch_model = copy.deepcopy(zero_model)
 
     zero_model = zero_model.cuda().half()
diff --git a/tests/test_zero/low_level_zero/test_zero_init.py b/tests/test_zero/low_level_zero/test_zero_init.py
index 84d7b8c514b6..1305da5df9c5 100644
--- a/tests/test_zero/low_level_zero/test_zero_init.py
+++ b/tests/test_zero/low_level_zero/test_zero_init.py
@@ -13,10 +13,10 @@
 from colossalai.zero import LowLevelZeroOptimizer
 
 
-class TestModel(nn.Module):
+class MlpModel(nn.Module):
 
     def __init__(self):
-        super(TestModel, self).__init__()
+        super(MlpModel, self).__init__()
         self.linear1 = nn.Linear(128, 256)
         self.linear2 = nn.Linear(256, 512)
 
@@ -28,9 +28,9 @@ def forward(self, x):
 
 def exam_zero_init():
     dp_2_tp_2_pg = ProcessGroup(dp_degree=2, tp_degree=2)
-    model1 = TestModel().cuda()
+    model1 = MlpModel().cuda()
     with ColoInitContext(device=get_current_device(), default_pg=dp_2_tp_2_pg):
-        model2 = TestModel()
+        model2 = MlpModel()
     optimizer1 = LowLevelZeroOptimizer(torch.optim.Adam(model1.parameters(), lr=1))
     optimizer2 = LowLevelZeroOptimizer(torch.optim.Adam(model2.parameters(), lr=1))
 
diff --git a/tests/test_zero/low_level_zero/test_zero_tp.py b/tests/test_zero/low_level_zero/test_zero_tp.py
index 8ba6e3cb61fd..ea8e3a0a3369 100644
--- a/tests/test_zero/low_level_zero/test_zero_tp.py
+++ b/tests/test_zero/low_level_zero/test_zero_tp.py
@@ -20,10 +20,10 @@ def strict_shard_equal(tensor, shard, tp_pg, rtol=1e-3, atol=1e-4):
     return tensor_shard_equal(tensor, shard, tp_pg.tp_local_rank(), tp_pg.tp_world_size(), rtol, atol)
 
 
-class TestModel(nn.Module):
+class MlpModel(nn.Module):
 
     def __init__(self):
-        super(TestModel, self).__init__()
+        super(MlpModel, self).__init__()
         self.linear1 = nn.Linear(32, 128)
         self.act = nn.GELU()
         self.linear2 = nn.Linear(128, 32)
@@ -42,8 +42,8 @@ def exam_zero_with_tp(overlap_flag, partition_flag):
     tp_pg = ProcessGroup(tp_degree=2)
 
     with ColoInitContext(device=get_current_device(), default_pg=tp_pg):
-        hybrid_model = TestModel()
-    torch_model = TestModel().cuda()
+        hybrid_model = MlpModel()
+    torch_model = MlpModel().cuda()
     for pt, ph in zip(torch_model.parameters(), hybrid_model.parameters()):
         pt.data.copy_(ph.data)
 
@@ -55,10 +55,11 @@ def exam_zero_with_tp(overlap_flag, partition_flag):
             split_param_col_tp1d(param, tp_pg)
 
     torch_model = DDP(torch_model, device_ids=[tp_pg.rank()], process_group=tp_pg.dp_process_group())
-    torch_optim = torch.optim.Adam(torch_model.parameters(), lr=1)
-    hybrid_optim = torch.optim.Adam(hybrid_model.parameters(), lr=1)
+    torch_optim = torch.optim.Adam(torch_model.parameters(), lr=1e-2)    # set to 1e-2 for torch-1.11
+    hybrid_optim = torch.optim.Adam(hybrid_model.parameters(), lr=1e-2)
     hybrid_optim = LowLevelZeroOptimizer(hybrid_optim,
-                                         initial_scale=1,
+                                         initial_scale=2,
+                                         clip_grad_norm=1.0,
                                          overlap_communication=overlap_flag,
                                          partition_grad=partition_flag)
 
@@ -71,6 +72,7 @@ def exam_zero_with_tp(overlap_flag, partition_flag):
     assert_close(torch_loss, hybrid_loss)
 
     torch_loss.backward()
+    torch.nn.utils.clip_grad_norm_(torch_model.parameters(), 1.0)
     hybrid_optim.backward(hybrid_loss)
     hybrid_optim.sync_grad()
 

From c198c7c0b067ab4216b31f7c70e0626e91ba949e Mon Sep 17 00:00:00 2001
From: Super Daniel <78588128+super-dainiu@users.noreply.github.com>
Date: Sun, 29 Jan 2023 16:28:10 +0800
Subject: [PATCH 220/503] [hotfix] meta tensor default device. (#2510)

---
 colossalai/fx/profiler/tensor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/colossalai/fx/profiler/tensor.py b/colossalai/fx/profiler/tensor.py
index 7606f17cf9d5..2ee5e5c47750 100644
--- a/colossalai/fx/profiler/tensor.py
+++ b/colossalai/fx/profiler/tensor.py
@@ -43,7 +43,7 @@ def __new__(cls, elem, fake_device=None):
             storage_offset=elem.storage_offset(),
             dtype=elem.dtype,
             layout=elem.layout,
-            device=fake_device if fake_device is not None else torch.device('cpu'),
+            device=fake_device or (elem.device if elem.device.type != 'meta' else torch.device('cpu')),
             requires_grad=elem.requires_grad)    # deceive the frontend for aten selections
         r._tensor = elem
         # ...the real tensor is held as an element on the tensor.

From b528eea0f05162bfedcd06381b953193c2a91b82 Mon Sep 17 00:00:00 2001
From: HELSON <c2h214748@gmail.com>
Date: Sun, 29 Jan 2023 17:52:58 +0800
Subject: [PATCH 221/503] [zero] add zero wrappers (#2523)

* [zero] add zero wrappers

* change names

* add wrapper functions to init
---
 colossalai/nn/optimizer/zero_optimizer.py     |   3 +-
 colossalai/nn/parallel/__init__.py            |   3 +-
 colossalai/nn/parallel/zero_wrapper.py        | 106 ++++++++++++++++++
 .../zero/sharded_optim/low_level_optim.py     |   9 +-
 .../test_zero/low_level_zero/test_grad_acc.py |  13 +--
 .../test_zero/low_level_zero/test_zero1_2.py  |  12 +-
 .../test_zero/low_level_zero/test_zero_tp.py  |   1 -
 7 files changed, 128 insertions(+), 19 deletions(-)
 create mode 100644 colossalai/nn/parallel/zero_wrapper.py

diff --git a/colossalai/nn/optimizer/zero_optimizer.py b/colossalai/nn/optimizer/zero_optimizer.py
index 9f761efdb12c..402e28ce8458 100644
--- a/colossalai/nn/optimizer/zero_optimizer.py
+++ b/colossalai/nn/optimizer/zero_optimizer.py
@@ -65,7 +65,8 @@ def __init__(self,
                  **defaults: Any):
         super().__init__(optim)
         assert isinstance(module, ZeroDDP)
-        assert type(optim) in _AVAIL_OPTIM_LIST, "you should use the optimizer in the available list"
+        assert type(optim) in _AVAIL_OPTIM_LIST, "You should use an optimizer in the available list:\n" \
+            f"{_AVAIL_OPTIM_LIST}"
         self.module = module
         self.gemini_manager = module.gemini_manager
         self.chunk_manager: ChunkManager = self.gemini_manager.chunk_manager
diff --git a/colossalai/nn/parallel/__init__.py b/colossalai/nn/parallel/__init__.py
index 0c369bfce22f..2afc8f18c36f 100644
--- a/colossalai/nn/parallel/__init__.py
+++ b/colossalai/nn/parallel/__init__.py
@@ -1,4 +1,5 @@
 from .data_parallel import ColoDDP, ZeroDDP
 from .gemini_parallel import GeminiDDP
+from .zero_wrapper import zero_model_wrapper, zero_optim_wrapper
 
-__all__ = ['ColoDDP', 'ZeroDDP', 'GeminiDDP']
+__all__ = ['ColoDDP', 'ZeroDDP', 'GeminiDDP', 'zero_model_wrapper', 'zero_optim_wrapper']
diff --git a/colossalai/nn/parallel/zero_wrapper.py b/colossalai/nn/parallel/zero_wrapper.py
new file mode 100644
index 000000000000..504625e62d30
--- /dev/null
+++ b/colossalai/nn/parallel/zero_wrapper.py
@@ -0,0 +1,106 @@
+from copy import copy
+from typing import Dict, Optional
+
+import torch
+import torch.nn as nn
+
+from .gemini_parallel import GeminiDDP
+
+
+def zero_model_wrapper(model: nn.Module, zero_stage: int = 1, gemini_config: Optional[Dict] = None):
+    """This wrapper function is used to wrap your training model for ZeRO DDP.
+
+    Example:
+
+        >>> with ColoInitContext():
+        >>>     my_model = Bert()
+        >>> my_optim = SGD(my_model.parameters(), lr = 1e-3)
+        >>> zero_model = zero_model_wrapper(my_model, zero_stage=1)
+        >>> zero_optim = zero_optim_wrapper(zero_model, my_optim)
+
+    Args:
+        model (nn.Module): The model used in ZeRO DDP.
+        zero_stage (int, optional): The stage of ZeRO DDP. You can find more information in ZeRO's paper.
+            https://arxiv.org/abs/1910.02054
+        gemini_config (dict, optional): The configuration dictionary of `GeminiDDP`. `GeminiDDP` is enabled
+            when the stage is set to 3. You can set the arguemnts of `GeminiDDP` in the gemini_config.
+            Here is an example where we set the device of the model, the placement policy of Gemini, and the
+            size of hidden dimension to help Gemini find out a unified chunk size.
+
+            Example:
+
+                >>> config_dict = dict(device=torch.cuda.current_device(), hidden_dim=1024, placement_policy='auto')
+                >>> model = zero_model_wrapper(model, zero_stage=3, gemini_config=config_dict)
+    """
+    setattr(model, "_colo_zero_stage", zero_stage)
+    assert zero_stage in [1, 2, 3], "The stage of ZeRO should be 1, 2 or 3"
+
+    if gemini_config is None:
+        gemini_config = dict()
+
+    if zero_stage in [1, 2]:
+        return model
+    else:
+        return GeminiDDP(model, **gemini_config)
+
+
+def zero_optim_wrapper(model: nn.Module,
+                       optimizer: torch.optim.Optimizer,
+                       initial_scale: float = 2**16,
+                       growth_factor: float = 2,
+                       backoff_factor: float = 0.5,
+                       growth_interval: int = 1000,
+                       hysteresis: int = 2,
+                       min_scale: float = 1,
+                       max_scale: float = 2**32,
+                       max_norm: float = 0.0,
+                       norm_type: float = 2.0,
+                       optim_config: Optional[Dict] = None):
+    """This wrapper function is used to wrap your training optimizer for ZeRO DDP.
+
+    Args:
+        model (nn.Module): Your model wrapped by `zero_model_wrapper`
+        optimizer (torch.optim.Optimizer): Your initialized optimizer
+        initial_scale (float, optional): initial_scale used by DynamicGradScaler.
+        min_scale (float, optional): min_scale used by DynamicGradScaler.
+        growth_factor (float, optional): growth_factor used by DynamicGradScaler.
+        backoff_factor (float, optional): backoff_factor used by DynamicGradScaler.
+        growth_interval (float, optional): growth_interval used by DynamicGradScaler.
+        hysteresis (float, optional): hysteresis used by DynamicGradScaler.
+        max_scale (int, optional): max_scale used by DynamicGradScaler.
+        max_norm (float, optional): max_norm used for `clip_grad_norm`. You should notice that you shall not do
+            clip_grad_norm by yourself when using ZeRO DDP. The ZeRO optimizer will take care of clip_grad_norm.
+        norm_type (float, optional): norm_type used for `clip_grad_norm`.
+        optim_config (dict, optinoal): The configuration used for the ZeRO optimizer.
+            Example:
+
+                >>> zero2_config = dict(reduce_bucket_size=12 * 1024 * 1024, overlap_communication=True)
+                >>> optim = zero_optim_wrapper(model, optim, optim_config=zero2_config)
+    """
+    assert hasattr(model, "_colo_zero_stage"), "You should use `zero_ddp_wrapper` first"
+    zero_stage = getattr(model, "_colo_zero_stage")
+
+    assert norm_type == 2.0, "Current ZeRO optimizers only support 'norm_type=2'"
+
+    if optim_config is None:
+        config_dict = dict()
+    else:
+        config_dict = copy(optim_config)
+
+    config_dict['initial_scale'] = initial_scale
+    config_dict['growth_factor'] = growth_factor
+    config_dict['backoff_factor'] = backoff_factor
+    config_dict['growth_interval'] = growth_interval
+    config_dict['hysteresis'] = hysteresis
+    config_dict['min_scale'] = min_scale
+    config_dict['max_scale'] = max_scale
+
+    if zero_stage in [1, 2]:
+        from colossalai.zero.sharded_optim.low_level_optim import LowLevelZeroOptimizer
+        config_dict['partition_grad'] = zero_stage == 2
+        config_dict['clip_grad_norm'] = max_norm
+        return LowLevelZeroOptimizer(optimizer, **config_dict)
+    else:
+        from colossalai.nn.optimizer.zero_optimizer import ZeroOptimizer
+        config_dict['clipping_norm'] = max_norm
+        return ZeroOptimizer(optimizer, model, **config_dict)
diff --git a/colossalai/zero/sharded_optim/low_level_optim.py b/colossalai/zero/sharded_optim/low_level_optim.py
index f45b5e200a61..d174fc6ac138 100644
--- a/colossalai/zero/sharded_optim/low_level_optim.py
+++ b/colossalai/zero/sharded_optim/low_level_optim.py
@@ -17,7 +17,6 @@
     calculate_global_norm_from_list,
     compute_norm,
     flatten,
-    get_grad_accumulate_object,
     has_inf_or_nan,
     reduce_tensor_dp_group,
     release_param_grad,
@@ -386,7 +385,7 @@ def _add_to_reduction_bucket(self, param, reduce_rank=None):
     # torch.optim.Optimizer methods
     ################################
 
-    def backward(self, loss, retain_graph=False):
+    def backward(self, loss, retain_graph=False, sync_grad=True):
         loss = self.loss_scale * loss
         loss.backward(retain_graph=retain_graph)
 
@@ -402,6 +401,10 @@ def backward(self, loss, retain_graph=False):
             torch.cuda.synchronize()
             self._param_store.clear_grads_of_previous_reduced_params()
 
+        # gradient synchronization
+        if sync_grad:
+            self._sync_grad()
+
     def zero_grad(self, set_to_none=True):
         """
         Set parameter gradients to zero. If set_to_none = True, gradient
@@ -537,7 +540,7 @@ def _unscale_and_clip_grads(self, grad_groups_flat, total_norm):
     # Gradient Synchronization #
     ############################
 
-    def sync_grad(self):
+    def _sync_grad(self):
         # update param already reduced flag
         reduction_states = self._param_store.get_param_reduction_states()
         for tensor, state in reduction_states.items():
diff --git a/tests/test_zero/low_level_zero/test_grad_acc.py b/tests/test_zero/low_level_zero/test_grad_acc.py
index 1e157c70a7ef..504df202e168 100644
--- a/tests/test_zero/low_level_zero/test_grad_acc.py
+++ b/tests/test_zero/low_level_zero/test_grad_acc.py
@@ -9,7 +9,6 @@
 from torch.testing import assert_close
 
 import colossalai
-from colossalai.tensor import ProcessGroup
 from colossalai.testing.random import seed_all
 from colossalai.utils import free_port
 from colossalai.zero import LowLevelZeroOptimizer
@@ -60,16 +59,16 @@ def fwd_bwd_func(number, cur_data):
         assert torch.equal(zero1_output, zero2_output)
 
         # zero-dp backward
-        zero1_optimizer.backward(zero1_output.sum().float())
-        zero2_optimizer.backward(zero2_output.sum().float())
+        zero1_optimizer.backward(zero1_output.sum().float(), sync_grad=False)
+        zero2_optimizer.backward(zero2_output.sum().float(), sync_grad=False)
 
         for (n, z1p), z2p in zip(zero1_model.named_parameters(), zero2_model.parameters()):
             if z2p.grad is not None:
                 # print(local_rank, n, z1p.shape, torch.max(z2p.grad), torch.max(torch.abs(z1p.grad - z2p.grad)))
                 assert torch.equal(z1p.grad, z2p.grad)
 
-        zero1_optimizer.sync_grad()
-        zero2_optimizer.sync_grad()
+        zero1_optimizer._sync_grad()
+        zero2_optimizer._sync_grad()
 
     fwd_bwd_func(0, input_data1)
     fwd_bwd_func(1, input_data2)
@@ -124,7 +123,7 @@ def fwd_bwd_func(number, cur_data, check_flag):
         assert torch.equal(zero_output, torch_output)
 
         # zero-dp backward
-        zero_optimizer.backward(zero_output.sum().float())
+        zero_optimizer.backward(zero_output.sum().float(), sync_grad=False)
         # torch-ddp backward
         torch_output.sum().backward()
 
@@ -135,7 +134,7 @@ def fwd_bwd_func(number, cur_data, check_flag):
                 # print(n, p.shape, torch.max(torch.abs(p.grad - unscale_grad)))
                 assert torch.equal(p.grad, unscale_grad)
 
-        zero_optimizer.sync_grad()
+        zero_optimizer._sync_grad()
 
     fwd_bwd_func(0, input_data1, True)
     fwd_bwd_func(1, input_data2, False)
diff --git a/tests/test_zero/low_level_zero/test_zero1_2.py b/tests/test_zero/low_level_zero/test_zero1_2.py
index 4949630725ca..930b6129174e 100644
--- a/tests/test_zero/low_level_zero/test_zero1_2.py
+++ b/tests/test_zero/low_level_zero/test_zero1_2.py
@@ -78,16 +78,16 @@ def exam_zero_1_2():
     assert torch.equal(zero1_output, zero2_output)
 
     # zero-dp backward
-    zero1_optimizer.backward(zero1_output.mean().float())
-    zero2_optimizer.backward(zero2_output.mean().float())
+    zero1_optimizer.backward(zero1_output.mean().float(), sync_grad=False)
+    zero2_optimizer.backward(zero2_output.mean().float(), sync_grad=False)
 
     for (n, z1p), z2p in zip(zero1_model.named_parameters(), zero2_model.parameters()):
         if z2p.grad is not None:
             # print(local_rank, n, z1p.shape, torch.max(z2p.grad), torch.max(torch.abs(z1p.grad - z2p.grad)))
             assert torch.equal(z1p.grad, z2p.grad)
 
-    zero1_optimizer.sync_grad()
-    zero2_optimizer.sync_grad()
+    zero1_optimizer._sync_grad()
+    zero2_optimizer._sync_grad()
 
     # step
     zero1_optimizer.step()
@@ -146,7 +146,7 @@ def exam_zero_1_torch_ddp():
     half_close(zero_output, torch_output, loose=True)
 
     # zero-dp backward
-    zero_optimizer.backward(zero_output.mean().float())
+    zero_optimizer.backward(zero_output.mean().float(), sync_grad=False)
 
     # torch-ddp backward
     torch_output.mean().backward()
@@ -156,7 +156,7 @@ def exam_zero_1_torch_ddp():
         half_close(p.grad, z1p.grad, loose=True)
 
     # zero-dp step
-    zero_optimizer.sync_grad()
+    zero_optimizer._sync_grad()
     zero_optimizer.step()
 
     # torch ddp step
diff --git a/tests/test_zero/low_level_zero/test_zero_tp.py b/tests/test_zero/low_level_zero/test_zero_tp.py
index ea8e3a0a3369..15d3530ff90a 100644
--- a/tests/test_zero/low_level_zero/test_zero_tp.py
+++ b/tests/test_zero/low_level_zero/test_zero_tp.py
@@ -74,7 +74,6 @@ def exam_zero_with_tp(overlap_flag, partition_flag):
     torch_loss.backward()
     torch.nn.utils.clip_grad_norm_(torch_model.parameters(), 1.0)
     hybrid_optim.backward(hybrid_loss)
-    hybrid_optim.sync_grad()
 
     torch_optim.step()
     hybrid_optim.step()

From af151032f27615ee15e18d9d594305cdd574444c Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 30 Jan 2023 10:02:13 +0800
Subject: [PATCH 222/503] [workflow] fixed the precommit CI (#2525)

* [workflow] fixed the precommit CI

* polish file

* polish file
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index dc9087af334c..725d266375ef 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,7 +1,7 @@
 repos:
 
   - repo: https://github.com/pycqa/isort
-    rev: 5.10.1
+    rev: 5.12.0
     hooks:
       - id: isort
         name: sort all imports (python)

From ecbad93b6556e38cc240c422b9b0c6aa19682f6e Mon Sep 17 00:00:00 2001
From: LuGY <74758262+Gy-Lu@users.noreply.github.com>
Date: Mon, 30 Jan 2023 17:08:18 +0800
Subject: [PATCH 223/503] [example] Add fastfold tutorial (#2528)

* add fastfold example

* pre-commit polish

* pre-commit polish readme and add empty test ci

* Add test_ci and reduce the default sequence length
---
 examples/tutorial/fastfold/README.md    |  47 ++++++++
 examples/tutorial/fastfold/inference.py | 153 ++++++++++++++++++++++++
 examples/tutorial/fastfold/test_ci.sh   |  10 ++
 3 files changed, 210 insertions(+)
 create mode 100644 examples/tutorial/fastfold/README.md
 create mode 100644 examples/tutorial/fastfold/inference.py
 create mode 100644 examples/tutorial/fastfold/test_ci.sh

diff --git a/examples/tutorial/fastfold/README.md b/examples/tutorial/fastfold/README.md
new file mode 100644
index 000000000000..5c74c737d4b0
--- /dev/null
+++ b/examples/tutorial/fastfold/README.md
@@ -0,0 +1,47 @@
+# FastFold Inference
+
+## Table of contents
+
+- [Overview](#📚-overview)
+- [Quick Start](#🚀-quick-start)
+- [Dive into FastFold](#🔍-dive-into-fastfold)
+
+## 📚 Overview
+
+This example lets you to quickly try out the inference of FastFold.
+
+**NOTE: We use random data and random parameters in this example.**
+
+
+## 🚀 Quick Start
+
+1. Install FastFold
+
+We highly recommend installing an Anaconda or Miniconda environment and install PyTorch with conda.
+
+```
+git clone https://github.com/hpcaitech/FastFold
+cd FastFold
+conda env create --name=fastfold -f environment.yml
+conda activate fastfold
+python setup.py install
+```
+
+2. Run the inference scripts.
+
+```bash
+python inference.py --gpus=1 --n_res=256 --chunk_size=None --inplace
+```
++ `gpus` means the DAP size
++ `n_res` means the length of residue sequence
++ `chunk_size` introduces a memory-saving technology at the cost of speed, None means not using, 16 may be a good trade off for long sequences.
++ `inplace` introduces another memory-saving technology with zero cost, drop `--inplace` if you do not want it.
+
+## 🔍 Dive into FastFold
+
+There are another features of FastFold, such as:
++ more excellent kernel based on triton
++ much faster data processing based on ray
++ training supported
+
+More detailed information can be seen [here](https://github.com/hpcaitech/FastFold/).
diff --git a/examples/tutorial/fastfold/inference.py b/examples/tutorial/fastfold/inference.py
new file mode 100644
index 000000000000..ccfa78256b19
--- /dev/null
+++ b/examples/tutorial/fastfold/inference.py
@@ -0,0 +1,153 @@
+# Copyright 2023 HPC-AI Tech Inc.
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import time
+
+import fastfold
+import numpy as np
+import torch
+import torch.multiprocessing as mp
+from fastfold.config import model_config
+from fastfold.data import data_transforms
+from fastfold.model.fastnn import set_chunk_size
+from fastfold.model.hub import AlphaFold
+from fastfold.utils.inject_fastnn import inject_fastnn
+from fastfold.utils.tensor_utils import tensor_tree_map
+
+if int(torch.__version__.split(".")[0]) >= 1 and int(torch.__version__.split(".")[1]) > 11:
+    torch.backends.cuda.matmul.allow_tf32 = True
+
+
+def random_template_feats(n_templ, n):
+    b = []
+    batch = {
+        "template_mask": np.random.randint(0, 2, (*b, n_templ)),
+        "template_pseudo_beta_mask": np.random.randint(0, 2, (*b, n_templ, n)),
+        "template_pseudo_beta": np.random.rand(*b, n_templ, n, 3),
+        "template_aatype": np.random.randint(0, 22, (*b, n_templ, n)),
+        "template_all_atom_mask": np.random.randint(0, 2, (*b, n_templ, n, 37)),
+        "template_all_atom_positions": np.random.rand(*b, n_templ, n, 37, 3) * 10,
+        "template_torsion_angles_sin_cos": np.random.rand(*b, n_templ, n, 7, 2),
+        "template_alt_torsion_angles_sin_cos": np.random.rand(*b, n_templ, n, 7, 2),
+        "template_torsion_angles_mask": np.random.rand(*b, n_templ, n, 7),
+    }
+    batch = {k: v.astype(np.float32) for k, v in batch.items()}
+    batch["template_aatype"] = batch["template_aatype"].astype(np.int64)
+    return batch
+
+
+def random_extra_msa_feats(n_extra, n):
+    b = []
+    batch = {
+        "extra_msa": np.random.randint(0, 22, (*b, n_extra, n)).astype(np.int64),
+        "extra_has_deletion": np.random.randint(0, 2, (*b, n_extra, n)).astype(np.float32),
+        "extra_deletion_value": np.random.rand(*b, n_extra, n).astype(np.float32),
+        "extra_msa_mask": np.random.randint(0, 2, (*b, n_extra, n)).astype(np.float32),
+    }
+    return batch
+
+
+def generate_batch(n_res):
+    batch = {}
+    tf = torch.randint(21, size=(n_res,))
+    batch["target_feat"] = torch.nn.functional.one_hot(tf, 22).float()
+    batch["aatype"] = torch.argmax(batch["target_feat"], dim=-1)
+    batch["residue_index"] = torch.arange(n_res)
+    batch["msa_feat"] = torch.rand((128, n_res, 49))
+    t_feats = random_template_feats(4, n_res)
+    batch.update({k: torch.tensor(v) for k, v in t_feats.items()})
+    extra_feats = random_extra_msa_feats(5120, n_res)
+    batch.update({k: torch.tensor(v) for k, v in extra_feats.items()})
+    batch["msa_mask"] = torch.randint(low=0, high=2, size=(128, n_res)).float()
+    batch["seq_mask"] = torch.randint(low=0, high=2, size=(n_res,)).float()
+    batch.update(data_transforms.make_atom14_masks(batch))
+    batch["no_recycling_iters"] = torch.tensor(2.)
+
+    add_recycling_dims = lambda t: (t.unsqueeze(-1).expand(*t.shape, 3))
+    batch = tensor_tree_map(add_recycling_dims, batch)
+
+    return batch
+
+
+def inference_model(rank, world_size, result_q, batch, args):
+    os.environ['RANK'] = str(rank)
+    os.environ['LOCAL_RANK'] = str(rank)
+    os.environ['WORLD_SIZE'] = str(world_size)
+    # init distributed for Dynamic Axial Parallelism
+    fastfold.distributed.init_dap()
+    torch.cuda.set_device(rank)
+    config = model_config(args.model_name)
+    if args.chunk_size:
+        config.globals.chunk_size = args.chunk_size
+
+    config.globals.inplace = args.inplace
+    config.globals.is_multimer = False
+    model = AlphaFold(config)
+
+    model = inject_fastnn(model)
+    model = model.eval()
+    model = model.cuda()
+
+    set_chunk_size(model.globals.chunk_size)
+
+    with torch.no_grad():
+        batch = {k: torch.as_tensor(v).cuda() for k, v in batch.items()}
+        t = time.perf_counter()
+        out = model(batch)
+        print(f"Inference time: {time.perf_counter() - t}")
+    out = tensor_tree_map(lambda x: np.array(x.cpu()), out)
+
+    result_q.put(out)
+
+    torch.distributed.barrier()
+    torch.cuda.synchronize()
+
+
+def inference_monomer_model(args):
+    batch = generate_batch(args.n_res)
+    manager = mp.Manager()
+    result_q = manager.Queue()
+    torch.multiprocessing.spawn(inference_model, nprocs=args.gpus, args=(args.gpus, result_q, batch, args))
+    out = result_q.get()
+
+    # get unrelexed pdb and save
+    # batch = tensor_tree_map(lambda x: np.array(x[..., -1].cpu()), batch)
+    # plddt = out["plddt"]
+    # plddt_b_factors = np.repeat(plddt[..., None], residue_constants.atom_type_num, axis=-1)
+    # unrelaxed_protein = protein.from_prediction(features=batch,
+    #                                             result=out,
+    #                                             b_factors=plddt_b_factors)
+    # with open('demo_unrelex.pdb', 'w+') as fp:
+    #     fp.write(unrelaxed_protein)
+
+
+def main(args):
+    inference_monomer_model(args)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--gpus", type=int, default=1, help="""Number of GPUs with which to run inference""")
+    parser.add_argument("--n_res", type=int, default=50, help="virtual residue number of random data")
+    parser.add_argument("--model_name", type=str, default="model_1", help="model name of alphafold")
+    parser.add_argument('--chunk_size', type=int, default=None)
+    parser.add_argument('--inplace', default=False, action='store_true')
+
+    args = parser.parse_args()
+
+    main(args)
diff --git a/examples/tutorial/fastfold/test_ci.sh b/examples/tutorial/fastfold/test_ci.sh
new file mode 100644
index 000000000000..ef9ab919e307
--- /dev/null
+++ b/examples/tutorial/fastfold/test_ci.sh
@@ -0,0 +1,10 @@
+set -euxo pipefail
+
+git clone https://github.com/hpcaitech/FastFold
+cd FastFold
+pip install -r requirements/requirements.txt
+python setup.py install
+pip install -r requirements/test_requirements.txt
+cd ..
+
+python inference.py

From 66dfcf5281edeaee9015372b081536631a2efe6e Mon Sep 17 00:00:00 2001
From: HELSON <c2h214748@gmail.com>
Date: Mon, 30 Jan 2023 17:58:05 +0800
Subject: [PATCH 224/503] [gemini] update the gpt example (#2527)

---
 colossalai/nn/parallel/zero_wrapper.py        |   9 +-
 .../language/gpt/gemini/benchmark_gemini.sh   |   2 +-
 examples/language/gpt/gemini/run_gemini.sh    |  12 +-
 .../language/gpt/gemini/train_gpt_demo.py     | 150 +++++++-----------
 4 files changed, 75 insertions(+), 98 deletions(-)

diff --git a/colossalai/nn/parallel/zero_wrapper.py b/colossalai/nn/parallel/zero_wrapper.py
index 504625e62d30..be8d1da7c24e 100644
--- a/colossalai/nn/parallel/zero_wrapper.py
+++ b/colossalai/nn/parallel/zero_wrapper.py
@@ -32,16 +32,19 @@ def zero_model_wrapper(model: nn.Module, zero_stage: int = 1, gemini_config: Opt
                 >>> config_dict = dict(device=torch.cuda.current_device(), hidden_dim=1024, placement_policy='auto')
                 >>> model = zero_model_wrapper(model, zero_stage=3, gemini_config=config_dict)
     """
-    setattr(model, "_colo_zero_stage", zero_stage)
     assert zero_stage in [1, 2, 3], "The stage of ZeRO should be 1, 2 or 3"
 
     if gemini_config is None:
         gemini_config = dict()
 
     if zero_stage in [1, 2]:
-        return model
+        wrapped_model = model
     else:
-        return GeminiDDP(model, **gemini_config)
+        wrapped_model = GeminiDDP(model, **gemini_config)
+
+    setattr(wrapped_model, "_colo_zero_stage", zero_stage)
+
+    return wrapped_model
 
 
 def zero_optim_wrapper(model: nn.Module,
diff --git a/examples/language/gpt/gemini/benchmark_gemini.sh b/examples/language/gpt/gemini/benchmark_gemini.sh
index 9a630b2ffe23..3a42e13645f6 100644
--- a/examples/language/gpt/gemini/benchmark_gemini.sh
+++ b/examples/language/gpt/gemini/benchmark_gemini.sh
@@ -1,5 +1,5 @@
 for MODEL_TYPE in "gpt2_medium"; do
-  for DISTPLAN in "colossalai"; do
+  for DISTPLAN in "CAI_Gemini"; do
     for BATCH_SIZE in 16; do
       for GPUNUM in 1 2 4 8; do
         for TPDEGREE in 1 2 4 8; do
diff --git a/examples/language/gpt/gemini/run_gemini.sh b/examples/language/gpt/gemini/run_gemini.sh
index 6f0710d54f01..ad4e9419c1bd 100644
--- a/examples/language/gpt/gemini/run_gemini.sh
+++ b/examples/language/gpt/gemini/run_gemini.sh
@@ -1,6 +1,6 @@
 set -x
-# distplan in ["colossalai", "zero1", "zero2", "torch_ddp", "torch_zero"]
-export DISTPLAN=${DISTPLAN:-"colossalai"}
+# distplan in ["CAI_ZeRO1", "CAI_ZeRO2", "CAI_Gemini", "Pytorch_DDP", "Pytorch_ZeRO"]
+export DISTPLAN=${DISTPLAN:-"CAI_Gemini"}
 
 # The following options only valid when DISTPLAN="colossalai"
 export GPUNUM=${GPUNUM:-1}
@@ -12,6 +12,12 @@ export MODEL_TYPE=${MODEL_TYPE:-"gpt2_medium"}
 export TRAIN_STEP=${TRAIN_STEP:-10}
 # export PYTHONPATH=$PWD:$PYTHONPATH
 
+if [ ${USE_SHARD_INIT} = "True" ]; then
+  USE_SHARD_INIT="--shardinit"
+else
+  USE_SHARD_INIT=""
+fi
+
 mkdir -p gemini_logs
 
 torchrun --standalone --nproc_per_node=${GPUNUM} ./train_gpt_demo.py \
@@ -19,7 +25,7 @@ torchrun --standalone --nproc_per_node=${GPUNUM} ./train_gpt_demo.py \
 --model_type=${MODEL_TYPE} \
 --batch_size=${BATCH_SIZE} \
 --placement=${PLACEMENT} \
---shardinit=${USE_SHARD_INIT} \
+${USE_SHARD_INIT} \
 --distplan=${DISTPLAN} \
 --train_step=${TRAIN_STEP} \
 2>&1 | tee ./gemini_logs/${MODEL_TYPE}_${DISTPLAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}_${PLACEMENT}.log
diff --git a/examples/language/gpt/gemini/train_gpt_demo.py b/examples/language/gpt/gemini/train_gpt_demo.py
index 02857ae9ce12..ab8a65e625cf 100644
--- a/examples/language/gpt/gemini/train_gpt_demo.py
+++ b/examples/language/gpt/gemini/train_gpt_demo.py
@@ -12,26 +12,21 @@
 
 import colossalai
 from colossalai.logging import disable_existing_loggers, get_dist_logger
-from colossalai.nn.parallel import ZeroDDP
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.nn.parallel import zero_model_wrapper, zero_optim_wrapper
 from colossalai.tensor import ColoParameter, ComputePattern, ComputeSpec, ProcessGroup, ReplicaSpec, ShardSpec
 from colossalai.utils import get_current_device
 from colossalai.utils.model.colo_init_context import ColoInitContext
 
 CAI_VERSION = colossalai.__version__
 
-if version.parse(CAI_VERSION) > version.parse("0.1.10"):
-    # These are added after 0.1.10
-    from colossalai.nn.optimizer.gemini_optimizer import GeminiAdamOptimizer
-    from colossalai.nn.parallel import GeminiDDP
-    from colossalai.zero.sharded_optim import LowLevelZeroOptimizer
-
 
 def parse_args():
     parser = colossalai.get_default_parser()
     parser.add_argument(
         "--distplan",
         type=str,
-        default='colossalai',
+        default='CAI_Gemini',
         help="The distributed plan [colossalai, zero1, zero2, torch_ddp, torch_zero].",
     )
     parser.add_argument(
@@ -48,8 +43,7 @@ def parse_args():
     )
     parser.add_argument(
         "--shardinit",
-        type=bool,
-        default=False,
+        action='store_true',
         help=
         "Shard the tensors when init the model to shrink peak memory size on the assigned device. Valid when using colossalai as dist plan.",
     )
@@ -186,57 +180,16 @@ def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup):
             param.visited = True
 
 
-# Gemini + ZeRO DDP
-def build_gemini(model: torch.nn.Module, pg: ProcessGroup, placement_policy: str = "auto", ddp_flag: bool = True):
-    fp16_init_scale = 2**5
-    gpu_margin_mem_ratio_for_auto = 0
-
-    if version.parse(CAI_VERSION) > version.parse("0.1.10"):
-        model = GeminiDDP(model,
-                          strict_ddp_mode=ddp_flag,
-                          device=get_current_device(),
-                          placement_policy=placement_policy,
-                          pin_memory=True,
-                          hidden_dim=model.config.n_embd,
-                          search_range_mb=128)
-        # configure the const policy
-        if placement_policy == 'const':
-            model.gemini_manager._placement_policy.set_const_memory_boundary(2 * 1024)
-        # build a highly optimized cpu optimizer
-        optimizer = GeminiAdamOptimizer(model,
-                                        lr=1e-3,
-                                        initial_scale=fp16_init_scale,
-                                        gpu_margin_mem_ratio=gpu_margin_mem_ratio_for_auto)
-    elif version.parse("0.1.9") <= version.parse(CAI_VERSION) <= version.parse("0.1.10"):
-        from colossalai.gemini import ChunkManager, GeminiManager
-        from colossalai.nn.optimizer import HybridAdam
-        from colossalai.zero import ZeroOptimizer
-        chunk_size = ChunkManager.search_chunk_size(model, 64 * 1024**2, 1024, filter_exlarge_params=True)
-        chunk_manager = ChunkManager(chunk_size,
-                                     pg,
-                                     enable_distributed_storage=True,
-                                     init_device=GeminiManager.get_default_device(placement_policy))
-        gemini_manager = GeminiManager(placement_policy, chunk_manager)
-        model = ZeroDDP(model, gemini_manager)
-        optimizer = HybridAdam(model.parameters(), lr=1e-3)
-        optimizer = ZeroOptimizer(optimizer,
-                                  model,
-                                  initial_scale=fp16_init_scale,
-                                  gpu_margin_mem_ratio=gpu_margin_mem_ratio_for_auto)
-    else:
-        raise NotImplemented(f"CAI version {CAI_VERSION} is not supported")
-    return model, optimizer
-
-
 def main():
     # version check
-    # this example is supposed to work for versions greater than 0.1.9
-    assert version.parse(CAI_VERSION) >= version.parse("0.1.9")
+    # this example is supposed to work for versions greater than 0.2.0
+    assert version.parse(CAI_VERSION) >= version.parse("0.2.0")
 
     set_cpu_maximum_parallelism()
     args = parse_args()
 
-    if args.distplan not in ["colossalai", "torch_ddp", "torch_zero", "zero1", "zero2"]:
+    # if args.distplan not in ["colossalai", "torch_ddp", "torch_zero", "zero1", "zero2"]:
+    if args.distplan not in ["CAI_ZeRO1", "CAI_ZeRO2", "CAI_Gemini", "Pytorch_DDP", "Pytorch_ZeRO"]:
         raise TypeError(f"{args.distplan} is error")
 
     # batch size per DP degree
@@ -260,22 +213,21 @@ def main():
     criterion = GPTLMLoss()
 
     torch.manual_seed(123)
-    if args.distplan == "colossalai":
+    if args.distplan.startswith("CAI"):
         # all param must use the same process group.
         world_size = torch.distributed.get_world_size()
         shard_pg = ProcessGroup(tp_degree=world_size) if args.shardinit else None
         default_dist_spec = ShardSpec([-1], [world_size]) if args.shardinit else None
 
+        if args.shardinit and args.distplan != "CAI_Gemini":
+            raise RuntimeError("You can only use shardinit with CAI_Gemini")
+
         # build GPT model
-        if version.parse(CAI_VERSION) > version.parse("0.1.10"):
-            with ColoInitContext(device=get_current_device(),
-                                 dtype=torch.half,
-                                 default_dist_spec=default_dist_spec,
-                                 default_pg=shard_pg):
-                model = model_builder(args.model_type)(checkpoint=True)
-        else:
-            with ColoInitContext(device=get_current_device()):
-                model = model_builder(args.model_type)(checkpoint=True)
+        with ColoInitContext(device=get_current_device(),
+                             dtype=torch.half,
+                             default_dist_spec=default_dist_spec,
+                             default_pg=shard_pg):
+            model = model_builder(args.model_type)(checkpoint=True)
 
         tp_pg = ProcessGroup(tp_degree=args.tp_degree)
         # Tensor Parallelism (TP)
@@ -283,34 +235,49 @@ def main():
         if args.tp_degree > 1:
             tensor_parallelize(model, tp_pg)
 
-        # build a Gemini model and a highly optimized cpu optimizer
-        # Gemini + ZeRO DP, Note it must be used after TP
-        model, optimizer = build_gemini(model, tp_pg, args.placement, args.tp_degree == 1)
+        # asign running configurations
+        gemini_config = None
+        if args.distplan.startswith("CAI_ZeRO"):
+            optim_config = dict(reduce_bucket_size=12 * 1024 * 1024, overlap_communication=True, verbose=True)
+        elif args.distplan == "CAI_Gemini":
+            gemini_config = dict(strict_ddp_mode=args.tp_degree == 1,
+                                 device=get_current_device(),
+                                 placement_policy=args.placement,
+                                 pin_memory=True,
+                                 hidden_dim=model.config.n_embd,
+                                 search_range_mb=128)
+            optim_config = dict(gpu_margin_mem_ratio=0.)
+        else:
+            raise RuntimeError
+
+        # build a highly optimized gpu/cpu optimizer
+        optimizer = HybridAdam(model.parameters(), lr=1e-3)
+
+        if args.distplan == "CAI_ZeRO1":
+            zero_stage = 1
+        elif args.distplan == "CAI_ZeRO2":
+            zero_stage = 2
+        elif args.distplan == "CAI_Gemini":
+            zero_stage = 3
+        else:
+            raise RuntimeError
+
+        # wrap your model and optimizer
+        model = zero_model_wrapper(model, zero_stage, gemini_config)
+        optimizer = zero_optim_wrapper(model, optimizer, optim_config=optim_config)
 
         logger.info(get_mem_info(prefix='After init optim, '), ranks=[0])
-    else:
+    elif args.distplan.startswith("Pytorch"):
         assert args.tp_degree == 1, "The degree of TP should be 1 for DDP examples."
         model = model_builder(args.model_type)(checkpoint=True).cuda()
-
-    if args.distplan.startswith("torch"):
         model = DDP(model)
-        if args.distplan.endswith("ddp"):
-            optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
-        elif args.distplan.endswith("zero"):
+        if args.distplan.endswith("DDP"):
+            optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
+        elif args.distplan.endswith("ZeRO"):
             from torch.distributed.optim import ZeroRedundancyOptimizer
-            optimizer = ZeroRedundancyOptimizer(model.parameters(), optimizer_class=torch.optim.Adam, lr=0.01)
-    elif args.distplan.startswith("zero"):
-        model = model.half()
-        partition_flag = (args.distplan == "zero2")
-        optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
-
-        optimizer = LowLevelZeroOptimizer(
-            optimizer,
-            reduce_bucket_size=12 * 1024 * 1024,
-            overlap_communication=True,
-            partition_grad=partition_flag,
-            verbose=True,
-        )
+            optimizer = ZeroRedundancyOptimizer(model.parameters(), optimizer_class=torch.optim.Adam, lr=1e-3)
+    else:
+        raise RuntimeError
 
     # model is shared after TP
     numel = get_model_size(model)
@@ -338,17 +305,18 @@ def main():
         fwd_time = fwd_end - start
         logger.info(get_mem_info(prefix=f'[{n + 1}/{NUM_STEPS}] Forward '), ranks=[0])
 
-        if args.distplan in ["colossalai", "zero1", "zero2"]:
+        if args.distplan.startswith("CAI"):
             optimizer.backward(loss)
-        elif args.distplan in ["torch_ddp", "torch_zero"]:
+        elif args.distplan.startswith("Pytorch"):
             loss.backward()
+        else:
+            raise RuntimeError
+
         torch.cuda.synchronize()
         bwd_end = time()
         bwd_time = bwd_end - fwd_end
         logger.info(get_mem_info(prefix=f'[{n + 1}/{NUM_STEPS}] Backward '), ranks=[0])
 
-        if args.distplan in ["zero1", "zero2"]:
-            optimizer.sync_grad()
         optimizer.step()
         torch.cuda.synchronize()
         optim_time = time() - bwd_end

From b55deb0662005e5db37075163a38487ff006eb68 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 30 Jan 2023 21:28:27 +0800
Subject: [PATCH 225/503] [workflow] only report coverage for changed files
 (#2524)

* [workflow] only report coverage for changed files

* polish file

* polish file

* polish file

* polish file

* polish file

* polish file

* polish file

* polish file

* polish file

* polish file

* polish file

* polish file

* polish file

* polish file

* polish file

* polish file

* polish file

* polish file

* polish file

* polish file

* polish file

* polish file
---
 .github/workflows/build.yml                | 92 +++++++++++++++++-----
 .github/workflows/report_test_coverage.yml | 35 ++++----
 tests/test_amp/test_naive_fp16.py          |  1 -
 3 files changed, 92 insertions(+), 36 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 8f334d599124..3c163e774a5c 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -6,15 +6,17 @@ on:
 
 jobs:
   detect:
-    name: Detect kernel-related file change
+    name: Detect file change
     if: |
         github.event.pull_request.draft == false &&
         github.base_ref == 'main' &&
         github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' &&
         contains( github.event.pull_request.labels.*.name, 'Run Build and Test')
     outputs:
-      changedFiles: ${{ steps.find-changed-files.outputs.changedFiles }}
-      anyChanged: ${{ steps.find-changed-files.outputs.any_changed }}
+      changedExtenisonFiles: ${{ steps.find-extension-change.outputs.all_changed_files }}
+      anyExtensionFileChanged: ${{ steps.find-extension-change.outputs.any_changed }}
+      changedLibraryFiles: ${{ steps.find-lib-change.outputs.all_changed_files }}
+      anyLibraryFileChanged: ${{ steps.find-lib-change.outputs.any_changed }}
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v2
@@ -30,8 +32,8 @@ jobs:
             echo $commonCommit
             echo "baseSHA=$commonCommit" >> $GITHUB_OUTPUT
 
-      - name: Find the changed files
-        id: find-changed-files
+      - name: Find the changed extension-related files
+        id: find-extension-change
         uses: tj-actions/changed-files@v35
         with:
           base_sha: ${{ steps.locate-base-sha.outputs.baseSHA }}
@@ -40,9 +42,23 @@ jobs:
             colossalai/kernel/**
             setup.py
 
+      - name: Find the changed library-related files
+        id: find-lib-change
+        uses: tj-actions/changed-files@v35
+        with:
+          base_sha: ${{ steps.locate-base-sha.outputs.baseSHA }}
+          files: |
+            **/*.py
+            **/*.h
+            **/*.cpp
+            **/*.cu
+
       - name: List changed files
         run: |
-          for file in ${{ steps.find-changed-files.outputs.all_changed_files }}; do
+          for file in ${{ steps.find-extension-change.outputs.all_changed_files }}; do
+            echo "$file was changed"
+          done
+          for file in ${{ steps.find-lib-change.outputs.all_changed_files }}; do
             echo "$file was changed"
           done
 
@@ -55,38 +71,58 @@ jobs:
       image: hpcaitech/pytorch-cuda:1.11.0-11.3.0
       options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10
     timeout-minutes: 40
+    defaults:
+      run:
+        shell: bash
     steps:
-      - uses: actions/checkout@v2
+      - name: Checkout TensorNVMe
+        uses: actions/checkout@v2
         with:
           repository: hpcaitech/TensorNVMe
           ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
           path: TensorNVMe
 
-      - name: Install tensornvme
+      - name: Restore TensorNVMe Cache
+        run: |
+          [ ! -z "$(ls -A /github/home/tensornvme_cache/)" ] && cp -p -r /github/home/tensornvme_cache/* /__w/ColossalAI/ColossalAI/TensorNVMe
+
+      - name: Install TensorNVMe
         run: |
           cd TensorNVMe
           conda install cmake
           pip install -r requirements.txt
           pip install -v .
 
-      - uses: actions/checkout@v2
+      - name: Store TensorNVMe Cache
+        run: |
+          cp -p -r ./build /github/home/tensornvme_cache/
+
+      - name: Checkout Colossal-AI
+        uses: actions/checkout@v2
         with:
           ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
 
-      - name: Restore cache
-        if: needs.detect.outputs.anyChanged != 'true'
+      - name: Restore Colossal-AI Cache
+        if: needs.detect.outputs.anyExtensionFileChanged != 'true'
         run: |
           # -p flag is required to preserve the file timestamp to avoid ninja rebuild
           [ ! -z "$(ls -A /github/home/cuda_ext_cache/)" ] && cp -p -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/
 
       - name: Install Colossal-AI
+        if: needs.detect.outputs.anyLibraryFileChanged == 'true'
         run: |
           CUDA_EXT=1 pip install -v -e .
           pip install -r requirements/requirements-test.txt
 
-      - name: Unit Testing
+      - name: Store Colossal-AI Cache
+        run: |
+          # -p flag is required to preserve the file timestamp to avoid ninja rebuild
+          cp -p -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/
+
+      - name: Execute Unit Testing
+        if: needs.detect.outputs.anyLibraryFileChanged == 'true'
         run: |
-          PYTHONPATH=$PWD pytest --cov=. --cov-report xml tests
+          PYTHONPATH=$PWD pytest --cov=. --cov-report xml tests/
         env:
           DATA: /data/scratch/cifar-10
           NCCL_SHM_DISABLE: 1
@@ -95,18 +131,36 @@ jobs:
       - name: Collate artifact
         env:
           PR_NUMBER: ${{ github.event.number }}
+          changedLibraryFiles: ${{ needs.detect.outputs.changedLibraryFiles }}
+          anyLibraryFileChanged: ${{ needs.detect.outputs.anyLibraryFileChanged }}
+          changedExtenisonFiles: ${{ needs.detect.outputs.changedExtenisonFiles }}
         run: |
           mkdir report
           echo $PR_NUMBER > ./report/pr_number
-          mv coverage.xml ./report
+
+          # generate coverage.xml if any
+          if [ "$anyLibraryFileChanged" == "true" ]; then
+            allFiles=""
+            for file in $changedLibraryFiles; do
+              if [ "$allFiles" == "" ]; then
+                allFiles=$file
+              else
+                allFiles=$allFiles,$file
+              fi
+            done
+
+            coverage report --data-file .coverage --include $allFiles > ./coverage.txt
+
+            covPercentage=$(tail -n 1 coverage.txt  | grep -o '[1-9]*%$')
+            covNum=${covPercentage::-1}
+            mv coverage.txt ./report
+            echo $covNum > ./report/cov_number
+          else
+            echo "No coverage report is generated"
+          fi
 
       - name: Upload test coverage artifact
         uses: actions/upload-artifact@v3
         with:
           name: report
           path: report/
-
-      - name: Store Cache
-        run: |
-          # -p flag is required to preserve the file timestamp to avoid ninja rebuild
-          cp -p -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/
diff --git a/.github/workflows/report_test_coverage.yml b/.github/workflows/report_test_coverage.yml
index dc3fe395f00b..c58527361181 100644
--- a/.github/workflows/report_test_coverage.yml
+++ b/.github/workflows/report_test_coverage.yml
@@ -32,28 +32,31 @@ jobs:
             fs.writeFileSync(`${process.env.GITHUB_WORKSPACE}/report.zip`, Buffer.from(download.data));
 
       - name: 'Unzip artifact'
+        id: unzip
         run: |
           unzip report.zip
-
-      - name: Code Coverage Report
-        uses: irongut/CodeCoverageSummary@v1.3.0
-        with:
-          filename: coverage.xml
-          badge: true
-          format: markdown
-          hide_branch_rate: false
-          hide_complexity: false
-          indicators: true
-          output: both
-          thresholds: '80 90'
+          if [ -f "coverage.txt" ]; then
+            echo "hasReport=true" >> $GITHUB_OUTPUT
+          else
+            echo "hasReport=false" >> $GITHUB_OUTPUT
+          fi
 
       - name: Make Coverage Report Collapsable
+        if: steps.unzip.outputs.hasReport == "true"
         run: |
-          sed -i '2 i <details>' code-coverage-results.md
-          sed -i '3 i <summary>Click me to view the complete report</summary>' code-coverage-results.md
-          echo "</details>" >> code-coverage-results.md
+          covNum=$(cat cov_number)
+          title="The code coverage for the changed files is ${covNum}%."
+          (echo $title; cat coverage.txt) > coverage_tmp.txt
+          mv coverage_tmp.txt coverage.txt
+          sed -i '2 i <details>' coverage.txt
+          sed -i '3 i <summary>Click me to view the complete report</summary>' coverage.txt
+          sed -i '4 i \n' coverage.txt
+          sed -i '5 i \`\`\`text' coverage.txt
+          echo "\`\`\`" >> coverage.txt
+          echo "</details>" >> coverage.txt
 
       - name: 'Comment on PR'
+        if: steps.unzip.outputs.hasReport == "true"
         uses: actions/github-script@v6
         with:
           github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -64,7 +67,7 @@ jobs:
             let repo = context.repo.repo;
             let run_id = context.payload.workflow_run.id;
             let run_url = `https://github.com/${owner}/${repo}/actions/runs/${run_id}`
-            let body = fs.readFileSync('./code-coverage-results.md', {encoding:'utf8', flag:'r'})
+            let body = fs.readFileSync('./coverage.txt', {encoding:'utf8', flag:'r'})
 
             await github.rest.issues.createComment({
               owner: owner,
diff --git a/tests/test_amp/test_naive_fp16.py b/tests/test_amp/test_naive_fp16.py
index 7f6f0c86ad8e..c01de469b8f1 100644
--- a/tests/test_amp/test_naive_fp16.py
+++ b/tests/test_amp/test_naive_fp16.py
@@ -24,7 +24,6 @@ def run_naive_amp():
     In this test, we compare the naive fp16 optimizer implemented in colossalai
     and fp32 torch optimizer
     """
-
     torch.backends.cudnn.benchmark = False
     torch.backends.cudnn.deterministic = True
 

From a4ed9125ac0ed4c3ad707b2e2a2fc566aacbf903 Mon Sep 17 00:00:00 2001
From: HELSON <c2h214748@gmail.com>
Date: Tue, 31 Jan 2023 10:40:39 +0800
Subject: [PATCH 226/503] [hotfix] fix lightning error (#2529)

---
 colossalai/nn/parallel/data_parallel.py   | 198 ++++++++++++----------
 colossalai/nn/parallel/gemini_parallel.py |   4 +
 colossalai/nn/parallel/utils.py           |  19 +--
 3 files changed, 119 insertions(+), 102 deletions(-)

diff --git a/colossalai/nn/parallel/data_parallel.py b/colossalai/nn/parallel/data_parallel.py
index 24d59e177b80..a30416ab9fdf 100644
--- a/colossalai/nn/parallel/data_parallel.py
+++ b/colossalai/nn/parallel/data_parallel.py
@@ -5,6 +5,7 @@
 
 import torch
 import torch.distributed as dist
+import torch.nn as nn
 
 from colossalai.gemini.chunk import Chunk, ChunkManager, TensorState
 from colossalai.gemini.gemini_mgr import GeminiManager
@@ -218,11 +219,15 @@ def __init__(self,
         self.chunk_manager: ChunkManager = gemini_manager.chunk_manager
         self.force_outputs_fp32 = force_outputs_fp32
         self.param_op_hook = GeminiZeROHook(gemini_manager)
-        self.fp32_params: List[ColoTensor] = []
+        self.fp32_params: List[ColoTensor] = list()
+        self.fp16_params: List[ColoParameter] = list()
         self.overflow_counter = 0
-        self.grads_device: Dict[torch.Tensor, torch.device] = {}
+        self.grads_device: Dict[torch.Tensor, torch.device] = dict()
+        self.param2name: Dict[nn.Parameter, str] = dict()
+        self.name2param: Dict[str, nn.Parameter] = dict()
 
-        cpu_offload = self.gemini_manager.policy_name != 'cuda'
+        self._cast_buffers()
+        self._logger = get_dist_logger()
 
         if self.gemini_manager._premade_memstats_:
             # build chunk in param runtime visited order.
@@ -234,50 +239,17 @@ def __init__(self,
             for p in module.parameters():
                 param_order.append(p)
 
-        ddp_pg = ColoProcessGroup()
-        for p in param_order.generate():
-            assert isinstance(p, ColoParameter)
-
-            if strict_ddp_mode:
-                if not p.is_replicate():
-                    p.set_dist_spec(ReplicaSpec())
-                p.set_process_group(pg=ddp_pg)
+        self._init_chunks(param_order=param_order,
+                          strict_ddp_mode=strict_ddp_mode,
+                          cpu_offload=self.gemini_manager.policy_name != 'cuda',
+                          pin_memory=pin_memory)
 
-            if is_ddp_ignored(p):
-                p.data = p.data.to(device=get_current_device(), dtype=torch.float16)
-                continue
-
-            fp32_data = p.data.float()
-            fp32_p = ColoTensor(fp32_data, spec=ColoTensorSpec(p.process_group))
-            p.data = p.data.half()
-            dp_world_size = p.process_group.dp_world_size()
-            self.chunk_manager.register_tensor(tensor=p,
-                                               group_type='fp16_param',
-                                               config_key=dp_world_size,
-                                               cpu_offload=cpu_offload,
-                                               pin_memory=pin_memory)
-            self.chunk_manager.register_tensor(tensor=fp32_p,
-                                               group_type='fp32_param',
-                                               config_key=dp_world_size,
-                                               cpu_offload=cpu_offload,
-                                               pin_memory=pin_memory)
-            self.fp32_params.append(fp32_p)
-            self.grads_device[p] = self.gemini_manager.default_device
-
-        self.chunk_manager.close_all_groups()
-        self._cast_buffers()
-
-        params_list = [p for p in param_order.generate() if not is_ddp_ignored(p)]
-        for p, fp32_p in zip(params_list, self.fp32_params):
-            chunk_16 = self.chunk_manager.get_chunk(p)
-            chunk_32 = self.chunk_manager.get_chunk(fp32_p)
-            chunk_32.init_pair(chunk_16)
-
-            # keep gathered chunks are in CUDA
-            if chunk_16.keep_gathered:
-                self.grads_device[p] = get_current_device()
-
-        self._logger = get_dist_logger()
+        for name, param in module.named_parameters():
+            self.param2name[param] = name
+        for m_name, m_var in module.named_modules():
+            for p_name, p_var in m_var.named_parameters(recurse=False):
+                param_name = m_name + '.' + p_name if m_name else p_name
+                self.name2param[param_name] = p_var
 
     def _post_forward(self):
         """This function is only triggered for inference.
@@ -318,10 +290,23 @@ def _setup_grads_ptr(self):
                 continue
             p.grad = None
 
+    def _pre_bacward(self):
+        # set a visit label for all parameters
+        # the label is used to check whether the parameter is correctly reduced
+        for param in self.param2name:
+            if not is_ddp_ignored(param):
+                setattr(param, "_gemini_reduced", False)
+
     def _post_backward(self):
         if self.chunk_manager.accessed_mem != 0:
+            error_params = ["Reduction failed at followed parameters:"]
+            for param in self.param2name:
+                if not is_ddp_ignored(param) and not getattr(param, "_gemini_reduced"):
+                    error_params.append(self.param2name[param])
+            error_str = "\n\t".join(error_params)
             raise RuntimeError("ZERO DDP error: the synchronization of gradients doesn't exit properly.",
-                               "The most possible reason is that the model is not compatible with ZeroDDP.")
+                               "The most possible reason is that the model is not compatible with ZeroDDP.\n",
+                               f"{error_str}")
         self._setup_grads_ptr()
         self._logger.debug(
             f'comp cuda demand time: {self.gemini_manager._comp_cuda_demand_time}, layout time: {self.gemini_manager._layout_time}, evict time: {self.gemini_manager._evict_time}, CPU->CUDA vol: {self.gemini_manager._h2d_volume}B, CUDA->CPU vol: {self.gemini_manager._d2h_volume}'
@@ -329,6 +314,7 @@ def _post_backward(self):
         self.gemini_manager.post_iter()
 
     def backward(self, loss: torch.Tensor):
+        self._pre_bacward()
         with self.param_op_hook.switch_to_backward(), ColoParamOpHookManager.use_hooks(self.param_op_hook):
             loss.backward()
         self._post_backward()
@@ -343,7 +329,9 @@ def grad_handle(self, p, grad):
         free_storage(empty_grad)
         with torch._C.DisableTorchFunction():
             chunk = self.chunk_manager.get_chunk(p)
-            assert chunk.tensors_info[p].state == TensorState.HOLD_AFTER_BWD
+            if chunk.tensors_info[p].state != TensorState.HOLD_AFTER_BWD:
+                raise RuntimeError(f"Parameter `{self.param2name[p]}` failed at the gradient reduction. "
+                                   "Some unsupported torch function is operated upon this parameter.")
             self.chunk_manager.trans_tensor_state(p, TensorState.READY_FOR_REDUCE)
             chunk.copy_tensor_to_chunk_slice(p, grad)
             reduced = self.chunk_manager.reduce_chunk(chunk)
@@ -367,30 +355,7 @@ def set_chunk_grad_device(self, chunk: Chunk, device: torch.device) -> None:
         for tensor in chunk.get_tensors():
             self.grads_device[tensor] = device
 
-    def state_dict(self, destination=None, prefix='', keep_vars=False, only_rank_0: bool = True, strict: bool = True):
-        """
-        Args:
-            strict (bool): whether to reture the whole model state as the pytorch `Module.state_dict()`
-
-        Returns:
-            dict:
-                a dictionary containing a whole state of the module
-
-        Example:
-
-            >>> module.state_dict().keys()
-            ['bias', 'weight']
-        """
-        if strict:
-            assert keep_vars is False, "`state_dict` with parameter, `keep_vars=True`, is not supported now."
-            torch_model = get_static_torch_model(zero_ddp_model=self, only_rank_0=only_rank_0)
-            return torch_model.state_dict(destination=destination, prefix=prefix, keep_vars=keep_vars)
-        return self._non_strict_state_dict(destination=destination,
-                                           prefix=prefix,
-                                           keep_vars=keep_vars,
-                                           only_rank_0=only_rank_0)
-
-    def _non_strict_state_dict(self, destination=None, prefix='', keep_vars=False, only_rank_0: bool = True):
+    def state_dict(self, destination=None, prefix='', keep_vars=False, only_rank_0: bool = True):
         """Returns a dictionary containing a whole state of the module.
 
         Both parameters and persistent buffers (e.g. running averages) are included.
@@ -461,19 +426,24 @@ def _save_to_state_dict(self, destination, prefix, keep_vars, only_rank_0=True):
         """
         assert keep_vars is False, "`state_dict` with parameter, `keep_vars=True`, is not supported now."
 
+        # get copies of fp32 parameters in CPU
         param_to_save_data = self._get_param_to_save_data(self.fp32_params, only_rank_0)
-        ddp_param_list = []
-        for name, param in self.named_parameters():
-            if is_ddp_ignored(param):
-                # deal with ddp ignored parameters
-                destination[prefix + name] = param if keep_vars else param.detach()
-            else:
-                ddp_param_list.append((name, param))
-        for (name, p), fp32_p in zip(ddp_param_list, self.fp32_params):
-            if p is not None:
-                assert fp32_p in param_to_save_data, "Parameter '{}' is neglected in the chunk list".format(name)
-                record_parameter = param_to_save_data[fp32_p]
-                destination[prefix + name] = record_parameter
+        # get the mapping between copies and fp16 parameters
+        p_mapping = dict()
+        for p, fp32_p in zip(self.fp16_params, self.fp32_params):
+            name = self.param2name[p]
+            assert fp32_p in param_to_save_data, "Parameter '{}' is neglected in the chunk list".format(name)
+            record_parameter = param_to_save_data[fp32_p]
+            p_mapping[p] = record_parameter
+        for name, param in self.name2param.items():
+            if param is not None:
+                if is_ddp_ignored(param):
+                    # deal with ddp ignored parameters
+                    destination[prefix + name] = param if keep_vars else param.detach()
+                else:
+                    destination[prefix + name] = p_mapping[param]
+        del p_mapping
+        del param_to_save_data
 
         # save all buffers
         for name, buf in self.named_buffers():
@@ -605,17 +575,15 @@ def load(param_name, dest_tensor, copy_func):
         def load_fp32_parameter(chunk_slice, data):
             chunk_slice.copy_(data.flatten())
 
-        ddp_param_list = []
         for name, param in self.named_parameters():
             if is_ddp_ignored(param):
                 # deal with ddp ignored parameters
                 load(name, param, param.copy_)
-            else:
-                ddp_param_list.append((name, param))
 
         fp32_to_name = dict()
-        for (name, p), fp32_p in zip(ddp_param_list, self.fp32_params):
+        for p, fp32_p in zip(self.fp16_params, self.fp32_params):
             if p is not None:
+                name = self.param2name[p]
                 fp32_to_name[fp32_p] = name
 
         chunk_list = self.chunk_manager.get_chunks(self.fp32_params)
@@ -662,6 +630,60 @@ def load_fp32_parameter(chunk_slice, data):
                     if input_name not in local_state:
                         unexpected_keys.append(key)
 
+    def _init_chunks(self, param_order, strict_ddp_mode: bool, cpu_offload: bool, pin_memory: bool):
+        ddp_pg = ColoProcessGroup()
+        for p in param_order.generate():
+            assert isinstance(p, ColoParameter)
+
+            # gather sharded parameters in the strict ddp mode
+            if strict_ddp_mode:
+                if not p.is_replicate():
+                    p.set_dist_spec(ReplicaSpec())
+                p.set_process_group(pg=ddp_pg)
+
+            # ignore the parameters with no gradient
+            if not p.requires_grad:
+                self.set_params_to_ignore([p])
+
+            # move ignored parameters to CUDA
+            if is_ddp_ignored(p):
+                p.data = p.data.to(device=get_current_device(), dtype=torch.float16)
+                continue
+
+            # create a fp32 parameter
+            fp32_data = p.data.float()
+            fp32_p = ColoTensor(fp32_data, spec=ColoTensorSpec(p.process_group))
+            # create a fp16 parameter
+            p.data = p.data.half()
+
+            # register the fp16 parameter and fp32 parameter in the chunk manager
+            dp_world_size = p.process_group.dp_world_size()
+            self.chunk_manager.register_tensor(tensor=p,
+                                               group_type='fp16_param',
+                                               config_key=dp_world_size,
+                                               cpu_offload=cpu_offload,
+                                               pin_memory=pin_memory)
+            self.chunk_manager.register_tensor(tensor=fp32_p,
+                                               group_type='fp32_param',
+                                               config_key=dp_world_size,
+                                               cpu_offload=cpu_offload,
+                                               pin_memory=pin_memory)
+
+            self.fp16_params.append(p)
+            self.fp32_params.append(fp32_p)
+            self.grads_device[p] = self.gemini_manager.default_device
+
+        self.chunk_manager.close_all_groups()
+
+        for p, fp32_p in zip(self.fp16_params, self.fp32_params):
+            chunk_16 = self.chunk_manager.get_chunk(p)
+            chunk_32 = self.chunk_manager.get_chunk(fp32_p)
+            chunk_32.init_pair(chunk_16)
+
+            # keep gathered chunks are in CUDA
+            if chunk_16.keep_gathered:
+                self.grads_device[p] = get_current_device()
+
     def _cast_buffers(self):
         for buffer in self.module.buffers():
             buffer.data = buffer.cuda()
diff --git a/colossalai/nn/parallel/gemini_parallel.py b/colossalai/nn/parallel/gemini_parallel.py
index 636f1ec7486e..2c6e15d91736 100644
--- a/colossalai/nn/parallel/gemini_parallel.py
+++ b/colossalai/nn/parallel/gemini_parallel.py
@@ -49,6 +49,10 @@ def __init__(self,
                 all parameters will be compacted into one small chunk.
             memstats (MemStats, optional) the memory statistics collector by a runtime memory tracer.
         """
+        # some ugly hotfix for the compatibility with Lightning
+        if search_range_mb is None:
+            search_range_mb = 32
+
         chunk_manager = init_chunk_manager(model=module,
                                            init_device=device,
                                            hidden_dim=hidden_dim,
diff --git a/colossalai/nn/parallel/utils.py b/colossalai/nn/parallel/utils.py
index d323556d5f72..08fdb6026e38 100644
--- a/colossalai/nn/parallel/utils.py
+++ b/colossalai/nn/parallel/utils.py
@@ -80,13 +80,11 @@ def get_static_torch_model(zero_ddp_model,
     from colossalai.nn.parallel import ZeroDDP
     assert isinstance(zero_ddp_model, ZeroDDP)
 
-    state_dict = zero_ddp_model.state_dict(only_rank_0=only_rank_0, strict=False)
+    state_dict = zero_ddp_model.state_dict(only_rank_0=only_rank_0)
     colo_model = zero_ddp_model.module
     torch_model = _get_shallow_copy_model(colo_model)
 
     if not only_rank_0 or dist.get_rank() == 0:
-        # record the mapping relationship between colo parameters and torch parameters
-        colo_to_torch = dict()
         for (name, colo_module), (_, torch_module) in \
                 zip(_get_dfs_module_list(colo_model), _get_dfs_module_list(torch_model)):
             # clean the parameter list of the new torch module
@@ -94,17 +92,10 @@ def get_static_torch_model(zero_ddp_model,
             for sufix_param_name, param in colo_module.named_parameters(recurse=False):
                 # get the full name of the parameter
                 full_param_name = name + ('.' if name else '') + sufix_param_name
-
-                if full_param_name not in state_dict:
-                    # this means the parameter is shared by multiple modules
-                    # we should use colo_to_torch to get the torch parameter created before
-                    assert param in colo_to_torch, f"can not find parameter `{full_param_name}` in the GeminiDDP module"
-                    torch_param = colo_to_torch[param]
-                else:
-                    # we meet the parameter the first time, just use the state dict to get the data
-                    state_param = state_dict[full_param_name]
-                    torch_param = torch.nn.Parameter(state_param.data.to(device=device, dtype=dtype))
-                    colo_to_torch[param] = torch_param
+                assert full_param_name in state_dict, \
+                    f"Can not find parameter `{full_param_name}` in the GeminiDDP module"
+                state_param = state_dict[full_param_name]
+                torch_param = torch.nn.Parameter(state_param.data.to(device=device, dtype=dtype))
 
                 setattr(torch_module, sufix_param_name, torch_param)
     dist.barrier()

From f35326881c84398bf40078f401ca95a14df66be1 Mon Sep 17 00:00:00 2001
From: Fazzie <1240419984@qq.com>
Date: Tue, 31 Jan 2023 10:00:37 +0800
Subject: [PATCH 227/503] fix README

---
 examples/images/diffusion/README.md           |  28 +++-
 .../Teyvat/train_colossalai_teyvat.yaml       |   1 +
 .../diffusion/configs/train_colossalai.yaml   |   1 +
 .../configs/train_colossalai_cifar10.yaml     |   1 +
 .../diffusion/configs/train_pokemon.yaml      | 120 ------------------
 examples/images/diffusion/scripts/txt2img.sh  |   5 +-
 examples/images/diffusion/train_colossalai.sh |   8 +-
 7 files changed, 33 insertions(+), 131 deletions(-)
 delete mode 100644 examples/images/diffusion/configs/train_pokemon.yaml

diff --git a/examples/images/diffusion/README.md b/examples/images/diffusion/README.md
index ddc7e2d97128..b68347c00b6e 100644
--- a/examples/images/diffusion/README.md
+++ b/examples/images/diffusion/README.md
@@ -1,6 +1,5 @@
 # ColoDiffusion: Stable Diffusion with Colossal-AI
 
-
 Acceleration of AIGC (AI-Generated Content) models such as [Stable Diffusion v1](https://github.com/CompVis/stable-diffusion) and [Stable Diffusion v2](https://github.com/Stability-AI/stablediffusion).
 
 <p id="diffusion_train" align="center">
@@ -57,14 +56,19 @@ pip install transformers==4.19.2 diffusers invisible-watermark
 pip install -e .
 ```
 
-##### Step 2: install lightning
+#### Step 2: install lightning
 
 Install Lightning version later than 2022.01.04. We suggest you install lightning from source.
 
-https://github.com/Lightning-AI/lightning.git
+```
+git clone https://github.com/Lightning-AI/lightning.git
+pip install -r requirements.txt
+python setup.py install
+```
 
+#### Step 3:Install [Colossal-AI](https://colossalai.org/download/) From Our Official Website
 
-##### Step 3:Install [Colossal-AI](https://colossalai.org/download/) From Our Official Website
+##### From pip
 
 For example, you can install  v0.1.12 from our official website.
 
@@ -72,6 +76,16 @@ For example, you can install  v0.1.12 from our official website.
 pip install colossalai==0.1.12+torch1.12cu11.3 -f https://release.colossalai.org
 ```
 
+##### From source
+
+```
+git clone https://github.com/hpcaitech/ColossalAI.git
+cd ColossalAI
+
+# install colossalai
+CUDA_EXT=1 pip install .
+```
+
 ### Option #2: Use Docker
 
 To use the stable diffusion Docker image, you can either build using the provided the [Dockerfile](./docker/Dockerfile) or pull a Docker image from our Docker hub.
@@ -122,6 +136,12 @@ It is important for you to configure your volume mapping in order to get the bes
 
 ## Download the model checkpoint from pretrained
 
+### stable-diffusion-v2-base
+
+```
+wget https://huggingface.co/stabilityai/stable-diffusion-2-base/resolve/main/512-base-ema.ckpt
+```
+
 ### stable-diffusion-v1-4
 
 Our default model config use the weight from [CompVis/stable-diffusion-v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4?text=A+mecha+robot+in+a+favela+in+expressionist+style)
diff --git a/examples/images/diffusion/configs/Teyvat/train_colossalai_teyvat.yaml b/examples/images/diffusion/configs/Teyvat/train_colossalai_teyvat.yaml
index d466c1c56259..8a8250c5d300 100644
--- a/examples/images/diffusion/configs/Teyvat/train_colossalai_teyvat.yaml
+++ b/examples/images/diffusion/configs/Teyvat/train_colossalai_teyvat.yaml
@@ -110,6 +110,7 @@ lightning:
         enable_distributed_storage: True
         placement_policy: cuda
         force_outputs_fp32: true
+        min_chunk_size: 64
 
     log_every_n_steps: 2
     logger: True
diff --git a/examples/images/diffusion/configs/train_colossalai.yaml b/examples/images/diffusion/configs/train_colossalai.yaml
index 0354311f84b6..88432e978a0f 100644
--- a/examples/images/diffusion/configs/train_colossalai.yaml
+++ b/examples/images/diffusion/configs/train_colossalai.yaml
@@ -107,6 +107,7 @@ lightning:
         enable_distributed_storage: True
         placement_policy: cuda
         force_outputs_fp32: true
+        min_chunk_size: 64
 
     log_every_n_steps: 2
     logger: True
diff --git a/examples/images/diffusion/configs/train_colossalai_cifar10.yaml b/examples/images/diffusion/configs/train_colossalai_cifar10.yaml
index 0273ca862bf8..0ba06f832178 100644
--- a/examples/images/diffusion/configs/train_colossalai_cifar10.yaml
+++ b/examples/images/diffusion/configs/train_colossalai_cifar10.yaml
@@ -111,6 +111,7 @@ lightning:
         enable_distributed_storage: True
         placement_policy: cuda
         force_outputs_fp32: true
+        min_chunk_size: 64
 
     log_every_n_steps: 2
     logger: True
diff --git a/examples/images/diffusion/configs/train_pokemon.yaml b/examples/images/diffusion/configs/train_pokemon.yaml
deleted file mode 100644
index aadb5f2a0870..000000000000
--- a/examples/images/diffusion/configs/train_pokemon.yaml
+++ /dev/null
@@ -1,120 +0,0 @@
-model:
-  base_learning_rate: 1.0e-4
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    parameterization: "v"
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: image
-    cond_stage_key: txt
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False # we set this to false because this is an inference only config
-
-    scheduler_config: # 10000 warmup steps
-      target: ldm.lr_scheduler.LambdaLinearScheduler
-      params:
-        warm_up_steps: [ 1 ] # NOTE for resuming. use 10000 if starting from scratch
-        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
-        f_start: [ 1.e-6 ]
-        f_max: [ 1.e-4 ]
-        f_min: [ 1.e-10 ]
-
-
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        use_checkpoint: True
-        use_fp16: True
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_head_channels: 64 # need to fix for flash-attn
-        use_spatial_transformer: True
-        use_linear_in_transformer: True
-        transformer_depth: 1
-        context_dim: 1024
-        legacy: False
-
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          #attn_type: "vanilla-xformers"
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
-      params:
-        freeze: True
-        layer: "penultimate"
-
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 32
-    wrap: False
-    train:
-      target: ldm.data.pokemon.PokemonDataset
-      # params:
-        # file_path: "/data/scratch/diffuser/laion_part0/"
-        # world_size: 1
-        # rank: 0
-
-lightning:
-  trainer:
-    accelerator: 'gpu'
-    devices: 1
-    log_gpu_memory: all
-    max_epochs: 2
-    precision: 16
-    auto_select_gpus: False
-    strategy:
-      target: strategies.ColossalAIStrategy
-      params:
-        use_chunk: True
-        enable_distributed_storage: True
-        placement_policy: cuda
-        force_outputs_fp32: true
-
-    log_every_n_steps: 2
-    logger: True
-    default_root_dir: "/tmp/diff_log/"
-    # profiler: pytorch
-
-  logger_config:
-    wandb:
-      target: loggers.WandbLogger
-      params:
-          name: nowname
-          save_dir: "/tmp/diff_log/"
-          offline: opt.debug
-          id: nowname
diff --git a/examples/images/diffusion/scripts/txt2img.sh b/examples/images/diffusion/scripts/txt2img.sh
index 549bb03a6885..53041cb8df6d 100755
--- a/examples/images/diffusion/scripts/txt2img.sh
+++ b/examples/images/diffusion/scripts/txt2img.sh
@@ -1,6 +1,5 @@
 python scripts/txt2img.py --prompt "Teyvat, Name:Layla, Element: Cryo, Weapon:Sword, Region:Sumeru, Model type:Medium Female, Description:a woman in a blue outfit holding a sword" --plms \
     --outdir ./output \
-    --config /home/lcmql/data2/Genshin/2022-11-18T16-38-46_train_colossalai_teyvattest/checkpoints/last.ckpt \
-    --ckpt /home/lcmql/data2/Genshin/2022-11-18T16-38-46_train_colossalai_teyvattest/configs/2022-11-18T16-38-46-project.yaml  \
+    --ckpt /tmp/2022-11-18T16-38-46_train_colossalai/checkpoints/last.ckpt \
+    --config /tmp/2022-11-18T16-38-46_train_colossalai/configs/2022-11-18T16-38-46-project.yaml  \
     --n_samples 4
-
diff --git a/examples/images/diffusion/train_colossalai.sh b/examples/images/diffusion/train_colossalai.sh
index 4223a69412fb..dcaeeb0c6595 100755
--- a/examples/images/diffusion/train_colossalai.sh
+++ b/examples/images/diffusion/train_colossalai.sh
@@ -1,5 +1,5 @@
-HF_DATASETS_OFFLINE=1 
-TRANSFORMERS_OFFLINE=1 
-DIFFUSERS_OFFLINE=1 
+HF_DATASETS_OFFLINE=1
+TRANSFORMERS_OFFLINE=1
+DIFFUSERS_OFFLINE=1
 
-python main.py --logdir /tmp  -t -b /configs/train_colossalai.yaml
+python main.py --logdir /tmp -t -b configs/train_colossalai.yaml

From 6e0faa70e0fb15299b3bcb1e7f6418e2865ae049 Mon Sep 17 00:00:00 2001
From: HELSON <c2h214748@gmail.com>
Date: Tue, 31 Jan 2023 14:21:22 +0800
Subject: [PATCH 228/503] [gemini] add profiler in the demo (#2534)

---
 examples/language/gpt/gemini/commons/utils.py | 29 +++++++++++++++++++
 .../language/gpt/gemini/train_gpt_demo.py     | 18 ++++++++++--
 2 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/examples/language/gpt/gemini/commons/utils.py b/examples/language/gpt/gemini/commons/utils.py
index 782f546dc26c..7bd098c1927c 100644
--- a/examples/language/gpt/gemini/commons/utils.py
+++ b/examples/language/gpt/gemini/commons/utils.py
@@ -1,4 +1,17 @@
+import time
+from contextlib import nullcontext
+
 import torch
+from torch.profiler import ProfilerActivity, profile, schedule, tensorboard_trace_handler
+
+
+class DummyProfiler:
+
+    def __init__(self):
+        self.step_number = 0
+
+    def step(self):
+        self.step_number += 1
 
 
 # Randomly Generated Data
@@ -10,3 +23,19 @@ def get_data(batch_size, seq_len, vocab_size):
 
 def get_tflops(model_numel, batch_size, seq_len, step_time):
     return model_numel * batch_size * seq_len * 8 / 1e12 / (step_time + 1e-12)
+
+
+def get_profile_context(enable_flag, warmup_steps, active_steps, save_dir):
+    if enable_flag:
+        return profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+                       schedule=schedule(wait=0, warmup=warmup_steps, active=active_steps),
+                       on_trace_ready=tensorboard_trace_handler(save_dir),
+                       record_shapes=True,
+                       profile_memory=True)
+    else:
+        return nullcontext(DummyProfiler())
+
+
+def get_time_stamp():
+    cur_time = time.strftime("%d-%H:%M", time.localtime())
+    return cur_time
diff --git a/examples/language/gpt/gemini/train_gpt_demo.py b/examples/language/gpt/gemini/train_gpt_demo.py
index ab8a65e625cf..f46226bce2b5 100644
--- a/examples/language/gpt/gemini/train_gpt_demo.py
+++ b/examples/language/gpt/gemini/train_gpt_demo.py
@@ -6,7 +6,7 @@
 import torch
 import torch.nn as nn
 from commons.model_zoo import model_builder
-from commons.utils import get_data, get_tflops
+from commons.utils import get_data, get_profile_context, get_tflops, get_time_stamp
 from packaging import version
 from torch.nn.parallel import DistributedDataParallel as DDP
 
@@ -201,7 +201,8 @@ def main():
 
     WARMUP_STEPS = 1
     assert WARMUP_STEPS < NUM_STEPS, "warmup steps should smaller than the total steps"
-    assert (NUM_STEPS - WARMUP_STEPS) % 2 == 1, "the number of valid steps should be odd to take the median "
+    assert (NUM_STEPS - WARMUP_STEPS) % 2 == 1, "the number of valid steps should be odd to take the median"
+    PROF_FLAG = False    # The flag of profiling, False by default
 
     disable_existing_loggers()
     colossalai.launch_from_torch(config={})
@@ -292,7 +293,8 @@ def main():
     torch.cuda.synchronize()
     model.train()
     tflops_list = []
-    for n in range(NUM_STEPS):
+
+    def train_step():
         # we just use randomly generated data here
         input_ids, attn_mask = get_data(BATCH_SIZE, SEQ_LEN, VOCAB_SIZE)
         optimizer.zero_grad()
@@ -331,6 +333,16 @@ def main():
         if n >= WARMUP_STEPS:
             tflops_list.append(step_tflops)
 
+    demo_profiler = get_profile_context(PROF_FLAG,
+                                        WARMUP_STEPS,
+                                        NUM_STEPS - WARMUP_STEPS,
+                                        save_dir=f"profile/{get_time_stamp()}-demo")
+
+    with demo_profiler as prof:
+        for n in range(NUM_STEPS):
+            train_step()
+            prof.step()
+
     tflops_list.sort()
     median_index = ((NUM_STEPS - WARMUP_STEPS) >> 1) + WARMUP_STEPS
     logger.info(f"Median TFLOPS is {tflops_list[median_index]:.3f}")

From 63199c668792cf86f24e8583363e8625154ed9d5 Mon Sep 17 00:00:00 2001
From: oahzxl <43881818+oahzxl@users.noreply.github.com>
Date: Tue, 31 Jan 2023 16:00:06 +0800
Subject: [PATCH 229/503] [autochunk] support transformer (#2526)

---
 colossalai/autochunk/autochunk_codegen.py     |   7 +-
 colossalai/autochunk/search_chunk.py          |  66 +---
 colossalai/autochunk/select_chunk.py          | 124 +++----
 colossalai/autochunk/trace_flow.py            | 112 +++---
 colossalai/autochunk/trace_indice.py          | 341 +++++++++++++-----
 colossalai/autochunk/utils.py                 |  50 ++-
 .../benchmark_simple_evoformer.py             |  94 -----
 .../test_alphafold/test_alphafold_utils.py    | 122 +++++++
 .../test_alphafold/test_evoformer_block.py    |  95 +++++
 .../test_alphafold/test_evoformer_stack.py    |  90 +++++
 .../test_alphafold/test_extramsa_block.py     |  96 +++++
 .../test_diffuser/test_diffuser_utils.py      | 120 ++++++
 .../test_autochunk/test_diffuser/test_unet.py |  70 ++++
 .../test_autochunk/test_evoformer_codegen.py  | 163 ---------
 .../test_evoformer_stack_codegen.py           | 163 ---------
 tests/test_autochunk/test_extramsa_codegen.py | 164 ---------
 .../test_simple_evoformer_codegen.py          | 104 ------
 .../test_simple_evoformer_search.py           |  97 -----
 .../test_transformer/test_autochunk_gpt.py    |  65 ++++
 .../test_transformer_utils.py                 | 123 +++++++
 20 files changed, 1198 insertions(+), 1068 deletions(-)
 delete mode 100644 tests/test_autochunk/benchmark_simple_evoformer.py
 create mode 100644 tests/test_autochunk/test_alphafold/test_alphafold_utils.py
 create mode 100644 tests/test_autochunk/test_alphafold/test_evoformer_block.py
 create mode 100644 tests/test_autochunk/test_alphafold/test_evoformer_stack.py
 create mode 100644 tests/test_autochunk/test_alphafold/test_extramsa_block.py
 create mode 100644 tests/test_autochunk/test_diffuser/test_diffuser_utils.py
 create mode 100644 tests/test_autochunk/test_diffuser/test_unet.py
 delete mode 100644 tests/test_autochunk/test_evoformer_codegen.py
 delete mode 100644 tests/test_autochunk/test_evoformer_stack_codegen.py
 delete mode 100644 tests/test_autochunk/test_extramsa_codegen.py
 delete mode 100644 tests/test_autochunk/test_simple_evoformer_codegen.py
 delete mode 100644 tests/test_autochunk/test_simple_evoformer_search.py
 create mode 100644 tests/test_autochunk/test_transformer/test_autochunk_gpt.py
 create mode 100644 tests/test_autochunk/test_transformer/test_transformer_utils.py

diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
index 8c3155a60685..ddf64dc8ff49 100644
--- a/colossalai/autochunk/autochunk_codegen.py
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -3,9 +3,12 @@
 import torch
 
 import colossalai
+from colossalai.fx._compatibility import is_compatible_with_meta
 from colossalai.fx.codegen.activation_checkpoint_codegen import CODEGEN_AVAILABLE
 
-if CODEGEN_AVAILABLE:
+AUTOCHUNK_AVAILABLE = CODEGEN_AVAILABLE and is_compatible_with_meta()
+
+if AUTOCHUNK_AVAILABLE:
     from torch.fx.graph import (
         CodeGen,
         PythonCode,
@@ -272,7 +275,7 @@ def emit_code_with_chunk(
         node_idx += 1
 
 
-if CODEGEN_AVAILABLE:
+if AUTOCHUNK_AVAILABLE:
 
     class AutoChunkCodeGen(CodeGen):
 
diff --git a/colossalai/autochunk/search_chunk.py b/colossalai/autochunk/search_chunk.py
index a8619671268b..720f3d92553a 100644
--- a/colossalai/autochunk/search_chunk.py
+++ b/colossalai/autochunk/search_chunk.py
@@ -8,7 +8,13 @@
 from .select_chunk import SelectChunk
 from .trace_flow import TraceFlow
 from .trace_indice import TraceIndice
-from .utils import get_logger, get_node_shape, is_non_compute_node, is_non_compute_node_except_placeholder
+from .utils import (
+    find_chunk_compute_input_and_output_nodes,
+    get_logger,
+    get_node_shape,
+    is_non_compute_node,
+    is_non_compute_node_except_placeholder,
+)
 
 
 class SearchChunk(object):
@@ -114,6 +120,12 @@ def _search_max_chunk_region(self, active_node: List, peak_node_idx: int, chunk_
             chunk_region_start (int)
             chunk_region_end (int)
         """
+        # check if peak node already in chunkinfo
+        if chunk_regions is not None:
+            for i in chunk_regions:
+                if i["region"][0] < peak_node_idx <= i["region"][1]:
+                    return None
+
         free_vars = self._get_free_var_idx()
         free_var_num = len(free_vars)
         active_node_num = [len(i) for i in active_node]
@@ -152,55 +164,6 @@ def _search_max_chunk_region(self, active_node: List, peak_node_idx: int, chunk_
                     chunk_region_end = region[0] - 1
         return chunk_region_start, chunk_region_end
 
-    def _find_chunk_info(self, input_trace, output_trace, start_idx, end_idx) -> List:
-        """
-        Find chunk info for a region.
-
-        We are given the region start and region end, and need to find out all chunk info for it.
-        We first loop every dim of start node and end node, to see if we can find dim pair,
-        which is linked in a flow and not computed.
-        If found, we then search flow in the whole region to find out all chunk infos.
-
-        Args:
-            input_trace (List): node's input trace in region
-            output_trace (List): node's output trace in region
-            start_idx (int): region start node index
-            end_idx (int): region end node index
-
-        Returns:
-            chunk_infos: possible regions found
-        """
-        start_traces = input_trace[start_idx]
-        end_trace = output_trace[end_idx]
-        end_node = self.trace_indice.node_list[end_idx]
-        chunk_infos = []
-        for end_dim, _ in enumerate(end_trace["indice"]):
-            if len(start_traces) > 1:
-                continue
-            for start_node, start_trace in start_traces.items():
-                for start_dim, _ in enumerate(start_trace["indice"]):
-                    # dim size cannot be 1
-                    if (get_node_shape(end_node)[end_dim] == 1 or get_node_shape(start_node)[start_dim] == 1):
-                        continue
-                    # must have users
-                    if len(end_node.users) == 0:
-                        continue
-                    # check index source align
-                    if not self.trace_flow.check_index_source(start_dim, start_node, start_idx, end_dim, end_node):
-                        continue
-                    # check index copmute
-                    if not self.trace_flow.check_index_compute(start_idx, end_dim, end_node, end_idx):
-                        continue
-                    # flow search
-                    chunk_info = self.trace_flow.flow_search(start_idx, start_dim, end_idx, end_dim)
-                    if chunk_info is None:
-                        continue
-                    # check index copmute
-                    if not self.trace_flow.check_index_duplicate(chunk_info):
-                        continue
-                    chunk_infos.append(chunk_info)
-        return chunk_infos
-
     def _search_possible_chunk_regions(self, max_chunk_region: Tuple, peak_node: Node) -> List:
         """
         Search every possible region within the max chunk region.
@@ -228,9 +191,8 @@ def _search_possible_chunk_regions(self, max_chunk_region: Tuple, peak_node: Nod
                 if is_non_compute_node(self.trace_indice.node_list[start_idx]) or is_non_compute_node(
                         self.trace_indice.node_list[end_idx]):
                     continue
-
                 # select free dim
-                chunk_info = self._find_chunk_info(input_trace, output_trace, start_idx, end_idx)
+                chunk_info = self.trace_flow.find_chunk_info(input_trace, output_trace, start_idx, end_idx)
                 if len(chunk_info) > 0:
                     possible_chunk_region.extend(chunk_info)
         return possible_chunk_region
diff --git a/colossalai/autochunk/select_chunk.py b/colossalai/autochunk/select_chunk.py
index f0612e45a8e6..1f3a95727054 100644
--- a/colossalai/autochunk/select_chunk.py
+++ b/colossalai/autochunk/select_chunk.py
@@ -5,6 +5,7 @@
 
 
 class SelectChunk(object):
+
     def __init__(
         self,
         trace_indice: TraceIndice,
@@ -17,13 +18,11 @@ def __init__(
         self.reorder_graph = reorder_graph
         if max_memory is not None:
             self.stratge = "fit_memory"
-            self.max_memory = max_memory  # MB
+            self.max_memory = max_memory    # MB
         else:
             self.stratge = "min_memory"
 
-    def _select_best_chunk_region(
-        self, possible_chunk_regions, chunk_infos, peak_node, max_chunk_region, mem_peak
-    ):
+    def _select_best_chunk_region(self, possible_chunk_regions, chunk_infos, peak_node, max_chunk_region, mem_peak):
         if self.stratge == "min_memory":
             best_region = self._select_min_memory_chunk_region(
                 possible_chunk_regions,
@@ -44,9 +43,8 @@ def _select_best_chunk_region(
             raise RuntimeError()
         return best_region
 
-    def _select_fit_memory_chunk_region(
-        self, possible_chunk_regions, chunk_infos, peak_node, max_chunk_region, mem_peak
-    ):
+    def _select_fit_memory_chunk_region(self, possible_chunk_regions, chunk_infos, peak_node, max_chunk_region,
+                                        mem_peak):
         # stop chunk if max memory satisfy memory limit
         if max(mem_peak) < self.max_memory:
             return None
@@ -63,33 +61,26 @@ def _select_fit_memory_chunk_region(
         if len(possible_chunk_regions) == 0:
             return None
 
+        max_possible_chunk_region = (min([i["region"][0] for i in possible_chunk_regions]),
+                                     max([i["region"][1] for i in possible_chunk_regions]))
+
         # get mem for chunk region
         regions_dict = []
         for region in possible_chunk_regions:
             cur_region = region.copy()
-            cur_node_list, cur_region = self.reorder_graph.tmp_reorder(
-                self.trace_indice.node_list, cur_region
-            )
+            cur_node_list, cur_region = self.reorder_graph.tmp_reorder(self.trace_indice.node_list, cur_region)
             cur_chunk_infos = chunk_infos + [cur_region]
-            cur_mem_peak = self.estimate_memory.estimate_chunk_inference_mem(
-                cur_node_list, cur_chunk_infos
-            )[0]
-            cur_chunk_region_peak = cur_mem_peak[
-                max_chunk_region[0] : max_chunk_region[1] + 1
-            ]
+            cur_mem_peak = self.estimate_memory.estimate_chunk_inference_mem(cur_node_list, cur_chunk_infos)[0]
+            cur_chunk_region_peak = cur_mem_peak[max_possible_chunk_region[0]:max_possible_chunk_region[1] + 1]
             cur_chunk_region_max_peak = max(cur_chunk_region_peak)
             if cur_chunk_region_max_peak < self.max_memory:
-                regions_dict.append(
-                    {
-                        "chunk_info": region,
-                        "chunk_max_mem": cur_chunk_region_max_peak,
-                        "chunk_len": self._get_compute_node_num(
-                            region["region"][0], region["region"][1]
-                        ),
-                        "reorder_chunk_info": cur_region,
-                        "reorder_node_list": cur_node_list,
-                    }
-                )
+                regions_dict.append({
+                    "chunk_info": region,
+                    "chunk_max_mem": cur_chunk_region_max_peak,
+                    "chunk_len": self._get_compute_node_num(region["region"][0], region["region"][1]),
+                    "reorder_chunk_info": cur_region,
+                    "reorder_node_list": cur_node_list,
+                })
         # no region found
         if len(regions_dict) == 0:
             raise RuntimeError("Search failed. Try a larger memory threshold.")
@@ -113,20 +104,13 @@ def _get_fit_chunk_size(self, chunk_region_dict, chunk_infos):
             chunk_size *= 2
             reorder_chunk_info["chunk_size"] = chunk_size
             cur_chunk_infos = chunk_infos + [reorder_chunk_info]
-            cur_mem_peak = self.estimate_memory.estimate_chunk_inference_mem(
-                chunk_region_dict["reorder_node_list"], cur_chunk_infos
-            )[0]
-            cur_chunk_max_mem = max(
-                cur_mem_peak[
-                    reorder_chunk_info["region"][0] : reorder_chunk_info["region"][1]
-                    + 1
-                ]
-            )
+            cur_mem_peak = self.estimate_memory.estimate_chunk_inference_mem(chunk_region_dict["reorder_node_list"],
+                                                                             cur_chunk_infos)[0]
+            cur_chunk_max_mem = max(cur_mem_peak[reorder_chunk_info["region"][0]:reorder_chunk_info["region"][1] + 1])
         # search exact size
         chunk_info = chunk_region_dict["chunk_info"]
-        chunk_info["chunk_size"] = self._chunk_size_binary_search(
-            chunk_size // 2, chunk_size, chunk_region_dict, chunk_infos
-        )
+        chunk_info["chunk_size"] = self._chunk_size_binary_search(chunk_size // 2, chunk_size, chunk_region_dict,
+                                                                  chunk_infos)
         return chunk_info
 
     def _chunk_size_binary_search(self, left, right, chunk_region_dict, chunk_infos):
@@ -139,12 +123,9 @@ def _chunk_size_binary_search(self, left, right, chunk_region_dict, chunk_infos)
             mid = int((left + right) / 2 + 0.5)
             chunk_info["chunk_size"] = mid
             cur_chunk_infos = chunk_infos + [chunk_info]
-            cur_mem_peak = self.estimate_memory.estimate_chunk_inference_mem(
-                chunk_region_dict["reorder_node_list"], cur_chunk_infos
-            )[0]
-            cur_chunk_max_mem = max(
-                cur_mem_peak[chunk_info["region"][0] : chunk_info["region"][1] + 1]
-            )
+            cur_mem_peak = self.estimate_memory.estimate_chunk_inference_mem(chunk_region_dict["reorder_node_list"],
+                                                                             cur_chunk_infos)[0]
+            cur_chunk_max_mem = max(cur_mem_peak[chunk_info["region"][0]:chunk_info["region"][1] + 1])
             if cur_chunk_max_mem >= self.max_memory:
                 right = mid - gap
             else:
@@ -153,14 +134,13 @@ def _chunk_size_binary_search(self, left, right, chunk_region_dict, chunk_infos)
 
     def _get_compute_node_num(self, start, end):
         count = 0
-        for i in self.trace_indice.node_list[start : end + 1]:
+        for i in self.trace_indice.node_list[start:end + 1]:
             if not is_non_compute_node(i):
                 count += 1
         return count
 
-    def _select_min_memory_chunk_region(
-        self, possible_chunk_regions, chunk_infos, peak_node, max_chunk_region, mem_peak
-    ):
+    def _select_min_memory_chunk_region(self, possible_chunk_regions, chunk_infos, peak_node, max_chunk_region,
+                                        mem_peak):
         # remove illegal regions
         illegal_regions = []
         for i in possible_chunk_regions:
@@ -173,37 +153,31 @@ def _select_min_memory_chunk_region(
         if len(possible_chunk_regions) == 0:
             return None
 
+        # get max possible chunk region
+        max_possible_chunk_region = (min([i["region"][0] for i in possible_chunk_regions]),
+                                     max([i["region"][1] for i in possible_chunk_regions]))
+
         # get mem for chunk region
-        regions_dict = []
+        regions_dict_list = []
         for region in possible_chunk_regions:
             cur_region = region.copy()
-            cur_node_list, cur_region = self.reorder_graph.tmp_reorder(
-                self.trace_indice.node_list, cur_region
-            )
+            cur_node_list, cur_region = self.reorder_graph.tmp_reorder(self.trace_indice.node_list, cur_region)
             cur_chunk_infos = chunk_infos + [cur_region]
-            cur_mem_peak = self.estimate_memory.estimate_chunk_inference_mem(
-                cur_node_list, cur_chunk_infos
-            )[0]
-            cur_chunk_region_peak = cur_mem_peak[
-                max_chunk_region[0] : max_chunk_region[1] + 1
-            ]
+            cur_mem_peak = self.estimate_memory.estimate_chunk_inference_mem(cur_node_list, cur_chunk_infos)[0]
+            cur_chunk_region_peak = cur_mem_peak[max_possible_chunk_region[0]:max_possible_chunk_region[1] + 1]
             cur_chunk_region_max_peak = max(cur_chunk_region_peak)
-            regions_dict.append(
-                {
-                    "chunk_info": region,
-                    "chunk_max_mem": cur_chunk_region_max_peak,
-                    "chunk_len": self._get_compute_node_num(
-                        region["region"][0], region["region"][1]
-                    ),
-                    "reorder_chunk_info": cur_region,
-                    "reorder_node_list": cur_node_list,
-                }
-            )
+            regions_dict_list.append({
+                "chunk_info": region,
+                "chunk_max_mem": cur_chunk_region_max_peak,
+                "chunk_len": self._get_compute_node_num(region["region"][0], region["region"][1]),
+                "reorder_chunk_info": cur_region,
+                "reorder_node_list": cur_node_list,
+            })
 
         # select the min mem
-        chunk_max_mem = [i["chunk_max_mem"] for i in regions_dict]
+        chunk_max_mem = [i["chunk_max_mem"] for i in regions_dict_list]
         best_region_idx = chunk_max_mem.index(min(chunk_max_mem))
-        best_region = regions_dict[best_region_idx]["chunk_info"]
+        best_region = regions_dict_list[best_region_idx]["chunk_info"]
         if best_region is not None:
             best_region["chunk_size"] = 1
         return best_region
@@ -216,9 +190,7 @@ def _is_legal_region(self, cur_chunk_info, chunk_infos):
             return False
         for i in chunk_infos:
             region = i["region"]
-            if not (
-                (chunk_region_start > region[1] and chunk_region_end > region[1])
-                or (chunk_region_start < region[0] and chunk_region_end < region[0])
-            ):
+            if not ((chunk_region_start > region[1] and chunk_region_end > region[1]) or
+                    (chunk_region_start < region[0] and chunk_region_end < region[0])):
                 return False
         return True
diff --git a/colossalai/autochunk/trace_flow.py b/colossalai/autochunk/trace_flow.py
index 830b4629ec1e..df7343764d05 100644
--- a/colossalai/autochunk/trace_flow.py
+++ b/colossalai/autochunk/trace_flow.py
@@ -8,9 +8,9 @@
     find_chunk_compute_input_and_output_nodes,
     find_idx_by_name,
     flat_list,
+    get_node_name,
     get_node_shape,
     is_non_compute_node,
-    is_non_compute_node_except_placeholder,
 )
 
 
@@ -79,43 +79,6 @@ def _find_inherit_dim(self, input_node, input_dim, node):
                 return node_dim
         return None
 
-    def check_index_duplicate(self, chunk_infos, return_dim=False):
-        input_dim_after_node = {}
-        for input_node_idx, input_node in enumerate(chunk_infos["inputs"]):
-            for k, v in chunk_infos["inputs_dim"][input_node_idx].items():
-                inherit_dim = self._find_inherit_dim(input_node, v, self.trace_indice.node_list[k])
-                if inherit_dim:
-                    input_dim_after_node[k] = inherit_dim
-
-        for node in self.trace_indice.node_list[chunk_infos["region"][0]:chunk_infos["region"][1] + 1]:
-            if is_non_compute_node_except_placeholder(node):
-                continue
-            count = 0
-            duplicate_dims = []
-            node_trace_source = self.trace_indice._find_source_trace_from_node(node)
-            for node_dim in range(len(get_node_shape(node))):
-                duplicate_dim = []
-                duplicate_flag = False
-                dim_source = node_trace_source[node_dim]
-                for k, v in dim_source.items():
-                    if chunk_infos["region"][0] <= k <= chunk_infos["region"][1]:
-                        if k in input_dim_after_node and input_dim_after_node[k] in v:
-                            duplicate_flag = True
-                            duplicate_dim.append((k, v))
-                duplicate_dims.append(duplicate_dim)
-                if duplicate_flag:
-                    count += 1
-
-            if count > 1:
-                if return_dim:
-                    return False, duplicate_dims
-                else:
-                    return False
-        if return_dim:
-            return True, None
-        else:
-            return True
-
     def _assgin_single_node_flow(
         self,
         arg_node: Node,
@@ -225,9 +188,12 @@ def _get_all_node_info(self, end_dim, start_idx, end_idx):
                     if flow_flag == False:
                         return None
 
-                if len(arg_list) == 2:
-                    if any(i in cur_node.name for i in ["add", "mul", "truediv"]):
+                if len(arg_list) >= 2:
+                    # need to mark fix dim
+                    if any(i == get_node_name(cur_node) for i in ["add", "mul", "truediv", "sub", "where"]):
                         for arg in arg_list:
+                            if get_node_shape(arg) is None:
+                                continue
                             if not (start_idx <= find_idx_by_name(arg.name, self.trace_indice.node_list) < end_idx):
                                 continue
                             arg_chunk_dim = all_node_info[arg]["chunk_dim"]
@@ -240,9 +206,8 @@ def _get_all_node_info(self, end_dim, start_idx, end_idx):
                                         return None
                                     if i not in arg_fix_dim:
                                         arg_fix_dim.append(i)
-                    elif "einsum" in cur_node.name:
-                        pass
-                    elif "matmul" in cur_node.name:
+                    elif any(i == get_node_name(cur_node)
+                             for i in ["einsum", "matmul", "view", "to", "getitem", "tensor", "type"]):
                         pass
                     else:
                         raise NotImplementedError()
@@ -426,7 +391,7 @@ def _reassgin_reshape_size(self, chunk_info):
         reshape_size = {}
         chunk_shape = get_node_shape(chunk_info["outputs"][0])[chunk_info["outputs_dim"]]
         for node in self.trace_indice.node_list[chunk_region[0]:chunk_region[1] + 1]:
-            if any(i in node.name for i in ["reshape", "view"]):
+            if any(i == get_node_name(node) for i in ["reshape", "view"]):
                 reshape_args = flat_list(node.args[1:])
                 chunk_dim = chunk_info["node_chunk_dim"][node]["chunk_dim"]
                 new_shape = ""
@@ -443,3 +408,62 @@ def _reassgin_reshape_size(self, chunk_info):
                 reshape_size[node.name] = [origin_shape, new_shape]
         chunk_info["reshape_size"] = reshape_size
         return chunk_info
+
+    def find_chunk_info(self, input_trace, output_trace, start_idx, end_idx) -> List:
+        """
+        Find chunk info for a region.
+
+        We are given the region start and region end, and need to find out all chunk info for it.
+        We first loop every dim of start node and end node, to see if we can find dim pair,
+        which is linked in a flow and not computed.
+        If found, we then search flow in the whole region to find out all chunk infos.
+
+        Args:
+            input_trace (List): node's input trace in region
+            output_trace (List): node's output trace in region
+            start_idx (int): region start node index
+            end_idx (int): region end node index
+
+        Returns:
+            chunk_infos: possible regions found
+        """
+        start_traces = input_trace[start_idx]
+        if len(start_traces) > 1:    # TODO need to be removed
+            return []
+        end_trace = output_trace[end_idx]
+        end_node = self.trace_indice.node_list[end_idx]
+
+        chunk_infos = []
+        for end_dim, _ in enumerate(end_trace["indice"]):
+            for start_node, start_trace in start_traces.items():
+                for start_dim, _ in enumerate(start_trace["indice"]):
+                    if not self._check_region_start_end(start_node, start_dim, start_idx, end_node, end_dim, end_idx):
+                        continue
+                    # flow search
+                    chunk_info = self.flow_search(start_idx, start_dim, end_idx, end_dim)
+                    if chunk_info is None:
+                        continue
+                    chunk_infos.append(chunk_info)
+        return chunk_infos
+
+    def _check_region_start_end(self, start_node: Node, start_dim: int, start_idx: int, end_node: Node, end_dim: int,
+                                end_idx: int) -> bool:
+        """
+        check if region start and end is legal
+        """
+        # dim cannot be None
+        if (get_node_shape(end_node) is None or get_node_shape(start_node) is None):
+            return False
+        # dim size cannot be 1
+        if (get_node_shape(end_node)[end_dim] == 1 or get_node_shape(start_node)[start_dim] == 1):
+            return False
+        # must have users
+        if len(end_node.users) == 0:
+            return False
+        # check index source align
+        if not self.check_index_source(start_dim, start_node, start_idx, end_dim, end_node):
+            return False
+        # check index copmute
+        if not self.check_index_compute(start_idx, end_dim, end_node, end_idx):
+            return False
+        return True
diff --git a/colossalai/autochunk/trace_indice.py b/colossalai/autochunk/trace_indice.py
index 827f60d8b53d..8f517cf2cdeb 100644
--- a/colossalai/autochunk/trace_indice.py
+++ b/colossalai/autochunk/trace_indice.py
@@ -3,7 +3,14 @@
 
 from torch.fx.node import Node
 
-from .utils import find_first_tensor_arg, find_idx_by_name, flat_list, get_node_shape
+from .utils import (
+    find_first_tensor_arg,
+    find_idx_by_name,
+    flat_list,
+    get_module_node_name,
+    get_node_name,
+    get_node_shape,
+)
 
 
 class TraceIndice(object):
@@ -36,7 +43,7 @@ def __init__(self, node_list: List[Node]) -> None:
         self.trace_range = []
         self.active_node_list = []
 
-    def _init_indice_trace_list(self):
+    def _init_indice_trace_list(self) -> List:
         indice_trace_list = []
         for n in self.node_list:
             if get_node_shape(n) != None:
@@ -54,7 +61,7 @@ def set_trace_range(self, trace_range: List, active_node_list: List) -> None:
         self.trace_range = trace_range
         self.active_node_list = active_node_list
 
-    def _add_indice(self):
+    def _add_indice(self) -> int:
         """
         Update the count and return it. To record the idx number.
 
@@ -64,39 +71,30 @@ def _add_indice(self):
         self.indice_count += 1
         return self.indice_count
 
-    def _del_dim(self, idx, dim_idx):
+    def _del_dim(self, idx: int, dim_idx: int) -> None:
+        """
+        delete a dim for indice, compute and source
+        """
         self.indice_trace_list[idx]["indice"].pop(dim_idx)
         self.indice_trace_list[idx]["compute"].pop(dim_idx)
         self.indice_trace_list[idx]["source"].pop(dim_idx)
 
-    def _add_dim(self, node_idx, dim_idx):
+    def _add_dim(self, node_idx: int, dim_idx: int) -> None:
+        """
+        add a dim for indice, compute and source
+        """
         self.indice_trace_list[node_idx]["indice"].insert(dim_idx, self._add_indice())
         self.indice_trace_list[node_idx]["compute"].insert(dim_idx, [])
         self.indice_trace_list[node_idx]["source"].insert(dim_idx, {})
 
-    def _transform_indice(self, node, node_dim):
-        node_idx = self._find_indice_trace_from_node(node)
-        dims = list(range(len(node_idx)))
-        return dims[node_dim]
-
-    def _inherit_indice(self, node_from, node_from_dim, node_to, node_to_dim):
-        node_from_dim = self._transform_indice(node_from, node_from_dim)
-        node_to_dim = self._transform_indice(node_to, node_to_dim)
-        node_from_trace = self._find_trace_from_node(node_from)
-        node_to_trace = self._find_trace_from_node(node_to)
-        node_to_trace["indice"][node_to_dim] = node_from_trace["indice"][node_from_dim]
-        node_to_trace["compute"][node_to_dim] = copy.deepcopy(node_from_trace["compute"][node_from_dim])
-        self._add_source(node_from, node_from_dim, node_to, node_to_dim, init=True)
-
-    def _inherit_all_computation(self, node_from, node_to):
-        node_from_compute = self._find_compute_trace_from_node(node_from)
-        node_to_compute = self._find_compute_trace_from_node(node_to)
-        assert len(node_from_compute) == len(node_to_compute)
-        for i in range(len(node_from_compute)):
-            self._add_source(node_from, i, node_to, i)
-            node_to_compute[i] = copy.deepcopy(node_from_compute[i])
-
-    def _add_source(self, node_from, node_from_dim, node_to, node_to_dim, init=False):
+    def _add_source(
+        self,
+        node_from: Node,
+        node_from_dim: int,
+        node_to: Node,
+        node_to_dim: int,
+        init=False,
+    ) -> None:
         node_from_dim = self._transform_indice(node_from, node_from_dim)
         node_from_trace_source = self._find_source_trace_from_node(node_from)
         node_to_dim = self._transform_indice(node_to, node_to_dim)
@@ -119,7 +117,50 @@ def _add_source(self, node_from, node_from_dim, node_to, node_to_dim, init=False
                     if d not in node_to_trace_source[node_to_dim][node_idx]:
                         node_to_trace_source[node_to_dim][node_idx].append(d)
 
-    def _mark_computation_from_node(self, node_from, node_to, exclude=None):
+    def _transform_indice(self, node: Node, node_dim: int) -> int:
+        node_idx = self._find_indice_trace_from_node(node)
+        dims = list(range(len(node_idx)))
+        return dims[node_dim]
+
+    def _inherit_indice(
+        self,
+        node_from: Node,
+        node_from_dim: int,
+        node_to: Node,
+        node_to_dim: int,
+        init: bool = True,
+    ) -> None:
+        """
+        node_to's node_to_dim inherit node_from's node_from_dim by indice, compute and source
+        """
+        node_from_dim = self._transform_indice(node_from, node_from_dim)
+        node_to_dim = self._transform_indice(node_to, node_to_dim)
+        node_from_trace = self._find_trace_from_node(node_from)
+        node_to_trace = self._find_trace_from_node(node_to)
+        if init:
+            node_to_trace["indice"][node_to_dim] = node_from_trace["indice"][node_from_dim]
+            node_to_trace["compute"][node_to_dim] = copy.deepcopy(node_from_trace["compute"][node_from_dim])
+        else:
+            for j in node_from_trace["compute"][node_from_dim]:
+                if j not in node_to_trace["compute"][node_to_dim]:
+                    node_to_trace["compute"][node_to_dim].append(j)
+        self._add_source(node_from, node_from_dim, node_to, node_to_dim, init)
+
+    def _inherit_all_indice(self, node_from: Node, node_to: Node) -> None:
+        """
+        inherit all dims with init
+        """
+        # find indice just for assert length
+        node_from_indice = self._find_indice_trace_from_node(node_from)
+        node_to_indice = self._find_indice_trace_from_node(node_to)
+        assert len(node_from_indice) == len(node_to_indice)
+        for i in range(len(node_from_indice)):
+            self._inherit_indice(node_from, i, node_to, i, init=True)
+
+    def _inherit_more_indice_from_node(self, node_from: Node, node_to: Node, exclude: List = None) -> None:
+        """
+        inheirt indice from node without init
+        """
         if exclude == None:
             exclude = []
         else:
@@ -130,12 +171,9 @@ def _mark_computation_from_node(self, node_from, node_to, exclude=None):
         for i in range(-1, -min(len(node_from_compute), len(node_to_compute)) - 1, -1):
             if self._transform_indice(node_to, i) in exclude:
                 continue
-            self._add_source(node_from, i, node_to, i)
-            for j in node_from_compute[i]:
-                if j not in node_to_compute[i]:
-                    node_to_compute[i].append(j)
+            self._inherit_indice(node_from, i, node_to, i, init=False)
 
-    def _mark_computation(self, node, idx, dim):
+    def _mark_computation(self, node: Node, idx: int, dim: int) -> None:
         """
         Mark some dims of node as computed.
 
@@ -152,7 +190,7 @@ def _mark_computation(self, node, idx, dim):
             if idx not in self.indice_trace_list[idx]["compute"][cur_dim]:
                 self.indice_trace_list[idx]["compute"][cur_dim].append(idx)
 
-    def _find_trace_from_node(self, node):
+    def _find_trace_from_node(self, node: Node) -> Dict:
         """
         Find node idx and compute trace by the node.
 
@@ -166,7 +204,7 @@ def _find_trace_from_node(self, node):
         node_dict = self.indice_trace_list[node_idx]
         return node_dict
 
-    def _find_source_trace_from_node(self, node):
+    def _find_source_trace_from_node(self, node: Node) -> List:
         """
         Find node source trace by the node.
 
@@ -180,7 +218,7 @@ def _find_source_trace_from_node(self, node):
         node_dict = self.indice_trace_list[node_idx]
         return node_dict["source"]
 
-    def _find_indice_trace_from_node(self, node):
+    def _find_indice_trace_from_node(self, node) -> List:
         """
         Find node idx trace by the node.
 
@@ -192,7 +230,7 @@ def _find_indice_trace_from_node(self, node):
         node_idx = find_idx_by_name(node.name, self.node_list)
         return self.indice_trace_list[node_idx]["indice"]
 
-    def _find_compute_trace_from_node(self, node):
+    def _find_compute_trace_from_node(self, node: Node) -> List:
         """
         Find node compute trace by the node.
 
@@ -204,7 +242,7 @@ def _find_compute_trace_from_node(self, node):
         node_idx = find_idx_by_name(node.name, self.node_list)
         return self.indice_trace_list[node_idx]["compute"]
 
-    def _assign_indice_as_input(self, node: Node, node_idx: int, input_node=None):
+    def _assign_indice_as_input(self, node: Node, node_idx: int, input_node=None) -> None:
         """
         Assign node's trace as its input node.
 
@@ -214,15 +252,9 @@ def _assign_indice_as_input(self, node: Node, node_idx: int, input_node=None):
         """
         if input_node == None:
             input_node = find_first_tensor_arg(node)
-        input_node_idx = find_idx_by_name(input_node.name, self.node_list)
-        input_node_idx_trace = self.indice_trace_list[input_node_idx]["indice"]
+        self._inherit_all_indice(input_node, node)
 
-        new_idx_trace = copy.deepcopy(input_node_idx_trace)
-        self.indice_trace_list[node_idx]["indice"] = new_idx_trace
-
-        self._inherit_all_computation(input_node, node)
-
-    def _assign_all_indice(self, node: Node, node_idx: int):
+    def _assign_all_indice(self, node: Node, node_idx: int) -> None:
         """
         Add new indice for all node's dims.
 
@@ -238,7 +270,7 @@ def _assign_all_indice(self, node: Node, node_idx: int):
             new_trace.append(self._add_indice())
         self.indice_trace_list[node_idx]["indice"] = new_trace
 
-    def _assign_transpose_indice(self, node: Node, node_idx: int):
+    def _assign_transpose_indice(self, node: Node, node_idx: int) -> None:
         """
         Assign indice for transpose op.
         1. swap input's dim according to transpose args
@@ -255,7 +287,7 @@ def _assign_transpose_indice(self, node: Node, node_idx: int):
         self._inherit_indice(input_node, tranpose_dim[1], node, tranpose_dim[0])
         self._inherit_indice(input_node, tranpose_dim[0], node, tranpose_dim[1])
 
-    def _assign_permute_indice(self, node: Node, node_idx: int):
+    def _assign_permute_indice(self, node: Node, node_idx: int) -> None:
         """
         Assign indice for permute op.
         1. swap input's dim according to permute args
@@ -272,7 +304,7 @@ def _assign_permute_indice(self, node: Node, node_idx: int):
         for idx, d in enumerate(permute_dim):
             self._inherit_indice(input_node, d, node, idx)
 
-    def _assign_linear_indice(self, node: Node, node_idx: int):
+    def _assign_linear_indice(self, node: Node, node_idx: int) -> None:
         """
         Assign indice for linear op.
         1. copy trace from input node and change last indice accroding to weight
@@ -293,7 +325,23 @@ def _assign_linear_indice(self, node: Node, node_idx: int):
 
         self._mark_computation(node, node_idx, [-1])
 
-    def _assign_matmul_indice(self, node: Node, node_idx: int):
+    def _assign_addmm_indice(self, node: Node, node_idx: int) -> None:
+        """
+        Assign indice for addmm op.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        bias, input_node, weight = node.args
+
+        self._assign_indice_as_input(node, node_idx, input_node)
+        self._inherit_indice(weight, 1, node, -1)
+        self._inherit_indice(bias, -1, node, -1)
+
+        self._mark_computation(node, node_idx, [-1])
+
+    def _assign_matmul_indice(self, node: Node, node_idx: int) -> None:
         """
         Assign indice for matmul op.
         1. copy trace from matmul_left and change last indice accroding to matmul_right. (assert they have same length)
@@ -310,7 +358,7 @@ def _assign_matmul_indice(self, node: Node, node_idx: int):
         self._assign_indice_as_input(node, node_idx, matmul_left)
         self._inherit_indice(matmul_right, -1, node, -1)
 
-        self._mark_computation_from_node(matmul_right, node, [-1, -2])
+        self._inherit_more_indice_from_node(matmul_right, node, [-1, -2])
         self._mark_computation(node, node_idx, [-1])
 
     def _assign_layernorm_indice(self, node, idx):
@@ -341,14 +389,13 @@ def _assign_elementwise_indice(self, node, idx):
         for node_in in node.args:
             if type(node_in) == type(node):
                 nodes_in.append(node_in)
-                self._mark_computation_from_node(node_in, node)
-        assert len(nodes_in) <= 2
+                self._inherit_more_indice_from_node(node_in, node)
 
     def _assgin_no_change_indice(self, node, idx):
         self._assign_indice_as_input(node, idx)
         for node_in in node.args:
             if type(node_in) == type(node):
-                self._mark_computation_from_node(node_in, node)
+                self._inherit_more_indice_from_node(node_in, node)
 
     def _assign_einsum_indice(self, node, idx):
         """
@@ -365,7 +412,7 @@ def _assign_einsum_indice(self, node, idx):
         left, right = patterns.split("->")
         left = left.split(",")
 
-        if '...' in right:
+        if "..." in right:
             replace_list = "!@#$%^&*"
             target_len = len(get_node_shape(node))
             add_len = target_len - len(right) + 3
@@ -399,24 +446,22 @@ def _assign_softmax_indice(self, node, idx):
         self._assign_indice_as_input(node, idx)
         self._mark_computation(node, idx, [node.kwargs["dim"]])
 
-    def _assign_unsqueeze_indice(self, node: Node, node_idx: int):
+    def _assign_split_indice(self, node: Node, node_idx: int) -> None:
         """
-        Assign indice for unsqueeze op.
-        1. assign new indice for unsqueeze dim
+        Assign indice for split op.
 
         Args:
             node (node)
             node_idx (int)
         """
-        self._del_dim(node_idx, -1)
+        for _ in range(len(get_node_shape(node.args[0]))):
+            self._add_dim(node_idx, 0)
         self._assign_indice_as_input(node, node_idx)
-        dim_idx = node.args[1]
-        # unsqueeze(-1) = unsqueeze(shape_num + 1)
-        if dim_idx < 0:
-            dim_idx = list(range(len(get_node_shape(node))))[dim_idx]
+        dim_idx = node.kwargs["dim"]
+        self._del_dim(node_idx, dim_idx)
         self._add_dim(node_idx, dim_idx)
 
-    def _assign_dropout_indice(self, node: Node, node_idx: int):
+    def _assign_unsqueeze_indice(self, node: Node, node_idx: int) -> None:
         """
         Assign indice for unsqueeze op.
         1. assign new indice for unsqueeze dim
@@ -425,9 +470,15 @@ def _assign_dropout_indice(self, node: Node, node_idx: int):
             node (node)
             node_idx (int)
         """
+        self._del_dim(node_idx, -1)
         self._assign_indice_as_input(node, node_idx)
+        dim_idx = node.args[1]
+        # unsqueeze(-1) = unsqueeze(shape_num + 1)
+        if dim_idx < 0:
+            dim_idx = list(range(len(get_node_shape(node))))[dim_idx]
+        self._add_dim(node_idx, dim_idx)
 
-    def _assign_ones_like_indice(self, node: Node, node_idx: int):
+    def _assign_ones_like_indice(self, node: Node, node_idx: int) -> None:
         """
         Assign indice for oneslike op.
         1. assign new indice for all dim
@@ -438,7 +489,7 @@ def _assign_ones_like_indice(self, node: Node, node_idx: int):
         """
         self._assign_all_indice(node, node_idx)
 
-    def _assign_cat_indice(self, node: Node, node_idx: int):
+    def _assign_cat_indice(self, node: Node, node_idx: int) -> None:
         """
         Assign indice for cat op.
 
@@ -449,12 +500,12 @@ def _assign_cat_indice(self, node: Node, node_idx: int):
         nodes_in = flat_list(node.args[0])
         self._assign_indice_as_input(node, node_idx, input_node=nodes_in[0])
         for n in nodes_in[1:]:
-            self._mark_computation_from_node(n, node)
+            self._inherit_more_indice_from_node(n, node)
         cat_dim = node.kwargs["dim"]
         self._del_dim(node_idx, cat_dim)
         self._add_dim(node_idx, cat_dim)
 
-    def _assign_sum_indice(self, node: Node, node_idx: int):
+    def _assign_sum_indice(self, node: Node, node_idx: int) -> None:
         """
         Assign indice for sum op.
 
@@ -466,11 +517,46 @@ def _assign_sum_indice(self, node: Node, node_idx: int):
         self._add_dim(node_idx, 0)
         self._assign_indice_as_input(node, node_idx, input_node=nodes_in[0])
         for n in nodes_in[1:]:
-            self._mark_computation_from_node(n, node)
+            self._inherit_more_indice_from_node(n, node)
         cat_dim = node.kwargs["dim"]
         self._del_dim(node_idx, cat_dim)
 
-    def _assign_getitem_indice(self, node: Node, node_idx: int):
+    def _assign_arange_indice(self, node: Node, node_idx: int) -> None:
+        """
+        Assign indice for arange op.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        self._assign_all_indice(node, node_idx)
+
+    def _assign_tensor_indice(self, node: Node, node_idx: int) -> None:
+        """
+        Assign indice for tensor op.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        if len(get_node_shape(node)) == 0:
+            return
+        else:
+            raise NotImplementedError()
+
+    def _assign_embedding_indice(self, node: Node, node_idx: int) -> None:
+        """
+        Assign indice for embedding op.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        self._del_dim(node_idx, -1)
+        self._assign_indice_as_input(node, node_idx)
+        self._add_dim(node_idx, -1)
+
+    def _assign_getitem_indice(self, node: Node, node_idx: int) -> None:
         """
         Assign indice for getitem.
         getitem can act like slice sometimes
@@ -480,6 +566,19 @@ def _assign_getitem_indice(self, node: Node, node_idx: int):
             node_idx (int)
         """
         node_args = flat_list(node.args[1:])
+
+        # deal with split
+        if get_node_name(node.args[0]) == "split":
+            self._assign_indice_as_input(node, node_idx)
+            self._del_dim(node_idx, node.args[0].kwargs["dim"])
+            self._add_dim(node_idx, node.args[0].kwargs["dim"])
+            return
+
+        # skip non tensor
+        if get_node_shape(node) is None:
+            return
+
+        # find if slice
         flag = False
         for node_arg in node_args:
             node_arg_str = str(node_arg)
@@ -528,7 +627,7 @@ def _assign_getitem_indice(self, node: Node, node_idx: int):
             else:
                 raise NotImplementedError()
 
-    def _assign_view_reshape_indice(self, node: Node, node_idx: int):
+    def _assign_view_reshape_indice(self, node: Node, node_idx: int) -> None:
         """
         Assign indice for view and reshape op.
         1. get origin shape and target shape by meta info.
@@ -536,7 +635,7 @@ def _assign_view_reshape_indice(self, node: Node, node_idx: int):
         3. determine changed dim, and assgin indice for generated dim.
         4. log changed dim and generated dim for restore
         5. inherit computation.
-        6. TODO: look into view list to see whether the view is associated with other,
+        6. look into view list to see whether the view is associated with other,
            if so assgin equal dim according to previous view.
 
         Args:
@@ -552,7 +651,7 @@ def _assign_view_reshape_indice(self, node: Node, node_idx: int):
             if isinstance(unflated_args[i], int):
                 target_shape.append(unflated_args[i])
             else:
-                target_shape.append(unflated_args[i].meta["fwd_out"][0])
+                target_shape.extend(unflated_args[i].meta["fwd_out"])
 
         # compute the value of -1
         if -1 in target_shape:
@@ -579,17 +678,36 @@ def _assign_view_reshape_indice(self, node: Node, node_idx: int):
             dim_from = [dim_equal.index(False)]
             dim_to = [dim_equal.index(False), dim_equal.index(False) + 1]
             self._del_dim(node_idx, -1)
+        elif len_diff == 0:
+            # dim equal
+            dim_equal = [i == j for i, j in zip(origin_shape, target_shape[:-1])]
+            dim_from = []
+            dim_to = []
         else:
             raise NotImplementedError("shape" + str(origin_shape) + "and" + str(target_shape) + "view not implemented")
 
         # get new indice
         origin_trace = self._find_indice_trace_from_node(origin_node)
         self._assign_indice_as_input(node, node_idx, origin_node)
+        idx_from = [origin_trace[i] for i in dim_from]
         dim_from.reverse()
         for i in dim_from:
             self._del_dim(node_idx, i)
         for i in dim_to:
             self._add_dim(node_idx, i)
+        dim_from.reverse()
+
+        # search view list
+        for view_node, view_dict in self.indice_view_list.items():
+            if (view_dict["idx_to"] == idx_from and view_dict["dim_to"] == dim_from
+                    and view_dict["dim_from"] == dim_to):
+                # inheirt indice from current node
+                for dim_to_i in dim_to:
+                    for dim_from_i in dim_from:
+                        self._inherit_indice(origin_node, dim_from_i, node, dim_to_i, init=False)
+                # inherid indice from input node of last view
+                for dim_to_i in dim_to:
+                    self._inherit_indice(view_node.args[0], dim_to_i, node, dim_to_i, init=False)
 
         # inherit computation
         compute_log = self._find_compute_trace_from_node(origin_node)
@@ -630,7 +748,7 @@ def _clear_trace(self, node_idx: int) -> None:
             # clear compute
             for dim_compute in trace["compute"]:
                 for i in range(len(dim_compute) - 1, -1, -1):
-                    if dim_compute[i] < trace_range[0] and dim_compute[i] not in active_nodes:
+                    if (dim_compute[i] < trace_range[0] and dim_compute[i] not in active_nodes):
                         dim_compute.pop(i)
                 continue
             # clear source
@@ -639,59 +757,82 @@ def _clear_trace(self, node_idx: int) -> None:
                     if k < trace_range[0] and k not in active_nodes:
                         dim_source.pop(k)
 
-    def trace_indice(self):
+    def trace_indice(self) -> None:
         for idx, node in enumerate(self.node_list):
+            node_name = get_node_name(node)
             if node.op == "placeholder":
                 self._assign_all_indice(node, idx)
             elif node.op == "call_method":
-                if "transpose" in node.name:
+                if "transpose" == node_name:
                     self._assign_transpose_indice(node, idx)
-                elif "permute" in node.name:
+                elif "permute" == node_name:
                     self._assign_permute_indice(node, idx)
-                elif "view" in node.name or "reshape" in node.name:
+                elif "view" == node_name or "reshape" == node_name:
                     self._assign_view_reshape_indice(node, idx)
-                elif "unsqueeze" in node.name:
+                elif "unsqueeze" == node_name:
                     self._assign_unsqueeze_indice(node, idx)
-                elif any(i in node.name for i in ["to", "contiguous", "clone"]):
+                elif "split" == node_name:
+                    self._assign_split_indice(node, idx)
+                elif any(i == node_name for i in ["to", "contiguous", "clone", "type"]):
                     self._assgin_no_change_indice(node, idx)
-                elif "new_ones" in node.name:
+                elif "new_ones" == node_name:
                     self._assign_ones_like_indice(node, idx)
+                elif any(i == node_name for i in ["size"]):
+                    continue
                 else:
-                    raise NotImplementedError(node.name, "method not implemented yet!")
+                    raise NotImplementedError(node_name, "method not implemented yet!")
             elif node.op == "call_function":
-                if "linear" in node.name:
+                if "linear" == node_name:
                     self._assign_linear_indice(node, idx)
-                elif "cat" in node.name:
+                elif "cat" == node_name:
                     self._assign_cat_indice(node, idx)
-                elif "matmul" in node.name:
+                elif "matmul" == node_name:
                     self._assign_matmul_indice(node, idx)
-                elif "softmax" in node.name:
+                elif "softmax" == node_name:
                     self._assign_softmax_indice(node, idx)
-                elif any(n in node.name for n in ["mul", "add", "sigmoid", "relu", "sub", "truediv"]):
+                elif any(n == node_name for n in [
+                        "mul",
+                        "add",
+                        "sigmoid",
+                        "relu",
+                        "sub",
+                        "truediv",
+                        "pow",
+                        "dropout",
+                        "where",
+                        "tanh",
+                ]):
                     self._assign_elementwise_indice(node, idx)
-                elif "ones_like" in node.name:
+                elif "ones_like" == node_name:
                     self._assign_ones_like_indice(node, idx)
-                elif "dropout" in node.name:
-                    self._assign_dropout_indice(node, idx)
-                elif "einsum" in node.name:
+                elif "einsum" == node_name:
                     self._assign_einsum_indice(node, idx)
-                elif "sum" in node.name:
+                elif "sum" == node_name:
                     self._assign_sum_indice(node, idx)
-                elif "layer_norm" in node.name:
+                elif "layer_norm" == node_name:
                     self._assign_layernorm_indice(node, idx)
-                elif "getitem" in node.name:
+                elif "getitem" == node_name:
                     self._assign_getitem_indice(node, idx)
-                elif any(i in node.name for i in ["getattr", "getitem", "eq", "_assert"]):
+                elif "addmm" == node_name:
+                    self._assign_addmm_indice(node, idx)
+                elif "arange" == node_name:
+                    self._assign_arange_indice(node, idx)
+                elif "tensor" == node_name:
+                    self._assign_arange_indice(node, idx)
+                elif any(i == node_name for i in ["getattr", "eq", "_assert_is_none", "_assert", "finfo"]):
                     continue
                 else:
-                    raise NotImplementedError(node.name, "function not implemented yet!")
+                    raise NotImplementedError(node_name, "function not implemented yet!")
             elif node.op == "call_module":
-                if any(n in node.name for n in ["layernorm", "norm"]):
+                node_name = get_module_node_name(node)
+                if "layernorm" == node_name:
                     self._assign_layernorm_indice(node, idx)
-                elif any(n in node.name for n in ["sigmoid", "dropout", "relu"]):
+                elif "embedding" == node_name:
+                    self._assign_embedding_indice(node, idx)
+                elif any(n == node_name for n in ["sigmoid", "dropout", "relu"]):
                     self._assign_elementwise_indice(node, idx)
                 else:
-                    raise NotImplementedError(node.name, "module not implemented yet!")
+                    raise NotImplementedError(node_name, "module not implemented yet!")
             elif node.op == "get_attr":
                 self._assign_all_indice(node, idx)    # get param
             elif node.op == "output":
diff --git a/colossalai/autochunk/utils.py b/colossalai/autochunk/utils.py
index e870685122e3..de081b41c26e 100644
--- a/colossalai/autochunk/utils.py
+++ b/colossalai/autochunk/utils.py
@@ -1,13 +1,15 @@
-from typing import Any, Callable, Dict, Iterable, List, Tuple
+from typing import Any, Callable, Dict, Iterable, List, Tuple, Union
 
 from torch.fx.node import Node
 
 from colossalai.logging import get_dist_logger
 
+NON_COMPUTE_OP = ["placeholder", "get_attr", "output"]
+NON_COMPUTE_NAME = ["getattr", "eq", "_assert_is_none", "_assert", "finfo", "size"]
 logger = get_dist_logger()
 
 
-def get_logger():
+def get_logger() -> Any:
     return logger
 
 
@@ -37,7 +39,7 @@ def find_first_tensor_arg(node: Node) -> Node:
 
 
 def is_non_compute_node(node: Node) -> bool:
-    if any(i in node.op for i in ["placeholder", "get_attr", "output"]) or any(i in node.name for i in ["getattr"]):
+    if any(i == node.op for i in NON_COMPUTE_OP) or any(i == get_node_name(node) for i in NON_COMPUTE_NAME):
         return True
     if "getitem" in node.name:
         node_args = flat_list(node.args[1:])
@@ -64,33 +66,33 @@ def is_non_memory_node(node: Node) -> bool:
     return is_non_compute_node(node)
 
 
-def is_non_compute_node_except_placeholder(node):
+def is_non_compute_node_except_placeholder(node: Node) -> bool:
     if "placeholder" in node.op:
         return False
     return is_non_compute_node(node)
 
 
-def is_non_compute_node_except_placeholder_output(node):
+def is_non_compute_node_except_placeholder_output(node: Node) -> bool:
     if "output" in node.op:
         return False
     return is_non_compute_node_except_placeholder(node)
 
 
-def find_idx_by_name(name, nodes_list):
+def find_idx_by_name(name: str, nodes_list: List) -> int:
     for idx, node in enumerate(nodes_list):
         if node.name == name:
             return idx
     raise RuntimeError("name %s not found in node list" % name)
 
 
-def delete_free_var_from_last_use(user_to_last_uses):
+def delete_free_var_from_last_use(user_to_last_uses: Dict) -> None:
     for key, value in user_to_last_uses.items():
         for n in value:
             if n.op == "placeholder":
                 user_to_last_uses[key].remove(n)
 
 
-def find_chunk_all_input_nodes(nodes: List[Node]):
+def find_chunk_all_input_nodes(nodes: List[Node]) -> List:
     """
     Find non-compute input and output node names.
     input nodes are nodes used in the list
@@ -104,7 +106,7 @@ def find_chunk_all_input_nodes(nodes: List[Node]):
     return input_nodes
 
 
-def find_chunk_compute_input_and_output_nodes(nodes: List[Node]):
+def find_chunk_compute_input_and_output_nodes(nodes: List[Node]) -> Union[List, List]:
     """
     Find non-compute input and output node names.
     input nodes are nodes used in the list
@@ -130,3 +132,33 @@ def find_chunk_compute_input_and_output_nodes(nodes: List[Node]):
                 output_nodes.append(node)
 
     return input_nodes, output_nodes
+
+
+def get_module_node_name(node: Node) -> str:
+    """
+    get module class name
+    """
+    node_targets = node.target.split(".")
+    module = node.graph.owning_module
+    for i in node_targets:
+        module = getattr(module, i)
+    module_name = str(module.__class__).split(".")[-1][:-2]
+    module_name = module_name.lower()
+    return module_name
+
+
+def get_node_name(node: Node) -> str:
+    """
+    get node name
+    """
+    node_name = node.name
+    if "_" in node_name:
+        for i in range(len(node_name) - 1, -1, -1):
+            if node_name[i] == "_":
+                node_name = node_name[:i]
+                break
+            elif node_name[i] in ["1", "2", "3", "4", "5", "6", "7", "8", "9", "0"]:
+                continue
+            else:
+                break
+    return node_name
diff --git a/tests/test_autochunk/benchmark_simple_evoformer.py b/tests/test_autochunk/benchmark_simple_evoformer.py
deleted file mode 100644
index 8b5d8a8bee77..000000000000
--- a/tests/test_autochunk/benchmark_simple_evoformer.py
+++ /dev/null
@@ -1,94 +0,0 @@
-import time
-
-import torch
-import torch.fx
-from simple_evoformer import base_evoformer, openfold_evoformer
-
-from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
-from colossalai.fx import ColoTracer
-from colossalai.fx.graph_module import ColoGraphModule
-from colossalai.fx.passes.meta_info_prop import MetaInfoProp
-from colossalai.fx.profiler import MetaTensor
-
-
-def _benchmark_evoformer(model: torch.nn.Module, node, pair, title, chunk_size=None):
-    torch.cuda.reset_peak_memory_stats()
-    now_mem = torch.cuda.memory_allocated() / 1024**2
-
-    loop = 3
-    with torch.no_grad():
-        for _ in range(loop // 2 + 1):
-            if chunk_size:
-                model(node, pair, chunk_size)
-            else:
-                model(node, pair)
-        torch.cuda.synchronize()
-        time1 = time.time()
-        for _ in range(loop):
-            if chunk_size:
-                model(node, pair, chunk_size)
-            else:
-                model(node, pair)
-        torch.cuda.synchronize()
-        time2 = time.time()
-
-    new_max_mem = torch.cuda.max_memory_allocated() / 1024**2
-    print("%s: time %.4fs, mem %dMB" % (title, (time2 - time1) / loop, new_max_mem - now_mem))
-
-
-def _build_autochunk(model, max_memory, node, pair):
-    # trace the module and replace codegen
-    graph = ColoTracer().trace(
-        model,
-        meta_args={
-            "node": node.to(torch.device("meta")),
-            "pair": pair.to(torch.device("meta")),
-        },
-    )
-
-    gm_prop = torch.fx.symbolic_trace(model)    # must use symbolic_trace
-    interp = MetaInfoProp(gm_prop)
-    interp.propagate(MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0"))
-
-    # now run it twice to get meta info in graph module, not necessary
-    gm = torch.fx.GraphModule(model, graph)
-    interp = MetaInfoProp(gm)
-    interp.propagate(MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0"))
-
-    # set code_gen
-    codegen = AutoChunkCodeGen(gm_prop, max_memory, print_mem=False)
-    graph.set_codegen(codegen)
-    gm = ColoGraphModule(model, graph)
-    gm.recompile()
-
-    # print
-    # code = graph.python_code("self").src
-    # print(code)
-    return gm
-
-
-def benchmark_evoformer():
-    # init data and model
-    msa_len = 128
-    pair_len = 256
-    node = torch.randn(1, msa_len, pair_len, 256).cuda()
-    pair = torch.randn(1, pair_len, pair_len, 128).cuda()
-    model = base_evoformer().cuda()
-
-    # build autochunk model
-    # max_memory = 1000  # MB, fit memory mode
-    max_memory = None    # min memory mode
-    autochunk = _build_autochunk(base_evoformer().cuda(), max_memory, node, pair)
-
-    # build openfold
-    chunk_size = 64
-    openfold = openfold_evoformer().cuda()
-
-    # benchmark
-    _benchmark_evoformer(model, node, pair, "base")
-    _benchmark_evoformer(openfold, node, pair, "openfold", chunk_size=chunk_size)
-    _benchmark_evoformer(autochunk, node, pair, "autochunk")
-
-
-if __name__ == "__main__":
-    benchmark_evoformer()
diff --git a/tests/test_autochunk/test_alphafold/test_alphafold_utils.py b/tests/test_autochunk/test_alphafold/test_alphafold_utils.py
new file mode 100644
index 000000000000..b05191d2bde4
--- /dev/null
+++ b/tests/test_autochunk/test_alphafold/test_alphafold_utils.py
@@ -0,0 +1,122 @@
+from typing import Any, Dict, List
+
+import torch
+import torch.fx
+
+import colossalai
+from colossalai.autochunk.autochunk_codegen import AUTOCHUNK_AVAILABLE
+from colossalai.autochunk.utils import flat_list
+from colossalai.core import global_context as gpc
+from colossalai.fx.graph_module import ColoGraphModule
+from colossalai.fx.passes.meta_info_prop import MetaInfoProp
+from colossalai.utils import free_port
+
+if AUTOCHUNK_AVAILABLE:
+    from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
+    from colossalai.fx.profiler import MetaTensor
+    from colossalai.fx.tracer.experimental import ColoTracer, symbolic_trace
+
+
+def assert_codegen_run(
+    model: Any,
+    meta_args: List,
+    concrete_args: List = None,
+    max_memory: int = None,
+    print_mem: bool = False,
+    print_progress: bool = False,
+    print_code: bool = False,
+) -> List[Dict]:
+    if concrete_args is None:
+        concrete_args = []
+
+    # trace the meta graph and setup codegen
+    meta_graph = symbolic_trace(
+        model,
+        meta_args={k: v.to(torch.device("meta")) for k, v in meta_args},
+        concrete_args={k: v for k, v in concrete_args},
+    )
+    interp = MetaInfoProp(meta_graph)
+    meta_tensors = [MetaTensor(i[1], fake_device="cuda:0") for i in meta_args] + [i[1] for i in concrete_args]
+    interp.propagate(*meta_tensors)
+    codegen = AutoChunkCodeGen(
+        meta_graph,
+        max_memory=max_memory,
+        print_mem=print_mem,
+        print_progress=print_progress,
+    )
+    chunks = codegen.chunk_infos
+
+    # trace and recompile
+    # MetaInfoProp requires symbolic_trace but CodeGen requires ColoTracer
+    graph = ColoTracer().trace(
+        model,
+        meta_args={k: v.to(torch.device("meta")) for k, v in meta_args},
+        concrete_args={k: v for k, v in concrete_args},
+    )
+    graph.set_codegen(codegen)
+    gm = ColoGraphModule(model, graph, ckpt_codegen=False)
+    gm.recompile()
+
+    # assert chunk in code
+    code = graph.python_code("self").src
+    if print_code:
+        print(code)
+    assert "chunk_result = None;  chunk_size = None;" in code
+
+    # assert result
+    inputs = [i[1] for i in meta_args] + [i[1] for i in concrete_args]
+    model.cuda()
+    with torch.no_grad():
+        out_gm = gm(*inputs)
+        out_model = model(*inputs)
+    out_gm = flat_list(out_gm)
+    out_model = flat_list(out_model)
+    for out_gm_i, out_model_i in zip(out_gm, out_model):
+        assert torch.allclose(out_gm_i, out_model_i,
+                              atol=1e-4), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(
+                                  torch.abs(out_gm_i - out_model_i))
+
+    return chunks
+
+
+def run_test(
+    rank: int,
+    data_args: tuple,
+    max_memory: int,
+    get_model: Any,
+    get_data: Any,
+    print_code: bool,
+    print_mem: bool,
+    print_progress: bool,
+    get_chunk_target: Any = None,
+) -> None:
+    # launch colossalai
+    colossalai.launch(
+        config={},
+        rank=rank,
+        world_size=1,
+        host="localhost",
+        port=free_port(),
+        backend="nccl",
+    )
+
+    # build model and input
+    model = get_model()
+    meta_args, concrete_args = get_data(*data_args)
+    chunks = assert_codegen_run(
+        model,
+        meta_args=meta_args,
+        concrete_args=concrete_args,
+        max_memory=max_memory,
+        print_code=print_code,
+        print_mem=print_mem,
+        print_progress=print_progress,
+    )
+
+    if get_chunk_target is not None:
+        chunk_found = [i["region"] for i in chunks]
+        chunk_target = get_chunk_target()[max_memory]
+        assert chunk_found == chunk_target, "found regions %s doesn't equal target regions %s" % (
+            str(chunk_found),
+            str(chunk_target),
+        )
diff --git a/tests/test_autochunk/test_alphafold/test_evoformer_block.py b/tests/test_autochunk/test_alphafold/test_evoformer_block.py
new file mode 100644
index 000000000000..787067daac8d
--- /dev/null
+++ b/tests/test_autochunk/test_alphafold/test_evoformer_block.py
@@ -0,0 +1,95 @@
+from functools import partial
+from typing import Dict, List, Tuple
+
+import pytest
+import torch
+import torch.fx
+import torch.multiprocessing as mp
+
+try:
+    from fastfold.model.nn.evoformer import EvoformerBlock
+    HAS_REPO = True
+except:
+    HAS_REPO = False
+
+from test_alphafold_utils import run_test
+
+from colossalai.autochunk.autochunk_codegen import AUTOCHUNK_AVAILABLE
+
+
+def get_model():
+    model = EvoformerBlock(
+        c_m=256,
+        c_z=128,
+        c_hidden_msa_att=32,
+        c_hidden_opm=32,
+        c_hidden_mul=128,
+        c_hidden_pair_att=32,
+        no_heads_msa=8,
+        no_heads_pair=4,
+        transition_n=4,
+        msa_dropout=0.15,
+        pair_dropout=0.15,
+        inf=1e4,
+        eps=1e-4,
+        is_multimer=False,
+    ).eval().cuda()
+    return model
+
+
+def get_data(msa_len: int, pair_len: int) -> Tuple[List, List]:
+    node = torch.randn(1, msa_len, pair_len, 256).cuda()
+    node_mask = torch.randn(1, msa_len, pair_len).cuda()
+    pair = torch.randn(1, pair_len, pair_len, 128).cuda()
+    pair_mask = torch.randn(1, pair_len, pair_len).cuda()
+
+    meta_args = [
+        ("m", node),
+        ("z", pair),
+        ("msa_mask", node_mask),
+        ("pair_mask", pair_mask),
+    ]
+    concrete_args = [("chunk_size", None), ("_mask_trans", True)]
+    return meta_args, concrete_args
+
+
+def get_chunk_target() -> Dict:
+    return {
+        None: [(118, 123), (219, 237), (264, 289), (302, 309), (97, 104), (144, 152), (185, 193), (241, 242), (21, 46)],
+        20: [(118, 123), (230, 237), (275, 282), (305, 306), (100, 101), (32, 39), (73, 79)],
+        24: [(118, 123)],
+    }
+
+
+@pytest.mark.skipif(
+    not (AUTOCHUNK_AVAILABLE and HAS_REPO),
+    reason="torch version is lower than 1.12.0",
+)
+@pytest.mark.parametrize("max_memory", [None, 20, 24])
+@pytest.mark.parametrize("data_args", [(32, 64)])    # (msa_len, pair_len)
+def test_evoformer_block(data_args, max_memory):
+    run_func = partial(
+        run_test,
+        data_args=data_args,
+        max_memory=max_memory,
+        get_model=get_model,
+        get_data=get_data,
+        get_chunk_target=get_chunk_target,
+        print_code=False,
+        print_mem=False,
+        print_progress=False,
+    )
+    mp.spawn(run_func, nprocs=1)
+
+
+if __name__ == "__main__":
+    run_test(
+        rank=0,
+        data_args=(32, 64),
+        max_memory=20,
+        get_model=get_model,
+        get_data=get_data,
+        print_code=False,
+        print_mem=False,
+        print_progress=False,
+    )
diff --git a/tests/test_autochunk/test_alphafold/test_evoformer_stack.py b/tests/test_autochunk/test_alphafold/test_evoformer_stack.py
new file mode 100644
index 000000000000..45d8e7ac8a84
--- /dev/null
+++ b/tests/test_autochunk/test_alphafold/test_evoformer_stack.py
@@ -0,0 +1,90 @@
+from functools import partial
+from typing import List, Tuple
+
+import pytest
+import torch
+import torch.fx
+import torch.multiprocessing as mp
+
+try:
+    from fastfold.model.nn.evoformer import EvoformerStack
+    HAS_REPO = True
+except:
+    HAS_REPO = False
+
+from test_alphafold_utils import run_test
+
+from colossalai.autochunk.autochunk_codegen import AUTOCHUNK_AVAILABLE
+
+
+def get_model():
+    model = EvoformerStack(
+        c_m=256,
+        c_z=128,
+        c_hidden_msa_att=32,
+        c_hidden_opm=32,
+        c_hidden_mul=128,
+        c_hidden_pair_att=32,
+        c_s=384,
+        no_heads_msa=8,
+        no_heads_pair=4,
+        no_blocks=2,    # 48
+        transition_n=4,
+        msa_dropout=0.15,
+        pair_dropout=0.25,
+        blocks_per_ckpt=None,
+        inf=1000000000.0,
+        eps=1e-08,
+        clear_cache_between_blocks=False,
+        is_multimer=False,
+    ).eval().cuda()
+    return model
+
+
+def get_data(msa_len: int, pair_len: int) -> Tuple[List, List]:
+    node = torch.randn(1, msa_len, pair_len, 256).cuda()
+    node_mask = torch.randn(1, msa_len, pair_len).cuda()
+    pair = torch.randn(1, pair_len, pair_len, 128).cuda()
+    pair_mask = torch.randn(1, pair_len, pair_len).cuda()
+
+    meta_args = [
+        ("m", node),
+        ("z", pair),
+        ("msa_mask", node_mask),
+        ("pair_mask", pair_mask),
+    ]
+    concrete_args = [("chunk_size", None), ("_mask_trans", True)]
+    return meta_args, concrete_args
+
+
+@pytest.mark.skipif(
+    not (AUTOCHUNK_AVAILABLE and HAS_REPO),
+    reason="torch version is lower than 1.12.0",
+)
+@pytest.mark.parametrize("max_memory", [None, 20, 24])
+@pytest.mark.parametrize("data_args", [(32, 64)])    # (msa_len, pair_len)
+def test_evoformer_stack(data_args, max_memory):
+    run_func = partial(
+        run_test,
+        data_args=data_args,
+        max_memory=max_memory,
+        get_model=get_model,
+        get_data=get_data,
+        print_code=False,
+        print_mem=False,
+        print_progress=False,
+    )
+    mp.spawn(run_func, nprocs=1)
+
+
+if __name__ == "__main__":
+    run_test(
+        rank=0,
+        data_args=(32, 64),
+        max_memory=20,
+        get_model=get_model,
+        get_data=get_data,
+        print_code=False,
+        print_mem=False,
+        print_progress=False,
+    )
diff --git a/tests/test_autochunk/test_alphafold/test_extramsa_block.py b/tests/test_autochunk/test_alphafold/test_extramsa_block.py
new file mode 100644
index 000000000000..a2b72ed1a803
--- /dev/null
+++ b/tests/test_autochunk/test_alphafold/test_extramsa_block.py
@@ -0,0 +1,96 @@
+from functools import partial
+from typing import Dict, List, Tuple
+
+import pytest
+import torch
+import torch.fx
+import torch.multiprocessing as mp
+
+try:
+    from fastfold.model.nn.evoformer import ExtraMSABlock
+    HAS_REPO = True
+except:
+    HAS_REPO = False
+from test_alphafold_utils import run_test
+
+from colossalai.autochunk.autochunk_codegen import AUTOCHUNK_AVAILABLE
+
+
+def get_model():
+    model = ExtraMSABlock(
+        c_m=256,
+        c_z=128,
+        c_hidden_msa_att=32,
+        c_hidden_opm=32,
+        c_hidden_mul=128,
+        c_hidden_pair_att=32,
+        no_heads_msa=8,
+        no_heads_pair=4,
+        transition_n=4,
+        msa_dropout=0.15,
+        pair_dropout=0.15,
+        inf=1e4,
+        eps=1e-4,
+        ckpt=False,
+        is_multimer=False,
+    ).eval().cuda()
+    return model
+
+
+def get_data(msa_len: int, pair_len: int) -> Tuple[List, List]:
+    node = torch.randn(1, msa_len, pair_len, 256).cuda()
+    node_mask = torch.randn(1, msa_len, pair_len).cuda()
+    pair = torch.randn(1, pair_len, pair_len, 128).cuda()
+    pair_mask = torch.randn(1, pair_len, pair_len).cuda()
+
+    meta_args = [
+        ("m", node),
+        ("z", pair),
+        ("msa_mask", node_mask),
+        ("pair_mask", pair_mask),
+    ]
+    concrete_args = [("chunk_size", None), ("_chunk_logits", 1024)]
+    return meta_args, concrete_args
+
+
+def get_chunk_target() -> Dict:
+    return {
+        None: [(126, 131), (227, 245), (272, 297), (310, 317), (105, 112), (152, 160), (193, 201), (249, 250),
+               (33, 46)],
+        20: [(126, 131), (238, 245), (283, 290), (313, 314), (108, 109), (35, 46)],
+        24: [(126, 131)],
+    }
+
+
+@pytest.mark.skipif(
+    not (AUTOCHUNK_AVAILABLE and HAS_REPO),
+    reason="torch version is lower than 1.12.0",
+)
+@pytest.mark.parametrize("max_memory", [None, 20, 24])
+@pytest.mark.parametrize("data_args", [(32, 64)])    # (msa_len, pair_len)
+def test_extramsa_block(data_args, max_memory):
+    run_func = partial(
+        run_test,
+        data_args=data_args,
+        max_memory=max_memory,
+        get_model=get_model,
+        get_data=get_data,
+        print_code=False,
+        print_mem=False,
+        print_progress=False,
+    )
+    mp.spawn(run_func, nprocs=1)
+
+
+if __name__ == "__main__":
+    run_test(
+        rank=0,
+        data_args=(32, 64),
+        max_memory=20,
+        get_model=get_model,
+        get_data=get_data,
+        get_chunk_target=get_chunk_target,
+        print_code=False,
+        print_mem=False,
+        print_progress=False,
+    )
diff --git a/tests/test_autochunk/test_diffuser/test_diffuser_utils.py b/tests/test_autochunk/test_diffuser/test_diffuser_utils.py
new file mode 100644
index 000000000000..0f3d22dc51e2
--- /dev/null
+++ b/tests/test_autochunk/test_diffuser/test_diffuser_utils.py
@@ -0,0 +1,120 @@
+from typing import Any, Dict, List
+
+import torch
+import torch.fx
+
+import colossalai
+from colossalai.autochunk.autochunk_codegen import AUTOCHUNK_AVAILABLE
+from colossalai.core import global_context as gpc
+from colossalai.fx.graph_module import ColoGraphModule
+from colossalai.fx.passes.meta_info_prop import MetaInfoProp
+from colossalai.utils import free_port
+
+if AUTOCHUNK_AVAILABLE:
+    from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
+    from colossalai.fx.profiler import MetaTensor
+    from colossalai.fx.tracer.experimental import ColoTracer, symbolic_trace
+
+
+def assert_codegen_run(
+    model: Any,
+    meta_args: List,
+    concrete_args: List = None,
+    max_memory: int = None,
+    print_mem: bool = False,
+    print_progress: bool = False,
+    print_code: bool = False,
+) -> List[Dict]:
+    if concrete_args is None:
+        concrete_args = []
+    model = model()
+
+    # trace the meta graph and setup codegen
+    meta_graph = symbolic_trace(
+        model,
+        meta_args={k: v.to(torch.device("meta")) for k, v in meta_args},
+        concrete_args={k: v for k, v in concrete_args},
+    )
+    interp = MetaInfoProp(meta_graph)
+    meta_tensors = [MetaTensor(i[1], fake_device="cuda:0") for i in meta_args] + [i[1] for i in concrete_args]
+    interp.propagate(*meta_tensors)
+    codegen = AutoChunkCodeGen(
+        meta_graph,
+        max_memory=max_memory,
+        print_mem=print_mem,
+        print_progress=print_progress,
+    )
+    chunks = codegen.chunk_infos
+
+    # trace and recompile
+    # MetaInfoProp requires symbolic_trace but CodeGen requires ColoTracer
+    graph = ColoTracer().trace(
+        model.cuda(),
+        meta_args={k: v.to(torch.device("meta")) for k, v in meta_args},
+        concrete_args={k: v for k, v in concrete_args},
+    )
+    graph.set_codegen(codegen)
+    gm = ColoGraphModule(model, graph, ckpt_codegen=False)
+    gm.recompile()
+
+    # assert chunk in code
+    code = graph.python_code("self").src
+    if print_code:
+        print(code)
+    assert "chunk_result = None;  chunk_size = None;" in code
+
+    # assert result
+    inputs = [i[1] for i in meta_args] + [i[1] for i in concrete_args]
+    model.cuda().eval()
+    gm.eval()
+    with torch.no_grad():
+        out_gm = gm(*inputs)
+        out_model = model(*inputs)
+    assert torch.allclose(out_gm["sample"], out_model["sample"],
+                          atol=1e-4), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(
+                              torch.abs(out_gm["sample"] - out_model["sample"]))
+
+    return chunks
+
+
+def run_test(
+    rank: int,
+    model: Any,
+    data: tuple,
+    max_memory: int,
+    print_code: bool,
+    print_mem: bool,
+    print_progress: bool,
+    get_chunk_target: Any = None,
+) -> None:
+    # launch colossalai
+    colossalai.launch(
+        config={},
+        rank=rank,
+        world_size=1,
+        host="localhost",
+        port=free_port(),
+        backend="nccl",
+    )
+
+    # build model and input
+    meta_args, concrete_args = data
+    chunks = assert_codegen_run(
+        model,
+        meta_args=meta_args,
+        concrete_args=concrete_args,
+        max_memory=max_memory,
+        print_code=print_code,
+        print_mem=print_mem,
+        print_progress=print_progress,
+    )
+
+    if get_chunk_target is not None:
+        chunk_found = [i["region"] for i in chunks]
+        chunk_target = get_chunk_target()[max_memory]
+        assert (chunk_found == chunk_target), "found regions %s doesn't equal target regions %s" % (
+            str(chunk_found),
+            str(chunk_target),
+        )
+
+    gpc.destroy()
diff --git a/tests/test_autochunk/test_diffuser/test_unet.py b/tests/test_autochunk/test_diffuser/test_unet.py
new file mode 100644
index 000000000000..db154b4bba60
--- /dev/null
+++ b/tests/test_autochunk/test_diffuser/test_unet.py
@@ -0,0 +1,70 @@
+from functools import partial
+from typing import List, Tuple
+
+import pytest
+import torch
+import torch.multiprocessing as mp
+
+try:
+    from diffusers import UNet2DModel
+    MODELS = [UNet2DModel]
+    HAS_REPO = True
+except:
+    MODELS = []
+    HAS_REPO = False
+
+from test_diffuser_utils import run_test
+
+from colossalai.autochunk.autochunk_codegen import AUTOCHUNK_AVAILABLE
+
+BATCH_SIZE = 2
+SEQ_LENGTH = 5
+HEIGHT = 224
+WIDTH = 224
+IN_CHANNELS = 3
+LATENTS_SHAPE = (BATCH_SIZE, IN_CHANNELS, HEIGHT // 7, WIDTH // 7)
+
+
+def get_data(shape: tuple) -> Tuple[List, List]:
+    sample = torch.randn(shape)
+    meta_args = [
+        ("sample", sample),
+    ]
+    concrete_args = [("timestep", 50)]
+    return meta_args, concrete_args
+
+
+@pytest.mark.skipif(
+    True,
+    reason="not implemented",
+)
+@pytest.mark.skipif(
+    not (AUTOCHUNK_AVAILABLE and HAS_REPO),
+    reason="torch version is lower than 1.12.0",
+)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("shape", [LATENTS_SHAPE])
+@pytest.mark.parametrize("max_memory", [64])
+def test_evoformer_block(model, shape, max_memory):
+    run_func = partial(
+        run_test,
+        max_memory=max_memory,
+        model=model,
+        data=get_data(shape),
+        print_code=False,
+        print_mem=False,
+        print_progress=False,
+    )
+    mp.spawn(run_func, nprocs=1)
+
+
+if __name__ == "__main__":
+    run_test(
+        rank=0,
+        data=get_data(LATENTS_SHAPE),
+        max_memory=64,
+        model=UNet2DModel,
+        print_code=False,
+        print_mem=False,
+        print_progress=False,
+    )
diff --git a/tests/test_autochunk/test_evoformer_codegen.py b/tests/test_autochunk/test_evoformer_codegen.py
deleted file mode 100644
index ba6a57a51ce3..000000000000
--- a/tests/test_autochunk/test_evoformer_codegen.py
+++ /dev/null
@@ -1,163 +0,0 @@
-from functools import partial
-
-import pytest
-import torch
-import torch.fx
-import torch.multiprocessing as mp
-
-try:
-    from fastfold.model.nn.evoformer import EvoformerBlock
-    HAS_REPO = True
-except:
-    HAS_REPO = False
-
-import colossalai
-from colossalai.core import global_context as gpc
-from colossalai.fx._compatibility import is_compatible_with_meta
-from colossalai.fx.codegen.activation_checkpoint_codegen import CODEGEN_AVAILABLE
-from colossalai.fx.graph_module import ColoGraphModule
-from colossalai.fx.passes.meta_info_prop import MetaInfoProp
-from colossalai.utils import free_port
-
-if CODEGEN_AVAILABLE and is_compatible_with_meta():
-    from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
-    from colossalai.fx.profiler import MetaTensor
-    from colossalai.fx.tracer.experimental import ColoTracer, symbolic_trace
-
-
-def _test_fwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair, node_mask, pair_mask):
-    # for memory test
-    # model = model.cuda()
-    # torch.cuda.reset_peak_memory_stats()
-    # now_mem = torch.cuda.memory_allocated() / 1024**2
-    # with torch.no_grad():
-    #     node1 = node.clone()
-    #     pair1 = pair.clone()
-    #     node_mask1 = node_mask.clone()
-    #     pair_mask1 = pair_mask.clone()
-    #     gm(node1, pair1, node_mask1, pair_mask1)
-    # new_max_mem = torch.cuda.max_memory_allocated() / 1024**2
-    # print("autochunk max mem:%.2f"% (new_max_mem - now_mem))
-
-    # test forward
-    model = model.cuda()
-    with torch.no_grad():
-        non_fx_out = model(node, pair, node_mask, pair_mask)
-        fx_out = gm(node, pair, node_mask, pair_mask)
-
-    assert torch.allclose(non_fx_out[0], fx_out[0],
-                          atol=1e-4), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(
-                              torch.abs(non_fx_out[0] - fx_out[0]))
-    assert torch.allclose(non_fx_out[1], fx_out[1],
-                          atol=1e-4), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(
-                              torch.abs(non_fx_out[1] - fx_out[1]))
-
-
-def _build_openfold():
-    model = EvoformerBlock(
-        c_m=256,
-        c_z=128,
-        c_hidden_msa_att=32,
-        c_hidden_opm=32,
-        c_hidden_mul=128,
-        c_hidden_pair_att=32,
-        no_heads_msa=8,
-        no_heads_pair=4,
-        transition_n=4,
-        msa_dropout=0.15,
-        pair_dropout=0.15,
-        inf=1e4,
-        eps=1e-4,
-        is_multimer=False,
-    ).eval().cuda()
-    return model
-
-
-def _test_evoformer_codegen(rank, msa_len, pair_len, max_memory):
-    # launch colossalai
-    colossalai.launch(
-        config={},
-        rank=rank,
-        world_size=1,
-        host="localhost",
-        port=free_port(),
-        backend="nccl",
-    )
-
-    # build model and input
-    model = _build_openfold()
-    node = torch.randn(1, msa_len, pair_len, 256).cuda()
-    node_mask = torch.randn(1, msa_len, pair_len).cuda()
-    pair = torch.randn(1, pair_len, pair_len, 128).cuda()
-    pair_mask = torch.randn(1, pair_len, pair_len).cuda()
-
-    # trace the meta graph and setup codegen
-    meta_graph = symbolic_trace(
-        model,
-        meta_args={
-            "m": node.to(torch.device("meta")),
-            "z": pair.to(torch.device("meta")),
-            "msa_mask": node_mask.to(torch.device("meta")),
-            "pair_mask": pair_mask.to(torch.device("meta")),
-        },
-        concrete_args={
-            "chunk_size": None,
-            "_mask_trans": True,
-        },
-    )
-    interp = MetaInfoProp(meta_graph)
-    interp.propagate(
-        MetaTensor(node, fake_device="cuda:0"),
-        MetaTensor(pair, fake_device="cuda:0"),
-        MetaTensor(node_mask, fake_device="cuda:0"),
-        MetaTensor(pair_mask, fake_device="cuda:0"),
-    )
-    codegen = AutoChunkCodeGen(meta_graph, max_memory=max_memory, print_mem=False)
-
-    # trace and recompile
-    # MetaInfoProp requires symbolic_trace but CodeGen requires ColoTracer
-    graph = ColoTracer().trace(
-        model,
-        meta_args={
-            "m": node.to(torch.device("meta")),
-            "z": pair.to(torch.device("meta")),
-            "msa_mask": node_mask.to(torch.device("meta")),
-            "pair_mask": pair_mask.to(torch.device("meta")),
-        },
-        concrete_args={
-            "chunk_size": None,
-            "_mask_trans": True,
-        },
-    )
-    graph.set_codegen(codegen)
-    gm = ColoGraphModule(model, graph, ckpt_codegen=False)
-    gm.recompile()
-
-    # assert we have inserted chunk
-    code = graph.python_code("self").src
-    # print(code)
-    assert "chunk_result = None;  chunk_size = None;" in code
-
-    _test_fwd(model, gm, node, pair, node_mask, pair_mask)
-    gpc.destroy()
-
-
-@pytest.mark.skipif(
-    not (CODEGEN_AVAILABLE and is_compatible_with_meta() and HAS_REPO),
-    reason="torch version is lower than 1.12.0",
-)
-@pytest.mark.parametrize("max_memory", [None, 24, 28, 32])
-@pytest.mark.parametrize("msa_len", [32])
-@pytest.mark.parametrize("pair_len", [64])
-def test_evoformer_codegen(msa_len, pair_len, max_memory):
-    run_func = partial(
-        _test_evoformer_codegen,
-        msa_len=msa_len,
-        pair_len=pair_len,
-        max_memory=max_memory,
-    )
-    mp.spawn(run_func, nprocs=1)
-
-
-if __name__ == "__main__":
-    _test_evoformer_codegen(0, 32, 64, 24)
diff --git a/tests/test_autochunk/test_evoformer_stack_codegen.py b/tests/test_autochunk/test_evoformer_stack_codegen.py
deleted file mode 100644
index 5fabb27028f9..000000000000
--- a/tests/test_autochunk/test_evoformer_stack_codegen.py
+++ /dev/null
@@ -1,163 +0,0 @@
-from functools import partial
-
-import pytest
-import torch
-import torch.fx
-import torch.multiprocessing as mp
-
-try:
-    from fastfold.model.nn.evoformer import EvoformerStack
-    HAS_REPO = True
-except:
-    HAS_REPO = False
-
-import colossalai
-from colossalai.core import global_context as gpc
-from colossalai.fx._compatibility import is_compatible_with_meta
-from colossalai.fx.codegen.activation_checkpoint_codegen import CODEGEN_AVAILABLE
-from colossalai.fx.graph_module import ColoGraphModule
-from colossalai.fx.passes.meta_info_prop import MetaInfoProp
-from colossalai.utils import free_port
-
-if CODEGEN_AVAILABLE and is_compatible_with_meta():
-    from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
-    from colossalai.fx.profiler import MetaTensor
-    from colossalai.fx.tracer.experimental import ColoTracer, symbolic_trace
-
-
-def _test_fwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair, node_mask, pair_mask):
-    # for memory test
-    # model = model.cuda()
-    # torch.cuda.reset_peak_memory_stats()
-    # now_mem = torch.cuda.memory_allocated() / 1024**2
-    # with torch.no_grad():
-    #     node1 = node.clone()
-    #     pair1 = pair.clone()
-    #     node_mask1 = node_mask.clone()
-    #     pair_mask1 = pair_mask.clone()
-    #     gm(node1, pair1, node_mask1, pair_mask1, None)
-    # new_max_mem = torch.cuda.max_memory_allocated() / 1024**2
-    # print("autochunk max mem:%.2f"% (new_max_mem - now_mem))
-
-    # test forward
-    model = model.cuda()
-    with torch.no_grad():
-        non_fx_out = model(node, pair, node_mask, pair_mask, None)
-        fx_out = gm(node, pair, node_mask, pair_mask, None)
-
-    assert torch.allclose(non_fx_out[0], fx_out[0],
-                          atol=1e-4), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(
-                              torch.abs(non_fx_out[0] - fx_out[0]))
-    assert torch.allclose(non_fx_out[1], fx_out[1],
-                          atol=1e-4), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(
-                              torch.abs(non_fx_out[1] - fx_out[1]))
-
-
-def _build_openfold():
-    model = EvoformerStack(
-        c_m=256,
-        c_z=128,
-        c_hidden_msa_att=32,
-        c_hidden_opm=32,
-        c_hidden_mul=128,
-        c_hidden_pair_att=32,
-        c_s=384,
-        no_heads_msa=8,
-        no_heads_pair=4,
-        no_blocks=2,    # 48
-        transition_n=4,
-        msa_dropout=0.15,
-        pair_dropout=0.25,
-        blocks_per_ckpt=None,
-        inf=1000000000.0,
-        eps=1e-08,
-        clear_cache_between_blocks=False,
-        is_multimer=False,
-    ).eval().cuda()
-    return model
-
-
-def _test_evoformer_stack_codegen(rank, msa_len, pair_len, max_memory):
-    # launch colossalai
-    colossalai.launch(
-        config={},
-        rank=rank,
-        world_size=1,
-        host="localhost",
-        port=free_port(),
-        backend="nccl",
-    )
-
-    # build model and input
-    model = _build_openfold()
-    node = torch.randn(1, msa_len, pair_len, 256).cuda()
-    node_mask = torch.randn(1, msa_len, pair_len).cuda()
-    pair = torch.randn(1, pair_len, pair_len, 128).cuda()
-    pair_mask = torch.randn(1, pair_len, pair_len).cuda()
-
-    # trace the meta graph and setup codegen
-    meta_graph = symbolic_trace(
-        model,
-        meta_args={
-            "m": node.to(torch.device("meta")),
-            "z": pair.to(torch.device("meta")),
-            "msa_mask": node_mask.to(torch.device("meta")),
-            "pair_mask": pair_mask.to(torch.device("meta")),
-        },
-        concrete_args={
-            "chunk_size": None,
-            "_mask_trans": True,
-        },
-    )
-    interp = MetaInfoProp(meta_graph)
-    interp.propagate(MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0"),
-                     MetaTensor(node_mask, fake_device="cuda:0"), MetaTensor(pair_mask, fake_device="cuda:0"), None)
-    codegen = AutoChunkCodeGen(meta_graph, max_memory=max_memory, print_mem=False, print_progress=False)
-
-    # trace and recompile
-    # MetaInfoProp requires symbolic_trace but CodeGen requires ColoTracer
-    graph = ColoTracer().trace(
-        model,
-        meta_args={
-            "m": node.to(torch.device("meta")),
-            "z": pair.to(torch.device("meta")),
-            "msa_mask": node_mask.to(torch.device("meta")),
-            "pair_mask": pair_mask.to(torch.device("meta")),
-        },
-        concrete_args={
-            "chunk_size": None,
-            "_mask_trans": True,
-        },
-    )
-    graph.set_codegen(codegen)
-    gm = ColoGraphModule(model, graph, ckpt_codegen=False)
-    gm.recompile()
-
-    # assert we have inserted chunk
-    code = graph.python_code("self").src
-    # print(code)
-    assert "chunk_result = None;  chunk_size = None;" in code
-
-    _test_fwd(model, gm, node, pair, node_mask, pair_mask)
-    gpc.destroy()
-
-
-@pytest.mark.skipif(
-    not (CODEGEN_AVAILABLE and is_compatible_with_meta() and HAS_REPO),
-    reason="torch version is lower than 1.12.0",
-)
-@pytest.mark.parametrize("max_memory", [None, 24, 28, 32])
-@pytest.mark.parametrize("msa_len", [32])
-@pytest.mark.parametrize("pair_len", [64])
-def test_evoformer_stack_codegen(msa_len, pair_len, max_memory):
-    run_func = partial(
-        _test_evoformer_stack_codegen,
-        msa_len=msa_len,
-        pair_len=pair_len,
-        max_memory=max_memory,
-    )
-    mp.spawn(run_func, nprocs=1)
-
-
-if __name__ == "__main__":
-    _test_evoformer_stack_codegen(0, 32, 64, None)
diff --git a/tests/test_autochunk/test_extramsa_codegen.py b/tests/test_autochunk/test_extramsa_codegen.py
deleted file mode 100644
index 2a41452a2ad7..000000000000
--- a/tests/test_autochunk/test_extramsa_codegen.py
+++ /dev/null
@@ -1,164 +0,0 @@
-from functools import partial
-
-import pytest
-import torch
-import torch.fx
-import torch.multiprocessing as mp
-
-try:
-    from fastfold.model.nn.evoformer import ExtraMSABlock
-    HAS_REPO = True
-except:
-    HAS_REPO = False
-
-import colossalai
-from colossalai.core import global_context as gpc
-from colossalai.fx._compatibility import is_compatible_with_meta
-from colossalai.fx.codegen.activation_checkpoint_codegen import CODEGEN_AVAILABLE
-from colossalai.fx.graph_module import ColoGraphModule
-from colossalai.fx.passes.meta_info_prop import MetaInfoProp
-from colossalai.utils import free_port
-
-if CODEGEN_AVAILABLE and is_compatible_with_meta():
-    from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
-    from colossalai.fx.profiler import MetaTensor
-    from colossalai.fx.tracer.experimental import ColoTracer, symbolic_trace
-
-
-def _test_fwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair, node_mask, pair_mask):
-    # for memory test
-    # model = model.cuda()
-    # torch.cuda.reset_peak_memory_stats()
-    # now_mem = torch.cuda.memory_allocated() / 1024**2
-    # with torch.no_grad():
-    #     node1 = node.clone()
-    #     pair1 = pair.clone()
-    #     node_mask1 = node_mask.clone()
-    #     pair_mask1 = pair_mask.clone()
-    #     gm(node1, pair1, node_mask1, pair_mask1)
-    # new_max_mem = torch.cuda.max_memory_allocated() / 1024**2
-    # print("autochunk max mem:%.2f"% (new_max_mem - now_mem))
-
-    # test forward
-    model = model.cuda()
-    with torch.no_grad():
-        non_fx_out = model(node, pair, node_mask, pair_mask)
-        fx_out = gm(node, pair, node_mask, pair_mask)
-
-    assert torch.allclose(non_fx_out[0], fx_out[0],
-                          atol=1e-4), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(
-                              torch.abs(non_fx_out[0] - fx_out[0]))
-    assert torch.allclose(non_fx_out[1], fx_out[1],
-                          atol=1e-4), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(
-                              torch.abs(non_fx_out[1] - fx_out[1]))
-
-
-def _build_openfold():
-    model = ExtraMSABlock(
-        c_m=256,
-        c_z=128,
-        c_hidden_msa_att=32,
-        c_hidden_opm=32,
-        c_hidden_mul=128,
-        c_hidden_pair_att=32,
-        no_heads_msa=8,
-        no_heads_pair=4,
-        transition_n=4,
-        msa_dropout=0.15,
-        pair_dropout=0.15,
-        inf=1e4,
-        eps=1e-4,
-        ckpt=False,
-        is_multimer=False,
-    ).eval().cuda()
-    return model
-
-
-def _test_extramsa_codegen(rank, msa_len, pair_len, max_memory):
-    # launch colossalai
-    colossalai.launch(
-        config={},
-        rank=rank,
-        world_size=1,
-        host="localhost",
-        port=free_port(),
-        backend="nccl",
-    )
-
-    # build model and input
-    model = _build_openfold()
-    node = torch.randn(1, msa_len, pair_len, 256).cuda()
-    node_mask = torch.randn(1, msa_len, pair_len).cuda()
-    pair = torch.randn(1, pair_len, pair_len, 128).cuda()
-    pair_mask = torch.randn(1, pair_len, pair_len).cuda()
-
-    # trace the meta graph and setup codegen
-    meta_graph = symbolic_trace(
-        model,
-        meta_args={
-            "m": node.to(torch.device("meta")),
-            "z": pair.to(torch.device("meta")),
-            "msa_mask": node_mask.to(torch.device("meta")),
-            "pair_mask": pair_mask.to(torch.device("meta")),
-        },
-        concrete_args={
-            "chunk_size": None,
-            "_chunk_logits": 1024,
-        },
-    )
-    interp = MetaInfoProp(meta_graph)
-    interp.propagate(
-        MetaTensor(node, fake_device="cuda:0"),
-        MetaTensor(pair, fake_device="cuda:0"),
-        MetaTensor(node_mask, fake_device="cuda:0"),
-        MetaTensor(pair_mask, fake_device="cuda:0"),
-    )
-    codegen = AutoChunkCodeGen(meta_graph, max_memory=max_memory, print_mem=False)
-
-    # trace and recompile
-    # MetaInfoProp requires symbolic_trace but CodeGen requires ColoTracer
-    graph = ColoTracer().trace(
-        model,
-        meta_args={
-            "m": node.to(torch.device("meta")),
-            "z": pair.to(torch.device("meta")),
-            "msa_mask": node_mask.to(torch.device("meta")),
-            "pair_mask": pair_mask.to(torch.device("meta")),
-        },
-        concrete_args={
-            "chunk_size": None,
-            "_chunk_logits": 1024,
-        },
-    )
-    graph.set_codegen(codegen)
-    gm = ColoGraphModule(model, graph, ckpt_codegen=False)
-    gm.recompile()
-
-    # assert we have inserted chunk
-    code = graph.python_code("self").src
-    # print(code)
-    assert "chunk_result = None;  chunk_size = None;" in code
-
-    _test_fwd(model, gm, node, pair, node_mask, pair_mask)
-    gpc.destroy()
-
-
-@pytest.mark.skipif(
-    not (CODEGEN_AVAILABLE and is_compatible_with_meta() and HAS_REPO),
-    reason="torch version is lower than 1.12.0",
-)
-@pytest.mark.parametrize("max_memory", [None, 24, 28, 32])
-@pytest.mark.parametrize("msa_len", [32])
-@pytest.mark.parametrize("pair_len", [64])
-def test_extramsa_codegen(msa_len, pair_len, max_memory):
-    run_func = partial(
-        _test_extramsa_codegen,
-        msa_len=msa_len,
-        pair_len=pair_len,
-        max_memory=max_memory,
-    )
-    mp.spawn(run_func, nprocs=1)
-
-
-if __name__ == "__main__":
-    _test_extramsa_codegen(0, 32, 64, None)
diff --git a/tests/test_autochunk/test_simple_evoformer_codegen.py b/tests/test_autochunk/test_simple_evoformer_codegen.py
deleted file mode 100644
index 7fe149c5784d..000000000000
--- a/tests/test_autochunk/test_simple_evoformer_codegen.py
+++ /dev/null
@@ -1,104 +0,0 @@
-from functools import partial
-
-import pytest
-import torch
-import torch.fx
-import torch.multiprocessing as mp
-
-try:
-    from simple_evoformer import base_evoformer
-    HAS_REPO = True
-except:
-    HAS_REPO = False
-
-import colossalai
-from colossalai.core import global_context as gpc
-from colossalai.fx import ColoTracer, symbolic_trace
-from colossalai.fx._compatibility import is_compatible_with_meta
-from colossalai.fx.codegen.activation_checkpoint_codegen import CODEGEN_AVAILABLE
-from colossalai.fx.graph_module import ColoGraphModule
-from colossalai.fx.passes.meta_info_prop import MetaInfoProp
-from colossalai.utils import free_port
-
-if CODEGEN_AVAILABLE and is_compatible_with_meta():
-    from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
-    from colossalai.fx.profiler import MetaTensor
-
-
-def _test_fwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair):
-    with torch.no_grad():
-        non_fx_out = model(node, pair)
-        fx_out = gm(node, pair)
-
-    assert torch.allclose(non_fx_out[0], fx_out[0],
-                          atol=1e-4), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(
-                              torch.abs(non_fx_out[0] - fx_out[0]))
-    assert torch.allclose(non_fx_out[1], fx_out[1],
-                          atol=1e-4), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(
-                              torch.abs(non_fx_out[1] - fx_out[1]))
-
-
-def _test_simple_evoformer_codegen(rank, msa_len, pair_len, max_memory):
-    # launch colossalai
-    colossalai.launch(
-        config={},
-        rank=rank,
-        world_size=1,
-        host="localhost",
-        port=free_port(),
-        backend="nccl",
-    )
-
-    # build model and input
-    model = base_evoformer().cuda()
-    node = torch.randn(1, msa_len, pair_len, 256).cuda()
-    pair = torch.randn(1, pair_len, pair_len, 128).cuda()
-
-    # meta info prop
-    meta_graph = symbolic_trace(model,
-                                meta_args={
-                                    "node": node.to(torch.device("meta")),
-                                    "pair": pair.to(torch.device("meta")),
-                                })    # must use symbolic_trace
-    interp = MetaInfoProp(meta_graph)
-    interp.propagate(MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0"))
-    codegen = AutoChunkCodeGen(meta_graph, max_memory=max_memory)
-
-    # trace the module and replace codegen
-    graph = ColoTracer().trace(
-        model,
-        meta_args={
-            "node": node.to(torch.device("meta")),
-            "pair": pair.to(torch.device("meta")),
-        },
-    )
-    graph.set_codegen(codegen)
-    gm = ColoGraphModule(model, graph, ckpt_codegen=False)
-    gm.recompile()
-
-    # assert we have inserted chunk
-    code = graph.python_code("self").src
-    # print(code)
-    assert "chunk_result = None;  chunk_size = None;" in code
-
-    _test_fwd(model, gm, node, pair)
-    gpc.destroy()
-
-
-@pytest.mark.skipif(not (CODEGEN_AVAILABLE and is_compatible_with_meta() and HAS_REPO),
-                    reason='torch version is lower than 1.12.0')
-@pytest.mark.parametrize("max_memory", [None, 20, 25, 30])
-@pytest.mark.parametrize("msa_len", [32])
-@pytest.mark.parametrize("pair_len", [64])
-def test_simple_evoformer_codegen(msa_len, pair_len, max_memory):
-    run_func = partial(
-        _test_simple_evoformer_codegen,
-        msa_len=msa_len,
-        pair_len=pair_len,
-        max_memory=max_memory,
-    )
-    mp.spawn(run_func, nprocs=1)
-
-
-if __name__ == "__main__":
-    _test_simple_evoformer_codegen(0, 32, 64, 25)
diff --git a/tests/test_autochunk/test_simple_evoformer_search.py b/tests/test_autochunk/test_simple_evoformer_search.py
deleted file mode 100644
index 89f28d625cbe..000000000000
--- a/tests/test_autochunk/test_simple_evoformer_search.py
+++ /dev/null
@@ -1,97 +0,0 @@
-from functools import partial
-
-import pytest
-import torch
-import torch.fx
-import torch.multiprocessing as mp
-
-try:
-    from simple_evoformer import base_evoformer
-    HAS_REPO = True
-except:
-    HAS_REPO = False
-
-import colossalai
-from colossalai.core import global_context as gpc
-from colossalai.fx import symbolic_trace
-from colossalai.fx._compatibility import is_compatible_with_meta
-from colossalai.fx.codegen.activation_checkpoint_codegen import CODEGEN_AVAILABLE
-from colossalai.fx.passes.meta_info_prop import MetaInfoProp
-from colossalai.utils import free_port
-
-if CODEGEN_AVAILABLE and is_compatible_with_meta():
-    from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
-    from colossalai.fx.profiler import MetaTensor
-
-
-def assert_chunk_infos(chunk_infos, max_memory, msa_len, pair_len):
-    found_regions = [i["region"] for i in chunk_infos]
-
-    if msa_len == 32 and pair_len == 64:
-        if max_memory is None:
-            target_regions = [(142, 154), (366, 373), (234, 283), (302, 351), (127, 134), (211, 228), (174, 191),
-                              (161, 166), (198, 203), (7, 57)]
-        elif max_memory == 20:
-            target_regions = [(142, 154), (369, 373), (235, 269), (303, 351), (130, 131)]
-        elif max_memory == 25:
-            target_regions = [(144, 154), (369, 370)]
-        elif max_memory == 30:
-            target_regions = [(144, 154)]
-        else:
-            raise NotImplementedError()
-    else:
-        raise NotImplementedError()
-
-    assert found_regions == target_regions, "found regions %s doesn't equal target regions %s" % (
-        str(found_regions),
-        str(target_regions),
-    )
-
-
-def _test_simple_evoformer_search(rank, msa_len, pair_len, max_memory):
-    # launch colossalai
-    colossalai.launch(
-        config={},
-        rank=rank,
-        world_size=1,
-        host="localhost",
-        port=free_port(),
-        backend="nccl",
-    )
-
-    # build model and input
-    model = base_evoformer().cuda()
-    node = torch.randn(1, msa_len, pair_len, 256).cuda()
-    pair = torch.randn(1, pair_len, pair_len, 128).cuda()
-
-    meta_graph = symbolic_trace(model,
-                                meta_args={
-                                    "node": node.to(torch.device("meta")),
-                                    "pair": pair.to(torch.device("meta")),
-                                })    # must use symbolic_trace
-    interp = MetaInfoProp(meta_graph)
-    interp.propagate(MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0"))
-    codegen = AutoChunkCodeGen(meta_graph, max_memory=max_memory)
-    chunk_infos = codegen.chunk_infos
-    assert_chunk_infos(chunk_infos, max_memory, msa_len, pair_len)
-
-    gpc.destroy()
-
-
-@pytest.mark.skipif(not (CODEGEN_AVAILABLE and is_compatible_with_meta() and HAS_REPO),
-                    reason="torch version is lower than 1.12.0")
-@pytest.mark.parametrize("max_memory", [None, 20, 25, 30])
-@pytest.mark.parametrize("msa_len", [32])
-@pytest.mark.parametrize("pair_len", [64])
-def test_simple_evoformer_search(msa_len, pair_len, max_memory):
-    run_func = partial(
-        _test_simple_evoformer_search,
-        msa_len=msa_len,
-        pair_len=pair_len,
-        max_memory=max_memory,
-    )
-    mp.spawn(run_func, nprocs=1)
-
-
-if __name__ == "__main__":
-    _test_simple_evoformer_search(0, 32, 64, 20)
diff --git a/tests/test_autochunk/test_transformer/test_autochunk_gpt.py b/tests/test_autochunk/test_transformer/test_autochunk_gpt.py
new file mode 100644
index 000000000000..0ba8f89c2c44
--- /dev/null
+++ b/tests/test_autochunk/test_transformer/test_autochunk_gpt.py
@@ -0,0 +1,65 @@
+from functools import partial
+from typing import List, Tuple
+
+import pytest
+import torch
+import torch.multiprocessing as mp
+
+try:
+    from transformers import GPT2Config, GPT2Model
+    MODELS = [GPT2Model]
+    HAS_REPO = True
+except:
+    MODELS = []
+    HAS_REPO = False
+
+from test_transformer_utils import run_test
+
+from colossalai.autochunk.autochunk_codegen import AUTOCHUNK_AVAILABLE
+
+BATCH_SIZE = 2
+SEQ_LENGTH = 256
+
+
+def get_data(shape: tuple) -> Tuple[List, List]:
+    input_ids = torch.zeros(shape, dtype=torch.int64)
+    token_type_ids = torch.zeros(shape, dtype=torch.int64)
+    attention_mask = torch.ones(shape, dtype=torch.int64)
+    meta_args = dict(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
+    concrete_args = {"past_key_values": None}
+    sequence = ["input_ids", "past_key_values", "attention_mask", "token_type_ids"]
+    return meta_args, concrete_args, sequence
+
+
+@pytest.mark.skipif(
+    not (AUTOCHUNK_AVAILABLE and HAS_REPO),
+    reason="torch version is lower than 1.12.0",
+)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("shape", [(BATCH_SIZE, SEQ_LENGTH)])
+@pytest.mark.parametrize("max_memory", [None, 4.5, 5])
+def test_gpt(model, shape, max_memory):
+    run_func = partial(
+        run_test,
+        data=get_data(shape),
+        max_memory=max_memory,
+        model=model,
+        config=GPT2Config(n_embd=96, n_position=shape[1], n_layer=2, n_head=4),
+        print_code=False,
+        print_mem=False,
+        print_progress=False,
+    )
+    mp.spawn(run_func, nprocs=1)
+
+
+if __name__ == "__main__":
+    run_test(
+        rank=0,
+        data=get_data((BATCH_SIZE, SEQ_LENGTH)),
+        max_memory=None,
+        model=GPT2Model,
+        config=GPT2Config(n_embd=96, n_position=SEQ_LENGTH, n_layer=2, n_head=4),
+        print_code=True,
+        print_mem=True,
+        print_progress=False,
+    )
diff --git a/tests/test_autochunk/test_transformer/test_transformer_utils.py b/tests/test_autochunk/test_transformer/test_transformer_utils.py
new file mode 100644
index 000000000000..d33fc04c5b75
--- /dev/null
+++ b/tests/test_autochunk/test_transformer/test_transformer_utils.py
@@ -0,0 +1,123 @@
+from typing import Any, Dict, List
+
+import torch
+import torch.fx
+
+import colossalai
+from colossalai.autochunk.autochunk_codegen import AUTOCHUNK_AVAILABLE
+from colossalai.core import global_context as gpc
+from colossalai.fx.graph_module import ColoGraphModule
+from colossalai.fx.passes.meta_info_prop import MetaInfoProp
+from colossalai.utils import free_port
+
+if AUTOCHUNK_AVAILABLE:
+    from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
+    from colossalai.fx.profiler import MetaTensor
+    from colossalai.fx.tracer.experimental import ColoTracer, symbolic_trace
+
+
+def assert_codegen_run(
+    model: Any,
+    data: tuple,
+    max_memory: int = None,
+    print_mem: bool = False,
+    print_progress: bool = False,
+    print_code: bool = False,
+) -> List[Dict]:
+    meta_args, concrete_args, sequence = data
+    if concrete_args is None:
+        concrete_args = {}
+
+    # trace the meta graph and setup codegen
+    meta_graph = symbolic_trace(
+        model,
+        meta_args={k: v.to(torch.device("meta")) for k, v in meta_args.items()},
+        concrete_args={k: v for k, v in concrete_args.items()},
+    )
+    interp = MetaInfoProp(meta_graph)
+    meta_tensors = [meta_args[i] if i in meta_args else concrete_args[i] for i in sequence]
+    meta_tensors = [MetaTensor(i, fake_device="cuda:0") if isinstance(i, torch.Tensor) else i for i in meta_tensors]
+    interp.propagate(*meta_tensors)
+    codegen = AutoChunkCodeGen(
+        meta_graph,
+        max_memory=max_memory,
+        print_mem=print_mem,
+        print_progress=print_progress,
+    )
+    chunks = codegen.chunk_infos
+
+    # trace and recompile
+    # MetaInfoProp requires symbolic_trace but CodeGen requires ColoTracer
+    graph = ColoTracer().trace(
+        model.cuda(),
+        meta_args={k: v.to(torch.device("meta")) for k, v in meta_args.items()},
+        concrete_args={k: v for k, v in concrete_args.items()},
+    )
+    graph.set_codegen(codegen)
+    gm = ColoGraphModule(model, graph, ckpt_codegen=False)
+    gm.recompile()
+
+    # assert chunk in code
+    code = graph.python_code("self").src
+    if print_code:
+        print(code)
+    assert "chunk_result = None;  chunk_size = None;" in code
+
+    # assert result
+    inputs = [meta_args[i] if i in meta_args else concrete_args[i] for i in sequence]
+    inputs = [i.cuda() if isinstance(i, torch.Tensor) else i for i in inputs]
+    model.cuda().eval()
+    gm.eval()
+    with torch.no_grad():
+        out_gm = gm(*inputs)
+        out_model = model(*inputs)
+    for k in out_model.keys():
+        if torch.is_tensor(out_gm[k]):
+            assert torch.equal(
+                out_model[k], out_gm[k]
+            ), f'{model.__class__.__name__} has incorrect output {k}, expect {out_model[k]}, but got {out_gm[k]}'
+
+    return chunks
+
+
+def run_test(
+    rank: int,
+    model: Any,
+    config: Any,
+    data: tuple,
+    max_memory: int,
+    print_code: bool,
+    print_mem: bool,
+    print_progress: bool,
+    get_chunk_target: Any = None,
+) -> None:
+    model = model(config=config)
+    # launch colossalai
+    colossalai.launch(
+        config={},
+        rank=rank,
+        world_size=1,
+        host="localhost",
+        port=free_port(),
+        backend="nccl",
+    )
+
+    # build model and input
+    chunks = assert_codegen_run(
+        model,
+        data=data,
+        max_memory=max_memory,
+        print_code=print_code,
+        print_mem=print_mem,
+        print_progress=print_progress,
+    )
+
+    if get_chunk_target is not None:
+        chunk_found = [i["region"] for i in chunks]
+        chunk_target = get_chunk_target()[max_memory]
+        assert (chunk_found == chunk_target), "found regions %s doesn't equal target regions %s" % (
+            str(chunk_found),
+            str(chunk_target),
+        )
+
+    gpc.destroy()

From f477a14f4aeb49f7a30ee0f46775040391a96e1c Mon Sep 17 00:00:00 2001
From: YuliangLiu0306 <72588413+YuliangLiu0306@users.noreply.github.com>
Date: Tue, 31 Jan 2023 17:42:45 +0800
Subject: [PATCH 230/503] [hotfix] fix autoparallel demo (#2533)

---
 .../auto_parallel/auto_parallel_with_resnet.py        | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py b/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py
index 15429f19cbcf..a6a9ad0a312c 100644
--- a/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py
+++ b/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py
@@ -3,8 +3,9 @@
 from tqdm import tqdm
 
 import colossalai
-from colossalai.auto_parallel.tensor_shard.initialize import autoparallelize
+from colossalai.auto_parallel.tensor_shard.initialize import initialize_model
 from colossalai.core import global_context as gpc
+from colossalai.device.device_mesh import DeviceMesh
 from colossalai.logging import get_dist_logger
 from colossalai.nn.lr_scheduler import CosineAnnealingLR
 
@@ -22,9 +23,14 @@ def main():
 
     # trace the model with meta data
     model = resnet50(num_classes=10).cuda()
+
     input_sample = {'x': torch.rand([gpc.config.BATCH_SIZE * torch.distributed.get_world_size(), 3, 32, 32]).to('meta')}
+    device_mesh = DeviceMesh(physical_mesh_id=torch.tensor([0, 1, 2, 3]), mesh_shape=[2, 2], init_process_group=True)
+    model, solution = initialize_model(model, input_sample, device_mesh=device_mesh, return_solution=True)
 
-    model = autoparallelize(model, input_sample)
+    if gpc.get_global_rank() == 0:
+        for node_strategy in solution:
+            print(node_strategy)
     # build criterion
     criterion = torch.nn.CrossEntropyLoss()
 
@@ -52,6 +58,7 @@ def main():
             output = model(img)
             train_loss = criterion(output, label)
             train_loss.backward(train_loss)
+            torch.cuda.synchronize()
             optimizer.step()
         lr_scheduler.step()
 

From 05671fcb42289879c55c4c2c0e6564752bd1c76b Mon Sep 17 00:00:00 2001
From: oahzxl <43881818+oahzxl@users.noreply.github.com>
Date: Wed, 1 Feb 2023 13:18:51 +0800
Subject: [PATCH 231/503] [autochunk] support multi outputs chunk search
 (#2538)

Support multi outputs chunk search. Previously we only support single output chunk search. It is more flexible and improve performance by a large margin. For transformer, we reduce memory by 40% than previous search strategy.

1. rewrite search strategy to support multi outputs chunk search
2. fix many, many bugs
3. update tests
---
 colossalai/autochunk/autochunk_codegen.py     | 103 +++++----
 colossalai/autochunk/estimate_memory.py       |  12 +-
 colossalai/autochunk/reorder_graph.py         |  32 ++-
 colossalai/autochunk/search_chunk.py          |  69 ++++--
 colossalai/autochunk/select_chunk.py          |  10 +-
 colossalai/autochunk/trace_flow.py            | 207 ++++++++++--------
 colossalai/autochunk/trace_indice.py          |  52 ++---
 colossalai/autochunk/utils.py                 |  84 ++++++-
 .../test_alphafold/test_alphafold_utils.py    |  22 +-
 .../test_alphafold/test_evoformer_block.py    |  14 +-
 .../test_alphafold/test_evoformer_stack.py    |   5 +-
 .../test_alphafold/test_extramsa_block.py     |  14 +-
 .../test_transformer/test_autochunk_gpt.py    |  16 +-
 .../test_transformer_utils.py                 |  46 ++--
 14 files changed, 428 insertions(+), 258 deletions(-)

diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
index ddf64dc8ff49..82937db9f6ba 100644
--- a/colossalai/autochunk/autochunk_codegen.py
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -25,7 +25,7 @@
 from torch.fx.node import Argument, Node, _get_qualified_name, _type_repr, map_arg
 
 from .search_chunk import SearchChunk
-from .utils import delete_free_var_from_last_use, find_idx_by_name, get_logger, get_node_shape
+from .utils import delete_free_var_from_last_use, get_logger, get_node_name, get_node_shape
 
 
 def _gen_chunk_slice_dim(chunk_dim: int, chunk_indice_name: str, shape: List) -> str:
@@ -51,7 +51,7 @@ def _gen_chunk_slice_dim(chunk_dim: int, chunk_indice_name: str, shape: List) ->
     return new_shape
 
 
-def _gen_loop_start(chunk_input: List[Node], chunk_output: Node, chunk_ouput_dim: int, chunk_size=2) -> str:
+def _gen_loop_start(chunk_input: List[Node], chunk_output: List[Node], chunk_ouput_dim: int, chunk_size=2) -> str:
     """
     Generate chunk loop start
 
@@ -70,22 +70,28 @@ def _gen_loop_start(chunk_input: List[Node], chunk_output: Node, chunk_ouput_dim
         context (str): generated str
     """
     input_node = chunk_input[0]
-    out_shape = get_node_shape(chunk_output)
-    out_str = str(list(out_shape))
-    context = (
-        "chunk_result = torch.empty(%s, dtype=%s.dtype, device=%s.device); chunk_size = %d\nfor chunk_idx in range" %
-        (out_str, input_node.name, input_node.name, chunk_size))
-    context += "(0, %d, chunk_size):\n" % (out_shape[chunk_ouput_dim])
+
+    context = ""
+    for i in range(len(chunk_output)):
+        shape_str = str(list(get_node_shape(chunk_output[i])))
+        if get_node_name(chunk_output[i]) == "split":
+            tensor_str = "torch.empty(%s, dtype=%s.dtype, device=%s.device), " % (shape_str, input_node.name,
+                                                                                  input_node.name)
+            tensor_str = tensor_str * len(chunk_output[i].meta['tensor_meta'])
+            tensor_str = "[" + tensor_str[:-2] + "]"
+            context += "%s = %s;  " % (chunk_output[i].name, tensor_str)
+        else:
+            context += "%s = torch.empty(%s, dtype=%s.dtype, device=%s.device);  " % (chunk_output[i].name, shape_str,
+                                                                                      input_node.name, input_node.name)
+
+    out_shape = get_node_shape(chunk_output[0])
+    chunk_shape = out_shape[chunk_ouput_dim[0]]
+    context += "chunk_size = %d\nfor chunk_idx in range(0, %d, chunk_size):\n" % (chunk_size, chunk_shape)
     return context
 
 
-def _gen_loop_end(
-    chunk_inputs: List[Node],
-    chunk_non_compute_inputs: List[Node],
-    chunk_outputs: Node,
-    chunk_outputs_dim: int,
-    node_list: List[Node],
-) -> str:
+def _gen_loop_end(chunk_inputs: List[Node], chunk_non_compute_inputs: List[Node], node_list: List[Node],
+                  chunk_outputs_idx: int, chunk_outputs_non_tensor: List[Node], search_chunk: SearchChunk) -> str:
     """
     Generate chunk loop end
 
@@ -102,22 +108,13 @@ def _gen_loop_end(
     Returns:
         context (str): generated str
     """
-    chunk_outputs_name = chunk_outputs.name
-    chunk_outputs_idx = find_idx_by_name(chunk_outputs_name, node_list)
-    chunk_output_shape = chunk_outputs.meta["tensor_meta"].shape
-    chunk_slice = _gen_chunk_slice_dim(chunk_outputs_dim, "chunk_idx", chunk_output_shape)
-    context = "    chunk_result%s = %s;  %s = None\n" % (
-        chunk_slice,
-        chunk_outputs_name,
-        chunk_outputs_name,
-    )
-    context += (chunk_outputs_name + " = chunk_result;  chunk_result = None;  chunk_size = None")
-
+    context = "chunk_size = None"
     # determine if its the last use for chunk input
     for chunk_input in chunk_inputs + chunk_non_compute_inputs:
-        if all([find_idx_by_name(user.name, node_list) <= chunk_outputs_idx for user in chunk_input.users.keys()]):
+        if all([search_chunk.node_mgr.find_node_idx(user) <= chunk_outputs_idx for user in chunk_input.users.keys()]):
             context += ";  %s = None" % chunk_input.name
-
+    for chunk_output_non_tensor, chunk_output_non_tensor_val in chunk_outputs_non_tensor.items():
+        context += ";  %s = %s" % (chunk_output_non_tensor.name, chunk_output_non_tensor_val)
     context += "\n"
     return context
 
@@ -158,7 +155,7 @@ def _replace_ones_like(
     add chunk slice for new tensor op such as ones like
     """
     if "ones_like" in node.name:
-        meta_node = search_chunk.trace_indice.node_list[node_idx]
+        meta_node = search_chunk.node_mgr.get_node_by_idx(node_idx)
         chunk_dim = chunk_infos[region_idx]["node_chunk_dim"][meta_node]["chunk_dim"]
         if get_node_shape(meta_node)[chunk_dim] != 1:
             source_node = meta_node.args[0].args[0]
@@ -169,21 +166,37 @@ def _replace_ones_like(
     return body
 
 
-def _replace_input_node(
-    chunk_inputs: List[Node],
+def _add_node_slice(
+    chunk_nodes: List[Node],
     region_idx: int,
-    chunk_inputs_dim: Dict,
+    chunk_nodes_dim: Dict,
     node_idx: int,
     body: List[str],
+    node: Node,
 ) -> List[str]:
     """
     add chunk slice for input nodes
     """
-    for input_node_idx, input_node in enumerate(chunk_inputs[region_idx]):
-        for idx, dim in chunk_inputs_dim[region_idx][input_node_idx].items():
-            if idx == node_idx:
-                chunk_slice = _gen_chunk_slice_dim(dim[0], "chunk_idx", get_node_shape(input_node))
-                body[-1] = _replace_name(body[-1], input_node.name, input_node.name + chunk_slice)
+    for chunk_node_idx, chunk_node in enumerate(chunk_nodes[region_idx]):
+        # inputs node
+        if isinstance(chunk_nodes_dim[region_idx][chunk_node_idx], dict):
+            for idx, dim in chunk_nodes_dim[region_idx][chunk_node_idx].items():
+                if idx == node_idx:
+                    chunk_slice = _gen_chunk_slice_dim(dim[0], "chunk_idx", get_node_shape(chunk_node))
+                    body[-1] = _replace_name(body[-1], chunk_node.name, chunk_node.name + chunk_slice)
+        # outputs node
+        else:
+            if chunk_node.name == node.name or (chunk_node.name in [i.name for i in node.all_input_nodes]):
+                chunk_slice = _gen_chunk_slice_dim(chunk_nodes_dim[region_idx][chunk_node_idx], "chunk_idx",
+                                                   get_node_shape(chunk_node))
+                if get_node_name(chunk_node) == "split":
+                    split_chunk_slice = ""
+                    for i in range(len(chunk_node.meta['tensor_meta'])):
+                        split_chunk_slice += "%s[%d]%s, " % (chunk_node.name, i, chunk_slice)
+                    split_chunk_slice = split_chunk_slice[:-2]
+                    body[-1] = _replace_name(body[-1], chunk_node.name, split_chunk_slice)
+                else:
+                    body[-1] = _replace_name(body[-1], chunk_node.name, chunk_node.name + chunk_slice)
     return body
 
 
@@ -222,7 +235,8 @@ def emit_code_with_chunk(
     chunk_inputs_names = [j.name for i in chunk_inputs for j in i] + [j.name for i in chunk_inputs_non_chunk for j in i]
 
     # chunk outputs
-    chunk_outputs = [i["outputs"][0] for i in chunk_infos]
+    chunk_outputs = [i["outputs"] for i in chunk_infos]
+    chunk_outputs_non_tensor = [i["outputs_non_tensor"] for i in chunk_infos]
     chunk_outputs_dim = [i["outputs_dim"] for i in chunk_infos]
 
     node_list = search_chunk.reorder_graph.reorder_node_list(node_list)
@@ -248,7 +262,9 @@ def emit_code_with_chunk(
         if within_chunk_region:
             emit_node_func(node, body)
             # replace input var with chunk var
-            body = _replace_input_node(chunk_inputs, region_idx, chunk_inputs_dim, node_idx, body)
+            body = _add_node_slice(chunk_inputs, region_idx, chunk_inputs_dim, node_idx, body, node)
+            # replace output var with chunk var
+            body = _add_node_slice(chunk_outputs, region_idx, chunk_outputs_dim, node_idx, body, node)
             # ones like
             body = _replace_ones_like(search_chunk, chunk_infos, region_idx, node_idx, node, body)
             # reassgin reshape size
@@ -263,13 +279,8 @@ def emit_code_with_chunk(
         # generate chunk region end
         if node_idx in chunk_ends:
             body.append(
-                _gen_loop_end(
-                    chunk_inputs[region_idx],
-                    chunk_inputs_non_chunk[region_idx],
-                    chunk_outputs[region_idx],
-                    chunk_outputs_dim[region_idx],
-                    node_list,
-                ))
+                _gen_loop_end(chunk_inputs[region_idx], chunk_inputs_non_chunk[region_idx], node_list,
+                              chunk_ends[region_idx], chunk_outputs_non_tensor[region_idx], search_chunk))
             within_chunk_region = False
 
         node_idx += 1
diff --git a/colossalai/autochunk/estimate_memory.py b/colossalai/autochunk/estimate_memory.py
index a03a5413bc34..f457696e6310 100644
--- a/colossalai/autochunk/estimate_memory.py
+++ b/colossalai/autochunk/estimate_memory.py
@@ -6,7 +6,7 @@
 
 from colossalai.fx.profiler import activation_size, parameter_size
 
-from .utils import delete_free_var_from_last_use, find_idx_by_name, get_node_shape, is_non_memory_node
+from .utils import NodeMgr, delete_free_var_from_last_use, get_node_shape, is_non_memory_node
 
 
 class EstimateMemory(object):
@@ -14,8 +14,8 @@ class EstimateMemory(object):
     Estimate memory with chunk
     """
 
-    def __init__(self) -> None:
-        pass
+    def __init__(self, node_mgr: NodeMgr) -> None:
+        self.node_mgr = node_mgr
 
     def _get_meta_node_size(self, x):
         x = x.meta["tensor_meta"]
@@ -78,7 +78,7 @@ def _get_chunk_inputs_size(self, chunk_inputs, chunk_inputs_non_chunk, node_list
         nodes_to_delete = []
         for chunk_input in chunk_inputs + chunk_inputs_non_chunk:
             chunk_input_users = chunk_input.users.keys()
-            chunk_input_users_idx = [find_idx_by_name(i.name, node_list) for i in chunk_input_users]
+            chunk_input_users_idx = [self.node_mgr.find_node_idx(i) for i in chunk_input_users]
             if all(i <= chunk_end_idx for i in chunk_input_users_idx):
                 if chunk_input not in nodes_to_delete:
                     nodes_to_delete.append(chunk_input)
@@ -212,7 +212,7 @@ def estimate_chunk_inference_mem(
             chunk_inputs_non_chunk = [i["inputs_non_chunk"] for i in chunk_infos]
             chunk_inputs_names = [j.name for i in chunk_inputs for j in i
                                  ] + [j.name for i in chunk_inputs_non_chunk for j in i]
-            chunk_outputs = [i["outputs"][0] for i in chunk_infos]
+            chunk_outputs = [i["outputs"] for i in chunk_infos]
             chunk_node_dim = [i["node_chunk_dim"] for i in chunk_infos]
             chunk_sizes = [i["chunk_size"] if "chunk_size" in i else 1 for i in chunk_infos]
 
@@ -221,7 +221,7 @@ def estimate_chunk_inference_mem(
             if use_chunk and idx in chunk_starts:
                 chunk_within = True
                 chunk_region_idx = chunk_starts.index(idx)
-                act_memory += self._get_output_node_size(chunk_outputs[chunk_region_idx]) / (1024**2)
+                act_memory += sum(self._get_output_node_size(i) for i in chunk_outputs[chunk_region_idx]) / (1024**2)
 
             # determine chunk ratio for current node
             if chunk_within:
diff --git a/colossalai/autochunk/reorder_graph.py b/colossalai/autochunk/reorder_graph.py
index 0343e52eedd6..3b00d47fb955 100644
--- a/colossalai/autochunk/reorder_graph.py
+++ b/colossalai/autochunk/reorder_graph.py
@@ -1,5 +1,5 @@
 from .trace_indice import TraceIndice
-from .utils import find_idx_by_name
+from .utils import NodeMgr
 
 
 class ReorderGraph(object):
@@ -7,31 +7,27 @@ class ReorderGraph(object):
     Reorder node list and indice trace list
     """
 
-    def __init__(self, trace_indice: TraceIndice) -> None:
+    def __init__(self, trace_indice: TraceIndice, node_mgr: NodeMgr) -> None:
         self.trace_indice = trace_indice
-        self.all_reorder_map = {
-            i: i for i in range(len(self.trace_indice.indice_trace_list))
-        }
+        self.node_mgr = node_mgr
+        self.all_reorder_map = {i: i for i in range(len(self.node_mgr.get_node_list()))}
 
     def _get_reorder_map(self, chunk_info):
-        reorder_map = {i: i for i in range(len(self.trace_indice.node_list))}
+        reorder_map = {i: i for i in range(len(self.node_mgr.get_node_list()))}
 
         chunk_region_start = chunk_info["region"][0]
         chunk_region_end = chunk_info["region"][1]
         chunk_prepose_nodes = chunk_info["args"]["prepose_nodes"]
-        chunk_prepose_nodes_idx = [
-            find_idx_by_name(i.name, self.trace_indice.node_list)
-            for i in chunk_prepose_nodes
-        ]
+        chunk_prepose_nodes_idx = [self.node_mgr.find_node_idx(i) for i in chunk_prepose_nodes]
         # put prepose nodes ahead
         for idx, n in enumerate(chunk_prepose_nodes):
             n_idx = chunk_prepose_nodes_idx[idx]
             reorder_map[n_idx] = chunk_region_start + idx
         # put other nodes after prepose nodes
-        for n in self.trace_indice.node_list[chunk_region_start : chunk_region_end + 1]:
+        for n in self.node_mgr.get_node_slice_by_idx(chunk_region_start, chunk_region_end + 1):
             if n in chunk_prepose_nodes:
                 continue
-            n_idx = find_idx_by_name(n.name, self.trace_indice.node_list)
+            n_idx = self.node_mgr.find_node_idx(n)
             pos = sum([n_idx < i for i in chunk_prepose_nodes_idx])
             reorder_map[n_idx] = n_idx + pos
 
@@ -44,7 +40,7 @@ def _reorder_chunk_info(self, chunk_info, reorder_map):
             chunk_info["region"][1],
         )
         new_inputs_dim = []
-        for idx, input_dim in enumerate(chunk_info["inputs_dim"]):
+        for _, input_dim in enumerate(chunk_info["inputs_dim"]):
             new_input_dim = {}
             for k, v in input_dim.items():
                 new_input_dim[reorder_map[k]] = v
@@ -57,16 +53,14 @@ def _update_all_reorder_map(self, reorder_map):
             self.all_reorder_map[origin_idx] = reorder_map[map_idx]
 
     def _reorder_self_node_list(self, reorder_map):
-        new_node_list = [None for _ in range(len(self.trace_indice.node_list))]
+        new_node_list = [None for _ in range(len(self.node_mgr.get_node_list()))]
         for old_idx, new_idx in reorder_map.items():
-            new_node_list[new_idx] = self.trace_indice.node_list[old_idx]
-        self.trace_indice.node_list = new_node_list
+            new_node_list[new_idx] = self.node_mgr.get_node_by_idx(old_idx)
+        self.node_mgr.update_node_list(new_node_list)
 
     def _reorder_idx_trace(self, reorder_map):
         # reorder list
-        new_idx_trace_list = [
-            None for _ in range(len(self.trace_indice.indice_trace_list))
-        ]
+        new_idx_trace_list = [None for _ in range(len(self.trace_indice.indice_trace_list))]
         for old_idx, new_idx in reorder_map.items():
             new_idx_trace_list[new_idx] = self.trace_indice.indice_trace_list[old_idx]
         self.trace_indice.indice_trace_list = new_idx_trace_list
diff --git a/colossalai/autochunk/search_chunk.py b/colossalai/autochunk/search_chunk.py
index 720f3d92553a..0278e03f78de 100644
--- a/colossalai/autochunk/search_chunk.py
+++ b/colossalai/autochunk/search_chunk.py
@@ -9,6 +9,7 @@
 from .trace_flow import TraceFlow
 from .trace_indice import TraceIndice
 from .utils import (
+    NodeMgr,
     find_chunk_compute_input_and_output_nodes,
     get_logger,
     get_node_shape,
@@ -49,15 +50,17 @@ class SearchChunk(object):
     def __init__(self, gm, max_memory=None, print_mem=False, print_progress=False) -> None:
         self.print_mem = print_mem
         self.print_progress = print_progress
-        self.trace_indice = TraceIndice(list(gm.graph.nodes))
-        self.estimate_memory = EstimateMemory()
+        self.node_mgr = NodeMgr(gm)
+        self.trace_indice = TraceIndice(self.node_mgr)
+        self.estimate_memory = EstimateMemory(self.node_mgr)
         self._init_trace()
-        self.trace_flow = TraceFlow(self.trace_indice)
-        self.reorder_graph = ReorderGraph(self.trace_indice)
+        self.trace_flow = TraceFlow(self.trace_indice, self.node_mgr)
+        self.reorder_graph = ReorderGraph(self.trace_indice, self.node_mgr)
         self.select_chunk = SelectChunk(
             self.trace_indice,
             self.estimate_memory,
             self.reorder_graph,
+            self.node_mgr,
             max_memory=max_memory,
         )
 
@@ -67,7 +70,7 @@ def _init_trace(self) -> None:
         reduce the computation complexity of trace_indice
         """
         # find all max ranges
-        active_nodes = self.estimate_memory.get_active_nodes(self.trace_indice.node_list)
+        active_nodes = self.estimate_memory.get_active_nodes(self.node_mgr.get_node_list())
         cur_node_idx = len(self._get_free_var_idx())
         max_chunk_region_list = []
         while True:
@@ -100,7 +103,7 @@ def _get_free_var_idx(self) -> List:
             free_var_idx (List): all indexs of free vars
         """
         free_var_idx = []
-        for idx, n in enumerate(self.trace_indice.node_list):
+        for idx, n in enumerate(self.node_mgr.get_node_list()):
             if n.op == "placeholder" and get_node_shape(n) is not None:
                 free_var_idx.append(idx)
         return free_var_idx
@@ -164,6 +167,44 @@ def _search_max_chunk_region(self, active_node: List, peak_node_idx: int, chunk_
                     chunk_region_end = region[0] - 1
         return chunk_region_start, chunk_region_end
 
+    def _find_chunk_info(self, input_trace, output_trace, start_idx, end_idx) -> List:
+        """
+        Find chunk info for a region.
+
+        We are given the region start and region end, and need to find out all chunk info for it.
+        We first loop every dim of start node and end node, to see if we can find dim pair,
+        which is linked in a flow and not computed.
+        If found, we then search flow in the whole region to find out all chunk infos.
+
+        Args:
+            input_trace (List): node's input trace in region
+            output_trace (List): node's output trace in region
+            start_idx (int): region start node index
+            end_idx (int): region end node index
+
+        Returns:
+            chunk_infos: possible regions found
+        """
+        start_traces = input_trace[start_idx]
+        if len(start_traces) > 1:    # TODO need to be removed
+            return []
+        end_trace = output_trace[end_idx]
+        end_node = self.node_mgr.get_node_by_idx(end_idx)
+
+        chunk_infos = []
+        for end_dim, _ in enumerate(end_trace["indice"]):
+            for start_node, start_trace in start_traces.items():
+                for start_dim, _ in enumerate(start_trace["indice"]):
+                    if not self.trace_flow.check_region_start_end(start_node, start_dim, start_idx, end_node, end_dim,
+                                                                  end_idx):
+                        continue
+                    # flow search
+                    chunk_info = self.trace_flow.flow_search(start_idx, start_dim, end_idx, end_dim)
+                    if chunk_info is None:
+                        continue
+                    chunk_infos.append(chunk_info)
+        return chunk_infos
+
     def _search_possible_chunk_regions(self, max_chunk_region: Tuple, peak_node: Node) -> List:
         """
         Search every possible region within the max chunk region.
@@ -178,7 +219,7 @@ def _search_possible_chunk_regions(self, max_chunk_region: Tuple, peak_node: Nod
         possible_chunk_region = []
         output_trace = copy.deepcopy(self.trace_indice.indice_trace_list)
         input_trace = []    # trace of a node's input nodes
-        for _, n in enumerate(self.trace_indice.node_list):
+        for _, n in enumerate(self.node_mgr.get_node_list()):
             cur_trace = {}
             for arg in n.args:
                 if type(arg) == type(n) and not is_non_compute_node_except_placeholder(arg):
@@ -188,11 +229,11 @@ def _search_possible_chunk_regions(self, max_chunk_region: Tuple, peak_node: Nod
         for start_idx in range(max_chunk_region[0], peak_node + 1):
             for end_idx in range(peak_node, max_chunk_region[1] + 1):
                 # skip non compute nodes
-                if is_non_compute_node(self.trace_indice.node_list[start_idx]) or is_non_compute_node(
-                        self.trace_indice.node_list[end_idx]):
+                if is_non_compute_node(self.node_mgr.get_node_by_idx(start_idx)) or is_non_compute_node(
+                        self.node_mgr.get_node_by_idx(end_idx)):
                     continue
                 # select free dim
-                chunk_info = self.trace_flow.find_chunk_info(input_trace, output_trace, start_idx, end_idx)
+                chunk_info = self._find_chunk_info(input_trace, output_trace, start_idx, end_idx)
                 if len(chunk_info) > 0:
                     possible_chunk_region.extend(chunk_info)
         return possible_chunk_region
@@ -254,7 +295,7 @@ def search_region(self) -> Dict:
             init_mem_peak,
             _,
             active_node,
-        ) = self.estimate_memory.estimate_chunk_inference_mem(self.trace_indice.node_list)
+        ) = self.estimate_memory.estimate_chunk_inference_mem(self.node_mgr.get_node_list())
         mem_peak = init_mem_peak
 
         while True:
@@ -267,7 +308,7 @@ def search_region(self) -> Dict:
                 mem_peak,
                 _,
                 active_node,
-            ) = self.estimate_memory.estimate_chunk_inference_mem(self.trace_indice.node_list, chunk_infos)
+            ) = self.estimate_memory.estimate_chunk_inference_mem(self.node_mgr.get_node_list(), chunk_infos)
 
             if self.print_progress:
                 get_logger().info("AutoChunk find chunk region %d = (%d, %d)" %
@@ -277,5 +318,7 @@ def search_region(self) -> Dict:
                 break
         if self.print_mem:
             self.print_mem = False
-            self.estimate_memory.estimate_chunk_inference_mem(self.trace_indice.node_list, chunk_infos, print_mem=True)
+            self.estimate_memory.estimate_chunk_inference_mem(self.node_mgr.get_node_list(),
+                                                              chunk_infos,
+                                                              print_mem=True)
         return chunk_infos
diff --git a/colossalai/autochunk/select_chunk.py b/colossalai/autochunk/select_chunk.py
index 1f3a95727054..1bb7d318cacf 100644
--- a/colossalai/autochunk/select_chunk.py
+++ b/colossalai/autochunk/select_chunk.py
@@ -1,7 +1,7 @@
 from .estimate_memory import EstimateMemory
 from .reorder_graph import ReorderGraph
 from .trace_indice import TraceIndice
-from .utils import is_non_compute_node
+from .utils import NodeMgr, is_non_compute_node
 
 
 class SelectChunk(object):
@@ -11,11 +11,13 @@ def __init__(
         trace_indice: TraceIndice,
         estimate_memory: EstimateMemory,
         reorder_graph: ReorderGraph,
+        node_mgr: NodeMgr,
         max_memory=None,
     ):
         self.trace_indice = trace_indice
         self.estimate_memory = estimate_memory
         self.reorder_graph = reorder_graph
+        self.node_mgr = node_mgr
         if max_memory is not None:
             self.stratge = "fit_memory"
             self.max_memory = max_memory    # MB
@@ -68,7 +70,7 @@ def _select_fit_memory_chunk_region(self, possible_chunk_regions, chunk_infos, p
         regions_dict = []
         for region in possible_chunk_regions:
             cur_region = region.copy()
-            cur_node_list, cur_region = self.reorder_graph.tmp_reorder(self.trace_indice.node_list, cur_region)
+            cur_node_list, cur_region = self.reorder_graph.tmp_reorder(self.node_mgr.get_node_list(), cur_region)
             cur_chunk_infos = chunk_infos + [cur_region]
             cur_mem_peak = self.estimate_memory.estimate_chunk_inference_mem(cur_node_list, cur_chunk_infos)[0]
             cur_chunk_region_peak = cur_mem_peak[max_possible_chunk_region[0]:max_possible_chunk_region[1] + 1]
@@ -134,7 +136,7 @@ def _chunk_size_binary_search(self, left, right, chunk_region_dict, chunk_infos)
 
     def _get_compute_node_num(self, start, end):
         count = 0
-        for i in self.trace_indice.node_list[start:end + 1]:
+        for i in self.node_mgr.get_node_slice_by_idx(start, end + 1):
             if not is_non_compute_node(i):
                 count += 1
         return count
@@ -161,7 +163,7 @@ def _select_min_memory_chunk_region(self, possible_chunk_regions, chunk_infos, p
         regions_dict_list = []
         for region in possible_chunk_regions:
             cur_region = region.copy()
-            cur_node_list, cur_region = self.reorder_graph.tmp_reorder(self.trace_indice.node_list, cur_region)
+            cur_node_list, cur_region = self.reorder_graph.tmp_reorder(self.node_mgr.get_node_list(), cur_region)
             cur_chunk_infos = chunk_infos + [cur_region]
             cur_mem_peak = self.estimate_memory.estimate_chunk_inference_mem(cur_node_list, cur_chunk_infos)[0]
             cur_chunk_region_peak = cur_mem_peak[max_possible_chunk_region[0]:max_possible_chunk_region[1] + 1]
diff --git a/colossalai/autochunk/trace_flow.py b/colossalai/autochunk/trace_flow.py
index df7343764d05..11dbb266d4b4 100644
--- a/colossalai/autochunk/trace_flow.py
+++ b/colossalai/autochunk/trace_flow.py
@@ -4,9 +4,10 @@
 
 from .trace_indice import TraceIndice
 from .utils import (
+    NodeMgr,
     find_chunk_all_input_nodes,
     find_chunk_compute_input_and_output_nodes,
-    find_idx_by_name,
+    find_tensor_shape_node,
     flat_list,
     get_node_name,
     get_node_shape,
@@ -16,8 +17,9 @@
 
 class TraceFlow(object):
 
-    def __init__(self, trace_indice: TraceIndice) -> None:
+    def __init__(self, trace_indice: TraceIndice, node_mgr: NodeMgr) -> None:
         self.trace_indice = trace_indice
+        self.node_mgr = node_mgr
 
     def check_index_source(self, start_dim, start_node, start_idx, end_dim, end_node):
         """
@@ -31,7 +33,8 @@ def check_index_source(self, start_dim, start_node, start_idx, end_dim, end_node
         Returns:
             bool: True if check pass
         """
-        start_node_idx = find_idx_by_name(start_node.name, self.trace_indice.node_list)
+        # we use start_node_idx instead of real chunk index
+        start_node_idx = self.node_mgr.find_node_idx(start_node)
         end_node_trace = self.trace_indice._find_trace_from_node(end_node)
         end_node_trace_source = end_node_trace["source"][end_dim]
         sorted_source = sorted(end_node_trace_source.items(), key=lambda d: d[0], reverse=True)
@@ -39,7 +42,7 @@ def check_index_source(self, start_dim, start_node, start_idx, end_dim, end_node
             if node_idx == start_node_idx and start_dim in node_dim:
                 return True
             # it means we meet a node outside the loop, and the node is not input node
-            if node_idx < start_idx:
+            if node_idx < start_node_idx:
                 return False
         return False
 
@@ -61,29 +64,12 @@ def check_index_compute(self, start_idx, end_dim, end_node, end_idx):
             return False
         return True
 
-    def get_node_chunk_dim(self, node_from, node_from_dim, node_to):
-        node_from_source = self.trace_indice._find_source_trace_from_node(node_from)
-        dim_source = node_from_source[node_from_dim]
-        node_to_idx = find_idx_by_name(node_to.name, self.trace_indice.node_list)
-        for k, v in dim_source.items():
-            if k == node_to_idx:
-                return v
-        return None
-
-    def _find_inherit_dim(self, input_node, input_dim, node):
-        input_node_idx = find_idx_by_name(input_node.name, self.trace_indice.node_list)
-        node_trace_source = self.trace_indice._find_source_trace_from_node(node)
-        for node_dim in range(len(get_node_shape(node))):
-            if (input_node_idx in node_trace_source[node_dim]
-                    and input_dim[0] in node_trace_source[node_dim][input_node_idx]):
-                return node_dim
-        return None
-
     def _assgin_single_node_flow(
         self,
         arg_node: Node,
         start_idx: int,
         end_idx: int,
+        cur_node: Node,
         cur_node_dim: int,
         cur_node_compute: Dict,
         cur_node_source: Dict,
@@ -109,7 +95,7 @@ def _assgin_single_node_flow(
         Returns:
             bool: True if this node can be added to the flow, vice versa.
         """
-        arg_idx = find_idx_by_name(arg_node.name, self.trace_indice.node_list)
+        arg_idx = self.node_mgr.find_node_idx(arg_node)
         # arg in chunk range or be inputs
         if not (start_idx <= arg_idx < end_idx):
             return True
@@ -126,6 +112,11 @@ def _assgin_single_node_flow(
                 # chunk dim should be None if shape size is 1
                 if get_node_shape(arg_node)[arg_dim] == 1:
                     arg_dim = None
+                # chunk shape should equal cur node
+                elif get_node_shape(arg_node)[arg_dim] != 1:
+                    if cur_node_dim is not None and get_node_shape(cur_node)[cur_node_dim] != 1:
+                        if get_node_shape(arg_node)[arg_dim] != get_node_shape(cur_node)[cur_node_dim]:
+                            return False
         else:
             arg_dim = None
 
@@ -150,7 +141,7 @@ def _assgin_single_node_flow(
         return True
 
     def _get_all_node_info(self, end_dim, start_idx, end_idx):
-        cur_node_list = [self.trace_indice.node_list[end_idx]]    # start from the last node
+        cur_node_list = [self.node_mgr.get_node_by_idx(end_idx)]    # start from the last node
         all_node_info = {cur_node_list[0]: {"chunk_dim": end_dim, "fix_dim": []}}
 
         while len(cur_node_list) > 0:
@@ -178,6 +169,7 @@ def _get_all_node_info(self, end_dim, start_idx, end_idx):
                         arg,
                         start_idx,
                         end_idx,
+                        cur_node,
                         cur_node_chunk_dim,
                         cur_node_compute,
                         cur_node_source,
@@ -194,7 +186,7 @@ def _get_all_node_info(self, end_dim, start_idx, end_idx):
                         for arg in arg_list:
                             if get_node_shape(arg) is None:
                                 continue
-                            if not (start_idx <= find_idx_by_name(arg.name, self.trace_indice.node_list) < end_idx):
+                            if not (start_idx <= self.node_mgr.find_node_idx(arg) < end_idx):
                                 continue
                             arg_chunk_dim = all_node_info[arg]["chunk_dim"]
                             arg_fix_dim = all_node_info[arg]["fix_dim"]
@@ -232,7 +224,7 @@ def _get_input_nodes_dim(self, inputs: List[Node], start_idx: int, end_idx: int,
         remove_inputs = []
         for input_node in inputs:
             input_dict = {}
-            input_node_idx = find_idx_by_name(input_node.name, self.trace_indice.node_list)
+            input_node_idx = self.node_mgr.find_node_idx(input_node)
             for user in input_node.users.keys():
                 # skip non compute
                 if is_non_compute_node(user):
@@ -240,7 +232,7 @@ def _get_input_nodes_dim(self, inputs: List[Node], start_idx: int, end_idx: int,
                 # untraced node, mostly non compute
                 if user not in all_node_info:
                     continue
-                user_idx = find_idx_by_name(user.name, self.trace_indice.node_list)
+                user_idx = self.node_mgr.find_node_idx(user)
                 if start_idx <= user_idx <= end_idx:
                     chunk_dim = all_node_info[user]["chunk_dim"]
                     if chunk_dim is not None:
@@ -262,7 +254,7 @@ def _get_input_nodes_dim(self, inputs: List[Node], start_idx: int, end_idx: int,
                 inputs.remove(i)
         return inputs, inputs_dim
 
-    def _get_prepose_nodes(self, all_node_info: Dict, start_idx: int, end_idx: int) -> List[Node]:
+    def _get_prepose_nodes(self, all_node_info: Dict, start_idx: int, end_idx: int, chunk_info) -> List[Node]:
         """
         get all useless nodes in chunk region and prepose them
 
@@ -279,8 +271,11 @@ def _get_prepose_nodes(self, all_node_info: Dict, start_idx: int, end_idx: int)
         for node, node_info in all_node_info.items():
             if node_info["chunk_dim"] is None:
                 maybe_prepose_nodes.append(node)
+        for node in self.node_mgr.get_node_slice_by_idx(start_idx, end_idx):
+            if node not in all_node_info and node not in chunk_info["outputs"]:
+                maybe_prepose_nodes.append(node)
         maybe_prepose_nodes.sort(
-            key=lambda x: find_idx_by_name(x.name, self.trace_indice.node_list),
+            key=lambda x: self.node_mgr.find_node_idx(x),
             reverse=True,
         )    # from last node to first node
         prepose_nodes = []
@@ -303,8 +298,7 @@ def _get_prepose_nodes(self, all_node_info: Dict, start_idx: int, end_idx: int)
                         if type(cur_prepose_node_arg) != type(cur_prepose_node):
                             continue
                         # out of loop
-                        if not (start_idx <= find_idx_by_name(cur_prepose_node_arg.name, self.trace_indice.node_list) <
-                                end_idx):
+                        if not (start_idx <= self.node_mgr.find_node_idx(cur_prepose_node_arg) < end_idx):
                             continue
                         # compute op in loop
                         elif cur_prepose_node_arg in all_node_info:
@@ -328,13 +322,12 @@ def _get_prepose_nodes(self, all_node_info: Dict, start_idx: int, end_idx: int)
                     if n in maybe_prepose_nodes:
                         maybe_prepose_nodes.remove(n)
         # sort by index
-        prepose_nodes.sort(key=lambda x: find_idx_by_name(x.name, self.trace_indice.node_list))
-
-        return prepose_nodes
+        prepose_nodes.sort(key=lambda x: self.node_mgr.find_node_idx(x))
+        chunk_info["args"]["prepose_nodes"] = prepose_nodes
 
     def _get_non_chunk_inputs(self, chunk_info, start_idx, end_idx):
         # we need to log input nodes to avoid deleteing them in the loop
-        chunk_node_list = self.trace_indice.node_list[start_idx:end_idx + 1]
+        chunk_node_list = self.node_mgr.get_node_slice_by_idx(start_idx, end_idx + 1)
         # also need to get some prepose node's arg out of non_chunk_inputs
         for n in chunk_info["args"]["prepose_nodes"]:
             chunk_node_list.remove(n)
@@ -345,34 +338,41 @@ def _get_non_chunk_inputs(self, chunk_info, start_idx, end_idx):
         return chunk_info
 
     def flow_search(self, start_idx, start_dim, end_idx, end_dim):
-        inputs, outputs = find_chunk_compute_input_and_output_nodes(self.trace_indice.node_list[start_idx:end_idx + 1])
-        # only single ouput
-        if len(outputs) > 1:
-            return None
+        inputs, outputs = find_chunk_compute_input_and_output_nodes(
+            self.node_mgr.get_node_slice_by_idx(start_idx, end_idx + 1))
 
         # get every node's chunk dim and fix dim
         all_node_info = self._get_all_node_info(end_dim, start_idx, end_idx)
         if all_node_info is None:
             return None
 
-        # get input nodes' chunk dim
-        inputs, inputs_dim = self._get_input_nodes_dim(inputs, start_idx, end_idx, all_node_info)
-        if inputs is None:
-            return None
-
         chunk_info = {
             "region": (start_idx, end_idx),
-            "inputs": inputs,
+            "inputs": [],
             "inputs_non_chunk": [],
-            "inputs_dim": inputs_dim,
-            "outputs": outputs,
-            "outputs_dim": end_dim,
+            "inputs_dim": [],
+            "outputs": [self.node_mgr.get_node_by_idx(end_idx)],
+            "outputs_non_tensor": {},
+            "outputs_dim": [end_dim],
             "node_chunk_dim": all_node_info,
             "args": {},
         }
 
+        # find chunk info for other outputs
+        if len(find_tensor_shape_node(outputs)) > 1:
+            chunk_info = self._get_other_output_info(outputs, start_idx, start_dim, end_idx, end_dim, chunk_info)
+            if chunk_info is None:
+                return None
+
+        # get input nodes' chunk dim
+        inputs, inputs_dim = self._get_input_nodes_dim(inputs, start_idx, end_idx, all_node_info)
+        if inputs is None:
+            return None
+        chunk_info["inputs"] = inputs
+        chunk_info["inputs_dim"] = inputs_dim
+
         # move useless nodes ahead of loop
-        chunk_info["args"]["prepose_nodes"] = self._get_prepose_nodes(all_node_info, start_idx, end_idx)
+        self._get_prepose_nodes(all_node_info, start_idx, end_idx, chunk_info)
 
         # find non chunk inputs
         chunk_info = self._get_non_chunk_inputs(chunk_info, start_idx, end_idx)
@@ -382,6 +382,63 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
 
         return chunk_info
 
+    def _get_other_output_info(self, outputs: List[Node], start_idx: int, start_dim: int, end_idx: int, end_dim: int,
+                               chunk_info: Dict):
+        start_node = self.node_mgr.get_node_by_idx(start_idx)
+        # loop all outputs
+        for output in outputs:
+            output_legal = False
+            output_idx = self.node_mgr.find_node_idx(output)
+            # skip the origin output
+            if output_idx == end_idx:
+                continue
+            # skip non tensor
+            if get_node_shape(output) is None:
+                # log shape tensor
+                if len(output.meta['fwd_out']) > 0 and isinstance(output.meta['fwd_out'][0], int):
+                    chunk_info["outputs_non_tensor"][output] = str(output.meta['fwd_out'])
+                continue
+            # loop every dim of outputs, try to find a legal one
+            for output_dim in range(len(get_node_shape(output))):
+                if not self.check_region_start_end(start_node, start_dim, start_idx, output, output_dim, output_idx):
+                    continue
+                new_all_node_info = self._get_all_node_info(output_dim, start_idx, output_idx)
+                if new_all_node_info is None:
+                    continue
+                # check node info legal
+                if self._update_chunk_info(chunk_info, new_all_node_info, output, output_dim) == True:
+                    output_legal = True
+                    break
+            # not legal
+            if output_legal == False:
+                return None
+        return chunk_info
+
+    def _update_chunk_info(self, chunk_info: Dict, new_all_node_info: Dict, output: Node, output_dim: int) -> bool:
+        """
+        check if there is conflict between new node info and old chunk info. If not, update old chunk info
+        """
+        # check if conflict
+        overlap_flag = False
+        for k, v in new_all_node_info.items():
+            if k in chunk_info["node_chunk_dim"]:
+                overlap_flag = True
+                if chunk_info["node_chunk_dim"][k]["chunk_dim"] != v["chunk_dim"]:
+                    return False
+        # if no overlap, we just consider them as prepose nodes, instead of new output
+        if overlap_flag == False:
+            return True
+        # update chunk info
+        for k, v in new_all_node_info.items():
+            if k in chunk_info["node_chunk_dim"]:
+                chunk_info["node_chunk_dim"][k]["fix_dim"] = list(
+                    set(chunk_info["node_chunk_dim"][k]["fix_dim"] + v["fix_dim"]))
+            else:
+                chunk_info["node_chunk_dim"][k] = v
+        chunk_info["outputs"].append(output)
+        chunk_info["outputs_dim"].append(output_dim)
+        return True
+
     def _reassgin_reshape_size(self, chunk_info):
         """
         Some shape args in reshape may have changed due to chunk
@@ -389,10 +446,17 @@ def _reassgin_reshape_size(self, chunk_info):
         """
         chunk_region = chunk_info["region"]
         reshape_size = {}
-        chunk_shape = get_node_shape(chunk_info["outputs"][0])[chunk_info["outputs_dim"]]
-        for node in self.trace_indice.node_list[chunk_region[0]:chunk_region[1] + 1]:
+        chunk_shape = get_node_shape(chunk_info["outputs"][0])[chunk_info["outputs_dim"][0]]
+        for node in self.node_mgr.get_node_slice_by_idx(chunk_region[0], chunk_region[1] + 1):
             if any(i == get_node_name(node) for i in ["reshape", "view"]):
+                if node in chunk_info["args"]["prepose_nodes"]:
+                    continue
+                if node.args[0] in chunk_info["inputs_non_chunk"]:
+                    continue
                 reshape_args = flat_list(node.args[1:])
+                if len(reshape_args) == 1 and get_node_shape(reshape_args[0]) is None and len(
+                        reshape_args[0].meta['fwd_out']) > 1:
+                    continue
                 chunk_dim = chunk_info["node_chunk_dim"][node]["chunk_dim"]
                 new_shape = ""
                 for reshape_arg_dim, reshape_arg in enumerate(reshape_args):
@@ -409,45 +473,8 @@ def _reassgin_reshape_size(self, chunk_info):
         chunk_info["reshape_size"] = reshape_size
         return chunk_info
 
-    def find_chunk_info(self, input_trace, output_trace, start_idx, end_idx) -> List:
-        """
-        Find chunk info for a region.
-
-        We are given the region start and region end, and need to find out all chunk info for it.
-        We first loop every dim of start node and end node, to see if we can find dim pair,
-        which is linked in a flow and not computed.
-        If found, we then search flow in the whole region to find out all chunk infos.
-
-        Args:
-            input_trace (List): node's input trace in region
-            output_trace (List): node's output trace in region
-            start_idx (int): region start node index
-            end_idx (int): region end node index
-
-        Returns:
-            chunk_infos: possible regions found
-        """
-        start_traces = input_trace[start_idx]
-        if len(start_traces) > 1:    # TODO need to be removed
-            return []
-        end_trace = output_trace[end_idx]
-        end_node = self.trace_indice.node_list[end_idx]
-
-        chunk_infos = []
-        for end_dim, _ in enumerate(end_trace["indice"]):
-            for start_node, start_trace in start_traces.items():
-                for start_dim, _ in enumerate(start_trace["indice"]):
-                    if not self._check_region_start_end(start_node, start_dim, start_idx, end_node, end_dim, end_idx):
-                        continue
-                    # flow search
-                    chunk_info = self.flow_search(start_idx, start_dim, end_idx, end_dim)
-                    if chunk_info is None:
-                        continue
-                    chunk_infos.append(chunk_info)
-        return chunk_infos
-
-    def _check_region_start_end(self, start_node: Node, start_dim: int, start_idx: int, end_node: Node, end_dim: int,
-                                end_idx: int) -> bool:
+    def check_region_start_end(self, start_node: Node, start_dim: int, start_idx: int, end_node: Node, end_dim: int,
+                               end_idx: int) -> bool:
         """
         check if region start and end is legal
         """
diff --git a/colossalai/autochunk/trace_indice.py b/colossalai/autochunk/trace_indice.py
index 8f517cf2cdeb..b591fa764423 100644
--- a/colossalai/autochunk/trace_indice.py
+++ b/colossalai/autochunk/trace_indice.py
@@ -3,14 +3,7 @@
 
 from torch.fx.node import Node
 
-from .utils import (
-    find_first_tensor_arg,
-    find_idx_by_name,
-    flat_list,
-    get_module_node_name,
-    get_node_name,
-    get_node_shape,
-)
+from .utils import NodeMgr, find_first_tensor_arg, flat_list, get_module_node_name, get_node_name, get_node_shape
 
 
 class TraceIndice(object):
@@ -35,8 +28,8 @@ class TraceIndice(object):
         node_list (List)
     """
 
-    def __init__(self, node_list: List[Node]) -> None:
-        self.node_list = node_list
+    def __init__(self, node_mgr: NodeMgr) -> None:
+        self.node_mgr = node_mgr
         self.indice_trace_list = self._init_indice_trace_list()
         self.indice_view_list = {}
         self.indice_count = -1
@@ -45,7 +38,7 @@ def __init__(self, node_list: List[Node]) -> None:
 
     def _init_indice_trace_list(self) -> List:
         indice_trace_list = []
-        for n in self.node_list:
+        for n in self.node_mgr.get_node_list():
             if get_node_shape(n) != None:
                 cur_trace = {
                     "indice": [None for _ in range(len(get_node_shape(n)))],
@@ -99,7 +92,7 @@ def _add_source(
         node_from_trace_source = self._find_source_trace_from_node(node_from)
         node_to_dim = self._transform_indice(node_to, node_to_dim)
         node_to_trace_source = self._find_source_trace_from_node(node_to)
-        node_from_idx = find_idx_by_name(node_from.name, self.node_list)
+        node_from_idx = self.node_mgr.find_node_idx(node_from)
         if init:
             node_to_trace_source[node_to_dim] = {}
         # add dim to cur new source
@@ -200,7 +193,7 @@ def _find_trace_from_node(self, node: Node) -> Dict:
             idx (list): idx of the node
             compute (list): computed idx of the node.
         """
-        node_idx = find_idx_by_name(node.name, self.node_list)
+        node_idx = self.node_mgr.find_node_idx(node)
         node_dict = self.indice_trace_list[node_idx]
         return node_dict
 
@@ -214,7 +207,7 @@ def _find_source_trace_from_node(self, node: Node) -> List:
             idx (list): idx of the node
             compute (list): computed idx of the node.
         """
-        node_idx = find_idx_by_name(node.name, self.node_list)
+        node_idx = self.node_mgr.find_node_idx(node)
         node_dict = self.indice_trace_list[node_idx]
         return node_dict["source"]
 
@@ -227,7 +220,7 @@ def _find_indice_trace_from_node(self, node) -> List:
         Returns:
             idx (list): idx of the node
         """
-        node_idx = find_idx_by_name(node.name, self.node_list)
+        node_idx = self.node_mgr.find_node_idx(node)
         return self.indice_trace_list[node_idx]["indice"]
 
     def _find_compute_trace_from_node(self, node: Node) -> List:
@@ -239,7 +232,7 @@ def _find_compute_trace_from_node(self, node: Node) -> List:
         Returns:
             compute (list): computed idx of the node.
         """
-        node_idx = find_idx_by_name(node.name, self.node_list)
+        node_idx = self.node_mgr.find_node_idx(node)
         return self.indice_trace_list[node_idx]["compute"]
 
     def _assign_indice_as_input(self, node: Node, node_idx: int, input_node=None) -> None:
@@ -454,8 +447,6 @@ def _assign_split_indice(self, node: Node, node_idx: int) -> None:
             node (node)
             node_idx (int)
         """
-        for _ in range(len(get_node_shape(node.args[0]))):
-            self._add_dim(node_idx, 0)
         self._assign_indice_as_input(node, node_idx)
         dim_idx = node.kwargs["dim"]
         self._del_dim(node_idx, dim_idx)
@@ -702,21 +693,20 @@ def _assign_view_reshape_indice(self, node: Node, node_idx: int) -> None:
             if (view_dict["idx_to"] == idx_from and view_dict["dim_to"] == dim_from
                     and view_dict["dim_from"] == dim_to):
                 # inheirt indice from current node
-                for dim_to_i in dim_to:
-                    for dim_from_i in dim_from:
-                        self._inherit_indice(origin_node, dim_from_i, node, dim_to_i, init=False)
+                if len_diff == 1:
+                    if origin_shape[dim_from[0]] == 1:
+                        self._inherit_indice(origin_node, dim_from[1], node, dim_to[0], init=False)
+                    elif origin_shape[dim_from[1]] == 1:
+                        self._inherit_indice(origin_node, dim_from[0], node, dim_to[0], init=False)
+                elif len_diff == -1:
+                    if target_shape[dim_to[0]] == 1:
+                        self._inherit_indice(origin_node, dim_from[0], node, dim_to[1], init=False)
+                    elif target_shape[dim_to[1]] == 1:
+                        self._inherit_indice(origin_node, dim_from[0], node, dim_to[0], init=False)
                 # inherid indice from input node of last view
                 for dim_to_i in dim_to:
                     self._inherit_indice(view_node.args[0], dim_to_i, node, dim_to_i, init=False)
 
-        # inherit computation
-        compute_log = self._find_compute_trace_from_node(origin_node)
-        for i in dim_from:
-            if origin_trace[i] in compute_log:
-                for j in dim_to:
-                    self._mark_computation(node, node_idx, [j])
-                break
-
         # log view, not used now
         view_dict = {
             "idx_from": [origin_trace[i] for i in dim_from],
@@ -742,7 +732,7 @@ def _clear_trace(self, node_idx: int) -> None:
 
         active_nodes = self.active_node_list[trace_range[0]:trace_range[1] + 1]
         active_nodes = set(flat_list(active_nodes))
-        active_nodes = [find_idx_by_name(i, self.node_list) for i in active_nodes]
+        active_nodes = [self.node_mgr.find_node_idx_by_name(i) for i in active_nodes]
         for i in range(trace_range[0], trace_range[1] + 1):
             trace = self.indice_trace_list[i]
             # clear compute
@@ -758,7 +748,7 @@ def _clear_trace(self, node_idx: int) -> None:
                         dim_source.pop(k)
 
     def trace_indice(self) -> None:
-        for idx, node in enumerate(self.node_list):
+        for idx, node in enumerate(self.node_mgr.get_node_list()):
             node_name = get_node_name(node)
             if node.op == "placeholder":
                 self._assign_all_indice(node, idx)
diff --git a/colossalai/autochunk/utils.py b/colossalai/autochunk/utils.py
index de081b41c26e..c6bbc219e41f 100644
--- a/colossalai/autochunk/utils.py
+++ b/colossalai/autochunk/utils.py
@@ -9,6 +9,59 @@
 logger = get_dist_logger()
 
 
+class NodeMgr(object):
+
+    def __init__(self, gm) -> None:
+        self._node_list = list(gm.graph.nodes)
+        self._node_dict = {}
+        self._set_node_dict()
+
+    def _set_node_dict(self) -> None:
+        """
+        create a dict {node_name: node_idx}
+        """
+        self._node_dict.clear()
+        for idx, node in enumerate(self._node_list):
+            self._node_dict[node.name] = idx
+
+    def find_node_idx(self, node: Node) -> int:
+        """
+        find node's index
+        """
+        return self._node_dict[node.name]
+
+    def find_node_idx_by_name(self, node_name: str) -> int:
+        """
+        find node's index
+        """
+        return self._node_dict[node_name]
+
+    def get_node_by_idx(self, idx: int) -> Node:
+        """
+        get a node by index
+        """
+        return self._node_list[idx]
+
+    def get_node_slice_by_idx(self, start: int, end: int) -> List[Node]:
+        """
+        get a slice of node by index
+        """
+        return self._node_list[start:end]
+
+    def get_node_list(self) -> List:
+        """
+        get full node list
+        """
+        return self._node_list
+
+    def update_node_list(self, node_list: List) -> None:
+        """
+        update node list, reset node dict
+        """
+        self._node_list = node_list
+        self._set_node_dict()
+
+
 def get_logger() -> Any:
     return logger
 
@@ -42,6 +95,8 @@ def is_non_compute_node(node: Node) -> bool:
     if any(i == node.op for i in NON_COMPUTE_OP) or any(i == get_node_name(node) for i in NON_COMPUTE_NAME):
         return True
     if "getitem" in node.name:
+        if get_node_shape(node) is not None:
+            return False
         node_args = flat_list(node.args[1:])
         for node_arg in node_args:
             if any(i == str(node_arg) for i in ["None", "Ellipsis"]):
@@ -53,6 +108,8 @@ def is_non_compute_node(node: Node) -> bool:
 
 
 def get_node_shape(node: Node) -> List:
+    if get_node_name(node) == "split":
+        return node.meta["tensor_meta"][0].shape
     if hasattr(node.meta["tensor_meta"], "shape"):
         return node.meta["tensor_meta"].shape
     return None
@@ -78,7 +135,7 @@ def is_non_compute_node_except_placeholder_output(node: Node) -> bool:
     return is_non_compute_node_except_placeholder(node)
 
 
-def find_idx_by_name(name: str, nodes_list: List) -> int:
+def find_node_idx(name: str, nodes_list: List) -> int:
     for idx, node in enumerate(nodes_list):
         if node.name == name:
             return idx
@@ -162,3 +219,28 @@ def get_node_name(node: Node) -> str:
             else:
                 break
     return node_name
+
+
+def find_tensor_node(node_list: List[Node]) -> List[Node]:
+    """
+    find tensor nodes from a node list
+    """
+    out = []
+    for node in node_list:
+        if get_node_shape(node) is not None:
+            out.append(node)
+    return out
+
+
+def find_tensor_shape_node(node_list: List[Node]) -> List[Node]:
+    """
+    find tensor and shape nodes from a node list
+    """
+    out = []
+    for node in node_list:
+        if get_node_shape(node) is not None:
+            out.append(node)
+        elif len(node.meta['fwd_out']) > 0 and isinstance(node.meta['fwd_out'], list) and isinstance(
+                node.meta['fwd_out'][0], int):
+            out.append(node)
+    return out
diff --git a/tests/test_autochunk/test_alphafold/test_alphafold_utils.py b/tests/test_autochunk/test_alphafold/test_alphafold_utils.py
index b05191d2bde4..cb250d6402e2 100644
--- a/tests/test_autochunk/test_alphafold/test_alphafold_utils.py
+++ b/tests/test_autochunk/test_alphafold/test_alphafold_utils.py
@@ -23,6 +23,7 @@ def assert_codegen_run(
     concrete_args: List = None,
     max_memory: int = None,
     print_mem: bool = False,
+    print_est_mem: bool = False,
     print_progress: bool = False,
     print_code: bool = False,
 ) -> List[Dict]:
@@ -41,7 +42,7 @@ def assert_codegen_run(
     codegen = AutoChunkCodeGen(
         meta_graph,
         max_memory=max_memory,
-        print_mem=print_mem,
+        print_mem=print_est_mem,
         print_progress=print_progress,
     )
     chunks = codegen.chunk_infos
@@ -61,13 +62,20 @@ def assert_codegen_run(
     code = graph.python_code("self").src
     if print_code:
         print(code)
-    assert "chunk_result = None;  chunk_size = None;" in code
+    assert "chunk_size = None;  " in code
 
     # assert result
     inputs = [i[1] for i in meta_args] + [i[1] for i in concrete_args]
+    inputs = [i.cuda() if isinstance(i, torch.Tensor) else i for i in inputs]
     model.cuda()
     with torch.no_grad():
-        out_gm = gm(*inputs)
+        if print_mem:
+            torch.cuda.reset_peak_memory_stats()
+            now_mem = torch.cuda.memory_allocated() / 1024**2
+        out_gm = gm(*[i.clone() if isinstance(i, torch.Tensor) else i for i in inputs])
+        if print_mem:
+            new_max_mem = torch.cuda.max_memory_allocated() / 1024**2
+            print("mem: %.2fMB" % (new_max_mem - now_mem))
         out_model = model(*inputs)
     out_gm = flat_list(out_gm)
     out_model = flat_list(out_model)
@@ -85,9 +93,10 @@ def run_test(
     max_memory: int,
     get_model: Any,
     get_data: Any,
-    print_code: bool,
-    print_mem: bool,
-    print_progress: bool,
+    print_code: bool = False,
+    print_mem: bool = False,
+    print_est_mem: bool = False,
+    print_progress: bool = False,
     get_chunk_target: Any = None,
 ) -> None:
     # launch colossalai
@@ -110,6 +119,7 @@ def run_test(
         max_memory=max_memory,
         print_code=print_code,
         print_mem=print_mem,
+        print_est_mem=print_est_mem,
         print_progress=print_progress,
     )
 
diff --git a/tests/test_autochunk/test_alphafold/test_evoformer_block.py b/tests/test_autochunk/test_alphafold/test_evoformer_block.py
index 787067daac8d..99a54fe18e5d 100644
--- a/tests/test_autochunk/test_alphafold/test_evoformer_block.py
+++ b/tests/test_autochunk/test_alphafold/test_evoformer_block.py
@@ -55,9 +55,10 @@ def get_data(msa_len: int, pair_len: int) -> Tuple[List, List]:
 
 def get_chunk_target() -> Dict:
     return {
-        None: [(118, 123), (219, 237), (264, 289), (302, 309), (97, 104), (144, 152), (185, 193), (241, 242), (21, 46)],
-        20: [(118, 123), (230, 237), (275, 282), (305, 306), (100, 101), (32, 39), (73, 79)],
-        24: [(118, 123)],
+        None: [(120, 123), (222, 237), (269, 289), (305, 311), (100, 105), (146, 152), (187, 193), (241, 242),
+               (25, 50)],
+        20: [(120, 123), (232, 237), (277, 282), (305, 306), (100, 101), (34, 39)],
+        24: [(120, 123)],
     }
 
 
@@ -75,9 +76,6 @@ def test_evoformer_block(data_args, max_memory):
         get_model=get_model,
         get_data=get_data,
         get_chunk_target=get_chunk_target,
-        print_code=False,
-        print_mem=False,
-        print_progress=False,
     )
     mp.spawn(run_func, nprocs=1)
 
@@ -86,10 +84,12 @@ def test_evoformer_block(data_args, max_memory):
     run_test(
         rank=0,
         data_args=(32, 64),
-        max_memory=20,
+        max_memory=24,
         get_model=get_model,
         get_data=get_data,
+        get_chunk_target=get_chunk_target,
         print_code=False,
         print_mem=False,
+        print_est_mem=False,
         print_progress=False,
     )
diff --git a/tests/test_autochunk/test_alphafold/test_evoformer_stack.py b/tests/test_autochunk/test_alphafold/test_evoformer_stack.py
index 45d8e7ac8a84..06aba07990e8 100644
--- a/tests/test_autochunk/test_alphafold/test_evoformer_stack.py
+++ b/tests/test_autochunk/test_alphafold/test_evoformer_stack.py
@@ -70,9 +70,6 @@ def test_evoformer_stack(data_args, max_memory):
         max_memory=max_memory,
         get_model=get_model,
         get_data=get_data,
-        print_code=False,
-        print_mem=False,
-        print_progress=False,
     )
     mp.spawn(run_func, nprocs=1)
 
@@ -81,7 +78,7 @@ def test_evoformer_stack(data_args, max_memory):
     run_test(
         rank=0,
         data_args=(32, 64),
-        max_memory=20,
+        max_memory=None,
         get_model=get_model,
         get_data=get_data,
         print_code=False,
diff --git a/tests/test_autochunk/test_alphafold/test_extramsa_block.py b/tests/test_autochunk/test_alphafold/test_extramsa_block.py
index a2b72ed1a803..1b0273a1684f 100644
--- a/tests/test_autochunk/test_alphafold/test_extramsa_block.py
+++ b/tests/test_autochunk/test_alphafold/test_extramsa_block.py
@@ -55,10 +55,10 @@ def get_data(msa_len: int, pair_len: int) -> Tuple[List, List]:
 
 def get_chunk_target() -> Dict:
     return {
-        None: [(126, 131), (227, 245), (272, 297), (310, 317), (105, 112), (152, 160), (193, 201), (249, 250),
-               (33, 46)],
-        20: [(126, 131), (238, 245), (283, 290), (313, 314), (108, 109), (35, 46)],
-        24: [(126, 131)],
+        None: [(128, 131), (230, 245), (277, 297), (313, 319), (108, 113), (154, 160), (195, 201), (249, 250),
+               (36, 46)],
+        20: [(128, 131), (240, 245), (285, 290), (313, 314), (108, 109), (41, 46)],
+        24: [(128, 131)],
     }
 
 
@@ -75,9 +75,7 @@ def test_extramsa_block(data_args, max_memory):
         max_memory=max_memory,
         get_model=get_model,
         get_data=get_data,
-        print_code=False,
-        print_mem=False,
-        print_progress=False,
+        get_chunk_target=get_chunk_target,
     )
     mp.spawn(run_func, nprocs=1)
 
@@ -86,7 +84,7 @@ def test_extramsa_block(data_args, max_memory):
     run_test(
         rank=0,
         data_args=(32, 64),
-        max_memory=20,
+        max_memory=None,
         get_model=get_model,
         get_data=get_data,
         get_chunk_target=get_chunk_target,
diff --git a/tests/test_autochunk/test_transformer/test_autochunk_gpt.py b/tests/test_autochunk/test_transformer/test_autochunk_gpt.py
index 0ba8f89c2c44..256df8bbbae5 100644
--- a/tests/test_autochunk/test_transformer/test_autochunk_gpt.py
+++ b/tests/test_autochunk/test_transformer/test_autochunk_gpt.py
@@ -17,8 +17,8 @@
 
 from colossalai.autochunk.autochunk_codegen import AUTOCHUNK_AVAILABLE
 
-BATCH_SIZE = 2
-SEQ_LENGTH = 256
+BATCH_SIZE = 1
+SEQ_LENGTH = 512
 
 
 def get_data(shape: tuple) -> Tuple[List, List]:
@@ -37,17 +37,14 @@ def get_data(shape: tuple) -> Tuple[List, List]:
 )
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("shape", [(BATCH_SIZE, SEQ_LENGTH)])
-@pytest.mark.parametrize("max_memory", [None, 4.5, 5])
-def test_gpt(model, shape, max_memory):
+@pytest.mark.parametrize("max_memory", [None, 6, 8])
+def test_autochunk_gpt(model, shape, max_memory):
     run_func = partial(
         run_test,
         data=get_data(shape),
         max_memory=max_memory,
         model=model,
         config=GPT2Config(n_embd=96, n_position=shape[1], n_layer=2, n_head=4),
-        print_code=False,
-        print_mem=False,
-        print_progress=False,
     )
     mp.spawn(run_func, nprocs=1)
 
@@ -59,7 +56,8 @@ def test_gpt(model, shape, max_memory):
         max_memory=None,
         model=GPT2Model,
         config=GPT2Config(n_embd=96, n_position=SEQ_LENGTH, n_layer=2, n_head=4),
-        print_code=True,
-        print_mem=True,
+        print_code=False,
+        print_est_mem=False,
+        print_mem=False,
         print_progress=False,
     )
diff --git a/tests/test_autochunk/test_transformer/test_transformer_utils.py b/tests/test_autochunk/test_transformer/test_transformer_utils.py
index d33fc04c5b75..cc26168c7191 100644
--- a/tests/test_autochunk/test_transformer/test_transformer_utils.py
+++ b/tests/test_autochunk/test_transformer/test_transformer_utils.py
@@ -20,6 +20,7 @@ def assert_codegen_run(
     model: Any,
     data: tuple,
     max_memory: int = None,
+    print_est_mem: bool = False,
     print_mem: bool = False,
     print_progress: bool = False,
     print_code: bool = False,
@@ -41,7 +42,7 @@ def assert_codegen_run(
     codegen = AutoChunkCodeGen(
         meta_graph,
         max_memory=max_memory,
-        print_mem=print_mem,
+        print_mem=print_est_mem,
         print_progress=print_progress,
     )
     chunks = codegen.chunk_infos
@@ -61,7 +62,7 @@ def assert_codegen_run(
     code = graph.python_code("self").src
     if print_code:
         print(code)
-    assert "chunk_result = None;  chunk_size = None;" in code
+    assert "chunk_size = None;  " in code
 
     # assert result
     inputs = [meta_args[i] if i in meta_args else concrete_args[i] for i in sequence]
@@ -69,26 +70,44 @@ def assert_codegen_run(
     model.cuda().eval()
     gm.eval()
     with torch.no_grad():
-        out_gm = gm(*inputs)
+        if print_mem:
+            torch.cuda.reset_peak_memory_stats()
+            now_mem = torch.cuda.memory_allocated() / 1024**2
+        out_gm = gm(*[i.clone() if isinstance(i, torch.Tensor) else i for i in inputs])
+        if print_mem:
+            new_max_mem = torch.cuda.max_memory_allocated() / 1024**2
+            print("mem: %.2fMB" % (new_max_mem - now_mem))
         out_model = model(*inputs)
-    for k in out_model.keys():
-        if torch.is_tensor(out_gm[k]):
-            assert torch.equal(
-                out_model[k], out_gm[k]
-            ), f'{model.__class__.__name__} has incorrect output {k}, expect {out_model[k]}, but got {out_gm[k]}'
-
+    assert_allclose(out_model, out_gm)
     return chunks
 
 
+def assert_allclose(out_model: Any, out_gm: Any) -> None:
+    """
+    assert allclose for out
+    """
+    if isinstance(out_model, torch.Tensor):
+        assert torch.allclose(out_model, out_gm,
+                              atol=1e-4), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(
+                                  torch.abs(out_model - out_gm))
+    elif isinstance(out_model, dict):
+        for k in out_model.keys():
+            assert_allclose(out_model[k], out_gm[k])
+    elif isinstance(out_model, tuple) or isinstance(out_model, list) or isinstance(out_model, set):
+        for i, j in zip(out_model, out_gm):
+            assert_allclose(i, j)
+
+
 def run_test(
     rank: int,
     model: Any,
     config: Any,
     data: tuple,
     max_memory: int,
-    print_code: bool,
-    print_mem: bool,
-    print_progress: bool,
+    print_code: bool = False,
+    print_est_mem: bool = False,
+    print_mem: bool = False,
+    print_progress: bool = False,
     get_chunk_target: Any = None,
 ) -> None:
     model = model(config=config)
@@ -108,6 +127,7 @@ def run_test(
         data=data,
         max_memory=max_memory,
         print_code=print_code,
+        print_est_mem=print_est_mem,
         print_mem=print_mem,
         print_progress=print_progress,
     )
@@ -119,5 +139,3 @@ def run_test(
             str(chunk_found),
             str(chunk_target),
         )
-
-    gpc.destroy()

From 9885ec2b2e1fa6b52f80157c09775558baae4254 Mon Sep 17 00:00:00 2001
From: binmakeswell <binmakeswell@gmail.com>
Date: Wed, 1 Feb 2023 17:54:03 +0800
Subject: [PATCH 232/503] [git] remove invalid submodule (#2540)

---
 .gitmodules | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 1e7631bd8760..63387570a548 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,12 +1,3 @@
-[submodule "benchmark"]
-	path = benchmark
-	url = https://github.com/hpcaitech/ColossalAI-Benchmark.git
-	branch = main
-[submodule "examples"]
-	path = examples
-	url = https://github.com/hpcaitech/ColossalAI-Examples.git
-	branch = main
-
 [submodule "inference"]
 	path = inference
 	url = https://github.com/hpcaitech/EnergonAI.git

From c4b15661d7894c7b76f59f18b437254d64f149b9 Mon Sep 17 00:00:00 2001
From: oahzxl <43881818+oahzxl@users.noreply.github.com>
Date: Thu, 2 Feb 2023 15:06:43 +0800
Subject: [PATCH 233/503] [autochunk] add benchmark for transformer and
 alphafold (#2543)

---
 .../benchmark_autochunk_alphafold.py          | 131 +++++++++++++++
 .../test_autochunk_alphafold_utils.py}        |   0
 .../test_autochunk_evoformer_block.py}        |   2 +-
 .../test_autochunk_evoformer_stack.py}        |   2 +-
 .../test_autochunk_extramsa_block.py}         |   2 +-
 .../test_autochunk_diffuser_utils.py}         |   0
 .../test_autochunk_unet.py}                   |   2 +-
 .../benchmark_autochunk_transformer.py        | 150 ++++++++++++++++++
 .../test_autochunk_gpt.py                     |   2 +-
 .../test_autochunk_transformer_utils.py}      |   0
 10 files changed, 286 insertions(+), 5 deletions(-)
 create mode 100644 tests/test_autochunk/test_autochunk_alphafold/benchmark_autochunk_alphafold.py
 rename tests/test_autochunk/{test_alphafold/test_alphafold_utils.py => test_autochunk_alphafold/test_autochunk_alphafold_utils.py} (100%)
 rename tests/test_autochunk/{test_alphafold/test_evoformer_block.py => test_autochunk_alphafold/test_autochunk_evoformer_block.py} (97%)
 rename tests/test_autochunk/{test_alphafold/test_evoformer_stack.py => test_autochunk_alphafold/test_autochunk_evoformer_stack.py} (97%)
 rename tests/test_autochunk/{test_alphafold/test_extramsa_block.py => test_autochunk_alphafold/test_autochunk_extramsa_block.py} (97%)
 rename tests/test_autochunk/{test_diffuser/test_diffuser_utils.py => test_autochunk_diffuser/test_autochunk_diffuser_utils.py} (100%)
 rename tests/test_autochunk/{test_diffuser/test_unet.py => test_autochunk_diffuser/test_autochunk_unet.py} (96%)
 create mode 100644 tests/test_autochunk/test_autochunk_transformer/benchmark_autochunk_transformer.py
 rename tests/test_autochunk/{test_transformer => test_autochunk_transformer}/test_autochunk_gpt.py (97%)
 rename tests/test_autochunk/{test_transformer/test_transformer_utils.py => test_autochunk_transformer/test_autochunk_transformer_utils.py} (100%)

diff --git a/tests/test_autochunk/test_autochunk_alphafold/benchmark_autochunk_alphafold.py b/tests/test_autochunk/test_autochunk_alphafold/benchmark_autochunk_alphafold.py
new file mode 100644
index 000000000000..2f56f139abaf
--- /dev/null
+++ b/tests/test_autochunk/test_autochunk_alphafold/benchmark_autochunk_alphafold.py
@@ -0,0 +1,131 @@
+import time
+from typing import Any, Dict, List
+
+import torch
+import torch.fx
+
+import colossalai
+from colossalai.autochunk.autochunk_codegen import AUTOCHUNK_AVAILABLE
+from colossalai.fx.graph_module import ColoGraphModule
+from colossalai.fx.passes.meta_info_prop import MetaInfoProp
+from colossalai.utils import free_port
+
+if AUTOCHUNK_AVAILABLE:
+    from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
+    from colossalai.fx.profiler import MetaTensor
+    from colossalai.fx.tracer.experimental import ColoTracer, symbolic_trace
+
+
+def _benchmark_evoformer_stack_gm(
+    data_args: tuple,
+    max_memory: int,
+    get_model: Any,
+    get_data: Any,
+) -> None:
+    # build model and input
+    model = get_model()
+    meta_args, concrete_args = get_data(*data_args)
+    if concrete_args is None:
+        concrete_args = []
+
+    # trace the meta graph and setup codegen
+    meta_graph = symbolic_trace(
+        model,
+        meta_args={k: v.to(torch.device("meta")) for k, v in meta_args},
+        concrete_args={k: v for k, v in concrete_args},
+    )
+    interp = MetaInfoProp(meta_graph)
+    meta_tensors = [MetaTensor(i[1], fake_device="cuda:0") for i in meta_args] + [i[1] for i in concrete_args]
+    interp.propagate(*meta_tensors)
+    codegen = AutoChunkCodeGen(
+        meta_graph,
+        max_memory=max_memory,
+    )
+
+    # trace and recompile
+    # MetaInfoProp requires symbolic_trace but CodeGen requires ColoTracer
+    graph = ColoTracer().trace(
+        model,
+        meta_args={k: v.to(torch.device("meta")) for k, v in meta_args},
+        concrete_args={k: v for k, v in concrete_args},
+    )
+    graph.set_codegen(codegen)
+    gm = ColoGraphModule(model, graph, ckpt_codegen=False)
+    gm.recompile()
+
+    # init inputs
+    inputs = [i[1] for i in meta_args] + [i[1] for i in concrete_args]
+    inputs = [i.cuda() if isinstance(i, torch.Tensor) else i for i in inputs]
+    model.cuda()
+
+    # bench
+    mem = _benchmark_memory(gm, inputs)
+    speed = _benchmark_speed(gm, inputs)
+    print("evoformer stack gm, mem: %.2fMB, time: %.4fs, data_args: %s" % (mem, speed, str(data_args)))
+
+
+def _benchmark_evoformer_stack_origin(
+    data_args: tuple,
+    get_model: Any,
+    get_data: Any,
+) -> None:
+    # build model and input
+    model = get_model()
+    meta_args, concrete_args = get_data(*data_args)
+    if concrete_args is None:
+        concrete_args = []
+
+    # init inputs
+    inputs = [i[1] for i in meta_args] + [i[1] for i in concrete_args]
+    inputs = [i.cuda() if isinstance(i, torch.Tensor) else i for i in inputs]
+    model.cuda()
+
+    # bench
+    mem = _benchmark_memory(model, inputs)
+    speed = _benchmark_speed(model, inputs)
+    print("evoformer stack origin, mem: %.2fMB, time: %.4fs, data_args: %s" % (mem, speed, str(data_args)))
+
+
+def _benchmark_memory(model, inputs):
+    with torch.no_grad():
+        torch.cuda.reset_peak_memory_stats()
+        now_mem = torch.cuda.memory_allocated() / 1024**2
+        model(*[i.clone() if isinstance(i, torch.Tensor) else i for i in inputs])
+        new_max_mem = torch.cuda.max_memory_allocated() / 1024**2
+    return new_max_mem - now_mem
+
+
+def _benchmark_speed(model, inputs, loop=5):
+    with torch.no_grad():
+        for _ in range(loop // 2 + 1):
+            model(*inputs)
+        torch.cuda.synchronize()
+        time1 = time.time()
+        for _ in range(loop):
+            model(*inputs)
+        torch.cuda.synchronize()
+        time2 = time.time()
+    return (time2 - time1) / loop
+
+
+def benchmark_evoformer_stack():
+    from test_autochunk_evoformer_stack import get_data, get_model
+    data_args = [128, 256]
+    print("")
+    _benchmark_evoformer_stack_origin(data_args, get_model, get_data)
+    _benchmark_evoformer_stack_gm(data_args, 600, get_model, get_data)
+    _benchmark_evoformer_stack_gm(data_args, 400, get_model, get_data)
+    _benchmark_evoformer_stack_gm(data_args, None, get_model, get_data)
+
+
+if __name__ == "__main__":
+    # launch colossalai
+    colossalai.launch(
+        config={},
+        rank=0,
+        world_size=1,
+        host="localhost",
+        port=free_port(),
+        backend="nccl",
+    )
+    benchmark_evoformer_stack()
diff --git a/tests/test_autochunk/test_alphafold/test_alphafold_utils.py b/tests/test_autochunk/test_autochunk_alphafold/test_autochunk_alphafold_utils.py
similarity index 100%
rename from tests/test_autochunk/test_alphafold/test_alphafold_utils.py
rename to tests/test_autochunk/test_autochunk_alphafold/test_autochunk_alphafold_utils.py
diff --git a/tests/test_autochunk/test_alphafold/test_evoformer_block.py b/tests/test_autochunk/test_autochunk_alphafold/test_autochunk_evoformer_block.py
similarity index 97%
rename from tests/test_autochunk/test_alphafold/test_evoformer_block.py
rename to tests/test_autochunk/test_autochunk_alphafold/test_autochunk_evoformer_block.py
index 99a54fe18e5d..be727701c091 100644
--- a/tests/test_autochunk/test_alphafold/test_evoformer_block.py
+++ b/tests/test_autochunk/test_autochunk_alphafold/test_autochunk_evoformer_block.py
@@ -12,7 +12,7 @@
 except:
     HAS_REPO = False
 
-from test_alphafold_utils import run_test
+from test_autochunk_alphafold_utils import run_test
 
 from colossalai.autochunk.autochunk_codegen import AUTOCHUNK_AVAILABLE
 
diff --git a/tests/test_autochunk/test_alphafold/test_evoformer_stack.py b/tests/test_autochunk/test_autochunk_alphafold/test_autochunk_evoformer_stack.py
similarity index 97%
rename from tests/test_autochunk/test_alphafold/test_evoformer_stack.py
rename to tests/test_autochunk/test_autochunk_alphafold/test_autochunk_evoformer_stack.py
index 06aba07990e8..5210c1c8d48e 100644
--- a/tests/test_autochunk/test_alphafold/test_evoformer_stack.py
+++ b/tests/test_autochunk/test_autochunk_alphafold/test_autochunk_evoformer_stack.py
@@ -12,7 +12,7 @@
 except:
     HAS_REPO = False
 
-from test_alphafold_utils import run_test
+from test_autochunk_alphafold_utils import run_test
 
 from colossalai.autochunk.autochunk_codegen import AUTOCHUNK_AVAILABLE
 
diff --git a/tests/test_autochunk/test_alphafold/test_extramsa_block.py b/tests/test_autochunk/test_autochunk_alphafold/test_autochunk_extramsa_block.py
similarity index 97%
rename from tests/test_autochunk/test_alphafold/test_extramsa_block.py
rename to tests/test_autochunk/test_autochunk_alphafold/test_autochunk_extramsa_block.py
index 1b0273a1684f..f8102f351982 100644
--- a/tests/test_autochunk/test_alphafold/test_extramsa_block.py
+++ b/tests/test_autochunk/test_autochunk_alphafold/test_autochunk_extramsa_block.py
@@ -11,7 +11,7 @@
     HAS_REPO = True
 except:
     HAS_REPO = False
-from test_alphafold_utils import run_test
+from test_autochunk_alphafold_utils import run_test
 
 from colossalai.autochunk.autochunk_codegen import AUTOCHUNK_AVAILABLE
 
diff --git a/tests/test_autochunk/test_diffuser/test_diffuser_utils.py b/tests/test_autochunk/test_autochunk_diffuser/test_autochunk_diffuser_utils.py
similarity index 100%
rename from tests/test_autochunk/test_diffuser/test_diffuser_utils.py
rename to tests/test_autochunk/test_autochunk_diffuser/test_autochunk_diffuser_utils.py
diff --git a/tests/test_autochunk/test_diffuser/test_unet.py b/tests/test_autochunk/test_autochunk_diffuser/test_autochunk_unet.py
similarity index 96%
rename from tests/test_autochunk/test_diffuser/test_unet.py
rename to tests/test_autochunk/test_autochunk_diffuser/test_autochunk_unet.py
index db154b4bba60..9ebe6f393b20 100644
--- a/tests/test_autochunk/test_diffuser/test_unet.py
+++ b/tests/test_autochunk/test_autochunk_diffuser/test_autochunk_unet.py
@@ -13,7 +13,7 @@
     MODELS = []
     HAS_REPO = False
 
-from test_diffuser_utils import run_test
+from test_autochunk_diffuser_utils import run_test
 
 from colossalai.autochunk.autochunk_codegen import AUTOCHUNK_AVAILABLE
 
diff --git a/tests/test_autochunk/test_autochunk_transformer/benchmark_autochunk_transformer.py b/tests/test_autochunk/test_autochunk_transformer/benchmark_autochunk_transformer.py
new file mode 100644
index 000000000000..43cefcb74988
--- /dev/null
+++ b/tests/test_autochunk/test_autochunk_transformer/benchmark_autochunk_transformer.py
@@ -0,0 +1,150 @@
+import time
+from typing import Any, Dict, List
+
+import torch
+import torch.fx
+
+import colossalai
+from colossalai.autochunk.autochunk_codegen import AUTOCHUNK_AVAILABLE
+from colossalai.fx.graph_module import ColoGraphModule
+from colossalai.fx.passes.meta_info_prop import MetaInfoProp
+from colossalai.fx.profiler import parameter_size
+from colossalai.utils import free_port
+
+if AUTOCHUNK_AVAILABLE:
+    from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
+    from colossalai.fx.profiler import MetaTensor
+    from colossalai.fx.tracer.experimental import ColoTracer, symbolic_trace
+
+
+def _benchmark_autochunk_gpt_gm(
+    model: Any,
+    data: tuple,
+    max_memory: int = None,
+) -> None:
+    model = model.cuda().eval()
+
+    # build model and input
+    meta_args, concrete_args, sequence = data
+    if concrete_args is None:
+        concrete_args = {}
+
+    # trace the meta graph and setup codegen
+    meta_graph = symbolic_trace(
+        model,
+        meta_args={k: v.to(torch.device("meta")) for k, v in meta_args.items()},
+        concrete_args={k: v for k, v in concrete_args.items()},
+    )
+    interp = MetaInfoProp(meta_graph)
+    meta_tensors = [meta_args[i] if i in meta_args else concrete_args[i] for i in sequence]
+    meta_tensors = [MetaTensor(i, fake_device="cuda:0") if isinstance(i, torch.Tensor) else i for i in meta_tensors]
+    interp.propagate(*meta_tensors)
+    codegen = AutoChunkCodeGen(
+        meta_graph,
+        max_memory=max_memory,
+    )
+
+    # trace and recompile
+    # MetaInfoProp requires symbolic_trace but CodeGen requires ColoTracer
+    graph = ColoTracer().trace(
+        model.cuda().eval(),
+        meta_args={k: v.to(torch.device("meta")) for k, v in meta_args.items()},
+        concrete_args={k: v for k, v in concrete_args.items()},
+    )
+    graph.set_codegen(codegen)
+    gm = ColoGraphModule(model, graph, ckpt_codegen=False)
+    gm.recompile()
+
+    # init inputs
+    inputs = [meta_args[i] if i in meta_args else concrete_args[i] for i in sequence]
+    inputs = [i.cuda() if isinstance(i, torch.Tensor) else i for i in inputs]
+    model.cuda().eval()
+
+    # bench
+    para_mem = float(parameter_size(model)) / 1024**2 * 6
+    act_mem = _benchmark_memory(gm, inputs)
+    speed = _benchmark_speed(gm, inputs)
+    print("gpt autochunk, time: %.4fs, act mem: %.2fMB, para mem: %.2fMB, all mem: %.2fMB" %
+          (speed, act_mem, para_mem, act_mem + para_mem))
+
+
+def _benchmark_autochunk_gpt_origin(
+    model: Any,
+    data: tuple,
+) -> None:
+    # build model and input
+    meta_args, concrete_args, sequence = data
+    if concrete_args is None:
+        concrete_args = {}
+
+    # init inputs
+    inputs = [meta_args[i] if i in meta_args else concrete_args[i] for i in sequence]
+    inputs = [i.cuda() if isinstance(i, torch.Tensor) else i for i in inputs]
+    model.cuda().eval()
+
+    # bench
+    para_mem = float(parameter_size(model)) / 1024**2 * 6
+    act_mem = _benchmark_memory(model, inputs)
+    speed = _benchmark_speed(model, inputs)
+    print("gpt origin, time: %.4fs, act mem: %.2fMB, para mem: %.2fMB, all mem: %.2fMB" %
+          (speed, act_mem, para_mem, act_mem + para_mem))
+    return act_mem
+
+
+def _benchmark_memory(model, inputs):
+    with torch.no_grad():
+        torch.cuda.reset_peak_memory_stats()
+        now_mem = float(torch.cuda.memory_allocated()) / 1024**2
+        model(*[i.clone() if isinstance(i, torch.Tensor) else i for i in inputs])
+        new_max_mem = float(torch.cuda.max_memory_allocated()) / 1024**2
+    return new_max_mem - now_mem
+
+
+def _benchmark_speed(model, inputs, loop=5):
+    with torch.no_grad():
+        for _ in range(loop // 2 + 1):
+            model(*inputs)
+        torch.cuda.synchronize()
+        time1 = time.time()
+        for _ in range(loop):
+            model(*inputs)
+        torch.cuda.synchronize()
+        time2 = time.time()
+    return (time2 - time1) / loop
+
+
+def benchmark_autochunk_gpt(batch=1, seq=512, n_embd=768, n_head=12):
+    from test_autochunk_gpt import GPT2Config, GPT2Model, get_data
+    model = GPT2Model
+    config = GPT2Config(n_embd=n_embd, n_position=seq, n_layer=2, n_head=n_head)
+    config.max_position_embeddings = seq
+    model = model(config=config)
+    shape = [batch, seq]
+    print("\nbatch: %d, seq: %d, n_embd: %d, n_head: %d" % (batch, seq, n_embd, n_head))
+    max_mem = _benchmark_autochunk_gpt_origin(model, get_data(shape))
+    for ratio in [0.5, 0.4, 0.3, 0.2]:
+        try:
+            _benchmark_autochunk_gpt_gm(model, get_data(shape), max_mem * ratio)
+        except RuntimeError as e:
+            if e.args[0] == 'Search failed. Try a larger memory threshold.':
+                break
+        except Exception as e:
+            raise e
+    _benchmark_autochunk_gpt_gm(model, get_data(shape), None)
+
+
+if __name__ == "__main__":
+    # launch colossalai
+    colossalai.launch(
+        config={},
+        rank=0,
+        world_size=1,
+        host="localhost",
+        port=free_port(),
+        backend="nccl",
+    )
+    benchmark_autochunk_gpt(batch=1, seq=1024, n_embd=768, n_head=12)
+    benchmark_autochunk_gpt(batch=1, seq=2048, n_embd=768, n_head=12)
+    benchmark_autochunk_gpt(batch=1, seq=4096, n_embd=768, n_head=12)
+    benchmark_autochunk_gpt(batch=1, seq=6144, n_embd=768, n_head=12)
+    benchmark_autochunk_gpt(batch=1, seq=8192, n_embd=768, n_head=12)
diff --git a/tests/test_autochunk/test_transformer/test_autochunk_gpt.py b/tests/test_autochunk/test_autochunk_transformer/test_autochunk_gpt.py
similarity index 97%
rename from tests/test_autochunk/test_transformer/test_autochunk_gpt.py
rename to tests/test_autochunk/test_autochunk_transformer/test_autochunk_gpt.py
index 256df8bbbae5..6e1076ec792b 100644
--- a/tests/test_autochunk/test_transformer/test_autochunk_gpt.py
+++ b/tests/test_autochunk/test_autochunk_transformer/test_autochunk_gpt.py
@@ -13,7 +13,7 @@
     MODELS = []
     HAS_REPO = False
 
-from test_transformer_utils import run_test
+from test_autochunk_transformer_utils import run_test
 
 from colossalai.autochunk.autochunk_codegen import AUTOCHUNK_AVAILABLE
 
diff --git a/tests/test_autochunk/test_transformer/test_transformer_utils.py b/tests/test_autochunk/test_autochunk_transformer/test_autochunk_transformer_utils.py
similarity index 100%
rename from tests/test_autochunk/test_transformer/test_transformer_utils.py
rename to tests/test_autochunk/test_autochunk_transformer/test_autochunk_transformer_utils.py

From fa3d66feb9793a0e0003d827066a70fabe924a50 Mon Sep 17 00:00:00 2001
From: oahzxl <43881818+oahzxl@users.noreply.github.com>
Date: Thu, 2 Feb 2023 16:19:26 +0800
Subject: [PATCH 234/503] support unet metainfo prop (#2544)

---
 colossalai/fx/_meta_registrations.py | 31 +++++++++++++---------------
 colossalai/fx/profiler/opcount.py    | 21 +++++++++++++++++++
 2 files changed, 35 insertions(+), 17 deletions(-)

diff --git a/colossalai/fx/_meta_registrations.py b/colossalai/fx/_meta_registrations.py
index 8c0201c71e08..153214447223 100644
--- a/colossalai/fx/_meta_registrations.py
+++ b/colossalai/fx/_meta_registrations.py
@@ -164,18 +164,9 @@ def pick_memory_format():
 
 
 @register_meta(aten._convolution.default)
-def meta_conv_1(
-    input_tensor: torch.Tensor,
-    weight: torch.Tensor,
-    bias: torch.Tensor,
-    stride: List[int],
-    padding: List[int],
-    dilation: List[int],
-    is_transposed: bool,
-    output_padding: List[int],
-    groups: int,
-    *extra_args
-):
+def meta_conv_1(input_tensor: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, stride: List[int],
+                padding: List[int], dilation: List[int], is_transposed: bool, output_padding: List[int], groups: int,
+                *extra_args):
     out = meta_conv(input_tensor, weight, bias, stride, padding, dilation, is_transposed, output_padding, groups)
     return out
 
@@ -233,11 +224,8 @@ def meta_cuda_rnn(
     if is_input_packed:
         out_shape = [batch_sizes_sum, out_size * num_directions]
     else:
-        out_shape = (
-            [mini_batch, seq_length, out_size * num_directions]
-            if batch_first
-            else [seq_length, mini_batch, out_size * num_directions]
-        )
+        out_shape = ([mini_batch, seq_length, out_size *
+                      num_directions] if batch_first else [seq_length, mini_batch, out_size * num_directions])
     output = input.new_empty(out_shape)
 
     cell_shape = [num_layers * num_directions, mini_batch, hidden_size]
@@ -372,6 +360,15 @@ def meta_ln_backward(dY: torch.Tensor, input: torch.Tensor, normalized_shape, me
     return dX, dgamma, dbeta
 
 
+# https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/group_norm.cpp
+@register_meta(aten.native_group_norm_backward.default)
+def meta_gn_backward(dY: torch.Tensor, input: torch.Tensor, mean, rstd, gamma, N, C, HxW, group, grad_input_mask):
+    dX = torch.empty_like(input)
+    dgamma = torch.empty_like(gamma)
+    dbeta = torch.empty_like(gamma)
+    return dX, dgamma, dbeta
+
+
 # ================================== Misc ==========================================
 # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/native_functions.yaml
 @register_meta(aten.roll.default)
diff --git a/colossalai/fx/profiler/opcount.py b/colossalai/fx/profiler/opcount.py
index 6bd612ad2fd1..d780ef6d49c9 100644
--- a/colossalai/fx/profiler/opcount.py
+++ b/colossalai/fx/profiler/opcount.py
@@ -70,6 +70,19 @@ def bmm_flop_jit(inputs: List[Any], outputs: List[Any]) -> Number:
     return flops
 
 
+def baddbmm_flop_jit(inputs: List[Any], outputs: List[Any]) -> Number:
+    """
+    Count flops for the baddbmm(batch add and batch matmul) operation.
+    """
+    # Inputs = [input, batch1, batch2]
+    # out = input + batch1 x batch2
+    assert len(inputs) == 3, len(inputs)
+    n, c, t = inputs[1].shape
+    d = inputs[2].shape[-1]
+    flops = n * c * t * d
+    return flops
+
+
 def conv_flop_count(
     x_shape: List[int],
     w_shape: List[int],
@@ -196,6 +209,7 @@ def zero_flop_jit(*args):
         aten.matmul.default: matmul_flop_jit,
         aten.addmm.default: addmm_flop_jit,
         aten.bmm.default: bmm_flop_jit,
+        aten.baddbmm.default: baddbmm_flop_jit,
 
     # convolution
         aten.convolution.default: conv_flop_jit,
@@ -209,6 +223,8 @@ def zero_flop_jit(*args):
         aten.cudnn_batch_norm_backward.default: partial(batchnorm_flop_jit, training=True),
         aten.native_layer_norm.default: norm_flop_counter(2, 0),
         aten.native_layer_norm_backward.default: norm_flop_counter(2, 0),
+        aten.native_group_norm.default: norm_flop_counter(2, 0),
+        aten.native_group_norm_backward.default: norm_flop_counter(2, 0),
 
     # pooling
         aten.avg_pool1d.default: elementwise_flop_counter(1, 0),
@@ -230,6 +246,8 @@ def zero_flop_jit(*args):
         aten._adaptive_avg_pool3d_backward.default: elementwise_flop_counter(0, 1),
         aten.embedding_dense_backward.default: elementwise_flop_counter(0, 1),
         aten.embedding.default: elementwise_flop_counter(1, 0),
+        aten.upsample_nearest2d.vec: elementwise_flop_counter(0, 1),
+        aten.upsample_nearest2d_backward.vec: elementwise_flop_counter(0, 1),
     }
 
     elementwise_flop_aten = [
@@ -251,6 +269,9 @@ def zero_flop_jit(*args):
         aten.mean.dim,
         aten.sub.Tensor,
         aten.sub_.Tensor,
+        aten.exp.default,
+        aten.sin.default,
+        aten.cos.default,
 
     # activation op
         aten.hardswish.default,

From 5b1854309a066f058f5a51c8adcbff1e51870c25 Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Thu, 2 Feb 2023 16:42:38 +0800
Subject: [PATCH 235/503] [hotfix] fix zero ddp warmup check (#2545)

---
 colossalai/gemini/gemini_mgr.py         | 4 ++++
 colossalai/nn/parallel/data_parallel.py | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/colossalai/gemini/gemini_mgr.py b/colossalai/gemini/gemini_mgr.py
index 08fc0cf922d4..72a5e4a7f19b 100644
--- a/colossalai/gemini/gemini_mgr.py
+++ b/colossalai/gemini/gemini_mgr.py
@@ -58,6 +58,10 @@ def reset_attributes(self):
         self._evict_time = 0
         self._comp_cuda_demand_time = 0
 
+    @property
+    def need_warmup(self) -> bool:
+        return self.policy_name in ('auto', 'const')
+
     def is_warmup(self):
         return self._warmup
 
diff --git a/colossalai/nn/parallel/data_parallel.py b/colossalai/nn/parallel/data_parallel.py
index a30416ab9fdf..a313da59b056 100644
--- a/colossalai/nn/parallel/data_parallel.py
+++ b/colossalai/nn/parallel/data_parallel.py
@@ -269,7 +269,8 @@ def forward(self, *args, **kwargs):
         # check whether we are in a inference mode
         grad_flag = torch.is_grad_enabled()
         if not grad_flag:
-            assert not self.gemini_manager.is_warmup(), "You should run a completed iteration as your warmup iter"
+            assert not self.gemini_manager.need_warmup or not self.gemini_manager.is_warmup(
+            ), "You should run a completed iteration as your warmup iter"
 
         args, kwargs = _cast_float(args, torch.half), _cast_float(kwargs, torch.half)
         self.module.zero_grad(set_to_none=True)

From 8438c35a5f10993d7622f864e84f68c2302312be Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Thu, 2 Feb 2023 18:16:03 +0800
Subject: [PATCH 236/503] [doc] added pull  request template (#2550)

* [doc] added pull  request template

* polish code

* polish code
---
 .github/pull_request_template.md | 36 ++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100644 .github/pull_request_template.md

diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
new file mode 100644
index 000000000000..1d3c43ae402a
--- /dev/null
+++ b/.github/pull_request_template.md
@@ -0,0 +1,36 @@
+## 📌 Checklist before creating the PR
+
+- [ ] I have created an issue for this PR for traceability
+- [ ] The title follows the standard format: `[doc/gemini/tensor/...]: A concise description`
+- [ ] I have added relevant tags if possible for us to better distinguish differnt PRs
+
+
+## 🚨 Issue number
+
+> Link this PR to your issue with words like fixed to automatically close the linked issue upon merge
+>
+> e.g. fixed #1234, closed #1234, resolved #1234
+
+
+
+## 📝 What does this PR do?
+
+> Summarize your work here.
+> if you have any plots/diagrams/screenshots/tables, please attach them here.
+
+
+
+## 💥 Checklist before requesting a review
+
+- [ ] I have linked my PR to an issue ([instruction](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue))
+- [ ] My issue clearly describes the problem/feature/proposal, with diagrams/charts/table/code if possible
+- [ ] I have performed a self-review of my code
+- [ ] I have added thorough tests.
+- [ ] I have added docstrings for all the functions/methods I implemented
+
+## ⭐️ Do you enjoy contributing to Colossal-AI?
+
+- [ ] 🌝 Yes, I do.
+- [ ] 🌚 No, I don't.
+
+Tell us more if you don't enjoy contributing to Colossal-AI.

From dd14783f75c164775ee19eeb5fe89094465e27e7 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Fri, 3 Feb 2023 09:47:13 +0800
Subject: [PATCH 237/503] [kernel] fixed repeated loading of kernels (#2549)

* [kernel] fixed repeated loading of kernels

* polish code

* polish code
---
 colossalai/kernel/cuda_native/__init__.py     |  4 +-
 colossalai/kernel/cuda_native/layer_norm.py   | 25 +++++-----
 .../kernel/cuda_native/scaled_softmax.py      | 47 ++++++++-----------
 op_builder/builder.py                         | 29 ++++++++++--
 4 files changed, 59 insertions(+), 46 deletions(-)

diff --git a/colossalai/kernel/cuda_native/__init__.py b/colossalai/kernel/cuda_native/__init__.py
index 8f857ff5d9f1..1d5a6ce495bd 100644
--- a/colossalai/kernel/cuda_native/__init__.py
+++ b/colossalai/kernel/cuda_native/__init__.py
@@ -1,3 +1,5 @@
 from .layer_norm import MixedFusedLayerNorm as LayerNorm
 from .multihead_attention import MultiHeadAttention
-from .scaled_softmax import FusedScaleMaskSoftmax
+from .scaled_softmax import FusedScaleMaskSoftmax, ScaledUpperTriangMaskedSoftmax
+
+__all__ = ['LayerNorm', 'MultiHeadAttention', 'FusedScaleMaskSoftmax', 'ScaledUpperTriangMaskedSoftmax']
diff --git a/colossalai/kernel/cuda_native/layer_norm.py b/colossalai/kernel/cuda_native/layer_norm.py
index 4be3363882ce..40355a41ed0d 100644
--- a/colossalai/kernel/cuda_native/layer_norm.py
+++ b/colossalai/kernel/cuda_native/layer_norm.py
@@ -9,24 +9,31 @@
 from torch.nn import init
 from torch.nn.parameter import Parameter
 
+from colossalai.kernel.op_builder.layernorm import LayerNormBuilder
+
+try:
+    from colossalai._C import layer_norm
+except ImportError:
+    layer_norm = None
+
 
 class FusedLayerNormAffineFunction(torch.autograd.Function):
 
     @staticmethod
     @custom_fwd(cast_inputs=torch.float32)
     def forward(ctx, input, weight, bias, normalized_shape, eps):
-        try:
-            from colossalai._C import layer_norm
-        except ImportError:
-            from colossalai.kernel.op_builder.layernorm import LayerNormBuilder
-            layer_norm = LayerNormBuilder().load()
-
         ctx.normalized_shape = normalized_shape
         ctx.eps = eps
         input_ = input.contiguous()
         weight_ = weight.contiguous()
         bias_ = bias.contiguous()
+
+        global layer_norm
+        if layer_norm is None:
+
+            layer_norm = LayerNormBuilder().load()
         output, mean, invvar = layer_norm.forward_affine(input_, ctx.normalized_shape, weight_, bias_, ctx.eps)
+        ctx.layernorm_op = layer_norm
         ctx.save_for_backward(input_, weight_, bias_, mean, invvar)
 
         return output
@@ -34,12 +41,6 @@ def forward(ctx, input, weight, bias, normalized_shape, eps):
     @staticmethod
     @custom_bwd
     def backward(ctx, grad_output):
-        try:
-            from colossalai._C import layer_norm
-        except ImportError:
-            from colossalai.kernel.op_builder.layernorm import LayerNormBuilder
-            layer_norm = LayerNormBuilder().load()
-
         input_, weight_, bias_, mean, invvar = ctx.saved_tensors
         grad_input = grad_weight = grad_bias = None
         grad_input, grad_weight, grad_bias \
diff --git a/colossalai/kernel/cuda_native/scaled_softmax.py b/colossalai/kernel/cuda_native/scaled_softmax.py
index 44d750c5cbde..580e5c81aabb 100644
--- a/colossalai/kernel/cuda_native/scaled_softmax.py
+++ b/colossalai/kernel/cuda_native/scaled_softmax.py
@@ -1,11 +1,17 @@
-"""This code from NVIDIA Megatron
-   with some changes. """
-
 import enum
 
 import torch
 import torch.nn as nn
 
+from colossalai.kernel.op_builder.scaled_masked_softmax import ScaledMaskedSoftmaxBuilder
+from colossalai.kernel.op_builder.scaled_upper_triangle_masked_softmax import ScaledUpperTrainglemaskedSoftmaxBuilder
+
+try:
+    from colossalai._C import scaled_masked_softmax, scaled_upper_triang_masked_softmax
+except ImportError:
+    scaled_masked_softmax = None
+    scaled_upper_triang_masked_softmax = None
+
 
 class AttnMaskType(enum.Enum):
     padding = 1
@@ -23,7 +29,9 @@ class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
 
     @staticmethod
     def forward(ctx, inputs, scale):
-        from colossalai.kernel import scaled_upper_triang_masked_softmax
+        global scaled_upper_triang_masked_softmax
+        if scaled_upper_triang_masked_softmax:
+            scaled_upper_triang_masked_softmax = ScaledUpperTrainglemaskedSoftmaxBuilder().load()
 
         scale_t = torch.tensor([scale])
         softmax_results = scaled_upper_triang_masked_softmax.forward(inputs, scale_t[0])
@@ -33,8 +41,6 @@ def forward(ctx, inputs, scale):
 
     @staticmethod
     def backward(ctx, output_grads):
-        from colossalai.kernel import scaled_upper_triang_masked_softmax
-
         softmax_results, scale_t = ctx.saved_tensors
         input_grads = scaled_upper_triang_masked_softmax.backward(output_grads, softmax_results, scale_t[0])
 
@@ -52,30 +58,23 @@ class ScaledMaskedSoftmax(torch.autograd.Function):
 
     @staticmethod
     def forward(ctx, inputs, mask, scale):
-        try:
-            from colossalai._C import scaled_masked_softmax
-        except ImportError:
-            from colossalai.kernel.op_builder.scaled_masked_softmax import ScaledMaskedSoftmaxBuilder
-            scaled_masked_softmax = ScaledMaskedSoftmaxBuilder().load()
-
         scale_t = torch.tensor([scale])
 
+        # build and load kernel if not pre-built
+        global scaled_masked_softmax
+        if scaled_masked_softmax is None:
+            scaled_masked_softmax = ScaledMaskedSoftmaxBuilder().load()
+
         softmax_results = scaled_masked_softmax.forward(inputs, mask, scale_t[0])
         ctx.save_for_backward(softmax_results, scale_t)
         return softmax_results
 
     @staticmethod
     def backward(ctx, output_grads):
-        try:
-            from colossalai._C import scaled_masked_softmax
-        except ImportError:
-            from colossalai.kernel.op_builder.scaled_masked_softmax import ScaledMaskedSoftmaxBuilder
-            scaled_masked_softmax = ScaledMaskedSoftmaxBuilder().load()
-
         softmax_results, scale_t = ctx.saved_tensors
 
         input_grads = scaled_masked_softmax.backward(output_grads, softmax_results, scale_t[0])
-        return input_grads, None, None
+        return input_grads, None, None, None
 
 
 class FusedScaleMaskSoftmax(nn.Module):
@@ -113,14 +112,6 @@ def __init__(
         self.mask_func = mask_func
         self.softmax_in_fp32 = softmax_in_fp32
         self.scale = scale
-
-        try:
-            from colossalai._C import scaled_masked_softmax
-        except ImportError:
-            from colossalai.kernel.op_builder.scaled_masked_softmax import ScaledMaskedSoftmaxBuilder
-            scaled_masked_softmax = ScaledMaskedSoftmaxBuilder().load()
-        self.scaled_masked_softmax = scaled_masked_softmax
-
         assert (self.scale is None or softmax_in_fp32), "softmax should be in fp32 when scaled"
 
     def forward(self, input, mask):
@@ -186,4 +177,4 @@ def forward_torch_softmax(self, input, mask):
         return probs
 
     def get_batch_per_block(self, sq, sk, b, np):
-        return self.scaled_masked_softmax.get_batch_per_block(sq, sk, b, np)
+        return scaled_masked_softmax.get_batch_per_block(sq, sk, b, np)
diff --git a/op_builder/builder.py b/op_builder/builder.py
index dc9ea8e115d8..e2fdde3affa8 100644
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -6,6 +6,23 @@
 from typing import List
 
 
+def print_rank_0(message):
+    """
+    Print on only one process to avoid spamming.
+    """
+    try:
+        import torch.distributed as dist
+        if not dist.is_initialized():
+            is_main_rank = True
+        else:
+            is_main_rank = dist.get_rank() == 0
+    except ImportError:
+        is_main_rank = True
+
+    if is_main_rank:
+        print(message)
+
+
 class Builder(ABC):
     """
     Builder is the base class to build extensions for PyTorch.
@@ -117,7 +134,7 @@ def load(self, verbose=True):
         try:
             op_module = self.import_op()
             if verbose:
-                print(f"OP {self.prebuilt_import_path} already exists, skip building.")
+                print_rank_0(f"OP {self.prebuilt_import_path} already exists, skip building.")
         except ImportError:
             # construct the build directory
             import torch
@@ -130,9 +147,11 @@ def load(self, verbose=True):
             Path(build_directory).mkdir(parents=True, exist_ok=True)
 
             if verbose:
-                print("=========================================================================================")
-                print(f"No pre-built kernel is found, build and load the {self.name} kernel during runtime now")
-                print("=========================================================================================")
+                print_rank_0(
+                    "=========================================================================================")
+                print_rank_0(f"No pre-built kernel is found, build and load the {self.name} kernel during runtime now")
+                print_rank_0(
+                    "=========================================================================================")
 
             # load the kernel
             op_module = load(name=self.name,
@@ -146,7 +165,7 @@ def load(self, verbose=True):
 
         build_duration = time.time() - start_build
         if verbose:
-            print(f"Time to load {self.name} op: {build_duration} seconds")
+            print_rank_0(f"Time to load {self.name} op: {build_duration} seconds")
 
         return op_module
 

From 578374d0dec613d1d1f6595c01a925168c26d4fc Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Fri, 3 Feb 2023 10:47:00 +0800
Subject: [PATCH 238/503] [doc] fixed the typo in pr template (#2556)

---
 .github/pull_request_template.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index 1d3c43ae402a..f3431226ecc9 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -2,7 +2,7 @@
 
 - [ ] I have created an issue for this PR for traceability
 - [ ] The title follows the standard format: `[doc/gemini/tensor/...]: A concise description`
-- [ ] I have added relevant tags if possible for us to better distinguish differnt PRs
+- [ ] I have added relevant tags if possible for us to better distinguish different PRs
 
 
 ## 🚨 Issue number

From 4af31d263dd12c9238607fa48e5fd0488cd8cf25 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Fri, 3 Feb 2023 10:47:27 +0800
Subject: [PATCH 239/503] [doc] updated the CHANGE_LOG.md for github release
 page (#2552)

---
 CHANGE_LOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGE_LOG.md b/CHANGE_LOG.md
index bbf1d62f908b..385137665906 100644
--- a/CHANGE_LOG.md
+++ b/CHANGE_LOG.md
@@ -2,6 +2,8 @@
 
 All notable changes to this project will be documented in this file.
 
+🚩 **We have moved the change log to the GitHub [release page](https://github.com/hpcaitech/ColossalAI/releases)**
+
 ## v0.0.2 | 2022-02
 
 ### Added

From 51d4d6e718272791265690661520b0c5b00f14ff Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 3 Feb 2023 10:48:15 +0800
Subject: [PATCH 240/503] Automated submodule synchronization (#2492)

Co-authored-by: github-actions <github-actions@github.com>
---
 inference | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/inference b/inference
index 6dadc2a4f293..5d250f4af628 160000
--- a/inference
+++ b/inference
@@ -1 +1 @@
-Subproject commit 6dadc2a4f293f4314280d6250463d986536e46ea
+Subproject commit 5d250f4af6283f65a701636628ffeef10447e650

From 552183bb745012809a96e30b525a759f8e6b7cd3 Mon Sep 17 00:00:00 2001
From: HELSON <c2h214748@gmail.com>
Date: Fri, 3 Feb 2023 11:44:10 +0800
Subject: [PATCH 241/503] [polish] polish ColoTensor and its submodules (#2537)

---
 colossalai/tensor/colo_parameter.py         |  2 +-
 colossalai/tensor/colo_tensor.py            |  7 ++-
 colossalai/tensor/compute_spec.py           |  6 +-
 colossalai/tensor/distspec.py               |  9 +--
 colossalai/tensor/process_group.py          | 66 ++++++++++++---------
 colossalai/utils/model/colo_init_context.py | 50 +++++++---------
 6 files changed, 75 insertions(+), 65 deletions(-)

diff --git a/colossalai/tensor/colo_parameter.py b/colossalai/tensor/colo_parameter.py
index 92220d9e2a38..b384579feb35 100644
--- a/colossalai/tensor/colo_parameter.py
+++ b/colossalai/tensor/colo_parameter.py
@@ -71,7 +71,7 @@ def from_torch_tensor(tensor: torch.Tensor,
         return tensor
 
     def __repr__(self):
-        return f'ColoParameter: {ColoTensor.__repr__(self)}'
+        return super(ColoParameter, self).__repr__()
 
     @classmethod
     def __torch_function__(cls, func, types, args=..., kwargs=None):
diff --git a/colossalai/tensor/colo_tensor.py b/colossalai/tensor/colo_tensor.py
index b27f5dea76a9..474dc7a1e9bf 100644
--- a/colossalai/tensor/colo_tensor.py
+++ b/colossalai/tensor/colo_tensor.py
@@ -189,7 +189,12 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
                 return _convert_output(ret, colo_spec)
 
     def __repr__(self):
-        return f'ColoTensor:\n{super().__repr__()}\n{self.dist_spec}\n{self.process_group}\n{self.compute_spec}'
+        output_list = [super(ColoTensor, self).__repr__()]
+        output_list.append(str(self.process_group))
+        output_list.append(str(self.dist_spec))
+        if self.compute_spec is not None:
+            output_list.append(str(self.compute_spec))
+        return "\n".join(output_list)
 
     def _redistribute(self, dist_spec: _DistSpec) -> None:
         """_redistribute
diff --git a/colossalai/tensor/compute_spec.py b/colossalai/tensor/compute_spec.py
index a9774c34c01b..73328285ee93 100644
--- a/colossalai/tensor/compute_spec.py
+++ b/colossalai/tensor/compute_spec.py
@@ -9,9 +9,9 @@ class ComputePattern(Enum):
 
 
 class ComputeSpec(object):
-    """ComputeSpec 
+    """ComputeSpec
     The Specification for compuattion pattern
-    
+
     Args:
         compute_pattern (ComputePattern): an Enum instance for compute pattern.
     """
@@ -23,7 +23,7 @@ def __init__(self, compute_pattern: ComputePattern) -> None:
         self.output_replicate = True
 
     def __repr__(self):
-        return f'Compute pattern: {self.compute_pattern}'
+        return f'ComputeSpec(pattern={self.compute_pattern}, replicate_output={self.output_replicate})'
 
     def set_output_replicate(self, flag: bool = True):
         self.output_replicate = flag
diff --git a/colossalai/tensor/distspec.py b/colossalai/tensor/distspec.py
index 0b62cbdda2c5..8dd0d8791537 100644
--- a/colossalai/tensor/distspec.py
+++ b/colossalai/tensor/distspec.py
@@ -11,7 +11,7 @@ class DistPlacementPattern(Enum):
 
 class _DistSpec:
     """_DistSpec
-    
+
     A class indicates Distributed Specification.
     The DistSpec is only works for the tensor parallel process groups.
     Because the dist spec of data parallel process group can be automatically deduced.
@@ -39,11 +39,12 @@ def __eq__(self, other: "_DistSpec") -> bool:
         return True
 
     def __repr__(self) -> str:
-        res_list = ["DistSpec:"]
+        attr_list = []
         for attr in dir(self):
             if not attr.startswith('__'):
-                res_list.append(f'\n\t{attr}: {str(getattr(self, attr))}')
-        return ''.join(res_list)
+                attr_list.append(f'{attr}={str(getattr(self, attr))}')
+        attr_str = ", ".join(attr_list)
+        return "DistSpec(" + attr_str + ")"
 
 
 def ReplicaSpec() -> _DistSpec:
diff --git a/colossalai/tensor/process_group.py b/colossalai/tensor/process_group.py
index e7e565071e58..f108bdc247f5 100644
--- a/colossalai/tensor/process_group.py
+++ b/colossalai/tensor/process_group.py
@@ -1,29 +1,36 @@
-import torch
 from typing import List, Optional
-from colossalai.logging import get_dist_logger
+
+import torch
+
 from colossalai.context.singleton_meta import SingletonMeta
+from colossalai.logging import get_dist_logger
 
 
 class PyTorchProcessGroupDict(metaclass=SingletonMeta):
 
     def __init__(self):
         # distributed settings
+        # use this dict to record all Pytorch ProcessGroups
         self.dict = {}
+        # set a distributed logger
+        self.logger = get_dist_logger('ProcessGroup')
+
+    def log_pg_init(self, rank_list: List[int], backend: str):
+        str_list = ["Pytorch ProcessGroup Init:"]
+        str_list.append(f"backend: {backend}")
+        str_list.append(f"ranks: {rank_list}")
+        self.logger.info("\n\t".join(str_list), ranks=[0])
 
     def get(self, rank_list: List[int], backend: str = 'nccl'):
         """Reuse Pytorch ProcessGroup when such a group is initialized
         """
-        rank_tuple = tuple(rank_list)
         # we need to convert the passed list to a tuple
         # since List is unhashable
-        pg_key = (backend, rank_tuple)
-
-        if pg_key not in self.dict:
-
-            self.logger = get_dist_logger('ProcessGroup')
-            self.logger.info(f'NCCL initialize ProcessGroup on {rank_list}', ranks=[0])
-            self.dict[pg_key] = torch.distributed.new_group(ranks=rank_list, backend=backend)
-        return self.dict[pg_key]
+        processgroup_key = (backend, tuple(rank_list))
+        if processgroup_key not in self.dict:
+            self.log_pg_init(rank_list=rank_list, backend=backend)
+            self.dict[processgroup_key] = torch.distributed.new_group(ranks=rank_list, backend=backend)
+        return self.dict[processgroup_key]
 
 
 PYTORCHPGDICT_ = PyTorchProcessGroupDict()
@@ -40,7 +47,7 @@ class ProcessGroup:
         rank: the global rank of the current process.
         ranks: List[int], a list of rank id belongings to this process group.
         backend: str, the backend of the process group.
-        tp_degree: Optional[int], tensor parallelism degree. How many processes are inside a tp process group. default None means 1. 
+        tp_degree: Optional[int], tensor parallelism degree. How many processes are inside a tp process group. default None means 1.
         dp_degree: Optional[int], data parallelism degree. How many processes are inside a dp process group. . default None means len(ranks).
     """
 
@@ -54,10 +61,10 @@ def __init__(self,
             return
 
         assert torch.distributed.is_initialized(), f"ProcessGroup must be used after distributed initialized"
-        if rank is None:
-            self._rank = torch.distributed.get_rank()
-        else:
-            self._rank = rank
+
+        self._rank = torch.distributed.get_rank()
+        if rank is not None:
+            assert self._rank == rank    # make sure that the global rank is correct
 
         if ranks is None:
             self._rank_list = list(range(torch.distributed.get_world_size()))
@@ -104,7 +111,7 @@ def __init__(self,
         self.is_init = True
 
     def set_cpu_groups(self):
-        """set_cpu_groups 
+        """set_cpu_groups
         Initialize Pytorch process groups for cpu communications.
         """
         if self.has_cpu_groups:
@@ -122,7 +129,7 @@ def set_cpu_groups(self):
 
     @property
     def has_cpu_groups(self) -> bool:
-        """has_cpu_groups 
+        """has_cpu_groups
         If cpu groups have been initailized.
 
         Returns:
@@ -132,8 +139,9 @@ def has_cpu_groups(self) -> bool:
 
     def __repr__(self):
         if self.is_init:
-            return "ProcessGroup:\n\tRank: {}, World size: {}, DP degree: {}, TP degree: {}\n\tRanks in group: {}".\
-                format(self._rank, self._world_size, self._dp_degree, self._tp_degree, self._rank_list)
+            ranks_str = f"ProcessGroup(ranks={self._rank_list},\n"
+            personal_str = f"             rank={self._rank}, dp={self._dp_degree}, tp={self._tp_degree})"
+            return ranks_str + personal_str
         else:
             return "ProcessGroup not initialized"
 
@@ -155,7 +163,7 @@ def __eq__(self, obj: 'ProcessGroup') -> bool:
         return True
 
     def rank(self) -> int:
-        """rank 
+        """rank
 
         The current rank in the global process group.
 
@@ -165,9 +173,9 @@ def rank(self) -> int:
         return self._rank
 
     def ranks_in_group(self) -> List[int]:
-        """ranks_in_group 
+        """ranks_in_group
 
-        a list of rank number in in the global process group. 
+        a list of rank number in in the global process group.
 
         Returns:
             List[int]: a list of rank number.
@@ -177,7 +185,7 @@ def ranks_in_group(self) -> List[int]:
     def world_size(self) -> int:
         """world_size
 
-        The world size of the global process group. 
+        The world size of the global process group.
 
         Returns:
             int: world size
@@ -185,7 +193,7 @@ def world_size(self) -> int:
         return self._world_size
 
     def tp_rank_list(self) -> List[int]:
-        """tp_rank_list 
+        """tp_rank_list
 
         the rank list in the TP process group containing the current rank.
 
@@ -195,7 +203,7 @@ def tp_rank_list(self) -> List[int]:
         return self._tp_rank_list
 
     def dp_rank_list(self) -> List[int]:
-        """dp_rank_list 
+        """dp_rank_list
 
         the rank list in the DP process group containing the current rank.
 
@@ -205,7 +213,7 @@ def dp_rank_list(self) -> List[int]:
         return self._dp_rank_list
 
     def tp_local_rank(self) -> int:
-        """tp_local_rank 
+        """tp_local_rank
 
         The local rank number in the current TP process group.
 
@@ -268,7 +276,7 @@ def cpu_dp_process_group(self):
         """cpu_dp_process_group
 
         the pytorch CPU DP process group containing the current rank.
-        
+
         assert failed if cpu process group is not initialized.
 
         Returns:
@@ -281,7 +289,7 @@ def cpu_tp_process_group(self):
         """cpu_tp_process_group
 
         the pytorch CPU TP process group containing the current rank.
-        
+
         assert failed if cpu process group is not initialized.
 
         Returns:
diff --git a/colossalai/utils/model/colo_init_context.py b/colossalai/utils/model/colo_init_context.py
index 93c91e0995ea..ab354ea70320 100644
--- a/colossalai/utils/model/colo_init_context.py
+++ b/colossalai/utils/model/colo_init_context.py
@@ -37,12 +37,11 @@ def _convert_to_coloparam(param: torch.nn.Parameter,
     # detaching tensor is necessary for optimizers.
     requires_grad = param.requires_grad
     # param is the global tensor.
-    
+
     if param.device.type == "meta":
         colo_param = ColoParameter(param, requires_grad=requires_grad)
-    else:    
+    else:
         colo_param = ColoParameter(param.to(device=device, dtype=dtype), requires_grad=requires_grad)
-      
 
     # if default_shard_plan exists, shard the param during initialization.
     # This can reduce the model size after initialization.
@@ -129,32 +128,29 @@ def _post_init_method(self, module: torch.nn.Module, *args, **kwargs):
             delattr(submodule, param_name)
             setattr(submodule, param_name, colo_param)
             colo_param.shared_param_modules.append(submodule)
-        
-        meta_param_flag = 0
-        meta_buffer_flag = 0
+
+        param_number = 0
+        meta_param_number = 0
+        buffer_number = 0
+        meta_buffer_number = 0
+
         for param in module.parameters():
-            if param.device.type=="meta":
-                meta_param_flag = 1
-            if meta_param_flag == 1 and param.device.type!="meta":
-                raise ValueError("Meta parameters and valued parameters can not  be in the same model")
-            
+            param_number += 1
+            meta_param_number += (param.device.type == 'meta')
+
         for buffer in module.buffers():
-            if buffer.device.type=="meta":
-                meta_buffer_flag = 1
-            if meta_buffer_flag == 1 and buffer.device.type!="meta":
-                raise ValueError("Meta buffers and valued buffers can not be in the same model")
-        
-        if meta_param_flag==1 and meta_buffer_flag==1:
-            pass
-        elif meta_buffer_flag==0 and meta_param_flag==1:
-             for name, buf in module.named_buffers():
-                module._buffers[name] = module._buffers[name].to(device=self._device)
-        elif meta_param_flag==0 and meta_buffer_flag==1:
-            for name, param in module.named_parameters():
-                module._parameters[name] = module._parameters[name].to(device=self._device)
-        else:
-            module.to(self._device)
- 
+            buffer_number += 1
+            meta_buffer_number += (buffer.device.type == 'meta')
+
+        if meta_param_number > 0 and meta_param_number != param_number:
+            raise ValueError("Meta parameters and valued parameters can not  be in the same model")
+        if meta_buffer_number > 0 and meta_buffer_number != buffer_number:
+            raise ValueError("Meta buffers and valued buffers can not be in the same model")
+
+        if meta_buffer_number == 0:
+            for buffer in module.buffers():
+                buffer.data = buffer.data.to(device=self._device)
+
 
 def post_process_colo_init_ctx(model: torch.nn.Module,
                                device: torch.device = torch.device('cpu'),

From cad1f505125ab3d74f0390a3e3dc5796e5cd790f Mon Sep 17 00:00:00 2001
From: Fazzie <1240419984@qq.com>
Date: Fri, 3 Feb 2023 15:34:54 +0800
Subject: [PATCH 242/503] fix ckpt

---
 examples/images/diffusion/README.md           |  35 +-
 .../Teyvat/train_colossalai_teyvat.yaml       |   3 +-
 .../diffusion/ldm/models/diffusion/ddpm.py    | 818 +++++++++++-------
 .../ldm/modules/diffusionmodules/model.py     | 547 ++++++------
 examples/images/diffusion/main.py             |  78 +-
 examples/images/diffusion/scripts/txt2img.sh  |   6 +-
 examples/images/diffusion/train_colossalai.sh |   2 +-
 7 files changed, 831 insertions(+), 658 deletions(-)

diff --git a/examples/images/diffusion/README.md b/examples/images/diffusion/README.md
index b68347c00b6e..bec1c7503b4e 100644
--- a/examples/images/diffusion/README.md
+++ b/examples/images/diffusion/README.md
@@ -53,27 +53,33 @@ You can also update an existing [latent diffusion](https://github.com/CompVis/la
 ```
 conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit=11.3 -c pytorch
 pip install transformers==4.19.2 diffusers invisible-watermark
-pip install -e .
 ```
 
 #### Step 2: install lightning
 
 Install Lightning version later than 2022.01.04. We suggest you install lightning from source.
 
+##### From Source
 ```
 git clone https://github.com/Lightning-AI/lightning.git
 pip install -r requirements.txt
 python setup.py install
 ```
 
+##### From pip
+
+```
+pip install pytorch-lightning
+```
+
 #### Step 3:Install [Colossal-AI](https://colossalai.org/download/) From Our Official Website
 
 ##### From pip
 
-For example, you can install  v0.1.12 from our official website.
+For example, you can install  v0.2.0 from our official website.
 
 ```
-pip install colossalai==0.1.12+torch1.12cu11.3 -f https://release.colossalai.org
+pip install colossalai==0.2.0+torch1.12cu11.3 -f https://release.colossalai.org
 ```
 
 ##### From source
@@ -133,10 +139,9 @@ It is important for you to configure your volume mapping in order to get the bes
 3. **Optional**, if you encounter any problem stating that shared memory is insufficient inside container, please add `-v /dev/shm:/dev/shm` to your `docker run` command.
 
 
-
 ## Download the model checkpoint from pretrained
 
-### stable-diffusion-v2-base
+### stable-diffusion-v2-base(Recommand)
 
 ```
 wget https://huggingface.co/stabilityai/stable-diffusion-2-base/resolve/main/512-base-ema.ckpt
@@ -144,8 +149,6 @@ wget https://huggingface.co/stabilityai/stable-diffusion-2-base/resolve/main/512
 
 ### stable-diffusion-v1-4
 
-Our default model config use the weight from [CompVis/stable-diffusion-v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4?text=A+mecha+robot+in+a+favela+in+expressionist+style)
-
 ```
 git lfs install
 git clone https://huggingface.co/CompVis/stable-diffusion-v1-4
@@ -153,8 +156,6 @@ git clone https://huggingface.co/CompVis/stable-diffusion-v1-4
 
 ### stable-diffusion-v1-5 from runway
 
-If you want to useed the Last [stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) weight from runwayml
-
 ```
 git lfs install
 git clone https://huggingface.co/runwayml/stable-diffusion-v1-5
@@ -171,11 +172,16 @@ We provide the script `train_colossalai.sh` to run the training task with coloss
 and can also use `train_ddp.sh` to run the training task with ddp to compare.
 
 In `train_colossalai.sh` the main command is:
+
 ```
-python main.py --logdir /tmp/ -t -b configs/train_colossalai.yaml
+python main.py --logdir /tmp/ --train --base configs/train_colossalai.yaml --ckpt 512-base-ema.ckpt
 ```
 
-- you can change the `--logdir` to decide where to save the log information and the last checkpoint.
+- You can change the `--logdir` to decide where to save the log information and the last checkpoint.
+  - You will find your ckpt in `logdir/checkpoints` or `logdir/diff_tb/version_0/checkpoints`
+  - You will find your train config yaml in `logdir/configs`
+- You can add the `--ckpt` if you want to load the pretrained model, for example `512-base-ema.ckpt`
+- You can change the `--base` to specify the path of config yaml
 
 ### Training config
 
@@ -186,7 +192,8 @@ You can change the trainging config in the yaml file
 - precision: the precision type used in training, default 16 (fp16), you must use fp16 if you want to apply colossalai
 - more information about the configuration of ColossalAIStrategy can be found [here](https://pytorch-lightning.readthedocs.io/en/latest/advanced/model_parallel.html#colossal-ai)
 
-## Finetune Example (Work In Progress)
+
+## Finetune Example
 ### Training on Teyvat Datasets
 
 We provide the finetuning example on [Teyvat](https://huggingface.co/datasets/Fazzie/Teyvat) dataset, which is create by BLIP generated captions.
@@ -201,8 +208,8 @@ you can get yout training last.ckpt and train config.yaml in your `--logdir`, an
 ```
 python scripts/txt2img.py --prompt "a photograph of an astronaut riding a horse" --plms
     --outdir ./output \
-    --config path/to/logdir/checkpoints/last.ckpt \
-    --ckpt /path/to/logdir/configs/project.yaml  \
+    --ckpt path/to/logdir/checkpoints/last.ckpt \
+    --config /path/to/logdir/configs/project.yaml  \
 ```
 
 ```commandline
diff --git a/examples/images/diffusion/configs/Teyvat/train_colossalai_teyvat.yaml b/examples/images/diffusion/configs/Teyvat/train_colossalai_teyvat.yaml
index 8a8250c5d300..ff0f4c5a0463 100644
--- a/examples/images/diffusion/configs/Teyvat/train_colossalai_teyvat.yaml
+++ b/examples/images/diffusion/configs/Teyvat/train_colossalai_teyvat.yaml
@@ -6,6 +6,7 @@ model:
     linear_start: 0.00085
     linear_end: 0.0120
     num_timesteps_cond: 1
+    ckpt: None # use ckpt path
     log_every_t: 200
     timesteps: 1000
     first_stage_key: image
@@ -16,7 +17,7 @@ model:
     conditioning_key: crossattn
     monitor: val/loss_simple_ema
     scale_factor: 0.18215
-    use_ema: False # we set this to false because this is an inference only config
+    use_ema: False
 
     scheduler_config: # 10000 warmup steps
       target: ldm.lr_scheduler.LambdaLinearScheduler
diff --git a/examples/images/diffusion/ldm/models/diffusion/ddpm.py b/examples/images/diffusion/ldm/models/diffusion/ddpm.py
index f7ac0a735f10..b7315b048c66 100644
--- a/examples/images/diffusion/ldm/models/diffusion/ddpm.py
+++ b/examples/images/diffusion/ldm/models/diffusion/ddpm.py
@@ -6,56 +6,41 @@
 -- merci
 """
 
+import numpy as np
 import torch
 import torch.nn as nn
-import numpy as np
+
 try:
     import lightning.pytorch as pl
-    from lightning.pytorch.utilities import rank_zero_only, rank_zero_info
+    from lightning.pytorch.utilities import rank_zero_info, rank_zero_only
 except:
     import pytorch_lightning as pl
     from pytorch_lightning.utilities import rank_zero_only, rank_zero_info
-from torch.optim.lr_scheduler import LambdaLR
-from einops import rearrange, repeat
+
+import itertools
 from contextlib import contextmanager, nullcontext
 from functools import partial
-import itertools
-from tqdm import tqdm
-from torchvision.utils import make_grid
 
-from omegaconf import ListConfig
-
-from ldm.util import log_txt_as_img, exists, default, ismap, isimage, mean_flat, count_params, instantiate_from_config
-from ldm.modules.ema import LitEma
-from ldm.modules.distributions.distributions import normal_kl, DiagonalGaussianDistribution
-from ldm.models.autoencoder import IdentityFirstStage, AutoencoderKL
-
-
-from ldm.modules.diffusionmodules.util import make_beta_schedule, extract_into_tensor, noise_like
+from einops import rearrange, repeat
+from ldm.models.autoencoder import *
+from ldm.models.autoencoder import AutoencoderKL, IdentityFirstStage
+from ldm.models.diffusion.ddim import *
 from ldm.models.diffusion.ddim import DDIMSampler
+from ldm.modules.diffusionmodules.model import *
+from ldm.modules.diffusionmodules.model import Decoder, Encoder, Model
 from ldm.modules.diffusionmodules.openaimodel import *
-
-from ldm.modules.diffusionmodules.util import make_beta_schedule, extract_into_tensor, noise_like
-from ldm.models.diffusion.ddim import DDIMSampler
 from ldm.modules.diffusionmodules.openaimodel import AttentionPool2d
-from ldm.modules.encoders.modules import *
-
+from ldm.modules.diffusionmodules.util import extract_into_tensor, make_beta_schedule, noise_like
+from ldm.modules.distributions.distributions import DiagonalGaussianDistribution, normal_kl
 from ldm.modules.ema import LitEma
-from ldm.modules.distributions.distributions import normal_kl, DiagonalGaussianDistribution
-from ldm.models.autoencoder import *
-from ldm.models.diffusion.ddim import *
-from ldm.modules.diffusionmodules.openaimodel import *
-from ldm.modules.diffusionmodules.model import *
-
-
-from ldm.modules.diffusionmodules.model import Model, Encoder, Decoder
-
-from ldm.util import instantiate_from_config
-
+from ldm.modules.encoders.modules import *
+from ldm.util import count_params, default, exists, instantiate_from_config, isimage, ismap, log_txt_as_img, mean_flat
+from omegaconf import ListConfig
+from torch.optim.lr_scheduler import LambdaLR
+from torchvision.utils import make_grid
+from tqdm import tqdm
 
-__conditioning_keys__ = {'concat': 'c_concat',
-                         'crossattn': 'c_crossattn',
-                         'adm': 'y'}
+__conditioning_keys__ = {'concat': 'c_concat', 'crossattn': 'c_crossattn', 'adm': 'y'}
 
 
 def disabled_train(self, mode=True):
@@ -70,40 +55,41 @@ def uniform_on_device(r1, r2, shape, device):
 
 class DDPM(pl.LightningModule):
     # classic DDPM with Gaussian diffusion, in image space
-    def __init__(self,
-                 unet_config,
-                 timesteps=1000,
-                 beta_schedule="linear",
-                 loss_type="l2",
-                 ckpt_path=None,
-                 ignore_keys=[],
-                 load_only_unet=False,
-                 monitor="val/loss",
-                 use_ema=True,
-                 first_stage_key="image",
-                 image_size=256,
-                 channels=3,
-                 log_every_t=100,
-                 clip_denoised=True,
-                 linear_start=1e-4,
-                 linear_end=2e-2,
-                 cosine_s=8e-3,
-                 given_betas=None,
-                 original_elbo_weight=0.,
-                 v_posterior=0.,  # weight for choosing posterior variance as sigma = (1-v) * beta_tilde + v * beta
-                 l_simple_weight=1.,
-                 conditioning_key=None,
-                 parameterization="eps",  # all assuming fixed variance schedules
-                 scheduler_config=None,
-                 use_positional_encodings=False,
-                 learn_logvar=False,
-                 logvar_init=0.,
-                 use_fp16 = True,
-                 make_it_fit=False,
-                 ucg_training=None,
-                 reset_ema=False,
-                 reset_num_ema_updates=False,
-                 ):
+    def __init__(
+        self,
+        unet_config,
+        timesteps=1000,
+        beta_schedule="linear",
+        loss_type="l2",
+        ckpt=None,
+        ignore_keys=[],
+        load_only_unet=False,
+        monitor="val/loss",
+        use_ema=True,
+        first_stage_key="image",
+        image_size=256,
+        channels=3,
+        log_every_t=100,
+        clip_denoised=True,
+        linear_start=1e-4,
+        linear_end=2e-2,
+        cosine_s=8e-3,
+        given_betas=None,
+        original_elbo_weight=0.,
+        v_posterior=0.,    # weight for choosing posterior variance as sigma = (1-v) * beta_tilde + v * beta
+        l_simple_weight=1.,
+        conditioning_key=None,
+        parameterization="eps",    # all assuming fixed variance schedules
+        scheduler_config=None,
+        use_positional_encodings=False,
+        learn_logvar=False,
+        logvar_init=0.,
+        use_fp16=True,
+        make_it_fit=False,
+        ucg_training=None,
+        reset_ema=False,
+        reset_num_ema_updates=False,
+    ):
         super().__init__()
         assert parameterization in ["eps", "x0", "v"], 'currently only supporting "eps" and "x0" and "v"'
         self.parameterization = parameterization
@@ -112,18 +98,18 @@ def __init__(self,
         self.clip_denoised = clip_denoised
         self.log_every_t = log_every_t
         self.first_stage_key = first_stage_key
-        self.image_size = image_size  # try conv?
+        self.image_size = image_size
         self.channels = channels
         self.use_positional_encodings = use_positional_encodings
 
         self.unet_config = unet_config
         self.conditioning_key = conditioning_key
         self.model = DiffusionWrapper(unet_config, conditioning_key)
-        count_params(self.model, verbose=True)
+        # count_params(self.model, verbose=True)
         self.use_ema = use_ema
         if self.use_ema:
             self.model_ema = LitEma(self.model)
-            print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
+            rank_zero_info(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
 
         self.use_scheduler = scheduler_config is not None
         if self.use_scheduler:
@@ -136,21 +122,26 @@ def __init__(self,
         if monitor is not None:
             self.monitor = monitor
         self.make_it_fit = make_it_fit
-        self.ckpt_path = ckpt_path
+        self.ckpt = ckpt
         self.ignore_keys = ignore_keys
         self.load_only_unet = load_only_unet
         self.reset_ema = reset_ema
         self.reset_num_ema_updates = reset_num_ema_updates
 
-        if reset_ema: assert exists(ckpt_path)
-        if ckpt_path is not None:
-            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys, only_model=load_only_unet)
-            if reset_ema:
-                assert self.use_ema
-                print(f"Resetting ema to pure model weights. This is useful when restoring from an ema-only checkpoint.")
-                self.model_ema = LitEma(self.model)
+        if reset_ema:
+            assert exists(ckpt)
+        '''
+        Uncomment if you Use DDP Strategy
+        '''
+        # if ckpt is not None:
+        #     self.init_from_ckpt(ckpt, ignore_keys=ignore_keys, only_model=load_only_unet)
+        #     if reset_ema:
+        #         assert self.use_ema
+        #         rank_zero_info(f"Resetting ema to pure model weights. This is useful when restoring from an ema-only checkpoint.")
+        #         self.model_ema = LitEma(self.model)
+
         if reset_num_ema_updates:
-            print(" +++++++++++ WARNING: RESETTING NUM_EMA UPDATES TO ZERO +++++++++++ ")
+            rank_zero_info(" +++++++++++ WARNING: RESETTING NUM_EMA UPDATES TO ZERO +++++++++++ ")
             assert self.use_ema
             self.model_ema.reset_num_updates()
 
@@ -160,9 +151,13 @@ def __init__(self,
         self.linear_start = linear_start
         self.linear_end = linear_end
         self.cosine_s = cosine_s
-        
-        self.register_schedule(given_betas=given_betas, beta_schedule=beta_schedule, timesteps=timesteps,
-                               linear_start=linear_start, linear_end=linear_end, cosine_s=cosine_s)
+
+        self.register_schedule(given_betas=given_betas,
+                               beta_schedule=beta_schedule,
+                               timesteps=timesteps,
+                               linear_start=linear_start,
+                               linear_end=linear_end,
+                               cosine_s=cosine_s)
 
         self.loss_type = loss_type
 
@@ -176,12 +171,20 @@ def __init__(self,
         if self.ucg_training:
             self.ucg_prng = np.random.RandomState()
 
-    def register_schedule(self, given_betas=None, beta_schedule="linear", timesteps=1000,
-                          linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
+    def register_schedule(self,
+                          given_betas=None,
+                          beta_schedule="linear",
+                          timesteps=1000,
+                          linear_start=1e-4,
+                          linear_end=2e-2,
+                          cosine_s=8e-3):
         if exists(given_betas):
             betas = given_betas
         else:
-            betas = make_beta_schedule(beta_schedule, timesteps, linear_start=linear_start, linear_end=linear_end,
+            betas = make_beta_schedule(beta_schedule,
+                                       timesteps,
+                                       linear_start=linear_start,
+                                       linear_end=linear_end,
                                        cosine_s=cosine_s)
         alphas = 1. - betas
         alphas_cumprod = np.cumprod(alphas, axis=0)
@@ -208,24 +211,23 @@ def register_schedule(self, given_betas=None, beta_schedule="linear", timesteps=
 
         # calculations for posterior q(x_{t-1} | x_t, x_0)
         posterior_variance = (1 - self.v_posterior) * betas * (1. - alphas_cumprod_prev) / (
-                1. - alphas_cumprod) + self.v_posterior * betas
+            1. - alphas_cumprod) + self.v_posterior * betas
         # above: equal to 1. / (1. / (1. - alpha_cumprod_tm1) + alpha_t / beta_t)
         self.register_buffer('posterior_variance', to_torch(posterior_variance))
         # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
         self.register_buffer('posterior_log_variance_clipped', to_torch(np.log(np.maximum(posterior_variance, 1e-20))))
-        self.register_buffer('posterior_mean_coef1', to_torch(
-            betas * np.sqrt(alphas_cumprod_prev) / (1. - alphas_cumprod)))
-        self.register_buffer('posterior_mean_coef2', to_torch(
-            (1. - alphas_cumprod_prev) * np.sqrt(alphas) / (1. - alphas_cumprod)))
+        self.register_buffer('posterior_mean_coef1',
+                             to_torch(betas * np.sqrt(alphas_cumprod_prev) / (1. - alphas_cumprod)))
+        self.register_buffer('posterior_mean_coef2',
+                             to_torch((1. - alphas_cumprod_prev) * np.sqrt(alphas) / (1. - alphas_cumprod)))
 
         if self.parameterization == "eps":
-            lvlb_weights = self.betas ** 2 / (
-                    2 * self.posterior_variance * to_torch(alphas) * (1 - self.alphas_cumprod))
+            lvlb_weights = self.betas**2 / (2 * self.posterior_variance * to_torch(alphas) * (1 - self.alphas_cumprod))
         elif self.parameterization == "x0":
             lvlb_weights = 0.5 * np.sqrt(torch.Tensor(alphas_cumprod)) / (2. * 1 - torch.Tensor(alphas_cumprod))
         elif self.parameterization == "v":
-            lvlb_weights = torch.ones_like(self.betas ** 2 / (
-                    2 * self.posterior_variance * to_torch(alphas) * (1 - self.alphas_cumprod)))
+            lvlb_weights = torch.ones_like(self.betas**2 / (2 * self.posterior_variance * to_torch(alphas) *
+                                                            (1 - self.alphas_cumprod)))
         else:
             raise NotImplementedError("mu not supported")
         lvlb_weights[0] = lvlb_weights[1]
@@ -238,14 +240,14 @@ def ema_scope(self, context=None):
             self.model_ema.store(self.model.parameters())
             self.model_ema.copy_to(self.model)
             if context is not None:
-                print(f"{context}: Switched to EMA weights")
+                rank_zero_info(f"{context}: Switched to EMA weights")
         try:
             yield None
         finally:
             if self.use_ema:
                 self.model_ema.restore(self.model.parameters())
                 if context is not None:
-                    print(f"{context}: Restored training weights")
+                    rank_zero_info(f"{context}: Restored training weights")
 
     @torch.no_grad()
     def init_from_ckpt(self, path, ignore_keys=list(), only_model=False):
@@ -256,18 +258,13 @@ def init_from_ckpt(self, path, ignore_keys=list(), only_model=False):
         for k in keys:
             for ik in ignore_keys:
                 if k.startswith(ik):
-                    print("Deleting key {} from state_dict.".format(k))
+                    rank_zero_info("Deleting key {} from state_dict.".format(k))
                     del sd[k]
         if self.make_it_fit:
-            n_params = len([name for name, _ in
-                            itertools.chain(self.named_parameters(),
-                                            self.named_buffers())])
-            for name, param in tqdm(
-                    itertools.chain(self.named_parameters(),
-                                    self.named_buffers()),
-                    desc="Fitting old weights to new weights",
-                    total=n_params
-            ):
+            n_params = len([name for name, _ in itertools.chain(self.named_parameters(), self.named_buffers())])
+            for name, param in tqdm(itertools.chain(self.named_parameters(), self.named_buffers()),
+                                    desc="Fitting old weights to new weights",
+                                    total=n_params):
                 if not name in sd:
                     continue
                 old_shape = sd[name].shape
@@ -304,11 +301,11 @@ def init_from_ckpt(self, path, ignore_keys=list(), only_model=False):
 
         missing, unexpected = self.load_state_dict(sd, strict=False) if not only_model else self.model.load_state_dict(
             sd, strict=False)
-        print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
+        rank_zero_info(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
         if len(missing) > 0:
-            print(f"Missing Keys:\n {missing}")
+            rank_zero_info(f"Missing Keys:\n {missing}")
         if len(unexpected) > 0:
-            print(f"\nUnexpected Keys:\n {unexpected}")
+            rank_zero_info(f"\nUnexpected Keys:\n {unexpected}")
 
     def q_mean_variance(self, x_start, t):
         """
@@ -323,30 +320,22 @@ def q_mean_variance(self, x_start, t):
         return mean, variance, log_variance
 
     def predict_start_from_noise(self, x_t, t, noise):
-        return (
-                extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t -
-                extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * noise
-        )
+        return (extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t -
+                extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * noise)
 
     def predict_start_from_z_and_v(self, x_t, t, v):
         # self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod)))
         # self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod)))
-        return (
-                extract_into_tensor(self.sqrt_alphas_cumprod, t, x_t.shape) * x_t -
-                extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_t.shape) * v
-        )
+        return (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_t.shape) * x_t -
+                extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_t.shape) * v)
 
     def predict_eps_from_z_and_v(self, x_t, t, v):
-        return (
-                extract_into_tensor(self.sqrt_alphas_cumprod, t, x_t.shape) * v +
-                extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_t.shape) * x_t
-        )
+        return (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_t.shape) * v +
+                extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_t.shape) * x_t)
 
     def q_posterior(self, x_start, x_t, t):
-        posterior_mean = (
-                extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start +
-                extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
-        )
+        posterior_mean = (extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start +
+                          extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t)
         posterior_variance = extract_into_tensor(self.posterior_variance, t, x_t.shape)
         posterior_log_variance_clipped = extract_into_tensor(self.posterior_log_variance_clipped, t, x_t.shape)
         return posterior_mean, posterior_variance, posterior_log_variance_clipped
@@ -379,7 +368,8 @@ def p_sample_loop(self, shape, return_intermediates=False):
         img = torch.randn(shape, device=device)
         intermediates = [img]
         for i in tqdm(reversed(range(0, self.num_timesteps)), desc='Sampling t', total=self.num_timesteps):
-            img = self.p_sample(img, torch.full((b,), i, device=device, dtype=torch.long),
+            img = self.p_sample(img,
+                                torch.full((b,), i, device=device, dtype=torch.long),
                                 clip_denoised=self.clip_denoised)
             if i % self.log_every_t == 0 or i == self.num_timesteps - 1:
                 intermediates.append(img)
@@ -400,10 +390,8 @@ def q_sample(self, x_start, t, noise=None):
                 extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise)
 
     def get_v(self, x, noise, t):
-        return (
-                extract_into_tensor(self.sqrt_alphas_cumprod, t, x.shape) * noise -
-                extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x.shape) * x
-        )
+        return (extract_into_tensor(self.sqrt_alphas_cumprod, t, x.shape) * noise -
+                extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x.shape) * x)
 
     def get_loss(self, pred, target, mean=True):
         if self.loss_type == 'l1':
@@ -485,11 +473,9 @@ def training_step(self, batch, batch_idx):
 
         loss, loss_dict = self.shared_step(batch)
 
-        self.log_dict(loss_dict, prog_bar=True,
-                      logger=True, on_step=True, on_epoch=True)
+        self.log_dict(loss_dict, prog_bar=True, logger=True, on_step=True, on_epoch=True)
 
-        self.log("global_step", self.global_step,
-                 prog_bar=True, logger=True, on_step=True, on_epoch=False)
+        self.log("global_step", self.global_step, prog_bar=True, logger=True, on_step=True, on_epoch=False)
 
         if self.use_scheduler:
             lr = self.optimizers().param_groups[0]['lr']
@@ -580,7 +566,8 @@ def __init__(self,
                  scale_by_std=False,
                  use_fp16=True,
                  force_null_conditioning=False,
-                 *args, **kwargs):
+                 *args,
+                 **kwargs):
         self.force_null_conditioning = force_null_conditioning
         self.num_timesteps_cond = default(num_timesteps_cond, 1)
         self.scale_by_std = scale_by_std
@@ -590,7 +577,7 @@ def __init__(self,
             conditioning_key = 'concat' if concat_mode else 'crossattn'
         if cond_stage_config == '__is_unconditional__' and not self.force_null_conditioning:
             conditioning_key = None
-        
+
         super().__init__(conditioning_key=conditioning_key, *args, **kwargs)
         self.concat_mode = concat_mode
         self.cond_stage_trainable = cond_stage_trainable
@@ -599,7 +586,7 @@ def __init__(self,
             self.num_downs = len(first_stage_config.params.ddconfig.ch_mult) - 1
         except:
             self.num_downs = 0
-            
+
         if not scale_by_std:
             self.scale_factor = scale_factor
         else:
@@ -611,40 +598,44 @@ def __init__(self,
         self.cond_stage_forward = cond_stage_forward
         self.clip_denoised = False
         self.bbox_tokenizer = None
-
-        self.restarted_from_ckpt = False
-        if self.ckpt_path is not None:
-            self.init_from_ckpt(self.ckpt_path, self.ignore_keys)
-            self.restarted_from_ckpt = True
-            if self.reset_ema:
-                assert self.use_ema
-                print(
-                    f"Resetting ema to pure model weights. This is useful when restoring from an ema-only checkpoint.")
-                self.model_ema = LitEma(self.model)
+        '''
+        Uncomment if you Use DDP Strategy
+        '''
+        # self.restarted_from_ckpt = False
+        # if self.ckpt is not None:
+        #     self.init_from_ckpt(self.ckpt, self.ignore_keys)
+        #     self.restarted_from_ckpt = True
+        #     if self.reset_ema:
+        #         assert self.use_ema
+        #         rank_zero_info(
+        #             f"Resetting ema to pure model weights. This is useful when restoring from an ema-only checkpoint.")
+        #         self.model_ema = LitEma(self.model)
         if self.reset_num_ema_updates:
-            print(" +++++++++++ WARNING: RESETTING NUM_EMA UPDATES TO ZERO +++++++++++ ")
+            rank_zero_info(" +++++++++++ WARNING: RESETTING NUM_EMA UPDATES TO ZERO +++++++++++ ")
             assert self.use_ema
             self.model_ema.reset_num_updates()
 
     def configure_sharded_model(self) -> None:
         rank_zero_info("Configure sharded model for LatentDiffusion")
         self.model = DiffusionWrapper(self.unet_config, self.conditioning_key)
+        count_params(self.model, verbose=True)
         if self.use_ema:
             self.model_ema = LitEma(self.model)
 
-        if self.ckpt_path is not None:
-            self.init_from_ckpt(self.ckpt_path, ignore_keys=self.ignore_keys, only_model=self.load_only_unet)
+        if self.ckpt is not None:
+            self.init_from_ckpt(self.ckpt, ignore_keys=self.ignore_keys, only_model=self.load_only_unet)
             if self.reset_ema:
                 assert self.use_ema
-                print(f"Resetting ema to pure model weights. This is useful when restoring from an ema-only checkpoint.")
+                rank_zero_info(
+                    f"Resetting ema to pure model weights. This is useful when restoring from an ema-only checkpoint.")
                 self.model_ema = LitEma(self.model)
-        if self.reset_num_ema_updates:
-            print(" +++++++++++ WARNING: RESETTING NUM_EMA UPDATES TO ZERO +++++++++++ ")
-            assert self.use_ema
-            self.model_ema.reset_num_updates()
 
-        self.register_schedule(given_betas=self.given_betas, beta_schedule=self.beta_schedule, timesteps=self.timesteps,
-                               linear_start=self.linear_start, linear_end=self.linear_end, cosine_s=self.cosine_s)
+        self.register_schedule(given_betas=self.given_betas,
+                               beta_schedule=self.beta_schedule,
+                               timesteps=self.timesteps,
+                               linear_start=self.linear_start,
+                               linear_end=self.linear_end,
+                               cosine_s=self.cosine_s)
 
         self.logvar = torch.full(fill_value=self.logvar_init, size=(self.num_timesteps,))
         if self.learn_logvar:
@@ -654,20 +645,16 @@ def configure_sharded_model(self) -> None:
 
         self.instantiate_first_stage(self.first_stage_config)
         self.instantiate_cond_stage(self.cond_stage_config)
-        if self.ckpt_path is not None:
-            self.init_from_ckpt(self.ckpt_path, self.ignore_keys)
+        if self.ckpt is not None:
+            self.init_from_ckpt(self.ckpt, self.ignore_keys)
             self.restarted_from_ckpt = True
             if self.reset_ema:
                 assert self.use_ema
-                print(
+                rank_zero_info(
                     f"Resetting ema to pure model weights. This is useful when restoring from an ema-only checkpoint.")
                 self.model_ema = LitEma(self.model)
-        if self.reset_num_ema_updates:
-            print(" +++++++++++ WARNING: RESETTING NUM_EMA UPDATES TO ZERO +++++++++++ ")
-            assert self.use_ema
-            self.model_ema.reset_num_updates()
 
-    def make_cond_schedule(self, ):
+    def make_cond_schedule(self,):
         self.cond_ids = torch.full(size=(self.num_timesteps,), fill_value=self.num_timesteps - 1, dtype=torch.long)
         ids = torch.round(torch.linspace(0, self.num_timesteps - 1, self.num_timesteps_cond)).long()
         self.cond_ids[:self.num_timesteps_cond] = ids
@@ -679,19 +666,23 @@ def on_train_batch_start(self, batch, batch_idx):
         if self.scale_by_std and self.current_epoch == 0 and self.global_step == 0 and batch_idx == 0 and not self.restarted_from_ckpt:
             assert self.scale_factor == 1., 'rather not use custom rescaling and std-rescaling simultaneously'
             # set rescale weight to 1./std of encodings
-            print("### USING STD-RESCALING ###")
+            rank_zero_info("### USING STD-RESCALING ###")
             x = super().get_input(batch, self.first_stage_key)
             x = x.to(self.device)
             encoder_posterior = self.encode_first_stage(x)
             z = self.get_first_stage_encoding(encoder_posterior).detach()
             del self.scale_factor
             self.register_buffer('scale_factor', 1. / z.flatten().std())
-            print(f"setting self.scale_factor to {self.scale_factor}")
-            print("### USING STD-RESCALING ###")
+            rank_zero_info(f"setting self.scale_factor to {self.scale_factor}")
+            rank_zero_info("### USING STD-RESCALING ###")
 
     def register_schedule(self,
-                          given_betas=None, beta_schedule="linear", timesteps=1000,
-                          linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
+                          given_betas=None,
+                          beta_schedule="linear",
+                          timesteps=1000,
+                          linear_start=1e-4,
+                          linear_end=2e-2,
+                          cosine_s=8e-3):
         super().register_schedule(given_betas, beta_schedule, timesteps, linear_start, linear_end, cosine_s)
 
         self.shorten_cond_schedule = self.num_timesteps_cond > 1
@@ -708,10 +699,10 @@ def instantiate_first_stage(self, config):
     def instantiate_cond_stage(self, config):
         if not self.cond_stage_trainable:
             if config == "__is_first_stage__":
-                print("Using first stage also as cond stage.")
+                rank_zero_info("Using first stage also as cond stage.")
                 self.cond_stage_model = self.first_stage_model
             elif config == "__is_unconditional__":
-                print(f"Training {self.__class__.__name__} as an unconditional model.")
+                rank_zero_info(f"Training {self.__class__.__name__} as an unconditional model.")
                 self.cond_stage_model = None
                 # self.be_unconditional = True
             else:
@@ -729,10 +720,10 @@ def instantiate_cond_stage(self, config):
     def _get_denoise_row_from_list(self, samples, desc='', force_no_decoder_quantization=False):
         denoise_row = []
         for zd in tqdm(samples, desc=desc):
-            denoise_row.append(self.decode_first_stage(zd.to(self.device),
-                                                       force_not_quantize=force_no_decoder_quantization))
+            denoise_row.append(
+                self.decode_first_stage(zd.to(self.device), force_not_quantize=force_no_decoder_quantization))
         n_imgs_per_row = len(denoise_row)
-        denoise_row = torch.stack(denoise_row)  # n_log_step, n_row, C, H, W
+        denoise_row = torch.stack(denoise_row)    # n_log_step, n_row, C, H, W
         denoise_grid = rearrange(denoise_row, 'n b c h w -> b n c h w')
         denoise_grid = rearrange(denoise_grid, 'b n c h w -> (b n) c h w')
         denoise_grid = make_grid(denoise_grid, nrow=n_imgs_per_row)
@@ -783,21 +774,23 @@ def delta_border(self, h, w):
 
     def get_weighting(self, h, w, Ly, Lx, device):
         weighting = self.delta_border(h, w)
-        weighting = torch.clip(weighting, self.split_input_params["clip_min_weight"],
-                               self.split_input_params["clip_max_weight"], )
+        weighting = torch.clip(
+            weighting,
+            self.split_input_params["clip_min_weight"],
+            self.split_input_params["clip_max_weight"],
+        )
         weighting = weighting.view(1, h * w, 1).repeat(1, 1, Ly * Lx).to(device)
 
         if self.split_input_params["tie_braker"]:
             L_weighting = self.delta_border(Ly, Lx)
-            L_weighting = torch.clip(L_weighting,
-                                     self.split_input_params["clip_min_tie_weight"],
+            L_weighting = torch.clip(L_weighting, self.split_input_params["clip_min_tie_weight"],
                                      self.split_input_params["clip_max_tie_weight"])
 
             L_weighting = L_weighting.view(1, 1, Ly * Lx).to(device)
             weighting = weighting * L_weighting
         return weighting
 
-    def get_fold_unfold(self, x, kernel_size, stride, uf=1, df=1):  # todo load once not every time, shorten code
+    def get_fold_unfold(self, x, kernel_size, stride, uf=1, df=1):    # todo load once not every time, shorten code
         """
         :param x: img of size (bs, c, h, w)
         :return: n img crops of size (n, bs, c, kernel_size[0], kernel_size[1])
@@ -815,7 +808,7 @@ def get_fold_unfold(self, x, kernel_size, stride, uf=1, df=1):  # todo load once
             fold = torch.nn.Fold(output_size=x.shape[2:], **fold_params)
 
             weighting = self.get_weighting(kernel_size[0], kernel_size[1], Ly, Lx, x.device).to(x.dtype)
-            normalization = fold(weighting).view(1, 1, h, w)  # normalizes the overlap
+            normalization = fold(weighting).view(1, 1, h, w)    # normalizes the overlap
             weighting = weighting.view((1, 1, kernel_size[0], kernel_size[1], Ly * Lx))
 
         elif uf > 1 and df == 1:
@@ -823,12 +816,13 @@ def get_fold_unfold(self, x, kernel_size, stride, uf=1, df=1):  # todo load once
             unfold = torch.nn.Unfold(**fold_params)
 
             fold_params2 = dict(kernel_size=(kernel_size[0] * uf, kernel_size[0] * uf),
-                                dilation=1, padding=0,
+                                dilation=1,
+                                padding=0,
                                 stride=(stride[0] * uf, stride[1] * uf))
             fold = torch.nn.Fold(output_size=(x.shape[2] * uf, x.shape[3] * uf), **fold_params2)
 
             weighting = self.get_weighting(kernel_size[0] * uf, kernel_size[1] * uf, Ly, Lx, x.device).to(x.dtype)
-            normalization = fold(weighting).view(1, 1, h * uf, w * uf)  # normalizes the overlap
+            normalization = fold(weighting).view(1, 1, h * uf, w * uf)    # normalizes the overlap
             weighting = weighting.view((1, 1, kernel_size[0] * uf, kernel_size[1] * uf, Ly * Lx))
 
         elif df > 1 and uf == 1:
@@ -836,12 +830,13 @@ def get_fold_unfold(self, x, kernel_size, stride, uf=1, df=1):  # todo load once
             unfold = torch.nn.Unfold(**fold_params)
 
             fold_params2 = dict(kernel_size=(kernel_size[0] // df, kernel_size[0] // df),
-                                dilation=1, padding=0,
+                                dilation=1,
+                                padding=0,
                                 stride=(stride[0] // df, stride[1] // df))
             fold = torch.nn.Fold(output_size=(x.shape[2] // df, x.shape[3] // df), **fold_params2)
 
             weighting = self.get_weighting(kernel_size[0] // df, kernel_size[1] // df, Ly, Lx, x.device).to(x.dtype)
-            normalization = fold(weighting).view(1, 1, h // df, w // df)  # normalizes the overlap
+            normalization = fold(weighting).view(1, 1, h // df, w // df)    # normalizes the overlap
             weighting = weighting.view((1, 1, kernel_size[0] // df, kernel_size[1] // df, Ly * Lx))
 
         else:
@@ -850,8 +845,15 @@ def get_fold_unfold(self, x, kernel_size, stride, uf=1, df=1):  # todo load once
         return fold, unfold, normalization, weighting
 
     @torch.no_grad()
-    def get_input(self, batch, k, return_first_stage_outputs=False, force_c_encode=False,
-                  cond_key=None, return_original_cond=False, bs=None, return_x=False):
+    def get_input(self,
+                  batch,
+                  k,
+                  return_first_stage_outputs=False,
+                  force_c_encode=False,
+                  cond_key=None,
+                  return_original_cond=False,
+                  bs=None,
+                  return_x=False):
         x = super().get_input(batch, k)
         if bs is not None:
             x = x[:bs]
@@ -900,7 +902,7 @@ def get_input(self, batch, k, return_first_stage_outputs=False, force_c_encode=F
             out.extend([x])
         if return_original_cond:
             out.append(xc)
-        
+
         return out
 
     @torch.no_grad()
@@ -929,7 +931,7 @@ def forward(self, x, c, *args, **kwargs):
             assert c is not None
             if self.cond_stage_trainable:
                 c = self.get_learned_conditioning(c)
-            if self.shorten_cond_schedule:  # TODO: drop this option
+            if self.shorten_cond_schedule:    # TODO: drop this option
                 tc = self.cond_ids[t].to(self.device)
                 c = self.q_sample(x_start=c, t=tc, noise=torch.randn_like(c.float()))
         return self.p_losses(x, c, t, *args, **kwargs)
@@ -1007,8 +1009,16 @@ def p_losses(self, x_start, cond, t, noise=None):
 
         return loss, loss_dict
 
-    def p_mean_variance(self, x, c, t, clip_denoised: bool, return_codebook_ids=False, quantize_denoised=False,
-                        return_x0=False, score_corrector=None, corrector_kwargs=None):
+    def p_mean_variance(self,
+                        x,
+                        c,
+                        t,
+                        clip_denoised: bool,
+                        return_codebook_ids=False,
+                        quantize_denoised=False,
+                        return_x0=False,
+                        score_corrector=None,
+                        corrector_kwargs=None):
         t_in = t
         model_out = self.apply_model(x, t_in, c, return_ids=return_codebook_ids)
 
@@ -1039,15 +1049,29 @@ def p_mean_variance(self, x, c, t, clip_denoised: bool, return_codebook_ids=Fals
             return model_mean, posterior_variance, posterior_log_variance
 
     @torch.no_grad()
-    def p_sample(self, x, c, t, clip_denoised=False, repeat_noise=False,
-                 return_codebook_ids=False, quantize_denoised=False, return_x0=False,
-                 temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None):
+    def p_sample(self,
+                 x,
+                 c,
+                 t,
+                 clip_denoised=False,
+                 repeat_noise=False,
+                 return_codebook_ids=False,
+                 quantize_denoised=False,
+                 return_x0=False,
+                 temperature=1.,
+                 noise_dropout=0.,
+                 score_corrector=None,
+                 corrector_kwargs=None):
         b, *_, device = *x.shape, x.device
-        outputs = self.p_mean_variance(x=x, c=c, t=t, clip_denoised=clip_denoised,
+        outputs = self.p_mean_variance(x=x,
+                                       c=c,
+                                       t=t,
+                                       clip_denoised=clip_denoised,
                                        return_codebook_ids=return_codebook_ids,
                                        quantize_denoised=quantize_denoised,
                                        return_x0=return_x0,
-                                       score_corrector=score_corrector, corrector_kwargs=corrector_kwargs)
+                                       score_corrector=score_corrector,
+                                       corrector_kwargs=corrector_kwargs)
         if return_codebook_ids:
             raise DeprecationWarning("Support dropped.")
             model_mean, _, model_log_variance, logits = outputs
@@ -1070,9 +1094,22 @@ def p_sample(self, x, c, t, clip_denoised=False, repeat_noise=False,
             return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise
 
     @torch.no_grad()
-    def progressive_denoising(self, cond, shape, verbose=True, callback=None, quantize_denoised=False,
-                              img_callback=None, mask=None, x0=None, temperature=1., noise_dropout=0.,
-                              score_corrector=None, corrector_kwargs=None, batch_size=None, x_T=None, start_T=None,
+    def progressive_denoising(self,
+                              cond,
+                              shape,
+                              verbose=True,
+                              callback=None,
+                              quantize_denoised=False,
+                              img_callback=None,
+                              mask=None,
+                              x0=None,
+                              temperature=1.,
+                              noise_dropout=0.,
+                              score_corrector=None,
+                              corrector_kwargs=None,
+                              batch_size=None,
+                              x_T=None,
+                              start_T=None,
                               log_every_t=None):
         if not log_every_t:
             log_every_t = self.log_every_t
@@ -1089,16 +1126,17 @@ def progressive_denoising(self, cond, shape, verbose=True, callback=None, quanti
         intermediates = []
         if cond is not None:
             if isinstance(cond, dict):
-                cond = {key: cond[key][:batch_size] if not isinstance(cond[key], list) else
-                list(map(lambda x: x[:batch_size], cond[key])) for key in cond}
+                cond = {
+                    key: cond[key][:batch_size] if not isinstance(cond[key], list) else list(
+                        map(lambda x: x[:batch_size], cond[key])) for key in cond
+                }
             else:
                 cond = [c[:batch_size] for c in cond] if isinstance(cond, list) else cond[:batch_size]
 
         if start_T is not None:
             timesteps = min(timesteps, start_T)
         iterator = tqdm(reversed(range(0, timesteps)), desc='Progressive Generation',
-                        total=timesteps) if verbose else reversed(
-            range(0, timesteps))
+                        total=timesteps) if verbose else reversed(range(0, timesteps))
         if type(temperature) == float:
             temperature = [temperature] * timesteps
 
@@ -1109,11 +1147,16 @@ def progressive_denoising(self, cond, shape, verbose=True, callback=None, quanti
                 tc = self.cond_ids[ts].to(cond.device)
                 cond = self.q_sample(x_start=cond, t=tc, noise=torch.randn_like(cond))
 
-            img, x0_partial = self.p_sample(img, cond, ts,
+            img, x0_partial = self.p_sample(img,
+                                            cond,
+                                            ts,
                                             clip_denoised=self.clip_denoised,
-                                            quantize_denoised=quantize_denoised, return_x0=True,
-                                            temperature=temperature[i], noise_dropout=noise_dropout,
-                                            score_corrector=score_corrector, corrector_kwargs=corrector_kwargs)
+                                            quantize_denoised=quantize_denoised,
+                                            return_x0=True,
+                                            temperature=temperature[i],
+                                            noise_dropout=noise_dropout,
+                                            score_corrector=score_corrector,
+                                            corrector_kwargs=corrector_kwargs)
             if mask is not None:
                 assert x0 is not None
                 img_orig = self.q_sample(x0, ts)
@@ -1121,14 +1164,26 @@ def progressive_denoising(self, cond, shape, verbose=True, callback=None, quanti
 
             if i % log_every_t == 0 or i == timesteps - 1:
                 intermediates.append(x0_partial)
-            if callback: callback(i)
-            if img_callback: img_callback(img, i)
+            if callback:
+                callback(i)
+            if img_callback:
+                img_callback(img, i)
         return img, intermediates
 
     @torch.no_grad()
-    def p_sample_loop(self, cond, shape, return_intermediates=False,
-                      x_T=None, verbose=True, callback=None, timesteps=None, quantize_denoised=False,
-                      mask=None, x0=None, img_callback=None, start_T=None,
+    def p_sample_loop(self,
+                      cond,
+                      shape,
+                      return_intermediates=False,
+                      x_T=None,
+                      verbose=True,
+                      callback=None,
+                      timesteps=None,
+                      quantize_denoised=False,
+                      mask=None,
+                      x0=None,
+                      img_callback=None,
+                      start_T=None,
                       log_every_t=None):
 
         if not log_every_t:
@@ -1151,7 +1206,7 @@ def p_sample_loop(self, cond, shape, return_intermediates=False,
 
         if mask is not None:
             assert x0 is not None
-            assert x0.shape[2:3] == mask.shape[2:3]  # spatial size has to match
+            assert x0.shape[2:3] == mask.shape[2:3]    # spatial size has to match
 
         for i in iterator:
             ts = torch.full((b,), i, device=device, dtype=torch.long)
@@ -1160,51 +1215,64 @@ def p_sample_loop(self, cond, shape, return_intermediates=False,
                 tc = self.cond_ids[ts].to(cond.device)
                 cond = self.q_sample(x_start=cond, t=tc, noise=torch.randn_like(cond))
 
-            img = self.p_sample(img, cond, ts,
-                                clip_denoised=self.clip_denoised,
-                                quantize_denoised=quantize_denoised)
+            img = self.p_sample(img, cond, ts, clip_denoised=self.clip_denoised, quantize_denoised=quantize_denoised)
             if mask is not None:
                 img_orig = self.q_sample(x0, ts)
                 img = img_orig * mask + (1. - mask) * img
 
             if i % log_every_t == 0 or i == timesteps - 1:
                 intermediates.append(img)
-            if callback: callback(i)
-            if img_callback: img_callback(img, i)
+            if callback:
+                callback(i)
+            if img_callback:
+                img_callback(img, i)
 
         if return_intermediates:
             return img, intermediates
         return img
 
     @torch.no_grad()
-    def sample(self, cond, batch_size=16, return_intermediates=False, x_T=None,
-               verbose=True, timesteps=None, quantize_denoised=False,
-               mask=None, x0=None, shape=None, **kwargs):
+    def sample(self,
+               cond,
+               batch_size=16,
+               return_intermediates=False,
+               x_T=None,
+               verbose=True,
+               timesteps=None,
+               quantize_denoised=False,
+               mask=None,
+               x0=None,
+               shape=None,
+               **kwargs):
         if shape is None:
             shape = (batch_size, self.channels, self.image_size, self.image_size)
         if cond is not None:
             if isinstance(cond, dict):
-                cond = {key: cond[key][:batch_size] if not isinstance(cond[key], list) else
-                list(map(lambda x: x[:batch_size], cond[key])) for key in cond}
+                cond = {
+                    key: cond[key][:batch_size] if not isinstance(cond[key], list) else list(
+                        map(lambda x: x[:batch_size], cond[key])) for key in cond
+                }
             else:
                 cond = [c[:batch_size] for c in cond] if isinstance(cond, list) else cond[:batch_size]
         return self.p_sample_loop(cond,
                                   shape,
-                                  return_intermediates=return_intermediates, x_T=x_T,
-                                  verbose=verbose, timesteps=timesteps, quantize_denoised=quantize_denoised,
-                                  mask=mask, x0=x0)
+                                  return_intermediates=return_intermediates,
+                                  x_T=x_T,
+                                  verbose=verbose,
+                                  timesteps=timesteps,
+                                  quantize_denoised=quantize_denoised,
+                                  mask=mask,
+                                  x0=x0)
 
     @torch.no_grad()
     def sample_log(self, cond, batch_size, ddim, ddim_steps, **kwargs):
         if ddim:
             ddim_sampler = DDIMSampler(self)
             shape = (self.channels, self.image_size, self.image_size)
-            samples, intermediates = ddim_sampler.sample(ddim_steps, batch_size,
-                                                         shape, cond, verbose=False, **kwargs)
+            samples, intermediates = ddim_sampler.sample(ddim_steps, batch_size, shape, cond, verbose=False, **kwargs)
 
         else:
-            samples, intermediates = self.sample(cond=cond, batch_size=batch_size,
-                                                 return_intermediates=True, **kwargs)
+            samples, intermediates = self.sample(cond=cond, batch_size=batch_size, return_intermediates=True, **kwargs)
 
         return samples, intermediates
 
@@ -1226,7 +1294,7 @@ def get_unconditional_conditioning(self, batch_size, null_label=None):
                 return self.get_learned_conditioning(xc)
             else:
                 raise NotImplementedError("todo")
-        if isinstance(c, list):  # in case the encoder gives us a list
+        if isinstance(c, list):    # in case the encoder gives us a list
             for i in range(len(c)):
                 c[i] = repeat(c[i], '1 ... -> b ...', b=batch_size).to(self.device)
         else:
@@ -1234,16 +1302,29 @@ def get_unconditional_conditioning(self, batch_size, null_label=None):
         return c
 
     @torch.no_grad()
-    def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=50, ddim_eta=0., return_keys=None,
-                   quantize_denoised=True, inpaint=True, plot_denoise_rows=False, plot_progressive_rows=True,
-                   plot_diffusion_rows=True, unconditional_guidance_scale=1., unconditional_guidance_label=None,
+    def log_images(self,
+                   batch,
+                   N=8,
+                   n_row=4,
+                   sample=True,
+                   ddim_steps=50,
+                   ddim_eta=0.,
+                   return_keys=None,
+                   quantize_denoised=True,
+                   inpaint=True,
+                   plot_denoise_rows=False,
+                   plot_progressive_rows=True,
+                   plot_diffusion_rows=True,
+                   unconditional_guidance_scale=1.,
+                   unconditional_guidance_label=None,
                    use_ema_scope=True,
                    **kwargs):
         ema_scope = self.ema_scope if use_ema_scope else nullcontext
         use_ddim = ddim_steps is not None
 
         log = dict()
-        z, c, x, xrec, xc = self.get_input(batch, self.first_stage_key,
+        z, c, x, xrec, xc = self.get_input(batch,
+                                           self.first_stage_key,
                                            return_first_stage_outputs=True,
                                            force_c_encode=True,
                                            return_original_cond=True,
@@ -1283,7 +1364,7 @@ def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=50, ddim_eta=0
                     z_noisy = self.q_sample(x_start=z_start, t=t, noise=noise)
                     diffusion_row.append(self.decode_first_stage(z_noisy))
 
-            diffusion_row = torch.stack(diffusion_row)  # n_log_step, n_row, C, H, W
+            diffusion_row = torch.stack(diffusion_row)    # n_log_step, n_row, C, H, W
             diffusion_grid = rearrange(diffusion_row, 'n b c h w -> b n c h w')
             diffusion_grid = rearrange(diffusion_grid, 'b n c h w -> (b n) c h w')
             diffusion_grid = make_grid(diffusion_grid, nrow=diffusion_row.shape[0])
@@ -1292,8 +1373,11 @@ def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=50, ddim_eta=0
         if sample:
             # get denoise row
             with ema_scope("Sampling"):
-                samples, z_denoise_row = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,
-                                                         ddim_steps=ddim_steps, eta=ddim_eta)
+                samples, z_denoise_row = self.sample_log(cond=c,
+                                                         batch_size=N,
+                                                         ddim=use_ddim,
+                                                         ddim_steps=ddim_steps,
+                                                         eta=ddim_eta)
                 # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True)
             x_samples = self.decode_first_stage(samples)
             log["samples"] = x_samples
@@ -1305,8 +1389,11 @@ def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=50, ddim_eta=0
                     self.first_stage_model, IdentityFirstStage):
                 # also display when quantizing x0 while sampling
                 with ema_scope("Plotting Quantized Denoised"):
-                    samples, z_denoise_row = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,
-                                                             ddim_steps=ddim_steps, eta=ddim_eta,
+                    samples, z_denoise_row = self.sample_log(cond=c,
+                                                             batch_size=N,
+                                                             ddim=use_ddim,
+                                                             ddim_steps=ddim_steps,
+                                                             eta=ddim_eta,
                                                              quantize_denoised=True)
                     # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True,
                     #                                      quantize_denoised=True)
@@ -1318,11 +1405,15 @@ def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=50, ddim_eta=0
             if self.model.conditioning_key == "crossattn-adm":
                 uc = {"c_crossattn": [uc], "c_adm": c["c_adm"]}
             with ema_scope("Sampling with classifier-free guidance"):
-                samples_cfg, _ = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,
-                                                 ddim_steps=ddim_steps, eta=ddim_eta,
-                                                 unconditional_guidance_scale=unconditional_guidance_scale,
-                                                 unconditional_conditioning=uc,
-                                                 )
+                samples_cfg, _ = self.sample_log(
+                    cond=c,
+                    batch_size=N,
+                    ddim=use_ddim,
+                    ddim_steps=ddim_steps,
+                    eta=ddim_eta,
+                    unconditional_guidance_scale=unconditional_guidance_scale,
+                    unconditional_conditioning=uc,
+                )
                 x_samples_cfg = self.decode_first_stage(samples_cfg)
                 log[f"samples_cfg_scale_{unconditional_guidance_scale:.2f}"] = x_samples_cfg
 
@@ -1334,8 +1425,13 @@ def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=50, ddim_eta=0
             mask[:, h // 4:3 * h // 4, w // 4:3 * w // 4] = 0.
             mask = mask[:, None, ...]
             with ema_scope("Plotting Inpaint"):
-                samples, _ = self.sample_log(cond=c, batch_size=N, ddim=use_ddim, eta=ddim_eta,
-                                             ddim_steps=ddim_steps, x0=z[:N], mask=mask)
+                samples, _ = self.sample_log(cond=c,
+                                             batch_size=N,
+                                             ddim=use_ddim,
+                                             eta=ddim_eta,
+                                             ddim_steps=ddim_steps,
+                                             x0=z[:N],
+                                             mask=mask)
             x_samples = self.decode_first_stage(samples.to(self.device))
             log["samples_inpainting"] = x_samples
             log["mask"] = mask
@@ -1343,8 +1439,13 @@ def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=50, ddim_eta=0
             # outpaint
             mask = 1. - mask
             with ema_scope("Plotting Outpaint"):
-                samples, _ = self.sample_log(cond=c, batch_size=N, ddim=use_ddim, eta=ddim_eta,
-                                             ddim_steps=ddim_steps, x0=z[:N], mask=mask)
+                samples, _ = self.sample_log(cond=c,
+                                             batch_size=N,
+                                             ddim=use_ddim,
+                                             eta=ddim_eta,
+                                             ddim_steps=ddim_steps,
+                                             x0=z[:N],
+                                             mask=mask)
             x_samples = self.decode_first_stage(samples.to(self.device))
             log["samples_outpainting"] = x_samples
 
@@ -1367,10 +1468,10 @@ def configure_optimizers(self):
         lr = self.learning_rate
         params = list(self.model.parameters())
         if self.cond_stage_trainable:
-            print(f"{self.__class__.__name__}: Also optimizing conditioner params!")
+            rank_zero_info(f"{self.__class__.__name__}: Also optimizing conditioner params!")
             params = params + list(self.cond_stage_model.parameters())
         if self.learn_logvar:
-            print('Diffusion model optimizing logvar')
+            rank_zero_info('Diffusion model optimizing logvar')
             params.append(self.logvar)
 
         from colossalai.nn.optimizer import HybridAdam
@@ -1381,13 +1482,8 @@ def configure_optimizers(self):
             assert 'target' in self.scheduler_config
             scheduler = instantiate_from_config(self.scheduler_config)
 
-            print("Setting up LambdaLR scheduler...")
-            scheduler = [
-                {
-                    'scheduler': LambdaLR(opt, lr_lambda=scheduler.schedule),
-                    'interval': 'step',
-                    'frequency': 1
-                }]
+            rank_zero_info("Setting up LambdaLR scheduler...")
+            scheduler = [{'scheduler': LambdaLR(opt, lr_lambda=scheduler.schedule), 'interval': 'step', 'frequency': 1}]
             return [opt], scheduler
         return opt
 
@@ -1402,6 +1498,7 @@ def to_rgb(self, x):
 
 
 class DiffusionWrapper(pl.LightningModule):
+
     def __init__(self, diff_model_config, conditioning_key):
         super().__init__()
         self.sequential_cross_attn = diff_model_config.pop("sequential_crossattn", False)
@@ -1444,6 +1541,7 @@ def forward(self, x, t, c_concat: list = None, c_crossattn: list = None, c_adm=N
 
 
 class LatentUpscaleDiffusion(LatentDiffusion):
+
     def __init__(self, *args, low_scale_config, low_scale_key="LR", noise_level_key=None, **kwargs):
         super().__init__(*args, **kwargs)
         # assumes that neither the cond_stage nor the low_scale_model contain trainable params
@@ -1464,8 +1562,12 @@ def get_input(self, batch, k, cond_key=None, bs=None, log_mode=False):
         if not log_mode:
             z, c = super().get_input(batch, k, force_c_encode=True, bs=bs)
         else:
-            z, c, x, xrec, xc = super().get_input(batch, self.first_stage_key, return_first_stage_outputs=True,
-                                                  force_c_encode=True, return_original_cond=True, bs=bs)
+            z, c, x, xrec, xc = super().get_input(batch,
+                                                  self.first_stage_key,
+                                                  return_first_stage_outputs=True,
+                                                  force_c_encode=True,
+                                                  return_original_cond=True,
+                                                  bs=bs)
         x_low = batch[self.low_scale_key][:bs]
         x_low = rearrange(x_low, 'b h w c -> b c h w')
         if self.use_fp16:
@@ -1485,15 +1587,28 @@ def get_input(self, batch, k, cond_key=None, bs=None, log_mode=False):
         return z, all_conds
 
     @torch.no_grad()
-    def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=200, ddim_eta=1., return_keys=None,
-                   plot_denoise_rows=False, plot_progressive_rows=True, plot_diffusion_rows=True,
-                   unconditional_guidance_scale=1., unconditional_guidance_label=None, use_ema_scope=True,
+    def log_images(self,
+                   batch,
+                   N=8,
+                   n_row=4,
+                   sample=True,
+                   ddim_steps=200,
+                   ddim_eta=1.,
+                   return_keys=None,
+                   plot_denoise_rows=False,
+                   plot_progressive_rows=True,
+                   plot_diffusion_rows=True,
+                   unconditional_guidance_scale=1.,
+                   unconditional_guidance_label=None,
+                   use_ema_scope=True,
                    **kwargs):
         ema_scope = self.ema_scope if use_ema_scope else nullcontext
         use_ddim = ddim_steps is not None
 
         log = dict()
-        z, c, x, xrec, xc, x_low, x_low_rec, noise_level = self.get_input(batch, self.first_stage_key, bs=N,
+        z, c, x, xrec, xc, x_low, x_low_rec, noise_level = self.get_input(batch,
+                                                                          self.first_stage_key,
+                                                                          bs=N,
                                                                           log_mode=True)
         N = min(x.shape[0], N)
         n_row = min(x.shape[0], n_row)
@@ -1528,7 +1643,7 @@ def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=200, ddim_eta=
                     z_noisy = self.q_sample(x_start=z_start, t=t, noise=noise)
                     diffusion_row.append(self.decode_first_stage(z_noisy))
 
-            diffusion_row = torch.stack(diffusion_row)  # n_log_step, n_row, C, H, W
+            diffusion_row = torch.stack(diffusion_row)    # n_log_step, n_row, C, H, W
             diffusion_grid = rearrange(diffusion_row, 'n b c h w -> b n c h w')
             diffusion_grid = rearrange(diffusion_grid, 'b n c h w -> (b n) c h w')
             diffusion_grid = make_grid(diffusion_grid, nrow=diffusion_row.shape[0])
@@ -1537,8 +1652,11 @@ def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=200, ddim_eta=
         if sample:
             # get denoise row
             with ema_scope("Sampling"):
-                samples, z_denoise_row = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,
-                                                         ddim_steps=ddim_steps, eta=ddim_eta)
+                samples, z_denoise_row = self.sample_log(cond=c,
+                                                         batch_size=N,
+                                                         ddim=use_ddim,
+                                                         ddim_steps=ddim_steps,
+                                                         eta=ddim_eta)
                 # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True)
             x_samples = self.decode_first_stage(samples)
             log["samples"] = x_samples
@@ -1555,7 +1673,7 @@ def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=200, ddim_eta=
                 if k == "c_crossattn":
                     assert isinstance(c[k], list) and len(c[k]) == 1
                     uc[k] = [uc_tmp]
-                elif k == "c_adm":  # todo: only run with text-based guidance?
+                elif k == "c_adm":    # todo: only run with text-based guidance?
                     assert isinstance(c[k], torch.Tensor)
                     #uc[k] = torch.ones_like(c[k]) * self.low_scale_model.max_noise_level
                     uc[k] = c[k]
@@ -1565,11 +1683,15 @@ def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=200, ddim_eta=
                     uc[k] = c[k]
 
             with ema_scope("Sampling with classifier-free guidance"):
-                samples_cfg, _ = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,
-                                                 ddim_steps=ddim_steps, eta=ddim_eta,
-                                                 unconditional_guidance_scale=unconditional_guidance_scale,
-                                                 unconditional_conditioning=uc,
-                                                 )
+                samples_cfg, _ = self.sample_log(
+                    cond=c,
+                    batch_size=N,
+                    ddim=use_ddim,
+                    ddim_steps=ddim_steps,
+                    eta=ddim_eta,
+                    unconditional_guidance_scale=unconditional_guidance_scale,
+                    unconditional_conditioning=uc,
+                )
                 x_samples_cfg = self.decode_first_stage(samples_cfg)
                 log[f"samples_cfg_scale_{unconditional_guidance_scale:.2f}"] = x_samples_cfg
 
@@ -1590,18 +1712,18 @@ class LatentFinetuneDiffusion(LatentDiffusion):
          To disable finetuning mode, set finetune_keys to None
     """
 
-    def __init__(self,
-                 concat_keys: tuple,
-                 finetune_keys=("model.diffusion_model.input_blocks.0.0.weight",
-                                "model_ema.diffusion_modelinput_blocks00weight"
-                                ),
-                 keep_finetune_dims=4,
-                 # if model was trained without concat mode before and we would like to keep these channels
-                 c_concat_log_start=None,  # to log reconstruction of c_concat codes
-                 c_concat_log_end=None,
-                 *args, **kwargs
-                 ):
-        ckpt_path = kwargs.pop("ckpt_path", None)
+    def __init__(
+            self,
+            concat_keys: tuple,
+            finetune_keys=("model.diffusion_model.input_blocks.0.0.weight",
+                           "model_ema.diffusion_modelinput_blocks00weight"),
+            keep_finetune_dims=4,
+    # if model was trained without concat mode before and we would like to keep these channels
+            c_concat_log_start=None,    # to log reconstruction of c_concat codes
+            c_concat_log_end=None,
+            *args,
+            **kwargs):
+        ckpt = kwargs.pop("ckpt", None)
         ignore_keys = kwargs.pop("ignore_keys", list())
         super().__init__(*args, **kwargs)
         self.finetune_keys = finetune_keys
@@ -1609,9 +1731,10 @@ def __init__(self,
         self.keep_dims = keep_finetune_dims
         self.c_concat_log_start = c_concat_log_start
         self.c_concat_log_end = c_concat_log_end
-        if exists(self.finetune_keys): assert exists(ckpt_path), 'can only finetune from a given checkpoint'
-        if exists(ckpt_path):
-            self.init_from_ckpt(ckpt_path, ignore_keys)
+        if exists(self.finetune_keys):
+            assert exists(ckpt), 'can only finetune from a given checkpoint'
+        if exists(ckpt):
+            self.init_from_ckpt(ckpt, ignore_keys)
 
     def init_from_ckpt(self, path, ignore_keys=list(), only_model=False):
         sd = torch.load(path, map_location="cpu")
@@ -1621,7 +1744,7 @@ def init_from_ckpt(self, path, ignore_keys=list(), only_model=False):
         for k in keys:
             for ik in ignore_keys:
                 if k.startswith(ik):
-                    print("Deleting key {} from state_dict.".format(k))
+                    rank_zero_info("Deleting key {} from state_dict.".format(k))
                     del sd[k]
 
             # make it explicit, finetune by including extra input channels
@@ -1629,25 +1752,38 @@ def init_from_ckpt(self, path, ignore_keys=list(), only_model=False):
                 new_entry = None
                 for name, param in self.named_parameters():
                     if name in self.finetune_keys:
-                        print(
-                            f"modifying key '{name}' and keeping its original {self.keep_dims} (channels) dimensions only")
-                        new_entry = torch.zeros_like(param)  # zero init
+                        rank_zero_info(
+                            f"modifying key '{name}' and keeping its original {self.keep_dims} (channels) dimensions only"
+                        )
+                        new_entry = torch.zeros_like(param)    # zero init
                 assert exists(new_entry), 'did not find matching parameter to modify'
                 new_entry[:, :self.keep_dims, ...] = sd[k]
                 sd[k] = new_entry
 
         missing, unexpected = self.load_state_dict(sd, strict=False) if not only_model else self.model.load_state_dict(
             sd, strict=False)
-        print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
+        rank_zero_info(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
         if len(missing) > 0:
-            print(f"Missing Keys: {missing}")
+            rank_zero_info(f"Missing Keys: {missing}")
         if len(unexpected) > 0:
-            print(f"Unexpected Keys: {unexpected}")
+            rank_zero_info(f"Unexpected Keys: {unexpected}")
 
     @torch.no_grad()
-    def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=200, ddim_eta=1., return_keys=None,
-                   quantize_denoised=True, inpaint=True, plot_denoise_rows=False, plot_progressive_rows=True,
-                   plot_diffusion_rows=True, unconditional_guidance_scale=1., unconditional_guidance_label=None,
+    def log_images(self,
+                   batch,
+                   N=8,
+                   n_row=4,
+                   sample=True,
+                   ddim_steps=200,
+                   ddim_eta=1.,
+                   return_keys=None,
+                   quantize_denoised=True,
+                   inpaint=True,
+                   plot_denoise_rows=False,
+                   plot_progressive_rows=True,
+                   plot_diffusion_rows=True,
+                   unconditional_guidance_scale=1.,
+                   unconditional_guidance_label=None,
                    use_ema_scope=True,
                    **kwargs):
         ema_scope = self.ema_scope if use_ema_scope else nullcontext
@@ -1690,7 +1826,7 @@ def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=200, ddim_eta=
                     z_noisy = self.q_sample(x_start=z_start, t=t, noise=noise)
                     diffusion_row.append(self.decode_first_stage(z_noisy))
 
-            diffusion_row = torch.stack(diffusion_row)  # n_log_step, n_row, C, H, W
+            diffusion_row = torch.stack(diffusion_row)    # n_log_step, n_row, C, H, W
             diffusion_grid = rearrange(diffusion_row, 'n b c h w -> b n c h w')
             diffusion_grid = rearrange(diffusion_grid, 'b n c h w -> (b n) c h w')
             diffusion_grid = make_grid(diffusion_grid, nrow=diffusion_row.shape[0])
@@ -1699,9 +1835,14 @@ def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=200, ddim_eta=
         if sample:
             # get denoise row
             with ema_scope("Sampling"):
-                samples, z_denoise_row = self.sample_log(cond={"c_concat": [c_cat], "c_crossattn": [c]},
-                                                         batch_size=N, ddim=use_ddim,
-                                                         ddim_steps=ddim_steps, eta=ddim_eta)
+                samples, z_denoise_row = self.sample_log(cond={
+                    "c_concat": [c_cat],
+                    "c_crossattn": [c]
+                },
+                                                         batch_size=N,
+                                                         ddim=use_ddim,
+                                                         ddim_steps=ddim_steps,
+                                                         eta=ddim_eta)
                 # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True)
             x_samples = self.decode_first_stage(samples)
             log["samples"] = x_samples
@@ -1714,12 +1855,18 @@ def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=200, ddim_eta=
             uc_cat = c_cat
             uc_full = {"c_concat": [uc_cat], "c_crossattn": [uc_cross]}
             with ema_scope("Sampling with classifier-free guidance"):
-                samples_cfg, _ = self.sample_log(cond={"c_concat": [c_cat], "c_crossattn": [c]},
-                                                 batch_size=N, ddim=use_ddim,
-                                                 ddim_steps=ddim_steps, eta=ddim_eta,
-                                                 unconditional_guidance_scale=unconditional_guidance_scale,
-                                                 unconditional_conditioning=uc_full,
-                                                 )
+                samples_cfg, _ = self.sample_log(
+                    cond={
+                        "c_concat": [c_cat],
+                        "c_crossattn": [c]
+                    },
+                    batch_size=N,
+                    ddim=use_ddim,
+                    ddim_steps=ddim_steps,
+                    eta=ddim_eta,
+                    unconditional_guidance_scale=unconditional_guidance_scale,
+                    unconditional_conditioning=uc_full,
+                )
                 x_samples_cfg = self.decode_first_stage(samples_cfg)
                 log[f"samples_cfg_scale_{unconditional_guidance_scale:.2f}"] = x_samples_cfg
 
@@ -1733,11 +1880,7 @@ class LatentInpaintDiffusion(LatentFinetuneDiffusion):
     To disable finetuning mode, set finetune_keys to None
      """
 
-    def __init__(self,
-                 concat_keys=("mask", "masked_image"),
-                 masked_image_key="masked_image",
-                 *args, **kwargs
-                 ):
+    def __init__(self, concat_keys=("mask", "masked_image"), masked_image_key="masked_image", *args, **kwargs):
         super().__init__(concat_keys, *args, **kwargs)
         self.masked_image_key = masked_image_key
         assert self.masked_image_key in concat_keys
@@ -1746,8 +1889,12 @@ def __init__(self,
     def get_input(self, batch, k, cond_key=None, bs=None, return_first_stage_outputs=False):
         # note: restricted to non-trainable encoders currently
         assert not self.cond_stage_trainable, 'trainable cond stages not yet supported for inpainting'
-        z, c, x, xrec, xc = super().get_input(batch, self.first_stage_key, return_first_stage_outputs=True,
-                                              force_c_encode=True, return_original_cond=True, bs=bs)
+        z, c, x, xrec, xc = super().get_input(batch,
+                                              self.first_stage_key,
+                                              return_first_stage_outputs=True,
+                                              force_c_encode=True,
+                                              return_original_cond=True,
+                                              bs=bs)
 
         assert exists(self.concat_keys)
         c_cat = list()
@@ -1793,8 +1940,12 @@ def __init__(self, depth_stage_config, concat_keys=("midas_in",), *args, **kwarg
     def get_input(self, batch, k, cond_key=None, bs=None, return_first_stage_outputs=False):
         # note: restricted to non-trainable encoders currently
         assert not self.cond_stage_trainable, 'trainable cond stages not yet supported for depth2img'
-        z, c, x, xrec, xc = super().get_input(batch, self.first_stage_key, return_first_stage_outputs=True,
-                                              force_c_encode=True, return_original_cond=True, bs=bs)
+        z, c, x, xrec, xc = super().get_input(batch,
+                                              self.first_stage_key,
+                                              return_first_stage_outputs=True,
+                                              force_c_encode=True,
+                                              return_original_cond=True,
+                                              bs=bs)
 
         assert exists(self.concat_keys)
         assert len(self.concat_keys) == 1
@@ -1812,7 +1963,8 @@ def get_input(self, batch, k, cond_key=None, bs=None, return_first_stage_outputs
                 align_corners=False,
             )
 
-            depth_min, depth_max = torch.amin(cc, dim=[1, 2, 3], keepdim=True), torch.amax(cc, dim=[1, 2, 3],
+            depth_min, depth_max = torch.amin(cc, dim=[1, 2, 3], keepdim=True), torch.amax(cc,
+                                                                                           dim=[1, 2, 3],
                                                                                            keepdim=True)
             cc = 2. * (cc - depth_min) / (depth_max - depth_min + 0.001) - 1.
             c_cat.append(cc)
@@ -1836,13 +1988,19 @@ class LatentUpscaleFinetuneDiffusion(LatentFinetuneDiffusion):
     """
         condition on low-res image (and optionally on some spatial noise augmentation)
     """
-    def __init__(self, concat_keys=("lr",), reshuffle_patch_size=None,
-                 low_scale_config=None, low_scale_key=None, *args, **kwargs):
+
+    def __init__(self,
+                 concat_keys=("lr",),
+                 reshuffle_patch_size=None,
+                 low_scale_config=None,
+                 low_scale_key=None,
+                 *args,
+                 **kwargs):
         super().__init__(concat_keys=concat_keys, *args, **kwargs)
         self.reshuffle_patch_size = reshuffle_patch_size
         self.low_scale_model = None
         if low_scale_config is not None:
-            print("Initializing a low-scale model")
+            rank_zero_info("Initializing a low-scale model")
             assert exists(low_scale_key)
             self.instantiate_low_stage(low_scale_config)
             self.low_scale_key = low_scale_key
@@ -1858,8 +2016,12 @@ def instantiate_low_stage(self, config):
     def get_input(self, batch, k, cond_key=None, bs=None, return_first_stage_outputs=False):
         # note: restricted to non-trainable encoders currently
         assert not self.cond_stage_trainable, 'trainable cond stages not yet supported for upscaling-ft'
-        z, c, x, xrec, xc = super().get_input(batch, self.first_stage_key, return_first_stage_outputs=True,
-                                              force_c_encode=True, return_original_cond=True, bs=bs)
+        z, c, x, xrec, xc = super().get_input(batch,
+                                              self.first_stage_key,
+                                              return_first_stage_outputs=True,
+                                              force_c_encode=True,
+                                              return_original_cond=True,
+                                              bs=bs)
 
         assert exists(self.concat_keys)
         assert len(self.concat_keys) == 1
@@ -1871,8 +2033,10 @@ def get_input(self, batch, k, cond_key=None, bs=None, return_first_stage_outputs
             cc = rearrange(cc, 'b h w c -> b c h w')
             if exists(self.reshuffle_patch_size):
                 assert isinstance(self.reshuffle_patch_size, int)
-                cc = rearrange(cc, 'b c (p1 h) (p2 w) -> b (p1 p2 c) h w',
-                               p1=self.reshuffle_patch_size, p2=self.reshuffle_patch_size)
+                cc = rearrange(cc,
+                               'b c (p1 h) (p2 w) -> b (p1 p2 c) h w',
+                               p1=self.reshuffle_patch_size,
+                               p2=self.reshuffle_patch_size)
             if bs is not None:
                 cc = cc[:bs]
                 cc = cc.to(self.device)
diff --git a/examples/images/diffusion/ldm/modules/diffusionmodules/model.py b/examples/images/diffusion/ldm/modules/diffusionmodules/model.py
index 57b9a4b80f4b..fb088db58919 100644
--- a/examples/images/diffusion/ldm/modules/diffusionmodules/model.py
+++ b/examples/images/diffusion/ldm/modules/diffusionmodules/model.py
@@ -1,10 +1,11 @@
 # pytorch_diffusion + derived encoder decoder
 import math
+from typing import Any, Optional
+
+import numpy as np
 import torch
 import torch.nn as nn
-import numpy as np
 from einops import rearrange
-from typing import Optional, Any
 
 try:
     from lightning.pytorch.utilities import rank_zero_info
@@ -38,14 +39,14 @@ def get_timestep_embedding(timesteps, embedding_dim):
     emb = emb.to(device=timesteps.device)
     emb = timesteps.float()[:, None] * emb[None, :]
     emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
-    if embedding_dim % 2 == 1:  # zero pad
-        emb = torch.nn.functional.pad(emb, (0,1,0,0))
+    if embedding_dim % 2 == 1:    # zero pad
+        emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
     return emb
 
 
 def nonlinearity(x):
     # swish
-    return x*torch.sigmoid(x)
+    return x * torch.sigmoid(x)
 
 
 def Normalize(in_channels, num_groups=32):
@@ -53,15 +54,12 @@ def Normalize(in_channels, num_groups=32):
 
 
 class Upsample(nn.Module):
+
     def __init__(self, in_channels, with_conv):
         super().__init__()
         self.with_conv = with_conv
         if self.with_conv:
-            self.conv = torch.nn.Conv2d(in_channels,
-                                        in_channels,
-                                        kernel_size=3,
-                                        stride=1,
-                                        padding=1)
+            self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
 
     def forward(self, x):
         x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
@@ -71,20 +69,17 @@ def forward(self, x):
 
 
 class Downsample(nn.Module):
+
     def __init__(self, in_channels, with_conv):
         super().__init__()
         self.with_conv = with_conv
         if self.with_conv:
             # no asymmetric padding in torch conv, must do it ourselves
-            self.conv = torch.nn.Conv2d(in_channels,
-                                        in_channels,
-                                        kernel_size=3,
-                                        stride=2,
-                                        padding=0)
+            self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
 
     def forward(self, x):
         if self.with_conv:
-            pad = (0,1,0,1)
+            pad = (0, 1, 0, 1)
             x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
             x = self.conv(x)
         else:
@@ -93,8 +88,8 @@ def forward(self, x):
 
 
 class ResnetBlock(nn.Module):
-    def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False,
-                 dropout, temb_channels=512):
+
+    def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False, dropout, temb_channels=512):
         super().__init__()
         self.in_channels = in_channels
         out_channels = in_channels if out_channels is None else out_channels
@@ -102,34 +97,17 @@ def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False,
         self.use_conv_shortcut = conv_shortcut
 
         self.norm1 = Normalize(in_channels)
-        self.conv1 = torch.nn.Conv2d(in_channels,
-                                     out_channels,
-                                     kernel_size=3,
-                                     stride=1,
-                                     padding=1)
+        self.conv1 = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
         if temb_channels > 0:
-            self.temb_proj = torch.nn.Linear(temb_channels,
-                                             out_channels)
+            self.temb_proj = torch.nn.Linear(temb_channels, out_channels)
         self.norm2 = Normalize(out_channels)
         self.dropout = torch.nn.Dropout(dropout)
-        self.conv2 = torch.nn.Conv2d(out_channels,
-                                     out_channels,
-                                     kernel_size=3,
-                                     stride=1,
-                                     padding=1)
+        self.conv2 = torch.nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
         if self.in_channels != self.out_channels:
             if self.use_conv_shortcut:
-                self.conv_shortcut = torch.nn.Conv2d(in_channels,
-                                                     out_channels,
-                                                     kernel_size=3,
-                                                     stride=1,
-                                                     padding=1)
+                self.conv_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
             else:
-                self.nin_shortcut = torch.nn.Conv2d(in_channels,
-                                                    out_channels,
-                                                    kernel_size=1,
-                                                    stride=1,
-                                                    padding=0)
+                self.nin_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
 
     def forward(self, x, temb):
         h = x
@@ -138,7 +116,7 @@ def forward(self, x, temb):
         h = self.conv1(h)
 
         if temb is not None:
-            h = h + self.temb_proj(nonlinearity(temb))[:,:,None,None]
+            h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None]
 
         h = self.norm2(h)
         h = nonlinearity(h)
@@ -151,35 +129,20 @@ def forward(self, x, temb):
             else:
                 x = self.nin_shortcut(x)
 
-        return x+h
+        return x + h
 
 
 class AttnBlock(nn.Module):
+
     def __init__(self, in_channels):
         super().__init__()
         self.in_channels = in_channels
 
         self.norm = Normalize(in_channels)
-        self.q = torch.nn.Conv2d(in_channels,
-                                 in_channels,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-        self.k = torch.nn.Conv2d(in_channels,
-                                 in_channels,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-        self.v = torch.nn.Conv2d(in_channels,
-                                 in_channels,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-        self.proj_out = torch.nn.Conv2d(in_channels,
-                                        in_channels,
-                                        kernel_size=1,
-                                        stride=1,
-                                        padding=0)
+        self.q = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.k = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.v = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.proj_out = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
 
     def forward(self, x):
         h_ = x
@@ -189,23 +152,24 @@ def forward(self, x):
         v = self.v(h_)
 
         # compute attention
-        b,c,h,w = q.shape
-        q = q.reshape(b,c,h*w)
-        q = q.permute(0,2,1)   # b,hw,c
-        k = k.reshape(b,c,h*w) # b,c,hw
-        w_ = torch.bmm(q,k)     # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+        b, c, h, w = q.shape
+        q = q.reshape(b, c, h * w)
+        q = q.permute(0, 2, 1)    # b,hw,c
+        k = k.reshape(b, c, h * w)    # b,c,hw
+        w_ = torch.bmm(q, k)    # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
         w_ = w_ * (int(c)**(-0.5))
         w_ = torch.nn.functional.softmax(w_, dim=2)
 
         # attend to values
-        v = v.reshape(b,c,h*w)
-        w_ = w_.permute(0,2,1)   # b,hw,hw (first hw of k, second of q)
-        h_ = torch.bmm(v,w_)     # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
-        h_ = h_.reshape(b,c,h,w)
+        v = v.reshape(b, c, h * w)
+        w_ = w_.permute(0, 2, 1)    # b,hw,hw (first hw of k, second of q)
+        h_ = torch.bmm(v, w_)    # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+        h_ = h_.reshape(b, c, h, w)
 
         h_ = self.proj_out(h_)
 
-        return x+h_
+        return x + h_
+
 
 class MemoryEfficientAttnBlock(nn.Module):
     """
@@ -213,32 +177,17 @@ class MemoryEfficientAttnBlock(nn.Module):
         see https://github.com/MatthieuTPHR/diffusers/blob/d80b531ff8060ec1ea982b65a1b8df70f73aa67c/src/diffusers/models/attention.py#L223
         Note: this is a single-head self-attention operation
     """
+
     #
     def __init__(self, in_channels):
         super().__init__()
         self.in_channels = in_channels
 
         self.norm = Normalize(in_channels)
-        self.q = torch.nn.Conv2d(in_channels,
-                                 in_channels,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-        self.k = torch.nn.Conv2d(in_channels,
-                                 in_channels,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-        self.v = torch.nn.Conv2d(in_channels,
-                                 in_channels,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-        self.proj_out = torch.nn.Conv2d(in_channels,
-                                        in_channels,
-                                        kernel_size=1,
-                                        stride=1,
-                                        padding=0)
+        self.q = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.k = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.v = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.proj_out = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
         self.attention_op: Optional[Any] = None
 
     def forward(self, x):
@@ -253,27 +202,20 @@ def forward(self, x):
         q, k, v = map(lambda x: rearrange(x, 'b c h w -> b (h w) c'), (q, k, v))
 
         q, k, v = map(
-            lambda t: t.unsqueeze(3)
-            .reshape(B, t.shape[1], 1, C)
-            .permute(0, 2, 1, 3)
-            .reshape(B * 1, t.shape[1], C)
-            .contiguous(),
+            lambda t: t.unsqueeze(3).reshape(B, t.shape[1], 1, C).permute(0, 2, 1, 3).reshape(B * 1, t.shape[1], C).
+            contiguous(),
             (q, k, v),
         )
         out = xformers.ops.memory_efficient_attention(q, k, v, attn_bias=None, op=self.attention_op)
 
-        out = (
-            out.unsqueeze(0)
-            .reshape(B, 1, out.shape[1], C)
-            .permute(0, 2, 1, 3)
-            .reshape(B, out.shape[1], C)
-        )
+        out = (out.unsqueeze(0).reshape(B, 1, out.shape[1], C).permute(0, 2, 1, 3).reshape(B, out.shape[1], C))
         out = rearrange(out, 'b (h w) c -> b c h w', b=B, h=H, w=W, c=C)
         out = self.proj_out(out)
-        return x+out
+        return x + out
 
 
 class MemoryEfficientCrossAttentionWrapper(MemoryEfficientCrossAttention):
+
     def forward(self, x, context=None, mask=None):
         b, c, h, w = x.shape
         x = rearrange(x, 'b c h w -> b (h w) c')
@@ -283,10 +225,10 @@ def forward(self, x, context=None, mask=None):
 
 
 def make_attn(in_channels, attn_type="vanilla", attn_kwargs=None):
-    assert attn_type in ["vanilla", "vanilla-xformers", "memory-efficient-cross-attn", "linear", "none"], f'attn_type {attn_type} unknown'
+    assert attn_type in ["vanilla", "vanilla-xformers", "memory-efficient-cross-attn", "linear",
+                         "none"], f'attn_type {attn_type} unknown'
     if XFORMERS_IS_AVAILBLE and attn_type == "vanilla":
         attn_type = "vanilla-xformers"
-    rank_zero_info(f"making attention of type '{attn_type}' with {in_channels} in_channels")
     if attn_type == "vanilla":
         assert attn_kwargs is None
         return AttnBlock(in_channels)
@@ -303,13 +245,26 @@ def make_attn(in_channels, attn_type="vanilla", attn_kwargs=None):
 
 
 class Model(nn.Module):
-    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
-                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
-                 resolution, use_timestep=True, use_linear_attn=False, attn_type="vanilla"):
+
+    def __init__(self,
+                 *,
+                 ch,
+                 out_ch,
+                 ch_mult=(1, 2, 4, 8),
+                 num_res_blocks,
+                 attn_resolutions,
+                 dropout=0.0,
+                 resamp_with_conv=True,
+                 in_channels,
+                 resolution,
+                 use_timestep=True,
+                 use_linear_attn=False,
+                 attn_type="vanilla"):
         super().__init__()
-        if use_linear_attn: attn_type = "linear"
+        if use_linear_attn:
+            attn_type = "linear"
         self.ch = ch
-        self.temb_ch = self.ch*4
+        self.temb_ch = self.ch * 4
         self.num_resolutions = len(ch_mult)
         self.num_res_blocks = num_res_blocks
         self.resolution = resolution
@@ -320,39 +275,34 @@ def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
             # timestep embedding
             self.temb = nn.Module()
             self.temb.dense = nn.ModuleList([
-                torch.nn.Linear(self.ch,
-                                self.temb_ch),
-                torch.nn.Linear(self.temb_ch,
-                                self.temb_ch),
+                torch.nn.Linear(self.ch, self.temb_ch),
+                torch.nn.Linear(self.temb_ch, self.temb_ch),
             ])
 
         # downsampling
-        self.conv_in = torch.nn.Conv2d(in_channels,
-                                       self.ch,
-                                       kernel_size=3,
-                                       stride=1,
-                                       padding=1)
+        self.conv_in = torch.nn.Conv2d(in_channels, self.ch, kernel_size=3, stride=1, padding=1)
 
         curr_res = resolution
-        in_ch_mult = (1,)+tuple(ch_mult)
+        in_ch_mult = (1,) + tuple(ch_mult)
         self.down = nn.ModuleList()
         for i_level in range(self.num_resolutions):
             block = nn.ModuleList()
             attn = nn.ModuleList()
-            block_in = ch*in_ch_mult[i_level]
-            block_out = ch*ch_mult[i_level]
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
             for i_block in range(self.num_res_blocks):
-                block.append(ResnetBlock(in_channels=block_in,
-                                         out_channels=block_out,
-                                         temb_channels=self.temb_ch,
-                                         dropout=dropout))
+                block.append(
+                    ResnetBlock(in_channels=block_in,
+                                out_channels=block_out,
+                                temb_channels=self.temb_ch,
+                                dropout=dropout))
                 block_in = block_out
                 if curr_res in attn_resolutions:
                     attn.append(make_attn(block_in, attn_type=attn_type))
             down = nn.Module()
             down.block = block
             down.attn = attn
-            if i_level != self.num_resolutions-1:
+            if i_level != self.num_resolutions - 1:
                 down.downsample = Downsample(block_in, resamp_with_conv)
                 curr_res = curr_res // 2
             self.down.append(down)
@@ -374,15 +324,16 @@ def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
         for i_level in reversed(range(self.num_resolutions)):
             block = nn.ModuleList()
             attn = nn.ModuleList()
-            block_out = ch*ch_mult[i_level]
-            skip_in = ch*ch_mult[i_level]
-            for i_block in range(self.num_res_blocks+1):
+            block_out = ch * ch_mult[i_level]
+            skip_in = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
                 if i_block == self.num_res_blocks:
-                    skip_in = ch*in_ch_mult[i_level]
-                block.append(ResnetBlock(in_channels=block_in+skip_in,
-                                         out_channels=block_out,
-                                         temb_channels=self.temb_ch,
-                                         dropout=dropout))
+                    skip_in = ch * in_ch_mult[i_level]
+                block.append(
+                    ResnetBlock(in_channels=block_in + skip_in,
+                                out_channels=block_out,
+                                temb_channels=self.temb_ch,
+                                dropout=dropout))
                 block_in = block_out
                 if curr_res in attn_resolutions:
                     attn.append(make_attn(block_in, attn_type=attn_type))
@@ -392,15 +343,11 @@ def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
             if i_level != 0:
                 up.upsample = Upsample(block_in, resamp_with_conv)
                 curr_res = curr_res * 2
-            self.up.insert(0, up) # prepend to get consistent order
+            self.up.insert(0, up)    # prepend to get consistent order
 
         # end
         self.norm_out = Normalize(block_in)
-        self.conv_out = torch.nn.Conv2d(block_in,
-                                        out_ch,
-                                        kernel_size=3,
-                                        stride=1,
-                                        padding=1)
+        self.conv_out = torch.nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1)
 
     def forward(self, x, t=None, context=None):
         #assert x.shape[2] == x.shape[3] == self.resolution
@@ -425,7 +372,7 @@ def forward(self, x, t=None, context=None):
                 if len(self.down[i_level].attn) > 0:
                     h = self.down[i_level].attn[i_block](h)
                 hs.append(h)
-            if i_level != self.num_resolutions-1:
+            if i_level != self.num_resolutions - 1:
                 hs.append(self.down[i_level].downsample(hs[-1]))
 
         # middle
@@ -436,9 +383,8 @@ def forward(self, x, t=None, context=None):
 
         # upsampling
         for i_level in reversed(range(self.num_resolutions)):
-            for i_block in range(self.num_res_blocks+1):
-                h = self.up[i_level].block[i_block](
-                    torch.cat([h, hs.pop()], dim=1), temb)
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](torch.cat([h, hs.pop()], dim=1), temb)
                 if len(self.up[i_level].attn) > 0:
                     h = self.up[i_level].attn[i_block](h)
             if i_level != 0:
@@ -455,12 +401,26 @@ def get_last_layer(self):
 
 
 class Encoder(nn.Module):
-    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
-                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
-                 resolution, z_channels, double_z=True, use_linear_attn=False, attn_type="vanilla",
+
+    def __init__(self,
+                 *,
+                 ch,
+                 out_ch,
+                 ch_mult=(1, 2, 4, 8),
+                 num_res_blocks,
+                 attn_resolutions,
+                 dropout=0.0,
+                 resamp_with_conv=True,
+                 in_channels,
+                 resolution,
+                 z_channels,
+                 double_z=True,
+                 use_linear_attn=False,
+                 attn_type="vanilla",
                  **ignore_kwargs):
         super().__init__()
-        if use_linear_attn: attn_type = "linear"
+        if use_linear_attn:
+            attn_type = "linear"
         self.ch = ch
         self.temb_ch = 0
         self.num_resolutions = len(ch_mult)
@@ -469,33 +429,30 @@ def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
         self.in_channels = in_channels
 
         # downsampling
-        self.conv_in = torch.nn.Conv2d(in_channels,
-                                       self.ch,
-                                       kernel_size=3,
-                                       stride=1,
-                                       padding=1)
+        self.conv_in = torch.nn.Conv2d(in_channels, self.ch, kernel_size=3, stride=1, padding=1)
 
         curr_res = resolution
-        in_ch_mult = (1,)+tuple(ch_mult)
+        in_ch_mult = (1,) + tuple(ch_mult)
         self.in_ch_mult = in_ch_mult
         self.down = nn.ModuleList()
         for i_level in range(self.num_resolutions):
             block = nn.ModuleList()
             attn = nn.ModuleList()
-            block_in = ch*in_ch_mult[i_level]
-            block_out = ch*ch_mult[i_level]
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
             for i_block in range(self.num_res_blocks):
-                block.append(ResnetBlock(in_channels=block_in,
-                                         out_channels=block_out,
-                                         temb_channels=self.temb_ch,
-                                         dropout=dropout))
+                block.append(
+                    ResnetBlock(in_channels=block_in,
+                                out_channels=block_out,
+                                temb_channels=self.temb_ch,
+                                dropout=dropout))
                 block_in = block_out
                 if curr_res in attn_resolutions:
                     attn.append(make_attn(block_in, attn_type=attn_type))
             down = nn.Module()
             down.block = block
             down.attn = attn
-            if i_level != self.num_resolutions-1:
+            if i_level != self.num_resolutions - 1:
                 down.downsample = Downsample(block_in, resamp_with_conv)
                 curr_res = curr_res // 2
             self.down.append(down)
@@ -515,7 +472,7 @@ def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
         # end
         self.norm_out = Normalize(block_in)
         self.conv_out = torch.nn.Conv2d(block_in,
-                                        2*z_channels if double_z else z_channels,
+                                        2 * z_channels if double_z else z_channels,
                                         kernel_size=3,
                                         stride=1,
                                         padding=1)
@@ -532,7 +489,7 @@ def forward(self, x):
                 if len(self.down[i_level].attn) > 0:
                     h = self.down[i_level].attn[i_block](h)
                 hs.append(h)
-            if i_level != self.num_resolutions-1:
+            if i_level != self.num_resolutions - 1:
                 hs.append(self.down[i_level].downsample(hs[-1]))
 
         # middle
@@ -549,12 +506,27 @@ def forward(self, x):
 
 
 class Decoder(nn.Module):
-    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
-                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
-                 resolution, z_channels, give_pre_end=False, tanh_out=False, use_linear_attn=False,
-                 attn_type="vanilla", **ignorekwargs):
+
+    def __init__(self,
+                 *,
+                 ch,
+                 out_ch,
+                 ch_mult=(1, 2, 4, 8),
+                 num_res_blocks,
+                 attn_resolutions,
+                 dropout=0.0,
+                 resamp_with_conv=True,
+                 in_channels,
+                 resolution,
+                 z_channels,
+                 give_pre_end=False,
+                 tanh_out=False,
+                 use_linear_attn=False,
+                 attn_type="vanilla",
+                 **ignorekwargs):
         super().__init__()
-        if use_linear_attn: attn_type = "linear"
+        if use_linear_attn:
+            attn_type = "linear"
         self.ch = ch
         self.temb_ch = 0
         self.num_resolutions = len(ch_mult)
@@ -565,19 +537,14 @@ def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
         self.tanh_out = tanh_out
 
         # compute in_ch_mult, block_in and curr_res at lowest res
-        in_ch_mult = (1,)+tuple(ch_mult)
-        block_in = ch*ch_mult[self.num_resolutions-1]
-        curr_res = resolution // 2**(self.num_resolutions-1)
-        self.z_shape = (1,z_channels,curr_res,curr_res)
-        rank_zero_info("Working with z of shape {} = {} dimensions.".format(
-            self.z_shape, np.prod(self.z_shape)))
+        in_ch_mult = (1,) + tuple(ch_mult)
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        curr_res = resolution // 2**(self.num_resolutions - 1)
+        self.z_shape = (1, z_channels, curr_res, curr_res)
+        rank_zero_info("Working with z of shape {} = {} dimensions.".format(self.z_shape, np.prod(self.z_shape)))
 
         # z to block_in
-        self.conv_in = torch.nn.Conv2d(z_channels,
-                                       block_in,
-                                       kernel_size=3,
-                                       stride=1,
-                                       padding=1)
+        self.conv_in = torch.nn.Conv2d(z_channels, block_in, kernel_size=3, stride=1, padding=1)
 
         # middle
         self.mid = nn.Module()
@@ -596,12 +563,13 @@ def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
         for i_level in reversed(range(self.num_resolutions)):
             block = nn.ModuleList()
             attn = nn.ModuleList()
-            block_out = ch*ch_mult[i_level]
-            for i_block in range(self.num_res_blocks+1):
-                block.append(ResnetBlock(in_channels=block_in,
-                                         out_channels=block_out,
-                                         temb_channels=self.temb_ch,
-                                         dropout=dropout))
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                block.append(
+                    ResnetBlock(in_channels=block_in,
+                                out_channels=block_out,
+                                temb_channels=self.temb_ch,
+                                dropout=dropout))
                 block_in = block_out
                 if curr_res in attn_resolutions:
                     attn.append(make_attn(block_in, attn_type=attn_type))
@@ -611,15 +579,11 @@ def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
             if i_level != 0:
                 up.upsample = Upsample(block_in, resamp_with_conv)
                 curr_res = curr_res * 2
-            self.up.insert(0, up) # prepend to get consistent order
+            self.up.insert(0, up)    # prepend to get consistent order
 
         # end
         self.norm_out = Normalize(block_in)
-        self.conv_out = torch.nn.Conv2d(block_in,
-                                        out_ch,
-                                        kernel_size=3,
-                                        stride=1,
-                                        padding=1)
+        self.conv_out = torch.nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1)
 
     def forward(self, z):
         #assert z.shape[1:] == self.z_shape[1:]
@@ -638,7 +602,7 @@ def forward(self, z):
 
         # upsampling
         for i_level in reversed(range(self.num_resolutions)):
-            for i_block in range(self.num_res_blocks+1):
+            for i_block in range(self.num_res_blocks + 1):
                 h = self.up[i_level].block[i_block](h, temb)
                 if len(self.up[i_level].attn) > 0:
                     h = self.up[i_level].attn[i_block](h)
@@ -658,31 +622,24 @@ def forward(self, z):
 
 
 class SimpleDecoder(nn.Module):
+
     def __init__(self, in_channels, out_channels, *args, **kwargs):
         super().__init__()
-        self.model = nn.ModuleList([nn.Conv2d(in_channels, in_channels, 1),
-                                     ResnetBlock(in_channels=in_channels,
-                                                 out_channels=2 * in_channels,
-                                                 temb_channels=0, dropout=0.0),
-                                     ResnetBlock(in_channels=2 * in_channels,
-                                                out_channels=4 * in_channels,
-                                                temb_channels=0, dropout=0.0),
-                                     ResnetBlock(in_channels=4 * in_channels,
-                                                out_channels=2 * in_channels,
-                                                temb_channels=0, dropout=0.0),
-                                     nn.Conv2d(2*in_channels, in_channels, 1),
-                                     Upsample(in_channels, with_conv=True)])
+        self.model = nn.ModuleList([
+            nn.Conv2d(in_channels, in_channels, 1),
+            ResnetBlock(in_channels=in_channels, out_channels=2 * in_channels, temb_channels=0, dropout=0.0),
+            ResnetBlock(in_channels=2 * in_channels, out_channels=4 * in_channels, temb_channels=0, dropout=0.0),
+            ResnetBlock(in_channels=4 * in_channels, out_channels=2 * in_channels, temb_channels=0, dropout=0.0),
+            nn.Conv2d(2 * in_channels, in_channels, 1),
+            Upsample(in_channels, with_conv=True)
+        ])
         # end
         self.norm_out = Normalize(in_channels)
-        self.conv_out = torch.nn.Conv2d(in_channels,
-                                        out_channels,
-                                        kernel_size=3,
-                                        stride=1,
-                                        padding=1)
+        self.conv_out = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
 
     def forward(self, x):
         for i, layer in enumerate(self.model):
-            if i in [1,2,3]:
+            if i in [1, 2, 3]:
                 x = layer(x, None)
             else:
                 x = layer(x)
@@ -694,25 +651,26 @@ def forward(self, x):
 
 
 class UpsampleDecoder(nn.Module):
-    def __init__(self, in_channels, out_channels, ch, num_res_blocks, resolution,
-                 ch_mult=(2,2), dropout=0.0):
+
+    def __init__(self, in_channels, out_channels, ch, num_res_blocks, resolution, ch_mult=(2, 2), dropout=0.0):
         super().__init__()
         # upsampling
         self.temb_ch = 0
         self.num_resolutions = len(ch_mult)
         self.num_res_blocks = num_res_blocks
         block_in = in_channels
-        curr_res = resolution // 2 ** (self.num_resolutions - 1)
+        curr_res = resolution // 2**(self.num_resolutions - 1)
         self.res_blocks = nn.ModuleList()
         self.upsample_blocks = nn.ModuleList()
         for i_level in range(self.num_resolutions):
             res_block = []
             block_out = ch * ch_mult[i_level]
             for i_block in range(self.num_res_blocks + 1):
-                res_block.append(ResnetBlock(in_channels=block_in,
-                                         out_channels=block_out,
-                                         temb_channels=self.temb_ch,
-                                         dropout=dropout))
+                res_block.append(
+                    ResnetBlock(in_channels=block_in,
+                                out_channels=block_out,
+                                temb_channels=self.temb_ch,
+                                dropout=dropout))
                 block_in = block_out
             self.res_blocks.append(nn.ModuleList(res_block))
             if i_level != self.num_resolutions - 1:
@@ -721,11 +679,7 @@ def __init__(self, in_channels, out_channels, ch, num_res_blocks, resolution,
 
         # end
         self.norm_out = Normalize(block_in)
-        self.conv_out = torch.nn.Conv2d(block_in,
-                                        out_channels,
-                                        kernel_size=3,
-                                        stride=1,
-                                        padding=1)
+        self.conv_out = torch.nn.Conv2d(block_in, out_channels, kernel_size=3, stride=1, padding=1)
 
     def forward(self, x):
         # upsampling
@@ -742,35 +696,35 @@ def forward(self, x):
 
 
 class LatentRescaler(nn.Module):
+
     def __init__(self, factor, in_channels, mid_channels, out_channels, depth=2):
         super().__init__()
         # residual block, interpolate, residual block
         self.factor = factor
-        self.conv_in = nn.Conv2d(in_channels,
-                                 mid_channels,
-                                 kernel_size=3,
-                                 stride=1,
-                                 padding=1)
-        self.res_block1 = nn.ModuleList([ResnetBlock(in_channels=mid_channels,
-                                                     out_channels=mid_channels,
-                                                     temb_channels=0,
-                                                     dropout=0.0) for _ in range(depth)])
+        self.conv_in = nn.Conv2d(in_channels, mid_channels, kernel_size=3, stride=1, padding=1)
+        self.res_block1 = nn.ModuleList([
+            ResnetBlock(in_channels=mid_channels, out_channels=mid_channels, temb_channels=0, dropout=0.0)
+            for _ in range(depth)
+        ])
         self.attn = AttnBlock(mid_channels)
-        self.res_block2 = nn.ModuleList([ResnetBlock(in_channels=mid_channels,
-                                                     out_channels=mid_channels,
-                                                     temb_channels=0,
-                                                     dropout=0.0) for _ in range(depth)])
-
-        self.conv_out = nn.Conv2d(mid_channels,
-                                  out_channels,
-                                  kernel_size=1,
-                                  )
+        self.res_block2 = nn.ModuleList([
+            ResnetBlock(in_channels=mid_channels, out_channels=mid_channels, temb_channels=0, dropout=0.0)
+            for _ in range(depth)
+        ])
+
+        self.conv_out = nn.Conv2d(
+            mid_channels,
+            out_channels,
+            kernel_size=1,
+        )
 
     def forward(self, x):
         x = self.conv_in(x)
         for block in self.res_block1:
             x = block(x, None)
-        x = torch.nn.functional.interpolate(x, size=(int(round(x.shape[2]*self.factor)), int(round(x.shape[3]*self.factor))))
+        x = torch.nn.functional.interpolate(x,
+                                            size=(int(round(x.shape[2] * self.factor)),
+                                                  int(round(x.shape[3] * self.factor))))
         x = self.attn(x)
         for block in self.res_block2:
             x = block(x, None)
@@ -779,17 +733,37 @@ def forward(self, x):
 
 
 class MergedRescaleEncoder(nn.Module):
-    def __init__(self, in_channels, ch, resolution, out_ch, num_res_blocks,
-                 attn_resolutions, dropout=0.0, resamp_with_conv=True,
-                 ch_mult=(1,2,4,8), rescale_factor=1.0, rescale_module_depth=1):
+
+    def __init__(self,
+                 in_channels,
+                 ch,
+                 resolution,
+                 out_ch,
+                 num_res_blocks,
+                 attn_resolutions,
+                 dropout=0.0,
+                 resamp_with_conv=True,
+                 ch_mult=(1, 2, 4, 8),
+                 rescale_factor=1.0,
+                 rescale_module_depth=1):
         super().__init__()
         intermediate_chn = ch * ch_mult[-1]
-        self.encoder = Encoder(in_channels=in_channels, num_res_blocks=num_res_blocks, ch=ch, ch_mult=ch_mult,
-                               z_channels=intermediate_chn, double_z=False, resolution=resolution,
-                               attn_resolutions=attn_resolutions, dropout=dropout, resamp_with_conv=resamp_with_conv,
+        self.encoder = Encoder(in_channels=in_channels,
+                               num_res_blocks=num_res_blocks,
+                               ch=ch,
+                               ch_mult=ch_mult,
+                               z_channels=intermediate_chn,
+                               double_z=False,
+                               resolution=resolution,
+                               attn_resolutions=attn_resolutions,
+                               dropout=dropout,
+                               resamp_with_conv=resamp_with_conv,
                                out_ch=None)
-        self.rescaler = LatentRescaler(factor=rescale_factor, in_channels=intermediate_chn,
-                                       mid_channels=intermediate_chn, out_channels=out_ch, depth=rescale_module_depth)
+        self.rescaler = LatentRescaler(factor=rescale_factor,
+                                       in_channels=intermediate_chn,
+                                       mid_channels=intermediate_chn,
+                                       out_channels=out_ch,
+                                       depth=rescale_module_depth)
 
     def forward(self, x):
         x = self.encoder(x)
@@ -798,15 +772,36 @@ def forward(self, x):
 
 
 class MergedRescaleDecoder(nn.Module):
-    def __init__(self, z_channels, out_ch, resolution, num_res_blocks, attn_resolutions, ch, ch_mult=(1,2,4,8),
-                 dropout=0.0, resamp_with_conv=True, rescale_factor=1.0, rescale_module_depth=1):
+
+    def __init__(self,
+                 z_channels,
+                 out_ch,
+                 resolution,
+                 num_res_blocks,
+                 attn_resolutions,
+                 ch,
+                 ch_mult=(1, 2, 4, 8),
+                 dropout=0.0,
+                 resamp_with_conv=True,
+                 rescale_factor=1.0,
+                 rescale_module_depth=1):
         super().__init__()
-        tmp_chn = z_channels*ch_mult[-1]
-        self.decoder = Decoder(out_ch=out_ch, z_channels=tmp_chn, attn_resolutions=attn_resolutions, dropout=dropout,
-                               resamp_with_conv=resamp_with_conv, in_channels=None, num_res_blocks=num_res_blocks,
-                               ch_mult=ch_mult, resolution=resolution, ch=ch)
-        self.rescaler = LatentRescaler(factor=rescale_factor, in_channels=z_channels, mid_channels=tmp_chn,
-                                       out_channels=tmp_chn, depth=rescale_module_depth)
+        tmp_chn = z_channels * ch_mult[-1]
+        self.decoder = Decoder(out_ch=out_ch,
+                               z_channels=tmp_chn,
+                               attn_resolutions=attn_resolutions,
+                               dropout=dropout,
+                               resamp_with_conv=resamp_with_conv,
+                               in_channels=None,
+                               num_res_blocks=num_res_blocks,
+                               ch_mult=ch_mult,
+                               resolution=resolution,
+                               ch=ch)
+        self.rescaler = LatentRescaler(factor=rescale_factor,
+                                       in_channels=z_channels,
+                                       mid_channels=tmp_chn,
+                                       out_channels=tmp_chn,
+                                       depth=rescale_module_depth)
 
     def forward(self, x):
         x = self.rescaler(x)
@@ -815,16 +810,26 @@ def forward(self, x):
 
 
 class Upsampler(nn.Module):
+
     def __init__(self, in_size, out_size, in_channels, out_channels, ch_mult=2):
         super().__init__()
         assert out_size >= in_size
-        num_blocks = int(np.log2(out_size//in_size))+1
-        factor_up = 1.+ (out_size % in_size)
-        rank_zero_info(f"Building {self.__class__.__name__} with in_size: {in_size} --> out_size {out_size} and factor {factor_up}")
-        self.rescaler = LatentRescaler(factor=factor_up, in_channels=in_channels, mid_channels=2*in_channels,
+        num_blocks = int(np.log2(out_size // in_size)) + 1
+        factor_up = 1. + (out_size % in_size)
+        rank_zero_info(
+            f"Building {self.__class__.__name__} with in_size: {in_size} --> out_size {out_size} and factor {factor_up}"
+        )
+        self.rescaler = LatentRescaler(factor=factor_up,
+                                       in_channels=in_channels,
+                                       mid_channels=2 * in_channels,
                                        out_channels=in_channels)
-        self.decoder = Decoder(out_ch=out_channels, resolution=out_size, z_channels=in_channels, num_res_blocks=2,
-                               attn_resolutions=[], in_channels=None, ch=in_channels,
+        self.decoder = Decoder(out_ch=out_channels,
+                               resolution=out_size,
+                               z_channels=in_channels,
+                               num_res_blocks=2,
+                               attn_resolutions=[],
+                               in_channels=None,
+                               ch=in_channels,
                                ch_mult=[ch_mult for _ in range(num_blocks)])
 
     def forward(self, x):
@@ -834,23 +839,21 @@ def forward(self, x):
 
 
 class Resize(nn.Module):
+
     def __init__(self, in_channels=None, learned=False, mode="bilinear"):
         super().__init__()
         self.with_conv = learned
         self.mode = mode
         if self.with_conv:
-            rank_zero_info(f"Note: {self.__class__.__name} uses learned downsampling and will ignore the fixed {mode} mode")
+            rank_zero_info(
+                f"Note: {self.__class__.__name} uses learned downsampling and will ignore the fixed {mode} mode")
             raise NotImplementedError()
             assert in_channels is not None
             # no asymmetric padding in torch conv, must do it ourselves
-            self.conv = torch.nn.Conv2d(in_channels,
-                                        in_channels,
-                                        kernel_size=4,
-                                        stride=2,
-                                        padding=1)
+            self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=4, stride=2, padding=1)
 
     def forward(self, x, scale_factor=1.0):
-        if scale_factor==1.0:
+        if scale_factor == 1.0:
             return x
         else:
             x = torch.nn.functional.interpolate(x, mode=self.mode, align_corners=False, scale_factor=scale_factor)
diff --git a/examples/images/diffusion/main.py b/examples/images/diffusion/main.py
index 87d495123714..5f166aa1f71f 100644
--- a/examples/images/diffusion/main.py
+++ b/examples/images/diffusion/main.py
@@ -106,7 +106,20 @@ def str2bool(v):
         nargs="?",
         help="disable test",
     )
-    parser.add_argument("-p", "--project", help="name of new or path to existing project")
+    parser.add_argument(
+        "-p",
+        "--project",
+        help="name of new or path to existing project",
+    )
+    parser.add_argument(
+        "-c",
+        "--ckpt",
+        type=str,
+        const=True,
+        default="",
+        nargs="?",
+        help="load pretrained checkpoint from stable AI",
+    )
     parser.add_argument(
         "-d",
         "--debug",
@@ -145,22 +158,7 @@ def str2bool(v):
         default=True,
         help="scale base-lr by ngpu * batch_size * n_accumulate",
     )
-    parser.add_argument(
-        "--use_fp16",
-        type=str2bool,
-        nargs="?",
-        const=True,
-        default=True,
-        help="whether to use fp16",
-    )
-    parser.add_argument(
-        "--flash",
-        type=str2bool,
-        const=True,
-        default=False,
-        nargs="?",
-        help="whether to use flash attention",
-    )
+
     return parser
 
 
@@ -341,6 +339,12 @@ def on_fit_start(self, trainer, pl_module):
                 except FileNotFoundError:
                     pass
 
+    # def on_fit_end(self, trainer, pl_module):
+    #     if trainer.global_rank == 0:
+    #         ckpt_path = os.path.join(self.ckptdir, "last.ckpt")
+    #         rank_zero_info(f"Saving final checkpoint in {ckpt_path}.")
+    #         trainer.save_checkpoint(ckpt_path)
+
 
 class ImageLogger(Callback):
 
@@ -536,6 +540,7 @@ def on_train_epoch_end(self, trainer, pl_module):
                          "If you want to resume training in a new log folder, "
                          "use -n/--name in combination with --resume_from_checkpoint")
     if opt.resume:
+        rank_zero_info("Resuming from {}".format(opt.resume))
         if not os.path.exists(opt.resume):
             raise ValueError("Cannot find {}".format(opt.resume))
         if os.path.isfile(opt.resume):
@@ -543,13 +548,13 @@ def on_train_epoch_end(self, trainer, pl_module):
             # idx = len(paths)-paths[::-1].index("logs")+1
             # logdir = "/".join(paths[:idx])
             logdir = "/".join(paths[:-2])
+            rank_zero_info("logdir: {}".format(logdir))
             ckpt = opt.resume
         else:
             assert os.path.isdir(opt.resume), opt.resume
             logdir = opt.resume.rstrip("/")
             ckpt = os.path.join(logdir, "checkpoints", "last.ckpt")
 
-        opt.resume_from_checkpoint = ckpt
         base_configs = sorted(glob.glob(os.path.join(logdir, "configs/*.yaml")))
         opt.base = base_configs + opt.base
         _tmp = logdir.split("/")
@@ -558,6 +563,7 @@ def on_train_epoch_end(self, trainer, pl_module):
         if opt.name:
             name = "_" + opt.name
         elif opt.base:
+            rank_zero_info("Using base config {}".format(opt.base))
             cfg_fname = os.path.split(opt.base[0])[-1]
             cfg_name = os.path.splitext(cfg_fname)[0]
             name = "_" + cfg_name
@@ -566,6 +572,9 @@ def on_train_epoch_end(self, trainer, pl_module):
         nowname = now + name + opt.postfix
         logdir = os.path.join(opt.logdir, nowname)
 
+        if opt.ckpt:
+            ckpt = opt.ckpt
+
     ckptdir = os.path.join(logdir, "checkpoints")
     cfgdir = os.path.join(logdir, "configs")
     seed_everything(opt.seed)
@@ -582,14 +591,11 @@ def on_train_epoch_end(self, trainer, pl_module):
         for k in nondefault_trainer_args(opt):
             trainer_config[k] = getattr(opt, k)
 
-        print(trainer_config)
         if not trainer_config["accelerator"] == "gpu":
             del trainer_config["accelerator"]
             cpu = True
-            print("Running on CPU")
         else:
             cpu = False
-            print("Running on GPU")
         trainer_opt = argparse.Namespace(**trainer_config)
         lightning_config.trainer = trainer_config
 
@@ -597,10 +603,12 @@ def on_train_epoch_end(self, trainer, pl_module):
         use_fp16 = trainer_config.get("precision", 32) == 16
         if use_fp16:
             config.model["params"].update({"use_fp16": True})
-            print("Using FP16 = {}".format(config.model["params"]["use_fp16"]))
         else:
             config.model["params"].update({"use_fp16": False})
-            print("Using FP16 = {}".format(config.model["params"]["use_fp16"]))
+
+        if ckpt is not None:
+            config.model["params"].update({"ckpt": ckpt})
+            rank_zero_info("Using ckpt_path = {}".format(config.model["params"]["ckpt"]))
 
         model = instantiate_from_config(config.model)
         # trainer and callbacks
@@ -639,7 +647,6 @@ def on_train_epoch_end(self, trainer, pl_module):
         # config the strategy, defualt is ddp
         if "strategy" in trainer_config:
             strategy_cfg = trainer_config["strategy"]
-            print("Using strategy: {}".format(strategy_cfg["target"]))
             strategy_cfg["target"] = LIGHTNING_PACK_NAME + strategy_cfg["target"]
         else:
             strategy_cfg = {
@@ -648,7 +655,6 @@ def on_train_epoch_end(self, trainer, pl_module):
                     "find_unused_parameters": False
                 }
             }
-            print("Using strategy: DDPStrategy")
 
         trainer_kwargs["strategy"] = instantiate_from_config(strategy_cfg)
 
@@ -664,7 +670,6 @@ def on_train_epoch_end(self, trainer, pl_module):
             }
         }
         if hasattr(model, "monitor"):
-            print(f"Monitoring {model.monitor} as checkpoint metric.")
             default_modelckpt_cfg["params"]["monitor"] = model.monitor
             default_modelckpt_cfg["params"]["save_top_k"] = 3
 
@@ -673,7 +678,6 @@ def on_train_epoch_end(self, trainer, pl_module):
         else:
             modelckpt_cfg = OmegaConf.create()
         modelckpt_cfg = OmegaConf.merge(default_modelckpt_cfg, modelckpt_cfg)
-        print(f"Merged modelckpt-cfg: \n{modelckpt_cfg}")
         if version.parse(pl.__version__) < version.parse('1.4.0'):
             trainer_kwargs["checkpoint_callback"] = instantiate_from_config(modelckpt_cfg)
 
@@ -710,8 +714,6 @@ def on_train_epoch_end(self, trainer, pl_module):
                 "target": "main.CUDACallback"
             },
         }
-        if version.parse(pl.__version__) >= version.parse('1.4.0'):
-            default_callbacks_cfg.update({'checkpoint_callback': modelckpt_cfg})
 
         if "callbacks" in lightning_config:
             callbacks_cfg = lightning_config.callbacks
@@ -737,15 +739,11 @@ def on_train_epoch_end(self, trainer, pl_module):
             default_callbacks_cfg.update(default_metrics_over_trainsteps_ckpt_dict)
 
         callbacks_cfg = OmegaConf.merge(default_callbacks_cfg, callbacks_cfg)
-        if 'ignore_keys_callback' in callbacks_cfg and hasattr(trainer_opt, 'resume_from_checkpoint'):
-            callbacks_cfg.ignore_keys_callback.params['ckpt_path'] = trainer_opt.resume_from_checkpoint
-        elif 'ignore_keys_callback' in callbacks_cfg:
-            del callbacks_cfg['ignore_keys_callback']
 
         trainer_kwargs["callbacks"] = [instantiate_from_config(callbacks_cfg[k]) for k in callbacks_cfg]
 
         trainer = Trainer.from_argparse_args(trainer_opt, **trainer_kwargs)
-        trainer.logdir = logdir    ###
+        trainer.logdir = logdir
 
         # data
         data = instantiate_from_config(config.data)
@@ -754,9 +752,9 @@ def on_train_epoch_end(self, trainer, pl_module):
         # lightning still takes care of proper multiprocessing though
         data.prepare_data()
         data.setup()
-        print("#### Data #####")
+
         for k in data.datasets:
-            print(f"{k}, {data.datasets[k].__class__.__name__}, {len(data.datasets[k])}")
+            rank_zero_info(f"{k}, {data.datasets[k].__class__.__name__}, {len(data.datasets[k])}")
 
         # configure learning rate
         bs, base_lr = config.data.params.batch_size, config.model.base_learning_rate
@@ -768,17 +766,17 @@ def on_train_epoch_end(self, trainer, pl_module):
             accumulate_grad_batches = lightning_config.trainer.accumulate_grad_batches
         else:
             accumulate_grad_batches = 1
-        print(f"accumulate_grad_batches = {accumulate_grad_batches}")
+        rank_zero_info(f"accumulate_grad_batches = {accumulate_grad_batches}")
         lightning_config.trainer.accumulate_grad_batches = accumulate_grad_batches
         if opt.scale_lr:
             model.learning_rate = accumulate_grad_batches * ngpu * bs * base_lr
-            print(
+            rank_zero_info(
                 "Setting learning rate to {:.2e} = {} (accumulate_grad_batches) * {} (num_gpus) * {} (batchsize) * {:.2e} (base_lr)"
                 .format(model.learning_rate, accumulate_grad_batches, ngpu, bs, base_lr))
         else:
             model.learning_rate = base_lr
-            print("++++ NOT USING LR SCALING ++++")
-            print(f"Setting learning rate to {model.learning_rate:.2e}")
+            rank_zero_info("++++ NOT USING LR SCALING ++++")
+            rank_zero_info(f"Setting learning rate to {model.learning_rate:.2e}")
 
         # allow checkpointing via USR1
         def melk(*args, **kwargs):
diff --git a/examples/images/diffusion/scripts/txt2img.sh b/examples/images/diffusion/scripts/txt2img.sh
index 53041cb8df6d..bc6480b6bdaa 100755
--- a/examples/images/diffusion/scripts/txt2img.sh
+++ b/examples/images/diffusion/scripts/txt2img.sh
@@ -1,5 +1,5 @@
-python scripts/txt2img.py --prompt "Teyvat, Name:Layla, Element: Cryo, Weapon:Sword, Region:Sumeru, Model type:Medium Female, Description:a woman in a blue outfit holding a sword" --plms \
+python scripts/txt2img.py --prompt "Teyvat, Medium Female, a woman in a blue outfit holding a sword" --plms \
     --outdir ./output \
-    --ckpt /tmp/2022-11-18T16-38-46_train_colossalai/checkpoints/last.ckpt \
-    --config /tmp/2022-11-18T16-38-46_train_colossalai/configs/2022-11-18T16-38-46-project.yaml  \
+    --ckpt checkpoints/last.ckpt \
+    --config configs/2023-02-02T18-06-14-project.yaml \
     --n_samples 4
diff --git a/examples/images/diffusion/train_colossalai.sh b/examples/images/diffusion/train_colossalai.sh
index dcaeeb0c6595..c56ed7876e5a 100755
--- a/examples/images/diffusion/train_colossalai.sh
+++ b/examples/images/diffusion/train_colossalai.sh
@@ -2,4 +2,4 @@ HF_DATASETS_OFFLINE=1
 TRANSFORMERS_OFFLINE=1
 DIFFUSERS_OFFLINE=1
 
-python main.py --logdir /tmp -t -b configs/train_colossalai.yaml
+python main.py --logdir /tmp --train --base configs/Teyvat/train_colossalai_teyvat.yaml --ckpt diffuser_root_dir/512-base-ema.ckpt

From 4f5ef73a43db6fbce125bb29ea9fe8791a9182a5 Mon Sep 17 00:00:00 2001
From: oahzxl <43881818+oahzxl@users.noreply.github.com>
Date: Fri, 3 Feb 2023 16:54:28 +0800
Subject: [PATCH 243/503] [tutorial] update fastfold tutorial (#2565)

* update readme

* update

* update
---
 .gitmodules                             |   3 +
 examples/tutorial/fastfold/FastFold     |   1 +
 examples/tutorial/fastfold/README.md    |  34 +++---
 examples/tutorial/fastfold/inference.py | 153 ------------------------
 examples/tutorial/fastfold/test_ci.sh   |  10 --
 5 files changed, 22 insertions(+), 179 deletions(-)
 create mode 160000 examples/tutorial/fastfold/FastFold
 delete mode 100644 examples/tutorial/fastfold/inference.py
 delete mode 100644 examples/tutorial/fastfold/test_ci.sh

diff --git a/.gitmodules b/.gitmodules
index 63387570a548..2f1c34298a50 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -2,3 +2,6 @@
 	path = inference
 	url = https://github.com/hpcaitech/EnergonAI.git
 	branch = main
+[submodule "examples/tutorial/fastfold/FastFold"]
+	path = examples/tutorial/fastfold/FastFold
+	url = https://github.com/hpcaitech/FastFold
diff --git a/examples/tutorial/fastfold/FastFold b/examples/tutorial/fastfold/FastFold
new file mode 160000
index 000000000000..19ce840650fd
--- /dev/null
+++ b/examples/tutorial/fastfold/FastFold
@@ -0,0 +1 @@
+Subproject commit 19ce840650fd865bd3684684dac051ec3a7bc762
diff --git a/examples/tutorial/fastfold/README.md b/examples/tutorial/fastfold/README.md
index 5c74c737d4b0..0c3df7a07401 100644
--- a/examples/tutorial/fastfold/README.md
+++ b/examples/tutorial/fastfold/README.md
@@ -2,23 +2,21 @@
 
 ## Table of contents
 
-- [Overview](#📚-overview)
-- [Quick Start](#🚀-quick-start)
-- [Dive into FastFold](#🔍-dive-into-fastfold)
+- [FastFold Inference](#fastfold-inference)
+  - [Table of contents](#table-of-contents)
+  - [📚 Overview](#-overview)
+  - [🚀 Quick Start](#-quick-start)
+  - [🔍 Dive into FastFold](#-dive-into-fastfold)
 
 ## 📚 Overview
 
-This example lets you to quickly try out the inference of FastFold.
-
-**NOTE: We use random data and random parameters in this example.**
-
+This example lets you to try out the inference of FastFold.
 
 ## 🚀 Quick Start
 
 1. Install FastFold
 
-We highly recommend installing an Anaconda or Miniconda environment and install PyTorch with conda.
-
+We highly recommend you to install FastFold with conda.
 ```
 git clone https://github.com/hpcaitech/FastFold
 cd FastFold
@@ -27,15 +25,19 @@ conda activate fastfold
 python setup.py install
 ```
 
-2. Run the inference scripts.
+2. Download datasets.
+
+It may take ~900GB space to keep datasets.
+```
+./scripts/download_all_data.sh data/
+```
+
+3. Run the inference scripts.
 
-```bash
-python inference.py --gpus=1 --n_res=256 --chunk_size=None --inplace
 ```
-+ `gpus` means the DAP size
-+ `n_res` means the length of residue sequence
-+ `chunk_size` introduces a memory-saving technology at the cost of speed, None means not using, 16 may be a good trade off for long sequences.
-+ `inplace` introduces another memory-saving technology with zero cost, drop `--inplace` if you do not want it.
+bash inference.sh
+```
+You can find predictions under the `outputs` dir.
 
 ## 🔍 Dive into FastFold
 
diff --git a/examples/tutorial/fastfold/inference.py b/examples/tutorial/fastfold/inference.py
deleted file mode 100644
index ccfa78256b19..000000000000
--- a/examples/tutorial/fastfold/inference.py
+++ /dev/null
@@ -1,153 +0,0 @@
-# Copyright 2023 HPC-AI Tech Inc.
-# Copyright 2021 AlQuraishi Laboratory
-# Copyright 2021 DeepMind Technologies Limited
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-import time
-
-import fastfold
-import numpy as np
-import torch
-import torch.multiprocessing as mp
-from fastfold.config import model_config
-from fastfold.data import data_transforms
-from fastfold.model.fastnn import set_chunk_size
-from fastfold.model.hub import AlphaFold
-from fastfold.utils.inject_fastnn import inject_fastnn
-from fastfold.utils.tensor_utils import tensor_tree_map
-
-if int(torch.__version__.split(".")[0]) >= 1 and int(torch.__version__.split(".")[1]) > 11:
-    torch.backends.cuda.matmul.allow_tf32 = True
-
-
-def random_template_feats(n_templ, n):
-    b = []
-    batch = {
-        "template_mask": np.random.randint(0, 2, (*b, n_templ)),
-        "template_pseudo_beta_mask": np.random.randint(0, 2, (*b, n_templ, n)),
-        "template_pseudo_beta": np.random.rand(*b, n_templ, n, 3),
-        "template_aatype": np.random.randint(0, 22, (*b, n_templ, n)),
-        "template_all_atom_mask": np.random.randint(0, 2, (*b, n_templ, n, 37)),
-        "template_all_atom_positions": np.random.rand(*b, n_templ, n, 37, 3) * 10,
-        "template_torsion_angles_sin_cos": np.random.rand(*b, n_templ, n, 7, 2),
-        "template_alt_torsion_angles_sin_cos": np.random.rand(*b, n_templ, n, 7, 2),
-        "template_torsion_angles_mask": np.random.rand(*b, n_templ, n, 7),
-    }
-    batch = {k: v.astype(np.float32) for k, v in batch.items()}
-    batch["template_aatype"] = batch["template_aatype"].astype(np.int64)
-    return batch
-
-
-def random_extra_msa_feats(n_extra, n):
-    b = []
-    batch = {
-        "extra_msa": np.random.randint(0, 22, (*b, n_extra, n)).astype(np.int64),
-        "extra_has_deletion": np.random.randint(0, 2, (*b, n_extra, n)).astype(np.float32),
-        "extra_deletion_value": np.random.rand(*b, n_extra, n).astype(np.float32),
-        "extra_msa_mask": np.random.randint(0, 2, (*b, n_extra, n)).astype(np.float32),
-    }
-    return batch
-
-
-def generate_batch(n_res):
-    batch = {}
-    tf = torch.randint(21, size=(n_res,))
-    batch["target_feat"] = torch.nn.functional.one_hot(tf, 22).float()
-    batch["aatype"] = torch.argmax(batch["target_feat"], dim=-1)
-    batch["residue_index"] = torch.arange(n_res)
-    batch["msa_feat"] = torch.rand((128, n_res, 49))
-    t_feats = random_template_feats(4, n_res)
-    batch.update({k: torch.tensor(v) for k, v in t_feats.items()})
-    extra_feats = random_extra_msa_feats(5120, n_res)
-    batch.update({k: torch.tensor(v) for k, v in extra_feats.items()})
-    batch["msa_mask"] = torch.randint(low=0, high=2, size=(128, n_res)).float()
-    batch["seq_mask"] = torch.randint(low=0, high=2, size=(n_res,)).float()
-    batch.update(data_transforms.make_atom14_masks(batch))
-    batch["no_recycling_iters"] = torch.tensor(2.)
-
-    add_recycling_dims = lambda t: (t.unsqueeze(-1).expand(*t.shape, 3))
-    batch = tensor_tree_map(add_recycling_dims, batch)
-
-    return batch
-
-
-def inference_model(rank, world_size, result_q, batch, args):
-    os.environ['RANK'] = str(rank)
-    os.environ['LOCAL_RANK'] = str(rank)
-    os.environ['WORLD_SIZE'] = str(world_size)
-    # init distributed for Dynamic Axial Parallelism
-    fastfold.distributed.init_dap()
-    torch.cuda.set_device(rank)
-    config = model_config(args.model_name)
-    if args.chunk_size:
-        config.globals.chunk_size = args.chunk_size
-
-    config.globals.inplace = args.inplace
-    config.globals.is_multimer = False
-    model = AlphaFold(config)
-
-    model = inject_fastnn(model)
-    model = model.eval()
-    model = model.cuda()
-
-    set_chunk_size(model.globals.chunk_size)
-
-    with torch.no_grad():
-        batch = {k: torch.as_tensor(v).cuda() for k, v in batch.items()}
-        t = time.perf_counter()
-        out = model(batch)
-        print(f"Inference time: {time.perf_counter() - t}")
-    out = tensor_tree_map(lambda x: np.array(x.cpu()), out)
-
-    result_q.put(out)
-
-    torch.distributed.barrier()
-    torch.cuda.synchronize()
-
-
-def inference_monomer_model(args):
-    batch = generate_batch(args.n_res)
-    manager = mp.Manager()
-    result_q = manager.Queue()
-    torch.multiprocessing.spawn(inference_model, nprocs=args.gpus, args=(args.gpus, result_q, batch, args))
-    out = result_q.get()
-
-    # get unrelexed pdb and save
-    # batch = tensor_tree_map(lambda x: np.array(x[..., -1].cpu()), batch)
-    # plddt = out["plddt"]
-    # plddt_b_factors = np.repeat(plddt[..., None], residue_constants.atom_type_num, axis=-1)
-    # unrelaxed_protein = protein.from_prediction(features=batch,
-    #                                             result=out,
-    #                                             b_factors=plddt_b_factors)
-    # with open('demo_unrelex.pdb', 'w+') as fp:
-    #     fp.write(unrelaxed_protein)
-
-
-def main(args):
-    inference_monomer_model(args)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--gpus", type=int, default=1, help="""Number of GPUs with which to run inference""")
-    parser.add_argument("--n_res", type=int, default=50, help="virtual residue number of random data")
-    parser.add_argument("--model_name", type=str, default="model_1", help="model name of alphafold")
-    parser.add_argument('--chunk_size', type=int, default=None)
-    parser.add_argument('--inplace', default=False, action='store_true')
-
-    args = parser.parse_args()
-
-    main(args)
diff --git a/examples/tutorial/fastfold/test_ci.sh b/examples/tutorial/fastfold/test_ci.sh
deleted file mode 100644
index ef9ab919e307..000000000000
--- a/examples/tutorial/fastfold/test_ci.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-set -euxo pipefail
-
-git clone https://github.com/hpcaitech/FastFold
-cd FastFold
-pip install -r requirements/requirements.txt
-python setup.py install
-pip install -r requirements/test_requirements.txt
-cd ..
-
-python inference.py

From 7b4ad6e0fcadb88d342c32c93c8a577361731469 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Fri, 3 Feb 2023 17:12:35 +0800
Subject: [PATCH 244/503] [workflow] added contributor and user-engagement
 report (#2564)

* [workflow] added contributor and user-engagement report

* polish code

* polish code
---
 .../workflows/report_leaderboard_to_lark.yml  |  29 +++
 .../generate_leaderboard_and_send_to_lark.py  | 176 ++++++++++++++++++
 2 files changed, 205 insertions(+)
 create mode 100644 .github/workflows/report_leaderboard_to_lark.yml
 create mode 100644 .github/workflows/scripts/generate_leaderboard_and_send_to_lark.py

diff --git a/.github/workflows/report_leaderboard_to_lark.yml b/.github/workflows/report_leaderboard_to_lark.yml
new file mode 100644
index 000000000000..f51847a39521
--- /dev/null
+++ b/.github/workflows/report_leaderboard_to_lark.yml
@@ -0,0 +1,29 @@
+name: Publish Nightly Version to PyPI
+
+on:
+  workflow_dispatch:
+  schedule:
+    # release on every Friday 09:00 UTC time, 17:00 Beijing/Singapore time
+    - cron:  '0 9 * * 5'
+
+jobs:
+  generate-and-publish:
+    if: github.repository == 'hpcaitech/ColossalAI'
+    name: Generate leaderboard report and publish to Lark
+    runs-on: ubuntu-latest
+    timeout-minutes: 20
+    steps:
+    - uses: actions/checkout@v2
+
+    - uses: actions/setup-python@v2
+      with:
+        python-version: '3.8.14'
+
+    - run: pip install requests matplotlib seaborn requests_toolbelt pytz
+
+    - run: python .github/workflows/scripts/generate_leaderboard_and_send_to_lark.py
+      env:
+        LARK_APP_ID: ${{ secrets.LARK_LEADERBOARD_APP_ID }}
+        APP_SECRET: ${{ secrets.LARK_LEADERBOARD_APP_SECRET }}
+        LARK_WEBHOOK_URL: ${{ secrets.LARK_LEADERBOARD_WEBHOOK_URL }}
+        GITHUB_TOKEN: ${{ github.token }}
diff --git a/.github/workflows/scripts/generate_leaderboard_and_send_to_lark.py b/.github/workflows/scripts/generate_leaderboard_and_send_to_lark.py
new file mode 100644
index 000000000000..3dee161036ff
--- /dev/null
+++ b/.github/workflows/scripts/generate_leaderboard_and_send_to_lark.py
@@ -0,0 +1,176 @@
+import os
+from dataclasses import dataclass
+from datetime import datetime, timedelta
+
+import matplotlib.pyplot as plt
+import pytz
+import requests
+import seaborn
+from requests_toolbelt import MultipartEncoder
+
+
+@dataclass
+class Contributor:
+    name: str
+    num_commits_this_week: int
+
+
+def generate_user_engagement_leaderboard_image(github_token, output_path):
+    # request to the Github API to get the users who have replied the most in the last 7 days
+    now = datetime.utcnow()
+    start_datetime = now - timedelta(days=7)
+    start_datetime_str = start_datetime.strftime("%Y-%m-%dT%H:%M:%SZ")
+
+    # prepare header
+    headers = {
+        'Authorization': f'Bearer {github_token}',
+        'Accept': 'application/vnd.github+json',
+        'X-GitHub-Api-Version': '2022-11-28'
+    }
+
+    user_engagement_count = {}
+
+    # do pagination to the API
+    page = 1
+    while True:
+        comment_api = f'https://api.github.com/repos/hpcaitech/ColossalAI/issues/comments?since={start_datetime_str}&page={page}'
+        comment_response = requests.get(comment_api, headers=headers).json()
+
+        if len(comment_response) == 0:
+            break
+        else:
+            for item in comment_response:
+                comment_author_relationship = item['author_association']
+                if comment_author_relationship != 'MEMBER':
+                    # if the comment is not made by our member
+                    # we don't count this comment towards user engagement
+                    continue
+
+                issue_id = item['issue_url'].split('/')[-1]
+                issue_api = f'https://api.github.com/repos/hpcaitech/ColossalAI/issues/{issue_id}'
+                issue_response = requests.get(issue_api, headers=headers).json()
+                issue_author_relationship = issue_response['author_association']
+
+                if issue_author_relationship != 'MEMBER':
+                    # this means that the issue/PR is not created by our own people
+                    # any comments in this issue/PR by our member will be counted towards the leaderboard
+                    member_name = item['user']['login']
+
+                    if member_name in user_engagement_count:
+                        user_engagement_count[member_name] += 1
+                    else:
+                        user_engagement_count[member_name] = 1
+            page += 1
+
+    # plot the leaderboard
+    x = []
+    y = []
+
+    for name, count in user_engagement_count.items():
+        x.append(count)
+        y.append(name)
+    xticks = [str(v) for v in range(1, max(x) + 1)]
+    seaborn.color_palette()
+    fig = seaborn.barplot(x=x, y=y)
+    fig.set(xlabel=f"Number of Comments made (since {start_datetime})",
+            ylabel="Member",
+            title='Active User Engagement Leaderboard')
+    seaborn.despine()
+    plt.tight_layout()
+    plt.savefig(output_path, dpi=1200)
+
+
+def generate_contributor_leaderboard_image(github_token, output_path):
+    URL = 'https://api.github.com/repos/hpcaitech/ColossalAI/stats/contributors'
+    headers = {
+        'Authorization': f'Bearer {github_token}',
+        'Accept': 'application/vnd.github+json',
+        'X-GitHub-Api-Version': '2022-11-28'
+    }
+    response = requests.get(URL, headers=headers).json()
+
+    contributor_list = []
+
+    # convert unix timestamp to Beijing datetime
+    start_timestamp = response[0]['weeks'][-1]['w']
+    start_datetime = datetime.fromtimestamp(start_timestamp, tz=pytz.timezone('Asia/Shanghai'))
+
+    # get number of commits for each contributor
+    for item in response:
+        num_commits_this_week = item['weeks'][-1]['c']
+        name = item['author']['login']
+        contributor = Contributor(name=name, num_commits_this_week=num_commits_this_week)
+        contributor_list.append(contributor)
+
+    # sort by number of commits
+    contributor_list.sort(key=lambda x: x.num_commits_this_week, reverse=True)
+
+    # remove contributors who has zero commits
+    contributor_list = [x for x in contributor_list if x.num_commits_this_week > 0]
+
+    # plot
+    seaborn.color_palette()
+    x = [x.num_commits_this_week for x in contributor_list]
+    y = [x.name for x in contributor_list]
+    fig = seaborn.barplot(x=x, y=y)
+    fig.set(xlabel=f"Number of Commits (since {start_datetime})",
+            ylabel="Contributor",
+            title='Active Contributor Leaderboard')
+    seaborn.despine()
+    plt.tight_layout()
+    plt.savefig(output_path, dpi=1200)
+
+
+def upload_image_to_lark(lark_tenant_token, image_path):
+    url = "https://open.feishu.cn/open-apis/im/v1/images"
+    form = {'image_type': 'message', 'image': (open(image_path, 'rb'))}    # 需要替换具体的path
+    multi_form = MultipartEncoder(form)
+    headers = {
+        'Authorization': f'Bearer {lark_tenant_token}',    ## 获取tenant_access_token, 需要替换为实际的token
+    }
+    headers['Content-Type'] = multi_form.content_type
+    response = requests.request("POST", url, headers=headers, data=multi_form).json()
+    return response['data']['image_key']
+
+
+def generate_lark_tenant_access_token(app_id, app_secret):
+    url = 'https://open.feishu.cn/open-apis/auth/v3/tenant_access_token/internal'
+    data = {'app_id': app_id, 'app_secret': app_secret}
+    response = requests.post(url, json=data).json()
+    return response['tenant_access_token']
+
+
+def send_image_to_lark(image_key, webhook_url):
+    data = {"msg_type": "image", "content": {"image_key": image_key}}
+    requests.post(webhook_url, json=data)
+
+
+def send_message_to_lark(message, webhook_url):
+    data = {"msg_type": "text", "content": {"text": message}}
+    requests.post(webhook_url, json=data)
+
+
+if __name__ == '__main__':
+    GITHUB_TOKEN = os.environ['GITHUB_TOKEN']
+    CONTRIBUTOR_IMAGE_PATH = 'contributor_leaderboard.png'
+    USER_ENGAGEMENT_IMAGE_PATH = 'engagement_leaderboard.png'
+
+    # generate images
+    # generate_contributor_leaderboard_image(GITHUB_TOKEN, CONTRIBUTOR_IMAGE_PATH)
+    generate_user_engagement_leaderboard_image(GITHUB_TOKEN, USER_ENGAGEMENT_IMAGE_PATH)
+
+    # upload images
+    APP_ID = os.environ['LARK_APP_ID']
+    APP_SECRET = os.environ['LARK_APP_SECRET']
+    LARK_TENANT_TOKEN = generate_lark_tenant_access_token(app_id=APP_ID, app_secret=APP_SECRET)
+    contributor_image_key = upload_image_to_lark(LARK_TENANT_TOKEN, CONTRIBUTOR_IMAGE_PATH)
+    user_engagement_image_key = upload_image_to_lark(LARK_TENANT_TOKEN, USER_ENGAGEMENT_IMAGE_PATH)
+
+    # send contributor image to lark
+    LARK_WEBHOOK_URL = os.environ['LARK_WEBHOOK_URL']
+    send_message_to_lark("本周的开发者贡献榜单出炉啦！", LARK_WEBHOOK_URL)
+    send_image_to_lark(contributor_image_key, LARK_WEBHOOK_URL)
+
+    # send user engagement image to lark
+    send_message_to_lark("本周的开源社区互动榜单出炉啦！", LARK_WEBHOOK_URL)
+    send_image_to_lark(user_engagement_image_key, LARK_WEBHOOK_URL)

From 2eb4268b4722b16defd78554960b1676255ec55f Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Fri, 3 Feb 2023 17:25:56 +0800
Subject: [PATCH 245/503] [workflow] fixed typos in the leaderboard workflow
 (#2567)

---
 .github/workflows/report_leaderboard_to_lark.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/report_leaderboard_to_lark.yml b/.github/workflows/report_leaderboard_to_lark.yml
index f51847a39521..60c3ad2a6781 100644
--- a/.github/workflows/report_leaderboard_to_lark.yml
+++ b/.github/workflows/report_leaderboard_to_lark.yml
@@ -1,4 +1,4 @@
-name: Publish Nightly Version to PyPI
+name: Generate Community Report and Send to Lark
 
 on:
   workflow_dispatch:
@@ -24,6 +24,6 @@ jobs:
     - run: python .github/workflows/scripts/generate_leaderboard_and_send_to_lark.py
       env:
         LARK_APP_ID: ${{ secrets.LARK_LEADERBOARD_APP_ID }}
-        APP_SECRET: ${{ secrets.LARK_LEADERBOARD_APP_SECRET }}
+        LARK_APP_SECRET: ${{ secrets.LARK_LEADERBOARD_APP_SECRET }}
         LARK_WEBHOOK_URL: ${{ secrets.LARK_LEADERBOARD_WEBHOOK_URL }}
         GITHUB_TOKEN: ${{ github.token }}

From 039b0c487bb7d380201d5d6aa6aa63e5d58c329b Mon Sep 17 00:00:00 2001
From: binmakeswell <binmakeswell@gmail.com>
Date: Sat, 4 Feb 2023 17:49:52 +0800
Subject: [PATCH 246/503] [tutorial] polish README (#2568)

---
 README-zh-Hans.md                    |   3 +-
 README.md                            |   4 +-
 examples/tutorial/README.md          | 122 ++-------------------------
 examples/tutorial/fastfold/README.md |   4 +-
 4 files changed, 12 insertions(+), 121 deletions(-)

diff --git a/README-zh-Hans.md b/README-zh-Hans.md
index 9931d434f50c..b4a73e639889 100644
--- a/README-zh-Hans.md
+++ b/README-zh-Hans.md
@@ -109,8 +109,7 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
   - 基于参数文件的并行化
 - 推理
   - [Energon-AI](https://github.com/hpcaitech/EnergonAI)
-- Colossal-AI 成功案例
-  - 生物医药: [FastFold](https://github.com/hpcaitech/FastFold) 加速蛋白质结构预测 AlphaFold 训练与推理
+
 <p align="right">(<a href="#top">返回顶端</a>)</p>
 
 ## 并行训练样例展示
diff --git a/README.md b/README.md
index 5f230e627efe..d10184a6e15b 100644
--- a/README.md
+++ b/README.md
@@ -114,9 +114,7 @@ distributed training and inference in a few lines.
 
 - Inference
   - [Energon-AI](https://github.com/hpcaitech/EnergonAI)
-
-- Colossal-AI in the Real World
-  - Biomedicine: [FastFold](https://github.com/hpcaitech/FastFold) accelerates training and inference of AlphaFold protein structure
+  
 <p align="right">(<a href="#top">back to top</a>)</p>
 
 ## Parallel Training Demo
diff --git a/examples/tutorial/README.md b/examples/tutorial/README.md
index 9c61e41cd146..633e2f5a7c96 100644
--- a/examples/tutorial/README.md
+++ b/examples/tutorial/README.md
@@ -15,30 +15,18 @@ quickly deploy large AI model training and inference, reducing large AI model tr
 [**Colossal-AI**](https://github.com/hpcaitech/ColossalAI) |
 [**Paper**](https://arxiv.org/abs/2110.14883) |
 [**Documentation**](https://www.colossalai.org/) |
-[**Forum**](https://github.com/hpcaitech/ColossalAI/discussions) |
+[**Issue**](https://github.com/hpcaitech/ColossalAI/issues/new/choose) |
 [**Slack**](https://join.slack.com/t/colossalaiworkspace/shared_invite/zt-z7b26eeb-CBp7jouvu~r0~lcFzX832w)
 
 ## Table of Content
 
- - Multi-dimensional Parallelism
-   - Know the components and sketch of Colossal-AI
-   - Step-by-step from PyTorch to Colossal-AI
-   - Try data/pipeline parallelism and 1D/2D/2.5D/3D tensor parallelism using a unified model
- - Sequence Parallelism
-   - Try sequence parallelism with BERT
-   - Combination of data/pipeline/sequence parallelism
-   - Faster training and longer sequence length
- - Large Batch Training Optimization
-   - Comparison of small/large batch size with SGD/LARS optimizer
-   - Acceleration from a larger batch size
- - Auto-Parallelism
-   - Parallelism with normal non-distributed training code
-   - Model tracing + solution solving + runtime communication inserting all in one auto-parallelism system
-   - Try single program, multiple data (SPMD) parallel with auto-parallelism SPMD solver on ResNet50
- - Fine-tuning and Serving for OPT
-   - Try pre-trained OPT model weights with Colossal-AI
-   - Fine-tuning OPT with limited hardware using ZeRO, Gemini and parallelism
-   - Deploy the fine-tuned model to inference service
+ - Multi-dimensional Parallelism [[code]](https://github.com/hpcaitech/ColossalAI/tree/main/examples/tutorial/hybrid_parallel)
+ - Sequence Parallelism [[code]](https://github.com/hpcaitech/ColossalAI/tree/main/examples/tutorial/sequence_parallel)
+ - Large Batch Training Optimization [[code]](https://github.com/hpcaitech/ColossalAI/tree/main/examples/tutorial/large_batch_optimizer)
+ - Automatic Parallelism [[code]](https://github.com/hpcaitech/ColossalAI/tree/main/examples/tutorial/auto_parallel)
+ - Fine-tuning and Inference for OPT [[code]](https://github.com/hpcaitech/ColossalAI/tree/main/examples/tutorial/opt)
+ - Optimized AlphaFold [[code]](https://github.com/hpcaitech/ColossalAI/tree/main/examples/tutorial/fastfold)
+ - Optimized Stable Diffusion [[code]](https://github.com/hpcaitech/ColossalAI/tree/main/examples/images/diffusion)
 
 
 ## Discussion
@@ -71,97 +59,3 @@ Then clone the Colossal-AI repository from GitHub.
 git clone https://github.com/hpcaitech/ColossalAI.git
 cd ColossalAI/examples/tutorial
 ```
-
-## 🔥 Multi-dimensional Hybrid Parallel with Vision Transformer
-1. Go to **hybrid_parallel** folder in the **tutorial** directory.
-2. Install our model zoo.
-```bash
-pip install titans
-```
-3. Run with synthetic data which is of similar shape to CIFAR10 with the `-s` flag.
-```bash
-colossalai run --nproc_per_node 4 train.py --config config.py -s
-```
-
-4. Modify the config file to play with different types of tensor parallelism, for example, change tensor parallel size to be 4 and mode to be 2d and run on 8 GPUs.
-
-## ☀️ Sequence Parallel with BERT
-1. Go to the **sequence_parallel** folder in the **tutorial** directory.
-2. Run with the following command
-```bash
-export PYTHONPATH=$PWD
-colossalai run --nproc_per_node 4 train.py -s
-```
-3. The default config is sequence parallel size = 2, pipeline size = 1, let’s change pipeline size to be 2 and try it again.
-
-## 📕 Large batch optimization with LARS and LAMB
-1. Go to the **large_batch_optimizer**  folder in the **tutorial** directory.
-2. Run with synthetic data
-```bash
-colossalai run --nproc_per_node 4 train.py --config config.py -s
-```
-
-## 😀 Auto-Parallel Tutorial
-1. Go to the  **auto_parallel** folder in the **tutorial** directory.
-2. Install `pulp` and `coin-or-cbc` for the solver.
-```bash
-pip install pulp
-conda install -c conda-forge coin-or-cbc
-```
-2. Run the auto parallel resnet example with 4 GPUs with synthetic dataset.
-```bash
-colossalai run --nproc_per_node 4 auto_parallel_with_resnet.py -s
-```
-
-You should expect to the log like this. This log shows the edge cost on the computation graph as well as the sharding strategy for an operation. For example, `layer1_0_conv1 S01R = S01R X RR` means that the first dimension (batch) of the input and output is sharded while the weight is not sharded (S means sharded, R means replicated), simply equivalent to data parallel training.
-![](https://raw.githubusercontent.com/hpcaitech/public_assets/main/examples/tutorial/auto-parallel%20demo.png)
-
-## 🎆 Auto-Checkpoint Tutorial
-1. Stay in the `auto_parallel` folder.
-2. Install the dependencies.
-```bash
-pip install matplotlib transformers
-```
-3. Run a simple resnet50 benchmark to automatically checkpoint the model.
-```bash
-python auto_ckpt_solver_test.py --model resnet50
-```
-
-You should expect the log to be like this
-![](https://raw.githubusercontent.com/hpcaitech/public_assets/main/examples/tutorial/auto-ckpt%20demo.png)
-
-This shows that given different memory budgets, the model is automatically injected with activation checkpoint and its time taken per iteration. You can run this benchmark for GPT as well but it can much longer since the model is larger.
-```bash
-python auto_ckpt_solver_test.py --model gpt2
-```
-
-4. Run a simple benchmark to find the optimal batch size for checkpointed model.
-```bash
-python auto_ckpt_batchsize_test.py
-```
-
-You can expect the log to be like
-![](https://raw.githubusercontent.com/hpcaitech/public_assets/main/examples/tutorial/auto-ckpt%20batchsize.png)
-
-## 🚀 Run OPT finetuning and inference
-1. Install the dependency
-```bash
-pip install datasets accelerate
-```
-2. Run finetuning with synthetic datasets with one GPU
-```bash
-bash ./run_clm_synthetic.sh
-```
-3. Run finetuning with 4 GPUs
-```bash
-bash ./run_clm_synthetic.sh 16 0 125m 4
-```
-4. Run inference with OPT 125M
-```bash
-docker hpcaitech/tutorial:opt-inference
-docker run -it --rm --gpus all --ipc host -p 7070:7070 hpcaitech/tutorial:opt-inference
-```
-5. Start the http server inside the docker container with tensor parallel size 2
-```bash
-python opt_fastapi.py opt-125m --tp 2 --checkpoint /data/opt-125m
-```
diff --git a/examples/tutorial/fastfold/README.md b/examples/tutorial/fastfold/README.md
index 0c3df7a07401..434d033b9792 100644
--- a/examples/tutorial/fastfold/README.md
+++ b/examples/tutorial/fastfold/README.md
@@ -10,7 +10,7 @@
 
 ## 📚 Overview
 
-This example lets you to try out the inference of FastFold.
+This example lets you to try out the inference of [FastFold](https://github.com/hpcaitech/FastFold).
 
 ## 🚀 Quick Start
 
@@ -41,7 +41,7 @@ You can find predictions under the `outputs` dir.
 
 ## 🔍 Dive into FastFold
 
-There are another features of FastFold, such as:
+There are another features of [FastFold](https://github.com/hpcaitech/FastFold), such as:
 + more excellent kernel based on triton
 + much faster data processing based on ray
 + training supported

From fb1a4c0d96a150e47876e5f2ef6fc649f4589f9b Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 6 Feb 2023 10:29:24 +0800
Subject: [PATCH 247/503] [doc] fixed issue link in pr template (#2577)

---
 .github/pull_request_template.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index f3431226ecc9..0dc1262b008c 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -9,7 +9,7 @@
 
 > Link this PR to your issue with words like fixed to automatically close the linked issue upon merge
 >
-> e.g. fixed #1234, closed #1234, resolved #1234
+> e.g. `fixed #1234`, `closed #1234`, `resolved #1234`
 
 
From ba47517342ca8ae98986878ead3decb00c28a37a Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 6 Feb 2023 13:46:52 +0800
Subject: [PATCH 248/503] [workflow] fixed example check workflow (#2554)

* [workflow] fixed example check workflow

* polish yaml
---
 .github/workflows/README.md                   | 107 ++++++++++--------
 .../workflows/example_check_on_dispatch.yml   |  64 +++++++++++
 .github/workflows/example_check_on_pr.yml     |  91 +++++++++++++++
 .../workflows/example_check_on_schedule.yml   |  57 ++++++++++
 4 files changed, 269 insertions(+), 50 deletions(-)
 create mode 100644 .github/workflows/example_check_on_dispatch.yml
 create mode 100644 .github/workflows/example_check_on_pr.yml
 create mode 100644 .github/workflows/example_check_on_schedule.yml

diff --git a/.github/workflows/README.md b/.github/workflows/README.md
index cda6a3139a1b..980f7b5701ce 100644
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@@ -6,13 +6,14 @@
   - [Table of Contents](#table-of-contents)
   - [Overview](#overview)
   - [Workflows](#workflows)
-    - [Checks on Pull Requests](#checks-on-pull-requests)
-    - [Regular Checks](#regular-checks)
+    - [Code Style Check](#code-style-check)
+    - [Unit Test](#unit-test)
+    - [Example Test](#example-test)
+      - [Dispatch Example Test](#dispatch-example-test)
+    - [Compatibility Test](#compatibility-test)
+      - [Compatibility Test](#compatibility-test-1)
     - [Release](#release)
-    - [Manual Dispatch](#manual-dispatch)
       - [Release bdist wheel](#release-bdist-wheel)
-      - [Dispatch Example Test](#dispatch-example-test)
-      - [Compatibility Test](#compatibility-test)
     - [User Friendliness](#user-friendliness)
   - [Configuration](#configuration)
   - [Progress Log](#progress-log)
@@ -26,25 +27,54 @@ In the section below, we will dive into the details of different workflows avail
 
 ## Workflows
 
-### Checks on Pull Requests
+Refer to this [documentation](https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow) on how to manually trigger a workflow.
+I will provide the details of each workflow below.
+
+### Code Style Check
+
+| Workflow Name               | File name                      | Description                                                                                                |
+| --------------------------- | ------------------------------ | ---------------------------------------------------------------------------------------------------------- |
+| `Pre-commit`                | `pre_commit.yml`               | This workflow runs pre-commit checks for code style consistency for PRs.                                   |
+| `Report pre-commit failure` | `report_precommit_failure.yml` | This PR will put up a comment in the PR to explain the precommit failure and remedy if `Pre-commit` fails. |
+
+### Unit Test
+
+| Workflow Name          | File name                  | Description                                                                                                                                       |
+| ---------------------- | -------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `Build`                | `build.yml`                | This workflow is triggered when the label `Run build and Test` is assigned to a PR. It will run all the unit tests in the repository with 4 GPUs. |
+| `Build on 8 GPUs`      | `build_gpu_8.yml`          | This workflow will run the unit tests everyday with 8 GPUs.                                                                                       |
+| `Report test coverage` | `report_test_coverage.yml` | This PR will put up a comment to report the test coverage results when `Build` is done.                                                           |
+
+### Example Test
+
+| Workflow Name              | File name                       | Description                                                                 |
+| -------------------------- | ------------------------------- | --------------------------------------------------------------------------- |
+| `Test example on PR`       | `example_check_on_pr.yml`       | The example will be automatically tested if its files are changed in the PR |
+| `Test example on Schedule` | `example_check_on_schedule.yml` | This workflow will test all examples every Sunday                           |
+| `Example Test on Dispatch` | `example_check_on_dispatch.yml` | Manually test a specified example.                                          |
+
+#### Dispatch Example Test
+
+parameters:
+- `example_directory`: the example directory to test. Multiple directories are supported and must be separated by comma. For example, language/gpt, images/vit. Simply input language or simply gpt does not work.
+
+### Compatibility Test
 
-| Workflow Name               | File name                      | Description                                                                                                                                       |
-| --------------------------- | ------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `Build`                     | `build.yml`                    | This workflow is triggered when the label `Run build and Test` is assigned to a PR. It will run all the unit tests in the repository with 4 GPUs. |
-| `Pre-commit`                | `pre_commit.yml`               | This workflow runs pre-commit checks for code style consistency.                                                                                  |
-| `Report pre-commit failure` | `report_precommit_failure.yml` | This PR will put up a comment in the PR to explain the precommit failure and remedy. This is executed when `Pre-commit` is done                   |
-| `Report test coverage`      | `report_test_coverage.yml`     | This PR will put up a comment to report the test coverage results. This is executed when `Build` is completed.                                    |
-| `Test example`              | `auto_example_check.yml`       | The example will be automatically tested if its files are changed in the PR                                                                       |
+| Workflow Name                | File name                        | Description                                                                                                                   |
+| ---------------------------- | -------------------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
+| `Compatibility Test`         | `auto_compatibility_test.yml`    | This workflow will check the compatiblity of Colossal-AI against PyTorch and CUDA specified in `.compatibility` every Sunday. |
+| `Auto Compatibility Test`    | `auto_compatibility_test.yml`    | Check Colossal-AI's compatiblity when `version.txt` is changed in a PR.                                                       |
+| `Dispatch Compatiblity Test` | `dispatch_compatiblity_test.yml` | Test PyTorch and Python Compatibility.                                                                                        |
 
-### Regular Checks
 
-| Workflow Name           | File name                     | Description                                                                                                                                                      |
-| ----------------------- | ----------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `Test example`          | `auto_example_check.yml`      | This workflow will test all examples every Sunday                                                                                                                |
-| `Compatibility Test`    | `auto_compatibility_test.yml` | This workflow will check the compatiblity of Colossal-AI against PyTorch and CUDA every Sunday. The PyTorch and CUDA versions are specified in `.compatibility`. |
-| `Build on 8 GPUs`       | `build_gpu_8.yml`             | This workflow will run the unit tests everyday with 8 GPUs.                                                                                                      |
-| `Synchronize submodule` | `submodule.yml`               | This workflow will check if any git submodule is updated. If so, it will create a PR to update the submodule pointers.                                           |
-| `Close inactive issues` | `close_inactive.yml`          | This workflow will close issues which are stale for 14 days.                                                                                                     |
+#### Compatibility Test
+
+Parameters:
+- `torch version`:torch version to test against, multiple versions are supported but must be separated by comma. The default is value is all, which will test all available torch versions listed in this [repository](https://github.com/hpcaitech/public_assets/tree/main/colossalai/torch_build/torch_wheels).
+- `cuda version`: cuda versions to test against, multiple versions are supported but must be separated by comma. The CUDA versions must be present in our [DockerHub repository](https://hub.docker.com/r/hpcaitech/cuda-conda).
+
+> It only test the compatiblity of the main branch
+
 
 ### Release
 
@@ -56,18 +86,8 @@ In the section below, we will dive into the details of different workflows avail
 | `Release Docker`            | `release_docker.yml`            | Build and release the Docker image to DockerHub. Triggered when the change of `version.txt` is merged.                                                      |
 | `Release bdist wheel`       | `release_bdist.yml`             | Build binary wheels with pre-built PyTorch extensions. Manually dispatched. See more details in the next section.                                           |
 | `Auto Release bdist wheel`  | `auto_release_bdist.yml`        | Build binary wheels with pre-built PyTorch extensions.Triggered when the change of `version.txt` is merged. Build specificatons are stored in `.bdist.json` |
-| `Auto Compatibility Test`   | `auto_compatibility_test.yml`   | Check Colossal-AI's compatiblity against the PyTorch and CUDA version specified in `.compatibility`. Triggered when `version.txt` is changed in a PR.       |
-
-### Manual Dispatch
+| `Release bdist wheel`       | `release_bdist.yml`             | Build binary wheels with pre-built PyTorch extensions.                                                                                                      |
 
-| Workflow Name                | File name                        | Description                                            |
-| ---------------------------- | -------------------------------- | ------------------------------------------------------ |
-| `Release bdist wheel`        | `release_bdist.yml`              | Build binary wheels with pre-built PyTorch extensions. |
-| `Dispatch Example Test`      | `dispatch_example_check.yml`     | Manually test a specified example.                     |
-| `Dispatch Compatiblity Test` | `dispatch_compatiblity_test.yml` | Test PyTorch and Python Compatibility.                 |
-
-Refer to this [documentation](https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow) on how to manually trigger a workflow.
-I will provide the details of each workflow below.
 
 #### Release bdist wheel
 
@@ -76,26 +96,13 @@ Parameters:
 - `cuda version`: cuda versions to test against, multiple versions are supported but must be separated by comma. The CUDA versions must be present in our [DockerHub repository](https://hub.docker.com/r/hpcaitech/cuda-conda).
 - `ref`: input the branch or tag name to build the wheel for this ref.
 
-#### Dispatch Example Test
-
-parameters:
-- `example_directory`: the example directory to test. Multiple directories are supported and must be separated by comma. For example, language/gpt, images/vit. Simply input language or simply gpt does not work.
-
-
-#### Compatibility Test
-
-Parameters:
-- `torch version`:torch version to test against, multiple versions are supported but must be separated by comma. The default is value is all, which will test all available torch versions listed in this [repository](https://github.com/hpcaitech/public_assets/tree/main/colossalai/torch_build/torch_wheels).
-- `cuda version`: cuda versions to test against, multiple versions are supported but must be separated by comma. The CUDA versions must be present in our [DockerHub repository](https://hub.docker.com/r/hpcaitech/cuda-conda).
-
-> It only test the compatiblity of the main branch
-
-
 ### User Friendliness
 
-| Workflow Name     | File name               | Description                                                                                                                            |
-| ----------------- | ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------- |
-| `issue-translate` | `translate_comment.yml` | This workflow is triggered when a new issue comment is created. The comment will be translated into English if not written in English. |
+| Workflow Name           | File name               | Description                                                                                                                            |
+| ----------------------- | ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------- |
+| `issue-translate`       | `translate_comment.yml` | This workflow is triggered when a new issue comment is created. The comment will be translated into English if not written in English. |
+| `Synchronize submodule` | `submodule.yml`         | This workflow will check if any git submodule is updated. If so, it will create a PR to update the submodule pointers.                 |
+| `Close inactive issues` | `close_inactive.yml`    | This workflow will close issues which are stale for 14 days.                                                                           |
 
 
 ## Configuration
diff --git a/.github/workflows/example_check_on_dispatch.yml b/.github/workflows/example_check_on_dispatch.yml
new file mode 100644
index 000000000000..620d4771af55
--- /dev/null
+++ b/.github/workflows/example_check_on_dispatch.yml
@@ -0,0 +1,64 @@
+name: Test Example on Dispatch
+on:
+  workflow_dispatch:
+    inputs:
+      example_directory:
+        type: string
+        description: example directory, separated by space. For example, language/gpt, images/vit. Simply input language or simply gpt does not work.
+        required: true
+
+jobs:
+  matrix_preparation:
+    if: |
+        github.event.pull_request.draft == false &&
+        github.base_ref == 'main' &&
+        github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
+    name: Check the examples user want
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+    - name: 📚 Checkout
+      uses: actions/checkout@v3
+    - name: Set up matrix
+      id: set-matrix
+      env:
+        check_dir: ${{ inputs.example_directory }}
+      run: |
+        res=`python .github/workflows/scripts/example_checks/check_dispatch_inputs.py --fileNameList $check_dir`
+        if [ res == "failure" ];then
+          exit -1
+        fi
+        dirs="[${check_dir}]"
+        echo "Testing examples in $dirs"
+        echo "matrix={\"directory\":$(echo "$dirs")}" >> $GITHUB_OUTPUT
+
+  test_example:
+    if: |
+        github.event.pull_request.draft == false &&
+        github.base_ref == 'main' &&
+        github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
+    name: Manually check example files
+    needs: manual_check_matrix_preparation
+    runs-on: [self-hosted, gpu]
+    strategy:
+      fail-fast: false
+      matrix: ${{fromJson(needs.manual_check_matrix_preparation.outputs.matrix)}}
+    container:
+      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      options: --gpus all --rm -v /data/scratch/examples-data:/data/
+    timeout-minutes: 10
+    steps:
+      - name: 📚 Checkout
+        uses: actions/checkout@v3
+      - name: Install Colossal-AI
+        run: |
+          pip install -v .
+      - name: Test the example
+        run: |
+          dir=${{ matrix.directory }}
+          echo "Testing ${dir} now"
+          cd "${PWD}/examples/${dir}"
+          bash test_ci.sh
+        env:
+          NCCL_SHM_DISABLE: 1
diff --git a/.github/workflows/example_check_on_pr.yml b/.github/workflows/example_check_on_pr.yml
new file mode 100644
index 000000000000..ebc2a277c1de
--- /dev/null
+++ b/.github/workflows/example_check_on_pr.yml
@@ -0,0 +1,91 @@
+name: Test Example on PR
+on:
+  pull_request:
+    # any change in the examples folder will trigger check for the corresponding example.
+    paths:
+      - 'examples/**'
+
+jobs:
+  # This is for changed example files detect and output a matrix containing all the corresponding directory name.
+  detect-changed-example:
+    if: |
+        github.event.pull_request.draft == false &&
+        github.base_ref == 'main' &&
+        github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request'
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.setup-matrix.outputs.matrix }}
+      anyChanged: ${{ steps.setup-matrix.outputs.anyChanged }}
+    name: Detect changed example files
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.pull_request.head.sha }}
+
+      - name: Locate base commit
+        id: locate-base-sha
+        run: |
+            curBranch=$(git rev-parse --abbrev-ref HEAD)
+            commonCommit=$(git merge-base origin/main $curBranch)
+            echo $commonCommit
+            echo "baseSHA=$commonCommit" >> $GITHUB_OUTPUT
+
+      - name: Get all changed example files
+        id: changed-files
+        uses: tj-actions/changed-files@v35
+        with:
+          base_sha: ${{ steps.locate-base-sha.outputs.baseSHA }}
+
+      - name: setup matrix
+        id: setup-matrix
+        run: |
+          changedFileName=""
+          for file in ${{ steps.changed-files.outputs.all_changed_files  }}; do
+            changedFileName="${file}:${changedFileName}"
+          done
+          echo "$changedFileName was changed"
+          res=`python .github/workflows/scripts/example_checks/detect_changed_example.py --fileNameList $changedFileName`
+          echo "All changed examples are $res"
+
+          if [ "$res" = "[]" ]; then
+            echo "anyChanged=false" >> $GITHUB_OUTPUT
+            echo "matrix=null" >> $GITHUB_OUTPUT
+          else
+            dirs=$( IFS=',' ; echo "${res[*]}" )
+            echo "anyChanged=true" >> $GITHUB_OUTPUT
+            echo "matrix={\"directory\":$(echo "$dirs")}" >> $GITHUB_OUTPUT
+          fi
+
+  # If no file is changed, it will prompt an error and shows the matrix do not have value.
+  check-changed-example:
+    # Add this condition to avoid executing this job if the trigger event is workflow_dispatch.
+    if: |
+        github.event.pull_request.draft == false &&
+        github.base_ref == 'main' &&
+        github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request' &&
+        needs.detect-changed-example.outputs.anyChanged == 'true'
+    name: Test the changed example
+    needs: detect-changed-example
+    runs-on: [self-hosted, gpu]
+    strategy:
+      fail-fast: false
+      matrix: ${{fromJson(needs.detect-changed-example.outputs.matrix)}}
+    container:
+      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      options: --gpus all --rm -v /data/scratch/examples-data:/data/
+    timeout-minutes: 10
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Install Colossal-AI
+        run: |
+          pip install -v .
+
+      - name: Test the example
+        run: |
+          example_dir=${{ matrix.directory }}
+          cd "${PWD}/examples/${example_dir}"
+          bash test_ci.sh
+        env:
+          NCCL_SHM_DISABLE: 1
diff --git a/.github/workflows/example_check_on_schedule.yml b/.github/workflows/example_check_on_schedule.yml
new file mode 100644
index 000000000000..07424ecbede2
--- /dev/null
+++ b/.github/workflows/example_check_on_schedule.yml
@@ -0,0 +1,57 @@
+name: Test Example on Schedule
+on:
+  # run at 00:00 of every Sunday(singapore time) so here is UTC time Saturday 16:00
+  schedule:
+    - cron:  '0 16 * * 6'
+
+jobs:
+  # This is for all files' weekly check. Specifically, this job is to find all the directories.
+  matrix_preparation:
+    if: |
+        github.repository == 'hpcaitech/ColossalAI' &&
+        github.event_name == 'schedule'
+    name: Prepare matrix for weekly check
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.setup-matrix.outputs.matrix }}
+    steps:
+    - name: 📚 Checkout
+      uses: actions/checkout@v3
+
+    - name: setup matrix
+      id: setup-matrix
+      run: |
+        res=`python .github/workflows/scripts/example_checks/check_example_weekly.py`
+        all_loc=$( IFS=',' ; echo "${res[*]}" )
+        echo "Found the examples: $all_loc"
+        echo "matrix={\"directory\":$(echo "$all_loc")}" >> $GITHUB_OUTPUT
+
+  weekly_check:
+    if: |
+        github.repository == 'hpcaitech/ColossalAI' &&
+        github.event_name == 'schedule'
+    name: Weekly check all examples
+    needs: matrix_preparation
+    runs-on: [self-hosted, gpu]
+    strategy:
+      fail-fast: false
+      matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
+    container:
+      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+    timeout-minutes: 10
+    steps:
+      - name: 📚 Checkout
+        uses: actions/checkout@v3
+
+      - name: Install Colossal-AI
+        run: |
+          pip install -v .
+
+      - name: Traverse all files
+        run: |
+          example_dir=${{ matrix.diretory }}
+          echo "Testing ${example_dir} now"
+          cd "${PWD}/examples/${example_dir}"
+          bash test_ci.sh
+        env:
+          NCCL_SHM_DISABLE: 1

From 76edb04b0d79860358cf844fbcbecc81a84c14ed Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 6 Feb 2023 13:47:25 +0800
Subject: [PATCH 249/503] [workflow] adjust the GPU memory threshold for
 scheduled unit test (#2558)

* [workflow] adjust the GPU memory threshold for scheduled unit test

* polish code
---
 .github/workflows/{build.yml => build_on_pr.yml}             | 2 +-
 .github/workflows/{build_gpu_8.yml => build_on_schedule.yml} | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)
 rename .github/workflows/{build.yml => build_on_pr.yml} (99%)
 rename .github/workflows/{build_gpu_8.yml => build_on_schedule.yml} (94%)

diff --git a/.github/workflows/build.yml b/.github/workflows/build_on_pr.yml
similarity index 99%
rename from .github/workflows/build.yml
rename to .github/workflows/build_on_pr.yml
index 3c163e774a5c..82b671acea93 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build_on_pr.yml
@@ -1,4 +1,4 @@
-name: Build
+name: Build on PR
 
 on:
   pull_request:
diff --git a/.github/workflows/build_gpu_8.yml b/.github/workflows/build_on_schedule.yml
similarity index 94%
rename from .github/workflows/build_gpu_8.yml
rename to .github/workflows/build_on_schedule.yml
index be8337dd0378..ea1f4879ce51 100644
--- a/.github/workflows/build_gpu_8.yml
+++ b/.github/workflows/build_on_schedule.yml
@@ -1,4 +1,4 @@
-name: Build on 8 GPUs
+name: Build on Schedule
 
 on:
   schedule:
@@ -39,7 +39,7 @@ jobs:
       - name: Unit Testing
         run: |
           gpu_used=$(nvidia-smi -i 0 --query-gpu=memory.used --format=csv,noheader,nounits)
-          [ "$gpu_used" -le "100" ] && PYTHONPATH=$PWD pytest tests
+          [ "$gpu_used" -le "10000" ] && PYTHONPATH=$PWD pytest tests
         env:
           DATA: /data/scratch/cifar-10
           LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64

From b0c29d1b4c5f8164b5b8e8c6c65cfd89fa584500 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 6 Feb 2023 13:47:50 +0800
Subject: [PATCH 250/503] [workflow] refactored compatibility test workflow for
 maintenability (#2560)

---
 ....yml => compatiblity_test_on_dispatch.yml} |  2 +-
 .github/workflows/compatiblity_test_on_pr.yml | 71 +++++++++++++++++++
 ....yml => compatiblity_test_on_schedule.yml} |  6 +-
 3 files changed, 73 insertions(+), 6 deletions(-)
 rename .github/workflows/{dispatch_compatibility_test.yml => compatiblity_test_on_dispatch.yml} (98%)
 create mode 100644 .github/workflows/compatiblity_test_on_pr.yml
 rename .github/workflows/{auto_compatibility_test.yml => compatiblity_test_on_schedule.yml} (95%)

diff --git a/.github/workflows/dispatch_compatibility_test.yml b/.github/workflows/compatiblity_test_on_dispatch.yml
similarity index 98%
rename from .github/workflows/dispatch_compatibility_test.yml
rename to .github/workflows/compatiblity_test_on_dispatch.yml
index ac5669c6f7f0..717cf729b3f3 100644
--- a/.github/workflows/dispatch_compatibility_test.yml
+++ b/.github/workflows/compatiblity_test_on_dispatch.yml
@@ -1,4 +1,4 @@
-name: Dispatch Compatibility Test
+name: Compatibility Test on Dispatch
 
 on:
   workflow_dispatch:
diff --git a/.github/workflows/compatiblity_test_on_pr.yml b/.github/workflows/compatiblity_test_on_pr.yml
new file mode 100644
index 000000000000..2fca67b820a1
--- /dev/null
+++ b/.github/workflows/compatiblity_test_on_pr.yml
@@ -0,0 +1,71 @@
+name: Compatibility Test on PR
+
+on:
+  pull_request:
+    paths:
+      - 'version.txt'
+      - '.compatibility'
+
+jobs:
+  matrix_preparation:
+    name: Prepare Container List
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+      - uses: actions/checkout@v3
+      - id: set-matrix
+        run: |
+          IFS=','
+          DOCKER_IMAGE=()
+
+          while read tag; do
+            DOCKER_IMAGE+=("\"hpcaitech/pytorch-cuda:${tag}\"")
+          done <.compatibility
+
+          container=$( IFS=',' ; echo "${DOCKER_IMAGE[*]}" )
+          container="[${container}]"
+          echo "$container"
+          echo "::set-output name=matrix::{\"container\":$(echo "$container")}"
+
+  build:
+    name: Test for PyTorch Compatibility
+    needs: matrix_preparation
+    if: github.repository == 'hpcaitech/ColossalAI'
+    runs-on: [self-hosted, gpu]
+    strategy:
+      fail-fast: false
+      matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
+    container:
+      image: ${{ matrix.container }}
+      options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10
+    timeout-minutes: 120
+    steps:
+      - name: Install dependencies
+        run: |
+          pip install -U pip setuptools wheel --user
+      - uses: actions/checkout@v2
+        with:
+          repository: hpcaitech/TensorNVMe
+          ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
+          path: TensorNVMe
+      - name: Install tensornvme
+        run: |
+          cd TensorNVMe
+          conda install cmake
+          pip install -r requirements.txt
+          pip install -v .
+      - uses: actions/checkout@v2
+        with:
+          ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
+      - name: Install Colossal-AI
+        run: |
+          pip install -v --no-cache-dir .
+          pip install -r requirements/requirements-test.txt
+      - name: Unit Testing
+        run: |
+          PYTHONPATH=$PWD pytest tests
+        env:
+          DATA: /data/scratch/cifar-10
+          NCCL_SHM_DISABLE: 1
+          LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
diff --git a/.github/workflows/auto_compatibility_test.yml b/.github/workflows/compatiblity_test_on_schedule.yml
similarity index 95%
rename from .github/workflows/auto_compatibility_test.yml
rename to .github/workflows/compatiblity_test_on_schedule.yml
index 4b026c63e7f7..399f03cc7b22 100644
--- a/.github/workflows/auto_compatibility_test.yml
+++ b/.github/workflows/compatiblity_test_on_schedule.yml
@@ -1,10 +1,6 @@
-name: Compatibility Test
+name: Compatibility Test on Schedule
 
 on:
-  pull_request:
-    paths:
-      - 'version.txt'
-      - '.compatibility'
   # run at 03:00 of every Sunday(singapore time) so here is UTC time Saturday 16:00
   schedule:
     - cron:  '0 19 * * 6'

From 8af5a0799bec63e8fcb158f717e1cbad7f2f9c37 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 6 Feb 2023 13:47:59 +0800
Subject: [PATCH 251/503] [workflow] added discussion stats to community report
 (#2572)

* [workflow] added discussion stats to community report

* polish code
---
 .../workflows/report_leaderboard_to_lark.yml  |   2 +-
 .../generate_leaderboard_and_send_to_lark.py  | 390 +++++++++++++++---
 2 files changed, 343 insertions(+), 49 deletions(-)

diff --git a/.github/workflows/report_leaderboard_to_lark.yml b/.github/workflows/report_leaderboard_to_lark.yml
index 60c3ad2a6781..00d8e9e1f5fd 100644
--- a/.github/workflows/report_leaderboard_to_lark.yml
+++ b/.github/workflows/report_leaderboard_to_lark.yml
@@ -25,5 +25,5 @@ jobs:
       env:
         LARK_APP_ID: ${{ secrets.LARK_LEADERBOARD_APP_ID }}
         LARK_APP_SECRET: ${{ secrets.LARK_LEADERBOARD_APP_SECRET }}
-        LARK_WEBHOOK_URL: ${{ secrets.LARK_LEADERBOARD_WEBHOOK_URL }}
+        LARK_WEBHOOK_URL: ${{ secrets.LARK_NOTIFICATION_WEBHOOK_URL }}
         GITHUB_TOKEN: ${{ github.token }}
diff --git a/.github/workflows/scripts/generate_leaderboard_and_send_to_lark.py b/.github/workflows/scripts/generate_leaderboard_and_send_to_lark.py
index 3dee161036ff..36cdd9518486 100644
--- a/.github/workflows/scripts/generate_leaderboard_and_send_to_lark.py
+++ b/.github/workflows/scripts/generate_leaderboard_and_send_to_lark.py
@@ -1,6 +1,7 @@
 import os
 from dataclasses import dataclass
 from datetime import datetime, timedelta
+from typing import Any, Dict, List
 
 import matplotlib.pyplot as plt
 import pytz
@@ -11,16 +12,38 @@
 
 @dataclass
 class Contributor:
+    """
+    Dataclass for a github contributor.
+
+    Args:
+        name (str): name of the contributor
+        num_commits_this_week (int): number of commits made within one week
+    """
     name: str
     num_commits_this_week: int
 
 
-def generate_user_engagement_leaderboard_image(github_token, output_path):
-    # request to the Github API to get the users who have replied the most in the last 7 days
-    now = datetime.utcnow()
-    start_datetime = now - timedelta(days=7)
-    start_datetime_str = start_datetime.strftime("%Y-%m-%dT%H:%M:%SZ")
+def plot_bar_chart(x: List[Any], y: List[Any], xlabel: str, ylabel: str, title: str, output_path: str) -> None:
+    """
+    This function is a utility to plot the bar charts.
+    """
+    plt.clf()
+    seaborn.color_palette()
+    fig = seaborn.barplot(x=x, y=y)
+    fig.set(xlabel=xlabel, ylabel=ylabel, title=title)
+    seaborn.despine()
+    plt.tight_layout()
+    plt.savefig(output_path, dpi=1200)
+
+
+def get_issue_pull_request_comments(github_token: str, since: str) -> Dict[str, int]:
+    """
+    Retrive the issue/PR comments made by our members in the last 7 days.
 
+    Args:
+        github_token (str): GitHub access token for API calls
+        since (str): the path parameter required by GitHub Restful APIs, in the format of YYYY-MM-DDTHH:MM:SSZ
+    """
     # prepare header
     headers = {
         'Authorization': f'Bearer {github_token}',
@@ -33,7 +56,7 @@ def generate_user_engagement_leaderboard_image(github_token, output_path):
     # do pagination to the API
     page = 1
     while True:
-        comment_api = f'https://api.github.com/repos/hpcaitech/ColossalAI/issues/comments?since={start_datetime_str}&page={page}'
+        comment_api = f'https://api.github.com/repos/hpcaitech/ColossalAI/issues/comments?since={since}&page={page}'
         comment_response = requests.get(comment_api, headers=headers).json()
 
         if len(comment_response) == 0:
@@ -61,67 +84,301 @@ def generate_user_engagement_leaderboard_image(github_token, output_path):
                     else:
                         user_engagement_count[member_name] = 1
             page += 1
+    return user_engagement_count
 
-    # plot the leaderboard
-    x = []
-    y = []
 
-    for name, count in user_engagement_count.items():
-        x.append(count)
-        y.append(name)
-    xticks = [str(v) for v in range(1, max(x) + 1)]
-    seaborn.color_palette()
-    fig = seaborn.barplot(x=x, y=y)
-    fig.set(xlabel=f"Number of Comments made (since {start_datetime})",
-            ylabel="Member",
-            title='Active User Engagement Leaderboard')
-    seaborn.despine()
-    plt.tight_layout()
-    plt.savefig(output_path, dpi=1200)
+def get_discussion_comments(github_token, since) -> Dict[str, int]:
+    """
+    Retrive the discussion comments made by our members in the last 7 days.
+    This is only available via the GitHub GraphQL API.
+
+    Args:
+        github_token (str): GitHub access token for API calls
+        since (Datetime): the query parameter to determine whether the comment is made this week
+    """
+
+    # use graphql to get the discussions updated in the last 7 days
+    def _generate_discussion_query(num, cursor: str = None):
+        if cursor is None:
+            offset_str = ""
+        else:
+            offset_str = f", after: \"{cursor}\""
+        query = f"""
+        {{
+            repository(owner: "hpcaitech", name: "ColossalAI"){{
+                discussions(first: {num} {offset_str}){{
+                    edges {{
+                        cursor
+                        node{{
+                            title
+                            author{{
+                                login
+                            }}
+                            number
+                            authorAssociation
+                            updatedAt
+                        }}
+                    }}
+                }}
+            }}
+        }}
+        """
+        return query
+
+    def _generate_comment_reply_count_for_discussion(discussion_number, num, cursor: str = None):
+        # here we assume that each comment will not have more than 100 replies for simplicity
+        # otherwise, we have to go through pagination for both comment and reply
+        if cursor is None:
+            offset_str = ""
+        else:
+            offset_str = f", before: \"{cursor}\""
+        query = f"""
+        {{
+            repository(owner: "hpcaitech", name: "ColossalAI"){{
+                discussion(number: {discussion_number}){{
+                    title
+                    comments(last: {num} {offset_str}){{
+                        edges{{
+                            cursor
+                            node {{
+                                author{{
+                                    login
+                                }}
+                                updatedAt
+                                authorAssociation
+                                replies (last: 100) {{
+                                edges {{
+                                    node {{
+                                        author {{
+                                            login
+                                        }}
+                                        updatedAt
+                                        authorAssociation
+                                        }}
+                                    }}
+                                }}
+                            }}
+                        }}
+                    }}
+                }}
+            }}
+        }}
+        """
+        return query
+
+    # a utility function to make call to Github GraphQL API
+    def _call_graphql_api(query):
+        headers = {"Authorization": f"Bearer {github_token}"}
+        json_data = {'query': query}
+        response = requests.post('https://api.github.com/graphql', json=json_data, headers=headers)
+        data = response.json()
+        return data
+
+    # get the discussion numbers updated in the last 7 days
+    discussion_numbers = []
+    num_per_request = 10
+    cursor = None
+    while True:
+        query = _generate_discussion_query(num_per_request, cursor)
+        data = _call_graphql_api(query)
+        found_discussion_out_of_time_range = False
+
+        edges = data['data']['repository']['discussions']['edges']
+        if len(edges) == 0:
+            break
+        else:
+            # keep the discussion whose author is not a member
+            for edge in edges:
+                # print the discussion title
+                discussion = edge['node']
+
+                discussion_updated_at = datetime.strptime(discussion['updatedAt'], "%Y-%m-%dT%H:%M:%SZ")
+                # check if the updatedAt is within the last 7 days
+                # if yes, add it to dicussion_numbers
+                if discussion_updated_at > since:
+                    if discussion['authorAssociation'] != 'MEMBER':
+                        discussion_numbers.append(discussion['number'])
+                else:
+                    found_discussion_out_of_time_range = True
+
+        if found_discussion_out_of_time_range:
+            break
+        else:
+            # update cursor
+            cursor = edges[-1]['cursor']
+
+    # get the dicussion comments and replies made by our member
+    user_engagement_count = {}
+    for dicussion_number in discussion_numbers:
+        cursor = None
+        num_per_request = 10
+
+        while True:
+            query = _generate_comment_reply_count_for_discussion(dicussion_number, num_per_request, cursor)
+            data = _call_graphql_api(query)
+
+            # get the comments
+            edges = data['data']['repository']['discussion']['comments']['edges']
+
+            # update the cursor
+            if len(edges) == 0:
+                break
+            else:
+                # update cursor for pagination
+                cursor = edges[-1]['cursor']
+
+                for edge in edges:
+                    comment = edge['node']
+                    if comment['authorAssociation'] == 'MEMBER':
+                        # check if the updatedAt is within the last 7 days
+                        # if yes, add it to user_engagement_count
+                        comment_updated_at = datetime.strptime(comment['updatedAt'], "%Y-%m-%dT%H:%M:%SZ")
+                        if comment_updated_at > since:
+                            member_name = comment['author']['login']
+                            if member_name in user_engagement_count:
+                                user_engagement_count[member_name] += 1
+                            else:
+                                user_engagement_count[member_name] = 1
+
+                    # get the replies
+                    reply_edges = comment['replies']['edges']
+                    if len(reply_edges) == 0:
+                        continue
+                    else:
+                        for reply_edge in reply_edges:
+                            reply = reply_edge['node']
+                            if reply['authorAssociation'] == 'MEMBER':
+                                # check if the updatedAt is within the last 7 days
+                                # if yes, add it to dicussion_numbers
+                                reply_updated_at = datetime.strptime(reply['updatedAt'], "%Y-%m-%dT%H:%M:%SZ")
+                                if reply_updated_at > since:
+                                    member_name = reply['author']['login']
+                                    if member_name in user_engagement_count:
+                                        user_engagement_count[member_name] += 1
+                                    else:
+                                        user_engagement_count[member_name] = 1
+    return user_engagement_count
+
+
+def generate_user_engagement_leaderboard_image(github_token: str, output_path: str) -> bool:
+    """
+    Generate the user engagement leaderboard image for stats within the last 7 days
+
+    Args:
+        github_token (str): GitHub access token for API calls
+        output_path (str): the path to save the image
+    """
+
+    # request to the Github API to get the users who have replied the most in the last 7 days
+    now = datetime.utcnow()
+    start_datetime = now - timedelta(days=7)
+    start_datetime_str = start_datetime.strftime("%Y-%m-%dT%H:%M:%SZ")
 
+    # get the issue/PR comments and discussion comment count
+    issue_pr_engagement_count = get_issue_pull_request_comments(github_token=github_token, since=start_datetime_str)
+    discussion_engagement_count = get_discussion_comments(github_token=github_token, since=start_datetime)
+    total_engagement_count = {}
 
-def generate_contributor_leaderboard_image(github_token, output_path):
+    # update the total engagement count
+    total_engagement_count.update(issue_pr_engagement_count)
+    for name, count in discussion_engagement_count.items():
+        if name in total_engagement_count:
+            total_engagement_count[name] += count
+        else:
+            total_engagement_count[name] = count
+
+    # prepare the data for plotting
+    x = []
+    y = []
+
+    if len(total_engagement_count) > 0:
+        for name, count in total_engagement_count.items():
+            x.append(count)
+            y.append(name)
+
+        # use Shanghai time to display on the image
+        start_datetime_str = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%dT%H:%M:%SZ")
+
+        # plot the leaderboard
+        xlabel = f"Number of Comments made (since {start_datetime_str})"
+        ylabel = "Member"
+        title = 'Active User Engagement Leaderboard'
+        plot_bar_chart(x, y, xlabel=xlabel, ylabel=ylabel, title=title, output_path=output_path)
+        return True
+    else:
+        return False
+
+
+def generate_contributor_leaderboard_image(github_token, output_path) -> bool:
+    """
+    Generate the contributor leaderboard image for stats within the last 7 days
+
+    Args:
+        github_token (str): GitHub access token for API calls
+        output_path (str): the path to save the image
+    """
+    # request to the Github API to get the users who have contributed in the last 7 days
     URL = 'https://api.github.com/repos/hpcaitech/ColossalAI/stats/contributors'
     headers = {
         'Authorization': f'Bearer {github_token}',
         'Accept': 'application/vnd.github+json',
         'X-GitHub-Api-Version': '2022-11-28'
     }
-    response = requests.get(URL, headers=headers).json()
 
-    contributor_list = []
+    while True:
+        response = requests.get(URL, headers=headers).json()
 
-    # convert unix timestamp to Beijing datetime
-    start_timestamp = response[0]['weeks'][-1]['w']
-    start_datetime = datetime.fromtimestamp(start_timestamp, tz=pytz.timezone('Asia/Shanghai'))
+        if len(response) != 0:
+            # sometimes the Github API returns empty response for unknown reason
+            # request again if the response is empty
+            break
+
+    contributor_list = []
 
     # get number of commits for each contributor
+    start_timestamp = None
     for item in response:
         num_commits_this_week = item['weeks'][-1]['c']
         name = item['author']['login']
         contributor = Contributor(name=name, num_commits_this_week=num_commits_this_week)
         contributor_list.append(contributor)
 
+        # update start_timestamp
+        start_timestamp = item['weeks'][-1]['w']
+
+    # convert unix timestamp to Beijing datetime
+    start_datetime = datetime.fromtimestamp(start_timestamp, tz=pytz.timezone('Asia/Shanghai'))
+    start_datetime_str = start_datetime.strftime("%Y-%m-%dT%H:%M:%SZ")
+
     # sort by number of commits
     contributor_list.sort(key=lambda x: x.num_commits_this_week, reverse=True)
 
     # remove contributors who has zero commits
     contributor_list = [x for x in contributor_list if x.num_commits_this_week > 0]
 
-    # plot
-    seaborn.color_palette()
+    # prepare the data for plotting
     x = [x.num_commits_this_week for x in contributor_list]
     y = [x.name for x in contributor_list]
-    fig = seaborn.barplot(x=x, y=y)
-    fig.set(xlabel=f"Number of Commits (since {start_datetime})",
-            ylabel="Contributor",
-            title='Active Contributor Leaderboard')
-    seaborn.despine()
-    plt.tight_layout()
-    plt.savefig(output_path, dpi=1200)
 
-
-def upload_image_to_lark(lark_tenant_token, image_path):
+    # plot
+    if len(x) > 0:
+        xlabel = f"Number of Commits (since {start_datetime_str})"
+        ylabel = "Contributor"
+        title = 'Active Contributor Leaderboard'
+        plot_bar_chart(x, y, xlabel=xlabel, ylabel=ylabel, title=title, output_path=output_path)
+        return True
+    else:
+        return False
+
+
+def upload_image_to_lark(lark_tenant_token: str, image_path: str) -> str:
+    """
+    Upload image to Lark and return the image key
+
+    Args:
+        lark_tenant_token (str): Lark tenant access token
+        image_path (str): the path to the image to be uploaded
+    """
     url = "https://open.feishu.cn/open-apis/im/v1/images"
     form = {'image_type': 'message', 'image': (open(image_path, 'rb'))}    # 需要替换具体的path
     multi_form = MultipartEncoder(form)
@@ -133,19 +390,40 @@ def upload_image_to_lark(lark_tenant_token, image_path):
     return response['data']['image_key']
 
 
-def generate_lark_tenant_access_token(app_id, app_secret):
+def generate_lark_tenant_access_token(app_id: str, app_secret: str) -> str:
+    """
+    Generate Lark tenant access token.
+
+    Args:
+        app_id (str): Lark app id
+        app_secret (str): Lark app secret
+    """
     url = 'https://open.feishu.cn/open-apis/auth/v3/tenant_access_token/internal'
     data = {'app_id': app_id, 'app_secret': app_secret}
     response = requests.post(url, json=data).json()
     return response['tenant_access_token']
 
 
-def send_image_to_lark(image_key, webhook_url):
+def send_image_to_lark(image_key: str, webhook_url: str) -> None:
+    """
+    Send image to Lark.
+
+    Args:
+        image_key (str): the image key returned by Lark
+        webhook_url (str): the webhook url to send the image
+    """
     data = {"msg_type": "image", "content": {"image_key": image_key}}
     requests.post(webhook_url, json=data)
 
 
-def send_message_to_lark(message, webhook_url):
+def send_message_to_lark(message: str, webhook_url: str):
+    """
+    Send message to Lark.
+
+    Args:
+        message (str): the message to be sent
+        webhook_url (str): the webhook url to send the message
+    """
     data = {"msg_type": "text", "content": {"text": message}}
     requests.post(webhook_url, json=data)
 
@@ -156,8 +434,8 @@ def send_message_to_lark(message, webhook_url):
     USER_ENGAGEMENT_IMAGE_PATH = 'engagement_leaderboard.png'
 
     # generate images
-    # generate_contributor_leaderboard_image(GITHUB_TOKEN, CONTRIBUTOR_IMAGE_PATH)
-    generate_user_engagement_leaderboard_image(GITHUB_TOKEN, USER_ENGAGEMENT_IMAGE_PATH)
+    contrib_success = generate_contributor_leaderboard_image(GITHUB_TOKEN, CONTRIBUTOR_IMAGE_PATH)
+    engagement_success = generate_user_engagement_leaderboard_image(GITHUB_TOKEN, USER_ENGAGEMENT_IMAGE_PATH)
 
     # upload images
     APP_ID = os.environ['LARK_APP_ID']
@@ -166,11 +444,27 @@ def send_message_to_lark(message, webhook_url):
     contributor_image_key = upload_image_to_lark(LARK_TENANT_TOKEN, CONTRIBUTOR_IMAGE_PATH)
     user_engagement_image_key = upload_image_to_lark(LARK_TENANT_TOKEN, USER_ENGAGEMENT_IMAGE_PATH)
 
-    # send contributor image to lark
+    # send message to lark
     LARK_WEBHOOK_URL = os.environ['LARK_WEBHOOK_URL']
-    send_message_to_lark("本周的开发者贡献榜单出炉啦！", LARK_WEBHOOK_URL)
-    send_image_to_lark(contributor_image_key, LARK_WEBHOOK_URL)
+    message = """本周的社区榜单出炉啦！
+1. 开发贡献者榜单
+2. 用户互动榜单
+
+注：
+- 开发贡献者测评标准为：本周由公司成员提交的commit次数
+- 用户互动榜单测评标准为：本周由公司成员在非成员创建的issue/PR/discussion中回复的次数
+"""
+
+    send_message_to_lark(message, LARK_WEBHOOK_URL)
+
+    # send contributor image to lark
+    if contrib_success:
+        send_image_to_lark(contributor_image_key, LARK_WEBHOOK_URL)
+    else:
+        send_message_to_lark("本周没有成员贡献commit，无榜单图片生成。", LARK_WEBHOOK_URL)
 
     # send user engagement image to lark
-    send_message_to_lark("本周的开源社区互动榜单出炉啦！", LARK_WEBHOOK_URL)
-    send_image_to_lark(user_engagement_image_key, LARK_WEBHOOK_URL)
+    if engagement_success:
+        send_image_to_lark(user_engagement_image_key, LARK_WEBHOOK_URL)
+    else:
+        send_message_to_lark("本周没有成员互动，无榜单图片生成。", LARK_WEBHOOK_URL)

From fba08743a845fc1c04176b0661d7bcb534dae7c1 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 6 Feb 2023 13:48:20 +0800
Subject: [PATCH 252/503] [setup] fixed inconsistent version meta (#2578)

---
 setup.py | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/setup.py b/setup.py
index b9cd9e5e4714..7cfbbe9b19a4 100644
--- a/setup.py
+++ b/setup.py
@@ -117,14 +117,26 @@ def get_version():
 
     with open(version_txt_path) as f:
         version = f.read().strip()
-        if build_cuda_ext:
-            torch_version = '.'.join(torch.__version__.split('.')[:2])
-            cuda_version = '.'.join(get_cuda_bare_metal_version(CUDA_HOME)[1:])
-            version += f'+torch{torch_version}cu{cuda_version}'
 
     # write version into version.py
     with open(version_py_path, 'w') as f:
         f.write(f"__version__ = '{version}'\n")
+        if build_cuda_ext:
+            torch_version = '.'.join(torch.__version__.split('.')[:2])
+            cuda_version = '.'.join(get_cuda_bare_metal_version(CUDA_HOME)[1:])
+        else:
+            torch_version = None
+            cuda_version = None
+
+        if torch_version:
+            f.write(f'torch = "{torch_version}"\n')
+        else:
+            f.write('torch = None\n')
+
+        if cuda_version:
+            f.write(f'cuda = "{cuda_version}"\n')
+        else:
+            f.write('cuda = None\n')
 
     return version
 

From 788e1389601db846d6132ec99777f81987b0357b Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 6 Feb 2023 14:03:13 +0800
Subject: [PATCH 253/503] [workflow] added notification if scheduled build
 fails (#2574)

* [workflow] added notification if scheduled build fails

* polish code

* polish code
---
 .github/workflows/build_on_schedule.yml       | 39 ++++++++++++++++++-
 .../workflows/scripts/send_message_to_lark.py | 20 ++++++++++
 test.sh                                       |  6 +++
 3 files changed, 63 insertions(+), 2 deletions(-)
 create mode 100644 .github/workflows/scripts/send_message_to_lark.py
 create mode 100644 test.sh

diff --git a/.github/workflows/build_on_schedule.yml b/.github/workflows/build_on_schedule.yml
index ea1f4879ce51..32b518ac5394 100644
--- a/.github/workflows/build_on_schedule.yml
+++ b/.github/workflows/build_on_schedule.yml
@@ -16,30 +16,65 @@ jobs:
       options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10
     timeout-minutes: 40
     steps:
+      - name: Check GPU Availability # ensure all GPUs have enough memory
+        id: check-avai
+        run: |
+          avai=true
+          for i in $(seq 0 7);
+          do
+            gpu_used=$(nvidia-smi -i $i --query-gpu=memory.used --format=csv,noheader,nounits)
+            [ "$gpu_used" -le "10000" ] && avai=false
+          done
+
+          echo "GPU is available: $avai"
+          echo "avai=$avai" >> $GITHUB_OUTPUT
+
       - uses: actions/checkout@v2
+        if: steps.check-avai.outputs.avai == 'true'
         with:
           repository: hpcaitech/TensorNVMe
           ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
           path: TensorNVMe
+
       - name: Install tensornvme
+        if: steps.check-avai.outputs.avai == 'true'
         run: |
           cd TensorNVMe
           conda install cmake
           pip install -r requirements.txt
           pip install -v .
+
       - uses: actions/checkout@v2
+        if: steps.check-avai.outputs.avai == 'true'
         with:
           ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
+
       - name: Install Colossal-AI
+        if: steps.check-avai.outputs.avai == 'true'
         run: |
           [ ! -z "$(ls -A /github/home/cuda_ext_cache/)" ] && cp -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/
           CUDA_EXT=1 pip install -v -e .
           cp -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/
           pip install -r requirements/requirements-test.txt
+
       - name: Unit Testing
+        if: steps.check-avai.outputs.avai == 'true'
         run: |
-          gpu_used=$(nvidia-smi -i 0 --query-gpu=memory.used --format=csv,noheader,nounits)
-          [ "$gpu_used" -le "10000" ] && PYTHONPATH=$PWD pytest tests
+          PYTHONPATH=$PWD pytest tests
         env:
           DATA: /data/scratch/cifar-10
           LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
+
+      - name: Notify Lark
+        id: message-preparation
+        if: ${{ failure() }}
+        run: |
+          url=$SERVER_URL/$REPO/actions/runs/$RUN_ID
+          msg="Scheduled Build and Test failed on 8 GPUs, please visit $url for details"
+          echo $msg
+          python .github/workflows/scripts/send_message_to_lark.py -m "$msg" -u $WEBHOOK_URL
+        env:
+          SERVER_URL: ${{github.server_url }}
+          REPO: ${{ github.repository }}
+          RUN_ID: ${{ github.run_id }}
+          WEBHOOK_URL: ${{ secrets.LARK_NOTIFICATION_WEBHOOK_URL }}
diff --git a/.github/workflows/scripts/send_message_to_lark.py b/.github/workflows/scripts/send_message_to_lark.py
new file mode 100644
index 000000000000..a113327a786e
--- /dev/null
+++ b/.github/workflows/scripts/send_message_to_lark.py
@@ -0,0 +1,20 @@
+import argparse
+
+import requests
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-m', '--message', type=str)
+    parser.add_argument('-u', '--url', type=str)
+    return parser.parse_args()
+
+
+def send_message_to_lark(message, webhook_url):
+    data = {"msg_type": "text", "content": {"text": message}}
+    requests.post(webhook_url, json=data)
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    send_message_to_lark(args.message, args.url)
diff --git a/test.sh b/test.sh
new file mode 100644
index 000000000000..8dcecc6ddc55
--- /dev/null
+++ b/test.sh
@@ -0,0 +1,6 @@
+avai=true
+for i in $(seq 0 7);
+do
+  gpu_used=$(nvidia-smi -i $i --query-gpu=memory.used --format=csv,noheader,nounits)
+  [ "$gpu_used" -le "10000" ] && avai=false
+done

From 186ddce2c4050af7b815d47fc74b4412ac6e0f29 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 6 Feb 2023 14:38:35 +0800
Subject: [PATCH 254/503] [workflow] hook example test alert with lark (#2585)

---
 .github/workflows/dispatch_example_check.yml  | 63 -------------------
 .../workflows/example_check_on_schedule.yml   | 24 +++++--
 2 files changed, 18 insertions(+), 69 deletions(-)
 delete mode 100644 .github/workflows/dispatch_example_check.yml

diff --git a/.github/workflows/dispatch_example_check.yml b/.github/workflows/dispatch_example_check.yml
deleted file mode 100644
index e0333422f50d..000000000000
--- a/.github/workflows/dispatch_example_check.yml
+++ /dev/null
@@ -1,63 +0,0 @@
-name: Manual Test Example
-on:
-  workflow_dispatch:
-    inputs:
-      example_directory:
-        type: string
-        description: example directory, separated by space. For example, language/gpt, images/vit. Simply input language or simply gpt does not work.
-        required: true
-
-jobs:
-  matrix_preparation:
-    if: |
-        github.event.pull_request.draft == false &&
-        github.base_ref == 'main' &&
-        github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
-    name: Check the examples user want
-    runs-on: ubuntu-latest
-    outputs:
-      matrix: ${{ steps.set-matrix.outputs.matrix }}
-    steps:
-    - name: 📚 Checkout
-      uses: actions/checkout@v3
-    - name: Set up matrix
-      id: set-matrix
-      env:
-        check_dir: ${{ inputs.example_directory }}
-      run: |
-        res=`python .github/workflows/scripts/example_checks/check_dispatch_inputs.py --fileNameList $check_dir`
-        if [ res == "failure" ];then
-          exit -1
-        fi
-        dirs="[${check_dir}]"
-        echo "Testing examples in $dirs"
-        echo "matrix={\"directory\":$(echo "$dirs")}" >> $GITHUB_OUTPUT
-
-  test_example:
-    if: |
-        github.event.pull_request.draft == false &&
-        github.base_ref == 'main' &&
-        github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
-    name: Manually check example files
-    needs: manual_check_matrix_preparation
-    runs-on: [self-hosted, gpu]
-    strategy:
-      matrix: ${{fromJson(needs.manual_check_matrix_preparation.outputs.matrix)}}
-    container:
-      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
-      options: --gpus all --rm -v /data/scratch/examples-data:/data/
-    timeout-minutes: 10
-    steps:
-      - name: 📚 Checkout
-        uses: actions/checkout@v3
-      - name: Install Colossal-AI
-        run: |
-          pip install -v .
-      - name: Test the example
-        run: |
-          dir=${{ matrix.directory }}
-          echo "Testing ${dir} now"
-          cd "${PWD}/examples/${dir}"
-          bash test_ci.sh
-        env:
-          NCCL_SHM_DISABLE: 1
diff --git a/.github/workflows/example_check_on_schedule.yml b/.github/workflows/example_check_on_schedule.yml
index 07424ecbede2..9d8dcbbb5c09 100644
--- a/.github/workflows/example_check_on_schedule.yml
+++ b/.github/workflows/example_check_on_schedule.yml
@@ -3,13 +3,12 @@ on:
   # run at 00:00 of every Sunday(singapore time) so here is UTC time Saturday 16:00
   schedule:
     - cron:  '0 16 * * 6'
+  workflow_dispatch:
 
 jobs:
   # This is for all files' weekly check. Specifically, this job is to find all the directories.
   matrix_preparation:
-    if: |
-        github.repository == 'hpcaitech/ColossalAI' &&
-        github.event_name == 'schedule'
+    if: github.repository == 'hpcaitech/ColossalAI'
     name: Prepare matrix for weekly check
     runs-on: ubuntu-latest
     outputs:
@@ -27,9 +26,7 @@ jobs:
         echo "matrix={\"directory\":$(echo "$all_loc")}" >> $GITHUB_OUTPUT
 
   weekly_check:
-    if: |
-        github.repository == 'hpcaitech/ColossalAI' &&
-        github.event_name == 'schedule'
+    if: github.repository == 'hpcaitech/ColossalAI'
     name: Weekly check all examples
     needs: matrix_preparation
     runs-on: [self-hosted, gpu]
@@ -55,3 +52,18 @@ jobs:
           bash test_ci.sh
         env:
           NCCL_SHM_DISABLE: 1
+
+      - name: Notify Lark
+        id: message-preparation
+        if: ${{ failure() }}
+        run: |
+          url=$SERVER_URL/$REPO/actions/runs/$RUN_ID
+          msg="Example tests failed for $EXAMPLE_DIR, please visit $url for details"
+          echo $msg
+          python .github/workflows/scripts/send_message_to_lark.py -m "$msg" -u $WEBHOOK_URL
+        env:
+          SERVER_URL: ${{github.server_url }}
+          REPO: ${{ github.repository }}
+          RUN_ID: ${{ github.run_id }}
+          WEBHOOK_URL: ${{ secrets.LARK_NOTIFICATION_WEBHOOK_URL }}
+          EXAMPLE_DIR: ${{ matrix.diretory }}

From 5767f8e3946b867e0fe7586f781ff249d5ae73b0 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 6 Feb 2023 14:56:31 +0800
Subject: [PATCH 255/503] [workflow] hook compatibility test failure to lark
 (#2586)

---
 .../compatiblity_test_on_schedule.yml         | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/.github/workflows/compatiblity_test_on_schedule.yml b/.github/workflows/compatiblity_test_on_schedule.yml
index 399f03cc7b22..9802795fad24 100644
--- a/.github/workflows/compatiblity_test_on_schedule.yml
+++ b/.github/workflows/compatiblity_test_on_schedule.yml
@@ -4,6 +4,7 @@ on:
   # run at 03:00 of every Sunday(singapore time) so here is UTC time Saturday 16:00
   schedule:
     - cron:  '0 19 * * 6'
+  workflow_dispatch:
 
 jobs:
   matrix_preparation:
@@ -43,11 +44,13 @@ jobs:
       - name: Install dependencies
         run: |
           pip install -U pip setuptools wheel --user
+
       - uses: actions/checkout@v2
         with:
           repository: hpcaitech/TensorNVMe
           ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
           path: TensorNVMe
+
       - name: Install tensornvme
         run: |
           cd TensorNVMe
@@ -57,10 +60,12 @@ jobs:
       - uses: actions/checkout@v2
         with:
           ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
+
       - name: Install Colossal-AI
         run: |
           pip install -v --no-cache-dir .
           pip install -r requirements/requirements-test.txt
+
       - name: Unit Testing
         run: |
           PYTHONPATH=$PWD pytest tests
@@ -68,3 +73,18 @@ jobs:
           DATA: /data/scratch/cifar-10
           NCCL_SHM_DISABLE: 1
           LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
+
+      - name: Notify Lark
+        id: message-preparation
+        if: ${{ failure() }}
+        run: |
+          url=$SERVER_URL/$REPO/actions/runs/$RUN_ID
+          msg="Compatibility test failed with $container, please visit $url for details"
+          echo $msg
+          python .github/workflows/scripts/send_message_to_lark.py -m "$msg" -u $WEBHOOK_URL
+        env:
+          SERVER_URL: ${{github.server_url }}
+          REPO: ${{ github.repository }}
+          RUN_ID: ${{ github.run_id }}
+          WEBHOOK_URL: ${{ secrets.LARK_NOTIFICATION_WEBHOOK_URL }}
+          container: ${{ matrix.container }}

From 2059408edc76228f443a6024b66c2d28e629501e Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 6 Feb 2023 15:03:54 +0800
Subject: [PATCH 256/503] [workflow] fixed the typo in the example check
 workflow (#2589)

---
 .github/workflows/example_check_on_schedule.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/example_check_on_schedule.yml b/.github/workflows/example_check_on_schedule.yml
index 9d8dcbbb5c09..bd52ca4321a2 100644
--- a/.github/workflows/example_check_on_schedule.yml
+++ b/.github/workflows/example_check_on_schedule.yml
@@ -46,7 +46,7 @@ jobs:
 
       - name: Traverse all files
         run: |
-          example_dir=${{ matrix.diretory }}
+          example_dir=${{ matrix.directory }}
           echo "Testing ${example_dir} now"
           cd "${PWD}/examples/${example_dir}"
           bash test_ci.sh
@@ -66,4 +66,4 @@ jobs:
           REPO: ${{ github.repository }}
           RUN_ID: ${{ github.run_id }}
           WEBHOOK_URL: ${{ secrets.LARK_NOTIFICATION_WEBHOOK_URL }}
-          EXAMPLE_DIR: ${{ matrix.diretory }}
+          EXAMPLE_DIR: ${{ matrix.directory }}

From d6cc8f313e0651cac0970e1fd3d0f931c459343a Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 6 Feb 2023 15:42:08 +0800
Subject: [PATCH 257/503] [workflow] added test-pypi check before release
 (#2591)

* [workflow] added test-pypi check before release

* polish code
---
 ...ly.yml => release_nightly_on_schedule.yml} |  0
 ..._pypi.yml => release_pypi_after_merge.yml} |  4 +-
 .../release_test_pypi_before_merge.yml        | 52 +++++++++++++++++++
 3 files changed, 54 insertions(+), 2 deletions(-)
 rename .github/workflows/{release_nightly.yml => release_nightly_on_schedule.yml} (100%)
 rename .github/workflows/{release_pypi.yml => release_pypi_after_merge.yml} (94%)
 create mode 100644 .github/workflows/release_test_pypi_before_merge.yml

diff --git a/.github/workflows/release_nightly.yml b/.github/workflows/release_nightly_on_schedule.yml
similarity index 100%
rename from .github/workflows/release_nightly.yml
rename to .github/workflows/release_nightly_on_schedule.yml
diff --git a/.github/workflows/release_pypi.yml b/.github/workflows/release_pypi_after_merge.yml
similarity index 94%
rename from .github/workflows/release_pypi.yml
rename to .github/workflows/release_pypi_after_merge.yml
index 7f3f63cf31f3..797cd69f373e 100644
--- a/.github/workflows/release_pypi.yml
+++ b/.github/workflows/release_pypi_after_merge.yml
@@ -10,7 +10,7 @@ on:
 
 jobs:
   build-n-publish:
-    if: github.event_name == 'workflow_dispatch' || github.repository == 'hpcaitech/ColossalAI' && github.event.pull_request.merged == true && github.base_ref == 'main' 
+    if: github.event_name == 'workflow_dispatch' || github.repository == 'hpcaitech/ColossalAI' && github.event.pull_request.merged == true && github.base_ref == 'main'
     name: Build and publish Python 🐍 distributions 📦 to PyPI
     runs-on: ubuntu-latest
     timeout-minutes: 20
@@ -22,7 +22,7 @@ jobs:
         python-version: '3.8.14'
 
     - run: python setup.py sdist build
-    
+
     # publish to PyPI if executed on the main branch
     - name: Publish package to PyPI
       uses: pypa/gh-action-pypi-publish@release/v1
diff --git a/.github/workflows/release_test_pypi_before_merge.yml b/.github/workflows/release_test_pypi_before_merge.yml
new file mode 100644
index 000000000000..f35a8aad0b62
--- /dev/null
+++ b/.github/workflows/release_test_pypi_before_merge.yml
@@ -0,0 +1,52 @@
+name: Publish to Test-PyPI Before Merge
+
+on:
+  pull_request:
+    paths:
+      - 'version.txt'
+
+jobs:
+  build-n-publish:
+    if: github.event_name == 'workflow_dispatch' || github.repository == 'hpcaitech/ColossalAI'
+    name: Build and publish Python 🐍 distributions 📦 to Test PyPI
+    runs-on: ubuntu-latest
+    timeout-minutes: 20
+    steps:
+    - uses: actions/checkout@v2
+
+    - uses: actions/setup-python@v2
+      with:
+        python-version: '3.8.14'
+
+    - name: add timestamp to the version
+      id: prep-version
+      run: |
+        version=$(cat version.txt)
+        timestamp=$(date +%s)
+        new_version="${version}.post${timestamp}"
+        echo $new_version > ./version.txt
+        echo "version=$new_version" >> $GITHUB_OUTPUT
+
+    - run: python setup.py sdist build
+
+    # publish to PyPI if executed on the main branch
+    - name: Publish package to PyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
+      with:
+        user: __token__
+        password: ${{ secrets.TEST_PYPI_API_TOKEN }}
+        repository_url: https://test.pypi.org/legacy/
+        verbose: true
+
+    - name: Wait for Test-PyPI refresh
+      run: sleep 60s
+      shell: bash
+
+    - name: Try installation
+      run: |
+        # we need to install the requirements.txt first
+        # as test-pypi may not contain the distributions for libs listed in the txt file
+        pip install -r requirements/requirements.txt
+        pip install --index-url https://test.pypi.org/simple/ colossalai==$VERSION
+      env:
+        VERSION: ${{ steps.prep-version.outputs.version }}

From fd90245399be12cee179a06f95ad8fc3ee835179 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 6 Feb 2023 16:15:46 +0800
Subject: [PATCH 258/503] [workflow] hooked docker release with lark (#2594)

---
 ...ker.yml => release_docker_after_merge.yml} | 41 ++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)
 rename .github/workflows/{release_docker.yml => release_docker_after_merge.yml} (50%)

diff --git a/.github/workflows/release_docker.yml b/.github/workflows/release_docker_after_merge.yml
similarity index 50%
rename from .github/workflows/release_docker.yml
rename to .github/workflows/release_docker_after_merge.yml
index 8da6e5f87606..dbb38208e720 100644
--- a/.github/workflows/release_docker.yml
+++ b/.github/workflows/release_docker_after_merge.yml
@@ -1,4 +1,4 @@
-name: Publish Docker Image to DockerHub
+name: Publish Docker Image to DockerHub after Merge
 
 on:
   workflow_dispatch:
@@ -20,6 +20,7 @@ jobs:
       - uses: actions/checkout@v2
         with:
           fetch-depth: 0
+
       - name: Build Docker
         id: build
         run: |
@@ -27,11 +28,49 @@ jobs:
           tag=hpcaitech/colossalai:$version
           docker build --build-arg http_proxy=http://172.17.0.1:7890 --build-arg https_proxy=http://172.17.0.1:7890 -t $tag ./docker
           echo "tag=${tag}" >> $GITHUB_OUTPUT
+
       - name: Log in to Docker Hub
         uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9
         with:
           username: ${{ secrets.DOCKER_USERNAME }}
           password: ${{ secrets.DOCKER_PASSWORD }}
+
       - name: Push Docker image
+        id: docker-push
         run: |
           docker push ${{ steps.build.outputs.tag }}
+
+  notify:
+    name: Notify Lark via webhook
+    needs: release
+    runs-on: ubuntu-latest
+    if: ${{ always() }}
+    steps:
+      - uses: actions/checkout@v2
+
+      - uses: actions/setup-python@v2
+        with:
+          python-version: '3.8.14'
+
+      - name: Install requests
+        run: pip install requests
+
+      - name: Notify Lark
+        id: message-preparation
+        run: |
+          url=$SERVER_URL/$REPO/actions/runs/$RUN_ID
+
+          if [ $STATUS == 'success' ]
+          then
+            msg="The Docker image for the latest release has been successfully built and pushed to DockerHub."
+          else
+            msg="Failed to build and push the Docker image for the latest release, please visit $url for details."
+          fi
+          echo $msg
+          python .github/workflows/scripts/send_message_to_lark.py -m "$msg" -u $WEBHOOK_URL
+        env:
+          SERVER_URL: ${{github.server_url }}
+          REPO: ${{ github.repository }}
+          RUN_ID: ${{ github.run_id }}
+          WEBHOOK_URL: ${{ secrets.LARK_NOTIFICATION_WEBHOOK_URL }}
+          STATUS: ${{ steps.docker-push.outcome }}

From 0c03802bff8484660f3d16c62a8d03d4dbe96899 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 6 Feb 2023 16:29:04 +0800
Subject: [PATCH 259/503] [workflow] hooked pypi release with lark (#2596)

---
 .github/workflows/auto_example_check.yml      | 143 ------------------
 .github/workflows/auto_release_bdist.yml      |  70 ---------
 ...draft_github_release_post_after_merge.yml} |   0
 .../workflows/release_nightly_on_schedule.yml |  36 +++++
 .../workflows/release_pypi_after_merge.yml    |  36 +++++
 5 files changed, 72 insertions(+), 213 deletions(-)
 delete mode 100644 .github/workflows/auto_example_check.yml
 delete mode 100644 .github/workflows/auto_release_bdist.yml
 rename .github/workflows/{draft_github_release_post.yml => draft_github_release_post_after_merge.yml} (100%)

diff --git a/.github/workflows/auto_example_check.yml b/.github/workflows/auto_example_check.yml
deleted file mode 100644
index df413f646c2c..000000000000
--- a/.github/workflows/auto_example_check.yml
+++ /dev/null
@@ -1,143 +0,0 @@
-name: Test Example
-on:
-  pull_request:
-    # any change in the examples folder will trigger check for the corresponding example.
-    paths:
-      - 'examples/**'
-  # run at 00:00 of every Sunday(singapore time) so here is UTC time Saturday 16:00
-  schedule:
-    - cron:  '0 16 * * 6'
-
-jobs:
-  # This is for changed example files detect and output a matrix containing all the corresponding directory name.
-  detect-changed-example:
-    if: |
-        github.event.pull_request.draft == false &&
-        github.base_ref == 'main' &&
-        github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request'
-    runs-on: ubuntu-latest
-    outputs:
-      matrix: ${{ steps.setup-matrix.outputs.matrix }}
-      anyChanged: ${{ steps.setup-matrix.outputs.anyChanged }}
-    name: Detect changed example files
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.pull_request.head.sha }}
-
-      - name: Locate base commit
-        id: locate-base-sha
-        run: |
-            curBranch=$(git rev-parse --abbrev-ref HEAD)
-            commonCommit=$(git merge-base origin/main $curBranch)
-            echo $commonCommit
-            echo "baseSHA=$commonCommit" >> $GITHUB_OUTPUT
-
-      - name: Get all changed example files
-        id: changed-files
-        uses: tj-actions/changed-files@v35
-        with:
-          base_sha: ${{ steps.locate-base-sha.outputs.baseSHA }}
-
-      - name: setup matrix
-        id: setup-matrix
-        run: |
-          changedFileName=""
-          for file in ${{ steps.changed-files.outputs.all_changed_files  }}; do
-            changedFileName="${file}:${changedFileName}"
-          done
-          echo "$changedFileName was changed"
-          res=`python .github/workflows/scripts/example_checks/detect_changed_example.py --fileNameList $changedFileName`
-          echo "All changed examples are $res"
-
-          if [ "$res" = "[]" ]; then
-            echo "anyChanged=false" >> $GITHUB_OUTPUT
-            echo "matrix=null" >> $GITHUB_OUTPUT
-          else
-            dirs=$( IFS=',' ; echo "${res[*]}" )
-            echo "anyChanged=true" >> $GITHUB_OUTPUT
-            echo "matrix={\"directory\":$(echo "$dirs")}" >> $GITHUB_OUTPUT
-          fi
-
-  # If no file is changed, it will prompt an error and shows the matrix do not have value.
-  check-changed-example:
-    # Add this condition to avoid executing this job if the trigger event is workflow_dispatch.
-    if: |
-        github.event.pull_request.draft == false &&
-        github.base_ref == 'main' &&
-        github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request' &&
-        needs.detect-changed-example.outputs.anyChanged == 'true'
-    name: Test the changed example
-    needs: detect-changed-example
-    runs-on: [self-hosted, gpu]
-    strategy:
-      matrix: ${{fromJson(needs.detect-changed-example.outputs.matrix)}}
-    container:
-      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
-      options: --gpus all --rm -v /data/scratch/examples-data:/data/
-    timeout-minutes: 10
-    steps:
-      - uses: actions/checkout@v3
-
-      - name: Install Colossal-AI
-        run: |
-          pip install -v .
-
-      - name: Test the example
-        run: |
-          example_dir=${{ matrix.directory }}
-          cd "${PWD}/examples/${example_dir}"
-          bash test_ci.sh
-        env:
-          NCCL_SHM_DISABLE: 1
-
-  # This is for all files' weekly check. Specifically, this job is to find all the directories.
-  matrix_preparation:
-    if: |
-        github.repository == 'hpcaitech/ColossalAI' &&
-        github.event_name == 'schedule'
-    name: Prepare matrix for weekly check
-    runs-on: ubuntu-latest
-    outputs:
-      matrix: ${{ steps.setup-matrix.outputs.matrix }}
-    steps:
-    - name: 📚 Checkout
-      uses: actions/checkout@v3
-
-    - name: setup matrix
-      id: setup-matrix
-      run: |
-        res=`python .github/workflows/scripts/example_checks/check_example_weekly.py`
-        all_loc=$( IFS=',' ; echo "${res[*]}" )
-        echo "Found the examples: $all_loc"
-        echo "matrix={\"directory\":$(echo "$all_loc")}" >> $GITHUB_OUTPUT
-
-  weekly_check:
-    if: |
-        github.repository == 'hpcaitech/ColossalAI' &&
-        github.event_name == 'schedule'
-    name: Weekly check all examples
-    needs: matrix_preparation
-    runs-on: [self-hosted, gpu]
-    strategy:
-      matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
-    container:
-      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
-    timeout-minutes: 10
-    steps:
-      - name: 📚 Checkout
-        uses: actions/checkout@v3
-
-      - name: Install Colossal-AI
-        run: |
-          pip install -v .
-
-      - name: Traverse all files
-        run: |
-          example_dir=${{ matrix.diretory }}
-          echo "Testing ${example_dir} now"
-          cd "${PWD}/examples/${example_dir}"
-          bash test_ci.sh
-        env:
-          NCCL_SHM_DISABLE: 1
diff --git a/.github/workflows/auto_release_bdist.yml b/.github/workflows/auto_release_bdist.yml
deleted file mode 100644
index 56a3036f8c94..000000000000
--- a/.github/workflows/auto_release_bdist.yml
+++ /dev/null
@@ -1,70 +0,0 @@
-name: Auto Release bdist wheel
-
-on:
-  workflow_dispatch:
-  pull_request:
-    paths:
-      - 'version.txt'
-    types:
-      - closed
-
-jobs:
-  matrix_preparation:
-    name: Prepare Container List
-    if: ( github.event_name == 'workflow_dispatch' || github.event.pull_request.merged == true ) && github.repository == 'hpcaitech/ColossalAI'
-    runs-on: ubuntu-latest
-    outputs:
-      matrix: ${{ steps.set-matrix.outputs.matrix }}
-    steps:
-      - uses: actions/checkout@v3
-      - id: set-matrix
-        run: |
-          bdist=$(cat .bdist.json | tr '\n' ' ')
-          echo "matrix=${bdist}" >> $GITHUB_OUTPUT
-
-  build:
-    name: Release bdist wheels
-    needs: matrix_preparation
-    runs-on: [self-hosted, gpu]
-    strategy:
-      fail-fast: false
-      matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
-    container:
-      image: ${{ matrix.build.cuda_image }}
-      options: --gpus all --rm
-    steps:
-      - uses: actions/checkout@v2
-        with:
-          fetch-depth: 0
-      # cub is for cuda 10.2
-      - name: Copy scripts
-        run: |
-          cp -r ./.github/workflows/scripts/* ./
-
-          # link the cache diretories to current path
-          ln -s /github/home/conda_pkgs ./conda_pkgs
-          ln -s /github/home/pip_wheels ./pip_wheels
-
-          # set the conda package path
-          echo "pkgs_dirs:\n  - $PWD/conda_pkgs" > ~/.condarc
-
-          # set safe directory
-          git config --global --add safe.directory /__w/ColossalAI/ColossalAI
-
-          # get cub package for cuda 10.2
-          wget https://github.com/NVIDIA/cub/archive/refs/tags/1.8.0.zip
-          unzip 1.8.0.zip
-      - name: Build bdist wheel
-        run: |
-          pip install beautifulsoup4 requests packaging
-          python ./build_colossalai_wheel.py --torch_version $TORCH_VERSIONS
-        env:
-          TORCH_VERSIONS: ${{ matrix.build.torch_version }}
-      - name: 🚀 Deploy
-        uses: garygrossgarten/github-action-scp@release
-        with:
-          local: all_dist
-          remote: ${{ secrets.PRIVATE_PYPI_DIR }}
-          host: ${{ secrets.PRIVATE_PYPI_HOST }}
-          username: ${{ secrets.PRIVATE_PYPI_USER }}
-          password: ${{ secrets.PRIVATE_PYPI_PASSWD }}
diff --git a/.github/workflows/draft_github_release_post.yml b/.github/workflows/draft_github_release_post_after_merge.yml
similarity index 100%
rename from .github/workflows/draft_github_release_post.yml
rename to .github/workflows/draft_github_release_post_after_merge.yml
diff --git a/.github/workflows/release_nightly_on_schedule.yml b/.github/workflows/release_nightly_on_schedule.yml
index 8aa48b8ed89e..aab42e1d754d 100644
--- a/.github/workflows/release_nightly_on_schedule.yml
+++ b/.github/workflows/release_nightly_on_schedule.yml
@@ -23,7 +23,43 @@ jobs:
     # publish to PyPI if executed on the main branch
     - name: Publish package to PyPI
       uses: pypa/gh-action-pypi-publish@release/v1
+      id: publish
       with:
         user: __token__
         password: ${{ secrets.PYPI_API_TOKEN }}
         verbose: true
+
+  notify:
+    name: Notify Lark via webhook
+    needs: release
+    runs-on: ubuntu-latest
+    if: ${{ always() }}
+    steps:
+      - uses: actions/checkout@v2
+
+      - uses: actions/setup-python@v2
+        with:
+          python-version: '3.8.14'
+
+      - name: Install requests
+        run: pip install requests
+
+      - name: Notify Lark
+        id: message-preparation
+        run: |
+          url=$SERVER_URL/$REPO/actions/runs/$RUN_ID
+
+          if [ $STATUS == 'success' ]
+          then
+            msg="The Colossal-AI nightly version has been successfully released to PyPI."
+          else
+            msg="Failed to release Colossal-AI nightly version to PyPI, please visit $url for details."
+          fi
+          echo $msg
+          python .github/workflows/scripts/send_message_to_lark.py -m "$msg" -u $WEBHOOK_URL
+        env:
+          SERVER_URL: ${{github.server_url }}
+          REPO: ${{ github.repository }}
+          RUN_ID: ${{ github.run_id }}
+          WEBHOOK_URL: ${{ secrets.LARK_NOTIFICATION_WEBHOOK_URL }}
+          STATUS: ${{ steps.publish.outcome }}
diff --git a/.github/workflows/release_pypi_after_merge.yml b/.github/workflows/release_pypi_after_merge.yml
index 797cd69f373e..7fa4bdd03cc1 100644
--- a/.github/workflows/release_pypi_after_merge.yml
+++ b/.github/workflows/release_pypi_after_merge.yml
@@ -25,8 +25,44 @@ jobs:
 
     # publish to PyPI if executed on the main branch
     - name: Publish package to PyPI
+      id: publish
       uses: pypa/gh-action-pypi-publish@release/v1
       with:
         user: __token__
         password: ${{ secrets.PYPI_API_TOKEN }}
         verbose: true
+
+  notify:
+    name: Notify Lark via webhook
+    needs: release
+    runs-on: ubuntu-latest
+    if: ${{ always() }}
+    steps:
+      - uses: actions/checkout@v2
+
+      - uses: actions/setup-python@v2
+        with:
+          python-version: '3.8.14'
+
+      - name: Install requests
+        run: pip install requests
+
+      - name: Notify Lark
+        id: message-preparation
+        run: |
+          url=$SERVER_URL/$REPO/actions/runs/$RUN_ID
+
+          if [ $STATUS == 'success' ]
+          then
+            msg="The Colossal-AI latest version has been successfully released to PyPI."
+          else
+            msg="Failed to release Colossal-AI to PyPI, please visit $url for details."
+          fi
+          echo $msg
+          python .github/workflows/scripts/send_message_to_lark.py -m "$msg" -u $WEBHOOK_URL
+        env:
+          SERVER_URL: ${{github.server_url }}
+          REPO: ${{ github.repository }}
+          RUN_ID: ${{ github.run_id }}
+          WEBHOOK_URL: ${{ secrets.LARK_NOTIFICATION_WEBHOOK_URL }}
+          STATUS: ${{ steps.publish.outcome }}

From 4d582893a79b72a878e8fac52b3282799e404636 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 6 Feb 2023 17:07:41 +0800
Subject: [PATCH 260/503] [workflow] added cuda extension build test before
 release (#2598)

* [workflow] added cuda extension build test before release

* polish code
---
 .bdist.json                                   | 24 -----
 .cuda_ext.json                                | 16 +++
 .../workflows/cuda_ext_check_before_merge.yml | 42 ++++++++
 .github/workflows/release_bdist.yml           | 99 -------------------
 4 files changed, 58 insertions(+), 123 deletions(-)
 delete mode 100644 .bdist.json
 create mode 100644 .cuda_ext.json
 create mode 100644 .github/workflows/cuda_ext_check_before_merge.yml
 delete mode 100644 .github/workflows/release_bdist.yml

diff --git a/.bdist.json b/.bdist.json
deleted file mode 100644
index 8693bca489e8..000000000000
--- a/.bdist.json
+++ /dev/null
@@ -1,24 +0,0 @@
-{
-  "build": [
-    {
-      "torch_version": "1.11.0",
-      "cuda_image": "hpcaitech/cuda-conda:10.2"
-    },
-    {
-      "torch_version": "1.11.0",
-      "cuda_image": "hpcaitech/cuda-conda:11.3"
-    },
-    {
-      "torch_version": "1.12.1",
-      "cuda_image": "hpcaitech/cuda-conda:10.2"
-    },
-    {
-      "torch_version": "1.12.1",
-      "cuda_image": "hpcaitech/cuda-conda:11.3"
-    },
-    {
-      "torch_version": "1.12.1",
-      "cuda_image": "hpcaitech/cuda-conda:11.6"
-    }
-  ]
-}
diff --git a/.cuda_ext.json b/.cuda_ext.json
new file mode 100644
index 000000000000..eba19cf05e31
--- /dev/null
+++ b/.cuda_ext.json
@@ -0,0 +1,16 @@
+{
+  "build": [
+    {
+      "torch_command": "pip install torch==1.12.1+cu102 torchvision==0.13.1+cu102 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu102",
+      "cuda_image": "hpcaitech/cuda-conda:10.2"
+    },
+    {
+      "torch_command": "pip install torch==1.12.1+cu113 torchvision==0.13.1+cu113 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu113",
+      "cuda_image": "hpcaitech/cuda-conda:11.3"
+    },
+    {
+      "torch_command": "pip install torch==1.12.1+cu116 torchvision==0.13.1+cu116 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu116",
+      "cuda_image": "hpcaitech/cuda-conda:11.6"
+    }
+  ]
+}
diff --git a/.github/workflows/cuda_ext_check_before_merge.yml b/.github/workflows/cuda_ext_check_before_merge.yml
new file mode 100644
index 000000000000..eba5bb98ec07
--- /dev/null
+++ b/.github/workflows/cuda_ext_check_before_merge.yml
@@ -0,0 +1,42 @@
+name: Check CUDA Extension Build Before Merge
+
+on:
+  workflow_dispatch:
+  pull_request:
+    paths:
+      - 'version.txt'
+
+jobs:
+  matrix_preparation:
+    name: Prepare Container List
+    if: github.repository == 'hpcaitech/ColossalAI'
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+      - uses: actions/checkout@v3
+
+      - id: set-matrix
+        run: |
+          cuda_ext=$(cat .cuda_ext.json | tr '\n' ' ')
+          echo "matrix=${cuda_ext}" >> $GITHUB_OUTPUT
+
+  build:
+    name: Release bdist wheels
+    needs: matrix_preparation
+    runs-on: [self-hosted, gpu]
+    strategy:
+      fail-fast: false
+      matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
+    container:
+      image: ${{ matrix.build.cuda_image }}
+      options: --gpus all --rm
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Install PyTorch
+        run: eval ${{ matrix.build.torch_command }}
+
+      - name: Build
+        run: |
+          CUDA_EXT=1 pip install -v .
diff --git a/.github/workflows/release_bdist.yml b/.github/workflows/release_bdist.yml
deleted file mode 100644
index c9c51df8d074..000000000000
--- a/.github/workflows/release_bdist.yml
+++ /dev/null
@@ -1,99 +0,0 @@
-name: Release bdist wheel
-
-on:
-  workflow_dispatch:
-    inputs:
-      torch_version:
-        type: string
-        description: torch version, separated by comma
-        required: true
-        default: "all"
-      cuda_version:
-        type: string
-        description: cuda version, separated by comma
-        required: true
-      github_ref:
-        type: string
-        description: Branch or Tag
-        default: 'main'
-        required: true
-
-jobs:
-  matrix_preparation:
-    name: Prepare Container List
-    runs-on: ubuntu-latest
-    outputs:
-      matrix: ${{ steps.set-matrix.outputs.matrix }}
-    steps:
-    - id: set-matrix
-      env:
-        TORCH_VERSIONS: ${{ inputs.torch_version }}
-        CUDA_VERSIONS: ${{ inputs.cuda_version }}
-      run: |
-        echo $TORCH_VERSIONS
-        echo $CUDA_VERSIONS
-        IFS=','
-        DOCKER_IMAGE=()
-
-        for cv in $CUDA_VERSIONS
-        do
-            DOCKER_IMAGE+=("\"hpcaitech/cuda-conda:${cv}\"")
-        done
-
-        container=$( IFS=',' ; echo "${DOCKER_IMAGE[*]}" )
-        container="[${container}]"
-        echo "$container"
-        echo "::set-output name=matrix::{\"container\":$(echo "$container")}"
-
-  build:
-    name: Release bdist wheels
-    needs: matrix_preparation
-    if: github.repository == 'hpcaitech/ColossalAI' && contains(fromJson('["FrankLeeeee", "ver217", "feifeibear", "kurisusnowdeng"]'), github.actor)
-    runs-on: [self-hosted, gpu]
-    strategy:
-      fail-fast: false
-      matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
-    container:
-      image: ${{ matrix.container }}
-      options: --gpus all --rm
-    steps:
-      - uses: actions/checkout@v2
-        with:
-          fetch-depth: 0
-      # cub is for cuda 10.2
-      - name: Copy scripts and checkout
-        run: |
-          cp -r ./.github/workflows/scripts/* ./
-
-          # link the cache diretories to current path
-          ln -s /github/home/conda_pkgs ./conda_pkgs
-          ln -s /github/home/pip_wheels ./pip_wheels
-
-          # set the conda package path
-          echo "pkgs_dirs:\n  - $PWD/conda_pkgs" > ~/.condarc
-
-          # set safe directory
-          git config --global --add safe.directory /__w/ColossalAI/ColossalAI
-
-          # check out
-          git checkout $git_ref
-
-          # get cub package for cuda 10.2
-          wget https://github.com/NVIDIA/cub/archive/refs/tags/1.8.0.zip
-          unzip 1.8.0.zip
-        env:
-          git_ref: ${{ github.event.inputs.github_ref }}
-      - name: Build bdist wheel
-        run: |
-          pip install beautifulsoup4 requests packaging
-          python ./build_colossalai_wheel.py --torch_version $TORCH_VERSIONS
-        env:
-          TORCH_VERSIONS: ${{ inputs.torch_version }}
-      - name: 🚀 Deploy
-        uses: garygrossgarten/github-action-scp@release
-        with:
-          local: all_dist
-          remote: ${{ secrets.PRIVATE_PYPI_DIR }}
-          host: ${{ secrets.PRIVATE_PYPI_HOST }}
-          username: ${{ secrets.PRIVATE_PYPI_USER }}
-          password: ${{ secrets.PRIVATE_PYPI_PASSWD }}

From 719c4d5553cf676d4d8d5a4b9c12dfc592a621a5 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 6 Feb 2023 17:42:15 +0800
Subject: [PATCH 261/503] [doc] updated readme for CI/CD (#2600)

---
 .github/workflows/README.md                   |  93 +++++++-------
 .github/workflows/example_check_on_pr.yml     |   2 +-
 .github/workflows/report_test_coverage.yml    |   2 +-
 .../scripts/build_colossalai_wheel.py         | 119 ------------------
 .../scripts/build_colossalai_wheel.sh         |  42 -------
 5 files changed, 50 insertions(+), 208 deletions(-)
 delete mode 100644 .github/workflows/scripts/build_colossalai_wheel.py
 delete mode 100644 .github/workflows/scripts/build_colossalai_wheel.sh

diff --git a/.github/workflows/README.md b/.github/workflows/README.md
index 980f7b5701ce..3bf535343d6d 100644
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@@ -9,12 +9,12 @@
     - [Code Style Check](#code-style-check)
     - [Unit Test](#unit-test)
     - [Example Test](#example-test)
-      - [Dispatch Example Test](#dispatch-example-test)
+      - [Example Test on Dispatch](#example-test-on-dispatch)
     - [Compatibility Test](#compatibility-test)
-      - [Compatibility Test](#compatibility-test-1)
+      - [Compatibility Test on Dispatch](#compatibility-test-on-dispatch)
     - [Release](#release)
-      - [Release bdist wheel](#release-bdist-wheel)
     - [User Friendliness](#user-friendliness)
+    - [Commmunity](#commmunity)
   - [Configuration](#configuration)
   - [Progress Log](#progress-log)
 
@@ -30,6 +30,9 @@ In the section below, we will dive into the details of different workflows avail
 Refer to this [documentation](https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow) on how to manually trigger a workflow.
 I will provide the details of each workflow below.
 
+**A PR which changes the `version.txt` is considered as a release PR in the following coontext.**
+
+
 ### Code Style Check
 
 | Workflow Name               | File name                      | Description                                                                                                |
@@ -41,35 +44,34 @@ I will provide the details of each workflow below.
 
 | Workflow Name          | File name                  | Description                                                                                                                                       |
 | ---------------------- | -------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `Build`                | `build.yml`                | This workflow is triggered when the label `Run build and Test` is assigned to a PR. It will run all the unit tests in the repository with 4 GPUs. |
-| `Build on 8 GPUs`      | `build_gpu_8.yml`          | This workflow will run the unit tests everyday with 8 GPUs.                                                                                       |
+| `Build on PR`          | `build_on_pr.yml`          | This workflow is triggered when the label `Run build and Test` is assigned to a PR. It will run all the unit tests in the repository with 4 GPUs. |
+| `Build on Schedule`    | `build_on_schedule.yml`    | This workflow will run the unit tests everyday with 8 GPUs. The result is sent to Lark.                                                           |
 | `Report test coverage` | `report_test_coverage.yml` | This PR will put up a comment to report the test coverage results when `Build` is done.                                                           |
 
 ### Example Test
 
-| Workflow Name              | File name                       | Description                                                                 |
-| -------------------------- | ------------------------------- | --------------------------------------------------------------------------- |
-| `Test example on PR`       | `example_check_on_pr.yml`       | The example will be automatically tested if its files are changed in the PR |
-| `Test example on Schedule` | `example_check_on_schedule.yml` | This workflow will test all examples every Sunday                           |
-| `Example Test on Dispatch` | `example_check_on_dispatch.yml` | Manually test a specified example.                                          |
+| Workflow Name              | File name                       | Description                                                                    |
+| -------------------------- | ------------------------------- | ------------------------------------------------------------------------------ |
+| `Test example on PR`       | `example_check_on_pr.yml`       | The example will be automatically tested if its files are changed in the PR    |
+| `Test example on Schedule` | `example_check_on_schedule.yml` | This workflow will test all examples every Sunday. The result is sent to Lark. |
+| `Example Test on Dispatch` | `example_check_on_dispatch.yml` | Manually test a specified example.                                             |
 
-#### Dispatch Example Test
+#### Example Test on Dispatch
 
-parameters:
-- `example_directory`: the example directory to test. Multiple directories are supported and must be separated by comma. For example, language/gpt, images/vit. Simply input language or simply gpt does not work.
+This workflow is triggered by manually dispatching the workflow. It has the following input parameters:
+- `example_directory`: the example directory to test. Multiple directories are supported and must be separated b$$y comma. For example, language/gpt, images/vit. Simply input language or simply gpt does not work.
 
 ### Compatibility Test
 
-| Workflow Name                | File name                        | Description                                                                                                                   |
-| ---------------------------- | -------------------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
-| `Compatibility Test`         | `auto_compatibility_test.yml`    | This workflow will check the compatiblity of Colossal-AI against PyTorch and CUDA specified in `.compatibility` every Sunday. |
-| `Auto Compatibility Test`    | `auto_compatibility_test.yml`    | Check Colossal-AI's compatiblity when `version.txt` is changed in a PR.                                                       |
-| `Dispatch Compatiblity Test` | `dispatch_compatiblity_test.yml` | Test PyTorch and Python Compatibility.                                                                                        |
-
+| Workflow Name                    | File name                            | Description                                                                                                          |
+| -------------------------------- | ------------------------------------ | -------------------------------------------------------------------------------------------------------------------- |
+| `Compatibility Test on PR`       | `compatibility_test_on_pr.yml`       | Check Colossal-AI's compatiblity when `version.txt` is changed in a PR.                                              |
+| `Compatibility Test on Schedule` | `compatibility_test_on_schedule.yml` | This workflow will check the compatiblity of Colossal-AI against PyTorch specified in `.compatibility` every Sunday. |
+| `Compatiblity Test on Dispatch`  | `compatibility_test_on_dispatch.yml` | Test PyTorch Compatibility manually.                                                                                 |
 
-#### Compatibility Test
 
-Parameters:
+#### Compatibility Test on Dispatch
+This workflow is triggered by manually dispatching the workflow. It has the following input parameters:
 - `torch version`:torch version to test against, multiple versions are supported but must be separated by comma. The default is value is all, which will test all available torch versions listed in this [repository](https://github.com/hpcaitech/public_assets/tree/main/colossalai/torch_build/torch_wheels).
 - `cuda version`: cuda versions to test against, multiple versions are supported but must be separated by comma. The CUDA versions must be present in our [DockerHub repository](https://hub.docker.com/r/hpcaitech/cuda-conda).
 
@@ -78,24 +80,16 @@ Parameters:
 
 ### Release
 
-| Workflow Name               | File name                       | Description                                                                                                                                                 |
-| --------------------------- | ------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `Draft GitHub Release Post` | `draft_github_release_post.yml` | Compose a GitHub release post draft based on the commit history.  Triggered when the change of `version.txt` is merged.                                     |
-| `Release to PyPI`           | `release_pypi.yml`              | Build and release the wheel to PyPI.  Triggered when the change of `version.txt` is merged.                                                                 |
-| `Release Nightly to PyPI`   | `release_nightly.yml`           | Build and release the nightly wheel to PyPI as `colossalai-nightly`. Automatically executed every Sunday.                                                   |
-| `Release Docker`            | `release_docker.yml`            | Build and release the Docker image to DockerHub. Triggered when the change of `version.txt` is merged.                                                      |
-| `Release bdist wheel`       | `release_bdist.yml`             | Build binary wheels with pre-built PyTorch extensions. Manually dispatched. See more details in the next section.                                           |
-| `Auto Release bdist wheel`  | `auto_release_bdist.yml`        | Build binary wheels with pre-built PyTorch extensions.Triggered when the change of `version.txt` is merged. Build specificatons are stored in `.bdist.json` |
-| `Release bdist wheel`       | `release_bdist.yml`             | Build binary wheels with pre-built PyTorch extensions.                                                                                                      |
+| Workflow Name                                   | File name                                   | Description                                                                                                   |
+| ----------------------------------------------- | ------------------------------------------- | ------------------------------------------------------------------------------------------------------------- |
+| `Draft GitHub Release Post`                     | `draft_github_release_post_after_merge.yml` | Compose a GitHub release post draft based on the commit history when a release PR is merged.                  |
+| `Publish to PyPI`                               | `release_pypi_after_merge.yml`              | Build and release the wheel to PyPI when a release PR is merged. The result is sent to Lark.                  |
+| `Publish Nightly Version to PyPI`               | `release_nightly_on_schedule.yml`           | Build and release the nightly wheel to PyPI as `colossalai-nightly` every Sunday. The result is sent to Lark. |
+| `Publish Docker Image to DockerHub after Merge` | `release_docker_after_merge.yml`            | Build and release the Docker image to DockerHub when a release PR is merged.  The result is sent to Lark.     |
+| `Check CUDA Extension Build Before Merge`       | `cuda_ext_check_before_merge.yml`           | Build CUDA extensions with different CUDA versions when a release PR is created.                              |
+| `Publish to Test-PyPI Before Merge`             | `release_test_pypi_before_merge.yml`        | Release to test-pypi to simulate user installation when a release PR is created.                              |
 
 
-#### Release bdist wheel
-
-Parameters:
-- `torch version`:torch version to test against, multiple versions are supported but must be separated by comma. The default is value is all, which will test all available torch versions listed in this [repository](https://github.com/hpcaitech/public_assets/tree/main/colossalai/torch_build/torch_wheels) which is regularly updated.
-- `cuda version`: cuda versions to test against, multiple versions are supported but must be separated by comma. The CUDA versions must be present in our [DockerHub repository](https://hub.docker.com/r/hpcaitech/cuda-conda).
-- `ref`: input the branch or tag name to build the wheel for this ref.
-
 ### User Friendliness
 
 | Workflow Name           | File name               | Description                                                                                                                            |
@@ -104,6 +98,11 @@ Parameters:
 | `Synchronize submodule` | `submodule.yml`         | This workflow will check if any git submodule is updated. If so, it will create a PR to update the submodule pointers.                 |
 | `Close inactive issues` | `close_inactive.yml`    | This workflow will close issues which are stale for 14 days.                                                                           |
 
+### Commmunity
+
+| Workflow Name                                | File name                        | Description                                                                      |
+| -------------------------------------------- | -------------------------------- | -------------------------------------------------------------------------------- |
+| `Generate Community Report and Send to Lark` | `report_leaderboard_to_lark.yml` | Collect contribution and user engagement stats and share with Lark every Friday. |
 
 ## Configuration
 
@@ -113,15 +112,15 @@ This section lists the files used to configure the workflow.
 
 This `.compatibility` file is to tell GitHub Actions which PyTorch and CUDA versions to test against. Each line in the file is in the format `${torch-version}-${cuda-version}`, which is a tag for Docker image. Thus, this tag must be present in the [docker registry](https://hub.docker.com/r/pytorch/conda-cuda) so as to perform the test.
 
-2. `.bdist.json`
+2. `.cuda_ext.json`
 
-This file controls what pytorch/cuda compatible pre-built releases will be built and published. You can add a new entry according to the json schema below if there is a new wheel that needs to be built with AOT compilation of PyTorch extensions.
+This file controls which CUDA versions will be checked against CUDA extenson built. You can add a new entry according to the json schema below to check the AOT build of PyTorch extensions before release.
 
 ```json
 {
   "build": [
     {
-      "torch_version": "",
+      "torch_command": "",
       "cuda_image": ""
     },
   ]
@@ -130,26 +129,30 @@ This file controls what pytorch/cuda compatible pre-built releases will be built
 
 ## Progress Log
 
+- [x] Code style check
+  - [x] pre-commit check
+  - [x] pre-commit failure report
 - [x] unit testing
   - [x] test on PR
   - [x] report test coverage
   - [x] regular test
 - [x] release
-  - [x] official release
+  - [x] pypi release
+  - [x] test-pypi simulation
   - [x] nightly build
-  - [x] binary build
   - [x] docker build
   - [x] draft release post
-- [x] pre-commit
-  - [x] check on PR
-  - [x] report failure
 - [x] example check
   - [x] check on PR
   - [x] regular check
   - [x] manual dispatch
 - [x] compatiblity check
+  - [x] check on PR
   - [x] manual dispatch
   - [x] auto test when release
+- [x] community
+  - [x] contribution report
+  - [x] user engagement report
 - [x] helpers
   - [x] comment translation
   - [x] submodule update
diff --git a/.github/workflows/example_check_on_pr.yml b/.github/workflows/example_check_on_pr.yml
index ebc2a277c1de..b22664ee47cc 100644
--- a/.github/workflows/example_check_on_pr.yml
+++ b/.github/workflows/example_check_on_pr.yml
@@ -48,7 +48,7 @@ jobs:
           res=`python .github/workflows/scripts/example_checks/detect_changed_example.py --fileNameList $changedFileName`
           echo "All changed examples are $res"
 
-          if [ "$res" = "[]" ]; then
+          if [ "$res" == "[]" ]; then
             echo "anyChanged=false" >> $GITHUB_OUTPUT
             echo "matrix=null" >> $GITHUB_OUTPUT
           else
diff --git a/.github/workflows/report_test_coverage.yml b/.github/workflows/report_test_coverage.yml
index c58527361181..d46e130839a5 100644
--- a/.github/workflows/report_test_coverage.yml
+++ b/.github/workflows/report_test_coverage.yml
@@ -42,7 +42,7 @@ jobs:
           fi
 
       - name: Make Coverage Report Collapsable
-        if: steps.unzip.outputs.hasReport == "true"
+        if: steps.unzip.outputs.hasReport == 'true'
         run: |
           covNum=$(cat cov_number)
           title="The code coverage for the changed files is ${covNum}%."
diff --git a/.github/workflows/scripts/build_colossalai_wheel.py b/.github/workflows/scripts/build_colossalai_wheel.py
deleted file mode 100644
index a9ac16fbc94a..000000000000
--- a/.github/workflows/scripts/build_colossalai_wheel.py
+++ /dev/null
@@ -1,119 +0,0 @@
-import argparse
-import os
-import subprocess
-from filecmp import cmp
-from functools import cmp_to_key
-
-import requests
-from bs4 import BeautifulSoup
-from packaging import version
-
-WHEEL_TEXT_ROOT_URL = 'https://github.com/hpcaitech/public_assets/tree/main/colossalai/torch_build/torch_wheels'
-RAW_TEXT_FILE_PREFIX = 'https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/torch_build/torch_wheels'
-CUDA_HOME = os.environ['CUDA_HOME']
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--torch_version', type=str)
-    parser.add_argument(
-        '--nightly',
-        action='store_true',
-        help=
-        'whether this build is for nightly release, if True, will only build on the latest PyTorch version and Python 3.8'
-    )
-    return parser.parse_args()
-
-
-def get_cuda_bare_metal_version():
-    raw_output = subprocess.check_output([CUDA_HOME + "/bin/nvcc", "-V"], universal_newlines=True)
-    output = raw_output.split()
-    release_idx = output.index("release") + 1
-    release = output[release_idx].split(".")
-    bare_metal_major = release[0]
-    bare_metal_minor = release[1][0]
-
-    return bare_metal_major, bare_metal_minor
-
-
-def all_wheel_info():
-    page_text = requests.get(WHEEL_TEXT_ROOT_URL).text
-    soup = BeautifulSoup(page_text)
-
-    all_a_links = soup.find_all('a')
-
-    wheel_info = dict()
-
-    for a_link in all_a_links:
-        if 'cuda' in a_link.text and '.txt' in a_link.text:
-            filename = a_link.text
-            torch_version, cuda_version = filename.rstrip('.txt').split('-')
-            cuda_version = cuda_version.lstrip('cuda')
-
-            if torch_version not in wheel_info:
-                wheel_info[torch_version] = dict()
-            wheel_info[torch_version][cuda_version] = dict()
-
-            file_text = requests.get(f'{RAW_TEXT_FILE_PREFIX}/{filename}').text
-            lines = file_text.strip().split('\n')
-
-            for line in lines:
-                parts = line.split('\t')
-                method, url, python_version = parts[:3]
-
-                if len(parts) > 3:
-                    flags = parts[3]
-                    flags = ' '.join(flags.split('+'))
-                else:
-                    flags = ''
-                wheel_info[torch_version][cuda_version][python_version] = dict(method=method, url=url, flags=flags)
-    return wheel_info
-
-
-def build_colossalai(wheel_info):
-    cuda_version_major, cuda_version_minor = get_cuda_bare_metal_version()
-    cuda_version_on_host = f'{cuda_version_major}.{cuda_version_minor}'
-
-    for torch_version, cuda_versioned_wheel_info in wheel_info.items():
-        for cuda_version, python_versioned_wheel_info in cuda_versioned_wheel_info.items():
-            if cuda_version_on_host == cuda_version:
-                for python_version, wheel_info in python_versioned_wheel_info.items():
-                    url = wheel_info['url']
-                    method = wheel_info['method']
-                    flags = wheel_info['flags']
-                    filename = url.split('/')[-1].replace('%2B', '+')
-                    cmd = f'bash ./build_colossalai_wheel.sh {method} {url} {filename} {cuda_version} {python_version} {torch_version} {flags}'
-                    os.system(cmd)
-
-
-def main():
-    args = parse_args()
-    wheel_info = all_wheel_info()
-
-    # filter wheels on condition
-    all_torch_versions = list(wheel_info.keys())
-
-    def _compare_version(a, b):
-        if version.parse(a) > version.parse(b):
-            return 1
-        else:
-            return -1
-
-    all_torch_versions.sort(key=cmp_to_key(_compare_version))
-
-    if args.nightly:
-        # only keep the latest version
-        for key in all_torch_versions[:-1]:
-            wheel_info.pop(key)
-    elif args.torch_version != 'all':
-        torch_versions = args.torch_version.split(',')
-        # only keep the torch versions specified
-        for key in all_torch_versions:
-            if key not in torch_versions:
-                wheel_info.pop(key)
-
-    build_colossalai(wheel_info)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/.github/workflows/scripts/build_colossalai_wheel.sh b/.github/workflows/scripts/build_colossalai_wheel.sh
deleted file mode 100644
index c0d40fd2cc99..000000000000
--- a/.github/workflows/scripts/build_colossalai_wheel.sh
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/usr/bin/env bash
-
-method=${1}
-url=${2}
-filename=${3}
-cuda_version=${4}
-python_version=${5}
-torch_version=${6}
-flags=${@:7}
-
-git reset --hard HEAD
-mkdir -p ./all_dist
-source activate base
-conda create -n $python_version -y python=$python_version
-source activate $python_version
-
-if [ $1 == "pip" ]
-then
-    wget -nc -q -O ./pip_wheels/$filename $url
-    pip install ./pip_wheels/$filename
-
-elif [ $1 == 'conda' ]
-then
-    conda install pytorch==$torch_version cudatoolkit=$cuda_version $flags
-else
-    echo Invalid installation method
-    exit
-fi
-
-if [ $cuda_version == "10.2" ]
-then
-    cp -r cub-1.8.0/cub/ colossalai/kernel/cuda_native/csrc/kernels/include/
-fi
-
-python setup.py bdist_wheel
-mv ./dist/* ./all_dist
-# must remove build to enable compilation for
-# cuda extension in the next build
-rm -rf ./build
-python setup.py clean
-conda deactivate
-conda env remove -n $python_version

From f7458d3ec7fa216b94dad7b9f10c6670a3252d46 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 6 Feb 2023 20:46:18 +0800
Subject: [PATCH 262/503] [release] v0.2.1 (#2602)

* [release] v0.2.1

* polish code
---
 .github/workflows/release_test_pypi_before_merge.yml | 2 +-
 version.txt                                          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/release_test_pypi_before_merge.yml b/.github/workflows/release_test_pypi_before_merge.yml
index f35a8aad0b62..49c626265175 100644
--- a/.github/workflows/release_test_pypi_before_merge.yml
+++ b/.github/workflows/release_test_pypi_before_merge.yml
@@ -39,7 +39,7 @@ jobs:
         verbose: true
 
     - name: Wait for Test-PyPI refresh
-      run: sleep 60s
+      run: sleep 300s
       shell: bash
 
     - name: Try installation
diff --git a/version.txt b/version.txt
index 0ea3a944b399..0c62199f16ac 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.2.0
+0.2.1

From f566b0ce6b96c0045322c2a7623013ec0c23b6dc Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 6 Feb 2023 21:40:19 +0800
Subject: [PATCH 263/503] [workflow] fixed broken rellease workflows (#2604)

---
 .github/workflows/release_docker_after_merge.yml    |  5 ++---
 .github/workflows/release_pypi_after_merge.yml      |  6 +++---
 .github/workflows/scripts/generate_release_draft.py | 12 ++++++++++--
 3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/release_docker_after_merge.yml b/.github/workflows/release_docker_after_merge.yml
index dbb38208e720..607c19b05472 100644
--- a/.github/workflows/release_docker_after_merge.yml
+++ b/.github/workflows/release_docker_after_merge.yml
@@ -59,8 +59,7 @@ jobs:
         id: message-preparation
         run: |
           url=$SERVER_URL/$REPO/actions/runs/$RUN_ID
-
-          if [ $STATUS == 'success' ]
+          if [ "$STATUS" == 'success' ]
           then
             msg="The Docker image for the latest release has been successfully built and pushed to DockerHub."
           else
@@ -73,4 +72,4 @@ jobs:
           REPO: ${{ github.repository }}
           RUN_ID: ${{ github.run_id }}
           WEBHOOK_URL: ${{ secrets.LARK_NOTIFICATION_WEBHOOK_URL }}
-          STATUS: ${{ steps.docker-push.outcome }}
+          STATUS: ${{ needs.release.result }}
diff --git a/.github/workflows/release_pypi_after_merge.yml b/.github/workflows/release_pypi_after_merge.yml
index 7fa4bdd03cc1..b987b4397c17 100644
--- a/.github/workflows/release_pypi_after_merge.yml
+++ b/.github/workflows/release_pypi_after_merge.yml
@@ -34,7 +34,7 @@ jobs:
 
   notify:
     name: Notify Lark via webhook
-    needs: release
+    needs: build-n-publish
     runs-on: ubuntu-latest
     if: ${{ always() }}
     steps:
@@ -52,7 +52,7 @@ jobs:
         run: |
           url=$SERVER_URL/$REPO/actions/runs/$RUN_ID
 
-          if [ $STATUS == 'success' ]
+          if [ "$STATUS" == 'success' ]
           then
             msg="The Colossal-AI latest version has been successfully released to PyPI."
           else
@@ -65,4 +65,4 @@ jobs:
           REPO: ${{ github.repository }}
           RUN_ID: ${{ github.run_id }}
           WEBHOOK_URL: ${{ secrets.LARK_NOTIFICATION_WEBHOOK_URL }}
-          STATUS: ${{ steps.publish.outcome }}
+          STATUS: ${{ needs.build-n-publish.result }}
diff --git a/.github/workflows/scripts/generate_release_draft.py b/.github/workflows/scripts/generate_release_draft.py
index 1c407cf14554..dc592e4c977b 100644
--- a/.github/workflows/scripts/generate_release_draft.py
+++ b/.github/workflows/scripts/generate_release_draft.py
@@ -57,7 +57,12 @@ def collate_release_info(commit_info_list):
 
     for commit_info in commit_info_list:
         author = commit_info['commit']['author']['name']
-        author_url = commit_info['author']['url']
+
+        try:
+            author_url = commit_info['author']['url']
+        except:
+            # author can be None
+            author_url = None
         msg = commit_info['commit']['message']
         match = re.search(pattern, msg)
 
@@ -86,7 +91,10 @@ def generate_release_post_markdown(current_version, last_version, release_info):
             # only keep the first line
             msg = msg.split('\n')[0]
 
-            item = f'{msg} by [{author}]({author_url})\n'
+            if author_url:
+                item = f'{msg} by [{author}]({author_url})\n'
+            else:
+                item = f'{msg} by {author}\n'
             text.append(f'- {item}')
 
         text.append('\n')

From ae86be1fd27a980ba04d694991af5ff2e1c4e28d Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Tue, 7 Feb 2023 09:33:27 +0800
Subject: [PATCH 264/503] Automated submodule synchronization (#2607)

Co-authored-by: github-actions <github-actions@github.com>
---
 examples/tutorial/fastfold/FastFold | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tutorial/fastfold/FastFold b/examples/tutorial/fastfold/FastFold
index 19ce840650fd..95150c384b9b 160000
--- a/examples/tutorial/fastfold/FastFold
+++ b/examples/tutorial/fastfold/FastFold
@@ -1 +1 @@
-Subproject commit 19ce840650fd865bd3684684dac051ec3a7bc762
+Subproject commit 95150c384b9b6e776cad38dd91494e74115dc4ac

From b3973b995a20f46af15b7a51b6d5c92427c4e2bc Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Tue, 7 Feb 2023 11:02:56 +0800
Subject: [PATCH 265/503] [workflow] fixed test coverage report (#2611)

---
 .github/workflows/report_test_coverage.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/report_test_coverage.yml b/.github/workflows/report_test_coverage.yml
index d46e130839a5..0efb7d85782a 100644
--- a/.github/workflows/report_test_coverage.yml
+++ b/.github/workflows/report_test_coverage.yml
@@ -2,7 +2,7 @@ name: Report Test Coverage
 
 on:
   workflow_run:
-    workflows: [Build]
+    workflows: [Build on PR]
     types:
       - completed
 
@@ -56,7 +56,7 @@ jobs:
           echo "</details>" >> coverage.txt
 
       - name: 'Comment on PR'
-        if: steps.unzip.outputs.hasReport == "true"
+        if: steps.unzip.outputs.hasReport == 'true'
         uses: actions/github-script@v6
         with:
           github-token: ${{ secrets.GITHUB_TOKEN }}

From aa7e9e4794397082a89149e8aa6d0689f971b94e Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Tue, 7 Feb 2023 11:50:53 +0800
Subject: [PATCH 266/503] [workflow] fixed the test coverage report (#2614)

* [workflow] fixed the test coverage report

* polish code
---
 .github/workflows/report_test_coverage.yml | 19 +++++++++++--------
 test.sh                                    |  6 ------
 2 files changed, 11 insertions(+), 14 deletions(-)
 delete mode 100644 test.sh

diff --git a/.github/workflows/report_test_coverage.yml b/.github/workflows/report_test_coverage.yml
index 0efb7d85782a..bbada74e6850 100644
--- a/.github/workflows/report_test_coverage.yml
+++ b/.github/workflows/report_test_coverage.yml
@@ -46,14 +46,17 @@ jobs:
         run: |
           covNum=$(cat cov_number)
           title="The code coverage for the changed files is ${covNum}%."
-          (echo $title; cat coverage.txt) > coverage_tmp.txt
-          mv coverage_tmp.txt coverage.txt
-          sed -i '2 i <details>' coverage.txt
-          sed -i '3 i <summary>Click me to view the complete report</summary>' coverage.txt
-          sed -i '4 i \n' coverage.txt
-          sed -i '5 i \`\`\`text' coverage.txt
-          echo "\`\`\`" >> coverage.txt
-          echo "</details>" >> coverage.txt
+          touch coverage_report.txt
+          echo $title >> coverage_report.txt
+          echo " " >> coverage_report.txt
+          echo "<details>" >> coverage_report.txt
+          echo "<summary>Click me to view the complete report</summary>" >> coverage_report.txt
+          echo " " >> coverage_report.txt
+          echo "\`\`\`" >> coverage_report.txt
+          cat coverage.txt >> coverage_report.txt
+          echo "\`\`\`" >> coverage_report.txt
+          echo "</details>" >> coverage_report.txt
+          mv coverage_report.txt coverage.txt
 
       - name: 'Comment on PR'
         if: steps.unzip.outputs.hasReport == 'true'
diff --git a/test.sh b/test.sh
deleted file mode 100644
index 8dcecc6ddc55..000000000000
--- a/test.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-avai=true
-for i in $(seq 0 7);
-do
-  gpu_used=$(nvidia-smi -i $i --query-gpu=memory.used --format=csv,noheader,nounits)
-  [ "$gpu_used" -le "10000" ] && avai=false
-done

From 8518263b80d2c512f475e11a2ba88091861a6c46 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Tue, 7 Feb 2023 13:49:38 +0800
Subject: [PATCH 267/503] [test] fixed the triton version for testing (#2608)

---
 .github/workflows/build_on_pr.yml   | 1 +
 colossalai/amp/apex_amp/apex_amp.py | 1 +
 requirements/requirements-test.txt  | 2 +-
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml
index 82b671acea93..c7882db6ec61 100644
--- a/.github/workflows/build_on_pr.yml
+++ b/.github/workflows/build_on_pr.yml
@@ -52,6 +52,7 @@ jobs:
             **/*.h
             **/*.cpp
             **/*.cu
+            **/*.txt
 
       - name: List changed files
         run: |
diff --git a/colossalai/amp/apex_amp/apex_amp.py b/colossalai/amp/apex_amp/apex_amp.py
index 69a4e348e5a7..e6bdbe4520f9 100644
--- a/colossalai/amp/apex_amp/apex_amp.py
+++ b/colossalai/amp/apex_amp/apex_amp.py
@@ -2,6 +2,7 @@
 # -*- encoding: utf-8 -*-
 
 import torch.nn as nn
+
 try:
     import apex.amp as apex_amp
 except ImportError:
diff --git a/requirements/requirements-test.txt b/requirements/requirements-test.txt
index 9ef0a682b6b8..93055cd12109 100644
--- a/requirements/requirements-test.txt
+++ b/requirements/requirements-test.txt
@@ -9,5 +9,5 @@ torchaudio
 torchrec==0.2.0
 contexttimer
 einops
-triton==2.0.0.dev20221011
+triton==2.0.0.dev20221202
 git+https://github.com/HazyResearch/flash-attention.git@c422fee3776eb3ea24e011ef641fd5fbeb212623#egg=flash_attn

From 93fdd35b5efc94ba129d7d22c0e6243668c4d00e Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Tue, 7 Feb 2023 14:36:34 +0800
Subject: [PATCH 268/503] [build] fixed the doc build process (#2618)

---
 requirements/requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index cc99257a93e5..8e619ac24477 100644
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -8,3 +8,4 @@ click
 fabric
 contexttimer
 ninja
+torch

From 0556f5d468c1b32a8238692c397252a85e029a75 Mon Sep 17 00:00:00 2001
From: binmakeswell <binmakeswell@gmail.com>
Date: Tue, 7 Feb 2023 15:14:51 +0800
Subject: [PATCH 269/503] [tutorial] add video link (#2619)

---
 examples/tutorial/README.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/examples/tutorial/README.md b/examples/tutorial/README.md
index 633e2f5a7c96..9de1cdfdc31d 100644
--- a/examples/tutorial/README.md
+++ b/examples/tutorial/README.md
@@ -20,13 +20,13 @@ quickly deploy large AI model training and inference, reducing large AI model tr
 
 ## Table of Content
 
- - Multi-dimensional Parallelism [[code]](https://github.com/hpcaitech/ColossalAI/tree/main/examples/tutorial/hybrid_parallel)
- - Sequence Parallelism [[code]](https://github.com/hpcaitech/ColossalAI/tree/main/examples/tutorial/sequence_parallel)
- - Large Batch Training Optimization [[code]](https://github.com/hpcaitech/ColossalAI/tree/main/examples/tutorial/large_batch_optimizer)
- - Automatic Parallelism [[code]](https://github.com/hpcaitech/ColossalAI/tree/main/examples/tutorial/auto_parallel)
- - Fine-tuning and Inference for OPT [[code]](https://github.com/hpcaitech/ColossalAI/tree/main/examples/tutorial/opt)
- - Optimized AlphaFold [[code]](https://github.com/hpcaitech/ColossalAI/tree/main/examples/tutorial/fastfold)
- - Optimized Stable Diffusion [[code]](https://github.com/hpcaitech/ColossalAI/tree/main/examples/images/diffusion)
+ - Multi-dimensional Parallelism [[code]](https://github.com/hpcaitech/ColossalAI/tree/main/examples/tutorial/hybrid_parallel) [[video]](https://www.youtube.com/watch?v=OwUQKdA2Icc)
+ - Sequence Parallelism [[code]](https://github.com/hpcaitech/ColossalAI/tree/main/examples/tutorial/sequence_parallel) [[video]](https://www.youtube.com/watch?v=HLLVKb7Cszs)
+ - Large Batch Training Optimization [[code]](https://github.com/hpcaitech/ColossalAI/tree/main/examples/tutorial/large_batch_optimizer) [[video]](https://www.youtube.com/watch?v=9Un0ktxJZbI)
+ - Automatic Parallelism [[code]](https://github.com/hpcaitech/ColossalAI/tree/main/examples/tutorial/auto_parallel) [[video]](https://www.youtube.com/watch?v=_-2jlyidxqE)
+ - Fine-tuning and Inference for OPT [[code]](https://github.com/hpcaitech/ColossalAI/tree/main/examples/tutorial/opt) [[video]](https://www.youtube.com/watch?v=jbEFNVzl67Y)
+ - Optimized AlphaFold [[code]](https://github.com/hpcaitech/ColossalAI/tree/main/examples/tutorial/fastfold) [[video]](https://www.youtube.com/watch?v=-zP13LfJP7w)
+ - Optimized Stable Diffusion [[code]](https://github.com/hpcaitech/ColossalAI/tree/main/examples/images/diffusion) [[video]](https://www.youtube.com/watch?v=8KHeUjjc-XQ)
 
 
 ## Discussion
@@ -37,7 +37,7 @@ If you think there is a need to discuss anything, you may jump to our [Slack](ht
 If you encounter any problem while running these tutorials, you may want to raise an [issue](https://github.com/hpcaitech/ColossalAI/issues/new/choose) in this repository.
 
 ## 🛠️ Setup environment
-You should use `conda` to create a virtual environment, we recommend **python 3.8**, e.g. `conda create -n colossal python=3.8`. This installation commands are for CUDA 11.3, if you have a different version of CUDA, please download PyTorch and Colossal-AI accordingly.
+[[video]](https://www.youtube.com/watch?v=dpMYj974ZIc) You should use `conda` to create a virtual environment, we recommend **python 3.8**, e.g. `conda create -n colossal python=3.8`. This installation commands are for CUDA 11.3, if you have a different version of CUDA, please download PyTorch and Colossal-AI accordingly.
 
 ```
 # install torch

From 291b05117116bc0fa327e44b9bd284482ac1b703 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Tue, 7 Feb 2023 16:15:17 +0800
Subject: [PATCH 270/503] [doc] fixed broken badge (#2623)

---
 README-zh-Hans.md | 2 +-
 README.md         | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README-zh-Hans.md b/README-zh-Hans.md
index b4a73e639889..1af9b0af1115 100644
--- a/README-zh-Hans.md
+++ b/README-zh-Hans.md
@@ -11,7 +11,7 @@
    <a href="https://github.com/hpcaitech/ColossalAI/discussions"> 论坛 </a> |
    <a href="https://medium.com/@hpcaitech"> 博客 </a></h3>
 
-   [![Build](https://github.com/hpcaitech/ColossalAI/actions/workflows/build.yml/badge.svg)](https://github.com/hpcaitech/ColossalAI/actions/workflows/build.yml)
+   [![Build](https://github.com/hpcaitech/ColossalAI/actions/workflows/build_on_schedule.yml/badge.svg)](https://github.com/hpcaitech/ColossalAI/actions/workflows/build_on_schedule.yml)
    [![Documentation](https://readthedocs.org/projects/colossalai/badge/?version=latest)](https://colossalai.readthedocs.io/en/latest/?badge=latest)
    [![CodeFactor](https://www.codefactor.io/repository/github/hpcaitech/colossalai/badge)](https://www.codefactor.io/repository/github/hpcaitech/colossalai)
    [![HuggingFace badge](https://img.shields.io/badge/%F0%9F%A4%97HuggingFace-Join-yellow)](https://huggingface.co/hpcai-tech)
diff --git a/README.md b/README.md
index d10184a6e15b..96debaf5c7e9 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@
    <a href="https://github.com/hpcaitech/ColossalAI/discussions"> Forum </a> |
    <a href="https://medium.com/@hpcaitech"> Blog </a></h3>
 
-   [![Build](https://github.com/hpcaitech/ColossalAI/actions/workflows/build.yml/badge.svg)](https://github.com/hpcaitech/ColossalAI/actions/workflows/build.yml)
+   [![Build](https://github.com/hpcaitech/ColossalAI/actions/workflows/build_on_schedule.yml/badge.svg)](https://github.com/hpcaitech/ColossalAI/actions/workflows/build_on_schedule.yml)
    [![Documentation](https://readthedocs.org/projects/colossalai/badge/?version=latest)](https://colossalai.readthedocs.io/en/latest/?badge=latest)
    [![CodeFactor](https://www.codefactor.io/repository/github/hpcaitech/colossalai/badge)](https://www.codefactor.io/repository/github/hpcaitech/colossalai)
    [![HuggingFace badge](https://img.shields.io/badge/%F0%9F%A4%97HuggingFace-Join-yellow)](https://huggingface.co/hpcai-tech)
@@ -114,7 +114,7 @@ distributed training and inference in a few lines.
 
 - Inference
   - [Energon-AI](https://github.com/hpcaitech/EnergonAI)
-  
+
 <p align="right">(<a href="#top">back to top</a>)</p>
 
 ## Parallel Training Demo

From 6ba83648817a117ab8e5b1e54b41cc7c8b749333 Mon Sep 17 00:00:00 2001
From: oahzxl <43881818+oahzxl@users.noreply.github.com>
Date: Tue, 7 Feb 2023 16:32:45 +0800
Subject: [PATCH 271/503] [autochunk] support diffusion for autochunk (#2621)

* add alphafold benchmark

* renae alphafold test

* rename tests

* rename diffuser

* renme

* rename

* update transformer

* update benchmark

* update benchmark

* update bench memory

* update transformer benchmark

* rename

* support diffuser

* support unet metainfo prop

* fix bug and simplify code

* update linear and support some op

* optimize max region search, support conv

* update unet test

* support some op

* support groupnorm and interpolate

* update flow search

* add fix dim in node flow

* fix utils

* rename

* support diffusion

* update diffuser

* update chunk search

* optimize imports

* import

* finish autochunk
---
 colossalai/autochunk/autochunk_codegen.py     |  50 +++--
 colossalai/autochunk/search_chunk.py          |  52 +++--
 colossalai/autochunk/trace_flow.py            |  51 ++---
 colossalai/autochunk/trace_indice.py          | 178 +++++++++++-------
 .../test_autochunk_diffuser_utils.py          |  32 +++-
 .../test_autochunk_unet.py                    |  19 +-
 6 files changed, 216 insertions(+), 166 deletions(-)

diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
index 82937db9f6ba..90bde8730052 100644
--- a/colossalai/autochunk/autochunk_codegen.py
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -9,18 +9,7 @@
 AUTOCHUNK_AVAILABLE = CODEGEN_AVAILABLE and is_compatible_with_meta()
 
 if AUTOCHUNK_AVAILABLE:
-    from torch.fx.graph import (
-        CodeGen,
-        PythonCode,
-        _custom_builtins,
-        _CustomBuiltin,
-        _format_target,
-        _is_from_torch,
-        _Namespace,
-        _origin_type_map,
-        inplace_methods,
-        magic_methods,
-    )
+    from torch.fx.graph import CodeGen, PythonCode, _custom_builtins, _CustomBuiltin, _format_target, _is_from_torch, _Namespace, _origin_type_map, inplace_methods, magic_methods
 
 from torch.fx.node import Argument, Node, _get_qualified_name, _type_repr, map_arg
 
@@ -143,7 +132,7 @@ def _replace_reshape_size(context: str, node_name: str, reshape_size_dict: Dict)
     return context
 
 
-def _replace_ones_like(
+def _replace_new_tensor_like_shape(
     search_chunk: SearchChunk,
     chunk_infos: List[Dict],
     region_idx: int,
@@ -154,7 +143,7 @@ def _replace_ones_like(
     """
     add chunk slice for new tensor op such as ones like
     """
-    if "ones_like" in node.name:
+    if get_node_name(node) in ["ones_like", "zeros_like", "empty_like"]:
         meta_node = search_chunk.node_mgr.get_node_by_idx(node_idx)
         chunk_dim = chunk_infos[region_idx]["node_chunk_dim"][meta_node]["chunk_dim"]
         if get_node_shape(meta_node)[chunk_dim] != 1:
@@ -166,6 +155,33 @@ def _replace_ones_like(
     return body
 
 
+def _replace_new_tensor_shape(
+    search_chunk: SearchChunk,
+    chunk_infos: List[Dict],
+    region_idx: int,
+    node_idx: int,
+    node: Node,
+    body: List[str],
+) -> List[str]:
+    """
+    add chunk slice for new tensor op such as ones
+    """
+    if get_node_name(node) in ["ones", "zeros", "empty"]:
+        meta_node = search_chunk.node_mgr.get_node_by_idx(node_idx)
+        chunk_dim = chunk_infos[region_idx]["node_chunk_dim"][meta_node]["chunk_dim"]
+        if chunk_dim is None:
+            return
+        if get_node_shape(meta_node)[chunk_dim] == 1:
+            return
+        origin_shape = str(node.args)
+        new_shape = list(node.args)
+        new_shape[chunk_dim] = "min(chunk_size, %d - chunk_idx)" % get_node_shape(meta_node)[chunk_dim]
+        new_shape = str(new_shape)
+        new_shape = new_shape.replace("'", "")
+        body[-1] = _replace_name(body[-1], origin_shape[1:-1], new_shape[1:-1])
+    return body
+
+
 def _add_node_slice(
     chunk_nodes: List[Node],
     region_idx: int,
@@ -265,8 +281,10 @@ def emit_code_with_chunk(
             body = _add_node_slice(chunk_inputs, region_idx, chunk_inputs_dim, node_idx, body, node)
             # replace output var with chunk var
             body = _add_node_slice(chunk_outputs, region_idx, chunk_outputs_dim, node_idx, body, node)
-            # ones like
-            body = _replace_ones_like(search_chunk, chunk_infos, region_idx, node_idx, node, body)
+            # new tensor like
+            body = _replace_new_tensor_like_shape(search_chunk, chunk_infos, region_idx, node_idx, node, body)
+            # new tensor
+            body = _replace_new_tensor_shape(search_chunk, chunk_infos, region_idx, node_idx, node, body)
             # reassgin reshape size
             body[-1] = _replace_reshape_size(body[-1], node.name, chunk_infos[region_idx]["reshape_size"])
             body[-1] = "    " + body[-1]
diff --git a/colossalai/autochunk/search_chunk.py b/colossalai/autochunk/search_chunk.py
index 0278e03f78de..eb99490957aa 100644
--- a/colossalai/autochunk/search_chunk.py
+++ b/colossalai/autochunk/search_chunk.py
@@ -8,14 +8,7 @@
 from .select_chunk import SelectChunk
 from .trace_flow import TraceFlow
 from .trace_indice import TraceIndice
-from .utils import (
-    NodeMgr,
-    find_chunk_compute_input_and_output_nodes,
-    get_logger,
-    get_node_shape,
-    is_non_compute_node,
-    is_non_compute_node_except_placeholder,
-)
+from .utils import NodeMgr, get_logger, get_node_shape, is_non_compute_node, is_non_compute_node_except_placeholder
 
 
 class SearchChunk(object):
@@ -75,8 +68,8 @@ def _init_trace(self) -> None:
         max_chunk_region_list = []
         while True:
             max_chunk_region = self._search_max_chunk_region(active_nodes, cur_node_idx)
-            cur_node_idx = max_chunk_region[1]
-            if cur_node_idx == len(active_nodes) - 1:
+            cur_node_idx = max_chunk_region[1] + 1
+            if cur_node_idx >= len(active_nodes) - 1:
                 break
             max_chunk_region_list.append(max_chunk_region)
 
@@ -135,6 +128,7 @@ def _search_max_chunk_region(self, active_node: List, peak_node_idx: int, chunk_
         min_active_node_num = min(active_node_num[free_var_num:])
         threshold = max(free_var_num, min_active_node_num)
 
+        # normal search
         # from peak_node to free_var
         inside_flag = False
         chunk_region_start = free_var_num
@@ -144,7 +138,6 @@ def _search_max_chunk_region(self, active_node: List, peak_node_idx: int, chunk_
             if inside_flag and active_node_num[i] > threshold:
                 chunk_region_start = i + 1
                 break
-
         # from peak_node to len-2
         inside_flag = False
         chunk_region_end = len(active_node) - 1
@@ -155,6 +148,22 @@ def _search_max_chunk_region(self, active_node: List, peak_node_idx: int, chunk_
                 chunk_region_end = i
                 break
 
+        # if normal search fails, use approximate search
+        if (chunk_region_end - chunk_region_start) > 250:
+            window_size = 100
+            # search min for start
+            min_num = 1e3
+            for i in range(max(peak_node_idx - window_size, 0), peak_node_idx + 1):
+                if active_node_num[i] < min_num:
+                    min_num = active_node_num[i]
+                    chunk_region_start = i
+            # search min for end
+            min_num = 1e3
+            for i in range(min(peak_node_idx + window_size, len(active_node_num) - 1), peak_node_idx - 1, -1):
+                if active_node_num[i] < min_num:
+                    min_num = active_node_num[i]
+                    chunk_region_end = i
+
         # avoid chunk regions overlap
         if chunk_regions is not None:
             for i in chunk_regions:
@@ -271,12 +280,6 @@ def _step_search(
         best_chunk_region = self.reorder_graph.reorder_all(best_chunk_region)
         return best_chunk_region
 
-    def _stop_search(self, init_mem_peak, mem_peak):
-        sorted_init_mem_peak = sorted(init_mem_peak)
-        if max(mem_peak) < sorted_init_mem_peak[int(len(sorted_init_mem_peak) * 0.5)]:
-            return True
-        return False
-
     def search_region(self) -> Dict:
         """
         Search all chunk regions:
@@ -291,11 +294,7 @@ def search_region(self) -> Dict:
             get_logger().info("AutoChunk start searching chunk regions")
 
         chunk_infos = []
-        (
-            init_mem_peak,
-            _,
-            active_node,
-        ) = self.estimate_memory.estimate_chunk_inference_mem(self.node_mgr.get_node_list())
+        init_mem_peak, _, active_node = self.estimate_memory.estimate_chunk_inference_mem(self.node_mgr.get_node_list())
         mem_peak = init_mem_peak
 
         while True:
@@ -304,18 +303,13 @@ def search_region(self) -> Dict:
                 break
             chunk_infos.append(chunk_info)
 
-            (
-                mem_peak,
-                _,
-                active_node,
-            ) = self.estimate_memory.estimate_chunk_inference_mem(self.node_mgr.get_node_list(), chunk_infos)
+            mem_peak, _, active_node = self.estimate_memory.estimate_chunk_inference_mem(
+                self.node_mgr.get_node_list(), chunk_infos)
 
             if self.print_progress:
                 get_logger().info("AutoChunk find chunk region %d = (%d, %d)" %
                                   (len(chunk_infos), chunk_info["region"][0], chunk_info["region"][1]))
 
-            if self._stop_search(init_mem_peak, mem_peak):
-                break
         if self.print_mem:
             self.print_mem = False
             self.estimate_memory.estimate_chunk_inference_mem(self.node_mgr.get_node_list(),
diff --git a/colossalai/autochunk/trace_flow.py b/colossalai/autochunk/trace_flow.py
index 11dbb266d4b4..16815215f52b 100644
--- a/colossalai/autochunk/trace_flow.py
+++ b/colossalai/autochunk/trace_flow.py
@@ -100,6 +100,16 @@ def _assgin_single_node_flow(
         if not (start_idx <= arg_idx < end_idx):
             return True
 
+        # get fix dim
+        arg_fix_dim = []
+        if cur_node_dim is not None:
+            for i in cur_node_fix_dim:
+                fix_dim_source = cur_node_source[i]
+                if arg_idx in fix_dim_source:
+                    arg_fix_dim.append(fix_dim_source[arg_idx][0])
+        if arg_node in all_node_info:
+            arg_fix_dim = list(set(all_node_info[arg_node]["fix_dim"] + arg_fix_dim))
+
         # find arg dim
         if cur_node_dim is not None:
             # dim is computed
@@ -109,6 +119,9 @@ def _assgin_single_node_flow(
                 arg_dim = None
             else:
                 arg_dim = cur_node_source[cur_node_dim][arg_idx][0]
+                # chunk dim cannot be in fix dims
+                if arg_dim in arg_fix_dim:
+                    return False
                 # chunk dim should be None if shape size is 1
                 if get_node_shape(arg_node)[arg_dim] == 1:
                     arg_dim = None
@@ -120,19 +133,16 @@ def _assgin_single_node_flow(
         else:
             arg_dim = None
 
-        # get fix dim
-        arg_fix_dim = []
-        if cur_node_dim is not None:
-            for i in cur_node_fix_dim:
-                fix_dim_source = cur_node_source[i]
-                if arg_idx in fix_dim_source:
-                    arg_fix_dim.append(fix_dim_source[arg_idx][0])
+        # add arg rest dim as fix dim
+        arg_fix_dim = list(range(len(get_node_shape(arg_node))))
+        if arg_dim is not None:
+            arg_fix_dim.remove(arg_dim)
 
         # if already in node_info, arg dim must be same
         if arg_node in all_node_info:
             if all_node_info[arg_node]["chunk_dim"] != arg_dim:
                 return False
-            all_node_info[arg_node]["fix_dim"] = list(set(all_node_info[arg_node]["fix_dim"] + arg_fix_dim))
+            all_node_info[arg_node]["fix_dim"] = arg_fix_dim
         # else add it to list
         else:
             all_node_info[arg_node] = {"chunk_dim": arg_dim, "fix_dim": arg_fix_dim}
@@ -164,6 +174,8 @@ def _get_all_node_info(self, end_dim, start_idx, end_idx):
                         continue
                     if is_non_compute_node(arg):
                         continue
+                    if get_node_shape(arg) is None:
+                        continue
                     arg_list.append(arg)
                     flow_flag = self._assgin_single_node_flow(
                         arg,
@@ -180,29 +192,6 @@ def _get_all_node_info(self, end_dim, start_idx, end_idx):
                     if flow_flag == False:
                         return None
 
-                if len(arg_list) >= 2:
-                    # need to mark fix dim
-                    if any(i == get_node_name(cur_node) for i in ["add", "mul", "truediv", "sub", "where"]):
-                        for arg in arg_list:
-                            if get_node_shape(arg) is None:
-                                continue
-                            if not (start_idx <= self.node_mgr.find_node_idx(arg) < end_idx):
-                                continue
-                            arg_chunk_dim = all_node_info[arg]["chunk_dim"]
-                            arg_fix_dim = all_node_info[arg]["fix_dim"]
-                            arg_shape = get_node_shape(arg)
-                            # add all dim as fix dim except chunk dim
-                            for i, shape in enumerate(arg_shape):
-                                if shape != 1 and i != cur_node_chunk_dim:
-                                    if i == arg_chunk_dim:
-                                        return None
-                                    if i not in arg_fix_dim:
-                                        arg_fix_dim.append(i)
-                    elif any(i == get_node_name(cur_node)
-                             for i in ["einsum", "matmul", "view", "to", "getitem", "tensor", "type"]):
-                        pass
-                    else:
-                        raise NotImplementedError()
             cur_node_list = next_node_list
         return all_node_info
 
diff --git a/colossalai/autochunk/trace_indice.py b/colossalai/autochunk/trace_indice.py
index b591fa764423..1e41073d7da6 100644
--- a/colossalai/autochunk/trace_indice.py
+++ b/colossalai/autochunk/trace_indice.py
@@ -150,7 +150,7 @@ def _inherit_all_indice(self, node_from: Node, node_to: Node) -> None:
         for i in range(len(node_from_indice)):
             self._inherit_indice(node_from, i, node_to, i, init=True)
 
-    def _inherit_more_indice_from_node(self, node_from: Node, node_to: Node, exclude: List = None) -> None:
+    def _inherit_more_indice_from_node_with_exclude(self, node_from: Node, node_to: Node, exclude: List = None) -> None:
         """
         inheirt indice from node without init
         """
@@ -308,14 +308,14 @@ def _assign_linear_indice(self, node: Node, node_idx: int) -> None:
             node (node)
             node_idx (int)
         """
-        if len(node.args) == 2:
-            _, weight = node.args
-        else:
-            _, weight, _ = node.args
-
         self._assign_indice_as_input(node, node_idx)
-        self._inherit_indice(weight, 1, node, -1)
 
+        if len(node.args) >= 2:
+            weight = node.args[1]
+            self._inherit_indice(weight, 1, node, -1)
+        else:
+            self._del_dim(node_idx, -1)
+            self._add_dim(node_idx, -1)
         self._mark_computation(node, node_idx, [-1])
 
     def _assign_addmm_indice(self, node: Node, node_idx: int) -> None:
@@ -327,13 +327,35 @@ def _assign_addmm_indice(self, node: Node, node_idx: int) -> None:
             node_idx (int)
         """
         bias, input_node, weight = node.args
-
+        assert len(get_node_shape(bias)) == 1 and len(get_node_shape(weight)) == 2
         self._assign_indice_as_input(node, node_idx, input_node)
         self._inherit_indice(weight, 1, node, -1)
-        self._inherit_indice(bias, -1, node, -1)
+        self._inherit_more_indice_from_node_with_exclude(bias, node)
 
         self._mark_computation(node, node_idx, [-1])
 
+    def _assign_baddbmm_indice(self, node: Node, node_idx: int) -> None:
+        """
+        Assign indice for baddbmm(batch add and batch matmul) op.
+        add, matmul_left, matmul_right = args
+        out = add + (matmul_left x matmul_right)
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        add, matmul_left, matmul_right = node.args
+
+        assert get_node_shape(add) == get_node_shape(node)
+        assert len(get_node_shape(matmul_left)) == len(get_node_shape(matmul_right))
+        self._assign_indice_as_input(node, node_idx, matmul_left)
+        # matmul
+        self._inherit_indice(matmul_right, -1, node, -1)
+        self._inherit_more_indice_from_node_with_exclude(matmul_right, node, [-2, -1])
+        self._mark_computation(node, node_idx, [-1])
+        # add
+        self._inherit_more_indice_from_node_with_exclude(add, node)
+
     def _assign_matmul_indice(self, node: Node, node_idx: int) -> None:
         """
         Assign indice for matmul op.
@@ -349,11 +371,53 @@ def _assign_matmul_indice(self, node: Node, node_idx: int) -> None:
 
         assert len(get_node_shape(matmul_left)) == len(get_node_shape(matmul_right))
         self._assign_indice_as_input(node, node_idx, matmul_left)
-        self._inherit_indice(matmul_right, -1, node, -1)
 
-        self._inherit_more_indice_from_node(matmul_right, node, [-1, -2])
+        self._inherit_indice(matmul_right, -1, node, -1)
+        self._inherit_more_indice_from_node_with_exclude(matmul_right, node, [-1, -2])
         self._mark_computation(node, node_idx, [-1])
 
+    def _assign_conv2d_indice(self, node: Node, node_idx: int) -> None:
+        """
+        Assign indice for conv2d op.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        # get conv module
+        node_targets = node.target.split(".")
+        conv_module = node.graph.owning_module
+        for i in node_targets:
+            conv_module = getattr(conv_module, i)
+        assert conv_module.dilation == (1, 1), "dilation for conv2d not implemented"
+
+        # get conv input
+        assert len(node.args) == 1
+        input_node = node.args[0]
+        assert len(get_node_shape(input_node)) == 4
+
+        # assgin index
+        self._assign_indice_as_input(node, node_idx, input_node)
+        self._del_dim(node_idx, 1)
+        self._add_dim(node_idx, 1)
+        self._mark_computation(node, node_idx, [1, 2, 3])
+
+    def _assign_interpolate_indice(self, node: Node, node_idx: int) -> None:
+        """
+        Assign indice for interpolate op.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        # get conv input
+        assert node.kwargs['size'] is None
+        assert len(get_node_shape(node)) == 4
+
+        # assgin index
+        self._assign_indice_as_input(node, node_idx)
+        self._mark_computation(node, node_idx, [-1, -2])
+
     def _assign_layernorm_indice(self, node, idx):
         """
         Assign indice for layernorm op.
@@ -367,6 +431,18 @@ def _assign_layernorm_indice(self, node, idx):
         self._assign_indice_as_input(node, idx)
         self._mark_computation(node, idx, [-1])
 
+    def _assign_groupnorm_indice(self, node, idx):
+        """
+        Assign indice for groupnorm op.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        assert len(get_node_shape(node)) == 4
+        self._assign_indice_as_input(node, idx)
+        self._mark_computation(node, idx, [-1, -2, -3])
+
     def _assign_elementwise_indice(self, node, idx):
         """
         Assign indice for element-wise op (eg. relu sigmoid add mul).
@@ -382,13 +458,13 @@ def _assign_elementwise_indice(self, node, idx):
         for node_in in node.args:
             if type(node_in) == type(node):
                 nodes_in.append(node_in)
-                self._inherit_more_indice_from_node(node_in, node)
+                self._inherit_more_indice_from_node_with_exclude(node_in, node)
 
     def _assgin_no_change_indice(self, node, idx):
         self._assign_indice_as_input(node, idx)
         for node_in in node.args:
             if type(node_in) == type(node):
-                self._inherit_more_indice_from_node(node_in, node)
+                self._inherit_more_indice_from_node_with_exclude(node_in, node)
 
     def _assign_einsum_indice(self, node, idx):
         """
@@ -469,17 +545,6 @@ def _assign_unsqueeze_indice(self, node: Node, node_idx: int) -> None:
             dim_idx = list(range(len(get_node_shape(node))))[dim_idx]
         self._add_dim(node_idx, dim_idx)
 
-    def _assign_ones_like_indice(self, node: Node, node_idx: int) -> None:
-        """
-        Assign indice for oneslike op.
-        1. assign new indice for all dim
-
-        Args:
-            node (node)
-            node_idx (int)
-        """
-        self._assign_all_indice(node, node_idx)
-
     def _assign_cat_indice(self, node: Node, node_idx: int) -> None:
         """
         Assign indice for cat op.
@@ -491,7 +556,7 @@ def _assign_cat_indice(self, node: Node, node_idx: int) -> None:
         nodes_in = flat_list(node.args[0])
         self._assign_indice_as_input(node, node_idx, input_node=nodes_in[0])
         for n in nodes_in[1:]:
-            self._inherit_more_indice_from_node(n, node)
+            self._inherit_more_indice_from_node_with_exclude(n, node)
         cat_dim = node.kwargs["dim"]
         self._del_dim(node_idx, cat_dim)
         self._add_dim(node_idx, cat_dim)
@@ -508,33 +573,10 @@ def _assign_sum_indice(self, node: Node, node_idx: int) -> None:
         self._add_dim(node_idx, 0)
         self._assign_indice_as_input(node, node_idx, input_node=nodes_in[0])
         for n in nodes_in[1:]:
-            self._inherit_more_indice_from_node(n, node)
+            self._inherit_more_indice_from_node_with_exclude(n, node)
         cat_dim = node.kwargs["dim"]
         self._del_dim(node_idx, cat_dim)
 
-    def _assign_arange_indice(self, node: Node, node_idx: int) -> None:
-        """
-        Assign indice for arange op.
-
-        Args:
-            node (node)
-            node_idx (int)
-        """
-        self._assign_all_indice(node, node_idx)
-
-    def _assign_tensor_indice(self, node: Node, node_idx: int) -> None:
-        """
-        Assign indice for tensor op.
-
-        Args:
-            node (node)
-            node_idx (int)
-        """
-        if len(get_node_shape(node)) == 0:
-            return
-        else:
-            raise NotImplementedError()
-
     def _assign_embedding_indice(self, node: Node, node_idx: int) -> None:
         """
         Assign indice for embedding op.
@@ -763,10 +805,10 @@ def trace_indice(self) -> None:
                     self._assign_unsqueeze_indice(node, idx)
                 elif "split" == node_name:
                     self._assign_split_indice(node, idx)
-                elif any(i == node_name for i in ["to", "contiguous", "clone", "type"]):
+                elif any(i == node_name for i in ["to", "contiguous", "clone", "type", "float"]):
                     self._assgin_no_change_indice(node, idx)
                 elif "new_ones" == node_name:
-                    self._assign_ones_like_indice(node, idx)
+                    self._assign_all_indice(node, idx)
                 elif any(i == node_name for i in ["size"]):
                     continue
                 else:
@@ -776,25 +818,15 @@ def trace_indice(self) -> None:
                     self._assign_linear_indice(node, idx)
                 elif "cat" == node_name:
                     self._assign_cat_indice(node, idx)
-                elif "matmul" == node_name:
+                elif any(n == node_name for n in ["matmul", "bmm"]):
                     self._assign_matmul_indice(node, idx)
                 elif "softmax" == node_name:
                     self._assign_softmax_indice(node, idx)
                 elif any(n == node_name for n in [
-                        "mul",
-                        "add",
-                        "sigmoid",
-                        "relu",
-                        "sub",
-                        "truediv",
-                        "pow",
-                        "dropout",
-                        "where",
-                        "tanh",
+                        "mul", "add", "sigmoid", "relu", "sub", "truediv", "pow", "dropout", "where", "tanh", "exp",
+                        "sin", "cos"
                 ]):
                     self._assign_elementwise_indice(node, idx)
-                elif "ones_like" == node_name:
-                    self._assign_ones_like_indice(node, idx)
                 elif "einsum" == node_name:
                     self._assign_einsum_indice(node, idx)
                 elif "sum" == node_name:
@@ -805,10 +837,12 @@ def trace_indice(self) -> None:
                     self._assign_getitem_indice(node, idx)
                 elif "addmm" == node_name:
                     self._assign_addmm_indice(node, idx)
-                elif "arange" == node_name:
-                    self._assign_arange_indice(node, idx)
-                elif "tensor" == node_name:
-                    self._assign_arange_indice(node, idx)
+                elif "baddbmm" == node_name:
+                    self._assign_baddbmm_indice(node, idx)
+                elif "interpolate" == node_name:
+                    self._assign_interpolate_indice(node, idx)
+                elif any(i == node_name for i in ["arange", "ones", "ones_like", "tensor", "empty"]):
+                    self._assign_all_indice(node, idx)
                 elif any(i == node_name for i in ["getattr", "eq", "_assert_is_none", "_assert", "finfo"]):
                     continue
                 else:
@@ -817,9 +851,15 @@ def trace_indice(self) -> None:
                 node_name = get_module_node_name(node)
                 if "layernorm" == node_name:
                     self._assign_layernorm_indice(node, idx)
+                elif "groupnorm" == node_name:
+                    self._assign_groupnorm_indice(node, idx)
                 elif "embedding" == node_name:
                     self._assign_embedding_indice(node, idx)
-                elif any(n == node_name for n in ["sigmoid", "dropout", "relu"]):
+                elif "linear" == node_name:
+                    self._assign_linear_indice(node, idx)
+                elif "conv2d" == node_name:
+                    self._assign_conv2d_indice(node, idx)
+                elif any(n == node_name for n in ["sigmoid", "dropout", "relu", "silu"]):
                     self._assign_elementwise_indice(node, idx)
                 else:
                     raise NotImplementedError(node_name, "module not implemented yet!")
diff --git a/tests/test_autochunk/test_autochunk_diffuser/test_autochunk_diffuser_utils.py b/tests/test_autochunk/test_autochunk_diffuser/test_autochunk_diffuser_utils.py
index 0f3d22dc51e2..529250fe8f51 100644
--- a/tests/test_autochunk/test_autochunk_diffuser/test_autochunk_diffuser_utils.py
+++ b/tests/test_autochunk/test_autochunk_diffuser/test_autochunk_diffuser_utils.py
@@ -22,6 +22,7 @@ def assert_codegen_run(
     concrete_args: List = None,
     max_memory: int = None,
     print_mem: bool = False,
+    print_est_mem: bool = False,
     print_progress: bool = False,
     print_code: bool = False,
 ) -> List[Dict]:
@@ -35,13 +36,14 @@ def assert_codegen_run(
         meta_args={k: v.to(torch.device("meta")) for k, v in meta_args},
         concrete_args={k: v for k, v in concrete_args},
     )
+    model = model.cuda().eval()
     interp = MetaInfoProp(meta_graph)
     meta_tensors = [MetaTensor(i[1], fake_device="cuda:0") for i in meta_args] + [i[1] for i in concrete_args]
     interp.propagate(*meta_tensors)
     codegen = AutoChunkCodeGen(
         meta_graph,
         max_memory=max_memory,
-        print_mem=print_mem,
+        print_mem=print_est_mem,
         print_progress=print_progress,
     )
     chunks = codegen.chunk_infos
@@ -61,17 +63,29 @@ def assert_codegen_run(
     code = graph.python_code("self").src
     if print_code:
         print(code)
-    assert "chunk_result = None;  chunk_size = None;" in code
+    assert "chunk_size = None;  " in code
 
     # assert result
     inputs = [i[1] for i in meta_args] + [i[1] for i in concrete_args]
+    inputs = [i.cuda() if isinstance(i, torch.Tensor) else i for i in inputs]
     model.cuda().eval()
     gm.eval()
     with torch.no_grad():
-        out_gm = gm(*inputs)
-        out_model = model(*inputs)
+        if print_mem:
+            torch.cuda.reset_peak_memory_stats()
+            now_mem_gm = torch.cuda.memory_allocated() / 1024**2
+        out_gm = gm(*[i.clone() if isinstance(i, torch.Tensor) else i for i in inputs])
+        if print_mem:
+            max_mem_gm = torch.cuda.max_memory_allocated() / 1024**2
+            torch.cuda.reset_peak_memory_stats()
+            now_mem_ori = torch.cuda.memory_allocated() / 1024**2
+        out_model = model(*[i.clone() if isinstance(i, torch.Tensor) else i for i in inputs])
+        if print_mem:
+            max_mem_ori = torch.cuda.max_memory_allocated() / 1024**2
+            print("origin mem: %.2fMB, autochunk mem: %.2fMB" % (max_mem_ori - now_mem_ori, max_mem_gm - now_mem_gm))
+
     assert torch.allclose(out_gm["sample"], out_model["sample"],
-                          atol=1e-4), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(
+                          atol=1e-3), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(
                               torch.abs(out_gm["sample"] - out_model["sample"]))
 
     return chunks
@@ -82,9 +96,10 @@ def run_test(
     model: Any,
     data: tuple,
     max_memory: int,
-    print_code: bool,
-    print_mem: bool,
-    print_progress: bool,
+    print_code: bool = False,
+    print_mem: bool = False,
+    print_est_mem: bool = False,
+    print_progress: bool = False,
     get_chunk_target: Any = None,
 ) -> None:
     # launch colossalai
@@ -106,6 +121,7 @@ def run_test(
         max_memory=max_memory,
         print_code=print_code,
         print_mem=print_mem,
+        print_est_mem=print_est_mem,
         print_progress=print_progress,
     )
 
diff --git a/tests/test_autochunk/test_autochunk_diffuser/test_autochunk_unet.py b/tests/test_autochunk/test_autochunk_diffuser/test_autochunk_unet.py
index 9ebe6f393b20..518c7f45124d 100644
--- a/tests/test_autochunk/test_autochunk_diffuser/test_autochunk_unet.py
+++ b/tests/test_autochunk/test_autochunk_diffuser/test_autochunk_unet.py
@@ -17,10 +17,9 @@
 
 from colossalai.autochunk.autochunk_codegen import AUTOCHUNK_AVAILABLE
 
-BATCH_SIZE = 2
-SEQ_LENGTH = 5
-HEIGHT = 224
-WIDTH = 224
+BATCH_SIZE = 1
+HEIGHT = 448
+WIDTH = 448
 IN_CHANNELS = 3
 LATENTS_SHAPE = (BATCH_SIZE, IN_CHANNELS, HEIGHT // 7, WIDTH // 7)
 
@@ -34,26 +33,19 @@ def get_data(shape: tuple) -> Tuple[List, List]:
     return meta_args, concrete_args
 
 
-@pytest.mark.skipif(
-    True,
-    reason="not implemented",
-)
 @pytest.mark.skipif(
     not (AUTOCHUNK_AVAILABLE and HAS_REPO),
     reason="torch version is lower than 1.12.0",
 )
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("shape", [LATENTS_SHAPE])
-@pytest.mark.parametrize("max_memory", [64])
+@pytest.mark.parametrize("max_memory", [None])
 def test_evoformer_block(model, shape, max_memory):
     run_func = partial(
         run_test,
         max_memory=max_memory,
         model=model,
         data=get_data(shape),
-        print_code=False,
-        print_mem=False,
-        print_progress=False,
     )
     mp.spawn(run_func, nprocs=1)
 
@@ -62,9 +54,10 @@ def test_evoformer_block(model, shape, max_memory):
     run_test(
         rank=0,
         data=get_data(LATENTS_SHAPE),
-        max_memory=64,
+        max_memory=None,
         model=UNet2DModel,
         print_code=False,
         print_mem=False,
+        print_est_mem=False,
         print_progress=False,
     )

From 4ae02c4b1c9a9ac710b11a08cc166aaa90944d5b Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Tue, 7 Feb 2023 16:58:06 +0800
Subject: [PATCH 272/503] [tutorial] added energonai to opt inference
 requirements (#2625)

---
 examples/tutorial/opt/inference/requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/tutorial/opt/inference/requirements.txt b/examples/tutorial/opt/inference/requirements.txt
index e6e8511e3178..966dff4746f2 100644
--- a/examples/tutorial/opt/inference/requirements.txt
+++ b/examples/tutorial/opt/inference/requirements.txt
@@ -7,3 +7,4 @@ torch>=1.10.0
 transformers==4.23.1
 uvicorn==0.19.0
 colossalai
+git+https://github.com/hpcaitech/EnergonAI@main

From 90a9fdd91d12cdfb03d4eaf88ff67a47cbe65f33 Mon Sep 17 00:00:00 2001
From: Boyuan Yao <70263930+Cypher30@users.noreply.github.com>
Date: Wed, 8 Feb 2023 11:05:31 +0800
Subject: [PATCH 273/503] [autoparallel] Patch meta information of
 `torch.matmul` (#2584)

* [autoparallel] matmul metainfo

* [auto_parallel] remove unused print

* [tests] skip test_matmul_handler when torch version is lower than 1.12.0
---
 .../meta_profiler/meta_registry/linear.py     | 235 +++++++++++++++++-
 .../node_handler/matmul_handler.py            |   4 +-
 .../tensor_shard/node_handler/node_handler.py |   9 +
 colossalai/fx/profiler/opcount.py             |  27 +-
 .../test_metainfo/test_matmul_metainfo.py     | 145 +++++++++++
 .../test_node_handler/test_matmul_handler.py  |   2 +
 6 files changed, 417 insertions(+), 5 deletions(-)
 create mode 100644 tests/test_auto_parallel/test_tensor_shard/test_metainfo/test_matmul_metainfo.py

diff --git a/colossalai/auto_parallel/meta_profiler/meta_registry/linear.py b/colossalai/auto_parallel/meta_profiler/meta_registry/linear.py
index 61f8fdff33a1..617375721222 100644
--- a/colossalai/auto_parallel/meta_profiler/meta_registry/linear.py
+++ b/colossalai/auto_parallel/meta_profiler/meta_registry/linear.py
@@ -1,3 +1,4 @@
+from functools import reduce
 from typing import Callable, Dict, List, Tuple, Union
 
 import torch
@@ -16,7 +17,7 @@
 
 from ..registry import meta_register
 
-__all__ = ['linear_meta_info']
+__all__ = ['linear_meta_info', 'matmul_meta_info']
 
 
 @meta_register.register(torch.nn.functional.linear)
@@ -170,3 +171,235 @@ def linear_meta_info(*args, **kwargs) -> Tuple[TrainCycleItem, TrainCycleItem, L
     fwd_out = [torch.zeros_like(output_tensor, device='meta')]
 
     return compute_cost, memory_cost, fwd_in, fwd_buffer, fwd_out
+
+
+@meta_register.register(torch.matmul)
+def matmul_meta_info(*args, **kwargs) -> Tuple[TrainCycleItem, TrainCycleItem, List[torch.Tensor]]:
+    """torch.matmul meta info generator
+    There are several cases for torch.matmul:
+    1. Vector-vector multiplication => no temp memory, forward memory cost is 1 element (could be neglected), backward memory cost is the same
+    as two input vectors.
+    2. Matrix-vector multiplication => if the first input is matrix, no temp memory is needed, otherwise, there is a temp memory in the backward
+    phase for the transpose of the matrix. The forward memory cost is the size of output tensor, backward memory cost is the size of the two inputs; if
+    the first input is vector, the forward memory cost is the size of the output tensor, and during the backward phase, it will allocate a temp memory
+    the same size as the input matrix, and allocate memory for the gradient of two inputs.
+    3. Batched Matrix-vector multiplication => if the first input is the batched matrix, no temp memory, the forward memory cost is the size of
+    output tensor, backward memory cost is the size of the two inputs; if the second input is the batched matrix, the matmul will allocate memory for
+    the gradient of the batched matrix in the forward phase (as they create a new tensor without the former batches), so the forward memory cost is
+    the output tensor and the newly created matrix (take the same amount of memory of the input batched matrix). During the backward phase, it will
+    allocate a temp memory the same size as input batched matrix, and allocate a tensor for the gradient of the input vector. The gradient of the batched
+    matrix will be stored in the memory allocated during the forward phase.
+    3. Matrix-matrix multiplication => no temp memory, forward memory is the size of output tensor, backward memory is the size of the two inputs
+    4. Batched matrix-matrix multiplication => if the first input is the batched matrix, no temp memory, the forward memory cost is the size of two
+    inputs and backward memory cost is the size of the output tensor; if the second input is the batched matrix, during the forward phase it will allocate
+    memory for the output and gradient of the second input, and has a temp memory the same size as the output, during the backward phase, it
+    will allocate memory for the gradient of the first input and has a temp memory which is as big as output and the second input.
+    5. Batched matrix-batched matrix multiplication => if the two inputs have the same batch dimensions, no temp memory, the forward memory cost is the size
+    of output, backward memory cost is the size of the two inputs; it the two inputs have different batch dimensions, during the forward phase it will allocate
+    memory of the expanded inputs (so that the batch dimensions could match) and the output, and during the backward phase, it has a temp memory of the size of
+    two expanded inputs, and it will allocate memory for the gradient of the two inputs and discard the expanded inputs allocated during the forward phase.
+
+    Returns:
+        Tuple[TrainCycleItem, TrainCycleItem, bool]: compute cost, memory cost and forward inputs
+
+    """
+    # Get input and output tensors
+    input_tensors = [args[0].data, args[1].data]
+    output_tensors = [args[-1].data]
+
+    # Check dimension
+    if all(len(tensor.shape) == 1 for tensor in input_tensors):
+        # Dot
+        fwd_compute_cost = flop_mapping[torch.ops.aten.dot.default](input_tensors, output_tensors)
+        bwd_compute_cost = flop_mapping[torch.ops.aten.mul.Tensor](input_tensors[0], output_tensors) * 2
+
+        fwd_mem_cost = MemoryCost(activation=activation_size(output_tensors), parameter=0, temp=0, buffer=0)
+        bwd_mem_cost = MemoryCost(activation=activation_size(input_tensors), parameter=0, temp=0, buffer=0)
+
+    elif len(input_tensors[0].shape) >= 2 and len(input_tensors[1].shape) == 1:
+        # gemv case 1: matrix-vector multiplication
+        # &
+        # batched gemv case 1: batched matrix-vector multiplication
+
+        fwd_compute_cost = flop_mapping[torch.ops.aten.mv.default](
+            [input_tensors[0].reshape(-1, input_tensors[0].shape[-1]), input_tensors[1]], output_tensors)
+
+        # combine the dimensions of output
+        bwd_compute_cost = flop_mapping[torch.ops.aten.mul.Tensor](
+                           [output_tensors[0].reshape(-1), input_tensors[1]],
+                           output_tensors) + \
+                           flop_mapping[torch.ops.aten.mv.default](
+                           [input_tensors[0].reshape(-1, input_tensors[0].shape[-1]).transpose(0, 1), output_tensors[0].reshape(-1)],
+                           output_tensors)
+
+        fwd_mem_cost = MemoryCost(activation=activation_size(output_tensors), parameter=0, temp=0, buffer=0)
+        bwd_mem_cost = MemoryCost(activation=activation_size(input_tensors), parameter=0, temp=0, buffer=0)
+
+    elif len(input_tensors[0].shape) == 1 and len(input_tensors[1].shape) == 2:
+        # gemv case 2: vector-matrix multiplication
+        fwd_compute_cost = flop_mapping[torch.ops.aten.mv.default](input_tensors, output_tensors)
+
+        bwd_compute_cost = flop_mapping[torch.ops.aten.mul.Tensor]([output_tensors[0], input_tensors[0]], output_tensors) + \
+                           flop_mapping[torch.ops.aten.mv.default]([input_tensors[1], output_tensors[0]], output_tensors)
+
+        fwd_mem_cost = MemoryCost(activation=activation_size(output_tensors), parameter=0, temp=0, buffer=0)
+        bwd_mem_cost = MemoryCost(activation=activation_size(input_tensors),
+                                  parameter=0,
+                                  temp=activation_size(input_tensors[1]),
+                                  buffer=0)
+
+    elif len(input_tensors[0].shape) == 1 and len(input_tensors[1].shape) >= 3:
+        # batched gemv case 2: vector-batched matrix multiplication
+
+        fwd_compute_cost = flop_mapping[torch.ops.aten.mv.default](
+            [input_tensors[1].transpose(-2, -1).reshape(-1, input_tensors[1].shape[-2]), input_tensors[0]],
+            [output_tensors[0].reshape(-1)])
+
+        # combine the dimensions of output
+        bwd_compute_cost = flop_mapping[torch.ops.aten.mul.Tensor](
+                           [output_tensors[0].reshape(-1), input_tensors[0]],
+                           output_tensors
+                           ) + \
+                           flop_mapping[torch.ops.aten.mv.default](
+                           [input_tensors[1].transpose(-2, -1).reshape(-1, input_tensors[1].shape[-2]).transpose(0, 1), output_tensors[0].reshape(-1)],
+                           output_tensors
+                           )
+
+        fwd_mem_cost = MemoryCost(activation=activation_size(output_tensors + [input_tensors[1]]))
+        bwd_mem_cost = MemoryCost(activation=activation_size(input_tensors[0]),
+                                  parameter=0,
+                                  temp=activation_size(input_tensors[1]),
+                                  buffer=0)
+
+    elif len(input_tensors[0].shape) >= 2 and len(input_tensors[1].shape) == 2:
+        # gemm & batched gemm case 1: batched matrix-matrix multiplication
+
+        fwd_compute_cost = flop_mapping[torch.ops.aten.mm.default](
+            [input_tensors[0].reshape(-1, input_tensors[0].shape[-1]), input_tensors[1]],
+            [output_tensors[0].reshape(-1, output_tensors[0].shape[-1])])
+
+        bwd_compute_cost = flop_mapping[torch.ops.aten.mm.default](
+                           [input_tensors[0].reshape(-1, input_tensors[0].shape[-1]).transpose(0, 1), output_tensors[0].reshape(-1, output_tensors[0].shape[-1])],
+                           [input_tensors[1]]
+                           ) + \
+                           flop_mapping[torch.ops.aten.mm.default](
+                           [output_tensors[0].reshape(-1, output_tensors[0].shape[-1]), input_tensors[1].transpose(0, 1)],
+                           [input_tensors[0].reshape(-1, input_tensors[0].shape[-1])]
+                           )
+
+        fwd_mem_cost = MemoryCost(activation=activation_size(output_tensors), parameter=0, temp=0, buffer=0)
+        bwd_mem_cost = MemoryCost(activation=activation_size(input_tensors), parameter=0, temp=0, buffer=0)
+
+    elif len(input_tensors[0].shape) == 2 and len(input_tensors[1].shape) >= 3:
+        # batched gemm case 2: matrix-batched matrix multiplication
+        fwd_compute_cost = flop_mapping[torch.ops.aten.mm.default]([
+            input_tensors[1].transpose(-2, -1).reshape(-1, input_tensors[1].shape[-2]), input_tensors[0].transpose(
+                0, 1)
+        ], [output_tensors[0].transpose(-2, -1)])
+
+        bwd_compute_cost = flop_mapping[torch.ops.aten.mm.default](
+                           [output_tensors[0].transpose(-2, -1).reshape(-1, output_tensors[0].shape[-2]).transpose(0, 1), input_tensors[1].transpose(-2, -1).reshape(-1, input_tensors[1].shape[-2])],
+                           [input_tensors[0]]
+                           ) + \
+                           flop_mapping[torch.ops.aten.mm.default](
+                           [output_tensors[0].transpose(-2, -1).reshape(-1, output_tensors[0].shape[-2]), input_tensors[0]],
+                           [input_tensors[1].transpose(-2, -1).reshape(-1, input_tensors[1].shape[-2])]
+                           )
+
+        fwd_mem_cost = MemoryCost(activation=activation_size(output_tensors) + activation_size(input_tensors[1]),
+                                  temp=activation_size(output_tensors))
+        bwd_mem_cost = MemoryCost(activation=activation_size(input_tensors[0]),
+                                  parameter=0,
+                                  temp=activation_size(input_tensors[1]) + activation_size(output_tensors))
+
+    elif all(len(tensor.shape) >= 3 for tensor in input_tensors):
+        # Batched matrix-batched matrix multiplication
+        # Fetch shape of the two inputs and see if the batch dimensions are the same
+        _is_batch_dims_same = True
+        if len(input_tensors[0].shape) == len(input_tensors[1].shape):
+            for (shape_0, shape_1) in zip(input_tensors[0].shape[:-2], input_tensors[1].shape[:-2]):
+                if shape_0 != shape_1:
+                    _is_batch_dims_same = False
+                    break
+        else:
+            _is_batch_dims_same = False
+
+        # retireve dimensions
+        input_dim_00 = input_tensors[0].shape[-2]
+        input_dim_01 = input_tensors[0].shape[-1]
+        input_dim_10 = input_tensors[1].shape[-2]
+        input_dim_11 = input_tensors[1].shape[-1]
+        output_dim_0 = output_tensors[0].shape[-2]
+        output_dim_1 = output_tensors[0].shape[-1]
+
+        if _is_batch_dims_same:
+            # Case 1: batch dimensions are the same
+
+            # Forward compute cost: C = A * B
+            fwd_compute_cost = flop_mapping[torch.ops.aten.bmm.default]([
+                input_tensors[0].reshape(-1, input_dim_00, input_dim_01), input_tensors[1].reshape(
+                    -1, input_dim_10, input_dim_11)
+            ], [output_tensors[0].reshape(-1, output_dim_0, output_dim_1)])
+
+            # Backward compute cost: dB = A^T * dC, dA = dC * B^T
+            bwd_compute_cost = flop_mapping[torch.ops.aten.bmm.default](
+                               [input_tensors[0].transpose(-2, -1).reshape(-1, input_dim_01, input_dim_00), output_tensors[0].reshape(-1, output_dim_0, output_dim_1)],
+                               [input_tensors[1].reshape(-1, input_dim_11, input_dim_10)]
+                               ) + \
+                               flop_mapping[torch.ops.aten.bmm.default](
+                               [output_tensors[0].reshape(-1, output_dim_0, output_dim_1), input_tensors[1].transpose(-2, -1).reshape(-1, input_dim_11, input_dim_10)],
+                               [input_tensors[0].reshape(-1, input_dim_00, input_dim_01)]
+                               )
+
+            fwd_mem_cost = MemoryCost(activation=activation_size(output_tensors))
+            bwd_mem_cost = MemoryCost(activation=activation_size(input_tensors))
+
+        else:
+            # Case 2: batch dimensions are different
+            batch_dims = output_tensors[0].shape[:-2]
+            extended_input_0 = torch.rand(reduce(lambda x, y: x * y, batch_dims),
+                                          input_dim_00,
+                                          input_dim_01,
+                                          device="meta")
+            extended_input_1 = torch.rand(reduce(lambda x, y: x * y, batch_dims),
+                                          input_dim_10,
+                                          input_dim_11,
+                                          device="meta")
+
+            # Forward compute cost: C = A * B
+            fwd_compute_cost = flop_mapping[torch.ops.aten.bmm.default](
+                [extended_input_0, extended_input_1], [output_tensors[0].reshape(-1, output_dim_0, output_dim_1)])
+
+            # Backward compute cost: dB = A^T * dC, dA = dC * B^T
+            bwd_compute_cost = flop_mapping[torch.ops.aten.bmm.default](
+                               [extended_input_0.transpose(-2, -1), output_tensors[0].reshape(-1, output_dim_0, output_dim_1)],
+                               [extended_input_1]
+                               ) + \
+                               flop_mapping[torch.ops.aten.bmm.default](
+                               [output_tensors[0].reshape(-1, output_dim_0, output_dim_1), extended_input_1.transpose(-2, -1)],
+                               [extended_input_0]
+                               )
+
+            fwd_mem_cost = MemoryCost(
+                activation=activation_size([output_tensors[0], extended_input_0, extended_input_1]))
+            bwd_mem_cost = MemoryCost(activation=activation_size(input_tensors) -
+                                      activation_size([extended_input_0, extended_input_1]),
+                                      temp=activation_size([extended_input_0, extended_input_1]))
+
+    # compute cost
+    compute_cost = TrainCycleItem(fwd=fwd_compute_cost, bwd=bwd_compute_cost, total=fwd_compute_cost + bwd_compute_cost)
+
+    # memory cost
+    total_cost = MemoryCost(activation=fwd_mem_cost.activation + bwd_mem_cost.activation,
+                            parameter=fwd_mem_cost.parameter + bwd_mem_cost.parameter,
+                            temp=fwd_mem_cost.temp + bwd_mem_cost.temp,
+                            buffer=fwd_mem_cost.buffer + bwd_mem_cost.buffer)
+
+    memory_cost = TrainCycleItem(fwd=fwd_mem_cost, bwd=bwd_mem_cost, total=total_cost)
+
+    # store fwd_in, fwd_buffer, fwd_out
+    fwd_in = input_tensors
+    fwd_buffer = []
+    fwd_out = output_tensors
+
+    return compute_cost, memory_cost, fwd_in, fwd_buffer, fwd_out
diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/matmul_handler.py b/colossalai/auto_parallel/tensor_shard/node_handler/matmul_handler.py
index 131c35156dcd..f3c9d0cbf826 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/matmul_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/matmul_handler.py
@@ -16,7 +16,7 @@
 
 from ..sharding_strategy import OperationData, OperationDataType, ShardingStrategy
 from ..utils import recover_sharding_spec_for_broadcast_shape
-from .node_handler import NodeHandler
+from .node_handler import MetaInfoNodeHandler, NodeHandler
 from .registry import operator_registry
 from .strategy import (
     BatchedMatMulStrategyGenerator,
@@ -326,7 +326,7 @@ def _get_bmm_logical_shape(input_shape, other_shape, transforms):
 
 @operator_registry.register(torch.matmul)
 @operator_registry.register(torch.Tensor.matmul)
-class MatMulHandler(NodeHandler):
+class MatMulHandler(MetaInfoNodeHandler):
     """
     The MatMulHandler is a node handler which handles the sharding strategy generation for the matmul operation.
     According to https://pytorch.org/docs/stable/generated/torch.matmul.html, the operations will vary depending on
diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py b/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py
index fbab2b61e5af..c6f8d035a820 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py
@@ -16,6 +16,7 @@
 )
 from colossalai.auto_parallel.tensor_shard.utils import check_sharding_spec_validity
 from colossalai.device.device_mesh import DeviceMesh
+from colossalai.logging import get_dist_logger
 from colossalai.tensor.shape_consistency import ShapeConsistencyManager
 
 from .strategy import StrategyGenerator
@@ -266,6 +267,10 @@ def register_strategy(self, compute_resharding_cost: bool = True) -> StrategiesV
             # attach metainfos to the handler
             setattr(self, "metainfo_vector", metainfo_vector)
 
+        else:
+            logger = get_dist_logger()
+            logger.warning(f'The target function {target} is not patched yet, ')
+
         return self.strategies_vector
 
 
@@ -317,4 +322,8 @@ def register_strategy(self, compute_resharding_cost: bool = True) -> StrategiesV
             # attach metainfos to the handler
             setattr(self, "metainfo_vector", metainfo_vector)
 
+        else:
+            logger = get_dist_logger()
+            logger.warning(f'The target function {target} is not patched yet')
+
         return self.strategies_vector
diff --git a/colossalai/fx/profiler/opcount.py b/colossalai/fx/profiler/opcount.py
index d780ef6d49c9..6bdec865fd84 100644
--- a/colossalai/fx/profiler/opcount.py
+++ b/colossalai/fx/profiler/opcount.py
@@ -20,7 +20,28 @@ def matmul_flop_jit(inputs: List[Any], outputs: List[Any]) -> Number:
     # Inputs contains the shapes of two matrices.
     input_shapes = [v.shape for v in inputs]
     assert len(input_shapes) == 2, input_shapes
-    assert input_shapes[0][-1] == input_shapes[1][-2], input_shapes
+
+    # There are three cases: 1) gemm, 2) gemv, 3) dot
+    if all(len(shape) == 2 for shape in input_shapes):
+        # gemm
+        assert input_shapes[0][-1] == input_shapes[1][-2], input_shapes
+    elif all(len(shape) == 1 for shape in input_shapes):
+        # dot
+        assert input_shapes[0][0] == input_shapes[1][0], input_shapes
+
+        # expand shape
+        input_shapes[0] = torch.Size([1, input_shapes[0][0]])
+        input_shapes[1] = torch.Size([input_shapes[1][0], 1])
+    else:
+        # gemv
+        if len(input_shapes[0]) == 1:
+            assert input_shapes[0][0] == input_shapes[1][-2], input_shapes
+            input_shapes.reverse()
+        else:
+            assert input_shapes[1][0] == input_shapes[0][-1], input_shapes
+
+        # expand the shape of the vector to [batch size, 1]
+        input_shapes[-1] = torch.Size([input_shapes[-1][-1], 1])
     flops = reduce(operator.mul, input_shapes[0]) * input_shapes[-1][-1]
     return flops
 
@@ -204,8 +225,10 @@ def zero_flop_jit(*args):
 
 if version.parse(torch.__version__) >= version.parse('1.12.0'):
     flop_mapping = {
-    # gemm
+    # gemm, gemv and dot
         aten.mm.default: matmul_flop_jit,
+        aten.mv.default: matmul_flop_jit,
+        aten.dot.default: matmul_flop_jit,
         aten.matmul.default: matmul_flop_jit,
         aten.addmm.default: addmm_flop_jit,
         aten.bmm.default: bmm_flop_jit,
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_metainfo/test_matmul_metainfo.py b/tests/test_auto_parallel/test_tensor_shard/test_metainfo/test_matmul_metainfo.py
new file mode 100644
index 000000000000..3fb9c3d85d64
--- /dev/null
+++ b/tests/test_auto_parallel/test_tensor_shard/test_metainfo/test_matmul_metainfo.py
@@ -0,0 +1,145 @@
+from functools import partial
+
+import pytest
+import torch
+import torch.multiprocessing as mp
+import torch.nn as nn
+
+from colossalai.auto_parallel.tensor_shard.node_handler import LinearModuleHandler
+from colossalai.auto_parallel.tensor_shard.sharding_strategy import (
+    MemoryCost,
+    OperationData,
+    OperationDataType,
+    ShardingStrategy,
+    StrategiesVector,
+    TrainCycleItem,
+)
+from colossalai.device.device_mesh import DeviceMesh
+from colossalai.fx import ColoGraphModule, ColoTracer
+from colossalai.initialize import launch
+from colossalai.logging import disable_existing_loggers
+from colossalai.testing.pytest_wrapper import run_on_environment_flag
+from colossalai.testing.utils import parameterize, rerun_if_address_is_in_use
+from colossalai.utils import free_port
+from tests.test_auto_parallel.test_tensor_shard.test_metainfo.utils import mem_test_for_node_strategy
+
+if torch.__version__ >= '1.12.0':
+    from colossalai.auto_parallel.meta_profiler import MetaInfo, meta_register
+
+
+@pytest.mark.skipif(torch.__version__ < '1.12.0', reason="need pytorch 1.12.0 or higher for aten level operations")
+@parameterize(
+    'tensor_shapes',
+    [
+        [[128], [128]],    # dot product
+        [[64, 128], [128]],    # mat-vec
+        [[128], [128, 64]],    # vec-mat
+        [[64, 64, 128], [128]],    # batched mat-vec
+        [[128], [64, 128, 64]],    # vec-batched mat
+        [[64, 128], [128, 192]],    # mat-mat
+        [[64, 64, 128], [128, 192]],    # batched mat-mat
+        [[64, 128], [64, 128, 192]],    # mat-batched mat
+        [[64, 64, 128], [64, 128, 192]],    # batched mat-batched mat (matched batch dims)
+        [[64, 1, 64, 128], [64, 128, 192]],    # batched mat-batched mat (unmatched batch dims)
+    ])
+def test_matmul_function_meta_info(tensor_shapes):
+    meta_func = meta_register.get(torch.matmul)
+
+    # construct meta tensors
+    input_tensor = torch.rand(*tensor_shapes[0], device="meta")
+    other_tensor = torch.rand(*tensor_shapes[1], device="meta")
+    output_tensor = torch.matmul(input_tensor, other_tensor)
+
+    # construct operation data
+    input_data = OperationData(
+        name="input",
+        data=input_tensor,
+        type=OperationDataType.ARG,
+        logical_shape=input_tensor.shape,
+    )
+    other_data = OperationData(
+        name="other",
+        data=other_tensor,
+        type=OperationDataType.ARG,
+        logical_shape=other_tensor.shape,
+    )
+    output_data = OperationData(
+        name="output",
+        data=output_tensor,
+        type=OperationDataType.OUTPUT,
+        logical_shape=output_tensor.shape,
+    )
+
+    # construct args and kwargs
+    args = [input_data, other_data, output_data]
+    kwargs = {'inplace': False}
+
+    # estimated results
+    compute_cost, memory_cost, fwd_in, fwd_buffer, fwd_out = meta_func(*args, **kwargs)
+
+    # actual results
+    input_real_tensor = torch.rand(*tensor_shapes[0], device="cuda:0")
+    other_real_tensor = torch.rand(*tensor_shapes[1], device="cuda:0")
+
+    input_real_tensor.requires_grad = True
+    other_real_tensor.requires_grad = True
+
+    # fwd
+    torch.cuda.reset_peak_memory_stats()
+    mem_stamp0 = torch.cuda.memory_allocated()
+    output_real_tensor = torch.matmul(input_real_tensor, other_real_tensor)
+    fwd_allocated = torch.cuda.memory_allocated() - mem_stamp0
+    fwd_peak = torch.cuda.max_memory_allocated() - mem_stamp0
+
+    # bwd
+    upstream_grad = torch.rand_like(output_real_tensor)
+    torch.cuda.reset_peak_memory_stats()
+    mem_stamp0 = torch.cuda.memory_allocated()
+    torch.autograd.backward(output_real_tensor, upstream_grad)
+    bwd_allocated = torch.cuda.memory_allocated() - mem_stamp0
+    bwd_peak = torch.cuda.max_memory_allocated() - mem_stamp0
+
+    compute_cost: TrainCycleItem
+    memory_cost: TrainCycleItem
+
+    print("=====================")
+    print(f"input shapes: {tensor_shapes[0]}, {tensor_shapes[1]}")
+    print(f"output shapes: {output_tensor.shape}")
+
+    # estimated results
+    print("Estimated Results")
+
+    # compute cost
+    print("compute_cost:")
+    print(f"    fwd: {compute_cost.fwd}")
+    print(f"    bwd: {compute_cost.bwd}")
+
+    # memory cost
+    print("memory_cost:")
+    # fwd
+    print(f"    fwd activation: {memory_cost.fwd.activation / 1024} KB")
+    print(f"    fwd buffer: {memory_cost.fwd.buffer / 1024} KB")
+    print(f"    fwd temp: {memory_cost.fwd.temp / 1024} KB")
+    print(f"    fwd parameter: {memory_cost.fwd.parameter / 1024} KB")
+
+    # bwd
+    print(f"    bwd activation: {memory_cost.bwd.activation / 1024} KB")
+    print(f"    bwd buffer: {memory_cost.bwd.buffer / 1024} KB")
+    print(f"    bwd temp: {memory_cost.bwd.temp / 1024} KB")
+    print(f"    bwd parameter: {memory_cost.bwd.parameter / 1024} KB")
+
+    # actual results
+    print("Actual Results")
+
+    print("memory_cost:")
+    # fwd
+    print(f"    fwd allocated: {fwd_allocated / 1024} KB")
+    print(f"    fwd peak: {fwd_peak / 1024} KB")
+
+    # bwd
+    print(f"    bwd allocated: {bwd_allocated / 1024} KB")
+    print(f"    bwd peak: {bwd_peak / 1024} KB")
+
+
+if __name__ == '__main__':
+    test_matmul_function_meta_info()
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_matmul_handler.py b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_matmul_handler.py
index 306c45f56dbf..91b3ae27d599 100644
--- a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_matmul_handler.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_matmul_handler.py
@@ -1,3 +1,4 @@
+import pytest
 import torch
 import torch.nn as nn
 
@@ -24,6 +25,7 @@ def forward(self, x1, x2):
         return torch.matmul(x1, x2)
 
 
+@pytest.mark.skipif(torch.__version__ < '1.12.0', reason="need pytorch 1.12.0 or higher for aten level operations")
 @parameterize(
     'tensor_shapes',
     [

From d3480396f81a76cc51ba7ae5a89bee9930152f1b Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Wed, 8 Feb 2023 13:48:08 +0800
Subject: [PATCH 274/503] [doc] updated the sphinx theme (#2635)

---
 docs/conf.py          | 10 ++++++----
 docs/requirements.txt |  8 ++++----
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/docs/conf.py b/docs/conf.py
index 893644f709d4..52e999f3b938 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -23,8 +23,7 @@
 author = 'HPC-AI Technology Inc.'
 
 # The full version, including alpha/beta/rc tags
-release = '0.0.1'
-
+# release = '0.0.1'
 
 # -- General configuration ---------------------------------------------------
 
@@ -64,14 +63,14 @@
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 #
-html_theme = 'sphinx_rtd_theme'
+html_theme = 'sphinx_book_theme'
 html_show_sourcelink = False
 html_theme_options = {
     'navigation_depth': 3,
 }
 
 html_context = {
-    'display_github': False,
+    'display_github': True,
     'github_user': 'hpcaitech',
     'github_repo': 'ColossalAI',
     #   'github_version': 'master/docs/',
@@ -90,7 +89,10 @@
 source_suffix = ['.rst', '.md', '.MD']
 
 import inspect
+
 import colossalai
+
+
 def linkcode_resolve(domain, info):
     """
     Determine the URL corresponding to Python object
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 2b3b1a25bca4..c93221495e2c 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,5 +1,5 @@
-tensorboard 
-apex 
-sphinx 
-sphinx-rtd-theme 
+tensorboard
+apex
+sphinx
 myst-parser
+sphinx-book-theme

From 292c81ed7c369a17b7bcf572c6d347a698ec818d Mon Sep 17 00:00:00 2001
From: Fazzie-Maqianli <55798671+Fazziekey@users.noreply.github.com>
Date: Wed, 8 Feb 2023 13:50:27 +0800
Subject: [PATCH 275/503] fix/transformer-verison (#2581)

---
 examples/images/diffusion/README.md           |  2 +-
 examples/images/diffusion/environment.yaml    |  2 +-
 examples/images/diffusion/requirements.txt    |  2 +-
 .../dreambooth/train_dreambooth_colossalai.py | 37 ++++++++-----------
 4 files changed, 19 insertions(+), 24 deletions(-)

diff --git a/examples/images/diffusion/README.md b/examples/images/diffusion/README.md
index bec1c7503b4e..952da5d1c3b0 100644
--- a/examples/images/diffusion/README.md
+++ b/examples/images/diffusion/README.md
@@ -52,7 +52,7 @@ You can also update an existing [latent diffusion](https://github.com/CompVis/la
 
 ```
 conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit=11.3 -c pytorch
-pip install transformers==4.19.2 diffusers invisible-watermark
+pip install transformers diffusers invisible-watermark
 ```
 
 #### Step 2: install lightning
diff --git a/examples/images/diffusion/environment.yaml b/examples/images/diffusion/environment.yaml
index 69904c72ea73..5164be72e556 100644
--- a/examples/images/diffusion/environment.yaml
+++ b/examples/images/diffusion/environment.yaml
@@ -18,7 +18,7 @@ dependencies:
     - test-tube>=0.7.5
     - streamlit==1.12.1
     - einops==0.3.0
-    - transformers==4.19.2
+    - transformers
     - webdataset==0.2.5
     - kornia==0.6
     - open_clip_torch==2.0.2
diff --git a/examples/images/diffusion/requirements.txt b/examples/images/diffusion/requirements.txt
index 1a9233d578ef..d0af35353b66 100644
--- a/examples/images/diffusion/requirements.txt
+++ b/examples/images/diffusion/requirements.txt
@@ -9,7 +9,7 @@ omegaconf==2.1.1
 test-tube>=0.7.5
 streamlit>=0.73.1
 einops==0.3.0
-transformers==4.19.2
+transformers
 webdataset==0.2.5
 open-clip-torch==2.7.0
 gradio==3.11
diff --git a/examples/images/dreambooth/train_dreambooth_colossalai.py b/examples/images/dreambooth/train_dreambooth_colossalai.py
index 9c72c06e79fe..5c4c86bc7073 100644
--- a/examples/images/dreambooth/train_dreambooth_colossalai.py
+++ b/examples/images/dreambooth/train_dreambooth_colossalai.py
@@ -10,7 +10,7 @@
 import torch.utils.checkpoint
 from diffusers import AutoencoderKL, DDPMScheduler, DiffusionPipeline, UNet2DConditionModel
 from diffusers.optimization import get_scheduler
-from huggingface_hub import HfFolder, Repository, whoami
+from huggingface_hub import HfFolder, Repository, create_repo, whoami
 from PIL import Image
 from torch.utils.data import Dataset
 from torchvision import transforms
@@ -133,9 +133,13 @@ def parse_args(input_args=None):
         default="cpu",
         help="Placement Policy for Gemini. Valid when using colossalai as dist plan.",
     )
-    parser.add_argument("--center_crop",
-                        action="store_true",
-                        help="Whether to center crop images before resizing to resolution")
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=("Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+              " cropped. The images will be resized to the resolution first before cropping."),
+    )
     parser.add_argument("--train_batch_size",
                         type=int,
                         default=4,
@@ -149,13 +153,6 @@ def parse_args(input_args=None):
         help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
     )
     parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help=
-        "Number of updates steps to accumulate before performing a backward/update pass. If using Gemini, it must be 1",
-    )
     parser.add_argument(
         "--gradient_checkpointing",
         action="store_true",
@@ -356,7 +353,6 @@ def gemini_zero_dpp(model: torch.nn.Module, placememt_policy: str = "auto"):
 
 
 def main(args):
-
     if args.seed is None:
         colossalai.launch_from_torch(config={})
     else:
@@ -410,7 +406,8 @@ def main(args):
                 repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
             else:
                 repo_name = args.hub_model_id
-            repo = Repository(args.output_dir, clone_from=repo_name)
+            create_repo(repo_name, exist_ok=True, token=args.hub_token)
+            repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
 
             with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                 if "step_*" not in gitignore:
@@ -469,9 +466,8 @@ def main(args):
     if args.gradient_checkpointing:
         unet.enable_gradient_checkpointing()
 
-    assert args.gradient_accumulation_steps == 1, "if using ColossalAI gradient_accumulation_steps must be set to 1."
     if args.scale_lr:
-        args.learning_rate = args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * world_size
+        args.learning_rate = args.learning_rate * args.train_batch_size * world_size
 
     unet = gemini_zero_dpp(unet, args.placement)
 
@@ -529,7 +525,7 @@ def collate_fn(examples):
 
     # Scheduler and math around the number of training steps.
     overrode_max_train_steps = False
-    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader))
     if args.max_train_steps is None:
         args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
         overrode_max_train_steps = True
@@ -537,8 +533,8 @@ def collate_fn(examples):
     lr_scheduler = get_scheduler(
         args.lr_scheduler,
         optimizer=optimizer,
-        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
-        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+        num_warmup_steps=args.lr_warmup_steps,
+        num_training_steps=args.max_train_steps,
     )
     weight_dtype = torch.float32
     if args.mixed_precision == "fp16":
@@ -553,14 +549,14 @@ def collate_fn(examples):
     text_encoder.to(get_current_device(), dtype=weight_dtype)
 
     # We need to recalculate our total training steps as the size of the training dataloader may have changed.
-    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader))
     if overrode_max_train_steps:
         args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
     # Afterwards we recalculate our number of training epochs
     args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
 
     # Train!
-    total_batch_size = args.train_batch_size * world_size * args.gradient_accumulation_steps
+    total_batch_size = args.train_batch_size * world_size
 
     logger.info("***** Running training *****", ranks=[0])
     logger.info(f"  Num examples = {len(train_dataset)}", ranks=[0])
@@ -568,7 +564,6 @@ def collate_fn(examples):
     logger.info(f"  Num Epochs = {args.num_train_epochs}", ranks=[0])
     logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}", ranks=[0])
     logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}", ranks=[0])
-    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}", ranks=[0])
     logger.info(f"  Total optimization steps = {args.max_train_steps}", ranks=[0])
 
     # Only show the progress bar once on each machine.

From c37556365304b425cad5c039ed4acdf9c69666ba Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Wed, 8 Feb 2023 14:39:36 +0800
Subject: [PATCH 276/503] [doc] removed pre-built wheel installation from
 readme (#2637)

---
 README-zh-Hans.md | 6 +-----
 README.md         | 5 -----
 2 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/README-zh-Hans.md b/README-zh-Hans.md
index 1af9b0af1115..1e1d475c906a 100644
--- a/README-zh-Hans.md
+++ b/README-zh-Hans.md
@@ -276,11 +276,7 @@ CUDA_EXT=1 pip install colossalai
 pip install colossalai-nightly
 ```
 
-### 从官方安装
-
-您可以访问我们[下载](https://www.colossalai.org/download)页面来安装Colossal-AI，在这个页面上发布的版本都预编译了CUDA扩展。
-
-### 从源安装
+### 从源码安装
 
 > 此文档将与版本库的主分支保持一致。如果您遇到任何问题，欢迎给我们提 issue :)
 
diff --git a/README.md b/README.md
index 96debaf5c7e9..dc1e80113517 100644
--- a/README.md
+++ b/README.md
@@ -278,11 +278,6 @@ Installation can be made via
 pip install colossalai-nightly
 ```
 
-### Download From Official Releases
-
-You can visit the [Download](https://www.colossalai.org/download) page to download Colossal-AI with pre-built PyTorch extensions.
-
-
 ### Download From Source
 
 > The version of Colossal-AI will be in line with the main branch of the repository. Feel free to raise an issue if you encounter any problem. :)

From cb3d1bef62b63eac96d976379a4930a0807e8da3 Mon Sep 17 00:00:00 2001
From: YuliangLiu0306 <72588413+YuliangLiu0306@users.noreply.github.com>
Date: Wed, 8 Feb 2023 15:02:12 +0800
Subject: [PATCH 277/503] [autoparallel] adapt autoparallel tests with latest
 api (#2626)

---
 .../strategy/matmul_strategy_generator.py     |  14 +-
 .../tensor_shard/solver/cost_graph.py         |   3 -
 .../test_bias_addition_forward.py             |  97 +------
 .../test_gpt/test_gpt2_performance.py         | 131 ---------
 .../test_gpt/test_runtime_with_gpt_modules.py |  56 ++--
 .../test_tensor_shard/test_metainfo/utils.py  |   2 +-
 .../test_resnet_block_runtime.py              | 270 ------------------
 .../test_shape_consistency_pass.py            |  75 ++---
 8 files changed, 62 insertions(+), 586 deletions(-)
 delete mode 100644 tests/test_auto_parallel/test_tensor_shard/test_gpt/test_gpt2_performance.py
 delete mode 100644 tests/test_auto_parallel/test_tensor_shard/test_resnet_block_runtime.py

diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/matmul_strategy_generator.py b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/matmul_strategy_generator.py
index 9aa95b43a966..fa2246f952a9 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/matmul_strategy_generator.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/matmul_strategy_generator.py
@@ -247,12 +247,12 @@ def collate_strategies(self) -> List[ShardingStrategy]:
         strategies.append(self.split_rhs_space_both_contract(1, 0))
 
         # RR= RS x SR
-        # strategies.append(self.recompute_split_both_contract(0))
-        # strategies.append(self.recompute_split_both_contract(1))
+        strategies.append(self.recompute_split_both_contract(0))
+        strategies.append(self.recompute_split_both_contract(1))
 
-        # # RS = RR x RS
-        # strategies.append(self.split_rhs_space_only(0))
-        # strategies.append(self.split_rhs_space_only(1))
+        # RS = RR x RS
+        strategies.append(self.split_rhs_space_only(0))
+        strategies.append(self.split_rhs_space_only(1))
 
         # S01R = S01R x RR
         strategies.append(self.split_lhs_1st_dim_1d(0, 1))
@@ -263,8 +263,8 @@ def collate_strategies(self) -> List[ShardingStrategy]:
         # RS01 = RR x RS01
         strategies.append(self.split_rhs_2nd_dim_1d(0, 1))
 
-        # # RR = RR x RR
-        # strategies.append(self.non_split())
+        # RR = RR x RR
+        strategies.append(self.non_split())
 
         return strategies
 
diff --git a/colossalai/auto_parallel/tensor_shard/solver/cost_graph.py b/colossalai/auto_parallel/tensor_shard/solver/cost_graph.py
index 038e56547b96..74290453ca0c 100644
--- a/colossalai/auto_parallel/tensor_shard/solver/cost_graph.py
+++ b/colossalai/auto_parallel/tensor_shard/solver/cost_graph.py
@@ -62,9 +62,6 @@ def _build_cost_graph(self):
                         else:
                             edge_cost[(j, i)] = resharding_cost_item.total
                 self.edge_costs[node_pair] = edge_cost
-            # add parents and children attribute to node
-            # parent_nodes = [node for node in strategies_vector.predecessor_nodes]
-            # children_nodes = [node for node in strategies_vector.successor_nodes]
             parent_nodes = []
             children_nodes = []
 
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_bias_addition_forward.py b/tests/test_auto_parallel/test_tensor_shard/test_bias_addition_forward.py
index e666cb1753a7..f43885a6ac44 100644
--- a/tests/test_auto_parallel/test_tensor_shard/test_bias_addition_forward.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_bias_addition_forward.py
@@ -4,21 +4,11 @@
 import torch
 import torch.multiprocessing as mp
 
-from colossalai.auto_parallel.passes.runtime_apply_pass import runtime_apply_pass
-from colossalai.auto_parallel.passes.runtime_preparation_pass import runtime_preparation_pass
-from colossalai.auto_parallel.tensor_shard.sharding_strategy import OperationDataType
-from colossalai.auto_parallel.tensor_shard.solver import (
-    CostGraph,
-    GraphAnalyser,
-    Solver,
-    SolverOptions,
-    StrategiesConstructor,
-)
+from colossalai.auto_parallel.tensor_shard.initialize import initialize_model
 from colossalai.device.device_mesh import DeviceMesh
-from colossalai.fx import ColoGraphModule, ColoTracer
 from colossalai.initialize import launch
 from colossalai.logging import disable_existing_loggers
-from colossalai.testing import assert_close, assert_close_loose, rerun_if_address_is_in_use
+from colossalai.testing import assert_close, rerun_if_address_is_in_use
 from colossalai.testing.pytest_wrapper import run_on_environment_flag
 from colossalai.utils import free_port
 
@@ -63,42 +53,9 @@ def check_linear_module(rank, world_size, port):
     # [[0, 1]
     #  [2, 3]]
     device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
-    tracer = ColoTracer()
-    # graph():
-    #     %x : torch.Tensor [#users=1] = placeholder[target=x]
-    #     %linear_weight : [#users=1] = get_attr[target=linear.weight]
-    #     %linear_bias : [#users=1] = get_attr[target=linear.bias]
-    #     %linear : [#users=1] = call_function[target=torch._C._nn.linear](args = (%x, %linear_weight), kwargs = {})
-    #     %add : [#users=1] = call_function[target=operator.add](args = (%linear, %linear_bias), kwargs = {})
-    #     %mul : [#users=1] = call_function[target=operator.mul](args = (%add, 2), kwargs = {})
-    #     return mul
-    graph = tracer.trace(root=model, meta_args={'x': torch.rand(4, 4).to('meta')})
-    # def forward(self, x : torch.Tensor):
-    #     linear_weight = self.linear.weight
-    #     linear_bias = self.linear.bias
-    #     linear = torch._C._nn.linear(x, linear_weight);  x = linear_weight = None
-    #     add = linear + linear_bias;  linear = linear_bias = None
-    #     mul = add * 2;  add = None
-    #     return mul
-    gm = ColoGraphModule(model, graph)
-    gm.recompile()
-    node_list = list(graph.nodes)
-
-    solver_options = SolverOptions()
-    strategies_constructor = StrategiesConstructor(graph, device_mesh, solver_options)
-    strategies_constructor.build_strategies_and_cost()
-    linear_node = node_list[3]
-    cost_graph = CostGraph(strategies_constructor.leaf_strategies)
-    cost_graph.simplify_graph()
-    graph_analyser = GraphAnalyser(gm)
-    solver = Solver(gm.graph, strategies_constructor, cost_graph, graph_analyser)
-    ret = solver.call_solver_serialized_args()
-    solution = list(ret[0])
-    gm, sharding_spec_dict, origin_spec_dict, comm_actions_dict = runtime_preparation_pass(gm, solution, device_mesh)
-
-    gm = runtime_apply_pass(gm)
-    gm.recompile()
-    output = gm(input, sharding_spec_dict, origin_spec_dict, comm_actions_dict)
+    meta_args = {'x': torch.rand(4, 4).to('meta')}
+    gm = initialize_model(model, meta_args=meta_args, device_mesh=device_mesh)
+    output = gm(input)
     assert_close(output, output_compare)
 
 
@@ -113,47 +70,9 @@ def check_conv_module(rank, world_size, port):
     # [[0, 1]
     #  [2, 3]]
     device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
-    tracer = ColoTracer()
-    # graph():
-    #     %x : torch.Tensor [#users=1] = placeholder[target=x]
-    #     %conv_weight : [#users=1] = get_attr[target=conv.weight]
-    #     %conv_bias : [#users=1] = get_attr[target=conv.bias]
-    #     %conv2d : [#users=1] = call_function[target=torch.conv2d](args = (%x, %conv_weight), kwargs = {})
-    #     %view : [#users=1] = call_method[target=view](args = (%conv_bias, [1, -1, 1, 1]), kwargs = {})
-    #     %add : [#users=1] = call_function[target=operator.add](args = (%conv2d, %view), kwargs = {})
-    #     %mul : [#users=1] = call_function[target=operator.mul](args = (%add, 2), kwargs = {})
-    #     return mul
-    graph = tracer.trace(root=model, meta_args={'x': torch.rand(4, 3, 64, 64).to('meta')})
-    # def forward(self, x : torch.Tensor):
-    #     conv_weight = self.conv.weight
-    #     conv_bias = self.conv.bias
-    #     conv2d = torch.conv2d(x, conv_weight);  x = conv_weight = None
-    #     view = conv_bias.view([1, -1, 1, 1]);  conv_bias = None
-    #     add = conv2d + view;  conv2d = view = None
-    #     mul = add * 2;  add = None
-    #     return mul
-    gm = ColoGraphModule(model, graph)
-
-    gm.recompile()
-
-    node_list = list(graph.nodes)
-    conv_node = node_list[3]
-    solver_options = SolverOptions()
-    strategies_constructor = StrategiesConstructor(graph, device_mesh, solver_options)
-    strategies_constructor.build_strategies_and_cost()
-
-    cost_graph = CostGraph(strategies_constructor.leaf_strategies)
-    cost_graph.simplify_graph()
-    graph_analyser = GraphAnalyser(gm)
-    solver = Solver(gm.graph, strategies_constructor, cost_graph, graph_analyser)
-    ret = solver.call_solver_serialized_args()
-    solution = list(ret[0])
-
-    gm, sharding_spec_dict, origin_spec_dict, comm_actions_dict = runtime_preparation_pass(gm, solution, device_mesh)
-
-    gm = runtime_apply_pass(gm)
-    gm.recompile()
-    output = gm(input, sharding_spec_dict, origin_spec_dict, comm_actions_dict)
+    meta_args = {'x': torch.rand(4, 3, 64, 64).to('meta')}
+    gm = initialize_model(model, meta_args=meta_args, device_mesh=device_mesh)
+    output = gm(input)
     assert_close(output, output_compare)
 
 
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_gpt/test_gpt2_performance.py b/tests/test_auto_parallel/test_tensor_shard/test_gpt/test_gpt2_performance.py
deleted file mode 100644
index 0979d8353ee7..000000000000
--- a/tests/test_auto_parallel/test_tensor_shard/test_gpt/test_gpt2_performance.py
+++ /dev/null
@@ -1,131 +0,0 @@
-import copy
-import random
-from functools import partial
-from time import time
-from typing import Dict, Optional, Tuple, Union
-
-import numpy as np
-import psutil
-import pytest
-import torch
-import torch.multiprocessing as mp
-import torch.nn as nn
-import transformers
-from torch.fx import GraphModule
-from torch.profiler import ProfilerActivity, profile, record_function, schedule, tensorboard_trace_handler
-
-from colossalai.auto_parallel.passes.runtime_apply_pass import runtime_apply_pass
-from colossalai.auto_parallel.passes.runtime_preparation_pass import runtime_preparation_pass
-from colossalai.auto_parallel.tensor_shard.constants import BATCHNORM_MODULE_OP
-from colossalai.auto_parallel.tensor_shard.initialize import autoparallelize, initialize_model
-from colossalai.auto_parallel.tensor_shard.sharding_strategy import ShardingSpec
-from colossalai.auto_parallel.tensor_shard.solver import (
-    CostGraph,
-    GraphAnalyser,
-    Solver,
-    SolverOptions,
-    StrategiesConstructor,
-)
-from colossalai.device.device_mesh import DeviceMesh
-from colossalai.fx.tracer.tracer import ColoTracer
-from colossalai.initialize import launch, launch_from_torch
-from colossalai.logging import disable_existing_loggers, get_dist_logger
-from colossalai.tensor.shape_consistency import ShapeConsistencyManager, to_global
-from colossalai.testing import assert_close, assert_close_loose, parameterize, rerun_if_address_is_in_use
-from colossalai.testing.pytest_wrapper import run_on_environment_flag
-from colossalai.utils import free_port
-from tests.test_auto_parallel.test_tensor_shard.test_gpt.gpt_modules import GPT2LMHeadModel, GPTLMLoss
-
-BATCH_SIZE = 32
-SEQ_LENGTH = 256
-HIDDEN_DIM = 16384
-NUM_HEADS = 128
-NUM_LAYERS = 4
-VOCAB_SIZE = 50257
-NUM_STEPS = 10
-FP16 = True
-
-
-def get_cpu_mem():
-    return psutil.Process().memory_info().rss / 1024**2
-
-
-def get_gpu_mem():
-    return torch.cuda.memory_allocated() / 1024**2
-
-
-def get_mem_info(prefix=''):
-    return f'{prefix}GPU memory usage: {get_gpu_mem():.2f} MB, CPU memory usage: {get_cpu_mem():.2f} MB'
-
-
-def get_tflops(model_numel, batch_size, seq_len, step_time):
-    # Tflops_per_GPU = global_batch * global_numel * seq_len * 8 / #gpu
-    return model_numel * batch_size * seq_len * 8 / 1e12 / (step_time + 1e-12) / 4
-
-
-# Randomly Generated Data
-def get_data(batch_size, seq_len, vocab_size):
-    input_ids = torch.randint(0, vocab_size, (batch_size, seq_len), device=torch.cuda.current_device())
-    attention_mask = torch.ones_like(input_ids)
-    return input_ids, attention_mask
-
-
-def main():
-    disable_existing_loggers()
-    launch_from_torch(config={})
-    logger = get_dist_logger()
-    config = transformers.GPT2Config(n_position=SEQ_LENGTH, n_layer=NUM_LAYERS, n_head=NUM_HEADS, n_embd=HIDDEN_DIM)
-    if FP16:
-        model = GPT2LMHeadModel(config=config).half().to('cuda')
-    else:
-        model = GPT2LMHeadModel(config=config).to('cuda')
-    global_numel = sum([p.numel() for p in model.parameters()])
-
-    meta_input_sample = {
-        'input_ids': torch.zeros((BATCH_SIZE, SEQ_LENGTH), dtype=torch.int64).to('meta'),
-        'attention_mask': torch.zeros((BATCH_SIZE, SEQ_LENGTH), dtype=torch.int64).to('meta'),
-    }
-
-    physical_mesh_id = torch.arange(0, 4)
-    mesh_shape = (2, 2)
-    # [[0, 1]
-    #  [2, 3]]
-    device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
-
-    gm = initialize_model(model, meta_input_sample, device_mesh)
-
-    # build criterion
-    criterion = GPTLMLoss()
-
-    optimizer = torch.optim.Adam(gm.parameters(), lr=0.01)
-    logger.info(get_mem_info(prefix='After init model, '), ranks=[0])
-    get_tflops_func = partial(get_tflops, global_numel, BATCH_SIZE, SEQ_LENGTH)
-    torch.cuda.synchronize()
-    model.train()
-    # with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
-    #              schedule=schedule(wait=1, warmup=2, active=2),
-    #              on_trace_ready=tensorboard_trace_handler(f'log/dummy_data/bs128_seq128_new'),
-    #              record_shapes=True,
-    #              profile_memory=True) as prof:
-    # with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA]) as prof:
-    for n in range(10):
-        # we just use randomly generated data here
-        input_ids, attn_mask = get_data(BATCH_SIZE, SEQ_LENGTH, VOCAB_SIZE)
-        optimizer.zero_grad()
-        start = time()
-        outputs = gm(input_ids, attn_mask)
-        loss = criterion(outputs, input_ids)
-        loss.backward()
-        optimizer.step()
-        # prof.step()
-        torch.cuda.synchronize()
-        step_time = time() - start
-        logger.info(
-            f'[{n+1}/{NUM_STEPS}] Loss:{loss.item():.3f}, Step time: {step_time:.3f}s, TFLOPS: {get_tflops_func(step_time):.3f}',
-            ranks=[0])
-    # print(prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=10))
-    torch.cuda.synchronize()
-
-
-if __name__ == '__main__':
-    main()
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_gpt/test_runtime_with_gpt_modules.py b/tests/test_auto_parallel/test_tensor_shard/test_gpt/test_runtime_with_gpt_modules.py
index c7f9988f1824..753ecff5374c 100644
--- a/tests/test_auto_parallel/test_tensor_shard/test_gpt/test_runtime_with_gpt_modules.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_gpt/test_runtime_with_gpt_modules.py
@@ -1,32 +1,27 @@
 import copy
 import random
 from functools import partial
-from typing import Dict, Optional, Tuple, Union
+from typing import Dict
 
 import numpy as np
 import pytest
 import torch
 import torch.multiprocessing as mp
-import torch.nn as nn
 import transformers
 from torch.fx import GraphModule
 
-from colossalai.auto_parallel.passes.runtime_apply_pass import runtime_apply_pass
-from colossalai.auto_parallel.passes.runtime_preparation_pass import runtime_preparation_pass
-from colossalai.auto_parallel.tensor_shard.constants import BATCHNORM_MODULE_OP
-from colossalai.auto_parallel.tensor_shard.sharding_strategy import ShardingSpec
-from colossalai.auto_parallel.tensor_shard.solver import (
-    CostGraph,
-    GraphAnalyser,
-    Solver,
-    SolverOptions,
-    StrategiesConstructor,
+from colossalai.auto_parallel.tensor_shard.initialize import (
+    ModuleWrapper,
+    build_strategy_constructor,
+    solve_solution,
+    transform_to_sharded_model,
 )
+from colossalai.auto_parallel.tensor_shard.sharding_strategy import ShardingSpec
 from colossalai.device.device_mesh import DeviceMesh
 from colossalai.fx.tracer.tracer import ColoTracer
 from colossalai.initialize import launch
 from colossalai.logging import disable_existing_loggers
-from colossalai.tensor.shape_consistency import ShapeConsistencyManager, to_global
+from colossalai.tensor.shape_consistency import to_global
 from colossalai.testing import assert_close, assert_close_loose, parameterize, rerun_if_address_is_in_use
 from colossalai.testing.pytest_wrapper import run_on_environment_flag
 from colossalai.utils import free_port
@@ -49,6 +44,7 @@ def _check_module_grad(module: torch.nn.Module, origin_param_dict: Dict[str, tor
                        best_sharding_spec_dict: Dict[str, ShardingSpec]):
     for name, param in module.named_parameters():
         param_grad = param.grad
+        name = name.replace('module.', '')
         origin_param_grad = origin_param_dict[name].grad
         atoms = name.split('.')
         new_name = '_'.join(atoms)
@@ -115,30 +111,17 @@ def check_attention_layer(rank, model_cls, world_size, port):
     # [[0, 1]
     #  [2, 3]]
     device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
-    shape_consistency_manager = ShapeConsistencyManager()
-
     tracer = ColoTracer()
 
     graph = tracer.trace(root=model, meta_args=meta_input_sample)
     gm = GraphModule(model, graph, model.__class__.__name__)
     gm.recompile()
 
-    graph_analyser = GraphAnalyser(gm)
-    liveness_list = graph_analyser.liveness_analysis()
-    solver_options = SolverOptions()
-    strategies_constructor = StrategiesConstructor(graph, device_mesh, solver_options)
-    strategies_constructor.build_strategies_and_cost()
-
-    cost_graph = CostGraph(strategies_constructor.leaf_strategies)
-    cost_graph.simplify_graph()
-    solver = Solver(gm.graph, strategies_constructor, cost_graph, graph_analyser, memory_budget=-1)
-    ret = solver.call_solver_serialized_args()
-
-    solution = list(ret[0])
-    gm, sharding_spec_dict, origin_spec_dict, comm_actions_dict = runtime_preparation_pass(
-        gm, solution, device_mesh, strategies_constructor)
-    gm = runtime_apply_pass(gm)
-    gm.recompile()
+    strategies_constructor = build_strategy_constructor(graph, device_mesh)
+    solution = solve_solution(gm, strategies_constructor, memory_budget=-1)
+    gm, sharding_spec_dicts = transform_to_sharded_model(gm, solution, device_mesh, strategies_constructor)
+    gm = ModuleWrapper(gm, *sharding_spec_dicts)
+
     nodes = [strategies_vector.node for strategies_vector in strategies_constructor.leaf_strategies]
     best_sharding_spec_dict = {}
     for index, node in enumerate(nodes):
@@ -149,7 +132,7 @@ def check_attention_layer(rank, model_cls, world_size, port):
     origin_output = test_model(*test_input_sample)
     torch.cuda.set_rng_state(cuda_rng_state)
     torch.set_rng_state(cpu_rng_state)
-    output = gm(*input_sample, sharding_spec_dict, origin_spec_dict, comm_actions_dict)
+    output = gm(*input_sample)
     assert_close(output, origin_output, rtol=1e-03, atol=1e-03)
 
     #*******************backward starting*******************
@@ -174,16 +157,15 @@ def check_attention_layer(rank, model_cls, world_size, port):
     #*******************strategy selected*******************
     if rank == 0:
         print("*******************strategy selected*******************")
-        strategies_list = solver.last_s_val
         nodes = [strategies_vector.node for strategies_vector in strategies_constructor.leaf_strategies]
         computation_cost = 0
         communication_cost = 0
         memory_cost = 0
         for index, node in enumerate(nodes):
-            print(node.name, node.strategies_vector[strategies_list[index]].name)
-            computation_cost += node.strategies_vector[strategies_list[index]].compute_cost.total
-            communication_cost += node.strategies_vector[strategies_list[index]].communication_cost.total
-            node_memory_cost = node.strategies_vector[strategies_list[index]].memory_cost.total
+            print(node.name, node.strategies_vector[solution[index]].name)
+            computation_cost += node.strategies_vector[solution[index]].compute_cost.total
+            communication_cost += node.strategies_vector[solution[index]].communication_cost.total
+            node_memory_cost = node.strategies_vector[solution[index]].memory_cost.total
             if isinstance(node_memory_cost, tuple):
                 node_memory_cost = node_memory_cost[0]
             memory_cost += node_memory_cost.activation + node_memory_cost.parameter
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_metainfo/utils.py b/tests/test_auto_parallel/test_tensor_shard/test_metainfo/utils.py
index 7c06f2ee9e20..17eb75fadef0 100644
--- a/tests/test_auto_parallel/test_tensor_shard/test_metainfo/utils.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_metainfo/utils.py
@@ -57,7 +57,7 @@ def mem_test_for_node_strategy(rank: int,
             output_key]
 
         gm, sharding_spec_dict, origin_spec_dict, comm_actions_dict = runtime_preparation_pass(
-            gm, solution, device_mesh)
+            gm, solution, device_mesh, strategies_constructor)
         gm = runtime_apply_pass(gm)
         gm.recompile()
         gm: GraphModule
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_resnet_block_runtime.py b/tests/test_auto_parallel/test_tensor_shard/test_resnet_block_runtime.py
deleted file mode 100644
index 814edd27948c..000000000000
--- a/tests/test_auto_parallel/test_tensor_shard/test_resnet_block_runtime.py
+++ /dev/null
@@ -1,270 +0,0 @@
-import copy
-from copy import deepcopy
-from functools import partial
-
-import pytest
-import torch
-import torch.multiprocessing as mp
-import torch.nn as nn
-from torch.fx import GraphModule
-from torchvision.models import resnet34, resnet50
-
-from colossalai import device
-from colossalai.auto_parallel.passes.runtime_apply_pass import runtime_apply_pass
-from colossalai.auto_parallel.passes.runtime_preparation_pass import runtime_preparation_pass
-from colossalai.auto_parallel.tensor_shard.constants import *
-from colossalai.auto_parallel.tensor_shard.solver.cost_graph import CostGraph
-from colossalai.auto_parallel.tensor_shard.solver.graph_analysis import GraphAnalyser
-from colossalai.auto_parallel.tensor_shard.solver.options import SolverOptions
-from colossalai.auto_parallel.tensor_shard.solver.solver import Solver
-from colossalai.auto_parallel.tensor_shard.solver.strategies_constructor import StrategiesConstructor
-from colossalai.device.device_mesh import DeviceMesh
-from colossalai.fx.tracer.tracer import ColoTracer
-from colossalai.initialize import launch
-from colossalai.logging import disable_existing_loggers
-from colossalai.testing import assert_close, assert_close_loose, rerun_if_address_is_in_use
-from colossalai.testing.pytest_wrapper import run_on_environment_flag
-from colossalai.utils import free_port
-
-seed = 128
-cudnn_benchmark = False
-cudnn_deterministic = True
-
-
-def conv3x3(in_planes: int, out_planes: int, stride: int = 1, groups: int = 1, dilation: int = 1) -> nn.Conv2d:
-    """3x3 convolution with padding"""
-    return nn.Conv2d(
-        in_planes,
-        out_planes,
-        kernel_size=3,
-        stride=stride,
-        padding=dilation,
-        groups=groups,
-        bias=False,
-        dilation=dilation,
-    )
-
-
-def conv1x1(in_planes: int, out_planes: int, stride: int = 1) -> nn.Conv2d:
-    """1x1 convolution"""
-    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
-
-
-class Bottleneck(nn.Module):
-    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
-    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
-    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
-    # This variant is also known as ResNet V1.5 and improves accuracy according to
-    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
-
-    expansion: int = 4
-
-    def __init__(
-        self,
-        inplanes: int,
-        planes: int,
-        stride: int = 1,
-        downsample=None,
-        groups: int = 1,
-        base_width: int = 64,
-        dilation: int = 1,
-        norm_layer=None,
-    ) -> None:
-        super().__init__()
-        if norm_layer is None:
-            norm_layer = nn.BatchNorm2d
-        width = int(planes * (base_width / 64.0)) * groups
-        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
-        self.conv1 = conv1x1(inplanes, width)
-        self.bn1 = norm_layer(width)
-        self.conv2 = conv3x3(width, width, stride, groups, dilation)
-        self.bn2 = norm_layer(width)
-        self.conv3 = conv1x1(width, planes * self.expansion)
-        self.bn3 = norm_layer(planes * self.expansion)
-        self.relu = nn.ReLU(inplace=True)
-        self.downsample = downsample
-        self.stride = stride
-
-    def forward(self, x):
-        identity = x
-
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-
-        out = self.conv2(out)
-        out = self.bn2(out)
-        out = self.relu(out)
-
-        out = self.conv3(out)
-        out = self.bn3(out)
-
-        if self.downsample is not None:
-            identity = self.downsample(x)
-
-        out = self.relu(out)
-
-        return out
-
-
-def check_apply_bottleneck(rank, world_size, port):
-    disable_existing_loggers()
-    launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
-    input = torch.rand(4, 4, 4, 4).cuda()
-    physical_mesh_id = torch.arange(0, 4)
-    mesh_shape = (2, 2)
-    # [[0, 1]
-    #  [2, 3]]
-    device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
-
-    tracer = ColoTracer()
-    model = Bottleneck(4, 4, 1, norm_layer=torch.nn.modules.batchnorm.BatchNorm2d).cuda()
-    test_model = copy.deepcopy(model)
-    test_input = copy.deepcopy(input)
-    # graph():
-    #     %x : torch.Tensor [#users=1] = placeholder[target=x]
-    #     %conv1 : [#users=1] = call_module[target=conv1](args = (%x,), kwargs = {})
-    #     %bn1 : [#users=1] = call_module[target=bn1](args = (%conv1,), kwargs = {})
-    #     %relu : [#users=1] = call_module[target=relu](args = (%bn1,), kwargs = {})
-    #     %conv2 : [#users=1] = call_module[target=conv2](args = (%relu,), kwargs = {})
-    #     %bn2 : [#users=1] = call_module[target=bn2](args = (%conv2,), kwargs = {})
-    #     %relu_1 : [#users=1] = call_module[target=relu](args = (%bn2,), kwargs = {})
-    #     %conv3 : [#users=1] = call_module[target=conv3](args = (%relu_1,), kwargs = {})
-    #     %bn3 : [#users=1] = call_module[target=bn3](args = (%conv3,), kwargs = {})
-    #     %relu_2 : [#users=1] = call_module[target=relu](args = (%bn3,), kwargs = {})
-    #     return relu_2
-    input_sample = {'x': torch.rand(4, 4, 4, 4).to('meta')}
-
-    graph = tracer.trace(root=model, meta_args=input_sample)
-    gm = GraphModule(model, graph, model.__class__.__name__)
-    gm.recompile()
-    solver_options = SolverOptions()
-    strategies_constructor = StrategiesConstructor(graph, device_mesh, solver_options)
-    strategies_constructor.build_strategies_and_cost()
-
-    cost_graph = CostGraph(strategies_constructor.leaf_strategies)
-    cost_graph.simplify_graph()
-    graph_analyser = GraphAnalyser(gm)
-    solver = Solver(gm.graph, strategies_constructor, cost_graph, graph_analyser)
-    ret = solver.call_solver_serialized_args()
-    solution = list(ret[0])
-    print(solution)
-    for index, node in enumerate(graph.nodes):
-        print(node.name, node.strategies_vector[solution[index]].name)
-    gm, sharding_spec_dict, origin_spec_dict, comm_actions_dict = runtime_preparation_pass(gm, solution, device_mesh)
-    gm = runtime_apply_pass(gm)
-    gm.recompile()
-    nodes = [node for node in gm.graph.nodes]
-    # TODO: wrap the gm to avoid the influence of the user training code
-    cuda_rng_state = torch.cuda.get_rng_state()
-    origin_output = test_model(test_input)
-    torch.cuda.set_rng_state(cuda_rng_state)
-    output = gm(input, sharding_spec_dict, origin_spec_dict, comm_actions_dict)
-
-    assert output.shape == origin_output.shape
-    assert_close(output, origin_output, rtol=1e-03, atol=1e-05)
-    print("*******************backward starting*******************")
-    cuda_rng_state = torch.cuda.get_rng_state()
-    output.sum().backward()
-    torch.cuda.set_rng_state(cuda_rng_state)
-    origin_output.sum().backward()
-    if rank == 0:
-        print(
-            f"bn3 diff sum in rank {rank}: {(gm.bn3.weight.grad - test_model.bn3.weight.grad.narrow(0, 0, 4)).abs().sum()}"
-        )
-        print(
-            f"conv3 diff sum in rank {rank}: {(gm.conv3.weight.grad - test_model.conv3.weight.grad.narrow(0, 0, 8)).abs().sum()}"
-        )
-        print(
-            f"bn2 diff sum in rank {rank}: {(gm.bn2.weight.grad - test_model.bn2.weight.grad.narrow(0, 0, 2)).abs().sum()}"
-        )
-        print(
-            f"conv2 diff sum in rank {rank}: {(gm.conv2.weight.grad - test_model.conv2.weight.grad.narrow(0, 0, 2)).abs().sum()}"
-        )
-        print(
-            f"bn1 diff sum in rank {rank}: {(gm.bn1.weight.grad - test_model.bn1.weight.grad.narrow(0, 0, 1)).abs().sum()}"
-        )
-        print(f"conv1 diff sum in rank {rank}: {(gm.conv1.weight.grad - test_model.conv1.weight.grad).sum()}")
-
-        assert_close_loose(gm.conv3.weight.grad.sum(), test_model.conv3.weight.grad.narrow(0, 0, 8).sum())
-        assert_close_loose(gm.conv2.weight.grad.sum(), test_model.conv2.weight.grad.narrow(0, 0, 2).sum())
-        assert_close_loose(gm.conv1.weight.grad.sum(), test_model.conv1.weight.grad.sum())
-
-    if rank == 1:
-        print(
-            f"bn3 diff sum in rank {rank}: {(gm.bn3.weight.grad - test_model.bn3.weight.grad.narrow(0, 4, 4)).abs().sum()}"
-        )
-        print(
-            f"conv3 diff sum in rank {rank}: {(gm.conv3.weight.grad - test_model.conv3.weight.grad.narrow(0, 0, 8)).abs().sum()}"
-        )
-        print(
-            f"bn2 diff sum in rank {rank}: {(gm.bn2.weight.grad - test_model.bn2.weight.grad.narrow(0, 2, 2)).abs().sum()}"
-        )
-        print(
-            f"conv2 diff sum in rank {rank}: {(gm.conv2.weight.grad - test_model.conv2.weight.grad.narrow(0, 2, 2)).abs().sum()}"
-        )
-        print(
-            f"bn1 diff sum in rank {rank}: {(gm.bn1.weight.grad - test_model.bn1.weight.grad.narrow(0, 1, 1)).abs().sum()}"
-        )
-        print(f"conv1 diff sum in rank {rank}: {(gm.conv1.weight.grad - test_model.conv1.weight.grad).sum()}")
-
-        assert_close_loose(gm.conv3.weight.grad.sum(), test_model.conv3.weight.grad.narrow(0, 0, 8).sum())
-        assert_close_loose(gm.conv2.weight.grad.sum(), test_model.conv2.weight.grad.narrow(0, 2, 2).sum())
-        assert_close_loose(gm.conv1.weight.grad.sum(), test_model.conv1.weight.grad.sum())
-
-    if rank == 2:
-        print(
-            f"bn3 diff sum in rank {rank}: {(gm.bn3.weight.grad - test_model.bn3.weight.grad.narrow(0, 8, 4)).abs().sum()}"
-        )
-        print(
-            f"conv3 diff sum in rank {rank}: {(gm.conv3.weight.grad - test_model.conv3.weight.grad.narrow(0, 8, 8)).abs().sum()}"
-        )
-        print(
-            f"bn2 diff sum in rank {rank}: {(gm.bn2.weight.grad - test_model.bn2.weight.grad.narrow(0, 0, 2)).abs().sum()}"
-        )
-        print(
-            f"conv2 diff sum in rank {rank}: {(gm.conv2.weight.grad - test_model.conv2.weight.grad.narrow(0, 0, 2)).abs().sum()}"
-        )
-        print(
-            f"bn1 diff sum in rank {rank}: {(gm.bn1.weight.grad - test_model.bn1.weight.grad.narrow(0, 2, 1)).abs().sum()}"
-        )
-        print(f"conv1 diff sum in rank {rank}: {(gm.conv1.weight.grad - test_model.conv1.weight.grad).sum()}")
-
-        assert_close_loose(gm.conv3.weight.grad.sum(), test_model.conv3.weight.grad.narrow(0, 8, 8).sum())
-        assert_close_loose(gm.conv2.weight.grad.sum(), test_model.conv2.weight.grad.narrow(0, 0, 2).sum())
-        assert_close_loose(gm.conv1.weight.grad.sum(), test_model.conv1.weight.grad.sum())
-
-    if rank == 3:
-        print(
-            f"bn3 diff sum in rank {rank}: {(gm.bn3.weight.grad - test_model.bn3.weight.grad.narrow(0, 12, 4)).abs().sum()}"
-        )
-        print(
-            f"conv3 diff sum in rank {rank}: {(gm.conv3.weight.grad - test_model.conv3.weight.grad.narrow(0, 8, 8)).abs().sum()}"
-        )
-        print(
-            f"bn2 diff sum in rank {rank}: {(gm.bn2.weight.grad - test_model.bn2.weight.grad.narrow(0, 2, 2)).abs().sum()}"
-        )
-        print(
-            f"conv2 diff sum in rank {rank}: {(gm.conv2.weight.grad - test_model.conv2.weight.grad.narrow(0, 2, 2)).abs().sum()}"
-        )
-        print(
-            f"bn1 diff sum in rank {rank}: {(gm.bn1.weight.grad - test_model.bn1.weight.grad.narrow(0, 3, 1)).abs().sum()}"
-        )
-        print(f"conv1 diff sum in rank {rank}: {(gm.conv1.weight.grad - test_model.conv1.weight.grad).sum()}")
-
-        assert_close_loose(gm.conv3.weight.grad.sum(), test_model.conv3.weight.grad.narrow(0, 8, 8).sum())
-        assert_close_loose(gm.conv2.weight.grad.sum(), test_model.conv2.weight.grad.narrow(0, 2, 2).sum())
-        assert_close_loose(gm.conv1.weight.grad.sum(), test_model.conv1.weight.grad.sum())
-
-
-@run_on_environment_flag(name='AUTO_PARALLEL')
-@pytest.mark.dist
-@rerun_if_address_is_in_use()
-def test_apply():
-    world_size = 4
-    run_func = partial(check_apply_bottleneck, world_size=world_size, port=free_port())
-    mp.spawn(run_func, nprocs=world_size)
-
-
-if __name__ == '__main__':
-    test_apply()
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_shape_consistency_pass.py b/tests/test_auto_parallel/test_tensor_shard/test_shape_consistency_pass.py
index 66cd3f3f7707..24a3ae5b42c3 100644
--- a/tests/test_auto_parallel/test_tensor_shard/test_shape_consistency_pass.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_shape_consistency_pass.py
@@ -5,19 +5,9 @@
 import torch
 import torch.multiprocessing as mp
 import torch.nn as nn
-from torch.fx import GraphModule
-
-from colossalai.auto_parallel.passes.runtime_apply_pass import runtime_apply_pass
-from colossalai.auto_parallel.passes.runtime_preparation_pass import runtime_preparation_pass
-from colossalai.auto_parallel.tensor_shard.solver import (
-    CostGraph,
-    GraphAnalyser,
-    Solver,
-    SolverOptions,
-    StrategiesConstructor,
-)
+
+from colossalai.auto_parallel.tensor_shard.initialize import initialize_model
 from colossalai.device.device_mesh import DeviceMesh
-from colossalai.fx.tracer.tracer import ColoTracer
 from colossalai.initialize import launch
 from colossalai.logging import disable_existing_loggers
 from colossalai.testing import assert_close, rerun_if_address_is_in_use
@@ -41,41 +31,22 @@ def check_apply(rank, world_size, port):
     disable_existing_loggers()
     launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
     input = torch.rand(4, 4, 4, 4).cuda()
+    test_input = copy.deepcopy(input)
+    # graph():
+    #     %x : torch.Tensor [#users=1] = placeholder[target=x]
+    #     %conv : [#users=1] = call_module[target=conv](args = (%mul,), kwargs = {})
+    #     return conv
+    model = ConvModel(4, 4).cuda()
+    test_model = copy.deepcopy(model)
     physical_mesh_id = torch.arange(0, 4)
     mesh_shape = (2, 2)
     # [[0, 1]
     #  [2, 3]]
     device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
+    meta_args = {'x': torch.rand(4, 4, 4, 4).to('meta')}
+    gm = initialize_model(model, meta_args, device_mesh)
 
-    tracer = ColoTracer()
-    model = ConvModel(4, 4).cuda()
-    test_model = copy.deepcopy(model)
-    test_input = copy.deepcopy(input)
-
-    input_sample = {'x': torch.rand(4, 4, 4, 4).to('meta')}
-    # graph():
-    #     %x : torch.Tensor [#users=1] = placeholder[target=x]
-    #     %conv : [#users=1] = call_module[target=conv](args = (%mul,), kwargs = {})
-    #     return conv
-    graph = tracer.trace(root=model, meta_args=input_sample)
-    gm = GraphModule(model, graph, model.__class__.__name__)
-    gm.recompile()
-    solver_options = SolverOptions()
-    strategies_constructor = StrategiesConstructor(graph, device_mesh, solver_options)
-    strategies_constructor.build_strategies_and_cost()
-
-    cost_graph = CostGraph(strategies_constructor.leaf_strategies)
-    cost_graph.simplify_graph()
-    graph_analyser = GraphAnalyser(gm)
-    solver = Solver(gm.graph, strategies_constructor, cost_graph, graph_analyser)
-    ret = solver.call_solver_serialized_args()
-    solution = list(ret[0])
-    gm, sharding_spec_dict, origin_spec_dict, comm_actions_dict = runtime_preparation_pass(gm, solution, device_mesh)
-    gm = runtime_apply_pass(gm)
-    gm.recompile()
-    nodes = [node for node in gm.graph.nodes]
-    # TODO: wrap the gm to avoid the influence of the user training code
-    output = gm(input, sharding_spec_dict, origin_spec_dict, comm_actions_dict)
+    output = gm(input)
     origin_output = test_model(test_input)
     assert output.equal(origin_output)
     origin_loss = origin_output.sum()
@@ -84,13 +55,21 @@ def check_apply(rank, world_size, port):
     origin_loss.backward()
     loss.backward()
 
-    grad_0 = test_model.conv.weight.grad.narrow(0, 0, 2)
-    grad_1 = test_model.conv.weight.grad.narrow(0, 2, 2)
-
-    if rank in (0, 1):
-        assert_close(gm.conv.weight.grad.data, grad_0.data)
-    elif rank in (2, 3):
-        assert_close(gm.conv.weight.grad.data, grad_1.data)
+    grad_0 = test_model.conv.weight.grad.narrow(0, 0, 1)
+    grad_1 = test_model.conv.weight.grad.narrow(0, 1, 1)
+    grad_2 = test_model.conv.weight.grad.narrow(0, 2, 1)
+    grad_3 = test_model.conv.weight.grad.narrow(0, 3, 1)
+
+    if rank == 0:
+        assert_close(gm.module.conv.weight.grad.data, grad_0.data)
+    elif rank == 1:
+        assert_close(gm.module.conv.weight.grad.data, grad_1.data)
+    elif rank == 2:
+        assert_close(gm.module.conv.weight.grad.data, grad_2.data)
+    elif rank == 3:
+        assert_close(gm.module.conv.weight.grad.data, grad_3.data)
+    else:
+        raise ValueError(f'rank {rank} does not exist.')
 
 
 # skip this test due to pulp not installed in CI environment

From 28398f1c7062460343e765b5d0234936d0cbe218 Mon Sep 17 00:00:00 2001
From: YuliangLiu0306 <72588413+YuliangLiu0306@users.noreply.github.com>
Date: Wed, 8 Feb 2023 15:02:31 +0800
Subject: [PATCH 278/503] add overlap option (#2613)

---
 .../passes/runtime_preparation_pass.py        | 29 ++++++++++++-------
 .../auto_parallel/tensor_shard/initialize.py  | 19 ++++++++----
 2 files changed, 32 insertions(+), 16 deletions(-)

diff --git a/colossalai/auto_parallel/passes/runtime_preparation_pass.py b/colossalai/auto_parallel/passes/runtime_preparation_pass.py
index 98897095753d..897602ce1d24 100644
--- a/colossalai/auto_parallel/passes/runtime_preparation_pass.py
+++ b/colossalai/auto_parallel/passes/runtime_preparation_pass.py
@@ -352,7 +352,7 @@ def _process_sharding_spec(sharding_spec):
     return gm
 
 
-def _module_params_sharding(gm: torch.fx.GraphModule, device_mesh: DeviceMesh):
+def _module_params_sharding(gm: torch.fx.GraphModule, device_mesh: DeviceMesh, overlap=False):
     """
     Apply the sharding action to the module parameters and buffers following the
     instructions of solver solution.
@@ -387,15 +387,18 @@ def _module_params_sharding(gm: torch.fx.GraphModule, device_mesh: DeviceMesh):
                     # register hook to the parameters
                     if operation_data.type == OperationDataType.PARAM and operation_data.name == name and comm_action.comm_type == CommType.HOOK:
 
-                        def wrapper(param, comm_spec, stream):
+                        def wrapper(param, comm_spec, stream, overlap):
 
                             def hook_fn(grad):
-                                with torch.cuda.stream(stream):
-                                    _all_reduce(grad, comm_spec, async_op=True)
+                                if overlap:
+                                    with torch.cuda.stream(stream):
+                                        _all_reduce(grad, comm_spec, async_op=True)
+                                else:
+                                    _all_reduce(grad, comm_spec, async_op=False)
 
                             param.register_hook(hook_fn)
 
-                        wrapper(param, comm_spec_to_use, reduction_stream)
+                        wrapper(param, comm_spec_to_use, reduction_stream, overlap=overlap)
 
             sharded_buffer_dict = {}
             # apply the sharding spec of buffers
@@ -441,15 +444,18 @@ def hook_fn(grad):
                 # register hook to the parameters
                 if isinstance(node._meta_data, torch.nn.parameter.Parameter) and comm_action.comm_type == CommType.HOOK:
 
-                    def wrapper(param, comm_spec, stream):
+                    def wrapper(param, comm_spec, stream, overlap):
 
                         def hook_fn(grad):
-                            with torch.cuda.stream(stream):
-                                _all_reduce(grad, comm_spec, async_op=True)
+                            if overlap:
+                                with torch.cuda.stream(stream):
+                                    _all_reduce(grad, comm_spec, async_op=True)
+                            else:
+                                _all_reduce(grad, comm_spec, async_op=False)
 
                         param.register_hook(hook_fn)
 
-                    wrapper(target, comm_spec_to_use, reduction_stream)
+                    wrapper(target, comm_spec_to_use, reduction_stream, overlap=overlap)
     return gm
 
 
@@ -463,13 +469,14 @@ def implicit_comm_action_apply(gm: torch.fx.GraphModule):
 def runtime_preparation_pass(gm: torch.fx.GraphModule,
                              solution: List[int],
                              device_mesh: DeviceMesh,
-                             strategies_constructor: StrategiesConstructor = None):
+                             strategies_constructor: StrategiesConstructor = None,
+                             overlap=False):
     gm, sharding_spec_convert_dict, origin_node_sharding_spec_dict, comm_actions_dict = _solution_annotatation(
         gm, solution, strategies_constructor)
     gm = _size_value_converting(gm, device_mesh)
     gm = _node_args_converting(gm, device_mesh)
     # TODO: the pass below should be uncommented after the implementation of implicit_comm_action_apply_pass completed.
     # gm = implicit_comm_action_apply(gm)
-    gm = _module_params_sharding(gm, device_mesh)
+    gm = _module_params_sharding(gm, device_mesh, overlap=overlap)
 
     return gm, sharding_spec_convert_dict, origin_node_sharding_spec_dict, comm_actions_dict
diff --git a/colossalai/auto_parallel/tensor_shard/initialize.py b/colossalai/auto_parallel/tensor_shard/initialize.py
index 387a682a1ad9..23ed0f433731 100644
--- a/colossalai/auto_parallel/tensor_shard/initialize.py
+++ b/colossalai/auto_parallel/tensor_shard/initialize.py
@@ -98,16 +98,22 @@ def solve_solution(gm: ColoGraphModule, strategy_constructor: StrategiesConstruc
     return solution
 
 
-def transform_to_sharded_model(gm: ColoGraphModule, solution: List[int], device_mesh: DeviceMesh,
-                               strategies_constructor: StrategiesConstructor):
+def transform_to_sharded_model(gm: ColoGraphModule,
+                               solution: List[int],
+                               device_mesh: DeviceMesh,
+                               strategies_constructor: StrategiesConstructor,
+                               overlap: bool = False):
     '''
     This method is used to transform the original graph to the sharded graph.
     The model parameters will be sharded according to the solution and the grad hooks
     will be added to the sharded graph using the runtime_preparation_pass.
     The communication node will be added into the graph using the runtime_apply_pass.
     '''
-    gm, sharding_spec_dict, origin_spec_dict, comm_actions_dict = runtime_preparation_pass(
-        gm, solution, device_mesh, strategies_constructor)
+    gm, sharding_spec_dict, origin_spec_dict, comm_actions_dict = runtime_preparation_pass(gm,
+                                                                                           solution,
+                                                                                           device_mesh,
+                                                                                           strategies_constructor,
+                                                                                           overlap=overlap)
     gm = runtime_apply_pass(gm)
     gm.recompile()
     sharding_spec_dicts = (sharding_spec_dict, origin_spec_dict, comm_actions_dict)
@@ -176,6 +182,7 @@ def initialize_model(model: nn.Module,
                      meta_args: Dict[str, torch.Tensor],
                      device_mesh: DeviceMesh,
                      memory_budget: float = -1.0,
+                     overlap: bool = False,
                      save_solver_solution: bool = False,
                      load_solver_solution: bool = False,
                      solution_path: str = None,
@@ -189,6 +196,8 @@ def initialize_model(model: nn.Module,
         device_mesh: the device mesh to execute the model.
         memory_budget(optional): the max cuda memory could be used. If the memory budget is -1.0,
             the memory budget will be infinity.
+        overlap(optional): the overlap is used to specify whether to overlap gradient communication and
+            backward computing.
         save_solver_solution(optional): if the save_solver_solution is True, the solution will be saved
             to the solution_path.
         load_solver_solution(optional): if the load_solver_solution is True, the solution will be loaded
@@ -211,7 +220,7 @@ def initialize_model(model: nn.Module,
         if save_solver_solution:
             torch.save(solution, solution_path)
 
-    gm, sharding_spec_dicts = transform_to_sharded_model(gm, solution, device_mesh, strategies_constructor)
+    gm, sharding_spec_dicts = transform_to_sharded_model(gm, solution, device_mesh, strategies_constructor, overlap)
     model_to_return = ModuleWrapper(gm, *sharding_spec_dicts)
 
     if return_solution:

From 37df666f38efad28b4cb681e8278c0deadc8679c Mon Sep 17 00:00:00 2001
From: YuliangLiu0306 <72588413+YuliangLiu0306@users.noreply.github.com>
Date: Wed, 8 Feb 2023 15:02:49 +0800
Subject: [PATCH 279/503] [autoparallel] refactor handlers which reshape input
 tensors (#2615)

* [autoparallel] refactor handlers which reshape input tensors

* polish
---
 .../tensor_shard/node_handler/__init__.py     |  12 +-
 ..._handler.py => default_reshape_handler.py} |  10 +-
 .../node_handler/experimental/__init__.py     |  10 -
 .../experimental/reshape_generator.py         | 299 ------------------
 .../{experimental => }/permute_handler.py     |   9 +-
 .../{experimental => }/split_handler.py       |   9 +-
 .../node_handler/strategy/__init__.py         |  15 +-
 .../strategy/reshape_generator.py             | 267 +++++++++++++++-
 .../{experimental => }/transpose_handler.py   |   9 +-
 .../{experimental => }/view_handler.py        |   9 +-
 ...ler.py => test_default_reshape_handler.py} |   8 +-
 .../test_node_handler/test_getitem_handler.py |   6 +-
 .../test_permute_and_transpose_handler.py     |   2 +-
 .../test_node_handler/test_split_handler.py   |   5 +-
 .../test_node_handler/test_view_handler.py    |   2 +-
 15 files changed, 307 insertions(+), 365 deletions(-)
 rename colossalai/auto_parallel/tensor_shard/node_handler/{reshape_handler.py => default_reshape_handler.py} (87%)
 delete mode 100644 colossalai/auto_parallel/tensor_shard/node_handler/experimental/__init__.py
 delete mode 100644 colossalai/auto_parallel/tensor_shard/node_handler/experimental/reshape_generator.py
 rename colossalai/auto_parallel/tensor_shard/node_handler/{experimental => }/permute_handler.py (92%)
 rename colossalai/auto_parallel/tensor_shard/node_handler/{experimental => }/split_handler.py (89%)
 rename colossalai/auto_parallel/tensor_shard/node_handler/{experimental => }/transpose_handler.py (90%)
 rename colossalai/auto_parallel/tensor_shard/node_handler/{experimental => }/view_handler.py (88%)
 rename tests/test_auto_parallel/test_tensor_shard/test_node_handler/{test_reshape_handler.py => test_default_reshape_handler.py} (91%)

diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/__init__.py b/colossalai/auto_parallel/tensor_shard/node_handler/__init__.py
index 87bd8966bb70..0050358ce093 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/__init__.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/__init__.py
@@ -3,8 +3,8 @@
 from .binary_elementwise_handler import BinaryElementwiseHandler
 from .bmm_handler import AddBMMFunctionHandler, BMMFunctionHandler
 from .conv_handler import ConvFunctionHandler, ConvModuleHandler
+from .default_reshape_handler import DefaultReshapeHandler
 from .embedding_handler import EmbeddingFunctionHandler, EmbeddingModuleHandler
-from .experimental import PermuteHandler, ViewHandler
 from .getattr_handler import GetattrHandler
 from .getitem_handler import GetItemHandler
 from .layer_norm_handler import LayerNormModuleHandler
@@ -13,20 +13,24 @@
 from .normal_pooling_handler import NormPoolingHandler
 from .option import ShardOption
 from .output_handler import OutputHandler
+from .permute_handler import PermuteHandler
 from .placeholder_handler import PlaceholderHandler
 from .registry import operator_registry
-from .reshape_handler import ReshapeHandler
 from .softmax_handler import SoftmaxHandler
+from .split_handler import SplitHandler
 from .sum_handler import SumHandler
 from .tensor_constructor_handler import TensorConstructorHandler
+from .transpose_handler import TransposeHandler
 from .unary_elementwise_handler import UnaryElementwiseHandler
+from .view_handler import ViewHandler
 from .where_handler import WhereHandler
 
 __all__ = [
     'LinearFunctionHandler', 'LinearModuleHandler', 'BMMFunctionHandler', 'AddBMMFunctionHandler',
     'LayerNormModuleHandler', 'BatchNormModuleHandler', 'ConvModuleHandler', 'ConvFunctionHandler',
-    'UnaryElementwiseHandler', 'ReshapeHandler', 'PlaceholderHandler', 'OutputHandler', 'WhereHandler',
+    'UnaryElementwiseHandler', 'DefaultReshapeHandler', 'PlaceholderHandler', 'OutputHandler', 'WhereHandler',
     'NormPoolingHandler', 'BinaryElementwiseHandler', 'MatMulHandler', 'operator_registry', 'ADDMMFunctionHandler',
     'GetItemHandler', 'GetattrHandler', 'ViewHandler', 'PermuteHandler', 'TensorConstructorHandler',
-    'EmbeddingModuleHandler', 'EmbeddingFunctionHandler', 'SumHandler', 'SoftmaxHandler', 'ShardOption'
+    'EmbeddingModuleHandler', 'EmbeddingFunctionHandler', 'SumHandler', 'SoftmaxHandler', 'ShardOption',
+    'TransposeHandler', 'SplitHandler'
 ]
diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/reshape_handler.py b/colossalai/auto_parallel/tensor_shard/node_handler/default_reshape_handler.py
similarity index 87%
rename from colossalai/auto_parallel/tensor_shard/node_handler/reshape_handler.py
rename to colossalai/auto_parallel/tensor_shard/node_handler/default_reshape_handler.py
index 7763b1884025..0c5b9f39e1fb 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/reshape_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/default_reshape_handler.py
@@ -5,23 +5,23 @@
 from ..sharding_strategy import OperationData, OperationDataType
 from .node_handler import MetaInfoNodeHandler, NodeHandler
 from .registry import operator_registry
-from .strategy import ReshapeGenerator, StrategyGenerator
+from .strategy import DefaultReshapeGenerator, StrategyGenerator
 
-__all__ = ['ReshapeHandler']
+__all__ = ['DefaultReshapeHandler']
 
 
 @operator_registry.register(torch.flatten)
 @operator_registry.register(torch.Tensor.unsqueeze)
 @operator_registry.register(torch.nn.AdaptiveAvgPool2d)
-class ReshapeHandler(MetaInfoNodeHandler):
+class DefaultReshapeHandler(MetaInfoNodeHandler):
     """
-    A ReshapeHandler which deals with the sharding strategies for Reshape Op, such as torch.reshape.
+    A DefaultReshapeHandler which deals with the sharding strategies for Reshape Op, such as torch.reshape.
     """
 
     def get_strategy_generator(self) -> List[StrategyGenerator]:
         op_data_mapping = self.get_operation_data_mapping()
         generators = []
-        generators.append(ReshapeGenerator(op_data_mapping, self.device_mesh, self.node.args[0]))
+        generators.append(DefaultReshapeGenerator(op_data_mapping, self.device_mesh, self.node.args[0]))
         return generators
 
     def infer_logical_shape(self, data):
diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/experimental/__init__.py b/colossalai/auto_parallel/tensor_shard/node_handler/experimental/__init__.py
deleted file mode 100644
index 15f66104b156..000000000000
--- a/colossalai/auto_parallel/tensor_shard/node_handler/experimental/__init__.py
+++ /dev/null
@@ -1,10 +0,0 @@
-from .permute_handler import PermuteHandler
-from .reshape_generator import PermuteGenerator, SplitGenerator, TransposeGenerator, ViewGenerator
-from .split_handler import SplitHandler
-from .transpose_handler import TransposeHandler
-from .view_handler import ViewHandler
-
-__all__ = [
-    'ViewGenerator', 'ViewHandler', 'PermuteGenerator', 'PermuteHandler', 'TransposeGenerator', 'TransposeGenerator',
-    'SplitHandler', 'SplitGenerator'
-]
diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/experimental/reshape_generator.py b/colossalai/auto_parallel/tensor_shard/node_handler/experimental/reshape_generator.py
deleted file mode 100644
index b7248d011950..000000000000
--- a/colossalai/auto_parallel/tensor_shard/node_handler/experimental/reshape_generator.py
+++ /dev/null
@@ -1,299 +0,0 @@
-import copy
-from typing import List
-
-from colossalai.auto_parallel.tensor_shard.node_handler.strategy.strategy_generator import FollowingStrategyGenerator
-from colossalai.auto_parallel.tensor_shard.sharding_strategy import (
-    CommAction,
-    CommType,
-    MemoryCost,
-    ShardingStrategy,
-    TrainCycleItem,
-)
-from colossalai.auto_parallel.tensor_shard.utils import (
-    check_keep_sharding_status,
-    detect_reshape_mapping,
-    infer_output_dim_partition_dict,
-)
-from colossalai.tensor.shape_consistency import CollectiveCommPattern
-from colossalai.tensor.sharding_spec import ShardingSpec
-
-__all__ = ['ReshapeGenerator', 'ViewGenerator', 'PermuteGenerator', 'TransposeGenerator', 'SplitGenerator']
-
-
-class ReshapeGenerator(FollowingStrategyGenerator):
-    """
-    ReshapeGenerator is the base class for all the reshape operation.
-    """
-
-    def validate(self) -> bool:
-        return super().validate()
-
-    def update_compute_cost(self, strategy: ShardingStrategy):
-        compute_cost = TrainCycleItem(fwd=10, bwd=10, total=20)
-        strategy.compute_cost = compute_cost
-
-    def update_memory_cost(self, strategy: ShardingStrategy):
-        '''
-        Compute the memory cost per device with this specific strategy.
-        '''
-        forward_size_mapping = {
-            'input': self._compute_size_in_bytes(strategy, "input"),
-            'output': self._compute_size_in_bytes(strategy, "output")
-        }
-
-        backward_size_mapping = copy.deepcopy(forward_size_mapping)
-        backward_size_mapping.pop("output")
-        # compute fwd cost incurred
-        # fwd_cost = input + output
-        fwd_activation_cost = sum([v for k, v in forward_size_mapping.items() if not self.is_param(k)])
-        fwd_parameter_cost = sum([v for k, v in forward_size_mapping.items() if self.is_param(k)])
-        fwd_mem_cost = MemoryCost(activation=fwd_activation_cost, parameter=fwd_parameter_cost)
-
-        # compute bwd cost incurred
-        # bwd_cost = input_grad
-        bwd_activation_cost = sum([v for k, v in backward_size_mapping.items() if not self.is_param(k)])
-        bwd_parameter_cost = sum([v for k, v in backward_size_mapping.items() if self.is_param(k)])
-        bwd_mem_cost = MemoryCost(activation=bwd_activation_cost, parameter=bwd_parameter_cost)
-
-        # compute total cost
-        total_mem_cost = MemoryCost(activation=fwd_activation_cost + bwd_activation_cost,
-                                    parameter=fwd_parameter_cost + bwd_parameter_cost)
-        memory_cost = TrainCycleItem(fwd=fwd_mem_cost, bwd=bwd_mem_cost, total=total_mem_cost)
-        strategy.memory_cost = memory_cost
-
-    def collate_strategies(self) -> List[ShardingStrategy]:
-        return super().collate_strategies()
-
-
-class ViewGenerator(ReshapeGenerator):
-    """
-    ViewGenerator deals with the sharding strategies of view op.
-    """
-
-    def collate_strategies(self) -> List[ShardingStrategy]:
-        strategy_list = []
-        for index, strategy in enumerate(self.predecessor_node.strategies_vector):
-            dim_partition_dict_mapping = {}
-            communication_action_mapping = {}
-            input_sharding_spec = strategy.output_sharding_specs[self.op_data["input"]]
-
-            origin_shape = self.op_data['input'].data.shape
-            tgt_shape = self.op_data['tgt_shape'].data
-
-            reshape_mapping_dict = detect_reshape_mapping(origin_shape, tgt_shape)
-
-            dim_partition_dict_for_input = input_sharding_spec.dim_partition_dict
-            keep_sharding_status = check_keep_sharding_status(dim_partition_dict_for_input, reshape_mapping_dict)
-
-            if keep_sharding_status:
-                dim_partition_dict_for_output = infer_output_dim_partition_dict(dim_partition_dict_for_input,
-                                                                                reshape_mapping_dict)
-            else:
-                dim_partition_dict_for_output = {}
-
-            dim_partition_dict_mapping = {
-                "input": dim_partition_dict_for_input,
-                "output": dim_partition_dict_for_output,
-            }
-            sharding_spec_mapping = self.to_sharding_spec_mapping(dim_partition_dict_mapping)
-
-            # add index into name to pass the duplicated check
-            # we keep same strategies with different name for node merging, and it will not increase the searching space,
-            # because in solver, this node will be merged into other nodes, and solver will not create a new variable for this node.
-            if keep_sharding_status:
-                name = f'{sharding_spec_mapping["input"].sharding_sequence} -> {sharding_spec_mapping["output"].sharding_sequence}_{index}'
-            else:
-                name = f'{sharding_spec_mapping["input"].sharding_sequence} -> FULLY REPLICATED_{index}'
-
-                # add comm action for converting input to fully replicated
-                total_mesh_dim_list = []
-                for mesh_dim_list in dim_partition_dict_for_input.values():
-                    total_mesh_dim_list.extend(mesh_dim_list)
-                # if there is only one sharding dimension, we should use the value instead of list as logical_process_axis.
-                if len(total_mesh_dim_list) == 1:
-                    total_mesh_dim_list = total_mesh_dim_list[0]
-                    # the total mesh dim list only has one element, so the shard dim has only one element as well.
-                    shard_dim = list(dim_partition_dict_for_input.keys())[0]
-                    input_comm_action = self.get_communication_action(
-                        sharding_spec=sharding_spec_mapping["input"],
-                        communication_pattern=CollectiveCommPattern.GATHER_FWD_SPLIT_BWD,
-                        logical_process_axis=total_mesh_dim_list,
-                        comm_type=CommType.BEFORE,
-                        arg_index=0)
-                    # it will gather the input through gather_dim during forward phase.
-                    input_comm_action.comm_spec.gather_dim = shard_dim
-                    # it will split the input activation grad through shard_dim during backward phase.
-                    input_comm_action.comm_spec.shard_dim = shard_dim
-
-                elif len(total_mesh_dim_list) >= 2:
-                    source_spec = sharding_spec_mapping["input"]
-                    target_spec = ShardingSpec(device_mesh=self.device_mesh,
-                                               entire_shape=source_spec.entire_shape,
-                                               dim_partition_dict={})
-                    comm_spec = {'src_spec': source_spec, 'tgt_spec': target_spec}
-                    input_comm_action = CommAction(comm_spec=comm_spec, comm_type=CommType.BEFORE, arg_index=0)
-
-                else:
-                    input_comm_action = None
-
-                if input_comm_action is not None:
-                    communication_action_mapping["input"] = input_comm_action
-
-            strategy = self.get_sharding_strategy(name=name,
-                                                  sharding_spec_mapping=sharding_spec_mapping,
-                                                  communication_action_mapping=communication_action_mapping)
-            strategy_list.append(strategy)
-
-        return strategy_list
-
-
-class PermuteGenerator(ReshapeGenerator):
-    """
-    PermuteGenerator deals with the sharding strategies of permute op.
-    """
-
-    def collate_strategies(self) -> List[ShardingStrategy]:
-        strategy_list = []
-        for index, strategy in enumerate(self.predecessor_node.strategies_vector):
-            dim_partition_dict_mapping = {}
-            communication_action_mapping = {}
-            input_sharding_spec = strategy.output_sharding_specs[self.op_data["input"]]
-
-            permute_dims = self.op_data['permute_dims'].data
-            dim_partition_dict_for_input = input_sharding_spec.dim_partition_dict
-            dim_partition_dict_for_output = {}
-            for dim_index, permute_dim in enumerate(permute_dims):
-                if permute_dim in dim_partition_dict_for_input:
-                    dim_partition_dict_for_output[dim_index] = dim_partition_dict_for_input[permute_dim]
-
-            dim_partition_dict_mapping = {
-                "input": dim_partition_dict_for_input,
-                "output": dim_partition_dict_for_output,
-            }
-            sharding_spec_mapping = self.to_sharding_spec_mapping(dim_partition_dict_mapping)
-
-            # add index into name to pass the duplicated check
-            # we keep same strategies with different name for node merging, and it will not increase the searching space,
-            # because in solver, this node will be merged into other nodes, and solver will not create a new variable for this node.
-            name = f'{sharding_spec_mapping["input"].sharding_sequence} -> {sharding_spec_mapping["output"].sharding_sequence}_{index}'
-
-            strategy = self.get_sharding_strategy(name=name,
-                                                  sharding_spec_mapping=sharding_spec_mapping,
-                                                  communication_action_mapping=communication_action_mapping)
-            strategy_list.append(strategy)
-
-        return strategy_list
-
-
-class TransposeGenerator(ReshapeGenerator):
-    """
-    TransposeGenerator deals with the sharding strategies of permute op.
-    """
-
-    def collate_strategies(self) -> List[ShardingStrategy]:
-        strategy_list = []
-        for index, strategy in enumerate(self.predecessor_node.strategies_vector):
-            dim_partition_dict_mapping = {}
-            communication_action_mapping = {}
-            input_sharding_spec = strategy.output_sharding_specs[self.op_data["input"]]
-            dim_partition_dict_for_input = input_sharding_spec.dim_partition_dict
-            dim_partition_dict_for_output = {}
-
-            transpose_dims = self.op_data['transpose_dims'].data
-            dim_0 = transpose_dims[0]
-            dim_1 = transpose_dims[1]
-            for dim, sharded_dims in dim_partition_dict_for_input.items():
-                if dim == dim_0:
-                    dim_partition_dict_for_output[dim_1] = dim_partition_dict_for_input[dim_0]
-                elif dim == dim_1:
-                    dim_partition_dict_for_output[dim_0] = dim_partition_dict_for_input[dim_1]
-                else:
-                    dim_partition_dict_for_output[dim] = sharded_dims
-
-            dim_partition_dict_mapping = {
-                "input": dim_partition_dict_for_input,
-                "output": dim_partition_dict_for_output,
-            }
-            sharding_spec_mapping = self.to_sharding_spec_mapping(dim_partition_dict_mapping)
-
-            # add index into name to pass the duplicated check
-            # we keep same strategies with different name for node merging, and it will not increase the searching space,
-            # because in solver, this node will be merged into other nodes, and solver will not create a new variable for this node.
-            name = f'{sharding_spec_mapping["input"].sharding_sequence} -> {sharding_spec_mapping["output"].sharding_sequence}_{index}'
-
-            strategy = self.get_sharding_strategy(name=name,
-                                                  sharding_spec_mapping=sharding_spec_mapping,
-                                                  communication_action_mapping=communication_action_mapping)
-            strategy_list.append(strategy)
-
-        return strategy_list
-
-
-class SplitGenerator(ReshapeGenerator):
-    """
-    SplitGenerator deals with the sharding strategies of split op.
-    """
-
-    def collate_strategies(self) -> List[ShardingStrategy]:
-        strategy_list = []
-        for index, strategy in enumerate(self.predecessor_node.strategies_vector):
-            recover_dims = None
-            dim_partition_dict_mapping = {}
-            communication_action_mapping = {}
-            input_sharding_spec = strategy.output_sharding_specs[self.op_data["input"]]
-            dim_partition_dict_for_input = copy.deepcopy(input_sharding_spec.dim_partition_dict)
-            split_size, split_dim = self.op_data['split_info'].data
-
-            if split_dim in dim_partition_dict_for_input:
-                recover_dims = dim_partition_dict_for_input.pop(split_dim)
-
-            dim_partition_dict_for_output = [
-                copy.deepcopy(dim_partition_dict_for_input) for _ in range(len(self.op_data["output"].data))
-            ]
-            assert len(dim_partition_dict_for_output) >= 2
-            dim_partition_dict_mapping = {
-                "input": dim_partition_dict_for_input,
-                "output": dim_partition_dict_for_output,
-            }
-            sharding_spec_mapping = self.to_sharding_spec_mapping(dim_partition_dict_mapping)
-            # add index into name to pass the duplicated check
-            # we keep same strategies with different name for node merging, and it will not increase the searching space,
-            # because in solver, this node will be merged into other nodes, and solver will not create a new variable for this node.
-            name = f'{sharding_spec_mapping["input"].sharding_sequence}_{index}'
-
-            # add comm action if the input need to be recovered to replica in the split dimension.
-            if recover_dims:
-                # if there is only one sharding dimension, we should use the value instead of list as logical_process_axis.
-                if len(recover_dims) == 1:
-                    recover_dims = recover_dims[0]
-                    input_comm_action = self.get_communication_action(
-                        sharding_spec=sharding_spec_mapping["input"],
-                        communication_pattern=CollectiveCommPattern.GATHER_FWD_SPLIT_BWD,
-                        logical_process_axis=recover_dims,
-                        comm_type=CommType.BEFORE,
-                        arg_index=0)
-                    # it will gather the input through gather_dim during forward phase.
-                    input_comm_action.comm_spec.gather_dim = split_dim
-                    # it will split the input activation grad through split_dim during backward phase.
-                    input_comm_action.comm_spec.shard_dim = split_dim
-
-                elif len(recover_dims) >= 2:
-                    # original sharding spec
-                    source_spec = input_sharding_spec
-                    # target sharding spec
-                    target_spec = sharding_spec_mapping["input"]
-                    comm_spec = {'src_spec': source_spec, 'tgt_spec': target_spec}
-                    input_comm_action = CommAction(comm_spec=comm_spec, comm_type=CommType.BEFORE, arg_index=0)
-
-                else:
-                    input_comm_action = None
-
-                if input_comm_action is not None:
-                    communication_action_mapping["input"] = input_comm_action
-
-            strategy = self.get_sharding_strategy(name=name,
-                                                  sharding_spec_mapping=sharding_spec_mapping,
-                                                  communication_action_mapping=communication_action_mapping)
-            strategy_list.append(strategy)
-
-        return strategy_list
diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/experimental/permute_handler.py b/colossalai/auto_parallel/tensor_shard/node_handler/permute_handler.py
similarity index 92%
rename from colossalai/auto_parallel/tensor_shard/node_handler/experimental/permute_handler.py
rename to colossalai/auto_parallel/tensor_shard/node_handler/permute_handler.py
index 6d625e153f61..91e4a5105a08 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/experimental/permute_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/permute_handler.py
@@ -2,11 +2,10 @@
 
 import torch
 
-from ...sharding_strategy import OperationData, OperationDataType
-from ..node_handler import NodeHandler
-from ..registry import operator_registry
-from ..strategy import StrategyGenerator
-from .reshape_generator import PermuteGenerator
+from ..sharding_strategy import OperationData, OperationDataType
+from .node_handler import NodeHandler
+from .registry import operator_registry
+from .strategy import PermuteGenerator, StrategyGenerator
 
 __all__ = ['PermuteHandler']
 
diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/experimental/split_handler.py b/colossalai/auto_parallel/tensor_shard/node_handler/split_handler.py
similarity index 89%
rename from colossalai/auto_parallel/tensor_shard/node_handler/experimental/split_handler.py
rename to colossalai/auto_parallel/tensor_shard/node_handler/split_handler.py
index 38c5eed7d00e..653d158b7c36 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/experimental/split_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/split_handler.py
@@ -2,11 +2,10 @@
 
 import torch
 
-from ...sharding_strategy import OperationData, OperationDataType
-from ..node_handler import NodeHandler
-from ..registry import operator_registry
-from ..strategy import StrategyGenerator
-from .reshape_generator import SplitGenerator
+from ..sharding_strategy import OperationData, OperationDataType
+from .node_handler import NodeHandler
+from .registry import operator_registry
+from .strategy import SplitGenerator, StrategyGenerator
 
 __all__ = ['SplitHandler']
 
diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/__init__.py b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/__init__.py
index 8d25475f9c57..db1f31521c86 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/__init__.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/__init__.py
@@ -14,7 +14,13 @@
 from .normal_pooling_generator import NormalPoolStrategyGenerator
 from .output_generator import OutputGenerator
 from .placeholder_generator import PlaceholderGenerator
-from .reshape_generator import ReshapeGenerator
+from .reshape_generator import (
+    DefaultReshapeGenerator,
+    PermuteGenerator,
+    SplitGenerator,
+    TransposeGenerator,
+    ViewGenerator,
+)
 from .softmax_generator import SoftmaxGenerator
 from .strategy_generator import StrategyGenerator
 from .sum_generator import SumGenerator
@@ -26,7 +32,8 @@
     'StrategyGenerator', 'DotProductStrategyGenerator', 'MatVecStrategyGenerator', 'LinearProjectionStrategyGenerator',
     'BatchedMatMulStrategyGenerator', 'ConvStrategyGenerator', 'UnaryElementwiseGenerator',
     'BatchNormStrategyGenerator', 'GetItemStrategyGenerator', 'TensorStrategyGenerator', 'TensorTupleStrategyGenerator',
-    'LayerNormGenerator', 'ReshapeGenerator', 'PlaceholderGenerator', 'OutputGenerator', 'WhereGenerator',
-    'ReshapeGenerator', 'NormalPoolStrategyGenerator', 'BinaryElementwiseStrategyGenerator', 'GetattrGenerator',
-    'TensorConstructorGenerator', 'EmbeddingStrategyGenerator', 'SumGenerator', 'SoftmaxGenerator'
+    'LayerNormGenerator', 'PlaceholderGenerator', 'OutputGenerator', 'WhereGenerator', 'NormalPoolStrategyGenerator',
+    'BinaryElementwiseStrategyGenerator', 'GetattrGenerator', 'TensorConstructorGenerator',
+    'EmbeddingStrategyGenerator', 'SumGenerator', 'SoftmaxGenerator', 'ViewGenerator', 'PermuteGenerator',
+    'TransposeGenerator', 'SplitGenerator', 'DefaultReshapeGenerator'
 ]
diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/reshape_generator.py b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/reshape_generator.py
index 0b3506c27e4c..39983e918a96 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/reshape_generator.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/reshape_generator.py
@@ -1,6 +1,7 @@
 import copy
 from typing import List
 
+from colossalai.auto_parallel.tensor_shard.node_handler.strategy.strategy_generator import FollowingStrategyGenerator
 from colossalai.auto_parallel.tensor_shard.sharding_strategy import (
     CommAction,
     CommType,
@@ -8,17 +9,20 @@
     ShardingStrategy,
     TrainCycleItem,
 )
+from colossalai.auto_parallel.tensor_shard.utils import (
+    check_keep_sharding_status,
+    detect_reshape_mapping,
+    infer_output_dim_partition_dict,
+)
 from colossalai.tensor.shape_consistency import CollectiveCommPattern
 from colossalai.tensor.sharding_spec import ShardingSpec
 
-from .strategy_generator import FollowingStrategyGenerator
-
-__all__ = ['ReshapeGenerator']
+__all__ = ['ReshapeGenerator', 'ViewGenerator', 'PermuteGenerator', 'TransposeGenerator', 'SplitGenerator']
 
 
 class ReshapeGenerator(FollowingStrategyGenerator):
     """
-    ReshapeGenerator which deals with the sharding strategies of Reshape Op, such as torch.Tensor.permute.
+    ReshapeGenerator is the base class for all the reshape operation.
     """
 
     def validate(self) -> bool:
@@ -57,11 +61,255 @@ def update_memory_cost(self, strategy: ShardingStrategy):
         memory_cost = TrainCycleItem(fwd=fwd_mem_cost, bwd=bwd_mem_cost, total=total_mem_cost)
         strategy.memory_cost = memory_cost
 
+    def collate_strategies(self) -> List[ShardingStrategy]:
+        return super().collate_strategies()
+
+
+class ViewGenerator(ReshapeGenerator):
+    """
+    ViewGenerator deals with the sharding strategies of view op.
+    """
+
+    def collate_strategies(self) -> List[ShardingStrategy]:
+        strategy_list = []
+        for index, strategy in enumerate(self.predecessor_node.strategies_vector):
+            dim_partition_dict_mapping = {}
+            communication_action_mapping = {}
+            input_sharding_spec = strategy.output_sharding_specs[self.op_data["input"]]
+
+            origin_shape = self.op_data['input'].data.shape
+            tgt_shape = self.op_data['tgt_shape'].data
+
+            reshape_mapping_dict = detect_reshape_mapping(origin_shape, tgt_shape)
+
+            dim_partition_dict_for_input = input_sharding_spec.dim_partition_dict
+            keep_sharding_status = check_keep_sharding_status(dim_partition_dict_for_input, reshape_mapping_dict)
+
+            if keep_sharding_status:
+                dim_partition_dict_for_output = infer_output_dim_partition_dict(dim_partition_dict_for_input,
+                                                                                reshape_mapping_dict)
+            else:
+                dim_partition_dict_for_output = {}
+
+            dim_partition_dict_mapping = {
+                "input": dim_partition_dict_for_input,
+                "output": dim_partition_dict_for_output,
+            }
+            sharding_spec_mapping = self.to_sharding_spec_mapping(dim_partition_dict_mapping)
+
+            # add index into name to pass the duplicated check
+            # we keep same strategies with different name for node merging, and it will not increase the searching space,
+            # because in solver, this node will be merged into other nodes, and solver will not create a new variable for this node.
+            if keep_sharding_status:
+                name = f'{sharding_spec_mapping["input"].sharding_sequence} -> {sharding_spec_mapping["output"].sharding_sequence}_{index}'
+            else:
+                name = f'{sharding_spec_mapping["input"].sharding_sequence} -> FULLY REPLICATED_{index}'
+
+                # add comm action for converting input to fully replicated
+                total_mesh_dim_list = []
+                for mesh_dim_list in dim_partition_dict_for_input.values():
+                    total_mesh_dim_list.extend(mesh_dim_list)
+                # if there is only one sharding dimension, we should use the value instead of list as logical_process_axis.
+                if len(total_mesh_dim_list) == 1:
+                    total_mesh_dim_list = total_mesh_dim_list[0]
+                    # the total mesh dim list only has one element, so the shard dim has only one element as well.
+                    shard_dim = list(dim_partition_dict_for_input.keys())[0]
+                    input_comm_action = self.get_communication_action(
+                        sharding_spec=sharding_spec_mapping["input"],
+                        communication_pattern=CollectiveCommPattern.GATHER_FWD_SPLIT_BWD,
+                        logical_process_axis=total_mesh_dim_list,
+                        comm_type=CommType.BEFORE,
+                        arg_index=0)
+                    # it will gather the input through gather_dim during forward phase.
+                    input_comm_action.comm_spec.gather_dim = shard_dim
+                    # it will split the input activation grad through shard_dim during backward phase.
+                    input_comm_action.comm_spec.shard_dim = shard_dim
+
+                elif len(total_mesh_dim_list) >= 2:
+                    source_spec = sharding_spec_mapping["input"]
+                    target_spec = ShardingSpec(device_mesh=self.device_mesh,
+                                               entire_shape=source_spec.entire_shape,
+                                               dim_partition_dict={})
+                    comm_spec = {'src_spec': source_spec, 'tgt_spec': target_spec}
+                    input_comm_action = CommAction(comm_spec=comm_spec, comm_type=CommType.BEFORE, arg_index=0)
+
+                else:
+                    input_comm_action = None
+
+                if input_comm_action is not None:
+                    communication_action_mapping["input"] = input_comm_action
+
+            strategy = self.get_sharding_strategy(name=name,
+                                                  sharding_spec_mapping=sharding_spec_mapping,
+                                                  communication_action_mapping=communication_action_mapping)
+            strategy_list.append(strategy)
+
+        return strategy_list
+
+
+class PermuteGenerator(ReshapeGenerator):
+    """
+    PermuteGenerator deals with the sharding strategies of permute op.
+    """
+
+    def collate_strategies(self) -> List[ShardingStrategy]:
+        strategy_list = []
+        for index, strategy in enumerate(self.predecessor_node.strategies_vector):
+            dim_partition_dict_mapping = {}
+            communication_action_mapping = {}
+            input_sharding_spec = strategy.output_sharding_specs[self.op_data["input"]]
+
+            permute_dims = self.op_data['permute_dims'].data
+            dim_partition_dict_for_input = input_sharding_spec.dim_partition_dict
+            dim_partition_dict_for_output = {}
+            for dim_index, permute_dim in enumerate(permute_dims):
+                if permute_dim in dim_partition_dict_for_input:
+                    dim_partition_dict_for_output[dim_index] = dim_partition_dict_for_input[permute_dim]
+
+            dim_partition_dict_mapping = {
+                "input": dim_partition_dict_for_input,
+                "output": dim_partition_dict_for_output,
+            }
+            sharding_spec_mapping = self.to_sharding_spec_mapping(dim_partition_dict_mapping)
+
+            # add index into name to pass the duplicated check
+            # we keep same strategies with different name for node merging, and it will not increase the searching space,
+            # because in solver, this node will be merged into other nodes, and solver will not create a new variable for this node.
+            name = f'{sharding_spec_mapping["input"].sharding_sequence} -> {sharding_spec_mapping["output"].sharding_sequence}_{index}'
+
+            strategy = self.get_sharding_strategy(name=name,
+                                                  sharding_spec_mapping=sharding_spec_mapping,
+                                                  communication_action_mapping=communication_action_mapping)
+            strategy_list.append(strategy)
+
+        return strategy_list
+
+
+class TransposeGenerator(ReshapeGenerator):
+    """
+    TransposeGenerator deals with the sharding strategies of permute op.
+    """
+
     def collate_strategies(self) -> List[ShardingStrategy]:
         strategy_list = []
-        # For reshape function, to keep the computing correctness we keep the sharding
-        # spec of input is fully replicated. In addition, we will keep the output in
-        # replica status and let the successor node choose the way to resharding the
+        for index, strategy in enumerate(self.predecessor_node.strategies_vector):
+            dim_partition_dict_mapping = {}
+            communication_action_mapping = {}
+            input_sharding_spec = strategy.output_sharding_specs[self.op_data["input"]]
+            dim_partition_dict_for_input = input_sharding_spec.dim_partition_dict
+            dim_partition_dict_for_output = {}
+
+            transpose_dims = self.op_data['transpose_dims'].data
+            dim_0 = transpose_dims[0]
+            dim_1 = transpose_dims[1]
+            for dim, sharded_dims in dim_partition_dict_for_input.items():
+                if dim == dim_0:
+                    dim_partition_dict_for_output[dim_1] = dim_partition_dict_for_input[dim_0]
+                elif dim == dim_1:
+                    dim_partition_dict_for_output[dim_0] = dim_partition_dict_for_input[dim_1]
+                else:
+                    dim_partition_dict_for_output[dim] = sharded_dims
+
+            dim_partition_dict_mapping = {
+                "input": dim_partition_dict_for_input,
+                "output": dim_partition_dict_for_output,
+            }
+            sharding_spec_mapping = self.to_sharding_spec_mapping(dim_partition_dict_mapping)
+
+            # add index into name to pass the duplicated check
+            # we keep same strategies with different name for node merging, and it will not increase the searching space,
+            # because in solver, this node will be merged into other nodes, and solver will not create a new variable for this node.
+            name = f'{sharding_spec_mapping["input"].sharding_sequence} -> {sharding_spec_mapping["output"].sharding_sequence}_{index}'
+
+            strategy = self.get_sharding_strategy(name=name,
+                                                  sharding_spec_mapping=sharding_spec_mapping,
+                                                  communication_action_mapping=communication_action_mapping)
+            strategy_list.append(strategy)
+
+        return strategy_list
+
+
+class SplitGenerator(ReshapeGenerator):
+    """
+    SplitGenerator deals with the sharding strategies of split op.
+    """
+
+    def collate_strategies(self) -> List[ShardingStrategy]:
+        strategy_list = []
+        for index, strategy in enumerate(self.predecessor_node.strategies_vector):
+            recover_dims = None
+            dim_partition_dict_mapping = {}
+            communication_action_mapping = {}
+            input_sharding_spec = strategy.output_sharding_specs[self.op_data["input"]]
+            dim_partition_dict_for_input = copy.deepcopy(input_sharding_spec.dim_partition_dict)
+            split_size, split_dim = self.op_data['split_info'].data
+
+            if split_dim in dim_partition_dict_for_input:
+                recover_dims = dim_partition_dict_for_input.pop(split_dim)
+
+            dim_partition_dict_for_output = [
+                copy.deepcopy(dim_partition_dict_for_input) for _ in range(len(self.op_data["output"].data))
+            ]
+            assert len(dim_partition_dict_for_output) >= 2
+            dim_partition_dict_mapping = {
+                "input": dim_partition_dict_for_input,
+                "output": dim_partition_dict_for_output,
+            }
+            sharding_spec_mapping = self.to_sharding_spec_mapping(dim_partition_dict_mapping)
+            # add index into name to pass the duplicated check
+            # we keep same strategies with different name for node merging, and it will not increase the searching space,
+            # because in solver, this node will be merged into other nodes, and solver will not create a new variable for this node.
+            name = f'{sharding_spec_mapping["input"].sharding_sequence}_{index}'
+
+            # add comm action if the input need to be recovered to replica in the split dimension.
+            if recover_dims:
+                # if there is only one sharding dimension, we should use the value instead of list as logical_process_axis.
+                if len(recover_dims) == 1:
+                    recover_dims = recover_dims[0]
+                    input_comm_action = self.get_communication_action(
+                        sharding_spec=sharding_spec_mapping["input"],
+                        communication_pattern=CollectiveCommPattern.GATHER_FWD_SPLIT_BWD,
+                        logical_process_axis=recover_dims,
+                        comm_type=CommType.BEFORE,
+                        arg_index=0)
+                    # it will gather the input through gather_dim during forward phase.
+                    input_comm_action.comm_spec.gather_dim = split_dim
+                    # it will split the input activation grad through split_dim during backward phase.
+                    input_comm_action.comm_spec.shard_dim = split_dim
+
+                elif len(recover_dims) >= 2:
+                    # original sharding spec
+                    source_spec = input_sharding_spec
+                    # target sharding spec
+                    target_spec = sharding_spec_mapping["input"]
+                    comm_spec = {'src_spec': source_spec, 'tgt_spec': target_spec}
+                    input_comm_action = CommAction(comm_spec=comm_spec, comm_type=CommType.BEFORE, arg_index=0)
+
+                else:
+                    input_comm_action = None
+
+                if input_comm_action is not None:
+                    communication_action_mapping["input"] = input_comm_action
+
+            strategy = self.get_sharding_strategy(name=name,
+                                                  sharding_spec_mapping=sharding_spec_mapping,
+                                                  communication_action_mapping=communication_action_mapping)
+            strategy_list.append(strategy)
+
+        return strategy_list
+
+
+class DefaultReshapeGenerator(ReshapeGenerator):
+    """
+    DefaultReshapeGenerator which deals with the sharding strategies of Reshape Op which have to recover the tensor
+    to Replica status.
+    """
+
+    def collate_strategies(self) -> List[ShardingStrategy]:
+        strategy_list = []
+        # For default reshape strategy, to keep the computing correctness we keep the
+        # sharding spec of input is fully replicated. In addition, we will keep the output
+        # in replica status and let the successor node choose the way to resharding the
         # output node. Therefore, the different strategies of input node with same
         # output sharding spec will generate same strategy for reshape function.
         for index, strategy in enumerate(self.predecessor_node.strategies_vector):
@@ -114,9 +362,4 @@ def collate_strategies(self) -> List[ShardingStrategy]:
                                                   communication_action_mapping=communication_action_mapping)
             strategy_list.append(strategy)
 
-        for strategy in strategy_list:
-            self.update_communication_cost(strategy)
-            self.update_compute_cost(strategy)
-            self.update_memory_cost(strategy)
-
         return strategy_list
diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/experimental/transpose_handler.py b/colossalai/auto_parallel/tensor_shard/node_handler/transpose_handler.py
similarity index 90%
rename from colossalai/auto_parallel/tensor_shard/node_handler/experimental/transpose_handler.py
rename to colossalai/auto_parallel/tensor_shard/node_handler/transpose_handler.py
index 3c7336a93167..7a9d37726490 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/experimental/transpose_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/transpose_handler.py
@@ -2,11 +2,10 @@
 
 import torch
 
-from ...sharding_strategy import OperationData, OperationDataType
-from ..node_handler import NodeHandler
-from ..registry import operator_registry
-from ..strategy import StrategyGenerator
-from .reshape_generator import TransposeGenerator
+from ..sharding_strategy import OperationData, OperationDataType
+from .node_handler import NodeHandler
+from .registry import operator_registry
+from .strategy import StrategyGenerator, TransposeGenerator
 
 __all__ = ['TransposeHandler']
 
diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/experimental/view_handler.py b/colossalai/auto_parallel/tensor_shard/node_handler/view_handler.py
similarity index 88%
rename from colossalai/auto_parallel/tensor_shard/node_handler/experimental/view_handler.py
rename to colossalai/auto_parallel/tensor_shard/node_handler/view_handler.py
index 6be634593510..7dff89d1d7a3 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/experimental/view_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/view_handler.py
@@ -2,11 +2,10 @@
 
 import torch
 
-from ...sharding_strategy import OperationData, OperationDataType
-from ..node_handler import NodeHandler
-from ..registry import operator_registry
-from ..strategy import StrategyGenerator
-from .reshape_generator import ViewGenerator
+from ..sharding_strategy import OperationData, OperationDataType
+from .node_handler import NodeHandler
+from .registry import operator_registry
+from .strategy import StrategyGenerator, ViewGenerator
 
 __all__ = ['ViewHandler']
 
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_reshape_handler.py b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_default_reshape_handler.py
similarity index 91%
rename from tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_reshape_handler.py
rename to tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_default_reshape_handler.py
index de277002b75d..ea7c2b729635 100644
--- a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_reshape_handler.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_default_reshape_handler.py
@@ -1,8 +1,8 @@
 import torch
 import torch.nn as nn
 
+from colossalai.auto_parallel.tensor_shard.node_handler import DefaultReshapeHandler
 from colossalai.auto_parallel.tensor_shard.node_handler.conv_handler import ConvFunctionHandler
-from colossalai.auto_parallel.tensor_shard.node_handler.reshape_handler import ReshapeHandler
 from colossalai.auto_parallel.tensor_shard.sharding_strategy import OperationData, OperationDataType, StrategiesVector
 from colossalai.device.device_mesh import DeviceMesh
 from colossalai.fx import ColoGraphModule, ColoTracer
@@ -51,9 +51,9 @@ def test_reshape_handler():
                                        strategies_vector=conv_strategies_vector)
     conv_handler.register_strategy(compute_resharding_cost=False)
     setattr(conv_mod_node, 'strategies_vector', conv_strategies_vector)
-    reshape_handler = ReshapeHandler(node=reshape_node,
-                                     device_mesh=device_mesh,
-                                     strategies_vector=reshape_strategies_vector)
+    reshape_handler = DefaultReshapeHandler(node=reshape_node,
+                                            device_mesh=device_mesh,
+                                            strategies_vector=reshape_strategies_vector)
 
     reshape_handler.register_strategy(compute_resharding_cost=False)
 
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_getitem_handler.py b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_getitem_handler.py
index 3c35da61b1c3..c72d2a6a80e8 100644
--- a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_getitem_handler.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_getitem_handler.py
@@ -5,10 +5,10 @@
 import torch.multiprocessing as mp
 import torch.nn as nn
 
+from colossalai.auto_parallel.tensor_shard.node_handler.default_reshape_handler import DefaultReshapeHandler
 from colossalai.auto_parallel.tensor_shard.node_handler.getitem_handler import GetItemHandler
 from colossalai.auto_parallel.tensor_shard.node_handler.linear_handler import LinearFunctionHandler
 from colossalai.auto_parallel.tensor_shard.node_handler.placeholder_handler import PlaceholderHandler
-from colossalai.auto_parallel.tensor_shard.node_handler.reshape_handler import ReshapeHandler
 from colossalai.auto_parallel.tensor_shard.sharding_strategy import OperationData, OperationDataType, StrategiesVector
 from colossalai.device.device_mesh import DeviceMesh
 from colossalai.fx import ColoGraphModule, ColoTracer
@@ -153,7 +153,9 @@ def test_getitem_from_tuple_handler():
     )
     input_handler.register_strategy(compute_resharding_cost=False)
     setattr(input_node, 'strategies_vector', input_strategies_vector)
-    split_handler = ReshapeHandler(node=split_node, device_mesh=device_mesh, strategies_vector=split_strategies_vector)
+    split_handler = DefaultReshapeHandler(node=split_node,
+                                          device_mesh=device_mesh,
+                                          strategies_vector=split_strategies_vector)
     split_handler.register_strategy(compute_resharding_cost=False)
     setattr(split_node, 'strategies_vector', split_strategies_vector)
     getitem_handler = GetItemHandler(node=getitem_node,
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_permute_and_transpose_handler.py b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_permute_and_transpose_handler.py
index c695b8843a3c..b12db13324c0 100644
--- a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_permute_and_transpose_handler.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_permute_and_transpose_handler.py
@@ -5,8 +5,8 @@
 import torch.multiprocessing as mp
 import torch.nn as nn
 
+from colossalai.auto_parallel.tensor_shard.node_handler import PermuteHandler, TransposeHandler
 from colossalai.auto_parallel.tensor_shard.node_handler.conv_handler import ConvFunctionHandler
-from colossalai.auto_parallel.tensor_shard.node_handler.experimental import PermuteHandler, TransposeHandler
 from colossalai.auto_parallel.tensor_shard.node_handler.linear_handler import LinearFunctionHandler
 from colossalai.auto_parallel.tensor_shard.sharding_strategy import OperationData, OperationDataType, StrategiesVector
 from colossalai.device.device_mesh import DeviceMesh
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_split_handler.py b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_split_handler.py
index 9e8e905c54a2..813651869454 100644
--- a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_split_handler.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_split_handler.py
@@ -5,8 +5,8 @@
 import torch.multiprocessing as mp
 import torch.nn as nn
 
+from colossalai.auto_parallel.tensor_shard.node_handler import SplitHandler
 from colossalai.auto_parallel.tensor_shard.node_handler.conv_handler import ConvFunctionHandler
-from colossalai.auto_parallel.tensor_shard.node_handler.experimental import SplitHandler
 from colossalai.auto_parallel.tensor_shard.node_handler.linear_handler import LinearFunctionHandler
 from colossalai.auto_parallel.tensor_shard.sharding_strategy import OperationData, OperationDataType, StrategiesVector
 from colossalai.device.device_mesh import DeviceMesh
@@ -156,8 +156,7 @@ def check_split_handler(rank, split_size, split_dim, model_cls, world_size, port
     # reshape handler is a following strategy handler, so the number of strategies is equal to the predecessor node.
     assert len(split_strategies_vector) == len(previous_strategies_vector)
     strategy_name_list = [strategy.name for strategy in split_strategies_vector]
-    for name in strategy_name_list:
-        print(name)
+
     if model_cls.__name__ == 'ConvSplitModel':
 
         if split_dim == 0:
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_view_handler.py b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_view_handler.py
index 08a702789f9f..d07d2f76c178 100644
--- a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_view_handler.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_view_handler.py
@@ -5,8 +5,8 @@
 import torch.multiprocessing as mp
 import torch.nn as nn
 
+from colossalai.auto_parallel.tensor_shard.node_handler import ViewHandler
 from colossalai.auto_parallel.tensor_shard.node_handler.conv_handler import ConvFunctionHandler
-from colossalai.auto_parallel.tensor_shard.node_handler.experimental import ViewHandler
 from colossalai.auto_parallel.tensor_shard.node_handler.linear_handler import LinearFunctionHandler
 from colossalai.auto_parallel.tensor_shard.sharding_strategy import OperationData, OperationDataType, StrategiesVector
 from colossalai.device.device_mesh import DeviceMesh

From a020eecc7051083e1dbc4a02bd49a9521b032aad Mon Sep 17 00:00:00 2001
From: binmakeswell <binmakeswell@gmail.com>
Date: Wed, 8 Feb 2023 17:28:29 +0800
Subject: [PATCH 280/503] [doc] fix typo of BLOOM (#2643)

* [doc] fix typo of BLOOM
---
 README-zh-Hans.md | 4 ++--
 README.md         | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/README-zh-Hans.md b/README-zh-Hans.md
index 1e1d475c906a..34122db65d75 100644
--- a/README-zh-Hans.md
+++ b/README-zh-Hans.md
@@ -58,7 +58,7 @@
    <ul>
      <li><a href="#GPT-3-Inference">GPT-3</a></li>
      <li><a href="#OPT-Serving">1750亿参数OPT在线推理服务</a></li>
-     <li><a href="#BLOOM-Inference">1750亿参数 BLOOM</a></li>
+     <li><a href="#BLOOM-Inference">1760亿参数 BLOOM</a></li>
    </ul>
  </li>
 <li>
@@ -204,7 +204,7 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/BLOOM%20Inference.PNG" width=800/>
 </p>
 
-- [BLOOM](https://github.com/hpcaitech/EnergonAI/tree/main/examples/bloom): 降低1750亿参数BLOOM模型部署推理成本超10倍
+- [BLOOM](https://github.com/hpcaitech/EnergonAI/tree/main/examples/bloom): 降低1760亿参数BLOOM模型部署推理成本超10倍
 
 <p align="right">(<a href="#top">返回顶端</a>)</p>
 
diff --git a/README.md b/README.md
index dc1e80113517..7ec864c02cdb 100644
--- a/README.md
+++ b/README.md
@@ -58,7 +58,7 @@
    <ul>
      <li><a href="#GPT-3-Inference">GPT-3</a></li>
      <li><a href="#OPT-Serving">OPT-175B Online Serving for Text Generation</a></li>
-     <li><a href="#BLOOM-Inference">175B BLOOM</a></li>
+     <li><a href="#BLOOM-Inference">176B BLOOM</a></li>
    </ul>
  </li>
    <li>
@@ -206,7 +206,7 @@ Please visit our [documentation](https://www.colossalai.org/) and [examples](htt
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/BLOOM%20Inference.PNG" width=800/>
 </p>
 
-- [BLOOM](https://github.com/hpcaitech/EnergonAI/tree/main/examples/bloom): Reduce hardware deployment costs of 175-billion-parameter BLOOM by more than 10 times.
+- [BLOOM](https://github.com/hpcaitech/EnergonAI/tree/main/examples/bloom): Reduce hardware deployment costs of 176-billion-parameter BLOOM by more than 10 times.
 
 <p align="right">(<a href="#top">back to top</a>)</p>
 

From 85b2303b5506f9cef57bed571eedb186015a4b8c Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Thu, 9 Feb 2023 14:21:38 +0800
Subject: [PATCH 281/503] [doc] migrate the markdown files (#2652)

---
 .github/workflows/check_doc_on_pr.yml         |  23 +
 .github/workflows/scripts/check_doc_i18n.py   |  67 ++
 .../Colossal-Auto/feature/auto_checkpoint.md  |   0
 .../en/Colossal-Auto/feature/device_mesh.md   |   0
 .../feature/shape_consistency.md              |   0
 .../source/en/Colossal-Auto/feature/tracer.md |   0
 .../Colossal-Auto/get_started/installation.md |  27 +
 .../Colossal-Auto/get_started/introduction.md |  47 ++
 .../en/Colossal-Auto/get_started/run_demo.md  |  17 +
 .../advanced_tutorials/add_your_parallel.md   | 124 ++++
 .../define_your_own_parallel_model.md         |  36 +
 ...rate_mixture_of_experts_into_your_model.md | 139 ++++
 .../en/advanced_tutorials/meet_gemini.md      |  88 +++
 .../en/advanced_tutorials/opt_service.md      |  81 +++
 ...parallelize_your_training_like_Megatron.md | 192 ++++++
 .../train_gpt_using_hybrid_parallelism.md     | 270 ++++++++
 .../train_vit_using_pipeline_parallelism.md   | 247 +++++++
 .../train_vit_with_hybrid_parallelism.md      | 646 ++++++++++++++++++
 docs/source/en/basics/colotensor_concept.md   |  97 +++
 docs/source/en/basics/command_line_tool.md    |  53 ++
 .../en/basics/configure_parallelization.md    | 156 +++++
 docs/source/en/basics/define_your_config.md   |  82 +++
 docs/source/en/basics/engine_trainer.md       | 387 +++++++++++
 docs/source/en/basics/initialize_features.md  |  49 ++
 docs/source/en/basics/launch_colossalai.md    | 232 +++++++
 docs/source/en/basics/model_checkpoint.md     |  61 ++
 .../source/en/concepts/colossalai_overview.md |  36 +
 .../en/concepts/distributed_training.md       | 120 ++++
 .../en/concepts/paradigms_of_parallelism.md   | 123 ++++
 docs/source/en/features/1D_tensor_parallel.md | 111 +++
 docs/source/en/features/2D_tensor_parallel.md | 142 ++++
 .../en/features/2p5D_tensor_parallel.md       | 142 ++++
 docs/source/en/features/3D_tensor_parallel.md | 151 ++++
 .../en/features/gradient_accumulation.md      |  45 ++
 docs/source/en/features/gradient_clipping.md  |  62 ++
 docs/source/en/features/gradient_handler.md   |  63 ++
 .../en/features/mixed_precision_training.md   | 367 ++++++++++
 docs/source/en/features/nvme_offload.md       |  42 ++
 docs/source/en/features/pipeline_parallel.md  | 159 +++++
 docs/source/en/features/zero_with_chunk.md    | 262 +++++++
 docs/source/en/get_started/installation.md    |  37 +
 docs/source/en/get_started/reading_roadmap.md |  19 +
 docs/source/en/get_started/run_demo.md        |  43 ++
 .../Colossal-Auto/feature/auto_checkpoint.md  |   0
 .../zh/Colossal-Auto/feature/device_mesh.md   |   0
 .../feature/shape_consistency.md              |   0
 .../source/zh/Colossal-Auto/feature/tracer.md |   0
 .../Colossal-Auto/get_started/installation.md |  28 +
 .../Colossal-Auto/get_started/introduction.md |  43 ++
 .../zh/Colossal-Auto/get_started/run_demo.md  |  16 +
 .../advanced_tutorials/add_your_parallel.md   | 112 +++
 .../define_your_own_parallel_model.md         |  31 +
 ...rate_mixture_of_experts_into_your_model.md | 140 ++++
 .../zh/advanced_tutorials/meet_gemini.md      |  96 +++
 .../zh/advanced_tutorials/opt_service.md      |  79 +++
 ...parallelize_your_training_like_Megatron.md | 176 +++++
 .../train_gpt_using_hybrid_parallelism.md     | 275 ++++++++
 .../train_vit_using_pipeline_parallelism.md   | 246 +++++++
 .../train_vit_with_hybrid_parallelism.md      | 591 ++++++++++++++++
 docs/source/zh/basics/colotensor_concept.md   |  98 +++
 docs/source/zh/basics/command_line_tool.md    |  47 ++
 .../zh/basics/configure_parallelization.md    | 136 ++++
 docs/source/zh/basics/define_your_config.md   |  71 ++
 docs/source/zh/basics/engine_trainer.md       | 384 +++++++++++
 docs/source/zh/basics/initialize_features.md  |  46 ++
 docs/source/zh/basics/launch_colossalai.md    | 212 ++++++
 docs/source/zh/basics/model_checkpoint.md     |  61 ++
 .../source/zh/concepts/colossalai_overview.md |  36 +
 .../zh/concepts/distributed_training.md       |  88 +++
 .../zh/concepts/paradigms_of_parallelism.md   |  91 +++
 docs/source/zh/features/1D_tensor_parallel.md | 111 +++
 docs/source/zh/features/2D_tensor_parallel.md | 141 ++++
 .../zh/features/2p5D_tensor_parallel.md       | 145 ++++
 docs/source/zh/features/3D_tensor_parallel.md | 154 +++++
 .../zh/features/gradient_accumulation.md      |  40 ++
 docs/source/zh/features/gradient_clipping.md  |  51 ++
 docs/source/zh/features/gradient_handler.md   |  59 ++
 .../zh/features/mixed_precision_training.md   | 344 ++++++++++
 docs/source/zh/features/nvme_offload.md       |  43 ++
 docs/source/zh/features/pipeline_parallel.md  | 158 +++++
 docs/source/zh/features/zero_with_chunk.md    | 261 +++++++
 docs/source/zh/get_started/installation.md    |  36 +
 docs/source/zh/get_started/reading_roadmap.md |  10 +
 docs/source/zh/get_started/run_demo.md        |  28 +
 84 files changed, 9729 insertions(+)
 create mode 100644 .github/workflows/check_doc_on_pr.yml
 create mode 100644 .github/workflows/scripts/check_doc_i18n.py
 create mode 100644 docs/source/en/Colossal-Auto/feature/auto_checkpoint.md
 create mode 100644 docs/source/en/Colossal-Auto/feature/device_mesh.md
 create mode 100644 docs/source/en/Colossal-Auto/feature/shape_consistency.md
 create mode 100644 docs/source/en/Colossal-Auto/feature/tracer.md
 create mode 100644 docs/source/en/Colossal-Auto/get_started/installation.md
 create mode 100644 docs/source/en/Colossal-Auto/get_started/introduction.md
 create mode 100644 docs/source/en/Colossal-Auto/get_started/run_demo.md
 create mode 100644 docs/source/en/advanced_tutorials/add_your_parallel.md
 create mode 100644 docs/source/en/advanced_tutorials/define_your_own_parallel_model.md
 create mode 100644 docs/source/en/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md
 create mode 100644 docs/source/en/advanced_tutorials/meet_gemini.md
 create mode 100644 docs/source/en/advanced_tutorials/opt_service.md
 create mode 100644 docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md
 create mode 100644 docs/source/en/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
 create mode 100644 docs/source/en/advanced_tutorials/train_vit_using_pipeline_parallelism.md
 create mode 100644 docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md
 create mode 100644 docs/source/en/basics/colotensor_concept.md
 create mode 100644 docs/source/en/basics/command_line_tool.md
 create mode 100644 docs/source/en/basics/configure_parallelization.md
 create mode 100644 docs/source/en/basics/define_your_config.md
 create mode 100644 docs/source/en/basics/engine_trainer.md
 create mode 100644 docs/source/en/basics/initialize_features.md
 create mode 100644 docs/source/en/basics/launch_colossalai.md
 create mode 100644 docs/source/en/basics/model_checkpoint.md
 create mode 100644 docs/source/en/concepts/colossalai_overview.md
 create mode 100644 docs/source/en/concepts/distributed_training.md
 create mode 100644 docs/source/en/concepts/paradigms_of_parallelism.md
 create mode 100644 docs/source/en/features/1D_tensor_parallel.md
 create mode 100644 docs/source/en/features/2D_tensor_parallel.md
 create mode 100644 docs/source/en/features/2p5D_tensor_parallel.md
 create mode 100644 docs/source/en/features/3D_tensor_parallel.md
 create mode 100644 docs/source/en/features/gradient_accumulation.md
 create mode 100644 docs/source/en/features/gradient_clipping.md
 create mode 100644 docs/source/en/features/gradient_handler.md
 create mode 100644 docs/source/en/features/mixed_precision_training.md
 create mode 100644 docs/source/en/features/nvme_offload.md
 create mode 100644 docs/source/en/features/pipeline_parallel.md
 create mode 100644 docs/source/en/features/zero_with_chunk.md
 create mode 100644 docs/source/en/get_started/installation.md
 create mode 100644 docs/source/en/get_started/reading_roadmap.md
 create mode 100644 docs/source/en/get_started/run_demo.md
 create mode 100644 docs/source/zh/Colossal-Auto/feature/auto_checkpoint.md
 create mode 100644 docs/source/zh/Colossal-Auto/feature/device_mesh.md
 create mode 100644 docs/source/zh/Colossal-Auto/feature/shape_consistency.md
 create mode 100644 docs/source/zh/Colossal-Auto/feature/tracer.md
 create mode 100644 docs/source/zh/Colossal-Auto/get_started/installation.md
 create mode 100644 docs/source/zh/Colossal-Auto/get_started/introduction.md
 create mode 100644 docs/source/zh/Colossal-Auto/get_started/run_demo.md
 create mode 100644 docs/source/zh/advanced_tutorials/add_your_parallel.md
 create mode 100644 docs/source/zh/advanced_tutorials/define_your_own_parallel_model.md
 create mode 100644 docs/source/zh/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md
 create mode 100644 docs/source/zh/advanced_tutorials/meet_gemini.md
 create mode 100644 docs/source/zh/advanced_tutorials/opt_service.md
 create mode 100644 docs/source/zh/advanced_tutorials/parallelize_your_training_like_Megatron.md
 create mode 100644 docs/source/zh/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
 create mode 100644 docs/source/zh/advanced_tutorials/train_vit_using_pipeline_parallelism.md
 create mode 100644 docs/source/zh/advanced_tutorials/train_vit_with_hybrid_parallelism.md
 create mode 100644 docs/source/zh/basics/colotensor_concept.md
 create mode 100644 docs/source/zh/basics/command_line_tool.md
 create mode 100644 docs/source/zh/basics/configure_parallelization.md
 create mode 100644 docs/source/zh/basics/define_your_config.md
 create mode 100644 docs/source/zh/basics/engine_trainer.md
 create mode 100644 docs/source/zh/basics/initialize_features.md
 create mode 100644 docs/source/zh/basics/launch_colossalai.md
 create mode 100644 docs/source/zh/basics/model_checkpoint.md
 create mode 100755 docs/source/zh/concepts/colossalai_overview.md
 create mode 100755 docs/source/zh/concepts/distributed_training.md
 create mode 100755 docs/source/zh/concepts/paradigms_of_parallelism.md
 create mode 100644 docs/source/zh/features/1D_tensor_parallel.md
 create mode 100644 docs/source/zh/features/2D_tensor_parallel.md
 create mode 100644 docs/source/zh/features/2p5D_tensor_parallel.md
 create mode 100644 docs/source/zh/features/3D_tensor_parallel.md
 create mode 100644 docs/source/zh/features/gradient_accumulation.md
 create mode 100644 docs/source/zh/features/gradient_clipping.md
 create mode 100644 docs/source/zh/features/gradient_handler.md
 create mode 100644 docs/source/zh/features/mixed_precision_training.md
 create mode 100644 docs/source/zh/features/nvme_offload.md
 create mode 100644 docs/source/zh/features/pipeline_parallel.md
 create mode 100644 docs/source/zh/features/zero_with_chunk.md
 create mode 100755 docs/source/zh/get_started/installation.md
 create mode 100755 docs/source/zh/get_started/reading_roadmap.md
 create mode 100755 docs/source/zh/get_started/run_demo.md

diff --git a/.github/workflows/check_doc_on_pr.yml b/.github/workflows/check_doc_on_pr.yml
new file mode 100644
index 000000000000..5b3c4f6fbc6d
--- /dev/null
+++ b/.github/workflows/check_doc_on_pr.yml
@@ -0,0 +1,23 @@
+name: Check Documentation on PR
+
+on:
+  pull_request:
+    paths:
+      - 'docs/**'
+
+jobs:
+  check-i18n:
+    name: Check docs in diff languages
+    if: |
+        github.event.pull_request.draft == false &&
+        github.base_ref == 'main' &&
+        github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+
+      - uses: actions/setup-python@v2
+        with:
+          python-version: '3.8.14'
+
+      - run: python .github/workflows/scripts/check_doc_i18n.py -d docs/source
diff --git a/.github/workflows/scripts/check_doc_i18n.py b/.github/workflows/scripts/check_doc_i18n.py
new file mode 100644
index 000000000000..1aa7283e9e52
--- /dev/null
+++ b/.github/workflows/scripts/check_doc_i18n.py
@@ -0,0 +1,67 @@
+import argparse
+import os
+
+
+def compare_dirs(dir1, dir2):
+    # First, we need to check if the two directories exist
+    if not os.path.exists(dir1) or not os.path.exists(dir2):
+        return False
+
+    # Now, we compare the list of items in each directory
+    items1 = os.listdir(dir1)
+    items2 = os.listdir(dir2)
+
+    # If the number of items in each directory is different, the directories are different
+    if len(items1) != len(items2):
+        return False
+
+    # For each item in the first directory, we check if there is a corresponding item in the second directory
+    for item in items1:
+        item_path1 = os.path.join(dir1, item)
+        item_path2 = os.path.join(dir2, item)
+
+        # If the corresponding item doesn't exist in the second directory, the directories are different
+        if not os.path.exists(item_path2):
+            print(f'Found mismatch: {item_path1}, {item_path2}')
+            return False
+
+        # If the corresponding item is a directory, we compare the two directories recursively
+        if os.path.isdir(item_path1) and os.path.isdir(item_path2):
+            if not compare_dirs(item_path1, item_path2):
+                print(f'Found mismatch: {item_path1}, {item_path2}')
+                return False
+
+        # both are files
+        elif os.path.isfile(item_path1) and os.path.isfile(item_path2):
+            continue
+
+        # If the corresponding item is not a file or a directory, the directories are different
+        else:
+            print(f'Found mismatch: {item_path1}, {item_path2}')
+            return False
+
+    # If all items are the same, the directories are the same
+    return True
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-d', '--directory', help="The directory where the multi-language source files are kept.")
+    args = parser.parse_args()
+
+    i18n_folders = os.listdir(args.directory)
+    i18n_folders = [os.path.join(args.directory, val) for val in i18n_folders]
+
+    if len(i18n_folders) > 1:
+        for i in range(1, len(i18n_folders)):
+            dir1 = i18n_folders[0]
+            dir2 = i18n_folders[i]
+            print(f'comparing {dir1} vs {dir2}')
+            match = compare_dirs(i18n_folders[0], i18n_folders[i])
+
+            if not match:
+                print(
+                    f"{dir1} and {dir2} don't match, please ensure that your documentation is available in different languages"
+                )
+            else:
+                print(f"{dir1} and {dir2} match")
diff --git a/docs/source/en/Colossal-Auto/feature/auto_checkpoint.md b/docs/source/en/Colossal-Auto/feature/auto_checkpoint.md
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/docs/source/en/Colossal-Auto/feature/device_mesh.md b/docs/source/en/Colossal-Auto/feature/device_mesh.md
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/docs/source/en/Colossal-Auto/feature/shape_consistency.md b/docs/source/en/Colossal-Auto/feature/shape_consistency.md
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/docs/source/en/Colossal-Auto/feature/tracer.md b/docs/source/en/Colossal-Auto/feature/tracer.md
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/docs/source/en/Colossal-Auto/get_started/installation.md b/docs/source/en/Colossal-Auto/get_started/installation.md
new file mode 100644
index 000000000000..d2a532bfa7b0
--- /dev/null
+++ b/docs/source/en/Colossal-Auto/get_started/installation.md
@@ -0,0 +1,27 @@
+# Setup
+
+## Announcement
+
+Our auto-parallel feature is a alpha version. It is still under development. We will keep updating it and make it more stable. If you encounter any problem, please feel free to raise an issue.
+
+## Requirements
+
+We need some extra dependencies to support auto-parallel. Please install them before using auto-parallel.
+
+### Install PyTorch
+
+We only support PyTorch 1.12 now, other versions are not tested. We will support more versions in the future.
+
+```bash
+#conda
+conda install pytorch==1.12.0 torchvision==0.13.0 torchaudio==0.12.0 cudatoolkit=11.3 -c pytorch
+#pip
+pip install torch==1.12.0+cu113 torchvision==0.13.0+cu113 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu113
+```
+
+### Install pulp and coin-or-cbc
+
+```bash
+pip install pulp
+conda install -c conda-forge coin-or-cbc
+```
diff --git a/docs/source/en/Colossal-Auto/get_started/introduction.md b/docs/source/en/Colossal-Auto/get_started/introduction.md
new file mode 100644
index 000000000000..3d504d9c9cf8
--- /dev/null
+++ b/docs/source/en/Colossal-Auto/get_started/introduction.md
@@ -0,0 +1,47 @@
+# Introduction
+
+In recent years, the deployment of large-scale machine learning models has become increasingly important. However, distributed training systems often require **manual parallelization plans**, which can be complex and require expert knowledge in system engineering and configuration. This can be a challenge for most AI developers without the necessary skills. The need for manual parallelization can make deploying large-scale machine learning models difficult and expensive.
+
+**Colossal-Auto** simplifies the process of deploying large-scale machine learning models for AI developers. Compared to other solutions that require manual configuration of complex parallel policies and model modification, Colossal-Auto only requires one line of code from the user, along with cluster information and model configurations, to enable distributed training. Technically, It seamlessly **integrates with popular AI model frameworks like Hugging Face and Timm.**
+
+
+
+## Overview
+
+<figure style={{textAlign: "center"}}>
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/auto_parallel/auto_parallel.png"/>
+</figure>
+
+
+## Usage
+
+```python
+# wrap the model using auto_engine
+model = autoparallelize(model, meta_input_samples)
+# normal training loop
+...
+```
+
+
+## Graph Tracing
+
+Colossal-Auto is **the first auto-parallelism system** that uses static graph analysis based on the PyTorch framework. Obtaining a static execution plan for PyTorch, a dynamic graph framework, has long been an area of research in the field of machine learning systems. Colossal-Auto uses ColoTracer, a forked version of the torch.FX Tracer, to guide the search for an optimal parallelization strategy. The meta-information of each tensor, such as tensor shape, dims, dtype, etc., is computed and recorded during the tracing process. This approach has the advantage of better generalization, as it is not tied to specific models or configurations.
+
+
+
+## Fine-grained Parallelism Search
+Colossal-AI’s auto-parallelism searches for strategies in regard to each operand with the goal of achieving the fastest runtime while meeting memory budget constraints. It ultimately determines the actual training time strategy, including the tensor split strategy for each tensor, the type of communication operators to be inserted between different computing nodes, whether to replace operators, etc. The tensor, data, and hybrid parallelism such as column and row split used by NVIDIA in Megatron-LM and other parallelism systems are all subsets of strategies that can be searched by Colossal-AI. In addition to these parallelisms that can be manually specified, Colossal-AI can specify a unique parallelism method for each operation and, potentially finding a better parallelism strategy than what human experts could provide.
+
+
+
+## Distributed Tensor and Shape-Consistency System
+
+The Colossal-AI system uses a device-mesh, similar to PyTorch's latest DTensor release, to manage its cluster. Colossal-AI uses a sharding-spec to annotate the storage status of each tensor and facilitate their distribution across the cluster. The system also employs a shape-consistency manager to automatically transform tensors between different sharding-specs, allowing for seamless slicing and dicing of tensors, while the shape-consistency manager ensures that the output of upstream operands is consistently stored in the cluster, regardless of how the input of downstream operands is stored. This makes Colossal-AI highly versatile and easy to use without users worrying about the storage status of tensors when performing operations on them.
+<figure style={{textAlign: "center"}}>
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/auto_parallel/shape_consistency.png"/>
+</figure>
+
+Here are some key advantages of Colossal-AI compared to PyTorch DTensor:
+Colossal-AI's device-mesh uses cluster performance metrics and profiling results to estimate the time consumption of different communication operators. This helps Colossal-AI optimize communication between nodes and improve overall system efficiency.
+Colossal-AI's shape-consistency manager uses a greedy search algorithm to find relatively efficient ways to transform tensors between different sharding-specs, rather than simply transforming dimensions one by one. This can lead to more efficient and effective transformations.
+The integration of all-to-all operations in Colossal-AI increases the scalability of the system by enabling more efficient communication between nodes. This is especially useful for large-scale machine learning tasks that require the transfer of large amounts of data between nodes.
diff --git a/docs/source/en/Colossal-Auto/get_started/run_demo.md b/docs/source/en/Colossal-Auto/get_started/run_demo.md
new file mode 100644
index 000000000000..6918ef497d19
--- /dev/null
+++ b/docs/source/en/Colossal-Auto/get_started/run_demo.md
@@ -0,0 +1,17 @@
+# Quick Demo
+
+Colossal-Auto simplifies the process of deploying large-scale machine learning models for AI developers. Compared to other solutions that require manual configuration of complex parallel policies and model modification, Colossal-Auto only requires one line of code from the user, along with cluster information and model configurations, to enable distributed training. Quick demos showing how to use Colossal-Auto are given below.
+
+### 1. Basic usage
+
+Colossal-Auto can be used to find a hybrid SPMD parallel strategy includes data, tensor(i.e., 1D, 2D, sequencial) for each operation. You can follow the [GPT example](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/gpt/experiments/auto_parallel).
+Detailed instructions can be found in its `README.md`.
+
+### 2. Integration with activation checkpoint
+
+Colossal-Auto's automatic search function for activation checkpointing finds the most efficient checkpoint within a given memory budget, rather than just aiming for maximum memory compression. To avoid a lengthy search process for an optimal activation checkpoint, Colossal-Auto has implemented a two-stage search process. This allows the system to find a feasible distributed training solution in a reasonable amount of time while still benefiting from activation checkpointing for memory management. The integration of activation checkpointing in Colossal-AI improves the efficiency and effectiveness of large model training. You can follow the [Resnet example](TBA).
+Detailed instructions can be found in its `README.md`.
+
+<figure style={{textAlign: "center"}}>
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/auto_parallel/auto_ckpt.jpg"/>
+</figure>
diff --git a/docs/source/en/advanced_tutorials/add_your_parallel.md b/docs/source/en/advanced_tutorials/add_your_parallel.md
new file mode 100644
index 000000000000..be7284a7ab64
--- /dev/null
+++ b/docs/source/en/advanced_tutorials/add_your_parallel.md
@@ -0,0 +1,124 @@
+# Add Your Own Parallel Mode
+
+Author: Shenggui Li, Yongbin Li
+
+**Prerequisite:**
+- [Define Your Configuration](../basics/define_your_config.md)
+- [Configure Parallelization](../basics/configure_parallelization.md)
+
+## Introduction
+
+To enable researchers and engineers to extend our system to other novel large-scale distributed training algorithm
+with less effort, we have decoupled various components in the training lifecycle. You can implement your own
+parallelism by simply inheriting from the base class.
+
+The main components are:
+
+1. `ProcessGroupInitializer`
+2. `GradientHandler`
+3. `Schedule`
+
+**This currently requires some code to the source code, thus we recommend that you install from source with the `-e` flag.
+`-e` flag makes the installation editable, thus, your code change will be reflected in your Python runtime.
+We will work on this to avoid change to source code in future releases.**
+
+
+## Process Group Initializer
+
+Parallelism is often managed by process groups where processes involved in the same parallel algorithm are placed in the same
+process group. For different parallel algorithms, different process groups need to be created. Colossal-AI provides a
+global context for users to easily manage their process groups. If you wish to add new process group, you can easily
+define a new class and set it in your configuration file. To define your own way of creating process groups, you can
+follow the steps below to create a new distributed initialization.
+
+1. Add your parallel mode in `colossalai.context.parallel_mode.ParallelMode`.
+    ```python
+    class ParallelMode(Enum):
+        GLOBAL = 'global'
+        DATA = 'data'
+        PIPELINE = 'pipe'
+        ...
+
+        NEW_MODE = 'new_mode'  # define your mode here
+    ```
+
+2. Create a `ProcessGroupInitializer`. You can refer to examples given in `colossalai.context.dist_group_initializer`. The
+   first six arguments are fixed. `ParallelContext` will pass in these arguments for you. If you need to set other
+   arguments, you can add it behind like the `arg1, arg2` in the example below. Lastly, register your initializer to the
+   registry by adding the decorator `@DIST_GROUP_INITIALIZER.register_module`.
+    ```python
+    # sample initializer class
+    @DIST_GROUP_INITIALIZER.register_module
+    class MyParallelInitializer(ProcessGroupInitializer):
+
+        def __init__(self,
+                    rank: int,
+                    world_size: int,
+                    config: Config,
+                    data_parallel_size: int,
+                    pipeline_parlalel_size: int,
+                    tensor_parallel_size: int,
+                    arg1,
+                    arg2):
+            super().__init__(rank, world_size, config)
+            self.arg1 = arg1
+            self.arg2 = arg2
+            # ... your variable init
+
+        def init_parallel_groups(self):
+            # initialize your process groups
+            pass
+
+    ```
+
+    Then, you can insert your new initializer to the current mode-to-initialize mapping
+    in `colossalai.constants.INITIALIZER_MAPPING`. You can modify the file or insert new key-value pair dynamically.
+
+    ```python
+    colossalai.constants.INITIALIZER_MAPPING['new_mode'] = 'MyParallelInitializer'
+    ```
+
+3. Set your initializer in your config file. You can pass in your own arguments if there is any. This allows
+   the `ParallelContext` to create your initializer and initialize your desired process groups.
+
+    ```python
+    parallel = dict(
+        pipeline=dict(size=1),
+        tensor=dict(size=x, mode='new_mode')  # this is where you enable your new parallel mode
+    )
+    ```
+
+## Gradient Handler
+
+Gradient handlers are objects which execute the all-reduce operations on parameters' gradients. As different all-reduce
+strategies may be executed for different kinds of parallelism, users can
+inherit `colossalai.engine.gradient_handler.BaseGradientHandler` to implement their strategies. Currently, the library
+uses the normal data parallel gradient handler which all-reduces the gradients across data parallel ranks. The data
+parallel gradient handler is added to the engine automatically if data parallel is detected. You can add your own
+gradient handler like below:
+
+```python
+from colossalai.registry import GRADIENT_HANDLER
+from colossalai.engine import BaseGradientHandler
+
+@GRADIENT_HANDLER.register_module
+class YourGradientHandler(BaseGradientHandler):
+
+    def handle_gradient(self):
+        do_something()
+
+```
+
+Afterwards, you can specify the gradient handler you want to use in your configuration file.
+
+```python
+gradient_handlers = [
+    dict(type='YourGradientHandler'),
+]
+```
+
+## Schedule
+
+Schedule entails how to execute a forward and backward pass. Currently, Colossal-AI provides pipeline and non-pipeline
+schedules. If you want to modify how the forward and backward passes are executed, you can
+inherit `colossalai.engine.schedule.BaseSchedule` and implement the `forward_back_step` function.
diff --git a/docs/source/en/advanced_tutorials/define_your_own_parallel_model.md b/docs/source/en/advanced_tutorials/define_your_own_parallel_model.md
new file mode 100644
index 000000000000..8e48737d2f64
--- /dev/null
+++ b/docs/source/en/advanced_tutorials/define_your_own_parallel_model.md
@@ -0,0 +1,36 @@
+# Define your own parallel model
+
+Author: Zhengda Bian, Yongbin Li
+
+> ⚠️ We are working on this documentation to make it more detailed. We will introduce the mechanism of different parallelism
+> and how to use them to write a model.
+
+Let's say that you have a huge MLP model with billions of parameters and its extremely large hidden layer size makes it
+impossible to fit into a single GPU directly. Don't worry, Colossal-AI is here to help you sort things out. With the help of Colossal-AI,
+you can write your model in the familiar way in which you used to write models for a single GPU, while Colossal-AI automatically
+splits your model weights and fit them perfectly into a set of GPUs. We give a simple example showing how to write a simple
+2D parallel model in the Colossal-AI context.
+
+## Write a simple 2D parallel model
+
+```python
+from colossalai.nn import Linear2D
+import torch.nn as nn
+
+class MLP_2D(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.linear_1 = Linear2D(in_features=1024, out_features=16384)
+        self.linear_2 = Linear2D(in_features=16384, out_features=1024)
+
+    def forward(self, x):
+        x = self.linear_1(x)
+        x = self.linear_2(x)
+        return x
+```
+
+## Use pre-defined model
+
+For the sake of your convenience, we kindly provide you in our Model Zoo with some prevalent models such as *BERT*, *ViT*, *MoE*,
+and *GPT*. Feel free to customize them into different sizes to fit into your special needs.
diff --git a/docs/source/en/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md b/docs/source/en/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md
new file mode 100644
index 000000000000..e01caf76d2b3
--- /dev/null
+++ b/docs/source/en/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md
@@ -0,0 +1,139 @@
+# Integrate Mixture-of-Experts Into Your Model
+
+Author: Haichen Huang
+
+**Example Code**
+- [ColossalAI-Examples WideNet](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/image/widenet)
+
+**Related Paper**
+- [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961)
+- [Go Wider Instead of Deeper](https://arxiv.org/abs/2107.11817)
+
+
+## Introduction
+
+Since the advent of Switch Transformer, the AI community has found Mixture of Experts (MoE) a useful technique to enlarge the capacity of deep learning models.
+
+Colossal-AI provides an early access version of parallelism specifically designed for MoE models.
+The most prominent advantage of MoE in Colossal-AI is convenience.
+We aim to help our users to easily combine MoE with model parallelism and data parallelism.
+
+However, the current implementation has two main drawbacks now.
+The first drawback is its poor efficiency in large batch size and long sequence length training.
+The second drawback is incompatibility with tensor parallelism.
+We are working on system optimization to overcome the training efficiency problem.
+The compatibility problem with tensor parallelism requires more adaptation, and we will tackle this issue in the future.
+
+Here, we will introduce how to use MoE with model parallelism and data parallelism.
+
+## Table of Content
+In this tutorial we will cover:
+1. Set up MoE running environment
+2. Create MoE layer
+3. Train your model
+
+We provided the [example code](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/image/widenet) for this tutorial in [ColossalAI-Examples](https://github.com/hpcaitech/ColossalAI-Examples).
+This example uses [WideNet](https://arxiv.org/abs/2107.11817) as an example of MoE-based model.
+
+
+## Set up MoE running environment
+In your project folder, create a `config.py`.
+
+This file is to specify some features you may want to use to train your model.
+In order to enable MoE, you need to add a dict called parallel and specify the value of key moe.
+You can assign a value for the key size of moe, which represents the model parallel size of experts (i.e. the number of experts in one group to parallelize training).
+
+For example, if the size is 4, 4 processes will be assigned to 4 consecutive GPUs and these 4 processes form a moe model parallel group.
+Each process on the 4 GPUs will only get a portion of experts. Increasing the model parallel size will reduce communication cost, but increase computation cost in each GPU and activation cost in memory.
+The total data parallel size is auto-detected and set as the number of GPUs by default.
+
+```python
+MOE_MODEL_PARALLEL_SIZE = ...
+parallel = dict(
+    moe=dict(size=MOE_MODEL_PARALLEL_SIZE)
+)
+```
+
+If `MOE_MODEL_PARALLEL_SIZE = E` and set the number of experts as `E` where `E` is a constant number, the process flow of forward pass of a transformer encoder in a model parallel group is shown below.
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/01/28/oI59QcxdteKUTks.png"/>
+<figcaption>MoE Transformer, image source: <a href="https://arxiv.org/abs/2006.16668">GShard</a></figcaption>
+</figure>
+
+Since all experts are allocated to all GPUs in a model parallel group and a GPU only owns a portion of experts,
+original data parallel groups are no longer correct for the parameters of experts during gradient handling in backward pass anymore.
+So we create a new kind of parallel group called moe data parallel group.
+The difference among different kinds of parallel group, when the configuration is set as `WORLD_SIZE=4`,
+`MOE_MODEL_PARALLEL_SIZE=2`, is shown here.
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/01/28/Sn8FpmQPKIiBEq2.png"/>
+<figcaption>MoE process group</figcaption>
+</figure>
+
+
+As for gradient handling, we provide MoeGradientHandler to all-reduce every parameter of the model.
+If you use `colossalai.initialize` function to create your training engine, the MoE gradient handler will be added to your engine automatically.
+Otherwise, you should take care of gradient by yourself.
+All parameters of MoE running environment are stored in colossalai.global_variables.moe_env.
+You can access your configuration parameters to check whether your setup is correct.
+```python
+from colossalai.global_variables import moe_env
+```
+
+## Create MoE layer
+You can create a MoE layer from `colossalai.nn.moe`.
+But before doing that, you should set up random seeds for all processes like this.
+
+```python
+from colossalai.context.random import moe_set_seed
+from model_zoo.moe.models import Widenet
+
+moe_set_seed(42)
+model = Widenet(num_experts=4, capacity_factor=1.2)
+```
+
+`moe_set_seed` will set different seed for different processes in a moe model parallel group.
+This helps initialize parameters in experts.
+Then create an instance of experts and an instance of router.
+Here is the example in model zoo.
+
+```python
+from colossalai.nn.layer.moe import Experts, MoeLayer, Top2Router, NormalNoiseGenerator
+
+
+noisy_func = NormalNoiseGenerator(num_experts)
+shared_router = Top2Router(capacity_factor,
+                           noisy_func=noisy_func)
+shared_experts = Experts(expert=VanillaFFN,
+                         num_experts=num_experts,
+                         **moe_mlp_args(
+                             d_model=d_model,
+                             d_ff=d_ff,
+                             drop_rate=drop_rate
+                         ))
+ffn=MoeLayer(dim_model=d_model, num_experts=num_experts,
+             router=shared_router, experts=shared_experts)
+```
+
+Inside the initialization of Experts, the local expert number of each GPU will be calculated automatically. You just need to specify the class of each expert and its parameters used in its initialization. As for routers, we have provided top1 router and top2 router. You can find them in colossalai.nn.layer.moe. After creating the instance of experts and router, the only thing initialized in Moelayer is gate module. More definitions of each class can be found in our API document and code.
+
+
+## Train Your Model
+Do not to forget to use `colossalai.initialize` function in `colosalai` to add gradient handler for the engine.
+We handle the back-propagation of MoE models for you.
+In `colossalai.initialize`, we will automatically create a `MoeGradientHandler` object to process gradients.
+You can find more information about the handler `MoeGradientHandler` in colossal directory.
+
+The loss criterion should be wrapped by `Moeloss` to add auxiliary loss of MoE. Example is like this.
+```python
+criterion = MoeLoss(
+    aux_weight=0.01,
+    loss_fn=nn.CrossEntropyLoss,
+    label_smoothing=0.1
+)
+```
+
+Finally, just use trainer or engine in `colossalai` to do your training.
+Otherwise, you should take care of gradient by yourself.
diff --git a/docs/source/en/advanced_tutorials/meet_gemini.md b/docs/source/en/advanced_tutorials/meet_gemini.md
new file mode 100644
index 000000000000..4889b30a6cf8
--- /dev/null
+++ b/docs/source/en/advanced_tutorials/meet_gemini.md
@@ -0,0 +1,88 @@
+
+# Meet Gemini:The Heterogeneous Memory Manager of Colossal-AI
+
+Author: [Jiarui Fang](https://github.com/feifeibear), Yang You
+
+## Brief
+
+When you only have a few GPUs for large model training tasks, **heterogeneous training** is the most effective approach. By accommodating model data in CPU and GPU and moving the data to the computing device when necessary, it can breakthrough the GPU memory wall by using GPU  and CPU memory (composed of CPU DRAM or nvme SSD memory) together at the same time. Moreover, the model scale can be further improved by combining heterogeneous training with the other parallel approaches, such as data parallel, tensor parallel and pipeline parallel . We now describe the design details of **Gemini**, the heterogeneous memory space manager of Colossal-AI. Its idea comes from [PatrickStar](https://arxiv.org/abs/2108.05818), which has been adapted to Colossal-AI.
+
+## Usage
+
+At present, Gemini supports compatibility with ZeRO parallel mode, and it is really simple to use Gemini. Set attribute of zero model_config, i.e., tensor_placement_policy='auto'.
+
+```
+zero = dict(
+    model_config=dict(
+        tensor_placement_policy='auto',
+        shard_strategy=BucketTensorShardStrategy()
+    ),
+    optimizer_config=dict(
+    ...)
+)
+```
+
+Note that Gemini and parallel strategies such as tensor parallelism, data parallelism, pipeline parallelism and zero should be decoupled. However, Colossal-AI requires users to use Gemini with ZeRO. Although they are not necessarily coupled, we will improve it in the near future.
+
+## Concepts
+
+**OP**(**OP**erator)：operation of a neural network layer, such as linear, LayerNorm, etc. The operator can be a forward propagation calculation or a back-propagation calculation.
+
+Neural networks must manage two types of training data during training.
+**model data**: consists of parameters, gradients and optimizer states, and its scale is related to the definition of model structure.
+
+**Non-model data**: mainly composed of the intermediate tensor generated by the operator and the temporary variables of the operator. Non-model data changes dynamically according to the configuration of training tasks, such as batch size. Model data and non-model data compete with each other for GPU memory.
+
+## Design Details
+
+
+In some solutions, the [Zero-offload](https://arxiv.org/abs/2101.06840) adopted by DeepSpeed statically divides model data between CPU and GPU memory, and their memory layout is constant for different training configurations. As shown on the left of the figure below, when the GPU memory is insufficient to meet its corresponding model data requirements, the system will crash even if there is still available memory on the CPU at that time. While Colossal-AI can complete the training by moving part of the model data to the CPU.
+
+<figure style={{textAlign: "center"}}>
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/tutorial/gemini/deepspeed_compare.png"/>
+<figcaption>Comparison of the memory management of Zero-Offload and Gemini</figcaption>
+</figure>
+
+
+Colossal-AI designed Gemini, just like two-stars, which manages the memory space of CPU and GPU efficiently. It can make the tensor dynamically distributed in the storage space of CPU-GPU during training, so that the model training can break through the memory wall of GPU. The memory manager consists of two parts: **MemStatsCollector (MSC)** and **StatefuleTensorMgr (STM)**.
+
+We take advantage of the iterative characteristics of the deep learning network training process. We divide iterations into two stages: warmup and non-warmup. One or several iterative steps at the beginning belong to the warmup stage, and the other iterative steps belong to the non-warmup stage. In the warmup stage, we collect information for the MSC, while in the non-warmup stage, STM gets the information collected by the MSC to move the tensor, so as to minimize the CPU-GPU data movement volume.
+
+<figure style={{textAlign: "center"}}>
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/tutorial/gemini/gemini_workflow.png"/>
+<figcaption>The workflow of Gemini during warmup and non-warmup phase</figcaption>
+</figure>
+
+
+### StatefulTensorMgr
+
+STM manages the information of all model data tensors. In the process of model construction, Colossal-AI registers all model data tensors with STM. The memory manager marks each tensor with state information. The state set includes three types: HOLD, COMPUTE and FREE. The functions of STM are as follows:
+
+**Query memory usage:**by traversing the locations of all tensors in heterogeneous space, obtain the memory occupation of CPU and GPU by model data.
+
+**Transition tensor state:** it marks the tensor as COMPUTE state before each model data tensor participates in the operator calculation, and as HOLD state after calculation. The FREE state marked if the tensor is no longer in use.
+
+**Adjust tensor position:**tensor manager ensures that the tensor in COMPUTE state is placed on the computing device. If the storage space of the computing device is insufficient, it is necessary to move some tensors in HOLD state to other devices for storage. Tensor eviction strategy requires information from MSC, which will be introduced later.
+
+
+### MemStatsCollector
+In the warmup stage, the memory information statistician monitors the memory usage of model data and non-model data in CPU and GPU for reference in the non-warmup stage. We can obtain the memory usage of model data at a certain time by querying STM. However, the memory usage of non-model data is difficult to obtain. Owing to the life cycle of non-model data not being managed by users, the existing deep learning framework does not expose the tracking interface of non-model data to users. MSC obtains the usage of CPU and GPU memory by non-model in the warmup stage through sampling. The specific methods are as follows:
+
+We trigger the memory sampling operation at the beginning and end of the operator. We call this time point **sampling moment**, and the time between the two sampling moments is called **period**. The calculation process is a black box. Due to the possible allocation of temporary buffer, the memory usage is very complex. However, we can accurately obtain the maximum memory usage of the system during the period. The use of non-model data can be obtained by the maximum memory use of the system between two statistical moments-model memory use.
+
+How do we design the sampling time. Before we choose model data layout adjust of preOp. As shown in the figure below. We sample the system memory used of the previous period and the model data memory used of the next period. The parallel strategy will cause obstacles to the work of MSC. As shown in the figure, for example, for ZeRO or Tensor Parallel, because gathering model data is required before OP calculation, it will bring additional memory requirements. Therefore, we require to sample the system memory before the model data changes, so that the MSC will capture the model change memory of preOp within a period. For example, in period 2-3, we consider the memory changes brought by tensor gather and shard.
+
+Although the sampling time can be placed in other locations, such as excluding the new information of the change of the gather buffer, it will cause trouble. There are differences in the implementation of Op in different parallel modes. For example, for Linear Op, gather buffer in Tensor Parallel is allocated in Op. For ZeRO, the allocation of gather buffer is in PreOp. Sampling at the beginning of PreOp helps to unify the two situations.
+
+<figure style={{textAlign: "center"}}>
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/tutorial/gemini/gemini_mem_curve.png"/>
+<figcaption>workflow</figcaption>
+</figure>
+
+### Tensor Eviction Strategy
+
+The important duty of MSC is to adjust the tensor layout position. For example, at S2 in the figure above, we reduce the model data on the device, and meet the peak memory requirement calculated in period 2-3.
+
+In the warmup stage, since we haven't finished a complete iteration yet, we don't know actual memory occupation. At this time, we limit the upper bound of memory usage of the model data. For example, only 30% of the GPU memory can be used. This ensures that we can successfully complete the warmup state.
+
+In the non-warmup stage, we need to use the memory information of non-model data collected in the warm-up stage to reserve the peak memory required by the computing device for the next Period, which requires us to move some model tensors. In order to avoid frequent replacement of the same tensor in and out of the CPU-GPU, causing a phenomenon similar to [cache thrashing](https://en.wikipedia.org/wiki/Thrashing_(computer_science)). Using the iterative characteristics of DNN training, we design the OPT cache swap out strategy. Specifically, in the warmup stage, we record the sampling time required by each tensor computing device. If we need to expel some HOLD tensors, we will choose the latest tensor needed on this device as the victim.
diff --git a/docs/source/en/advanced_tutorials/opt_service.md b/docs/source/en/advanced_tutorials/opt_service.md
new file mode 100644
index 000000000000..b317de91bbdd
--- /dev/null
+++ b/docs/source/en/advanced_tutorials/opt_service.md
@@ -0,0 +1,81 @@
+# Build an online OPT service using Colossal-AI in 5 minutes
+
+## Introduction
+
+This tutorial shows how to build your own service with OPT with the help of [Colossal-AI](https://github.com/hpcaitech/ColossalAI).
+
+## Colossal-AI Inference Overview
+Colossal-AI provides an inference subsystem [Energon-AI](https://github.com/hpcaitech/EnergonAI), a serving system built upon Colossal-AI, which has the following characteristics:
+
+- **Parallelism for Large-scale Models:** With the help of tensor parallel operations, pipeline parallel strategies from Colossal-AI, Colossal-AI inference enables efficient parallel inference for large-scale models.
+- **Pre-built large models:** There are pre-built implementations for popular models, such as OPT. It supports a caching technique for the generation task and checkpoints loading.
+- **Engine encapsulation：** There has an abstraction layer called an engine. It encapsulates the single instance multiple devices (SIMD) execution with the remote procedure call, making it act as the single instance single device (SISD) execution.
+- **An online service system:** Based on FastAPI, users can launch a web service of a distributed inference quickly. The online service makes special optimizations for the generation task. It adopts both left padding and bucket batching techniques to improve efficiency.
+
+## Basic Usage:
+
+1. Download OPT model
+
+To launch the distributed inference service quickly, you can download the OPT-125M from [here](https://huggingface.co/patrickvonplaten/opt_metaseq_125m/blob/main/model/restored.pt). You can get details for loading other sizes of models [here](https://github.com/hpcaitech/EnergonAI/tree/main/examples/opt/script).
+
+2. Prepare a prebuilt service image
+
+Pull a docker image from dockerhub installed with Colossal-AI inference.
+
+```bash
+docker pull hpcaitech/energon-ai:latest
+```
+
+3. Launch an HTTP service
+
+To launch a service, we need to provide python scripts to describe the model type and related configurations, and settings for the HTTP service.
+We have provided a set of [examples](https://github.com/hpcaitech/EnergonAI/tree/main/examples]). We will use the [OPT example](https://github.com/hpcaitech/EnergonAI/tree/main/examples/opt) in this tutorial.
+The entrance of the service is a bash script server.sh.
+The config of the service is at opt_config.py, which defines the model type, the checkpoint file path, the parallel strategy, and http settings. You can adapt it for your own case.
+For example, set the model class as opt_125M and set the correct checkpoint path as follows.
+
+```bash
+model_class = opt_125M
+checkpoint = 'your_file_path'
+```
+
+Set the tensor parallelism degree the same as your gpu number.
+
+```bash
+tp_init_size = #gpu
+```
+
+Now, we can launch a service using docker. You can map the path of the checkpoint and directory containing configs to local disk path `/model_checkpoint` and `/config`.
+
+
+```bash
+export CHECKPOINT_DIR="your_opt_checkpoint_path"
+# the ${CONFIG_DIR} must contain a server.sh file as the entry of service
+export CONFIG_DIR="config_file_path"
+
+docker run --gpus all  --rm -it -p 8020:8020 -v ${CHECKPOINT_DIR}:/model_checkpoint -v ${CONFIG_DIR}:/config --ipc=host energonai:lastest
+```
+
+Then open `https://[IP-ADDRESS]:8020/docs#` in your browser to try out!
+
+
+## Advance Features Usage:
+
+1. Batching Optimization
+
+To use our advanced batching technique to collect multiple queries in batches to serve, you can set the executor_max_batch_size as the max batch size. Note, that only the decoder task with the same top_k, top_p and temperature can be batched together.
+
+```
+executor_max_batch_size = 16
+```
+
+All queries are submitted to a FIFO queue. All consecutive queries whose number of decoding steps is less than or equal to that of the head of the queue can be batched together. Left padding is applied to ensure correctness. executor_max_batch_size should not be too large. This ensures batching won't increase latency. For opt-30b, `executor_max_batch_size=16` may be a good choice, while for opt-175b, `executor_max_batch_size=4` may be better.
+
+2. Cache Optimization.
+
+You can cache several recently served query results for each independent serving process. Set the cache_size and cache_list_size in config.py. The cache size is the number of queries cached. The cache_list_size is the number of results stored for each query. And a random cached result will be returned. When the cache is full, LRU is applied to evict cached queries. cache_size=0means no cache is applied.
+
+```
+cache_size = 50
+cache_list_size = 2
+```
diff --git a/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md b/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md
new file mode 100644
index 000000000000..e7698e5e9d1b
--- /dev/null
+++ b/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md
@@ -0,0 +1,192 @@
+# Parallelize Your Training like Megatron-LM via ColoTensor
+
+Author: [Haichen Huang](https://github.com/1SAA) and [Jiarui Fang](https://github.com/feifeibear)
+
+**Prerequisite:**
+- [ColoTensor Concepts](../basics/colotensor_concept.md)
+
+## Introduction
+
+Thanks to the convenience given by ColoTensor, users can apply parallelism with the least edition to their serial code.
+In this tutorial, we will illustrate how to modify the training model to automatically adapt the code to parallel training like Megatron-LM.
+We take the GPT-2 model offered by HuggingFace as an example and provide a way for you to pre-train the GPT-2 model on a single GPU.
+
+Megatron-LM provided a profound paradigm to parallelize large transformer language models.
+However, in order to train large transformer language models at scale, users have to build their models with those modules provided by Megatron.
+It imposes several difficult jobs on users, such as loading the weights from the pre-trained models and constructing the parallelized models.
+To mitigate users' trouble, we offer ColoTensor to enable the tensor model parallelism automatically.
+
+## Definitions of the model and the loss function
+
+First we use the GPTModel and GPTLoss directly from the HuggingFace library.
+
+```python
+import torch
+import torch.nn as nn
+from transformers import GPT2Config, GPT2LMHeadModel
+
+class GPTLMModel(nn.Module):
+    def __init__(self, hidden_size=768, num_layers=12, num_attention_heads=12, max_seq_len=1024, vocab_size=50257, checkpoint=False):
+        super().__init__()
+        self.checkpoint = checkpoint
+        self.model = GPT2LMHeadModel(GPT2Config(n_embd=hidden_size, n_layer=num_layers,
+                                     n_head=num_attention_heads, n_positions=max_seq_len, n_ctx=max_seq_len, vocab_size=vocab_size))
+        if checkpoint:
+            self.model.gradient_checkpointing_enable()
+
+    def forward(self, input_ids, attention_mask):
+        # Only return lm_logits
+        return self.model(input_ids=input_ids, attention_mask=attention_mask, use_cache=not self.checkpoint)[0]
+
+
+class GPTLMLoss(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.loss_fn = nn.CrossEntropyLoss()
+
+    def forward(self, logits, labels):
+        shift_logits = logits[..., :-1, :].contiguous()
+        shift_labels = labels[..., 1:].contiguous()
+        # Flatten the tokens
+        return self.loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+```
+
+## Brief Review of GPT-2
+
+Now, we recall the structure of each GPT-2 model.
+Every GPT-2 model can be represented as a DAG.
+As shown in the below pictures, each circle represents an operator and each square represents a weight.
+An arrow indicates the flow of the input data, and the notation alongside the arrow demonstrates the shape of the input data.
+
+Then, let's take an insight into this GPT-2 model. It consists of three parts.
+They are the **embedding module**, **transformer layers**, and the **classification head**.
+
+The embedding module contains two weights, token embedding weight and position embedding weight.
+After the forward operation of the embedding module, each word in all sequences of the raw input data will be embedded into a hidden state.
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/08/17/omfkIEN6ui5jcL3.png"/>
+<figcaption>The embedding module</figcaption>
+</figure>
+
+Each transformer layer contains two blocks. The self-attention operation is called in the first block and a two-layer percepton is located in the second block.
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/08/17/LAVzDlpRcj4dYeb.png"/>
+<figcaption>The transformer layer</figcaption>
+</figure>
+
+In the end, the classification head is just a linear module without bias, which only has a weight inside.
+
+## Applied with ColoTensor
+
+Two steps make your serial code adapted to Megatron-LM tensor parallel style.
+1. Initialize the model in the context of ColoInitContext.
+2. Setting ColoTensorSpec for each parameter.
+
+### Initialize with ColoInitContext
+
+We should build the model in the ColoInitContext.
+In this context, any parameter initialized would be transformed to ColoParameter and moved to the corresponded device automatically.
+
+```python
+from colossalai.utils.model.colo_init_context import ColoInitContext
+
+with ColoInitContext(device=torch.device('cpu')):
+    model = GPTLMModel()
+```
+
+### Setting ColoTensorSpec for each parameter
+
+After the creation of the model, we establish the distributed environment through ProcessGroup.
+Here, we specify the degree of the tensor parallelism as the same as the number of all GPUs, which means the degree of data parallelism is 1.
+
+```python
+import torch.distributed as dist
+from colossalai.tensor import ProcessGroup
+
+pg = ProcessGroup(tp_degree=dist.get_world_size())
+```
+
+Now, some auxiliary functions are necessary for the next step. We define two functions to split a parameter.
+Megatron-LM-like tensor parallelism requires splitting a parameter tensor along its first dimension or its last dimension.
+
+```python
+from colossalai.tensor import ShardSpec, ComputeSpec, ComputePattern, ColoParameter, ProcessGroup
+
+def split_param_single_dim_tp1d(dim: int, param: ColoParameter, pg: ProcessGroup):
+    spec = (ShardSpec([dim], [pg.tp_world_size()]), ComputeSpec(ComputePattern.TP1D))
+    if param.process_group.tp_world_size() == 1:
+        param.set_process_group(pg)
+    param.set_tensor_spec(*spec)
+
+
+def split_param_row_tp1d(param: ColoParameter, pg: ProcessGroup):
+    split_param_single_dim_tp1d(0, param, pg)
+
+
+def split_param_col_tp1d(param: ColoParameter, pg: ProcessGroup):
+    split_param_single_dim_tp1d(-1, param, pg)
+```
+
+Then we adapt the model to the tensor parallelism.
+According to the tensor parallelism applied in Megatron, it is supposed to shard along the last dimension of tensors, including the weights of token embedding, position embedding, all linear weights and biases in self-attention blocks, the first weight linear and bias in each MLP.
+And it shards the second linear weight along its first dimension.
+
+```python
+for mn, module in model.named_modules():
+    for pn, param in module.named_parameters(recurse=False):
+        # set process group for all parameters
+        param.set_process_group(pg)
+
+        if 'mlp.c_fc' in mn:
+            if 'weight' in pn or 'bias' in pn:
+                split_param_col_tp1d(param, pg)  # colmn slice
+                # keep the shape of the output from c_fc
+                param.compute_spec.set_output_replicate(False)
+        elif 'mlp.c_proj' in mn:
+            if 'weight' in pn:
+                split_param_row_tp1d(param, pg)  # row slice
+        elif 'wte' in mn or 'wpe' in mn:
+            split_param_col_tp1d(param, pg)  # colmn slice
+        elif 'c_attn' in mn or 'c_proj' in mn:
+            split_param_col_tp1d(param, pg)  # colmn slice
+```
+
+The modified model is illustrated below.
+
+The embedding module:
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/08/17/Yu2xzXEabHV7pwe.png"/>
+<figcaption>The modified embedding module</figcaption>
+</figure>
+
+The transformer layers:
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/08/17/4HWsA2xz51IhPFO.png"/>
+<figcaption>The modified transformer layer</figcaption>
+</figure>
+
+Once users have specified the distributed pattern of each parameter, ColoTensor is capable of inferring the computation patterns of all operators, including matrix multiplication, the linear function, other elementwise functions in torch.nn.functional, etc.
+In this way, users can train their models as usual.
+
+In our latest example, a Gemini + ZeRO DDP model is also defined to reduce overhead and improve efficiency.For the details of this part, please refer to [ZeRO](../features/zero_with_chunk.md). You can combine these two parts to understand our entire training process:
+
+```python
+def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy: str = "auto"):
+    from colossalai.nn.parallel import GeminiDDP
+    model = GeminiDDP(model,
+                        device=get_current_device(),
+                        placement_policy=placememt_policy,
+                        pin_memory=True,
+                        search_range_mb=32)
+    return model
+```
+
+## Pretrain GPT-2 On Single GPU
+
+The above optimization we made allows us to pretrain the GPT-2 model on a single GPU. We only need to set the parameter `GPUNUM`=1 in `run.sh`, and then we can complete the model training on a single GPU when running the file.
+
+The GPT-2 example is accessible at [Train GPT with Colossal-AI](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/gpt).
diff --git a/docs/source/en/advanced_tutorials/train_gpt_using_hybrid_parallelism.md b/docs/source/en/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
new file mode 100644
index 000000000000..715c15eb6300
--- /dev/null
+++ b/docs/source/en/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
@@ -0,0 +1,270 @@
+# Train GPT Using Hybrid Parallelism
+
+Author: Hongxin Liu, Yongbin Li
+
+**Example Code**
+- [ColossalAI-Examples GPT2](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/language/gpt_2)
+- [ColossalAI-Examples GPT3](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/language/gpt_3)
+
+**Related Paper**
+- [Colossal-AI: A Unified Deep Learning System For Large-Scale Parallel Training](https://arxiv.org/abs/2110.14883)
+- [Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM](https://arxiv.org/abs/2104.04473)
+
+## Introduction
+
+In the previous tutorial, we introduce how to train ViT with pipeline. In this tutorial, you will learn a more complex scenario -- train GPT with hybrid parallelism. In this case, GPT-3 is so large that CPU memory cannot fit it as well. Therefore, you must split the model by yourself.
+
+## Table of content
+
+In this tutorial we will cover:
+
+1. The definition of GPT model, based on colossalai/model_zoo
+2. Processing the dataset
+3. Training GPT using hybrid parallelism
+
+## Import libraries
+
+```python
+import json
+import os
+from typing import Callable
+
+import colossalai
+import colossalai.utils as utils
+import model_zoo.gpt.gpt as col_gpt
+import torch
+import torch.nn as nn
+from colossalai import nn as col_nn
+from colossalai.amp import AMP_TYPE
+from colossalai.builder.pipeline import partition_uniform
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.engine.schedule import (InterleavedPipelineSchedule,
+                                        PipelineSchedule)
+from colossalai.logging import disable_existing_loggers, get_dist_logger
+from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper
+from colossalai.trainer import Trainer, hooks
+from colossalai.utils.timer import MultiTimer
+from model_zoo.gpt import GPTLMLoss
+from torch.nn import functional as F
+from torch.utils.data import Dataset
+from transformers import GPT2Tokenizer
+```
+
+
+
+## Define GPT model
+
+In the previous tutorial, we introduced 3 ways to build a pipelined model. But for huge models like GPT-3, you can't even build the model in CPU. In this case, you must split the model by yourself.
+
+GPT dataloader returns `input_ids` and `attention_mask`, so we use two keyword arguments in `forward()` to get them. Note that for stages except the first stage, the first positional argument of `forward()` is the output tensor from the previous stage. So the `hidden_states` is from the previous stage, and for the first stage it's `None`.
+
+For GPT, the *word embedding layer* shares the weights with the *output head*. We provide `PipelineSharedModuleWrapper` to share parameters among pipeline stages. It takes a `list` of `int` as argument, which means those ranks share the parameters. You can use `register_module()` or `register_parameter()` to register a module or a parameter as the shared module or parameter. If you have multiple sets of shared modules / parameters, you should have multiple `PipelineSharedModuleWrapper` instance. If the parameter is shared within **one** stage, you should not use `PipelineSharedModuleWrapper`, and just use the same module / parameter instance. In this example, the *word embedding layer* is at the first stage, and the *output head* is at the last stage. Thus, they are shared among ranks `[0, pipeline_size - 1]`.
+
+For the first stage, it maintains the embedding layer and some transformer blocks. For the last stage, it maintains some transformer blocks and the output head layer. For other stages, they just maintain some transformer blocks. `partition_uniform(num_layers, pipeline_size, num_chunks)` returns the parts of all ranks, and the part is a `tuple` of `(start, end)` (exclude end). `start == 0` means that it's the first stage, and `end == num_layers` means it's the last stage.
+
+```python
+class PipelineGPTHybrid(nn.Module):
+    def __init__(self,
+                 num_layers: int = 12,
+                 hidden_size: int = 768,
+                 num_attention_heads: int = 12,
+                 vocab_size: int = 50304,
+                 embed_drop_rate: float = 0.,
+                 act_func: Callable = F.gelu,
+                 mlp_ratio: int = 4,
+                 attn_drop_rate: float = 0.,
+                 drop_rate: float = 0.,
+                 dtype: torch.dtype = torch.float,
+                 checkpoint: bool = False,
+                 max_position_embeddings: int = 1024,
+                 layer_norm_epsilon: float = 1e-5,
+                 first: bool = False,
+                 last: bool = False):
+        super().__init__()
+        self.embedding = None
+        self.norm = None
+        self.head = None
+        if first:
+            self.embedding = col_gpt.GPTEmbedding(
+                hidden_size, vocab_size, max_position_embeddings, dropout=embed_drop_rate, dtype=dtype)
+        self.blocks = nn.ModuleList([
+            col_gpt.GPTBlock(hidden_size, num_attention_heads, mlp_ratio=mlp_ratio, attention_dropout=attn_drop_rate,
+                             dropout=drop_rate, dtype=dtype, checkpoint=checkpoint, activation=act_func)
+            for _ in range(num_layers)
+        ])
+        if last:
+            self.norm = col_nn.LayerNorm(hidden_size, eps=layer_norm_epsilon)
+            self.head = col_gpt.GPTLMHead(vocab_size=vocab_size,
+                                          dim=hidden_size,
+                                          dtype=dtype,
+                                          bias=False)
+
+    def forward(self, hidden_states=None, input_ids=None, attention_mask=None):
+        if self.embedding is not None:
+            hidden_states = self.embedding(input_ids=input_ids)
+        batch_size = hidden_states.shape[0]
+        attention_mask = attention_mask.view(batch_size, -1)
+        attention_mask = attention_mask[:, None, None, :]
+        attention_mask = attention_mask.to(dtype=hidden_states.dtype)  # fp16 compatibility
+        attention_mask = (1.0 - attention_mask) * -10000.0
+        for block in self.blocks:
+            hidden_states, attention_mask = block(hidden_states, attention_mask)
+        if self.norm is not None:
+            hidden_states = self.head(self.norm(hidden_states))
+        return hidden_states
+
+
+def build_gpt_pipeline(num_layers, num_chunks, device=torch.device('cuda'), **kwargs):
+    logger = get_dist_logger()
+    pipeline_size = gpc.get_world_size(ParallelMode.PIPELINE)
+    pipeline_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
+    rank = gpc.get_global_rank()
+    wrapper = PipelineSharedModuleWrapper([0, pipeline_size - 1])
+    parts = partition_uniform(num_layers, pipeline_size, num_chunks)[pipeline_rank]
+    models = []
+    for start, end in parts:
+        kwargs['num_layers'] = end - start
+        kwargs['first'] = start == 0
+        kwargs['last'] = end == num_layers
+        logger.info(f'Rank{rank} build layer {start}-{end}, {end-start}/{num_layers} layers')
+        chunk = PipelineGPTHybrid(**kwargs).to(device)
+        if start == 0:
+            wrapper.register_module(chunk.embedding.word_embeddings)
+        elif end == num_layers:
+            wrapper.register_module(chunk.head)
+        models.append(chunk)
+    if len(models) == 1:
+        model = models[0]
+    else:
+        model = nn.ModuleList(models)
+    return model
+
+
+def GPT2_exlarge_pipeline_hybrid(num_chunks=1, checkpoint=False, dtype=torch.float):
+    cfg = dict(hidden_size=1600, num_attention_heads=32, checkpoint=checkpoint, dtype=dtype)
+    return build_gpt_pipeline(48, num_chunks, **cfg)
+
+
+def GPT3_pipeline_hybrid(num_chunks=1, checkpoint=False, dtype=torch.float):
+    cfg = dict(hidden_size=12288, num_attention_heads=96,
+               checkpoint=checkpoint, max_position_embeddings=2048, dtype=dtype)
+    return build_gpt_pipeline(96, num_chunks, **cfg)
+```
+
+## Process the dataset
+
+We provide a small GPT web-text dataset here. The original format is loose JSON, and we will save the processed dataset.
+
+```python
+class WebtextDataset(Dataset):
+    def __init__(self, path, seq_len=1024) -> None:
+        super().__init__()
+        root = os.path.dirname(path)
+        encoded_data_cache_path = os.path.join(root, f'gpt_webtext_{seq_len}.pt')
+        if os.path.isfile(encoded_data_cache_path):
+            seq_len_, data, attention_mask = torch.load(
+                encoded_data_cache_path)
+            if seq_len_ == seq_len:
+                self.data = data
+                self.attention_mask = attention_mask
+                return
+        raw_data = []
+        with open(path) as f:
+            for line in f.readlines():
+                raw_data.append(json.loads(line)['text'])
+        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        tokenizer.pad_token = tokenizer.unk_token
+        encoded_data = tokenizer(
+            raw_data, padding=True, truncation=True, max_length=seq_len, return_tensors='pt')
+        self.data = encoded_data['input_ids']
+        self.attention_mask = encoded_data['attention_mask']
+        torch.save((seq_len, self.data, self.attention_mask),
+                   encoded_data_cache_path)
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, index):
+        return {
+            'input_ids': self.data[index],
+            'attention_mask': self.attention_mask[index]
+        }, self.data[index]
+```
+
+## Training GPT using hybrid parallelism
+
+In the previous tutorial, we explained the meanings of some pipeline arguments. In this case, we can determine the shape of each output tensor which is exchanged among pipeline stages. For GPT, the shape is `(MICRO BATCH SIZE, SEQUENCE LEN, HIDDEN SIZE)`. By setting this, we can avoid exchanging the tensor shape of each stage. When you are not sure of the tensor shape, you can just  leave it `None`, and the shape is inferred automatically. Make sure that the `dtype` of your model is correct. When you use `fp16`, the `dtype` of your model must be `torch.half`. Otherwise, the `dtype` must be `torch.float`. For pipeline parallelism, only `AMP_TYPE.NAIVE` is supported.
+
+You can easily use tensor parallel by setting `parallel` in `CONFIG`. The data parallelism size is automatically set based on the number of GPUs.
+
+```python
+NUM_EPOCHS = 60
+SEQ_LEN = 1024
+BATCH_SIZE = 192
+NUM_CHUNKS = None
+TENSOR_SHAPE = (1, 1024, 1600)
+# only pipeline parallel
+# CONFIG = dict(parallel=dict(pipeline=2), fp16=dict(mode=AMP_TYPE.NAIVE))
+# pipeline + 1D model parallel
+CONFIG = dict(NUM_MICRO_BATCHES = 192, parallel=dict(pipeline=2, tensor=dict(mode='1d', size=2)), fp16=dict(mode=AMP_TYPE.NAIVE))
+
+
+def train():
+    disable_existing_loggers()
+    parser = colossalai.get_default_parser()
+    args = parser.parse_args()
+    colossalai.launch_from_torch(config=CONFIG, backend=args.backend)
+    logger = get_dist_logger()
+
+    train_ds = WebtextDataset(os.environ['DATA'], seq_len=SEQ_LEN)
+    train_dataloader = utils.get_dataloader(train_ds,
+                                            seed=42,
+                                            batch_size=BATCH_SIZE,
+                                            pin_memory=True,
+                                            shuffle=True,
+                                            drop_last=True)
+
+    use_interleaved = NUM_CHUNKS is not None
+    num_chunks = 1 if not use_interleaved else NUM_CHUNKS
+    model = GPT2_exlarge_pipeline_hybrid(num_chunks=num_chunks, checkpoint=True, dtype=torch.half)
+    # model = GPT3_pipeline_hybrid(num_chunks=num_chunks, checkpoint=True, dtype=torch.half)
+    if use_interleaved and not isinstance(model, nn.ModuleList):
+        model = nn.ModuleList([model])
+
+    criterion = GPTLMLoss()
+
+    optimizer = torch.optim.Adam(model.parameters(), lr=0.00015, weight_decay=1e-2,)
+
+    engine, train_dataloader, _, _ = colossalai.initialize(model,
+                                                           optimizer,
+                                                           criterion,
+                                                           train_dataloader=train_dataloader)
+    global_batch_size = BATCH_SIZE * \
+        gpc.get_world_size(ParallelMode.DATA) * getattr(gpc.config, "gradient_accumulation", 1)
+    logger.info(f'Init done, global batch size = {global_batch_size}', ranks=[0])
+
+    timer = MultiTimer()
+
+    trainer = Trainer(
+        engine=engine,
+        logger=logger,
+        timer=timer
+    )
+
+    hook_list = [
+        hooks.LossHook(),
+        hooks.LogMetricByEpochHook(logger),
+        hooks.ThroughputHook(),
+        hooks.LogMetricByStepHook(),
+    ]
+
+    trainer.fit(
+        train_dataloader=train_dataloader,
+        epochs=NUM_EPOCHS,
+        test_interval=1,
+        hooks=hook_list,
+        display_progress=True,
+        return_output_label=False,
+    )
+```
diff --git a/docs/source/en/advanced_tutorials/train_vit_using_pipeline_parallelism.md b/docs/source/en/advanced_tutorials/train_vit_using_pipeline_parallelism.md
new file mode 100644
index 000000000000..b26599740c5f
--- /dev/null
+++ b/docs/source/en/advanced_tutorials/train_vit_using_pipeline_parallelism.md
@@ -0,0 +1,247 @@
+# Train ViT Using Pipeline Parallelism
+
+Author: Hongxin Liu, Yongbin Li
+
+**Example Code**
+- [ColossalAI-Examples Pipeline Parallel ViT](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/image/vision_transformer/pipeline_parallel)
+
+**Related Paper**
+- [Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM](https://arxiv.org/abs/2104.04473)
+
+## Introduction
+
+In this tutorial, you will learn how to train Vision Transformer for image classification from scratch, using pipeline.
+Pipeline parallelism is a kind of model parallelism, which is useful when your GPU memory cannot fit your model.
+By using it, we split the original model into multi stages, and each stage maintains a part of the original model.
+We assume that your GPU memory cannot fit ViT/L-16, and your memory can fit this model.
+
+##  Table of contents
+
+In this tutorial we will cover:
+
+1. The definition of ViT model, based on [TIMM](https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py)
+2. Processing the dataset
+3. Training ViT using pipeline
+
+## Import libraries
+
+```python
+import os
+from collections import OrderedDict
+from functools import partial
+
+import colossalai
+import colossalai.nn as col_nn
+import torch
+import torch.nn as nn
+from colossalai.builder import build_pipeline_model
+from colossalai.engine.schedule import (InterleavedPipelineSchedule,
+                                        PipelineSchedule)
+from colossalai.logging import disable_existing_loggers, get_dist_logger
+from colossalai.trainer import Trainer, hooks
+from colossalai.utils import MultiTimer, get_dataloader
+from timm.models import vision_transformer as vit
+from torchvision import transforms
+from torchvision.datasets import CIFAR10
+```
+
+
+
+## Define Vision Transformer model
+
+Generally, we provide 3 ways to build a pipelined model:
+
+1. `colossalai.builder.build_pipeline_model_from_cfg`
+2. `colossalai.builder.build_pipeline_model`
+3. Split the model by stages by yourself
+
+When your memory can fit the model, you can use the first two methods to build your model, otherwise you must split the model by yourself. The first two methods first build the whole model on CPU, then split the model, and finally you can just move the corresponding part of model to GPU.
+
+`colossalai.builder.build_pipeline_model_from_cfg()` receives a config file of model, and it can split the model uniformly (by layer) or balanced (by parameter size).
+
+If you are familiar with `PyTorch`, you can use  `colossalai.builder.build_pipeline_model()` which receives a `torch.nn.Sequential` model and split it by layer uniformly.
+
+In this tutorial, we will modify [TIMM/ViT](https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py) to `torch.nn.Sequential` and then use `colossalai.builder.build_pipeline_model()` to build the pipelined model.
+
+When the data is **one** `Tensor`, you can use the positional argument in `forward()` of your model to get the data tensor. For the first stage of pipeline, the first positional argument of `forward()` is the data tensor loaded from data loader. For other stages, the first positional argument of `forward()` is the output tensor from the previous stage. Note that if the stage is not the last stage, the return of `forward()` must be a `Tensor`.
+
+When the data is a `dict` of `Tensor`, you can use named keyword arguments in `forward()` of your model to get the data `dict`.
+
+```python
+class ViTEmbedding(nn.Module):
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, embed_layer=vit.PatchEmbed, drop_rate=0., distilled=False):
+        super().__init__()
+        self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.num_tokens = 2 if distilled else 1
+        self.patch_embed = embed_layer(
+            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.dist_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) if distilled else None
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        self.init_weights()
+
+    def forward(self, x):
+        x = self.patch_embed(x)
+        cls_token = self.cls_token.expand(x.shape[0], -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        if self.dist_token is None:
+            x = torch.cat((cls_token, x), dim=1)
+        else:
+            x = torch.cat((cls_token, self.dist_token.expand(x.shape[0], -1, -1), x), dim=1)
+        x = self.pos_drop(x + self.pos_embed)
+        return x
+
+    def init_weights(self):
+        vit.trunc_normal_(self.pos_embed, std=.02)
+        if self.dist_token is not None:
+            vit.trunc_normal_(self.dist_token, std=.02)
+        vit.trunc_normal_(self.cls_token, std=.02)
+        self.apply(vit._init_vit_weights)
+
+
+class ViTHead(nn.Module):
+    def __init__(self, embed_dim=768, num_classes=1000, norm_layer=None, distilled=False, representation_size=None):
+        super().__init__()
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        self.norm = norm_layer(embed_dim)
+        self.num_classes = num_classes
+        self.distilled = distilled
+        self.num_features = embed_dim
+        # Representation layer
+        if representation_size and not distilled:
+            self.num_features = representation_size
+            self.pre_logits = nn.Sequential(OrderedDict([
+                ('fc', nn.Linear(embed_dim, representation_size)),
+                ('act', nn.Tanh())
+            ]))
+        else:
+            self.pre_logits = nn.Identity()
+        # Classifier head(s)
+        self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+        self.head_dist = None
+        if distilled:
+            self.head_dist = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+        self.init_weights()
+
+    def forward(self, x):
+        x = self.norm(x)
+        if self.distilled:
+            x, x_dist = self.head(x[:, 0]), self.head_dist(x[:, 1])
+            if self.training and not torch.jit.is_scripting():
+                # during inference, return the average of both classifier predictions
+                return x, x_dist
+            else:
+                return (x + x_dist) / 2
+        else:
+            x = self.pre_logits(x[:, 0])
+            x = self.head(x)
+        return x
+
+    def init_weights(self):
+        self.apply(vit._init_vit_weights)
+
+
+def sequential_vit(img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12,
+                   num_heads=12, mlp_ratio=4., qkv_bias=True, representation_size=None, distilled=False,
+                   drop_rate=0., attn_drop_rate=0., drop_path_rate=0., embed_layer=vit.PatchEmbed, norm_layer=None,
+                   act_layer=None):
+    norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+    act_layer = act_layer or nn.GELU
+    embedding = ViTEmbedding(img_size=img_size, patch_size=patch_size, in_chans=in_chans,
+                             embed_dim=embed_dim, embed_layer=embed_layer, drop_rate=drop_rate, distilled=distilled)
+    dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+    blocks = [vit.Block(
+        dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, drop=drop_rate,
+        attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, act_layer=act_layer)
+        for i in range(depth)]
+    for block in blocks:
+        block.apply(vit._init_vit_weights)
+    head = ViTHead(embed_dim=embed_dim, num_classes=num_classes, norm_layer=norm_layer,
+                   distilled=distilled, representation_size=representation_size)
+    return nn.Sequential(embedding, *blocks, head)
+
+
+def vit_large_patch16_224(**kwargs):
+    model_kwargs = dict(embed_dim=1024, depth=24, num_heads=16, **kwargs)
+    return sequential_vit(**model_kwargs)
+```
+
+## Process the dataset
+
+Generally, we train ViT on large dataset like Imagenet. For simplicity, we just use CIFAR-10 here, since this tutorial is just for pipeline training.
+
+```python
+def build_cifar(batch_size):
+    transform_train = transforms.Compose([
+        transforms.RandomCrop(224, pad_if_needed=True),
+        transforms.AutoAugment(policy=transforms.AutoAugmentPolicy.CIFAR10),
+        transforms.ToTensor(),
+        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
+    ])
+    transform_test = transforms.Compose([
+        transforms.Resize(224),
+        transforms.ToTensor(),
+        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
+    ])
+
+    train_dataset = CIFAR10(root=os.environ['DATA'], train=True, download=True, transform=transform_train)
+    test_dataset = CIFAR10(root=os.environ['DATA'], train=False, transform=transform_test)
+    train_dataloader = get_dataloader(dataset=train_dataset, shuffle=True, batch_size=batch_size, pin_memory=True)
+    test_dataloader = get_dataloader(dataset=test_dataset, batch_size=batch_size, pin_memory=True)
+    return train_dataloader, test_dataloader
+```
+
+## Training ViT using pipeline
+
+You can set the size of pipeline parallel and number of microbatches in config. `NUM_CHUNKS` is useful when using interleved-pipeline (for more details see [Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM](https://arxiv.org/abs/2104.04473) ). The original batch will be split into `num_microbatches`, and each stage will load a micro batch each time. Then we will generate an approriate schedule for you to execute the pipeline training. If you don't need the output and label of model, you can set `return_output_label` to `False` when calling `trainer.fit()` which can further reduce GPU memory usage.
+
+You should `export DATA=/path/to/cifar`.
+
+```python
+BATCH_SIZE = 16
+NUM_EPOCHS = 60
+NUM_CHUNKS = 1
+CONFIG = dict(NUM_MICRO_BATCHES=4, parallel=dict(pipeline=2))
+
+
+def train():
+    disable_existing_loggers()
+    parser = colossalai.get_default_parser()
+    args = parser.parse_args()
+    colossalai.launch_from_torch(backend=args.backend, config=CONFIG)
+    logger = get_dist_logger()
+
+    # build model
+    model = vit_large_patch16_224()
+    model = build_pipeline_model(model, num_chunks=NUM_CHUNKS, verbose=True)
+
+    # build criterion
+    criterion = nn.CrossEntropyLoss()
+
+    # optimizer
+    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0)
+
+    # build dataloader
+    train_dataloader, test_dataloader = build_cifar(BATCH_SIZE)
+
+    engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model, optimizer, criterion,
+                                                                         train_dataloader, test_dataloader)
+    timer = MultiTimer()
+
+    trainer = Trainer(engine=engine, timer=timer, logger=logger)
+
+    hook_list = [
+        hooks.LossHook(),
+        hooks.AccuracyHook(col_nn.metric.Accuracy()),
+        hooks.LogMetricByEpochHook(logger),
+    ]
+
+    trainer.fit(train_dataloader=train_dataloader,
+                epochs=NUM_EPOCHS,
+                test_dataloader=test_dataloader,
+                test_interval=1,
+                hooks=hook_list,
+                display_progress=True)
+```
diff --git a/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md b/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md
new file mode 100644
index 000000000000..1f3086559939
--- /dev/null
+++ b/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md
@@ -0,0 +1,646 @@
+# Step By Step: Accelerate ViT Training With Colossal-AI (From Data Parallel to Hybrid Parallel)
+
+Author: Yuxuan Lou
+
+**Example Code**
+
+- [Colossal-AI Examples ViT on Cifar10](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/image/vision_transformer)
+
+**Related Paper**
+- [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/pdf/2010.11929.pdf)
+
+
+## Introduction
+
+In this example for ViT model, Colossal-AI provides three different parallelism techniques which acclerate model training: data parallelism, pipeline parallelism and tensor parallelism.
+We will show you how to train ViT on CIFAR-10 dataset with these parallelism techniques. To run this example, you will need 2-4 GPUs.
+
+
+## Tabel of Contents
+1. Colossal-AI installation
+2. Steps to train ViT with data parallelism
+3. Steps to train ViT with pipeline parallelism
+4. Steps to train ViT with tensor parallelism or hybrid parallelism
+
+## Colossal-AI Installation
+You can install Colossal-AI pacakage and its dependencies with PyPI.
+```bash
+pip install colossalai
+```
+
+
+
+## Data Parallelism
+Data parallism is one basic way to accelerate model training process. You can apply data parallism to training by only two steps:
+1. Define a configuration file
+2. Change a few lines of code in train script
+
+### Define your configuration file (`data_parallel/config.py`)
+To use Colossal-AI, the first step is to define a configuration file. And there are two kinds of variables here:
+
+1. **Colossal-AI feature specification**
+
+There is an array of features Colossal-AI provides to speed up training (parallel mode, mixed precision, ZeRO, etc.). Each feature is defined by a corresponding field in the config file. If we apply data parallel only, we do not need to specify the parallel mode. In this example, we use mixed precision training natively provided by PyTorch by define the mixed precision configuration `fp16 = dict(mode=AMP_TYPE.TORCH)`.
+
+2. **Global hyper-parameters**
+
+Global hyper-parameters include model-specific hyper-parameters, training settings, dataset information, etc.
+
+```python
+from colossalai.amp import AMP_TYPE
+
+# ViT Base
+BATCH_SIZE = 256
+DROP_RATE = 0.1
+NUM_EPOCHS = 300
+
+# mix precision
+fp16 = dict(
+    mode=AMP_TYPE.TORCH,
+)
+
+gradient_accumulation = 16
+clip_grad_norm = 1.0
+
+dali = dict(
+    gpu_aug=True,
+    mixup_alpha=0.2
+)
+```
+
+### Modify train script (`/data_parallel/train_with_cifar10.py`)
+
+#### Import modules
+- Colossal-AI related modules
+```python
+import colossalai
+from colossalai.context import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.logging import disable_existing_loggers, get_dist_logger
+from colossalai.nn.lr_scheduler import LinearWarmupLR
+from colossalai.nn.metric import Accuracy
+from colossalai.trainer import Trainer, hooks
+```
+
+- Other modules
+```python
+import os
+
+import torch
+from timm.models import vit_base_patch16_224
+
+
+from torchvision import transforms
+from torchvision.datasets import CIFAR10
+```
+
+#### Lauch Colossal-AI
+
+In train script,  you need to initialize the distributed environment for Colossal-AI after your config file is prepared. We call this process `launch`. In Colossal-AI, we provided several launch methods to initialize the distributed backend. In most cases, you can use `colossalai.launch` and `colossalai.get_default_parser` to pass the parameters via command line. Besides, Colossal-AI can utilize the existing launch tool provided by PyTorch as many users are familiar with by using `colossalai.launch_from_torch`. For more details, you can view the related [documents](https://www.colossalai.org/docs/basics/launch_colossalai).
+
+```python
+# initialize distributed setting
+parser = colossalai.get_default_parser()
+args = parser.parse_args()
+colossalai.launch_from_torch(config=args.config)
+
+disable_existing_loggers()
+logger = get_dist_logger()
+```
+
+After initialization, you can acess the variables in the config file by using `colossalai.core.global_context`.
+
+```python
+#access parameters
+print(gpc.config.BATCH_SIZE)
+```
+
+#### Build Model
+
+If only data parallelism is required, you do not need to make any changes to your model. Here, we use `vit_base_patch16_224` from `timm`.
+```python
+# build model
+model = vit_base_patch16_224(drop_rate=0.1, num_classes=gpc.config.NUM_CLASSES)
+```
+
+#### Build CIFAR-10 Dataloader
+`colossalai.utils.get_dataloader` can help you build dataloader easily.
+
+```python
+def build_cifar(batch_size):
+    transform_train = transforms.Compose([
+        transforms.RandomCrop(224, pad_if_needed=True),
+        transforms.AutoAugment(policy=transforms.AutoAugmentPolicy.CIFAR10),
+        transforms.ToTensor(),
+        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
+    ])
+    transform_test = transforms.Compose([
+        transforms.Resize(224),
+        transforms.ToTensor(),
+        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
+    ])
+
+    train_dataset = CIFAR10(root=os.environ['DATA'], train=True, download=True, transform=transform_train)
+    test_dataset = CIFAR10(root=os.environ['DATA'], train=False, transform=transform_test)
+    train_dataloader = get_dataloader(dataset=train_dataset, shuffle=True, batch_size=batch_size, pin_memory=True)
+    test_dataloader = get_dataloader(dataset=test_dataset, batch_size=batch_size, pin_memory=True)
+    return train_dataloader, test_dataloader
+
+
+# build dataloader
+train_dataloader, test_dataloader = build_cifar(gpc.config.BATCH_SIZE)
+```
+
+#### Define optimizer, loss function and LR scheduler
+
+Colossal-AI provides its own optimizer, loss function and LR scheduler. Those from PyTorch are also compatible.
+
+```python
+# build optimizer
+optimizer = colossalai.nn.Lamb(model.parameters(), lr=1.8e-2, weight_decay=0.1)
+
+# build loss
+criterion = torch.nn.CrossEntropyLoss()
+
+# lr_scheduelr
+lr_scheduler = LinearWarmupLR(optimizer, warmup_steps=50, total_steps=gpc.config.NUM_EPOCHS)
+```
+
+#### Start Colossal-AI engine
+
+Engine is essentially a wrapper class for model, optimizer and loss function. When we call `colossalai.initialize`, an engine object will be returned, and it has already been equipped with functionalities such as gradient clipping, gradient accumulation and zero optimizer as specified in your configuration file. Further model training is based on Colossal-AI engine.
+
+```python
+engine, train_dataloader, test_dataloader, _ = colossalai.initialize(
+        model, optimizer, criterion, train_dataloader, test_dataloader
+    )
+```
+
+#### Train: Trainer API
+Trainer is a more high-level wrapper for the user to execute training with fewer lines of code. It is easy to create a trainer object by passing the engine object.
+
+Besides, In trainer, the user can customize some hooks and attach these hooks to the trainer object. A hook object will execute life-cycle methods periodically based on the training scheme. For example, The `LRSchedulerHook` will execute `lr_scheduler.step()` to update the learning rate of the model during either `after_train_iter` or `after_train_epoch` stages.
+
+```python
+# build trainer
+trainer = Trainer(engine=engine, logger=logger)
+
+# build hooks
+hook_list = [
+    hooks.LossHook(),
+    hooks.AccuracyHook(accuracy_func=MixupAccuracy()),
+    hooks.LogMetricByEpochHook(logger),
+    hooks.LRSchedulerHook(lr_scheduler, by_epoch=True),
+
+    # comment if you do not need to use the hooks below
+    hooks.SaveCheckpointHook(interval=1, checkpoint_dir='./ckpt'),
+    hooks.TensorboardHook(log_dir='./tb_logs', ranks=[0]),
+]
+```
+
+Use `trainer.fit` for training:
+
+```python
+# start training
+trainer.fit(
+    train_dataloader=train_dataloader,
+    test_dataloader=test_dataloader,
+    epochs=gpc.config.NUM_EPOCHS,
+    hooks=hook_list,
+    display_progress=True,
+    test_interval=1
+)
+```
+
+### Start training
+`DATA` is the filepath where CIFAR-10 dataset will be automatically downloaded and stored.
+
+`<NUM_GPUs>` is the number of GPUs you want to use to train ViT on CIFAR-10 with data parallelism.
+
+```bash
+export DATA=<path_to_data>
+# If your torch >= 1.10.0
+torchrun --standalone --nproc_per_node <NUM_GPUs>  train_dp.py --config ./configs/config_data_parallel.py
+# If your torch >= 1.9.0
+# python -m torch.distributed.run --standalone --nproc_per_node= <NUM_GPUs> train_dp.py --config ./configs/config_data_parallel.py
+# Otherwise
+# python -m torch.distributed.launch --nproc_per_node <NUM_GPUs> --master_addr <node_name> --master_port 29500 train_dp.py --config ./configs/config.py
+```
+
+
+
+## Pipeline Parallelism
+Aside from data parallelism, Colossal-AI also support pipleline parallelism. In specific, Colossal-AI uses 1F1B pipeline introduced by NVIDIA. For more details, you can view the related [documents](https://www.colossalai.org/tutorials/features/pipeline_parallel).
+
+### Define your configuration file(`hybrid_parallel/configs/vit_pipeline.py`)
+To apply pipleline parallel on the data parallel basis, you only need to add a **parallel dict**
+```python
+from colossalai.amp import AMP_TYPE
+
+parallel = dict(
+    pipeline=2
+)
+# pipeline config
+NUM_MICRO_BATCHES = parallel['pipeline']
+TENSOR_SHAPE = (BATCH_SIZE // NUM_MICRO_BATCHES, SEQ_LENGTH, HIDDEN_SIZE)
+
+fp16 = dict(mode=AMP_TYPE.NAIVE)
+clip_grad_norm = 1.0
+```
+
+Other configs：
+```python
+# hyperparameters
+# BATCH_SIZE is as per GPU
+# global batch size = BATCH_SIZE x data parallel size
+BATCH_SIZE = 256
+LEARNING_RATE = 3e-3
+WEIGHT_DECAY = 0.3
+NUM_EPOCHS = 300
+WARMUP_EPOCHS = 32
+
+# model config
+IMG_SIZE = 224
+PATCH_SIZE = 16
+HIDDEN_SIZE = 768
+DEPTH = 12
+NUM_HEADS = 12
+MLP_RATIO = 4
+NUM_CLASSES = 10
+CHECKPOINT = True
+SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE) ** 2 + 1  # add 1 for cls token
+```
+
+### Build pipeline model (`/hybrid_parallel/model/vit.py`)
+Colossal-AI provides two methods to build a pipeline model from the existing model.
+- `colossalai.builder.build_pipeline_model_from_cfg`
+- `colossalai.builder.build_pipeline_model`
+
+Besides, you can also build a pipeline model from scrath with Colossal-AI.
+```python
+import math
+from typing import Callable
+
+import inspect
+import torch
+from colossalai import nn as col_nn
+from colossalai.registry import LAYERS, MODELS
+from colossalai.logging import get_dist_logger
+from colossalai.core import global_context as gpc
+from colossalai.context import ParallelMode
+from colossalai.builder.pipeline import partition_uniform
+from torch import dtype, nn
+from model_zoo.vit.vit import ViTBlock, ViTEmbedding, ViTHead
+
+
+@MODELS.register_module
+class PipelineVisionTransformer(nn.Module):
+    def __init__(self,
+                 img_size: int = 224,
+                 patch_size: int = 16,
+                 in_chans: int = 3,
+                 num_classes: int = 1000,
+                 depth: int = 12,
+                 num_heads: int = 12,
+                 dim: int = 768,
+                 mlp_ratio: int = 4,
+                 attention_dropout: float = 0.,
+                 dropout: float = 0.1,
+                 drop_path: float = 0.,
+                 layernorm_epsilon: float = 1e-6,
+                 activation: Callable = nn.functional.gelu,
+                 representation_size: int = None,
+                 dtype: dtype = None,
+                 bias: bool = True,
+                 checkpoint: bool = False,
+                 init_method: str = 'torch',
+                 first_stage=True,
+                 last_stage=True,
+                 start_idx=None,
+                 end_idx=None,):
+        super().__init__()
+
+        layers = []
+
+        if first_stage:
+            embed = ViTEmbedding(img_size=img_size,
+                                 patch_size=patch_size,
+                                 in_chans=in_chans,
+                                 embedding_dim=dim,
+                                 dropout=dropout,
+                                 dtype=dtype,
+                                 init_method=init_method)
+            layers.append(embed)
+
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, drop_path, depth)]
+
+        if start_idx is None and end_idx is None:
+            start_idx = 0
+            end_idx = depth
+
+        blocks = [
+            ViTBlock(
+                dim=dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                attention_dropout=attention_dropout,
+                dropout=dropout,
+                drop_path=dpr[i],
+                activation=activation,
+                dtype=dtype,
+                bias=bias,
+                checkpoint=checkpoint,
+                init_method=init_method,
+            ) for i in range(start_idx, end_idx)
+        ]
+        layers.extend(blocks)
+
+        if last_stage:
+            norm = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype)
+            head = ViTHead(dim=dim,
+                           num_classes=num_classes,
+                           representation_size=representation_size,
+                           dtype=dtype,
+                           bias=bias,
+                           init_method=init_method)
+            layers.extend([norm, head])
+
+        self.layers = nn.Sequential(
+            *layers
+        )
+
+    def forward(self, x):
+        x = self.layers(x)
+        return x
+
+
+def _filter_kwargs(func, kwargs):
+    sig = inspect.signature(func)
+    return {k: v for k, v in kwargs.items() if k in sig.parameters}
+
+
+def _build_pipeline_vit(module_cls, num_layers, num_chunks, device=torch.device('cuda'), **kwargs):
+    logger = get_dist_logger()
+    if gpc.is_initialized(ParallelMode.PIPELINE):
+        pipeline_size = gpc.get_world_size(ParallelMode.PIPELINE)
+        pipeline_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
+    else:
+        pipeline_size = 1
+        pipeline_rank = 0
+    rank = gpc.get_global_rank()
+    parts = partition_uniform(num_layers, pipeline_size, num_chunks)[pipeline_rank]
+    models = []
+
+    for start, end in parts:
+        kwargs['first_stage'] = start == 0
+        kwargs['last_stage'] = end == num_layers
+        kwargs['start_idx'] = start
+        kwargs['end_idx'] = end
+        logger.info(f'Rank{rank} build layer {start}-{end}, {end-start}/{num_layers} layers')
+        chunk = module_cls(**_filter_kwargs(module_cls.__init__, kwargs)).to(device)
+        models.append(chunk)
+    if len(models) == 1:
+        model = models[0]
+    else:
+        model = nn.ModuleList(models)
+    return model
+
+
+def build_pipeline_vit(num_layers, num_chunks, device=torch.device('cuda'), **kwargs):
+    return _build_pipeline_vit(PipelineVisionTransformer, num_layers, num_chunks, device, **kwargs)
+```
+
+### Modify train script (`/hybrid_parallel/train_with_cifar10.py`)
+
+#### Import modules
+```python
+from colossalai.engine.schedule import (InterleavedPipelineSchedule,
+                                        PipelineSchedule)
+from colossalai.utils import MultiTimer
+import os
+
+import colossalai
+
+import torch
+from colossalai.context import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.logging import get_dist_logger
+from colossalai.nn import CrossEntropyLoss
+from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
+from colossalai.utils import is_using_pp, get_dataloader
+from model.vit import build_pipeline_vit
+from model_zoo.vit.vit import _create_vit_model
+from tqdm import tqdm
+
+from torchvision import transforms
+from torchvision.datasets import CIFAR10
+```
+
+#### Launch Colossal-AI
+`colossalai.utils.is_using_pp` can help check whether pipeline parallelism is required in config file.
+
+```python
+# initialize distributed setting
+parser = colossalai.get_default_parser()
+args = parser.parse_args()
+
+# launch from torch
+colossalai.launch_from_torch(config=args.config)
+
+# get logger
+logger = get_dist_logger()
+logger.info("initialized distributed environment", ranks=[0])
+
+if hasattr(gpc.config, 'LOG_PATH'):
+    if gpc.get_global_rank() == 0:
+        log_path = gpc.config.LOG_PATH
+        if not os.path.exists(log_path):
+            os.mkdir(log_path)
+        logger.log_to_file(log_path)
+
+use_pipeline = is_using_pp()
+```
+
+#### Define model
+
+```python
+# create model
+model_kwargs = dict(img_size=gpc.config.IMG_SIZE,
+                    patch_size=gpc.config.PATCH_SIZE,
+                    dim=gpc.config.HIDDEN_SIZE,
+                    depth=gpc.config.DEPTH,
+                    num_heads=gpc.config.NUM_HEADS,
+                    mlp_ratio=gpc.config.MLP_RATIO,
+                    num_classes=gpc.config.NUM_CLASSES,
+                    init_method='jax',
+                    checkpoint=gpc.config.CHECKPOINT)
+
+if use_pipeline:
+    model = build_pipeline_vit(num_layers=model_kwargs['depth'], num_chunks=1, **model_kwargs)
+else:
+    model = _create_vit_model(**model_kwargs)
+```
+
+#### Count number of parameters
+
+You can count model parameters on different pipeline stages easily.
+
+```
+# count number of parameters
+total_numel = 0
+for p in model.parameters():
+    total_numel += p.numel()
+if not gpc.is_initialized(ParallelMode.PIPELINE):
+    pipeline_stage = 0
+else:
+    pipeline_stage = gpc.get_local_rank(ParallelMode.PIPELINE)
+logger.info(f"number of parameters: {total_numel} on pipeline stage {pipeline_stage}")
+```
+
+#### Build dataloader, optimizer, etc.
+
+```python
+def build_cifar(batch_size):
+    transform_train = transforms.Compose([
+        transforms.RandomCrop(224, pad_if_needed=True),
+        transforms.AutoAugment(policy=transforms.AutoAugmentPolicy.CIFAR10),
+        transforms.ToTensor(),
+        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
+    ])
+    transform_test = transforms.Compose([
+        transforms.Resize(224),
+        transforms.ToTensor(),
+        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
+    ])
+
+    train_dataset = CIFAR10(root=os.environ['DATA'], train=True, download=True, transform=transform_train)
+    test_dataset = CIFAR10(root=os.environ['DATA'], train=False, transform=transform_test)
+    train_dataloader = get_dataloader(dataset=train_dataset, shuffle=True, batch_size=batch_size, pin_memory=True)
+    test_dataloader = get_dataloader(dataset=test_dataset, batch_size=batch_size, pin_memory=True)
+    return train_dataloader, test_dataloader
+
+
+# craete dataloaders
+train_dataloader , test_dataloader = build_cifar()
+
+# create loss function
+criterion = CrossEntropyLoss(label_smoothing=0.1)
+
+# create optimizer
+optimizer = torch.optim.AdamW(model.parameters(), lr=gpc.config.LEARNING_RATE, weight_decay=gpc.config.WEIGHT_DECAY)
+
+# create lr scheduler
+lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer,
+                                       total_steps=gpc.config.NUM_EPOCHS,
+                                       warmup_steps=gpc.config.WARMUP_EPOCHS)
+```
+
+#### Start Colossal-AI engine
+
+```python
+# intiailize
+engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model=model,
+                                                                     optimizer=optimizer,
+                                                                     criterion=criterion,
+                                                                     train_dataloader=train_dataloader,
+                                                                     test_dataloader=test_dataloader)
+
+logger.info("Engine is built", ranks=[0])
+```
+
+#### Train: based on engine
+
+In the data parallelism example, we show how to train a model with Trainer API. We can also directly train a model based on engine. In this way, you can customize your training with more features.
+
+```python
+data_iter = iter(train_dataloader)
+
+for epoch in range(gpc.config.NUM_EPOCHS):
+    # training
+    engine.train()
+
+    if gpc.get_global_rank() == 0:
+        description = 'Epoch {} / {}'.format(
+            epoch,
+            gpc.config.NUM_EPOCHS
+        )
+        progress = tqdm(range(len(train_dataloader)), desc=description)
+    else:
+        progress = range(len(train_dataloader))
+    for _ in progress:
+        engine.zero_grad()
+        engine.execute_schedule(data_iter, return_output_label=False)
+        engine.step()
+        lr_scheduler.step()
+```
+
+### Start training
+```bash
+export DATA=<path_to_dataset>
+# If your torch >= 1.10.0
+torchrun --standalone --nproc_per_node <NUM_GPUs>  train_hybrid.py --config ./configs/config_pipeline_parallel.py
+# If your torch >= 1.9.0
+# python -m torch.distributed.run --standalone --nproc_per_node= <NUM_GPUs> train_hybrid.py --config ./configs/config_pipeline_parallel.py
+```
+
+
+
+
+## Tensor Parallelism and Hybrid Parallelism
+Tensor parallelism partitions each weight parameter across multiple devices in order to reduce memory load. Colossal-AI support 1D, 2D, 2.5D and 3D tensor parallelism. Besides, you can combine tensor parallelism with pipeline parallelism and data parallelism to reach hybrid parallelism. Colossal-AI also provides an easy way to apply tensor parallelism and hybrid parallelism. On the basis of pipeline parallelism, a few lines of code changing in config file is all you need.
+
+### Define your configuration file(`/hybrid_parallel/configs/vit_1d_tp2_pp2.py`)
+To use tensor parallelism, you only need to add related information to the **parallel dict**. To be specific, `TENSOR_PARALLEL_MODE` can be '1d', '2d', '2.5d', '3d'. And the size of different parallelism should satisfy: `#GPUs = pipeline parallel size x tensor parallel size x data parallel size`.  `data parallel size` will automatically computed after you specify the number of GPUs, pipeline parallel size and tensor parallel size.
+
+```python
+from colossalai.amp import AMP_TYPE
+# parallel setting
+TENSOR_PARALLEL_SIZE = 2
+TENSOR_PARALLEL_MODE = '1d'
+
+parallel = dict(
+    pipeline=2,
+    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE)
+)
+
+fp16 = dict(mode=AMP_TYPE.NAIVE)
+clip_grad_norm = 1.0
+
+
+# pipeline config
+NUM_MICRO_BATCHES = parallel['pipeline']
+TENSOR_SHAPE = (BATCH_SIZE // NUM_MICRO_BATCHES, SEQ_LENGTH, HIDDEN_SIZE)
+```
+
+Ohter configs:
+```python
+# hyperparameters
+# BATCH_SIZE is as per GPU
+# global batch size = BATCH_SIZE x data parallel size
+BATCH_SIZE = 256
+LEARNING_RATE = 3e-3
+WEIGHT_DECAY = 0.3
+NUM_EPOCHS = 300
+WARMUP_EPOCHS = 32
+
+# model config
+IMG_SIZE = 224
+PATCH_SIZE = 16
+HIDDEN_SIZE = 768
+DEPTH = 12
+NUM_HEADS = 12
+MLP_RATIO = 4
+NUM_CLASSES = 10
+CHECKPOINT = True
+SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE) ** 2 + 1  # add 1 for cls token
+```
+
+### Start training
+```bash
+export DATA=<path_to_dataset>
+# If your torch >= 1.10.0
+torchrun --standalone --nproc_per_node <NUM_GPUs>  train_hybrid.py --config ./configs/config_hybrid_parallel.py
+# If your torch >= 1.9.0
+# python -m torch.distributed.run --standalone --nproc_per_node= <NUM_GPUs> train_hybrid.py --config ./configs/config_hybrid_parallel.py
+```
diff --git a/docs/source/en/basics/colotensor_concept.md b/docs/source/en/basics/colotensor_concept.md
new file mode 100644
index 000000000000..2d8acd88dfd4
--- /dev/null
+++ b/docs/source/en/basics/colotensor_concept.md
@@ -0,0 +1,97 @@
+# ColoTensor Concepts
+
+Author: [Jiarui Fang](https://github.com/feifeibear), [Hongxin Liu](https://github.com/ver217) and [Haichen Huang](https://github.com/1SAA)
+
+**Prerequisite:**
+- [Colossal-AI Overview](../concepts/colossalai_overview.md)
+- [Distributed Training](../concepts/distributed_training.md)
+- [Paradigms of Parallelism](../concepts/paradigms_of_parallelism.md)
+
+## Introduction
+
+After ColossalAI version 0.1.8, [ColoTensor](https://colossalai.readthedocs.io/en/latest/colossalai/colossalai.tensor.html#colossalai.tensor.ColoTensor) becomes the basic data structure for tensors in ColossalAI. It is a subclass of torch.Tensor and can be used as a PyTorch Tensor. Additionally, some unique features make it possible to represent a Global Tensor with a payload distributed across multiple GPU devices. With the help of ColoTensor, the users can write distributed DNN training program similar to a serial one.support the following features.
+
+ColoTensor contains extra attributes capsuled in a [ColoTensorSpec](https://colossalai.readthedocs.io/en/latest/colossalai/colossalai.tensor.tensor_spec.html#colossalai.tensor.tensor_spec.ColoTensorSpec) instance to describe the tensor's payload distribution and computing pattern.
+
+- ProcessGroup: how processes are organized as communication groups.
+- Distributed Spec: how tensor is distributed among process groups.
+- Compute Spec: how the tensor is used during computation.
+
+We elaborate on them one by one.
+
+## ProcessGroup
+
+An instance of class [ProcessGroup](https://colossalai.readthedocs.io/en/latest/colossalai/colossalai.tensor.html#colossalai.tensor.ProcessGroup) describes how processes are organized in process groups. Processes in a process group can participate in the same collective communication operations together, such as allgather, allreduce, etc. The way the process group is organized is dominated by the Tensor's parallelism strategy. For example, if the user defines the tensor parallel (TP) and data parallel (DP) modes of a tensor, then the process organization of the process group will be automatically deduced. The process group settings can vary among different tensors. Therefore, it enables us to support more complicated hybrid parallel. The pipeline parallel (PP) definition is not in the ProcessGroup, it needs another set of mechanisms . We will supplement the related content of ColoTensor applied to PP in the future.
+
+Currently, a process group of ColoTensor is defined by two configurations, i.e. tp_degree and dp_degree. In the case of DP+TP hybrid parallelism, the device can be viewed as a 2D mesh. We place TP communication groups on the leading low dimension of the device mesh and then place the data parallel groups along the high dimension of the device mesh. The reason is that tensor parallelism has a larger communication overhead than data parallelism. Neighboring devices are placed inside a TP process group and are often placed in the same node.
+
+Considering that 8 processes are configured as tp_degree=4, and dp_degree=2, the layout is shown below. Process group tp0 contains gpu 0,1,2,3. Process dp1 contains gpu 1 and 5.
+
+<figure style={{textAlign: "center"}}>
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/ColoTensor_layout_demo.PNG"/>
+<figcaption>Process Group using tp_degree=4, dp_degree=2</figcaption>
+</figure>
+
+## Distributed Spec
+
+An instance of [Distributed Spec](https://colossalai.readthedocs.io/en/latest/colossalai/colossalai.tensor.distspec.html) describes how a ColoTensor is distributed among the ProcessGroup.
+
+How tensors are distributed among DP process groups is automatically derived and does not need to be manually specified by the user. If this tensor is a model parameter, it is replicated within the DP process group. If it is an activation tensor, it is split along the process with the highest dimension and evenly distributed the tensor payload among processes in the DP process group.
+
+Therefore, when using Distributed Spec, we only need to describe the way that the tensor is distributed among TP process groups. There are currently two ways to distribute among TP process group, i.e. [ShardSpec](https://colossalai.readthedocs.io/en/latest/colossalai/colossalai.tensor.distspec.html#colossalai.tensor.distspec.ShardSpec) and [ReplicaSpec](https://colossalai.readthedocs.io/en/latest/colossalai/colossalai.tensor.distspec.html#colossalai.tensor.distspec.ReplicaSpec). ShardSpec needs to specify the dimension index dim of the partition and the number of partitions num_partitions. Currently, we only support the split on a single dim. Different dist specs on the TP process groups can be converted to each other through the set_dist_spec() interface. The spec conversions are recorded by the autograd mechanism and it will trigger corresponding reverse operations during backward propagation.
+
+## Compute Spec
+
+An instance of class [ComputeSpec](https://colossalai.readthedocs.io/en/latest/colossalai/colossalai.tensor.compute_spec.html#colossalai.tensor.compute_spec.ComputeSpec) describes how a Coloensor be used in DNN training. Currently, we will set the correct Compute Pattern for the ColoTensor as the parameters of the module. The specific application scenarios will be shown in the next document.
+
+## ColoParameter
+
+[ColoParameter](https://colossalai.readthedocs.io/en/latest/colossalai/colossalai.tensor.colo_parameter.html#colossalai.tensor.colo_parameter.ColoParameter) is a subclass of ColoTensor. Used to define a Global Parameter tensor. Its relationship with ColoTensor is consistent with Torch.Tensor and torch.Parameter. The latter allows the tensor to appear in the return values of the module's parameters() and name_parameters() methods.
+
+## Example
+
+Let's see an example. A ColoTensor is initialized and sharded on 8 GPUs using tp_degree=4, dp_dgree=2. And then the tensor is sharded along the last dim among the TP process groups. Finally, we reshard it along the first dim (0 dim) among the TP process groups. We encourage users to run the code and observe the shape of each tensor.
+
+
+```python
+import torch
+import torch.multiprocessing as mp
+from colossalai.utils import free_port, print_rank_0
+from functools import partial
+
+import colossalai
+from colossalai.tensor import ProcessGroup, ColoTensor, ColoTensorSpec, ShardSpec, ComputeSpec, ComputePattern
+from colossalai.utils import free_port
+
+import torch
+
+def run_dist_tests(rank, world_size, port):
+    colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+    pg = ProcessGroup(tp_degree=2, dp_degree=2)
+
+    torch.manual_seed(0)
+    local_tensor = torch.randn(2, 3, 1).cuda()
+    print_rank_0(f"shape {local_tensor.shape}, {local_tensor.data}")
+
+    spec = ColoTensorSpec(pg, ShardSpec(dims=[-1], num_partitions=[pg.tp_world_size()]), ComputeSpec(ComputePattern.TP1D))
+    t1 = ColoTensor.from_torch_tensor(local_tensor, spec)
+    t1 = t1.to_replicate()
+    print_rank_0(f"shape {t1.shape}, {t1.data}")
+
+    spec2 = ShardSpec([0], [pg.tp_world_size()])
+    t1.set_dist_spec(spec2)
+    print_rank_0(f"shape {t1.shape}, {t1.data}")
+
+def test_dist_cases(world_size):
+    run_func = partial(run_dist_tests, world_size=world_size, port=free_port())
+    mp.spawn(run_func, nprocs=world_size)
+
+if __name__ == '__main__':
+    test_dist_cases(4)
+```
+
+:::caution
+
+The ColoTensor is an experimental feature and may be updated.
+
+:::
diff --git a/docs/source/en/basics/command_line_tool.md b/docs/source/en/basics/command_line_tool.md
new file mode 100644
index 000000000000..48b199cf78e9
--- /dev/null
+++ b/docs/source/en/basics/command_line_tool.md
@@ -0,0 +1,53 @@
+# Command Line Tool
+
+Author: Shenggui Li
+
+**Prerequisite:**
+- [Distributed Training](../concepts/distributed_training.md)
+- [Colossal-AI Overview](../concepts/colossalai_overview.md)
+
+## Introduction
+
+Colossal-AI provides command-line utilities for the user.
+The current command line tools support the following features.
+
+- verify Colossal-AI build
+- launch distributed jobs
+- tensor parallel micro-benchmarking
+
+## Check Installation
+
+To verify whether your Colossal-AI is built correctly, you can use the command `colossalai check -i`.
+This command will inform you information regarding the version compatibility and cuda extension.
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/05/04/KJmcVknyPHpBofa.png"/>
+<figcaption>Check Installation Demo</figcaption>
+</figure>
+
+## Launcher
+
+To launch distributed jobs on single or multiple nodes, the command `colossalai run` can be used for process launching.
+You may refer to [Launch Colossal-AI](./launch_colossalai.md) for more details.
+
+## Tensor Parallel Micro-Benchmarking
+
+As Colossal-AI provides an array of tensor parallelism methods, it is not intuitive to choose one for your hardware and
+model. Therefore, we provide a simple benchmarking to evaluate the performance of various tensor parallelisms on your system.
+This benchmarking is run on a simple MLP model where the input data is of the shape `(batch_size, seq_length, hidden_size)`.
+Based on the number of GPUs, the CLI will look for all possible tensor parallel configurations and display the benchmarking results.
+You can customize the benchmarking configurations by checking out `colossalai benchmark --help`.
+
+```shell
+# run on 4 GPUs
+colossalai benchmark --gpus 4
+
+# run on 8 GPUs
+colossalai benchmark --gpus 8
+```
+
+:::caution
+
+Only single-node benchmarking is supported currently.
+
+:::
diff --git a/docs/source/en/basics/configure_parallelization.md b/docs/source/en/basics/configure_parallelization.md
new file mode 100644
index 000000000000..4ac0299eac14
--- /dev/null
+++ b/docs/source/en/basics/configure_parallelization.md
@@ -0,0 +1,156 @@
+# Configure Parallelization
+
+Author: Shenggui Li, Siqi Mai
+
+**Prerequisite:**
+- [Distributed Training](../concepts/distributed_training.md)
+- [Paradigms of Parallelism](../concepts/paradigms_of_parallelism.md)
+- [Define Your Configuration](./define_your_config.md)
+
+
+## Introduction
+
+We support multiple parallelization in Colossal-AI. Hybrid parallelism in our codebase refers to namely the combination
+of data parallelism, pipeline parallelism and tensor parallelism (1D, 2D, 2.5D, 3D).
+
+Each parallelism requires different network topology and thus initialize different process groups.
+You can initialize the corresponding process group by setting `parallel` in the config file.
+The configuration for `parallel` must obey the following format. Data parallel size will be
+inferred automatically based on your inputs to pipeline parallelism and tensor parallelism.
+`colossalai.launch` will initialize these distributed process groups automatically based on your configuration.
+
+Some sample configurations are shown below:
+
+```python
+# sampler format
+parallel = dict(
+    pipeline=dict("size": int),
+    tensor=dict("size": int, "mode": '1d' or '2d' or '2.5d' or '3d', "kwargs": Any)
+)
+
+# this is ok
+parallel = dict(
+    pipeline=dict(size=2),
+    tensor=dict(size=4, mode='2d')
+)
+
+# this is ok
+parallel = dict(
+    pipeline=2,
+    tensor=dict(size=4, mode='2d')
+)
+
+# this is not ok
+# as you need to specify the mode for tensor parallelism
+parallel = dict(
+    pipeline=2,
+    tensor=4
+)
+
+# this is ok as well as tensor will be default to size 1
+# and mode None
+parallel = dict(
+    pipeline=2
+)
+
+# this is ok as well as pipeline will default to size 1
+parallel = dict(
+    tensor=dict(size=4, mode='2d')
+)
+
+```
+
+The key name `size` refers to the parallel size of the parallelism dimension. For example, pipeline size 2 means there
+will be 2 pipeline stages. The key name `mode` in tensor parallel config means the corresponding tensor parallelism
+will be initialized.
+
+**You can choose to not have 'parallel' in your configuration and both pipeline and tensor will default to size 1.**
+
+**Total number of GPUs must be equal to `data parallel size * tensor parallel size * pipeline parallel size`**
+
+## Data Parallel
+
+Data parallel is the most common way to distribute your training task by splitting data into several shards and train on
+a single shard on each device. The configuration for data parallel is detected automatically and set for you. You do not
+have to explicitly set them in your configurations. There are two ways to handle the all-reduce in data parallel in Colossal-AI.
+
+1. If you specify gradient handlers, gradients will be all-reduced according to the gradient handlers
+2. Otherwise, PyTorch DistributedDataParallel will be used
+
+In most cases, you will be using the second mode unless you have complex handling of the gradients.
+
+## 1D, 2D, 2.5D and 3D Parallel
+
+To enable hybrid parallelism, we provide an array of tensor parallelism. We provide the list of papers which match each
+tensor parallel method. These parallel modes need to work with the distributed layers provided by Colossal-AI.
+
+- 1D: [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053)
+
+- 2D: [An Efficient 2D Method for Training Super-Large Deep Learning Models](https://arxiv.org/abs/2104.05343)
+  2D parallel relies on the SUMMA matrix multiplication algorithm and splits the input data, model weights and layer
+  outputs along two different dimensions. The tensor chunks are distributed over a 2D mesh of `P = N^2` devices where
+  `N` is the number of tensor chunks in a single dimension.
+
+- 2.5D: [2.5-dimensional distributed model training](https://arxiv.org/abs/2105.14500)
+  Inspired by the 2.5D matrix multiplication algorithm, 2.5D parallel introduces a novel tensor parallelism which
+  further parallelizes 2D tensor parallelism. An amount of `P = N^2 ∗ d` processors are arranged into `d` layers, where
+  each layer performs matrix multiplication operations independently with a dimension `N`.
+
+- 3D: [Maximizing Parallelism in Distributed Training for Huge Neural Networks](https://arxiv.org/abs/2105.14450)
+  We also introduce a 3D tensor parallelism that parallelizes neural networks on a 3D processor cube. This method
+  achieves the optimal, `O(P^{1/3})` communication overhead on $P$ processors, while both computation and memory usage
+  are evenly distributed through optimized load balancing of parameters as well as activations.
+
+```python
+# 1D parallel
+parallel = dict(
+    tensor=dict(size=4, mode='1d')
+)
+
+# 2D parallel
+parallel = dict(
+    tensor=dict(size=4, mode='2d')
+)
+
+# 2.5D parallel
+parallel = dict(
+    tensor=dict(size=8, mode='2.5d', depth=2)
+)
+
+# 3D parallel
+parallel = dict(
+    tensor=dict(size=8, mode='3d')
+)
+```
+
+Once you specify the tensor parallel mode in your configuration, you can proceed to use its corresponding distributed
+operator. For example, if you mode is '2d', you can use `colossalai.nn.Linear2D` in you model construction.
+
+
+## Pipeline Parallel
+
+Pipeline parallelism is to split the model into several partitions by layer. For example, let's assume we have a simple
+model which consists of two linear layer. We have two GPUs, and we can allocate the first linear layer to the first GPU
+and the second layer to the second GPU.
+
+You can set the number of pipeline stages in your configuration file. When pipeline size is larger than 1, Colossal-AI
+will automatically creates the pipeline schedule which defines the forward and backward step.
+
+```python
+parallel = dict(
+    pipeline=dict(size=4), # number of pipeline stages
+)
+```
+
+## Sequence Parallel
+
+Sequence parallel is to support long-sequence modelling such as document-level text understanding and medical imaging.
+This method is proposed in [Sequence Parallelism: Making 4D Parallelism Possible](https://arxiv.org/abs/2105.13120).
+You can use specify the mode to be `sequence` to initialize its process group.
+
+
+```python
+parallel = dict(
+    tensor=dict(size=4, mode='sequence')
+)
+```
diff --git a/docs/source/en/basics/define_your_config.md b/docs/source/en/basics/define_your_config.md
new file mode 100644
index 000000000000..d2569691b7dc
--- /dev/null
+++ b/docs/source/en/basics/define_your_config.md
@@ -0,0 +1,82 @@
+# Define Your Configuration
+
+Author: Guangyang Lu, Shenggui Li, Siqi Mai
+
+**Prerequisite:**
+- [Distributed Training](../concepts/distributed_training.md)
+- [Colossal-AI Overview](../concepts/colossalai_overview.md)
+
+
+## Introduction
+
+In Colossal-AI, a configuration file is required to specify the features the system will inject into the training process.
+In this tutorial, we will introduce you how to construct your configuration file and how this config file will be used.
+Using configuration file has several advantages:
+
+1. You can store your feature configuration and training hyper-parameters in different configuration files
+2. New features released in the future can be specified in the configuration without code change in the training script
+
+In this tutorial, we will cover how to define your configuration file.
+
+## Configuration Definition
+
+In a configuration file, there are two types of variables. One serves as feature specification and the other serves
+as hyper-parameters. All feature-related variables are reserved keywords. For example, if you want to use mixed precision
+training, you need to use the variable name `fp16` in the config file and follow a pre-defined format.
+
+### Feature Specification
+
+There is an array of features Colossal-AI provides to speed up training. Each feature is defined by a corresponding field
+in the config file. In this tutorial, we are not giving the config details for all the features, but rather we are providing
+an illustration of how to specify a feature. **The details of each feature can be found in its respective tutorial.**
+
+To illustrate the use of config file, we use mixed precision training as an example here. In order to do so, you need to
+follow the steps below.
+
+1. create a configuration file (e.g. `config.py`, the file name can be anything)
+2. define the mixed precision configuration in the config file. For example, in order to use mixed precision training
+natively provided by PyTorch, you can just write these lines of code below into your config file.
+
+   ```python
+   from colossalai.amp import AMP_TYPE
+
+   fp16 = dict(
+     mode=AMP_TYPE.TORCH
+   )
+   ```
+
+3. Tell Colossal-AI where your config file is when launch the distributed environment. For example, the config file is in
+the current directory.
+
+   ```python
+   import colossalai
+
+   colossalai.launch(config='./config.py', ...)
+   ```
+
+In this way, Colossal-AI knows what features you want to use and will inject this feature during `colossalai.initialize`.
+
+### Global Hyper-parameters
+
+Besides feature specification, the config file can also serve as a place to define your training hyper-parameters. This
+comes handy when you want to perform multiple experiments, each experiment details can be put into a single config file
+to avoid confusion. These parameters will be stored in the global parallel context and can be accessed in the training script.
+
+For example, you can specify the batch size in your config file.
+
+```python
+BATCH_SIZE = 32
+```
+
+After launch, you are able to access your hyper-parameters through global parallel context.
+
+```python
+import colossalai
+from colossalai.core import global_context as gpc
+
+colossalai.launch(config='./config.py', ...)
+
+# access your parameter
+print(gpc.config.BATCH_SIZE)
+
+```
diff --git a/docs/source/en/basics/engine_trainer.md b/docs/source/en/basics/engine_trainer.md
new file mode 100644
index 000000000000..39792f622aa9
--- /dev/null
+++ b/docs/source/en/basics/engine_trainer.md
@@ -0,0 +1,387 @@
+# Use Engine and Trainer in Training
+
+Author: Shenggui Li, Siqi Mai
+
+**Prerequisite:**
+- [Initialize Features](./initialize_features.md)
+
+## Introduction
+
+In this tutorial, you will learn how to use the engine and trainer provided in Colossal-AI to train your model.
+Before we delve into the details, we would like to first explain the concept of engine and trainer.
+
+### Engine
+
+Engine is essentially a wrapper class for model, optimizer and loss function.
+When we call `colossalai.initialize`, an engine object will be returned, and it has already been equipped with
+functionalities such as gradient clipping, gradient accumulation and zero optimizer as specified in your configuration file.
+An engine object will use similar APIs to those of PyTorch training components such that the user has minimum change
+to their code.
+
+Below is a table which shows the commonly used APIs for the engine object.
+
+| Component                             | Function                                      | PyTorch                         | Colossal-AI                            |
+| ------------------------------------- | --------------------------------------------- | ------------------------------- | -------------------------------------- |
+| optimizer                             | Set all gradients to zero before an iteration | optimizer.zero_grad()           | engine.zero_grad()                     |
+| optimizer                             | Update the parameters                         | optimizer.step()                | engine.step()                          |
+| model                                 | Run a forward pass                            | outputs = model(inputs)         | outputs = engine(inputs)               |
+| criterion                             | Calculate the loss value                      | loss = criterion(output, label) | loss = engine.criterion(output, label) |
+| criterion                             | Execute back-propagation on the model         | loss.backward()                 | engine.backward(loss)                  |
+
+The reason why we need such an engine class is that we can add more functionalities while hiding the implementations in
+the `colossalai.initialize` function.
+Imaging we are gonna add a new feature, we can manipulate the model, optimizer, dataloader and loss function in the
+`colossalai.initialize` function and only expose an engine object to the user.
+The user only needs to modify their code to the minimum extent by adapting the normal PyTorch APIs to the Colossal-AI
+engine APIs. In this way, they can enjoy more features for efficient training.
+
+A normal training iteration using engine can be:
+
+```python
+import colossalai
+
+# build your model, optimizer, criterion, dataloaders
+...
+
+engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model,
+                                                                    optimizer,
+                                                                    criterion,
+                                                                    train_dataloader,
+                                                                    test_dataloader)
+for img, label in train_dataloader:
+    engine.zero_grad()
+    output = engine(img)
+    loss = engine.criterion(output, label)
+    engine.backward(loss)
+    engine.step()
+```
+
+### Trainer
+
+Trainer is a more high-level wrapper for the user to execute training with fewer lines of code. However, in pursuit of more abstraction, it loses some flexibility compared to engine. The trainer is designed to execute a forward and backward step to perform model weight update. It is easy to create a trainer object by passing the engine object. The trainer has a default value `None` for the argument `schedule`. In most cases, we leave this value to `None` unless we want to use pipeline parallelism. If you wish to explore more about this parameter, you can go to the tutorial on pipeline parallelism.
+
+```python
+from colossalai.logging import get_dist_logger
+from colossalai.trainer import Trainer, hooks
+
+# build components and initialize with colossalai.initialize
+...
+
+# create a logger so that trainer can log on the console
+logger = get_dist_logger()
+
+# create a trainer object
+trainer = Trainer(
+    engine=engine,
+    logger=logger
+)
+```
+
+
+
+In trainer, the user can customize some hooks and attach these hooks to the trainer object. A hook object will execute life-cycle methods periodically based on the training scheme. For example,  The `LRSchedulerHook` will execute `lr_scheduler.step()` to update the learning rate of the model during either `after_train_iter` or `after_train_epoch` stages depending on whether the user wants to update the learning rate after each training iteration or only after the entire training epoch. You can store the hook objects in a list and pass it to `trainer.fit` method. `trainer.fit` method will execute training and testing based on your parameters. If `display_process` is True, a progress bar will be displayed on your console to show the training process.
+
+```python
+# define the hooks to attach to the trainer
+hook_list = [
+    hooks.LossHook(),
+    hooks.LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=True),
+    hooks.AccuracyHook(accuracy_func=Accuracy()),
+    hooks.LogMetricByEpochHook(logger),
+]
+
+# start training
+trainer.fit(
+    train_dataloader=train_dataloader,
+    epochs=NUM_EPOCHS,
+    test_dataloader=test_dataloader,
+    test_interval=1,
+    hooks=hook_list,
+    display_progress=True
+)
+```
+
+If you want to customize your own hook class, you can inherit `hooks.BaseHook` and override the life-cycle methods of your interest. A dummy example to demonstrate how to create a simple log message hook is provided below for your reference.
+
+```python
+from colossalai.logging import get_dist_logger
+from colossalai.trainer import hooks
+
+class LogMessageHook(hooks.BaseHook):
+
+    def __init__(self, priority=10):
+        self._logger = get_dist_logger()
+
+    def before_train(self, trainer):
+        self._logger.info('training starts')
+
+    def after_train(self, trainer):
+        self._logger.info('training finished')
+
+
+...
+
+# then in your training script
+hook_list.append(LogMessageHook())
+```
+
+
+
+In the sections below, I will guide you through the steps required to train a ResNet model with both engine and trainer.
+
+
+
+## Explain with ResNet
+
+### Overview
+
+In this section we will cover:
+
+1. Use an engine object to train a ResNet34 model on CIFAR10 dataset
+2. Use a trainer object to train a ResNet34 model on CIFAR10 dataset
+
+The project structure will be like:
+
+```bash
+-- config.py
+-- run_resnet_cifar10_with_engine.py
+-- run_resnet_cifar10_with_trainer.py
+```
+
+Steps 1-4 below are commonly used regardless of using engine or trainer. Thus, steps 1-4 + step 5 will be your `run_resnet_cifar10_with_engine.py` and steps 1-4 + step 6 will form `run_resnet_cifar10_with_trainer.py`.
+
+### Hands-on Practice
+
+#### Step 1. Create a Config File
+
+In your project folder, create a `config.py`. This file is to specify some features you may want to use to train your model. A sample config file is as below:
+
+```python
+from colossalai.amp import AMP_TYPE
+
+BATCH_SIZE = 128
+NUM_EPOCHS = 200
+
+fp16=dict(
+    mode=AMP_TYPE.TORCH
+)
+```
+
+In this config file, we specify that we want to use batch size 128 per GPU and run for 200 epochs. These two parameters are exposed by `gpc.config`. For example, you can use `gpc.config.BATCH_SIZE` to access the value you store in your config file. The `fp16` configuration tells `colossalai.initialize` to use mixed precision training provided by PyTorch to train the model with better speed and lower memory consumption.
+
+#### Step 2. Initialize Distributed Environment
+
+We need to initialize the distributed training environment. This has been introduced in the tutorial on how to
+[launch Colossal-AI](./launch_colossalai.md). For this demostration, we use `launch_from_torch` and PyTorch launch utility.
+
+```python
+import colossalai
+
+# ./config.py refers to the config file we just created in step 1
+colossalai.launch_from_torch(config='./config.py')
+```
+
+#### Step 3. Create all the training components
+
+In this step, we can create all the components used for training. These components include:
+
+1. Model
+2. Optimizer
+3. Criterion/loss function
+4. Training/Testing dataloaders
+5. Learning rate Scheduler
+6. Logger
+
+
+
+To build these components, you need to import the following modules:
+
+```python
+from pathlib import Path
+from colossalai.logging import get_dist_logger
+import torch
+import os
+from colossalai.core import global_context as gpc
+from colossalai.utils import get_dataloader
+from torchvision import transforms
+from colossalai.nn.lr_scheduler import CosineAnnealingLR
+from torchvision.datasets import CIFAR10
+from torchvision.models import resnet34
+```
+
+
+
+Then build your components in the same way as how to normally build them in your PyTorch scripts. In the script below, we set the root path for CIFAR10 dataset as an environment variable `DATA`. You can change it to any path you like, for example, you can change `root=Path(os.environ['DATA'])` to `root='./data'` so that there is no need to set the environment variable.
+
+```python
+# build logger
+logger = get_dist_logger()
+
+# build resnet
+model = resnet34(num_classes=10)
+
+# build datasets
+train_dataset = CIFAR10(
+    root='./data',
+    download=True,
+    transform=transforms.Compose(
+        [
+            transforms.RandomCrop(size=32, padding=4),
+            transforms.RandomHorizontalFlip(),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[
+                0.2023, 0.1994, 0.2010]),
+        ]
+    )
+)
+
+test_dataset = CIFAR10(
+    root='./data',
+    train=False,
+    transform=transforms.Compose(
+        [
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[
+                0.2023, 0.1994, 0.2010]),
+        ]
+    )
+)
+
+# build dataloaders
+train_dataloader = get_dataloader(dataset=train_dataset,
+                                  shuffle=True,
+                                  batch_size=gpc.config.BATCH_SIZE,
+                                  num_workers=1,
+                                  pin_memory=True,
+                                  )
+
+test_dataloader = get_dataloader(dataset=test_dataset,
+                                 add_sampler=False,
+                                 batch_size=gpc.config.BATCH_SIZE,
+                                 num_workers=1,
+                                 pin_memory=True,
+                                 )
+
+# build criterion
+criterion = torch.nn.CrossEntropyLoss()
+
+# optimizer
+optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
+
+# lr_scheduler
+lr_scheduler = CosineAnnealingLR(optimizer, total_steps=gpc.config.NUM_EPOCHS)
+```
+
+#### Step 4. Initialize with Colossal-AI
+
+Next, the essential step is to obtain the engine class by calling `colossalai.initialize`. As stated in `config.py`, we will be using mixed precision training for training ResNet34 model. `colossalai.initialize` will automatically check your config file and assign relevant features to your training components. In this way, our engine object has already been able to train with mixed precision, but you do not have to explicitly take care of it.
+
+```python
+engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model,
+                                                                     optimizer,
+                                                                     criterion,
+                                                                     train_dataloader,
+                                                                     test_dataloader,
+                                                                     )
+```
+
+
+
+#### Step 5. Train with engine
+
+With all the training components ready, we can train ResNet34 just like how to normally deal with PyTorch training.
+
+```python
+for epoch in range(gpc.config.NUM_EPOCHS):
+    # execute a training iteration
+    engine.train()
+    for img, label in train_dataloader:
+        img = img.cuda()
+        label = label.cuda()
+
+        # set gradients to zero
+        engine.zero_grad()
+
+        # run forward pass
+        output = engine(img)
+
+        # compute loss value and run backward pass
+        train_loss = engine.criterion(output, label)
+        engine.backward(train_loss)
+
+        # update parameters
+        engine.step()
+
+    # update learning rate
+    lr_scheduler.step()
+
+    # execute a testing iteration
+    engine.eval()
+    correct = 0
+    total = 0
+    for img, label in test_dataloader:
+        img = img.cuda()
+        label = label.cuda()
+
+        # run prediction without back-propagation
+        with torch.no_grad():
+            output = engine(img)
+            test_loss = engine.criterion(output, label)
+
+        # compute the number of correct prediction
+        pred = torch.argmax(output, dim=-1)
+        correct += torch.sum(pred == label)
+        total += img.size(0)
+
+    logger.info(
+        f"Epoch {epoch} - train loss: {train_loss:.5}, test loss: {test_loss:.5}, acc: {correct / total:.5}, lr: {lr_scheduler.get_last_lr()[0]:.5g}", ranks=[0])
+```
+
+#### Step 6. Train with trainer
+
+If you wish to train with a trainer object, you can follow the code snippet below:
+
+```python
+from colossalai.nn.metric import Accuracy
+from colossalai.trainer import Trainer, hooks
+
+
+# create a trainer object
+trainer = Trainer(
+    engine=engine,
+    logger=logger
+)
+
+# define the hooks to attach to the trainer
+hook_list = [
+    hooks.LossHook(),
+    hooks.LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=True),
+    hooks.AccuracyHook(accuracy_func=Accuracy()),
+    hooks.LogMetricByEpochHook(logger),
+    hooks.LogMemoryByEpochHook(logger)
+]
+
+# start training
+# run testing every 1 epoch
+trainer.fit(
+    train_dataloader=train_dataloader,
+    epochs=gpc.config.NUM_EPOCHS,
+    test_dataloader=test_dataloader,
+    test_interval=1,
+    hooks=hook_list,
+    display_progress=True
+)
+```
+
+
+
+#### Step 7. Start Distributed Training
+
+Lastly, we can invoke the scripts using the distributed launcher provided by PyTorch as we used `launch_from_torch` in Step 2. You need to replace `<num_gpus>` with the number of GPUs available on your machine. This number can be 1 if you only want to use 1 GPU. If you wish to use other launchers, you can refer to the tutorial on How to Launch Colossal-AI.
+
+```bash
+# with engine
+python -m torch.distributed.launch --nproc_per_node <num_gpus> --master_addr localhost --master_port 29500 run_resnet_cifar10_with_engine.py
+# with trainer
+python -m torch.distributed.launch --nproc_per_node <num_gpus> --master_addr localhost --master_port 29500 run_resnet_cifar10_with_trainer.py
+```
diff --git a/docs/source/en/basics/initialize_features.md b/docs/source/en/basics/initialize_features.md
new file mode 100644
index 000000000000..e768d2022ad8
--- /dev/null
+++ b/docs/source/en/basics/initialize_features.md
@@ -0,0 +1,49 @@
+# Initialize Features
+
+Author: Shenggui Li, Siqi Mai
+
+**Prerequisite:**
+- [Distributed Training](../concepts/distributed_training.md)
+- [Colossal-AI Overview](../concepts/colossalai_overview.md)
+
+## Introduction
+
+In this tutorial, we will cover the use of `colossalai.initialize` which injects features into your training components
+(e.g. model, optimizer, dataloader) seamlessly. Calling `colossalai.initialize` is the standard procedure before you run
+into your training loops.
+
+In the section below, I will cover how `colossalai.initialize` works and what we should take note  of.
+
+## Usage
+
+In a typical workflow, we will launch distributed environment at the beginning of our training script.
+Afterwards, we will instantiate our objects such as model, optimizer, loss function, dataloader etc. At this moment, `colossalai.initialize`
+can come in to inject features into these objects. A pseudo-code example is like below:
+
+```python
+import colossalai
+import torch
+...
+
+
+# launch distributed environment
+colossalai.launch(config='./config.py', ...)
+
+# create your objects
+model = MyModel()
+optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
+criterion = torch.nn.CrossEntropyLoss()
+train_dataloader = MyTrainDataloader()
+test_dataloader = MyTrainDataloader()
+
+# initialize features
+engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model,
+                                                                     optimizer,
+                                                                     criterion,
+                                                                     train_dataloader,
+                                                                     test_dataloader)
+```
+
+The `colossalai.initialize` function will return an `Engine` object. The engine object is a wrapper
+for model, optimizer and loss function. **The engine object will run with features specified in the config file.**
+More details about the engine can be found in the [Use Engine and Trainer in Training](./engine_trainer.md).
diff --git a/docs/source/en/basics/launch_colossalai.md b/docs/source/en/basics/launch_colossalai.md
new file mode 100644
index 000000000000..be487f8539a5
--- /dev/null
+++ b/docs/source/en/basics/launch_colossalai.md
@@ -0,0 +1,232 @@
+# Launch Colossal-AI
+
+Author: Chuanrui Wang, Shenggui Li, Siqi Mai
+
+**Prerequisite:**
+- [Distributed Training](../concepts/distributed_training.md)
+- [Colossal-AI Overview](../concepts/colossalai_overview.md)
+
+
+## Introduction
+
+As mentioned in the previous tutorials stated in the prerequisite, you need to initialize the distributed environment
+for Colossal-AI after your config file is prepared.
+We call this process `launch`.
+In this tutorial, you will learn how to launch Colossal-AI on your server, be it a small one or big one.
+
+In Colossal-AI, we provided several launch methods to initialize the distributed backend.
+In most cases, you can use `colossalai.launch` and `colossalai.get_default_parser` to pass the
+parameters via command line.
+If you happen to use launchers such as SLURM, OpenMPI and PyTorch launch utility,
+we also provide several launching helper methods to access the rank and world size from the environment variables
+set by these launchers directly for your convenience.
+
+In this tutorial we will cover how to launch Colossal-AI to initialize the distributed backends:
+- Launch with `colossalai.launch`
+- Launch with Colossal-AI CLI
+- Launch with SLURM
+- Launch with OpenMPI
+
+## Launch Distributed Environment
+
+In order to launch Colossal-AI, we need two types of arguments:
+1. config file
+2. distributed settings
+
+The config file is always required regardless of the launch method but distributed settings can vary. The config file
+can be a path to the configuration file or a Python dictionary. The distributed settings can be passed via command line
+or multi-process launchers.
+
+### Command Line Parser
+
+Before we jump to `launch`, we firstly need to understand what parameters we need for initialization.
+As stated in the `Basic Concepts in Distributed Training` section of [Distributed Training](../concepts/distributed_training.md),
+the important parameters are:
+
+1. host
+2. port
+3. rank
+4. world_size
+5. backend
+
+In Colossal-AI, we provided a command line parser which has added these arguments in advance. You can get this parser by calling
+`colossalai.get_default_parser()`. This parser is usually used with `colossalai.launch`.
+
+```python
+# add these lines in your train.py
+import colossalai
+
+# get default parser
+parser = colossalai.get_default_parser()
+
+# if you want to add your own arguments
+parser.add_argument(...)
+
+# parse arguments
+args = parser.parse_args()
+```
+
+Then in your terminal, you can pass in these arguments:
+```shell
+
+python train.py --host <host> --rank <rank> --world_size <world_size> --port <port> --backend <backend>
+```
+
+`backend` is optional and the default value is `nccl`.
+
+### Native Launch
+
+To initialize the distributed environment, we provided a general `colossalai.launch` API. The `colossalai.launch` function takes in the parameters
+listed above and create a default process group in the communication network. This function is often used with the default
+parser for convenience.
+
+```python
+import colossalai
+
+# parse arguments
+args = colossalai.get_default_parser().parse_args()
+
+# launch distributed environment
+colossalai.launch(config=<CONFIG>,
+                  rank=args.rank,
+                  world_size=args.world_size,
+                  host=args.host,
+                  port=args.port,
+                  backend=args.backend
+)
+
+```
+
+
+### Launch with Colossal-AI CLI
+
+To enable easy launching on both single or multi nodes, we have implemented a launcher for Colossal-AI. This launcher is
+a wrapper of the torch distributed launch utility but enhanced with the capability of launching multi-node jobs easily.
+
+First, we need to set the launch method in our code. As this is a wrapper of the torch distributed launch utility, we will
+use `colossalai.launch_from_torch`. The arguments required for distributed environment such as rank, world size, host and port are all set by the PyTorch
+launcher and can be read from the environment variable directly.
+
+```python
+import colossalai
+
+colossalai.launch_from_torch(
+    config=<CONFIG>,
+)
+```
+
+Next, we can easily start multiple processes with `colossalai run` in your terminal. Below is an example to run the code
+on a single node with 4 GPUs. You can change the number of GPUs by `nproc_per_node` and the default port by `master_port`.
+
+```shell
+# run on the local node with 4 GPUs (default port: 29500)
+colossalai run --nproc_per_node 4 train.py
+
+# run on the local node with 4 GPUs with a different port
+colossalai run --nproc_per_node 4 --master_port 29505 test.py
+```
+
+If you are in a cluster and want to launch multi-node training, the CLI can help you start processes on different nodes
+with one simple command. There are two ways you can launch multi-node jobs.
+
+- Run with `--hosts`
+
+This is suitable when you only have a few nodes. Let's say I have two nodes, namely `host1` and `host2`,  I can start
+multi-node training with the following command. Compared to single-node training, you must specify the `master_addr`
+option, which is auto-set to localhost if running on a single node only.
+
+:::caution
+
+`master_addr` cannot be localhost when running on multiple nodes, it should be the hostname or IP address of a node.
+
+:::
+
+```shell
+# run on these two nodes
+colossalai run --nproc_per_node 4 --host host1,host2 --master_addr host1 test.py
+```
+- Run with `--hostfile`
+
+This method is suitable when you have a lot of nodes. The host file is a simple text file listing the available nodes.
+The list of nodes is commonly provided by cluster managers such as SLURM and PBS Pro. For example, you can get the list
+of nodes allocated to you via the environment variable `SLURM_NODELIST` in SLURM and `PBS_NODEFILE` in PBS Pro.
+Just do `echo $SLURM_NODELIST` or `cat $PBS_NODEFILE` to check it out. If you do not have such cluster managers, you can
+manually create one for your own use.
+
+The host file given to Colossal-AI launcher must be in the following format where each line is the host name of a node.
+
+```text
+host1
+host2
+```
+
+With the host file ready, we can launch multi-node jobs with the following commands. Just like using `--host`, you also
+need to specify the `master_addr` option. Some extra options are provided for `--hostfile` as listed below:
+
+- `--include`: specify the hosts to include for multi-node jobs. For example, if your host file has 8 nodes, but you
+happen to only want to run on 6 nodes instead, you can add `--include host1,host2,host3,...,host6` so that the job will only
+be launcher on the 6 nodes.
+- `--exclude`: specify the hosts to exclude for multi-node jobs. This is useful when some nodes are faulty. For example,
+if host1 GPU has some problems and you do not wish to run on host1 but all other nodes, you can add `--exclude host1` so that
+the job will only be launched on the remaining nodes.
+
+```shell
+# run with a hostfile
+colossalai run --nproc_per_node 4 --hostfile ./hostfile --master_addr host1  test.py
+
+# only include certain hosts to execute commands
+# this is used to manually select nodes to run
+colossalai run --nproc_per_node 4 --hostfile ./hostfile --master_addr host1  --include host1 test.py
+
+# exclude certain hosts to execute commands
+# this can be used when certain nodes are faulty
+colossalai run --nproc_per_node 4 --hostfile ./hostfile --master_addr host1  --exclude host2 test.py
+```
+
+### Launch with SLURM
+
+If you are on a system managed by the SLURM scheduler, you can also rely on the `srun` launcher to kickstart your Colossal-AI scripts.
+We provided the helper function `launch_from_slurm` for compatibility with the SLURM scheduler.
+`launch_from_slurm` will automatically read the rank and world size from the environment variables `SLURM_PROCID` and `SLURM_NPROCS` respectively
+and use them to start the distributed backend.
+Do this in your training script:
+
+```python
+import colossalai
+
+colossalai.launch_from_slurm(
+    config=<CONFIG>,
+    host=args.host,
+    port=args.port
+)
+```
+
+You can initialize the distributed environment by using this command in terminal.
+
+```bash
+srun python train.py --host <master_node> --port 29500
+```
+
+### Launch with OpenMPI
+If you are more familiar with OpenMPI, you can use `launch_from_openmpi` instead.
+`launch_from_openmpi` will automatically read the local rank, global rank and world size from the environment variables
+`OMPI_COMM_WORLD_LOCAL_RANK`, `MPI_COMM_WORLD_RANK` and `OMPI_COMM_WORLD_SIZE` respectively and
+use them to start the distributed backend.
+
+Do this in your train.py:
+```python
+colossalai.launch_from_openmpi(
+    config=<CONFIG>,
+    host=args.host,
+    port=args.port
+)
+```
+
+A sample command to launch multiple processes with OpenMPI would be:
+
+```bash
+mpirun --hostfile <my_hostfile> -np <num_process> python train.py --host <node name or ip> --port 29500
+```
+
+- --hostfile: use this option to specify a list of hosts on which to run
+- --np: set the number of processes (GPUs) to launch in total. For example, if --np 4, 4 python processes will be initialized to run train.py.
diff --git a/docs/source/en/basics/model_checkpoint.md b/docs/source/en/basics/model_checkpoint.md
new file mode 100644
index 000000000000..09d44e7c2709
--- /dev/null
+++ b/docs/source/en/basics/model_checkpoint.md
@@ -0,0 +1,61 @@
+# Model Checkpoint
+
+Author : Guangyang Lu
+
+**Prerequisite:**
+- [Launch Colossal-AI](./launch_colossalai.md)
+- [Initialize Colossal-AI](./initialize_features.md)
+
+**Example Code:**
+- [ColossalAI-Examples Model Checkpoint](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/utils/checkpoint)
+
+**This function is experiential.**
+
+## Introduction
+
+In this tutorial, you will learn how to save and load model checkpoints.
+
+To leverage the power of parallel strategies in Colossal-AI, modifications to models and tensors are needed, for which you cannot directly use `torch.save` or `torch.load`  to save or load model checkpoints. Therefore, we have provided you with the API to achieve the same thing.
+
+Moreover, when loading, you are not demanded to use the same parallel strategy as saving.
+
+## How to use
+
+### Save
+
+There are two ways to train a model in Colossal-AI, by engine or by trainer.
+**Be aware that we only save the `state_dict`.** Therefore, when loading the checkpoints, you need to define the model first.
+
+#### Save when using engine
+
+```python
+from colossalai.utils import save_checkpoint
+model = ...
+engine, _, _, _ = colossalai.initialize(model=model, ...)
+for epoch in range(num_epochs):
+    ... # do some training
+    save_checkpoint('xxx.pt', epoch, model)
+```
+
+#### Save when using trainer
+```python
+from colossalai.trainer import Trainer, hooks
+model = ...
+engine, _, _, _ = colossalai.initialize(model=model, ...)
+trainer = Trainer(engine, ...)
+hook_list = [
+            hooks.SaveCheckpointHook(1, 'xxx.pt', model)
+            ...]
+
+trainer.fit(...
+            hook=hook_list)
+```
+
+### Load
+
+```python
+from colossalai.utils import load_checkpoint
+model = ...
+load_checkpoint('xxx.pt', model)
+... # train or test
+```
diff --git a/docs/source/en/concepts/colossalai_overview.md b/docs/source/en/concepts/colossalai_overview.md
new file mode 100644
index 000000000000..d75d20196b08
--- /dev/null
+++ b/docs/source/en/concepts/colossalai_overview.md
@@ -0,0 +1,36 @@
+# Colossal-AI Overview
+
+Author: Shenggui Li, Siqi Mai
+
+## About Colossal-AI
+
+With the development of deep learning model size, it is important to shift to a new training paradigm. The traditional training method with no parallelism and optimization became a thing of the past and new training methods are the key to make training large-scale models efficient and cost-effective.
+
+Colossal-AI is designed to be a unfied system to provide an integrated set of training skills and utilities to the user. You can find the common training utilities such as mixed precision training and gradient accumulation. Besides, we provide an array of parallelism including data, tensor and pipeline parallelism. We optimize tensor parallelism with different multi-dimensional distributed matrix-matrix multiplication algorithm. We also provided different pipeline parallelism methods to allow the user to scale their model across nodes efficiently. More advanced features such as offloading can be found in this tutorial documentation in detail as well.
+
+## General Usage
+
+We aim to make Colossal-AI easy to use and non-instrusive to user code. There is a simple general workflow if you want to use Colossal-AI.
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/01/28/ZK7ICWzbMsVuJof.png"/>
+<figcaption>Workflow</figcaption>
+</figure>
+
+1. Prepare a configiguration file where specifies the features you want to use and your parameters.
+2. Initialize distributed backend with `colossalai.launch`
+3. Inject the training features into your training components (e.g. model, optimizer) with `colossalai.initialize`.
+4. Run training and testing
+
+We will cover the whole workflow in the `basic tutorials` section.
+
+## Future Development
+
+The Colossal-AI system will be expanded to include more training skills, these new developments may include but are not limited to:
+
+1. optimization of distributed operations
+2. optimization of training on heterogenous system
+3. implementation of training utilities to reduce model size and speed up training while preserving model performance
+4. expansion of existing parallelism methods
+
+We welcome ideas and contribution from the community and you can post your idea for future development in our forum.
diff --git a/docs/source/en/concepts/distributed_training.md b/docs/source/en/concepts/distributed_training.md
new file mode 100644
index 000000000000..5038714f754b
--- /dev/null
+++ b/docs/source/en/concepts/distributed_training.md
@@ -0,0 +1,120 @@
+# Distributed Training
+
+Author: Shenggui Li, Siqi Mai
+
+## What is a distributed system?
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/01/28/sE5daHf2ohIy9wX.png"/>
+<figcaption>Image source: <a href="https://towardsdatascience.com/distributed-training-in-the-cloud-cloud-machine-learning-engine-9e264ddde27f">Towards Data Science</a></figcaption>
+</figure>
+
+A distributed system consists of multiple software components which run on multiple machines. For example, the traditional
+database runs on a single machine. As the amount of data gets incredibly large, a single machine can no longer deliver desirable
+performance to the business, especially in situations such as Black Friday where network traffic can be unexpectedly high.
+To handle such pressure, modern high-performance database is designed to run on multiple machines, and they work together to provide
+high throughput and low latency to the user.
+
+One important evaluation metric for distributed system is scalability. For example, when we run an application on 4 machines,
+we naturally expect that the application can run 4 times faster. However, due to communication overhead and difference in
+hardware performance, it is difficult to achieve linear speedup. Thus, it is important to consider how to make the application
+faster when we implement it. Algorithms of good design and system optimization can help to deliver good performance. Sometimes,
+it is even possible to achieve linear and super-linear speedup.
+
+
+## Why we need distributed training for machine learning?
+
+Back in 2012, [AlexNet](https://arxiv.org/abs/1404.5997) won the champion of the ImageNet competition, and it was trained
+on two GTX 580 3GB GPUs.
+Today, most models that appear in the top AI conferences are trained on multiple GPUs. Distributed training is definitely
+a common practice when researchers and engineers develop AI models. There are several reasons behind this trend.
+
+1. Model size increases rapidly. [ResNet50](https://arxiv.org/abs/1512.03385) has 20 million parameters in 2015,
+[BERT-Large](https://arxiv.org/abs/1810.04805) has 345 million parameters in 2018,
+[GPT-2](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)
+has 1.5 billion parameters in 2018, and [GPT-3](https://arxiv.org/abs/2005.14165) has 175 billion parameters in 2020.
+It is obvious that the model size grows exponentially with time. The current largest model has exceeded more than 1000
+billion parameters. Super large models generally deliver more superior performance compared to their smaller counterparts.
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/01/28/sCyreJ9PF1EdZYf.jpg"/>
+<figcaption>Image source: <a href="https://huggingface.co/blog/large-language-models">HuggingFace</a></figcaption>
+</figure>
+
+
+2. Dataset size increases rapidly. For most machine learning developers, MNIST and CIFAR10 datasets are often the first few
+datasets on which they train their models. However, these datasets are very small compared to well-known ImageNet datasets.
+Google even has its own (unpublished) JFT-300M dataset which has around 300 million images, and this is close to 300 times
+larger than the ImageNet-1k dataset.
+
+
+3. Computing power gets stronger. With the advancement in the semiconductor industry, graphics cards become more and more
+powerful. Due to its larger number of cores, GPU is the most common compute platform for deep learning.
+From K10 GPU in 2012 to A100 GPU in 2020, the computing power has increased several hundred times. This allows us to performance
+compute-intensive tasks faster and deep learning is exactly such a task.
+
+Nowadays, the model can be too large to fit into a single GPU, and the dataset can be large enough to train for a hundred
+days on a single GPU. Only by training our models on multiple GPUs with different parallelization techniques, we are able
+to speed up the training process and obtain results in a reasonable amount of time.
+
+
+## Basic Concepts in Distributed Training
+
+Distributed training requires multiple machines/GPUs. During training, there will be communication among these devices.
+To understand distributed training better, there are several important terms to be made clear.
+
+- host: host is the main device in the communication network. It is often required as an argument when initializing the
+distributed environment.
+- port: port here mainly refers to master port on the host for communication.
+- rank: the unique ID given to a device in the network.
+- world size: the number of devices in the network.
+- process group: a process group is a communication network which include a subset of the devices. There is always a default
+process group which contains all the devices. A subset devices can form a process group so that they only communicate among
+the devices within the group.
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/01/28/qnNBKh8AjzgM5sY.png"/>
+<figcaption>A distributed system example</figcaption>
+</figure>
+
+To illustrate these concepts, let's assume we have 2 machines (also called nodes), and each machine has 4 GPUs. When we
+initialize distributed environment over these two machines, we essentially launch 8 processes (4 processes on each machine)
+and each process is bound to a GPU.
+
+Before initializing the distributed environment, we need to specify the host (master address) and port (master port). In
+this example, we can let host be node 0 and port be a number such as 29500. All the 8 processes will then look for the
+address and port and connect to one another.
+The default process group will then be created. The default process group has a world size of 8 and details are as follows:
+
+| process ID | rank | Node index | GPU index |
+| ---------- | ---- | ---------- | --------- |
+| 0          | 0    | 0          | 0         |
+| 1          | 1    | 0          | 1         |
+| 2          | 2    | 0          | 2         |
+| 3          | 3    | 0          | 3         |
+| 4          | 4    | 1          | 0         |
+| 5          | 5    | 1          | 1         |
+| 6          | 6    | 1          | 2         |
+| 7          | 7    | 1          | 3         |
+
+
+We can also create a new process group. This new process group can contain any subset of the processes.
+For example, we can create one containing only even-number processes, and the details of this new group will be:
+
+| process ID | rank | Node index | GPU index |
+| ---------- | ---- | ---------- | --------- |
+| 0          | 0    | 0          | 0         |
+| 2          | 1    | 0          | 2         |
+| 4          | 2    | 1          | 0         |
+| 6          | 3    | 1          | 2         |
+
+**Please note that rank is relative to the process group and one process can have a different rank in different process
+groups. The max rank is always `world size of the process group - 1`.**
+
+In the process group, the processes can communicate in two ways:
+1. peer-to-peer: one process send data to another process
+2. collective: a group of process perform operations such as scatter, gather, all-reduce, broadcast together.
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/01/28/zTmlxgc3oeAdn97.png"/>
+<figcaption>Collective communication, source: <a href="https://pytorch.org/tutorials/intermediate/dist_tuto.html">PyTorch distributed tutorial</a></figcaption>
+</figure>
diff --git a/docs/source/en/concepts/paradigms_of_parallelism.md b/docs/source/en/concepts/paradigms_of_parallelism.md
new file mode 100644
index 000000000000..ced7a544a7b0
--- /dev/null
+++ b/docs/source/en/concepts/paradigms_of_parallelism.md
@@ -0,0 +1,123 @@
+# Paradigms of Parallelism
+
+Author: Shenggui Li, Siqi Mai
+
+## Introduction
+
+With the development of deep learning, there is an increasing demand for parallel training. This is because that model
+and datasets are getting larger and larger and training time becomes a nightmare if we stick to single-GPU training. In
+this section, we will provide a brief overview of existing methods to parallelize training. If you wish to add on to this
+post, you may create a discussion in the [GitHub forum](https://github.com/hpcaitech/ColossalAI/discussions).
+
+## Data Parallel
+
+Data parallel is the most common form of parallelism due to its simplicity. In data parallel training, the dataset is split
+into several shards, each shard is allocated to a device. This is equivalent to parallelize the training process along the
+batch dimension. Each device will hold a full copy of the model replica and trains on the dataset shard allocated. After
+back-propagation, the gradients of the model will be all-reduced so that the model parameters on different devices can stay
+synchronized.
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/01/28/WSAensMqjwHdOlR.png"/>
+<figcaption>Data parallel illustration</figcaption>
+</figure>
+
+## Model Parallel
+
+In data parallel training, one prominent feature is that each GPU holds a copy of the whole model weights. This brings
+redundancy issue. Another paradigm of parallelism is model parallelism, where model is split and distributed over an array
+of devices. There are generally two types of parallelism: tensor parallelism and pipeline parallelism. Tensor parallelism is
+to parallelize computation within an operation such as matrix-matrix multiplication. Pipeline parallelism is to parallelize
+computation between layers. Thus, from another point of view, tensor parallelism can be seen as intra-layer parallelism and
+pipeline parallelism can be seen as inter-layer parallelism.
+
+### Tensor Parallel
+
+Tensor parallel training is to split a tensor into `N` chunks along a specific dimension and each device only holds `1/N`
+of the whole tensor while not affecting the correctness of the computation graph. This requires additional communication
+to make sure that the result is correct.
+
+Taking a general matrix multiplication as an example, let's say we have C = AB. We can split B along the column dimension
+into `[B0 B1 B2 ... Bn]` and each device holds a column. We then multiply `A` with each column in `B` on each device, we
+will get `[AB0 AB1 AB2 ... ABn]`. At this moment, each device still holds partial results, e.g. device rank 0 holds `AB0`.
+To make sure the result is correct, we need to all-gather the partial result and concatenate the tensor along the column
+dimension. In this way, we are able to distribute the tensor over devices while making sure the computation flow remains
+correct.
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/01/28/2ZwyPDvXANW4tMG.png"/>
+<figcaption>Tensor parallel illustration</figcaption>
+</figure>
+
+In Colossal-AI, we provide an array of tensor parallelism methods, namely 1D, 2D, 2.5D and 3D tensor parallelism. We will
+talk about them in detail in `advanced tutorials`.
+
+
+Related paper:
+- [GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding](https://arxiv.org/abs/2006.16668)
+- [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053)
+- [An Efficient 2D Method for Training Super-Large Deep Learning Models](https://arxiv.org/abs/2104.05343)
+- [2.5-dimensional distributed model training](https://arxiv.org/abs/2105.14500)
+- [Maximizing Parallelism in Distributed Training for Huge Neural Networks](https://arxiv.org/abs/2105.14450)
+
+### Pipeline Parallel
+
+Pipeline parallelism is generally easy to understand. If you recall your computer architecture course, this indeed exists
+in the CPU design.
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/01/28/at3eDv7kKBusxbd.png"/>
+<figcaption>Pipeline parallel illustration</figcaption>
+</figure>
+
+The core idea of pipeline parallelism is that the model is split by layer into several chunks, each chunk is
+given to a device. During the forward pass, each device passes the intermediate activation to the next stage. During the backward pass,
+each device passes the gradient of the input tensor back to the previous pipeline stage. This allows devices to compute simultaneously,
+and increases the training throughput. One drawback of pipeline parallel training is that there will be some bubble time where
+some devices are engaged in computation, leading to waste of computational resources.
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/01/28/sDNq51PS3Gxbw7F.png"/>
+<figcaption>Source: <a href="https://arxiv.org/abs/1811.06965">GPipe</a></figcaption>
+</figure>
+
+Related paper:
+- [PipeDream: Fast and Efficient Pipeline Parallel DNN Training](https://arxiv.org/abs/1806.03377)
+- [GPipe: Efficient Training of Giant Neural Networks using Pipeline Parallelism](https://arxiv.org/abs/1811.06965)
+- [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053)
+- [Chimera: Efficiently Training Large-Scale Neural Networks with Bidirectional Pipelines](https://arxiv.org/abs/2107.06925)
+
+
+## Optimizer-Level Parallel
+
+Another paradigm works at the optimizer level, and the current most famous method of this paradigm is ZeRO which stands
+for [zero redundancy optimizer](https://arxiv.org/abs/1910.02054). ZeRO works at three levels to remove memory redundancy
+(fp16 training is required for ZeRO):
+
+- Level 1: The optimizer states are partitioned across the processes
+- Level 2: The reduced 32-bit gradients for updating the model weights are also partitioned such that each process
+only stores the gradients corresponding to its partition of the optimizer states.
+- Level 3: The 16-bit model parameters are partitioned across the processes
+
+Related paper:
+- [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models](https://arxiv.org/abs/1910.02054)
+
+
+## Parallelism on Heterogeneous System
+
+The methods mentioned above generally require a large number of GPU to train a large model. However, it is often neglected
+that CPU has a much larger memory compared to GPU. On a typical server, CPU can easily have several hundred GB RAM while each GPU
+typically only has 16 or 32 GB RAM. This prompts the community to think why CPU memory is not utilized for distributed training.
+
+Recent advances rely on CPU and even NVMe disk to train large models. The main idea is to offload tensors back to CPU memory
+or NVMe disk when they are not used. By using the heterogeneous system architecture, it is possible to accommodate a huge
+model on a single machine.
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/01/28/qLHD5lk97hXQdbv.png"/>
+<figcaption>Heterogenous system illustration</figcaption>
+</figure>
+
+Related paper:
+- [ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning](https://arxiv.org/abs/2104.07857)
+- [PatrickStar: Parallel Training of Pre-trained Models via Chunk-based Memory Management](https://arxiv.org/abs/2108.05818)
diff --git a/docs/source/en/features/1D_tensor_parallel.md b/docs/source/en/features/1D_tensor_parallel.md
new file mode 100644
index 000000000000..530c2e7b64bc
--- /dev/null
+++ b/docs/source/en/features/1D_tensor_parallel.md
@@ -0,0 +1,111 @@
+# 1D Tensor Parallelism
+
+Author: Zhengda Bian, Yongbin Li
+
+**Prerequisite**
+- [Define Your Configuration](../basics/define_your_config.md)
+- [Configure Parallelization](../basics/configure_parallelization.md)
+
+**Example Code**
+- [ColossalAI-Examples 1D Tensor Parallelism](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/features/tensor_parallel/tensor_parallel_1d.py)
+
+**Related Paper**
+- [Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM](https://deepakn94.github.io/assets/papers/megatron-sc21.pdf)
+
+## Introduction
+
+Tensor parallelism partitions model weights across multiple devices in order to reduce memory load.
+An efficient 1D tensor parallelism implementation was introduced by [Megatron-LM](https://deepakn94.github.io/assets/papers/megatron-sc21.pdf).
+
+Let's take a linear layer as an example, which consists of a GEMM $Y = XA$. Given 2 processors, we split the columns of $A$ into $[A_1 ~ A_2]$, and calculate $Y_i = XA_i$ on each processor, which then forms $[Y_1 ~ Y_2] = [XA_1 ~ XA_2]$. This is called a column-parallel fashion.
+
+When a second linear layer $Z=YB$ follows the column-parallel one, we split $B$ into $\left[\begin{matrix} B_1 \\ B_2 \end{matrix} \right]$,
+which is called a row-parallel fashion.
+To calculate $Z = [Y_1 ~ Y_2] \left[\begin{matrix} B_1 \\ B_2 \end{matrix} \right]$, we first calculate $Y_iB_i$ on each processor, then use an all-reduce to aggregate the results as $Z=Y_1B_1+Y_2B_2$.
+
+We also need to note that in the backward pass, the column-parallel linear layer needs to aggregate the gradients of the input tensor $X$, because on each processor $i$ we only have $\dot{X_i}=\dot{Y_i}A_i^T$.
+Thus, we apply an all-reduce across the processors to get $\dot{X}=\dot{Y}A^T=\dot{Y_1}A_1^T+\dot{Y_2}A_2^T$.
+
+## Efficiency
+Given $P$ processors, we present the theoretical computation and memory cost, as well as the communication cost based on the ring algorithm in both the forward and backward pass of 1D tensor parallelism.
+
+| Computation | Memory (parameters) | Memory (activations) | Communication (bandwidth) | Communication (latency) |
+| :-:         | :-:              | :-:                  | :-:                       | :-:                     |
+| $O(1/P)$    | $O(1/P)$         | $O(1)$               | $O(2(P-1)/P)$             | $O(2(P-1))$             |
+
+## Usage
+
+To enable 1D tensor parallelism for our model, e.g. on 2 GPUs, we need to configure the parallism setting as below.
+```python
+CONFIG = dict(parallel=dict(
+    data=1,
+    pipeline=1,
+    tensor=dict(size=2, mode='1d'),
+))
+```
+Then Colossal-AI will automatically apply 1D parallelism to all the layers from `colossalai.nn`.
+
+Let's define a model that consists of a two-layer multi-layer perceptron (MLP) as below.
+```python
+import colossalai
+import colossalai.nn as col_nn
+import torch
+from colossalai.utils import print_rank_0
+
+class MLP(torch.nn.Module):
+    def __init__(self, dim: int = 256):
+        super().__init__()
+        intermediate_dim = dim * 4
+        self.dense_1 = col_nn.Linear(dim, intermediate_dim)
+        print_rank_0(f'Weight of the first linear layer: {self.dense_1.weight.transpose(0, 1).shape}')
+        self.activation = torch.nn.GELU()
+        self.dense_2 = col_nn.Linear(intermediate_dim, dim)
+        print_rank_0(f'Weight of the second linear layer: {self.dense_2.weight.transpose(0, 1).shape}')
+        self.dropout = col_nn.Dropout(0.1)
+
+    def forward(self, x):
+        x = self.dense_1(x)
+        print_rank_0(f'Output of the first linear layer: {x.shape}')
+        x = self.activation(x)
+        x = self.dense_2(x)
+        print_rank_0(f'Output of the second linear layer: {x.shape}')
+        x = self.dropout(x)
+        return x
+```
+
+Launch Colossal-AI on 2 GPUs and build the model.
+
+```python
+parser = colossalai.get_default_parser()
+colossalai.launch(config=CONFIG,
+                  rank=args.rank,
+                  world_size=args.world_size,
+                  local_rank=args.local_rank,
+                  host=args.host,
+                  port=args.port)
+
+m = MLP()
+```
+We will see the shapes of partitioned parameters(e.g. weights) in the MLP model.
+```shell
+Weight of the first linear layer: torch.Size([256, 512])
+Weight of the second linear layer: torch.Size([512, 256])
+```
+The complete weight of the first linear layer is supposed to have the shape `[256, 1024]`. After the column-parallel partitioning, it becomes `[256, 512]`.
+Similarly, the second row-parallel layer partitions the weight `[1024, 256]` into `[512, 256]`.
+
+We can run the model with some random inputs.
+```python
+from colossalai.utils import get_current_device
+
+x = torch.randn((16, 256), device=get_current_device())
+torch.distributed.broadcast(x, src=0)  # synchronize input
+
+x = m(x)
+```
+Then we can see the shapes of activation results.
+```shell
+Output of the first linear layer: torch.Size([16, 512])
+Output of the second linear layer: torch.Size([16, 256])
+```
+The output of the first linear layer is split into 2 partitions (each has the shape `[16, 512]`), while the second layer has identical outputs across the GPUs.
diff --git a/docs/source/en/features/2D_tensor_parallel.md b/docs/source/en/features/2D_tensor_parallel.md
new file mode 100644
index 000000000000..582614c2f2f4
--- /dev/null
+++ b/docs/source/en/features/2D_tensor_parallel.md
@@ -0,0 +1,142 @@
+# 2D Tensor Parallelism
+
+Author: Zhengda Bian, Yongbin Li
+
+**Prerequisite**
+- [Define Your Configuration](../basics/define_your_config.md)
+- [Configure Parallelization](../basics/configure_parallelization.md)
+- [1D Tensor Parallelism](./1D_tensor_parallel.md)
+
+**Example Code**
+- [ColossalAI-Examples - 2D Tensor Parallelism](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/features/tensor_parallel/tensor_parallel_2d.py)
+
+**Related Paper**
+- [An Efficient 2D Method for Training Super-Large Deep Learning Models](https://arxiv.org/pdf/2104.05343.pdf)
+
+## Introduction
+
+1D tensor parallelism does not partition activations, which can also consume a great amount of memory in terms of large-scale models.
+To evenly distribute the computation and memory load, [an efficient 2D tensor parallelism algorithm](https://arxiv.org/pdf/2104.05343.pdf) was introduced based on SUMMA (Scalable Universal Matrix Multiplication Algorithm).
+
+Let's still take a linear layer $Y = XA$ as an example.
+Given $P=q\times q$ processors (necessary condition), e.g. $q=2$, we split both the input $X$ and weight $A$ into
+
+$$
+\left[\begin{matrix} X_{10} & X_{11} \\ X_{00} & X_{01} \end{matrix} \right]
+\text{~and~}
+\left[\begin{matrix} A_{10} & A_{11} \\ A_{00} & A_{01} \end{matrix} \right].
+$$
+
+The calculation includes $q$ steps. When $t=1$, $X_{i0}$ is broadcasted in its row, and $A_{0j}$ is broadcasted in its column. So, we have
+
+$$
+\left[\begin{matrix} X_{10},A_{00} & X_{10},A_{01} \\ X_{00},A_{00} & X_{00},A_{01} \end{matrix} \right].
+$$
+
+Then we multiply $X_{i0}$ and $A_{0j}$ on each processor $(i, j)$ as
+
+$$
+\left[\begin{matrix} X_{10}A_{00} & X_{10}A_{01} \\ X_{00}A_{00} & X_{00}A_{01} \end{matrix} \right] (1).
+$$
+
+Similarly, when $t=2$, $X_{i1}$ is broadcasted in its row, $A_{1j}$ is broadcasted in its column, and we multiply them as
+
+$$
+\left[\begin{matrix} X_{11}A_{10} & X_{11}A_{11} \\ X_{01}A_{10} & X_{01}A_{11} \end{matrix} \right] (2).
+$$
+
+By adding $(1)$ and $(2)$ up, we have
+
+$$
+Y = XA = \left[\begin{matrix} X_{10}A_{00}+X_{11}A_{10} & X_{10}A_{01}+X_{11}A_{11} \\ X_{00}A_{00}+X_{01}A_{10} & X_{00}A_{01}+X_{01}A_{11} \end{matrix} \right].
+$$
+
+## Efficiency
+Given $P=q\times q$ processors, we present the theoretical computation and memory cost, as well as the communication cost based on the ring algorithm in both the forward and backward pass of 2D tensor parallelism.
+
+| Computation | Memory (parameters) | Memory (activations) | Communication (bandwidth) | Communication (latency) |
+| :-:         | :-:              | :-:                  | :-:                       | :-:                     |
+| $O(1/q^2)$  | $O(1/q^2)$       | $O(1/q^2)$           | $O(6(q-1)/q)$             | $O(6(q-1))$             |
+
+## Usage
+
+To enable 2D tensor parallelism for our model, e.g. on 4 GPUs, we need to configure the parallism setting as below.
+```python
+CONFIG = dict(parallel=dict(
+    data=1,
+    pipeline=1,
+    tensor=dict(size=4, mode='2d'),
+))
+```
+Then Colossal-AI will automatically apply 2D parallelism to all the layers from `colossalai.nn`.
+
+Let's define a model that consists of a two-layer multi-layer perceptron (MLP) as below.
+```python
+import colossalai
+import colossalai.nn as col_nn
+import torch
+from colossalai.utils import print_rank_0
+
+class MLP(torch.nn.Module):
+    def __init__(self, dim: int = 256):
+        super().__init__()
+        intermediate_dim = dim * 4
+        self.dense_1 = col_nn.Linear(dim, intermediate_dim)
+        print_rank_0(f'Weight of the first linear layer: {self.dense_1.weight.shape}')
+        self.activation = torch.nn.GELU()
+        self.dense_2 = col_nn.Linear(intermediate_dim, dim)
+        print_rank_0(f'Weight of the second linear layer: {self.dense_2.weight.shape}')
+        self.dropout = col_nn.Dropout(0.1)
+
+    def forward(self, x):
+        x = self.dense_1(x)
+        print_rank_0(f'Output of the first linear layer: {x.shape}')
+        x = self.activation(x)
+        x = self.dense_2(x)
+        print_rank_0(f'Output of the second linear layer: {x.shape}')
+        x = self.dropout(x)
+        return x
+```
+Launch Colossal-AI on 4 GPUs and build the model
+```python
+parser = colossalai.get_default_parser()
+colossalai.launch(config=CONFIG,
+                  rank=args.rank,
+                  world_size=args.world_size,
+                  local_rank=args.local_rank,
+                  host=args.host,
+                  port=args.port)
+
+m = MLP()
+```
+We will see the shapes of partitioned parameters(e.g. weights) in the MLP model.
+```shell
+Weight of the first linear layer: torch.Size([128, 512])
+Weight of the second linear layer: torch.Size([512, 128])
+```
+The complete weight of the first linear layer is supposed to have the shape `[256, 1024]`. After the partitioning of 2D parallelism, it becomes `[128, 512]` on each GPU.
+Similarly, the second layer partitions the weight `[1024, 256]` into `[512, 128]`.
+
+We can run the model with some random inputs.
+```python
+from colossalai.context import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.utils import get_current_device
+
+x = torch.randn((16, 256), device=get_current_device())
+# partition input
+torch.distributed.broadcast(x, src=0)
+x = torch.chunk(x, 2, dim=0)[gpc.get_local_rank(ParallelMode.PARALLEL_2D_COL)]
+x = torch.chunk(x, 2, dim=-1)[gpc.get_local_rank(ParallelMode.PARALLEL_2D_ROW)]
+print_rank_0(f'Input: {x.shape}')
+
+x = m(x)
+```
+Then we can see the shapes of activation results.
+```shell
+Input: torch.Size([8, 128])
+Output of the first linear layer: torch.Size([8, 512])
+Output of the second linear layer: torch.Size([8, 128])
+```
+The activation tensors in 2D parallelism are all split in both row and column.
+E.g. the output of the first linear layer has the shape `[8, 512]`, while the second layer has the output of `[8, 128]`.
diff --git a/docs/source/en/features/2p5D_tensor_parallel.md b/docs/source/en/features/2p5D_tensor_parallel.md
new file mode 100644
index 000000000000..34a261ea0aa0
--- /dev/null
+++ b/docs/source/en/features/2p5D_tensor_parallel.md
@@ -0,0 +1,142 @@
+# 2.5D Tensor Parallelism
+
+Author: Zhengda Bian, Yongbin Li
+
+**Prerequisite**
+- [Define Your Configuration](../basics/define_your_config.md)
+- [Configure Parallelization](../basics/configure_parallelization.md)
+- [1D Tensor Parallelism](./1D_tensor_parallel.md)
+- [2D Tensor Parallelism](./2D_tensor_parallel.md)
+
+**Example Code**
+- [ColossalAI-Examples - 2.5D Tensor Parallelism](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/features/tensor_parallel/tensor_parallel_2p5d.py)
+
+**Related Paper**
+- [2.5-dimensional distributed model training](https://arxiv.org/pdf/2105.14500.pdf)
+
+## Introduction
+
+Compared with 1D tensor parallelism, 2D parallelism reduces the memory cost, but may introduce more communication.
+Therefore, a  [2.5D tensor parallelism algorithm](https://arxiv.org/pdf/2105.14500.pdf) was proposed based on 2.5D SUMMA to reduce communication by using more devices.
+
+Let's still take a linear layer $Y = XA$ as an example.
+Given $P=q \times q \times d$ processors (necessary condition), e.g. $q=d=2$, we split the input $X$ into $d\times q$ rows and $q$ columns as
+
+$$
+\left[\begin{matrix} X_{30} & X_{31} \\ X_{20} & X_{21} \\ X_{10} & X_{11} \\ X_{00} & X_{01}\end{matrix} \right],
+$$
+which can be reshaped into $d$ layers as
+
+$$
+\left[\begin{matrix} X_{10} & X_{11} \\ X_{00} & X_{01} \end{matrix} \right] \text{~and~}\left[\begin{matrix} X_{30} & X_{31} \\ X_{20} & X_{21} \end{matrix} \right].
+$$
+
+Also, the weight $A$ is split into
+
+$$
+\left[\begin{matrix} A_{10} & A_{11} \\ A_{00} & A_{01} \end{matrix} \right].
+$$
+
+For each layer of $X$, we use the SUMMA algorithm to multiply $X$ and $A$.
+Then, we have the output
+
+$$
+\left[\begin{matrix} Y_{10}=X_{10}A_{00}+X_{11}A_{10} & Y_{11}=X_{10}A_{01}+X_{11}A_{11} \\ Y_{00}=X_{00}A_{00}+X_{01}A_{10} & Y_{01}=X_{00}A_{01}+X_{01}A_{11} \end{matrix} \right]
+\text{~and~}
+$$
+$$
+\left[\begin{matrix} Y_{30}=X_{30}A_{00}+X_{31}A_{10} & Y_{31}=X_{30}A_{01}+X_{31}A_{11} \\ Y_{20}=X_{20}A_{00}+X_{21}A_{10} & Y_{21}=X_{20}A_{01}+X_{21}A_{11} \end{matrix} \right].
+$$
+
+## Efficiency
+Given $P=q \times q \times d$ processors, we present the theoretical computation and memory cost, as well as the communication cost based on the ring algorithm in both the forward and backward pass of 2.5D tensor parallelism.
+
+| Computation | Memory (parameters) | Memory (activations) | Communication (bandwidth) | Communication (latency) |
+| :-:         | :-:              | :-:                  | :-:                       | :-:                     |
+| $O(1/dq^2)$ | $O(1/q^2)$       | $O(1/dq^2)$          | $\small O(3(q-1)(d+1)/dq)$       | $O(6(q-1))$             |
+
+## Usage
+
+To enable 2.5D tensor parallelism for our model, e.g. on 8 GPUs, we need to configure the parallism setting as below.
+```python
+CONFIG = dict(parallel=dict(
+    data=1,
+    pipeline=1,
+    tensor=dict(size=8, mode='2.5d', depth=2),
+))
+
+```
+Then Colossal-AI will automatically apply 2.5D parallelism to all the layers from `colossalai.nn`.
+
+Let's define a model that consists of a two-layer multi-layer perceptron (MLP) as below.
+```python
+import colossalai
+import colossalai.nn as col_nn
+import torch
+from colossalai.utils import print_rank_0
+
+class MLP(torch.nn.Module):
+    def __init__(self, dim: int = 256):
+        super().__init__()
+        intermediate_dim = dim * 4
+        self.dense_1 = col_nn.Linear(dim, intermediate_dim)
+        print_rank_0(f'Weight of the first linear layer: {self.dense_1.weight.shape}')
+        self.activation = torch.nn.GELU()
+        self.dense_2 = col_nn.Linear(intermediate_dim, dim)
+        print_rank_0(f'Weight of the second linear layer: {self.dense_2.weight.shape}')
+        self.dropout = col_nn.Dropout(0.1)
+
+    def forward(self, x):
+        x = self.dense_1(x)
+        print_rank_0(f'Output of the first linear layer: {x.shape}')
+        x = self.activation(x)
+        x = self.dense_2(x)
+        print_rank_0(f'Output of the second linear layer: {x.shape}')
+        x = self.dropout(x)
+        return x
+```
+Launch Colossal-AI on 8 GPUs and build the model
+```python
+parser = colossalai.get_default_parser()
+colossalai.launch(config=CONFIG,
+                  rank=args.rank,
+                  world_size=args.world_size,
+                  local_rank=args.local_rank,
+                  host=args.host,
+                  port=args.port)
+
+m = MLP()
+```
+We will see the shapes of partitioned parameters(e.g. weights) in the MLP model.
+```shell
+Weight of the first linear layer: torch.Size([128, 512])
+Weight of the second linear layer: torch.Size([512, 128])
+```
+The complete weight of the first linear layer is supposed to have the shape `[256, 1024]`. After the partitioning of 2.5D parallelism, it becomes `[128, 512]` on each GPU.
+Similarly, the second layer partitions the weight `[1024, 256]` into `[512, 128]`.
+
+We can run the model with some random inputs.
+```python
+from colossalai.context import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.utils import get_current_device
+
+x = torch.randn((16, 256), device=get_current_device())
+# partition input
+torch.distributed.broadcast(x, src=0)
+x = torch.chunk(x, 2, dim=0)[gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_DEP)]
+x = torch.chunk(x, 2, dim=0)[gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_COL)]
+x = torch.chunk(x, 2, dim=-1)[gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_ROW)]
+print_rank_0(f'Input: {x.shape}')
+
+x = m(x)
+```
+Then we can see the shapes of activation results.
+```shell
+Input: torch.Size([4, 128])
+Output of the first linear layer: torch.Size([4, 512])
+Output of the second linear layer: torch.Size([4, 128])
+```
+The activation tensors in 2.5D parallelism are all split by $d \times q$ in the row and $q$ in the column.
+E.g. the output of the first linear layer has the shape `[4, 512]`), while the second layer has the output of `[4, 128]`.
+Note, 2.5D parallelism use the same partition method as 2D parallelism for weights, where the difference is the partition of input.
diff --git a/docs/source/en/features/3D_tensor_parallel.md b/docs/source/en/features/3D_tensor_parallel.md
new file mode 100644
index 000000000000..1207376335ce
--- /dev/null
+++ b/docs/source/en/features/3D_tensor_parallel.md
@@ -0,0 +1,151 @@
+# 3D Tensor Parallelism
+
+Author: Zhengda Bian, Yongbin Li
+
+**Prerequisite**
+- [Define Your Configuration](../basics/define_your_config.md)
+- [Configure Parallelization](../basics/configure_parallelization.md)
+- [1D Tensor Parallelism](./1D_tensor_parallel.md)
+- [2D Tensor Parallelism](./2D_tensor_parallel.md)
+
+**Example Code**
+- [ColossalAI-Examples - 3D Tensor Parallelism](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/features/tensor_parallel/tensor_parallel_3d.py)
+
+**Related Paper**
+- [Maximizing Parallelism in Distributed Training for Huge Neural Networks](https://arxiv.org/pdf/2105.14450.pdf)
+
+## Introduction
+
+The [3D tensor parallelism](https://arxiv.org/pdf/2105.14450.pdf) is an approach to parallelize the computation of neural models, hoping to obtain the optimal communication cost.
+
+Let's still take a linear layer $Y = XA$ as an example.
+Given $P=q \times q \times q$ processors (necessary condition), e.g. $q=2$, we split the input $X$ and weight $A$ into
+
+$$
+\left[\begin{matrix}
+            X_{000} & X_{001} \\
+            X_{010} & X_{011} \\
+            X_{100} & X_{101} \\
+            X_{110} & X_{111} \end{matrix}
+\right]
+\text{~and~}
+\left[\begin{matrix}
+            A_{000} & A_{001} & A_{010} & A_{011} \\
+            A_{100} & A_{101} & A_{110} & A_{111} \end{matrix}
+\right]
+\text{~respectively,}$$
+where each $X_{ijl}$ and $A_{lji}$ are stored at processor $(i,j,l)$, as shown in the figure below.
+
+<center>
+<img src="https://s2.loli.net/2022/02/17/JevO6SED5z4PFdp.png" width = "200" height = "250" />
+<img src="https://s2.loli.net/2022/02/17/qvtwjdfNXMAb4nF.png" width = "200" height = "250" />
+<img src="https://s2.loli.net/2022/02/17/WFzm2N4IwKf1jXZ.png" width = "200" height = "250" />
+<img src="https://s2.loli.net/2022/02/17/r2dZQ4hKxwTuIv6.png" width = "200" height = "250" />
+</center>
+
+Then we all-gather $X_{ijl}$ across $(i, 0...q,l)$, as well as $A_{lji}$ across $(0...q, j, l)$.
+So, we have $X_{il}$ and $A_{lj}$ on each processor $(i,j,l)$ to get $X_{il}A_{lj}$.
+Finally, we reduce-scatter the results across $(i, j, 0...q)$ to get $Y_{ijl}$, which forms
+$$
+Y=
+\left[\begin{matrix}
+            Y_{000} & Y_{001} \\
+            Y_{010} & Y_{011} \\
+            Y_{100} & Y_{101} \\
+            Y_{110} & Y_{111} \end{matrix}
+\right].
+$$
+
+We also need to note that in the backward pass, we need to all-gather the gradient $\dot{Y_{ijl}}$, and then reduce-scatter the gradient $\dot{X_{il}}=\dot{Y_{ij}}A_{lj}^T$ and $\dot{A_{lj}}=X_{il}^T\dot{Y_{ij}}$.
+
+## Efficiency
+Given $P=q \times q \times q$ processors, we present the theoretical computation and memory cost, as well as the communication cost based on the ring algorithm in both the forward and backward pass of 3D tensor parallelism.
+
+| Computation | Memory (parameters) | Memory (activations) | Communication (bandwidth) | Communication (latency) |
+| :-:         | :-:              | :-:                  | :-:                       | :-:                     |
+| $O(1/q^3)$  | $O(1/q^3)$       | $O(1/q^3)$           | $O(6(q-1)/q^3)$           | $O(6(q-1))$             |
+
+## Usage
+
+To enable 3D tensor parallelism for our model, e.g. on 8 GPUs, we need to configure the parallism setting as below.
+```python
+CONFIG = dict(parallel=dict(
+    data=1,
+    pipeline=1,
+    tensor=dict(size=8, mode='3d'),
+))
+```
+Then Colossal-AI will automatically apply 3D parallelism to all the layers from `colossalai.nn`.
+
+Let's define a model that consists of a two-layer multi-layer perceptron (MLP) as below.
+```python
+import colossalai
+import colossalai.nn as col_nn
+import torch
+from colossalai.utils import print_rank_0
+
+class MLP(torch.nn.Module):
+    def __init__(self, dim: int = 256):
+        super().__init__()
+        intermediate_dim = dim * 4
+        self.dense_1 = col_nn.Linear(dim, intermediate_dim)
+        print_rank_0(f'Weight of the first linear layer: {self.dense_1.weight.shape}')
+        self.activation = torch.nn.GELU()
+        self.dense_2 = col_nn.Linear(intermediate_dim, dim)
+        print_rank_0(f'Weight of the second linear layer: {self.dense_2.weight.shape}')
+        self.dropout = col_nn.Dropout(0.1)
+
+    def forward(self, x):
+        x = self.dense_1(x)
+        print_rank_0(f'Output of the first linear layer: {x.shape}')
+        x = self.activation(x)
+        x = self.dense_2(x)
+        print_rank_0(f'Output of the second linear layer: {x.shape}')
+        x = self.dropout(x)
+        return x
+```
+Launch Colossal-AI on 8 GPUs and build the model
+```python
+parser = colossalai.get_default_parser()
+colossalai.launch(config=CONFIG,
+                  rank=args.rank,
+                  world_size=args.world_size,
+                  local_rank=args.local_rank,
+                  host=args.host,
+                  port=args.port)
+
+m = MLP()
+```
+We will see the shapes of partitioned parameters(e.g. weights) in the MLP model.
+```shell
+Weight of the first linear layer: torch.Size([128, 256])
+Weight of the second linear layer: torch.Size([512, 64])
+```
+The complete weight of the first linear layer is supposed to have the shape `[256, 1024]`. After the partitioning of 3D parallelism, it becomes `[128, 256]` on each GPU.
+Similarly, the second layer partitions the weight `[1024, 256]` into `[512, 64]`.
+
+We can run the model with some random inputs.
+```python
+from colossalai.context import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.utils import get_current_device
+
+x = torch.randn((16, 256), device=get_current_device())
+# partition input
+torch.distributed.broadcast(x, src=0)
+x = torch.chunk(x, 2, dim=0)[gpc.get_local_rank(ParallelMode.PARALLEL_3D_WEIGHT)]
+x = torch.chunk(x, 2, dim=0)[gpc.get_local_rank(ParallelMode.PARALLEL_3D_INPUT)]
+x = torch.chunk(x, 2, dim=-1)[gpc.get_local_rank(ParallelMode.PARALLEL_3D_OUTPUT)]
+print_rank_0(f'Input: {x.shape}')
+
+x = m(x)
+```
+Then we can see the shapes of activation results.
+```shell
+Input: torch.Size([4, 128])
+Output of the first linear layer: torch.Size([4, 512])
+Output of the second linear layer: torch.Size([4, 128])
+```
+The activation tensors in 3D parallelism are all split by $q^2$ in the row and $q$ in the column.
+E.g. the output of the first linear layer has the shape `[4, 512]`), while the second layer has the output of `[4, 128]`.
+Note, although the results of 3D parallelism have the same shape as that of 2.5D parallelism for weights here, the content of each partition is different.
diff --git a/docs/source/en/features/gradient_accumulation.md b/docs/source/en/features/gradient_accumulation.md
new file mode 100644
index 000000000000..d8781ee691bc
--- /dev/null
+++ b/docs/source/en/features/gradient_accumulation.md
@@ -0,0 +1,45 @@
+# Gradient Accumulation
+
+Author: Shenggui Li, Yongbin Li
+
+**Prerequisite**
+- [Define Your Configuration](../basics/define_your_config.md)
+- [Use Engine and Trainer in Training](../basics/engine_trainer.md)
+
+**Example Code**
+- [ColossalAI-Examples Gradient Accumulation](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/features/gradient_accumulation)
+
+## Introduction
+
+Gradient accumulation is a common way to enlarge your batch size for training.
+When training large-scale models, memory can easily become the bottleneck and the batch size can be very small, (e.g. 2),
+leading to unsatisfactory convergence. Gradient accumulation works by adding up the gradients calculated in multiple iterations,
+and only update the parameters in the preset iteration.
+
+## Usage
+
+It is simple to use gradient accumulation in Colossal-AI. Just add this following configuration into your config file.
+The integer represents the number of iterations to accumulate gradients.
+
+```python
+gradient_accumulation = <int>
+```
+
+## Hands-on Practice
+
+We provide a [runnable example](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/features/gradient_accumulation)
+to demonstrate gradient accumulation. In this example, we set the gradinet accumulation size to be 4. You can run the script using this command:
+
+```shell
+python -m torch.distributed.launch --nproc_per_node 1 --master_addr localhost --master_port 29500  run_resnet_cifar10_with_engine.py
+```
+
+You will see output similar to the text below. This shows gradient is indeed accumulated as the parameter is not updated
+in the first 3 steps, but only updated in the last step.
+
+```text
+iteration 0, first 10 elements of param: tensor([-0.0208,  0.0189,  0.0234,  0.0047,  0.0116, -0.0283,  0.0071, -0.0359, -0.0267, -0.0006], device='cuda:0', grad_fn=<SliceBackward0>)
+iteration 1, first 10 elements of param: tensor([-0.0208,  0.0189,  0.0234,  0.0047,  0.0116, -0.0283,  0.0071, -0.0359, -0.0267, -0.0006], device='cuda:0', grad_fn=<SliceBackward0>)
+iteration 2, first 10 elements of param: tensor([-0.0208,  0.0189,  0.0234,  0.0047,  0.0116, -0.0283,  0.0071, -0.0359, -0.0267, -0.0006], device='cuda:0', grad_fn=<SliceBackward0>)
+iteration 3, first 10 elements of param: tensor([-0.0141,  0.0464,  0.0507,  0.0321,  0.0356, -0.0150,  0.0172, -0.0118, 0.0222,  0.0473], device='cuda:0', grad_fn=<SliceBackward0>)
+```
diff --git a/docs/source/en/features/gradient_clipping.md b/docs/source/en/features/gradient_clipping.md
new file mode 100644
index 000000000000..f606dde6c393
--- /dev/null
+++ b/docs/source/en/features/gradient_clipping.md
@@ -0,0 +1,62 @@
+# Gradient Clipping
+
+Author: Boxiang Wang, Haichen Huang, Yongbin Li
+
+**Prerequisite**
+- [Define Your Configuration](../basics/define_your_config.md)
+- [Use Engine and Trainer in Training](../basics/engine_trainer.md)
+
+**Example Code**
+- [ColossalAI-Examples Gradient Clipping](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/features/gradient_clipping)
+
+**Related Paper**
+- [On the difficulty of training Recurrent Neural Networks](https://arxiv.org/abs/1211.5063)
+
+## Introduction
+
+In order to speed up training process and seek global optimum for better performance, more and more learning
+rate schedulers have been proposed. People turn to control learning rate to adjust descent pace during training,
+which makes gradient vector better to be uniformed in every step. In that case, the descent pace can be
+controlled as expected. As a result, gradient clipping, a technique which can normalize the gradient vector
+to circumscribe it in a uniformed length, becomes indispensable for those who desire their better
+performance of their models.
+
+You do not have to worry about implementing gradient clipping when using Colossal-AI, we support gradient
+clipping in a powerful and convenient way. All you need is just an additional command in your configuration
+file.
+
+## Why you should use gradient clipping provided by Colossal-AI
+
+The reason of why we do not recommend users to write gradient clipping by themselves is that naive gradient clipping
+may fail when applying tensor parallelism, pipeline parallelism or MoE.
+
+According to the illustration below, each GPU only owns a portion of parameters of the weight in a linear layer.
+To get correct norm of gradient vector of the weight of the linear layer, the norm of every gradient vector in each GPU
+should be summed together.
+More complicated thing is that the distribution of bias is different from the distribution of the weight.
+The communication group is different in the sum operation.
+
+(PS: This situation is an old version of 2D parallelism, the implementation in the code is not the same.
+But it is a good example about the difficulty to unify all communication in gradient clipping.)
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/01/28/KXiJPHt3Dum82cA.png"/>
+<figcaption>Layout of parameters</figcaption>
+</figure>
+
+Do not worry about it, since Colossal-AI have handled it for you.
+
+### Usage
+To use gradient clipping, you can just simply add gradient clipping norm in your configuration file.
+```python
+clip_grad_norm = 1.0
+```
+
+### Hands-On Practice
+
+We provide a [runnable example](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/features/gradient_clipping)
+to demonstrate gradient clipping. In this example, we set the gradient clipping vector norm to be 1.0. You can run the script using this command:
+
+```shell
+python -m torch.distributed.launch --nproc_per_node 1 --master_addr localhost --master_port 29500  train_with_engine.py
+```
diff --git a/docs/source/en/features/gradient_handler.md b/docs/source/en/features/gradient_handler.md
new file mode 100644
index 000000000000..757016fcb53a
--- /dev/null
+++ b/docs/source/en/features/gradient_handler.md
@@ -0,0 +1,63 @@
+# Gradient Handler
+
+Author: Shenggui Li, Yongbin Li
+
+**Prerequisite**
+- [Define Your Configuration](../basics/define_your_config.md)
+- [Use Engine and Trainer in Training](../basics/engine_trainer.md)
+
+**Example Code**
+- [ColossalAI-Examples Gradient Handler](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/features/gradient_handler)
+
+## Introduction
+
+In distributed training, gradient synchronization is required at the end of each iteration. This is important because we
+need to make sure the parameters are updated with the same gradients in different machines so that the resulting parameters
+are the same. This is often seen in data parallel as the model is replicated across data parallel ranks.
+
+In Colossal-AI, we provide an interface for users to customize how they want to handle the synchronization. This brings
+flexibility in cases such as implementing a new parallelism method.
+
+When gradient handlers are used, PyTorch `DistributedDataParallel` will not be used as it will synchronize automatically.
+
+## Customize Your Gradient Handlers
+
+To implement a customized gradient handler, you need to follow these steps.
+1. inherit `BaseGradientHandler` in Colossal-AI.
+2. register the gradient handler into the `GRADIENT_HANDLER`.
+3. implement `handle_gradient` method.
+
+```python
+from colossalai.registry import GRADIENT_HANDLER
+from colossalai.engine.gradient_handler import BaseGradientHandler
+
+
+@GRADIENT_HANDLER.register_module
+class MyGradientHandler(BaseGradientHandler):
+
+    def handle_gradient(self):
+        do_something()
+
+
+```
+
+
+## Usage
+
+To use a gradient handler, you need to specify your gradient handler in the config file. The gradient handler
+will be automatically built and attached to the engine.
+
+```python
+gradient_handler = [dict(type='MyGradientHandler')]
+```
+
+
+### Hands-On Practice
+
+We provide a [runnable example](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/features/gradient_handler)
+to demonstrate the use of gradient handler. In this example, we used `DataParallelGradientHandler` instead of PyTorch
+`DistributedDataParallel` for data parallel training.
+
+```shell
+python -m torch.distributed.launch --nproc_per_node 4 --master_addr localhost --master_port 29500  train_with_engine.py
+```
diff --git a/docs/source/en/features/mixed_precision_training.md b/docs/source/en/features/mixed_precision_training.md
new file mode 100644
index 000000000000..71cb6971d346
--- /dev/null
+++ b/docs/source/en/features/mixed_precision_training.md
@@ -0,0 +1,367 @@
+# Auto Mixed Precision Training
+
+Author: Chuanrui Wang, Shenggui Li, Yongbin Li
+
+**Prerequisite**
+- [Define Your Configuration](../basics/define_your_config.md)
+- [Use Engine and Trainer in Training](../basics/engine_trainer.md)
+
+**Example Code**
+- [ColossalAI-Examples AMP](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/features/amp)
+
+**Related Paper**
+- [Accelerating Scientific Computations with Mixed Precision Algorithms](https://arxiv.org/abs/0808.2794)
+
+
+## Introduction
+
+AMP stands for automatic mixed precision training.
+In Colossal-AI, we have incorporated different implementations of mixed precision training:
+
+1. torch.cuda.amp
+2. apex.amp
+3. naive amp
+
+
+| Colossal-AI | support tensor parallel | support pipeline parallel | fp16 extent |
+| ----------- | ----------------------- | ------------------------- | ----------- |
+| AMP_TYPE.TORCH | ✅ | ❌ | Model parameters, activation, gradients are downcast to fp16 during forward and backward propagation |
+| AMP_TYPE.APEX | ❌ | ❌ | More fine-grained, we can choose opt_level O0, O1, O2, O3 |
+| AMP_TYPE.NAIVE | ✅ | ✅ | Model parameters, forward and backward operations are all downcast to fp16 |
+
+The first two rely on the original implementation of PyTorch (version 1.6 and above) and NVIDIA Apex.
+The last method is similar to Apex O2 level.
+Among these methods, apex AMP is not compatible with tensor parallelism.
+This is because that tensors are split across devices in tensor parallelism, thus, it is required to communicate among different processes to check if inf or nan occurs in the whole model weights.
+We modified the torch amp implementation so that it is compatible with tensor parallelism now.
+
+> ❌️ fp16 and zero configuration are not compatible
+>
+> ⚠️ Pipeline only support naive AMP currently
+
+We recommend you to use torch AMP as it generally gives better accuracy than naive AMP if no pipeline is used.
+
+## Table of Contents
+
+In this tutorial we will cover:
+
+1. AMP introduction
+2. AMP in Colossal-AI
+3. Hands-on Practice
+
+## AMP Introduction
+
+Automatic Mixed Precision training is a mixture of FP16 and FP32 training.
+
+Half-precision float point format (FP16) has lower arithmetic complexity and higher compute efficiency.
+Besides, fp16 requires half of the storage needed by fp32 and saves memory & network bandwidth, which makes more memory
+available for large batch size and model size.
+
+However, there are other operations, like reductions, which require the dynamic range of fp32 to avoid numeric overflow/underflow. That's the reason why we introduce automatic mixed precision, attempting to match each operation to its appropriate data type, which can reduce the memory footprint and augment training efficiency.
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/01/28/URzLJ3MPeDQbtck.png"/>
+<figcaption>Illustration of an ordinary AMP (figure from <a href="https://arxiv.org/abs/2108.05818">PatrickStar paper</a>)</figcaption>
+</figure>
+
+## AMP in Colossal-AI
+
+We supported three AMP training methods and allowed the user to train with AMP with no code. You can just simply add `fp16`
+configuration in your configuration file to use AMP.
+
+
+```python
+from colossalai.amp import AMP_TYPE
+
+# use Torch AMP
+fp16=dict(
+    mode = AMP_TYPE.TORCH
+)
+
+# use naive AMP
+fp16=dict(
+    mode = AMP_TYPE.NAIVE
+)
+
+# use NVIDIA Apex AMP
+fp16=dict(
+    mode = AMP_TYPE.APEX
+)
+
+```
+
+> These are the minimum configuration, full configuration are stated in the section later
+
+### AMP Modularity
+
+AMP module is designed to be completely modular and can be used independently.
+If you wish to only use AMP in your code base without `colossalai.initialize`,
+you can use `colossalai.amp.convert_to_amp`.
+
+```python
+from colossalai.amp import AMP_TYPE
+
+# exmaple of using torch amp
+model, optimizer, criterion = colossalai.amp.convert_to_amp(model,
+                                                            optimizer,
+                                                            criterion,
+                                                            AMP_TYPE.TORCH)
+```
+
+### Torch AMP Configuration
+
+```python
+from colossalai.amp import AMP_TYPE
+
+fp16=dict(
+    mode=AMP_TYPE.TORCH,
+
+    # below are default values for grad scaler
+    init_scale=2.**16,
+    growth_factor=2.0,
+    backoff_factor=0.5,
+    growth_interval=2000,
+    enabled=True
+)
+```
+
+With optional arguments:
+- init_scale(float, optional, default=2.**16): Initial scale factor
+- growth_factor(float, optional, default=2.0): Factor by which the scale is multiplied during `update` if no inf/NaN gradients occur for ``growth_interval`` consecutive iterations.
+- backoff_factor(float, optional, default=0.5): Factor by which the scale is multiplied during `update` if inf/NaN gradients occur in an iteration.
+- growth_interval(int, optional, default=2000): Number of consecutive iterations without inf/NaN gradients that must occur for the scale to be multiplied by ``growth_factor``.
+- enabled(bool, optional, default=True): If ``False``, disables gradient scaling. `step` simply invokes the underlying ``optimizer.step()``, and other methods become no-ops.
+
+### Apex AMP Configuration
+
+For this mode, we rely on the Apex implementation for mixed precision training.
+We support this plugin because it allows for finer control on the granularity of mixed precision.
+For example, O2 level (optimization level 2) will keep batch normalization in fp32.
+
+If you look for more details, please refer to [Apex Documentation](https://nvidia.github.io/apex/).
+
+```python
+from colossalai.amp import AMP_TYPE
+
+fp16 = dict(
+    mode=AMP_TYPE.APEX,
+
+    # below are the default values
+    enabled=True,
+    opt_level='O1',
+    cast_model_type=None,
+    patch_torch_functions=None,
+    keep_batchnorm_fp32=None,
+    master_weights=None,
+    loss_scale=None,
+    cast_model_outputs=None,
+    num_losses=1,
+    verbosity=1,
+    min_loss_scale=None,
+    max_loss_scale=16777216.0
+)
+```
+
+Parameters:
+- enabled(bool, optional, default=True): If False, renders all AMP calls no-ops, so your script should run as if Amp were not present.
+
+- opt_level(str, optional, default="O1" ): Pure or mixed precision optimization level.
+Accepted values are “O0”, “O1”, “O2”, and “O3”, explained in detail above Apex AMP Documentation.
+
+- num_losses(int, optional, default=1): Option to tell AMP in advance how many losses/backward passes you plan to use.
+When used in conjunction with the loss_id argument to `amp.scale_loss`, enables Amp to use a different loss scale per
+loss/backward pass, which can improve stability. If num_losses is left to 1, Amp will still support multiple
+losses/backward passes, but use a single global loss scale for all of them.
+
+- verbosity(int, default=1): Set to 0 to suppress Amp-related output.
+
+- min_loss_scale(float, default=None): Sets a floor for the loss scale values that can be chosen by dynamic loss scaling.
+The default value of None means that no floor is imposed. If dynamic loss scaling is not used, min_loss_scale is ignored.
+
+- max_loss_scale(float, default=2.**24 ): Sets a ceiling for the loss scale values that can be chosen by dynamic loss
+scaling. If dynamic loss scaling is not used, max_loss_scale is ignored.
+
+Currently, the under-the-hood properties that govern pure or mixed precision training are the following:
+cast_model_type, patch_torch_functions, keep_batchnorm_fp32, master_weights, loss_scale.
+They are optional properties override once opt_level is determined
+
+- cast_model_type: Casts your model’s parameters and buffers to the desired type.
+- patch_torch_functions: Patch all Torch functions and Tensor methods to perform Tensor Core-friendly ops like GEMMs and convolutions in FP16, and any ops that benefit from FP32 precision in FP32.
+- keep_batchnorm_fp32: To enhance precision and enable cudnn batchnorm (which improves performance), it’s often beneficial to keep batchnorm weights in FP32 even if the rest of the model is FP16.
+- master_weights: Maintain FP32 master weights to accompany any FP16 model weights. FP32 master weights are stepped by the optimizer to enhance precision and capture small gradients.
+- loss_scale: If loss_scale is a float value, use this value as the static (fixed) loss scale. If loss_scale is the string "dynamic", adaptively adjust the loss scale over time. Dynamic loss scale adjustments are performed by Amp automatically.
+
+
+### Naive AMP Configuration
+
+In Naive AMP mode, we achieved mixed precision training while maintaining compatibility with complex tensor and pipeline parallelism.
+This AMP mode will cast all operations into fp16.
+The following code block shows the `config.py` file for this mode.
+
+```python
+from colossalai.amp import AMP_TYPE
+
+fp16 = dict(
+    mode=AMP_TYPE.NAIVE,
+
+    # below are the default values
+    log_num_zeros_in_grad=False,
+    initial_scale=2 ** 32,
+    min_scale=1,
+    growth_factor=2,
+    backoff_factor=0.5,
+    growth_interval=1000,
+    hysteresis=2
+)
+```
+
+The default parameters of Naive AMP:
+- log_num_zeros_in_grad(bool): return number of zeros in the gradients.
+- initial_scale(int): initial scale of gradient scaler
+- growth_factor(int): the growth rate of loss scale
+- backoff_factor(float): the decrease rate of loss scale
+- hysterisis(int): delay shift in dynamic loss scaling
+- max_scale(int): maximum loss scale allowed
+- verbose(bool): if set to `True`, will print debug info
+
+When using `colossalai.initialize`, you are required to first instantiate a model, an optimizer and a criterion.
+The output model is converted to AMP model of smaller memory consumption.
+If your input model is already too large to fit in a GPU, please instantiate your model weights in `dtype=torch.float16`.
+Otherwise, try smaller models or checkout more parallelization training techniques!
+
+
+## Hands-on Practice
+
+We provide a [runnable example](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/features/amp) which demonstrates
+the use of AMP with Colossal-AI. In this practice, we will use Torch AMP as an example, but do note that config files are provided for all AMP modes.
+
+### Step 1. Create a config file
+
+Create a `config.py` and add the `fp16` configuration.
+
+```python
+# in config.py
+from colossalai.amp import AMP_TYPE
+
+BATCH_SIZE = 128
+DROP_RATE = 0.1
+NUM_EPOCHS = 300
+
+fp16 = dict(
+    mode=AMP_TYPE.TORCH,
+)
+
+clip_grad_norm = 1.0
+```
+
+### Step 2. Import libraries in train_with_engine.py
+
+Create a `train_with_engine.py` and import the necessary dependencies. Remember to install `scipy` and `timm` by running
+`pip install timm scipy`.
+
+```python
+import os
+import colossalai
+import torch
+from pathlib import Path
+from colossalai.core import global_context as gpc
+from colossalai.logging import get_dist_logger
+from colossalai.utils import get_dataloader
+from colossalai.trainer import Trainer, hooks
+from colossalai.nn.lr_scheduler import LinearWarmupLR
+from timm.models import vit_base_patch16_224
+from torchvision import datasets, transforms
+
+```
+
+### Step 3. Initialize Distributed Environment
+
+We then need to initialize distributed environment. For demo purpose, we uses `launch_from_torch`. You can refer to [Launch Colossal-AI](../basics/launch_colossalai.md)
+for other initialization methods.
+
+```python
+# initialize distributed setting
+parser = colossalai.get_default_parser()
+args = parser.parse_args()
+
+# launch from torch
+colossalai.launch_from_torch(config=args.config)
+
+```
+
+### Step 4. Create training components
+
+Build your model, optimizer, loss function, lr scheduler and dataloaders. Note that the root path of the dataset is
+obtained from the environment varialbe `DATA`. You may `export DATA=/path/to/data` or change `Path(os.environ['DATA'])`
+to a path on your machine. Data will be automatically downloaded to the root path.
+
+```python
+# build model
+    model = vit_base_patch16_224(drop_rate=0.1)
+
+    # build dataloader
+    train_dataset = datasets.Caltech101(
+        root=Path(os.environ['DATA']),
+        download=True,
+        transform=transforms.Compose([
+            transforms.Resize(256),
+            transforms.RandomResizedCrop(224),
+            transforms.RandomHorizontalFlip(),
+            transforms.ToTensor(),
+            Gray2RGB(),
+            transforms.Normalize([0.5, 0.5, 0.5],
+                                 [0.5, 0.5, 0.5])
+        ]))
+
+    train_dataloader = get_dataloader(dataset=train_dataset,
+                                      shuffle=True,
+                                      batch_size=gpc.config.BATCH_SIZE,
+                                      num_workers=1,
+                                      pin_memory=True,
+                                      )
+
+    # build optimizer
+    optimizer = torch.optim.SGD(model.parameters(), lr=1e-2, weight_decay=0.1)
+
+    # build loss
+    criterion = torch.nn.CrossEntropyLoss()
+
+    # lr_scheduelr
+    lr_scheduler = LinearWarmupLR(optimizer, warmup_steps=50, total_steps=gpc.config.NUM_EPOCHS)
+```
+
+### Step 5. Inject AMP Feature
+
+Call `colossalai.initialize` to convert the training components to be running with FP16.
+
+```python
+engine, train_dataloader, _, _ = colossalai.initialize(
+        model, optimizer, criterion, train_dataloader,
+    )
+```
+
+### Step 6. Train with Engine
+
+Use engine in a normal training loops.
+
+```python
+engine.train()
+for epoch in range(gpc.config.NUM_EPOCHS):
+    for img, label in enumerate(train_dataloader):
+        img = img.cuda()
+        label = label.cuda()
+        engine.zero_grad()
+        output = engine(img)
+        loss = engine.criterion(output, label)
+        engine.backward(loss)
+        engine.step()
+        lr_scheduler.step()
+```
+
+### Step 7. Invoke Training Scripts
+
+Use the following command to start the training scripts. You can change `--nproc_per_node` to use a different number of GPUs.
+
+```python
+python -m torch.distributed.launch --nproc_per_node 4 --master_addr localhost --master_port 29500 train_with_engine.py --config config/config_AMP_torch.py
+```
diff --git a/docs/source/en/features/nvme_offload.md b/docs/source/en/features/nvme_offload.md
new file mode 100644
index 000000000000..fb491b063c03
--- /dev/null
+++ b/docs/source/en/features/nvme_offload.md
@@ -0,0 +1,42 @@
+# NVMe offload
+
+Author: Hongxin Liu
+
+**Prerequisite:**
+- [Zero Redundancy Optimizer with chunk-based memory management](../features/zero_with_chunk.md)
+
+## Introduction
+
+If a model has `N` parameters, when using Adam, it has `8N` optimizer states. For billion-scale models, optimizer states take at least 32 GB memory. GPU memory limits the model scale we can train, which is called GPU memory wall. If we offload optimizer states to the disk, we can break through GPU memory wall.
+
+We implement a user-friendly and efficient asynchronous Tensor I/O library: [TensorNVMe](https://github.com/hpcaitech/TensorNVMe). With this library, we can simply implement NVMe offload.
+
+> This library is compatible with all kinds of disk (HDD, SATA SSD, and NVMe SSD). As I/O bandwidth of HDD or SATA SSD is low, it's recommended to use this lib only on NVMe disk.
+
+When optimizing a parameter, we can divide the optimization process into three stages: read, compute and offload. We perform the optimization process in a pipelined fashion, which can overlap computation and I/O.
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/08/16/CvRnowrsNyB4hza.jpg"/>
+<figcaption>Optimization process</figcaption>
+</figure>
+
+## Usage
+
+First, please make sure you installed [TensorNVMe](https://github.com/hpcaitech/TensorNVMe):
+
+```shell
+pip install packaging
+pip install tensornvme
+```
+
+We implement NVMe offload of optimizer states for Adam ([CPUAdam](https://colossalai.readthedocs.io/en/latest/colossalai/colossalai.nn.optimizer.cpu_adam.html) and [HybridAdam](https://colossalai.readthedocs.io/en/latest/colossalai/colossalai.nn.optimizer.hybrid_adam.html)).
+
+```python
+from colossalai.nn.optimizer import CPUAdam, HybridAdam
+
+optimizer = HybridAdam(model.parameters(), lr=1e-3, nvme_offload_fraction=1.0, nvme_offload_dir='./')
+```
+
+`nvme_offload_fraction` is the fraction of optimizer states to be offloaded to NVMe. `nvme_offload_dir` is the directory to save NVMe offload files. If `nvme_offload_dir` is `None`, a random temporary directory will be used.
+
+It's compatible with all parallel methods in ColossalAI.
diff --git a/docs/source/en/features/pipeline_parallel.md b/docs/source/en/features/pipeline_parallel.md
new file mode 100644
index 000000000000..ac49863b3c71
--- /dev/null
+++ b/docs/source/en/features/pipeline_parallel.md
@@ -0,0 +1,159 @@
+# Pipeline Parallel
+
+Author: Guangyang Lu, Hongxin Liu, Yongbin Li
+
+**Prerequisite**
+- [Define Your Configuration](../basics/define_your_config.md)
+- [Use Engine and Trainer in Training](../basics/engine_trainer.md)
+- [Configure Parallelization](../basics/configure_parallelization.md)
+
+**Example Code**
+- [ColossalAI-Examples ResNet with pipeline](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/features/pipeline_parallel)
+
+**Related Paper**
+- [Colossal-AI: A Unified Deep Learning System For Large-Scale Parallel Training](https://arxiv.org/abs/2110.14883)
+- [Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM](https://arxiv.org/abs/2104.04473)
+- [GPipe: Efficient Training of Giant Neural Networks using Pipeline Parallelism](https://arxiv.org/abs/1811.06965)
+
+## Quick introduction
+
+In this tutorial, you will learn how to use pipeline parallel. In Colossal-AI, we use 1F1B pipeline, introduced by Nvidia. In this case, ViT and Imagenet are too large to use. Therefore, here we use ResNet and Cifar as example.
+
+## Table Of Content
+
+In this tutorial we will cover:
+
+1. Introduction of 1F1B pipeline.
+2. Usage of non-interleaved and interleaved schedule.
+3. Training ResNet with pipeline.
+
+## Introduction of 1F1B pipeline
+
+First of all, we will introduce you GPipe for your better understanding.
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/01/28/OAucPF6mWYynUtV.png"/>
+<figcaption>Figure1: GPipe. This figure is from <a href="https://arxiv.org/pdf/2104.04473.pdf">Megatron-LM</a> paper.</figcaption>
+</figure>
+
+
+As you can see, for GPipe, only when the forward passes of all microbatches in a batch finish, the backward passes would be executed.
+
+In general, 1F1B(one forward pass followed by one backward pass) is more efficient than GPipe(in memory or both memory and time). There are two schedules of 1F1B pipeline, the non-interleaved and the interleaved. The figures are shown below.
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/01/28/iJrVkp2HLcahjsT.png"/>
+<figcaption>Figure2: This figure is from <a href="https://arxiv.org/pdf/2104.04473.pdf">Megatron-LM</a> paper. The top part shows the default non-interleaved schedule. And the bottom part shows the interleaved schedule.</figcaption>
+</figure>
+
+### Non-interleaved Schedule
+
+The non-interleaved schedule can be divided into three stages. The first stage is the warm-up stage, where workers perform differing numbers of forward passes. At the following stage, workers perform one forward pass followed by one backward pass. Workers will finish backward passes at the last stage.
+
+This mode is more memory-efficient than GPipe. However, it would take the same time to finish a turn of passes as GPipe.
+
+### Interleaved Schedule
+
+This schedule requires **the number of microbatches to be an integer multiple of the stage of pipeline**.
+
+In this schedule, each device can perform computation for multiple subsets of layers(called a model chunk) instead of a single contiguous set of layers. i.e. Before device 1 had layer 1-4; device 2 had layer 5-8; and so on. But now device 1 has layer 1,2,9,10; device 2 has layer 3,4,11,12; and so on. With this scheme, each device in the pipeline is assigned multiple pipeline stages and each pipeline stage has less computation.
+
+This mode is both memory-efficient and time-efficient.
+
+## Usage of non-interleaved and interleaved schedule
+
+In Colossal-AI, we provided both non-interleaved(as `PipelineSchedule`) and interleaved schedule(as  `InterleavedPipelineSchedule`).
+
+You just need to set `NUM_MICRO_BATCHES` in config file and set `NUM_CHUNKS` in config file if you want to use Interleaved Pipeline Schedule. If you certainly know the shape of each pipeline stage's output tensor and the shapes are all the same, you can set `TENSOR_SHAPE` in config file to further reduce communication. Otherwise, you can just ignore `tensor_shape`, and the shape will be exchanged over pipeline stages automatically. Then we will generate an appropriate schedule for you.
+
+## Training ResNet with pipeline
+
+Let's build the `ResNet` model first with Colossal PipelinableContext:
+```python
+import os
+from typing import Callable, List, Optional, Type, Union
+import torch
+import torch.nn as nn
+import colossalai
+import colossalai.nn as col_nn
+
+from colossalai.core import global_context as gpc
+from colossalai.logging import disable_existing_loggers, get_dist_logger
+from colossalai.trainer import Trainer, hooks
+from colossalai.utils import MultiTimer, get_dataloader
+from colossalai.context import ParallelMode
+from colossalai.pipeline.pipelinable import PipelinableContext
+
+from titans.dataloader.cifar10 import build_cifar
+from torchvision.models import resnet50
+from torchvision.models.resnet import BasicBlock, Bottleneck, conv1x1
+
+# Define some config
+BATCH_SIZE = 64
+NUM_EPOCHS = 2
+NUM_CHUNKS = 1
+CONFIG = dict(NUM_MICRO_BATCHES=4, parallel=dict(pipeline=2))
+
+# Train
+disable_existing_loggers()
+parser = colossalai.get_default_parser()
+args = parser.parse_args()
+colossalai.launch_from_torch(backend=args.backend, config=CONFIG)
+logger = get_dist_logger()
+pipelinable = PipelinableContext()
+
+# build model
+with pipelinable:
+    model = resnet50()
+```
+
+Define an execution sequence.
+```python
+exec_seq = [
+    'conv1', 'bn1', 'relu', 'maxpool', 'layer1', 'layer2', 'layer3', 'layer4', 'avgpool',
+    (lambda x: torch.flatten(x, 1), "behind"), 'fc'
+]
+pipelinable.to_layer_list(exec_seq)
+```
+
+Partition the model into pipeline.
+```python
+model = pipelinable.partition(NUM_CHUNKS, gpc.pipeline_parallel_size, gpc.get_local_rank(ParallelMode.PIPELINE))
+```
+
+In this tutorial, we use `Trainer` to train `ResNet`:
+```python
+# build criterion
+criterion = nn.CrossEntropyLoss()
+
+# optimizer
+optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
+
+# build dataloader
+root = os.environ.get('DATA', './data')
+train_dataloader, test_dataloader = build_cifar(BATCH_SIZE, root, padding=4, crop=32, resize=32)
+
+lr_scheduler = col_nn.lr_scheduler.LinearWarmupLR(optimizer, NUM_EPOCHS, warmup_steps=1)
+engine, train_dataloader, test_dataloader, lr_scheduler = colossalai.initialize(model, optimizer, criterion,
+                                                                                train_dataloader, test_dataloader,
+                                                                                lr_scheduler)
+timer = MultiTimer()
+
+trainer = Trainer(engine=engine, timer=timer, logger=logger)
+
+hook_list = [
+    hooks.LossHook(),
+    hooks.AccuracyHook(col_nn.metric.Accuracy()),
+    hooks.LogMetricByEpochHook(logger),
+    hooks.LRSchedulerHook(lr_scheduler, by_epoch=True)
+]
+
+trainer.fit(train_dataloader=train_dataloader,
+            epochs=NUM_EPOCHS,
+            test_dataloader=test_dataloader,
+            test_interval=1,
+            hooks=hook_list,
+            display_progress=True)
+```
+
+We use `2` pipeline stages and the batch will be splitted into `4` micro batches.
diff --git a/docs/source/en/features/zero_with_chunk.md b/docs/source/en/features/zero_with_chunk.md
new file mode 100644
index 000000000000..8492631bc0d3
--- /dev/null
+++ b/docs/source/en/features/zero_with_chunk.md
@@ -0,0 +1,262 @@
+# Zero Redundancy Optimizer with chunk-based memory management
+
+Author: [Hongxiu Liu](https://github.com/ver217), [Jiarui Fang](https://github.com/feifeibear), [Zijian Ye](https://github.com/ZijianYY)
+**Prerequisite:**
+- [Define Your Configuration](../basics/define_your_config.md)
+
+**Example Code**
+
+- [Train GPT with Colossal-AI](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/gpt)
+
+**Related Paper**
+- [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models](https://arxiv.org/abs/1910.02054)
+- [ZeRO-Offload: Democratizing Billion-Scale Model Training](https://arxiv.org/abs/2101.06840)
+- [ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning](https://arxiv.org/abs/2104.07857)
+- [PatrickStar: Parallel Training of Pre-trained Models via Chunk-based Memory Management](https://arxiv.org/abs/2108.05818)
+
+## Introduction
+
+The Zero Redundancy Optimizer (ZeRO) removes the memory redundancies across data-parallel processes by partitioning three
+model states (optimizer states, gradients, and parameters) instead of replicating them.
+By doing so, memory efficiency is boosted drastically compared to classic data parallelism, while the computational granularity
+and communication efficiency is retained.
+
+1. **Shard Optimizer States**: The optimizer states (e.g., for [Adam optimizer](https://arxiv.org/abs/1412.6980), 32-bit weights,
+and the first and second momentum estimates) are partitioned across the processes, so that each process updates only its partition.
+
+
+2. **Shard Gradient**: After reduction inside data parallel process group, gradient tensors are also partitioned such that each process only stores the gradients corresponding to its partition of the optimizer states. Note, Colossal converts gradient into fp32 format to participate in parameter updating.
+
+3. **Shard Parameter**: The 16-bit model parameters are partitioned across the processes of a data parallel group.
+
+4. **[Gemini](../advanced_tutorials/meet_gemini.md)**: Dynamic heterogeneous memory space manager for paramters, gradients and optimizer states.
+
+Besides, this article will introduce the Zero Redundancy Optimizer with chunk-based memory management.
+
+When using ZeRO, we distributed the model by sharding the parameters. The advantage of this method is that the memory of each node is load balanced. But this approach has two significiant disadvantages. First, during communication, a temporary memory buffer needs to be allocated and released afterwards, leading to the memory fragmentation problem. Secondly, using tensor as the granularity for communication will cause the network bandwidth underutilized. Generally, the longer the transmitted message length, the higher the bandwidth utilization.
+
+Using the Chunk mechanism introduced in ColossalAI v0.1.8, we can improve the efficiency of ZeRO. We store a continuous set of parameters in initialization order into a Chunk (a chunk is a continuous memory space), and each Chunk has the same size. Organizing memory in chunks can lead to efficient use of network bandwidth between PCI-e and GPU-GPU, reduce the number of communications, and avoid potential memory fragmentation.
+
+Before v0.1.8, ZeRO had a high communication cost for parameter communications. If a parameter was used multiple times in several consecutive operators, there will be repeated communications operations, and the efficiency was highly damaged. This situation is very common when using the Gradient Checkpoint technique, and the parameter will recompute the forward propagation during backward propagation.
+
+Taking GPT as an example, its Checkpoint will be applied to each GPT Block, and each GPT Block contains a Self-Attention layer and an MLP layer. During the backward pass, the forward of the Self-Attention layer and the MLP layer will be computed in turn, and then the backward of the MLP layer and the Self-Attention layer will be computed in turn.
+
+In addition, due to the communication and memory movement of small Tensors, the bandwidth of NVLINK and PCI-E cannot be fully utilized, and each communication and memory movement has the overhead of kernel launch. After using Chunk, multiple small Tensor communication and memory movement can be changed into one large Tensor communication and memory movement, which not only improves bandwidth utilization but also reduces the overhead of kernel launch.
+
+We also provide a lightweight chunk search mechanism to help users automatically find the chunk size with the smallest memory fragmentation.
+
+## Usage
+
+### GeminiDDP
+
+We will use `GeminiDDP` to use ZeRO with chunk-based memory management. This is our new torch.Module wrapper which uses ZeRO-DP and Gemini. ZeRO is for parallelism and Gemini is for memory management.
+
+Also Make sure that your model is initialized under the context of ColoInitContext.
+
+```python
+with ColoInitContext(device='cpu', default_dist_spec=default_dist_spec, default_pg=default_pg):
+  model = gpt2_medium(checkpoint=True)
+```
+
+Define the model parameters as follows:
+
+```python
+chunk_manager = init_chunk_manager(model=module,
+                                           init_device=device,
+                                           hidden_dim=hidden_dim,
+                                           search_range_mb=search_range_mb,
+                                           min_chunk_size_mb=min_chunk_size_mb)
+gemini_manager = GeminiManager(placement_policy, chunk_manager)
+```
+
+`hidden_dim` is the hidden dimension of DNN. Users can provide this argument to speed up searching. If users do not know this argument before training, it is ok. We will use a default value 1024. `min_chunk_size_mb` is the the minimum chunk size in MegaByte. If the aggregate size of parameters is still samller than the minimum chunk size, all parameters will be compacted into one small chunk.
+
+Initialization of the optimizer.
+```python
+optimizer = GeminiAdamOptimizer(model, lr=1e-3, initial_scale=2**5)
+```
+
+Training
+```python
+optimizer.zero_grad()
+outputs = model(input_ids, attn_mask)
+loss = criterion(outputs, input_ids)
+optimizer.backward(loss)
+optimizer.step()
+```
+> ⚠️ Note: Please do not use `loss.backward()`, the standard way of writing is `optimizer.backward(loss)`.
+
+### Train GPT
+
+In this example, we use `Hugging Face Transformers`. You have to install `transformers` before running this example. We will take `GPT2 Medium` as an example here.
+
+For simplicity, we just use randomly generated data here.
+
+First we only need to import `GPT2LMHeadModel` from `Huggingface transformers` to define our model, which does not require users to define or modify the model, so that users can use it more conveniently.
+
+```python
+class GPTLMModel(nn.Module):
+
+    def __init__(self,
+                 hidden_size=768,
+                 num_layers=12,
+                 num_attention_heads=12,
+                 max_seq_len=1024,
+                 vocab_size=50257,
+                 checkpoint=False):
+        super().__init__()
+        self.checkpoint = checkpoint
+        self.model = GPT2LMHeadModel(
+            GPT2Config(n_embd=hidden_size,
+                       n_layer=num_layers,
+                       n_head=num_attention_heads,
+                       n_positions=max_seq_len,
+                       n_ctx=max_seq_len,
+                       vocab_size=vocab_size))
+        if checkpoint:
+            self.model.gradient_checkpointing_enable()
+
+    def forward(self, input_ids, attention_mask):
+        return self.model(input_ids=input_ids, attention_mask=attention_mask, use_cache=not self.checkpoint)[0]
+
+def gpt2_medium(checkpoint=False):
+    return GPTLMModel(hidden_size=1024, num_layers=24, num_attention_heads=16, checkpoint=checkpoint)
+```
+
+Define our loss function:
+
+```python
+class GPTLMLoss(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.loss_fn = nn.CrossEntropyLoss()
+
+    def forward(self, logits, labels):
+        shift_logits = logits[..., :-1, :].contiguous()
+        shift_labels = labels[..., 1:].contiguous()
+        return self.loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+```
+
+Define tensor parallel and parameter sharding strategies for tensor parallelism:
+
+```python
+def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup):
+    for mn, module in model.named_modules():
+        for pn, param in module.named_parameters(recurse=False):
+            if hasattr(param, 'visited'):
+                continue
+            param.set_dist_spec(ReplicaSpec())
+            if 'mlp.c_fc' in mn:
+                if 'weight' in pn or 'bias' in pn:
+                    split_param_col_tp1d(param, pg)
+                    param.compute_spec.set_output_replicate(False)
+                else:
+                    param.set_dist_spec(ReplicaSpec())
+            elif 'mlp.c_proj' in mn:
+                if 'weight' in pn:
+                    split_param_row_tp1d(param, pg)
+                else:
+                    param.set_dist_spec(ReplicaSpec())
+            elif 'wte' in mn or 'wpe' in mn:
+                split_param_col_tp1d(param, pg)
+            elif 'c_attn' in mn or 'c_proj' in mn:
+                split_param_col_tp1d(param, pg)
+            else:
+                param.set_dist_spec(ReplicaSpec())
+
+            param.visited = True
+def split_param_single_dim_tp1d(dim: int, param: ColoParameter, pg: ProcessGroup):
+    spec = (ShardSpec([dim], [pg.tp_world_size()]), ComputeSpec(ComputePattern.TP1D))
+    param.set_tensor_spec(*spec)
+
+
+def split_param_row_tp1d(param: ColoParameter, pg: ProcessGroup):
+    split_param_single_dim_tp1d(0, param, pg)
+
+
+def split_param_col_tp1d(param: ColoParameter, pg: ProcessGroup):
+    split_param_single_dim_tp1d(-1, param, pg)
+```
+
+Define a model which uses Gemini + ZeRO DDP:
+
+```python
+def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy: str = "auto"):
+    cai_version = colossalai.__version__
+    if version.parse(cai_version) > version.parse("0.1.10"):
+        from colossalai.nn.parallel import GeminiDDP
+        model = GeminiDDP(model,
+                          device=get_current_device(),
+                          placement_policy=placememt_policy,
+                          pin_memory=True,
+                          search_range_mb=32)
+    elif version.parse(cai_version) <= version.parse("0.1.10") and version.parse(cai_version) >= version.parse("0.1.9"):
+        from colossalai.gemini import ChunkManager, GeminiManager
+        chunk_size = ChunkManager.search_chunk_size(model, 64 * 1024**2, 32)
+        gemini_manager = GeminiManager(placememt_policy, chunk_manager)
+        chunk_manager = ChunkManager(chunk_size,
+                                     pg,
+                                     enable_distributed_storage=True,
+                                     init_device=GeminiManager.get_default_device(placememt_policy))
+        model = ZeroDDP(model, gemini_manager)
+    else:
+        raise NotImplemented(f"CAI version {cai_version} is not supported")
+    return model
+```
+
+As we pre-train GPT in this example, we just use a simple language model loss.
+
+Write a function to get random inputs:
+
+```python
+def get_data(batch_size, seq_len, vocab_size):
+    input_ids = torch.randint(0, vocab_size, (batch_size, seq_len), device=torch.cuda.current_device())
+    attention_mask = torch.ones_like(input_ids)
+    return input_ids, attention_mask
+```
+
+Finally, we can define our training loop:
+
+```python
+def main():
+    args = parse_args()
+    BATCH_SIZE = 8
+    SEQ_LEN = 1024
+    VOCAB_SIZE = 50257
+    NUM_STEPS = 10
+    colossalai.launch_from_torch(config={})
+
+    # build criterion
+    criterion = GPTLMLoss()
+
+    torch.manual_seed(123)
+    default_pg = ProcessGroup(tp_degree=args.tp_degree)
+    default_dist_spec = ShardSpec([-1], [args.tp_degree]) if args.shardinit else None
+    # build GPT model
+    with ColoInitContext(device='cpu', default_dist_spec=default_dist_spec, default_pg=default_pg):
+      model = gpt2_medium(checkpoint=True)
+    pg = default_pg
+    # Tensor Parallelism (TP)
+    tensor_parallelize(model, pg)
+    # Gemini + ZeRO DP, Note it must be used after TP
+    model = gemini_zero_dpp(model, pg, args.placement)
+    # build optimizer
+    optimizer = GeminiAdamOptimizer(model, lr=1e-3, initial_scale=2**5)
+    numel = sum([p.numel() for p in model.parameters()])
+    get_tflops_func = partial(get_tflops, numel, BATCH_SIZE, SEQ_LEN)
+    torch.cuda.synchronize()
+    model.train()
+    for n in range(NUM_STEPS):
+        # we just use randomly generated data here
+        input_ids, attn_mask = get_data(BATCH_SIZE, SEQ_LEN, VOCAB_SIZE)
+        optimizer.zero_grad()
+        outputs = model(input_ids, attn_mask)
+        loss = criterion(outputs, input_ids)
+        optimizer.backward(loss)
+        optimizer.step()
+
+    torch.cuda.synchronize()
+```
+> ⚠️ Note: If you want to use the Gemini module, please do not use the [Gradient Accumulation](../features/gradient_accumulation.md) we mentioned before。
+The complete example can be found on [Train GPT with Colossal-AI](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/gpt).
diff --git a/docs/source/en/get_started/installation.md b/docs/source/en/get_started/installation.md
new file mode 100644
index 000000000000..b4285a40e194
--- /dev/null
+++ b/docs/source/en/get_started/installation.md
@@ -0,0 +1,37 @@
+# Setup
+
+## Download From PyPI
+
+You can install Colossal-AI with
+
+```shell
+pip install colossalai
+```
+
+If you want to build PyTorch extensions during installation, you can use the command below. Otherwise, the PyTorch extensions will be built during runtime.
+
+```shell
+CUDA_EXT=1 pip install colossalai
+```
+
+
+## Download From Source
+
+> The version of Colossal-AI will be in line with the main branch of the repository. Feel free to raise an issue if you encounter any problem. :)
+
+```shell
+git clone https://github.com/hpcaitech/ColossalAI.git
+cd ColossalAI
+
+# install dependency
+pip install -r requirements/requirements.txt
+
+# install colossalai
+pip install .
+```
+
+If you don't want to install and enable CUDA kernel fusion (compulsory installation when using fused optimizer):
+
+```shell
+CUDA_EXT=1 pip install .
+```
diff --git a/docs/source/en/get_started/reading_roadmap.md b/docs/source/en/get_started/reading_roadmap.md
new file mode 100644
index 000000000000..476c524ac011
--- /dev/null
+++ b/docs/source/en/get_started/reading_roadmap.md
@@ -0,0 +1,19 @@
+# Reading Roadmap
+
+Colossal-AI provides a collection of parallel training components for you. We aim to support you with your development
+of distributed deep learning models just like how you write single-GPU deep learning models. ColossalAI provides easy-to-use
+APIs to help you kickstart your training process. To better how ColossalAI works, we recommend you to read this documentation
+in the following order.
+
+- If you are not familiar with distributed system or have never used Colossal-AI, you should first jump into the `Concepts`
+section to get a sense of what we are trying to achieve. This section can provide you with some background knowledge on
+distributed training as well.
+- Next, you can follow the `basics` tutorials. This section will cover the details about how to use Colossal-AI.
+- Afterwards, you can try out the features provided in Colossal-AI by reading `features` section. We will provide a codebase for each tutorial. These tutorials will cover the
+basic usage of Colossal-AI to realize simple functions such as data parallel and mixed precision training.
+- Lastly, if you wish to apply more complicated techniques such as how to run hybrid parallel on GPT-3,  the
+`advanced tutorials` section is the place to go!
+
+**We always welcome suggestions and discussions from the community, and we would be more than willing to help you if you
+encounter any issue. You can raise an [issue](https://github.com/hpcaitech/ColossalAI/issues) here or create a discussion
+topic in the [forum](https://github.com/hpcaitech/ColossalAI/discussions).**
diff --git a/docs/source/en/get_started/run_demo.md b/docs/source/en/get_started/run_demo.md
new file mode 100644
index 000000000000..f47bdbbd62fc
--- /dev/null
+++ b/docs/source/en/get_started/run_demo.md
@@ -0,0 +1,43 @@
+# Quick Demo
+
+Colossal-AI is an integrated large-scale deep learning system with efficient parallelization techniques. The system can
+accelerate model training on distributed systems with multiple GPUs by applying parallelization techniques. The system
+can also run on systems with only one GPU. Quick demos showing how to use Colossal-AI are given below.
+
+## Single GPU
+
+Colossal-AI can be used to train deep learning models on systems with only one GPU and achieve baseline
+performances. We provided an example to [train ResNet on CIFAR10 dataset](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/image/resnet)
+with only one GPU. You can find the example in [ColossalAI-Examples](https://github.com/hpcaitech/ColossalAI-Examples).
+Detailed instructions can be found in its `README.md`.
+
+## Multiple GPUs
+
+Colossal-AI can be used to train deep learning models on distributed systems with multiple GPUs and accelerate the
+training process drastically by applying efficient parallelization techniques. When we have several parallelism for you
+to try out.
+
+#### 1. data parallel
+
+You can use the same [ResNet example](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/image/resnet) as the
+single-GPU demo above. By setting `--nproc_per_node` to be the number of GPUs you have on your machine, the example
+is turned into a data parallel example.
+
+#### 2. hybrid parallel
+
+Hybrid parallel includes data, tensor, and pipeline parallelism. In Colossal-AI, we support different types of tensor
+parallelism (i.e. 1D, 2D, 2.5D and 3D). You can switch between different tensor parallelism by simply changing the configuration
+in the `config.py`. You can follow the [GPT example](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/language/gpt).
+Detailed instructions can be found in its `README.md`.
+
+#### 3. MoE parallel
+
+We provided [an example of WideNet](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/image/widenet) to demonstrate
+MoE parallelism. WideNet uses mixture of experts (MoE) to achieve better performance. More details can be found in
+[Tutorial: Integrate Mixture-of-Experts Into Your Model](../advanced_tutorials/integrate_mixture_of_experts_into_your_model.md)
+
+#### 4. sequence parallel
+
+Sequence parallel is designed to tackle memory efficiency and sequence length limit problems in NLP tasks. We provided
+[an example of BERT](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/language/bert/sequene_parallel) in
+[ColossalAI-Examples](https://github.com/hpcaitech/ColossalAI-Examples). You can follow the `README.md` to execute the code.
diff --git a/docs/source/zh/Colossal-Auto/feature/auto_checkpoint.md b/docs/source/zh/Colossal-Auto/feature/auto_checkpoint.md
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/docs/source/zh/Colossal-Auto/feature/device_mesh.md b/docs/source/zh/Colossal-Auto/feature/device_mesh.md
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/docs/source/zh/Colossal-Auto/feature/shape_consistency.md b/docs/source/zh/Colossal-Auto/feature/shape_consistency.md
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/docs/source/zh/Colossal-Auto/feature/tracer.md b/docs/source/zh/Colossal-Auto/feature/tracer.md
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/docs/source/zh/Colossal-Auto/get_started/installation.md b/docs/source/zh/Colossal-Auto/get_started/installation.md
new file mode 100644
index 000000000000..054b709c92d0
--- /dev/null
+++ b/docs/source/zh/Colossal-Auto/get_started/installation.md
@@ -0,0 +1,28 @@
+# 安装
+
+## 声明
+
+我们的自动并行功能处于alpha版本，仍在快速的开发迭代中。我们会在兼容性和稳定性上做持续地改进。如果您遇到任何问题，欢迎随时提issue给我们。
+
+
+## 要求
+
+我们需要一些额外的依赖性来支持自动并行功能。 请在使用自动平行之前安装它们。
+
+### 安装PyTorch
+
+我们仅支持Pytorch 1.12，现在未测试其他版本。 将来我们将支持更多版本。
+
+```bash
+#conda
+conda install pytorch==1.12.0 torchvision==0.13.0 torchaudio==0.12.0 cudatoolkit=11.3 -c pytorch
+#pip
+pip install torch==1.12.0+cu113 torchvision==0.13.0+cu113 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu113
+```
+
+### 安装pulp和coin-or-cbc
+
+```bash
+pip install pulp
+conda install -c conda-forge coin-or-cbc
+```
diff --git a/docs/source/zh/Colossal-Auto/get_started/introduction.md b/docs/source/zh/Colossal-Auto/get_started/introduction.md
new file mode 100644
index 000000000000..1d41e3b501e6
--- /dev/null
+++ b/docs/source/zh/Colossal-Auto/get_started/introduction.md
@@ -0,0 +1,43 @@
+# 介绍
+
+近年来，大规模机器学习模型的部署受到越来越多的重视。然而，目前常见的分布式大模型训练方案，都依赖用户**人工反复尝试**和系统专家的经验来进行配置部署。这对绝大多数AI开发者来说十分不友好，因为他们不希望将时间精力花费在研究分布式系统和试错上。
+Colossal-AI的**Colossal-Auto** 帮助AI开发者简化了大规模机器学习模型的部署过程。相比现有其他手动配置复杂并行策略和修改模型的解决方案，Colossal-Auto 仅需增加一行代码，提供 cluster 信息以及单机训练模型即可获得分布式训练能力，并且**原生支持包括 Hugging Face，Timm 等热门 AI 模型库**。
+
+
+
+## 概览
+
+<figure style={{textAlign: "center"}}>
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/auto_parallel/auto_parallel.png"/>
+</figure>
+
+## 用法
+```python
+# wrap the model using auto_engine
+model = autoparallelize(model, meta_input_samples)
+# normal training loop
+...
+```
+
+
+## 图追踪
+Colossal-Auto 是**首个基于 PyTorch 框架使用静态图分析的自动并行系统**。PyTorch 作为一个动态图框架，获取其静态的执行计划是机器学习系统领域被长期研究的问题。Colossal-Auto 使用基于 torch.FX Tracer 的 ColoTracer 来完成对于最优并行策略的搜索。在 tracing 过程中推导并记录了每个 tensor 的元信息，例如 tensor shape，dims，dtype 等。因此 Colossal-AI 具有更好的模型泛化能力，而不是依靠模型名或手动修改来适配并行策略。
+
+
+## 细粒度分布式训练策略搜索
+Colossal-AI 的自动并行策略会在满足内存预算的限制下，以最快运行时间为目标，为每个 op 进行策略搜索，最终得到真实训练时的策略，包括每个 tensor 的切分策略，不同计算节点间需要插入的通信算子类型，是否要进行算子替换等。现有系统中的张量并行，数据并行，NVIDIA 在 Megatron-LM 等并行系统中使用的 column 切分和 row 切分并行等混合并行，都是自动并行可以搜索到的策略的子集。除了这些可以手动指定的并行方式外，Colossal-AI 有能力为每个 op 指定独特的并行方式，因此有可能找到比依赖专家经验和试错配置的手动切分更好的并行策略。
+
+
+
+## 分布式 tensor 与 shape consistency 系统
+
+与 PyTorch 最新发布的 DTensor 类似，Colossal-AI 也使用了 device mesh 对集群进行了抽象管理。具体来说，Colossal-AI 使用 sharding spec 对 tensor 的分布式存储状态进行标注，使用 shape consistency manager 自动地对同一 tensor 在不同 sharding spec 间进行转换。这让 Colossal-AI 的通用性和易用性极大地提升，借助 shape consistency manager 可以没有负担地切分 tensor，而不用担心上游 op 的 output 与下游的 input 在集群中的存储方式不同。
+
+<figure style={{textAlign: "center"}}>
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/auto_parallel/shape_consistency.png"/>
+</figure>
+
+相较于 PyTorch DTensor，Colossal-AI 有以下优势：
++ Colossal-AI 的 device mesh 可以 profiling 到集群性能指标，对不同的通信算子进行耗时估算。
++ Colossal-AI 的 shape consistency 会贪心地搜索 sharding spec 间的转换方式，而不是朴素地逐 dimension 进行转换，这样能找到更高效的转换路径，进而使得 sharding spec 间的转换通信开销更小。
++ 加入了 all_to_all 操作，使得 Colossal-AI 的扩展性更强，这在大规模集群上进行训练时，可以展现出很大的优势。
diff --git a/docs/source/zh/Colossal-Auto/get_started/run_demo.md b/docs/source/zh/Colossal-Auto/get_started/run_demo.md
new file mode 100644
index 000000000000..cdeb227eb261
--- /dev/null
+++ b/docs/source/zh/Colossal-Auto/get_started/run_demo.md
@@ -0,0 +1,16 @@
+# 快速上手
+
+Colossal-AI 提供了业界急需的一套高效易用自动并行系统。相比现有其他手动配置复杂并行策略和修改模型的解决方案，Colossal-AI 仅需增加一行代码，提供 cluster 信息以及单机训练模型即可获得分布式训练能力。Colossal-Auto的快速上手示例如下。
+
+### 1. 基本用法
+Colossal-Auto 可被用于为每一次操作寻找一个包含数据、张量（如1D、2D、序列化）的混合SPMD并行策略。您可参考[GPT 示例](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/gpt/experiments/auto_parallel)。
+详细的操作指引见其 `README.md`。
+
+### 2. 与 activation checkpoint 结合
+
+作为大模型训练中必不可少的显存压缩技术，Colossal-AI 也提供了对于 activation checkpoint 的自动搜索功能。相比于大部分将最大显存压缩作为目标的技术方案，Colossal-AI 的搜索目标是在显存预算以内，找到最快的 activation checkpoint 方案。同时，为了避免将 activation checkpoint 的搜索一起建模到 SPMD solver 中导致搜索时间爆炸，Colossal-AI 做了 2-stage search 的设计，因此可以在合理的时间内搜索到有效可行的分布式训练方案。 您可参考 [Resnet 示例](TBA)。
+详细的操作指引见其 `README.md`。
+
+<figure style={{textAlign: "center"}}>
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/auto_parallel/auto_ckpt.jpg"/>
+</figure>
diff --git a/docs/source/zh/advanced_tutorials/add_your_parallel.md b/docs/source/zh/advanced_tutorials/add_your_parallel.md
new file mode 100644
index 000000000000..4825a6fa1d6c
--- /dev/null
+++ b/docs/source/zh/advanced_tutorials/add_your_parallel.md
@@ -0,0 +1,112 @@
+# 添加你自己的并行模式
+
+作者: Shenggui Li, Yongbin Li
+
+**前置教程**
+- [定义配置文件](../basics/define_your_config.md)
+- [并行配置](../basics/configure_parallelization.md)
+
+## 引言
+
+为了使研究人员和工程师能够以更少的努力将我们的系统扩展到其他新颖的大规模分布式训练算法，我们已经将训练生命周期中的各种组件解耦。你可以通过简单地继承基类来实现你自己的并行模式。
+
+主要组件有:
+
+1. `ProcessGroupInitializer`
+2. `GradientHandler`
+3. `Schedule`
+
+**目前这需要对源代码进行一些改动，因此我们建议你用`-e`标志从源代码安装。`-e`标志使得安装是可编辑的，因此，你的代码变化将反映在你的Python运行时中。我们将在这方面努力，以避免在未来的版本中改变源代码。**
+
+
+## 进程组初始化器
+
+并行通常由进程组来管理，参与相同并行算法的进程被置于同一进程组。对于不同的并行算法，需要创建不同的进程组。
+Colossal-AI 为用户提供了一个全局 context，使他们能够轻松地管理进程组。如果你想添加新的进程组，你可以很容易地定义一个新的类并在你的配置文件中设置它。为了定义你自己的进程组创建方式，你可以按照下面的步骤来创建一个新的分布式初始化。
+
+1. 在 `colossalai.context.parallel_mode.ParallelMode` 中添加你自己的并行模式。
+    ```python
+    class ParallelMode(Enum):
+        GLOBAL = 'global'
+        DATA = 'data'
+        PIPELINE = 'pipe'
+        ...
+
+        NEW_MODE = 'new_mode'  # define your mode here
+    ```
+
+2. 创建一个 `ProcessGroupInitializer`。 你可以参考 `colossalai.context.dist_group_initializer` 中给出的例子，前六个参数是固定的。
+`ParallelContext` 将为你传入这些参数。如果你需要设置其他参数，可以像下面的例子中的 `arg1, arg2` 一样，在后面添加它。
+最后，通过添加装饰器 `@DIST_GROUP_INITIALIZER.register_module` 将你的初始化程序注册到注册表。
+    ```python
+    # sample initializer class
+    @DIST_GROUP_INITIALIZER.register_module
+    class MyParallelInitializer(ProcessGroupInitializer):
+
+        def __init__(self,
+                    rank: int,
+                    world_size: int,
+                    config: Config,
+                    data_parallel_size: int,
+                    pipeline_parlalel_size: int,
+                    tensor_parallel_size: int,
+                    arg1,
+                    arg2):
+            super().__init__(rank, world_size, config)
+            self.arg1 = arg1
+            self.arg2 = arg2
+            # ... your variable init
+
+        def init_parallel_groups(self):
+            # initialize your process groups
+            pass
+
+    ```
+    然后，你可以将你的新初始化器插入到 `colossalai.constants.INITIALIZER_MAPPING` 当前的模式与初始化映射中。你可以修改该文件或动态插入新的键值对。
+
+    ```python
+    colossalai.constants.INITIALIZER_MAPPING['new_mode'] = 'MyParallelInitializer'
+    ```
+
+3. 在你的配置文件中设置你的初始化器。你可以传入你的自定义参数。这允许
+   `ParallelContext` 创建你的初始化器并初始化你期望的进程组。
+
+    ```python
+    parallel = dict(
+        pipeline=dict(size=1),
+        tensor=dict(size=x, mode='new_mode')  # this is where you enable your new parallel mode
+    )
+    ```
+
+## 梯度 Handler
+
+梯度 handler 是对参数的梯度执行 all-reduce 操作的对象。由于不同的 all-reduce 策略或许在不同的并行中被执行，用户可以继承
+`colossalai.engine.gradient_handler.BaseGradientHandler` 来实现其策略。目前，Colossal-AI 使用普通的数据并行梯度 handler 在数据并行的 rank 间 all-reduce 梯度。
+如果数据并行被检测到，梯度 handler 会被自动添加进 engine。
+
+你可以添加你自己的梯度 handler，如下所示：
+
+```python
+from colossalai.registry import GRADIENT_HANDLER
+from colossalai.engine import BaseGradientHandler
+
+@GRADIENT_HANDLER.register_module
+class YourGradientHandler(BaseGradientHandler):
+
+    def handle_gradient(self):
+        do_something()
+
+```
+
+之后，你可以在配置文件中指定你要使用的梯度 handler。
+
+```python
+gradient_handlers = [
+    dict(type='YourGradientHandler'),
+]
+```
+
+## Schedule
+
+Schedule 包含了如何执行前向和后向计算。目前， Colossal-AI 提供了流水和非流水的 schedule。
+如果你想修改前向和后向计算的执行方式，你可以继承 `colossalai.engine.schedule.BaseSchedule` 并实现 `forward_back_step` 函数。
diff --git a/docs/source/zh/advanced_tutorials/define_your_own_parallel_model.md b/docs/source/zh/advanced_tutorials/define_your_own_parallel_model.md
new file mode 100644
index 000000000000..64e8d8bcd14a
--- /dev/null
+++ b/docs/source/zh/advanced_tutorials/define_your_own_parallel_model.md
@@ -0,0 +1,31 @@
+# 定义你自己的并行模型
+
+作者: Zhengda Bian, Yongbin Li
+
+> ⚠️ 我们正在编写此文档以使其更加详细。 我们将介绍不同并行的机制以及如何使用它们来编写模型。
+
+假设您有一个具有数十亿参数的巨大 MLP 模型，其极大的隐藏层大小使其无法直接被单个 GPU 容纳。别担心，Colossal-AI 可以帮你解决这个问题。
+在 Colossal-AI 的帮助下，您可以用所熟悉的为单个 GPU 编写模型的方式编写大模型，而 Colossal-AI 会自动拆分您的模型权重，并将它们完美地分配到一组 GPU 中。我们给出一个简单的示例，展示如何在 Colossal-AI 中编写简单的 2D 并行模型。
+
+## 写一个简单的2D并行模型
+
+```python
+from colossalai.nn import Linear2D
+import torch.nn as nn
+
+class MLP_2D(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.linear_1 = Linear2D(in_features=1024, out_features=16384)
+        self.linear_2 = Linear2D(in_features=16384, out_features=1024)
+
+    def forward(self, x):
+        x = self.linear_1(x)
+        x = self.linear_2(x)
+        return x
+```
+
+## 使用预定义的模型
+
+为了方便您的使用，我们在 Colossal-AI 的 Model Zoo 中提供一些流行的模型，如*BERT*, *ViT*, *MoE* 和 *GPT*，请自由地将它们定制为不同的尺寸，以满足您的特殊需求。
diff --git a/docs/source/zh/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md b/docs/source/zh/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md
new file mode 100644
index 000000000000..456878caa147
--- /dev/null
+++ b/docs/source/zh/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md
@@ -0,0 +1,140 @@
+# 将 MoE 整合进你的模型
+
+作者: Haichen Huang, Yongbin Li
+
+**前置教程**
+- [ColossalAI-Examples WideNet](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/image/widenet)
+
+**相关论文**
+- [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961)
+- [Go Wider Instead of Deeper](https://arxiv.org/abs/2107.11817)
+
+（中文版教程将会在近期提供）
+
+## Introduction
+
+Since the advent of Switch Transformer, the AI community has found Mixture of Experts (MoE) a useful technique to enlarge the capacity of deep learning models.
+
+Colossal-AI provides an early access version of parallelism specifically designed for MoE models.
+The most prominent advantage of MoE in Colossal-AI is convenience.
+We aim to help our users to easily combine MoE with model parallelism and data parallelism.
+
+However, the current implementation has two main drawbacks now.
+The first drawback is its poor efficiency in large batch size and long sequence length training.
+The second drawback is incompatibility with tensor parallelism.
+We are working on system optimization to overcome the training efficiency problem.
+The compatibility problem with tensor parallelism requires more adaptation, and we will tackle this issue in the future.
+
+Here, we will introduce how to use MoE with model parallelism and data parallelism.
+
+## Table of Content
+In this tutorial we will cover:
+1. Set up MoE running environment
+2. Create MoE layer
+3. Train your model
+
+We provided the [example code](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/image/widenet) for this tutorial in [ColossalAI-Examples](https://github.com/hpcaitech/ColossalAI-Examples).
+This example uses [WideNet](https://arxiv.org/abs/2107.11817) as an example of MoE-based model.
+
+
+## Set up MoE running environment
+In your project folder, create a `config.py`.
+
+This file is to specify some features you may want to use to train your model.
+In order to enable MoE, you need to add a dict called parallel and specify the value of key moe.
+You can assign a value for the key size of moe, which represents the model parallel size of experts (i.e. the number of experts in one group to parallelize training).
+
+For example, if the size is 4, 4 processes will be assigned to 4 consecutive GPUs and these 4 processes form a moe model parallel group.
+Each process on the 4 GPUs will only get a portion of experts. Increasing the model parallel size will reduce communication cost, but increase computation cost in each GPU and activation cost in memory.
+The total data parallel size is auto-detected and set as the number of GPUs by default.
+
+```python
+MOE_MODEL_PARALLEL_SIZE = ...
+parallel = dict(
+    moe=dict(size=MOE_MODEL_PARALLEL_SIZE)
+)
+```
+
+If `MOE_MODEL_PARALLEL_SIZE = E` and set the number of experts as `E` where `E` is a constant number, the process flow of forward pass of a transformer encoder in a model parallel group is shown below.
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/01/28/oI59QcxdteKUTks.png"/>
+<figcaption>MoE Transformer, image source: <a href="https://arxiv.org/abs/2006.16668">GShard</a></figcaption>
+</figure>
+
+Since all experts are allocated to all GPUs in a model parallel group and a GPU only owns a portion of experts,
+original data parallel groups are no longer correct for the parameters of experts during gradient handling in backward pass anymore.
+So we create a new kind of parallel group called moe data parallel group.
+The difference among different kinds of parallel group, when the configuration is set as `WORLD_SIZE=4`,
+`MOE_MODEL_PARALLEL_SIZE=2`, is shown here.
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/01/28/Sn8FpmQPKIiBEq2.png"/>
+<figcaption>MoE process group</figcaption>
+</figure>
+
+
+As for gradient handling, we provide MoeGradientHandler to all-reduce every parameter of the model.
+If you use `colossalai.initialize` function to create your training engine, the MoE gradient handler will be added to your engine automatically.
+Otherwise, you should take care of gradient by yourself.
+All parameters of MoE running environment are stored in colossalai.global_variables.moe_env.
+You can access your configuration parameters to check whether your setup is correct.
+```python
+from colossalai.global_variables import moe_env
+```
+
+## Create MoE layer
+You can create a MoE layer from `colossalai.nn.moe`.
+But before doing that, you should set up random seeds for all processes like this.
+
+```python
+from colossalai.context.random import moe_set_seed
+from model_zoo.moe.models import Widenet
+
+moe_set_seed(42)
+model = Widenet(num_experts=4, capacity_factor=1.2)
+```
+
+`moe_set_seed` will set different seed for different processes in a moe model parallel group.
+This helps initialize parameters in experts.
+Then create an instance of experts and an instance of router.
+Here is the example in model zoo.
+
+```python
+from colossalai.nn.layer.moe import Experts, MoeLayer, Top2Router, NormalNoiseGenerator
+
+
+noisy_func = NormalNoiseGenerator(num_experts)
+shared_router = Top2Router(capacity_factor,
+                           noisy_func=noisy_func)
+shared_experts = Experts(expert=VanillaFFN,
+                         num_experts=num_experts,
+                         **moe_mlp_args(
+                             d_model=d_model,
+                             d_ff=d_ff,
+                             drop_rate=drop_rate
+                         ))
+ffn=MoeLayer(dim_model=d_model, num_experts=num_experts,
+             router=shared_router, experts=shared_experts)
+```
+
+Inside the initialization of Experts, the local expert number of each GPU will be calculated automatically. You just need to specify the class of each expert and its parameters used in its initialization. As for routers, we have provided top1 router and top2 router. You can find them in colossalai.nn.layer.moe. After creating the instance of experts and router, the only thing initialized in Moelayer is gate module. More definitions of each class can be found in our API document and code.
+
+
+## Train Your Model
+Do not to forget to use `colossalai.initialize` function in `colosalai` to add gradient handler for the engine.
+We handle the back-propagation of MoE models for you.
+In `colossalai.initialize`, we will automatically create a `MoeGradientHandler` object to process gradients.
+You can find more information about the handler `MoeGradientHandler` in colossal directory.
+
+The loss criterion should be wrapped by `Moeloss` to add auxiliary loss of MoE. Example is like this.
+```python
+criterion = MoeLoss(
+    aux_weight=0.01,
+    loss_fn=nn.CrossEntropyLoss,
+    label_smoothing=0.1
+)
+```
+
+Finally, just use trainer or engine in `colossalai` to do your training.
+Otherwise, you should take care of gradient by yourself.
diff --git a/docs/source/zh/advanced_tutorials/meet_gemini.md b/docs/source/zh/advanced_tutorials/meet_gemini.md
new file mode 100644
index 000000000000..2bf0a9c98c3f
--- /dev/null
+++ b/docs/source/zh/advanced_tutorials/meet_gemini.md
@@ -0,0 +1,96 @@
+# 认识Gemini：ColossalAI的异构内存空间管理器
+
+作者: [Jiarui Fang](https://github.com/feifeibear)
+
+## 简介
+
+在GPU数量不足情况下，想要增加模型规模，异构训练是最有效的手段。它通过在 CPU 和 GPU 中容纳模型数据，并仅在必要时将数据移动到当前设备，可以同时利用 GPU 内存、CPU 内存（由 CPU DRAM 或 NVMe SSD内存组成）来突破单GPU内存墙的限制。并行，在大规模训练下，其他方案如数据并行、模型并行、流水线并行都可以在异构训练基础上进一步扩展GPU规模。这篇文章描述ColossalAI的异构内存空间管理模块Gemini的设计细节，它的思想来源于[PatrickStar](https://arxiv.org/abs/2108.05818)，ColossalAI根据自身情况进行了重新实现。
+
+## 用法
+
+目前Gemini支持和ZeRO并行方式兼容，它的使用方法很简单，在训练策略的配置文件里设置zero的model_config属性tensor_placement_policy='auto'
+
+```
+zero = dict(
+    model_config=dict(
+        reduce_scatter_bucket_size_mb=25,
+        fp32_reduce_scatter=False,
+        gradient_predivide_factor=1.0,
+        tensor_placement_policy="auto",
+        shard_strategy=TensorShardStrategy(),
+        ...
+    ),
+    optimizer_config=dict(
+        ...
+    )
+)
+```
+
+注意，Gemini和并行策略，如Tensor Parallelism，Data Parallelism，Pipeline Parallelism，ZeRO是解耦合的。对TP，PP的支持还在开发中。
+
+## 术语
+
+**算子**(**OP**erator)：一个神经网络层的计算操作，比如Linear，LayerNorm等。算子可以是正向传播的计算，也可以是反向传播的计算。
+
+神经网络在训练期间必须管理的两种类型的训练数据。
+
+**模型数据(model data)**: 由参数、梯度和优化器状态组成，其规模与模型结构定义相关
+
+**非模型数据(non-model data)**: 主要由算子生成的中间张量和算子的临时变量组成。非模型数据根据训练任务的配置动态变化，例如批量大小。模型数据和非模型数据相互竞争 GPU 内存。
+
+## 设计
+
+目前的一些解决方案，DeepSpeed采用的[Zero-offload](https://arxiv.org/abs/2101.06840)在CPU和GPU内存之间静态划分模型数据，并且它们的内存布局对于不同的训练配置是恒定的。如下图左边所示，当 GPU 内存不足以满足其相应的模型数据要求时，即使当时CPU上仍有可用内存，系统也会崩溃。而ColossalAI可以通过将一部分模型数据换出到CPU上来完成训练。
+
+<figure style={{textAlign: "center"}}>
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/tutorial/gemini/deepspeed_compare.png"/>
+<figcaption>比较Zero-Offload和Gemini的内存管理方案</figcaption>
+</figure>
+
+
+ColossalAI设计了Gemini，就像双子星一样，它管理CPU和GPU二者内存空间。它可以让张量在训练过程中动态分布在CPU-GPU的存储空间内，从而让模型训练突破GPU的内存墙。内存管理器由两部分组成，分别是MemStatsCollector(MSC)和StatefuleTensorMgr(STM)。
+
+
+我们利用了深度学习网络训练过程的迭代特性。我们将迭代分为warmup和non-warmup两个阶段，开始时的一个或若干迭代步属于预热阶段，其余的迭代步属于正式阶段。在warmup阶段我们为MSC收集信息，而在non-warmup阶段STM入去MSC收集的信息来移动tensor，以达到最小化CPU-GPU数据移动volume的目的。
+
+<figure style={{textAlign: "center"}}>
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/tutorial/gemini/gemini_workflow.png"/>
+<figcaption>Gemini在不同训练阶段的运行流程</figcaption>
+</figure>
+
+
+### StatefulTensorMgr
+
+STM管理所有model data tensor的信息。在模型的构造过程中，ColossalAI把所有model data张量注册给STM。内存管理器给每个张量标记一个状态信息。状态集合包括HOLD，COMPUTE，FREE三种状态。STM的功能如下：
+
+**查询内存使用：**通过遍历所有tensor的在异构空间的位置，获取模型数据对CPU和GPU的内存占用。
+
+**转换张量状态：**它在每个模型数据张量参与算子计算之前，将张量标记为COMPUTE状态，在计算之后标记为HOLD状态。如果张量不再使用则标记的FREE状态。
+
+**调整张量位置：**张量管理器保证COMPUTE状态的张量被放置在计算设备上，如果计算设备的存储空间不足，则需要移动出一些HOLD状态的张量到其他设备上存储。Tensor eviction strategy需要MSC的信息，我们将在后面介绍。
+
+
+### MemStatsCollector
+在预热阶段，内存信息统计器监测CPU和GPU中模型数据和非模型数据的内存使用情况，供正式训练阶段参考。我们通过查询STM可以获得模型数据在某个时刻的内存使用。但是非模型的内存使用却难以获取。因为非模型数据的生存周期并不归用户管理，现有的深度学习框架没有暴露非模型数据的追踪接口给用户。MSC通过采样方式在预热阶段获得非模型对CPU和GPU内存的使用情况。具体方法如下：
+
+我们在算子的开始和结束计算时，触发内存采样操作，我们称这个时间点为**采样时刻（sampling moment)**，两个采样时刻之间的时间我们称为**period**。计算过程是一个黑盒，由于可能分配临时buffer，内存使用情况很复杂。但是，我们可以较准确的获取period的系统最大内存使用。非模型数据的使用可以通过两个统计时刻之间系统最大内存使用-模型内存使用获得。
+
+我们如何设计采样时刻呢。我们选择preOp的model data layout adjust之前。如下图所示。我们采样获得上一个period的system memory used，和下一个period的model data memoy used。并行策略会给MSC的工作造成障碍。如图所示，比如对于ZeRO或者Tensor Parallel，由于Op计算前需要gather模型数据，会带来额外的内存需求。因此，我们要求在模型数据变化前进行采样系统内存，这样在一个period内，MSC会把preOp的模型变化内存捕捉。比如在period 2-3内，我们考虑的tensor gather和shard带来的内存变化。
+尽管可以将采样时刻放在其他位置，比如排除gather buffer的变动新信息，但是会给造成麻烦。不同并行方式Op的实现有差异，比如对于Linear Op，Tensor Parallel中gather buffer的分配在Op中。而对于ZeRO，gather buffer的分配是在PreOp中。将放在PreOp开始时采样有利于将两种情况统一。
+
+
+尽管可以将采样时刻放在其他位置，比如排除gather buffer的变动新信息，但是会给造成麻烦。不同并行方式Op的实现有差异，比如对于Linear Op，Tensor Parallel中gather buffer的分配在Op中。而对于ZeRO，gather buffer的分配是在PreOp中。将放在PreOp开始时采样有利于将两种情况统一。
+
+<figure style={{textAlign: "center"}}>
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/tutorial/gemini/gemini_mem_curve.png"/>
+<figcaption>Sampling based MemStatsCollector</figcaption>
+</figure>
+
+### Tensor Eviction Strategy
+
+MSC的重要职责是在调整tensor layout位置，比如在上图S2时刻，我们减少设备上model data数据，Period 2-3计算的峰值内存得到满足。
+
+在warmup阶段，由于还没执行完毕一个完整的迭代，我们对内存的真实使用情况尚一无所知。我们此时限制模型数据的内存使用上限，比如只使用30%的GPU内存。这样保证我们可以顺利完成预热状态。
+
+在non-warmup阶段，我们需要利用预热阶段采集的非模型数据内存信息，预留出下一个Period在计算设备上需要的峰值内存，这需要我们移动出一些模型张量。
+为了避免频繁在CPU-GPU换入换出相同的tensor，引起类似[cache thrashing](https://en.wikipedia.org/wiki/Thrashing_(computer_science))的现象。我们利用DNN训练迭代特性，设计了OPT cache换出策略。具体来说，在warmup阶段，我们记录每个tensor被计算设备需要的采样时刻。如果我们需要驱逐一些HOLD tensor，那么我们选择在本设备上最晚被需要的tensor作为受害者。
diff --git a/docs/source/zh/advanced_tutorials/opt_service.md b/docs/source/zh/advanced_tutorials/opt_service.md
new file mode 100644
index 000000000000..a213584fd41d
--- /dev/null
+++ b/docs/source/zh/advanced_tutorials/opt_service.md
@@ -0,0 +1,79 @@
+# Colossal-AI使用指南：5分钟搭建在线OPT服务
+
+## 介绍
+
+本指导手册将说明如何利用[Colossal-AI](https://github.com/hpcaitech/ColossalAI)搭建您自己的OPT服务。
+
+## Colossal-AI 推理概述
+Colossal-AI 提供了一个推理子系统 [Energon-AI](https://github.com/hpcaitech/EnergonAI)， 这是一个基于Colossal-AI的服务系统，拥有以下特性：
+
+- **大模型并行：** 在Colossal-AI的张量并行和流水线并行策略的帮助下，Colossal-AI的推理可实现大模型的高效并行推理。
+- **预构建大模型：** Colossal-AI提供热门模型的预构建部署，例如OPT。其支持用于生成任务和加载检查点的缓存技术。
+- **引擎封装：** Colossal-AI中有一个抽象层被称作引擎。其将单实例多设备(SIMD) 执行与远程过程调用封装在一起。
+- **在线服务系统：** 基于FastAPI，用户可以快速启动分布式推理的网络服务。 在线服务对生成任务进行了特殊优化。它采用left padding和bucket batching两种技术来提高效率。
+
+## 基本用法
+
+1. 下载OPT模型
+
+想要快速发布分布式推理服务，您从[此处](https://huggingface.co/patrickvonplaten/opt_metaseq_125m/blob/main/model/restored.pt)下载OPT-125M。有关加载其他体量模型的详细方法，您可访问[此处](https://github.com/hpcaitech/EnergonAI/tree/main/examples/opt/script)。
+
+2. 准备提前构建的服务镜像
+
+从dockerhub拉取一个已经安装Colossal-AI推理的docker镜像。
+
+```bash
+docker pull hpcaitech/energon-ai:latest
+```
+
+3. 发布HTTP服务
+
+若想发布服务，我们需要准备python脚本来描述模型的类型和相关的部署，以及HTTP服务的设置。 我们为您提供了一组[示例](https://github.com/hpcaitech/EnergonAI/tree/main/examples])。 我们将在本指导手册中使用[OPT 示例](https://github.com/hpcaitech/EnergonAI/tree/main/examples/opt)。
+服务的入口是一个bash脚本 server.sh。
+本服务的配置文件参考 opt_config.py，该文件定义了模型的类型、 检查点文件路径、并行策略和http设置。您能按照您的需求来修改这些设置。
+例如，将模型的大小设置为opt_125M，将正确的检查点路径按照如下设置：
+
+```bash
+model_class = opt_125M
+checkpoint = 'your_file_path'
+```
+
+将张量并行度设置为您的gpu数量。
+
+```bash
+tp_init_size = #gpu
+```
+
+现在，我们就能利用docker发布一个服务。您能在`/model_checkpoint` 和 `/config`路径下找到检查点文件和配置文件。
+
+
+```bash
+export CHECKPOINT_DIR="your_opt_checkpoint_path"
+# the ${CONFIG_DIR} must contain a server.sh file as the entry of service
+export CONFIG_DIR="config_file_path"
+
+docker run --gpus all  --rm -it -p 8020:8020 -v ${CHECKPOINT_DIR}:/model_checkpoint -v ${CONFIG_DIR}:/config --ipc=host energonai:lastest
+```
+
+接下来，您就可以在您的浏览器中打开 `https://[IP-ADDRESS]:8020/docs#` 进行测试。
+
+## 高级特性用法
+
+1. 批处理优化
+
+若想使用我们的高级批处理技术来批量收集多个查询，您可以将executor_max_batch_size设置为最大批处理大小。 请注意，只有具有相同 top_k、top_p 和温度的解码任务才能一起批处理。
+
+```
+executor_max_batch_size = 16
+```
+
+所有的查询将进入FIFO队列。解码步数小于或等于队列头部解码步数的所有连续查询可以一起批处理。  应用左填充以确保正确性。 executor_max_batch_size 不应该过大，从而确保批处理不会增加延迟。 以opt-30b为例， `executor_max_batch_size=16` 合适，但对于opt-175b而言， `executor_max_batch_size=4` 更合适。
+
+2. 缓存优化
+
+对于每一个独立的服务过程，您能将最近的多个查询结果缓存在一起。在config.py中设置 cache_size 和 cache_list_size。缓存的大小应为缓存的查询数目。cache_list_size 应为每次查询存储的结果数。一个随机缓存的结果将会被返回。当缓存已满，LRU策略被用于清理缓存过的查询。cache_size=0意味着不缓存。
+
+```
+cache_size = 50
+cache_list_size = 2
+```
diff --git a/docs/source/zh/advanced_tutorials/parallelize_your_training_like_Megatron.md b/docs/source/zh/advanced_tutorials/parallelize_your_training_like_Megatron.md
new file mode 100644
index 000000000000..f3c6247c38e4
--- /dev/null
+++ b/docs/source/zh/advanced_tutorials/parallelize_your_training_like_Megatron.md
@@ -0,0 +1,176 @@
+# 使用ColoTensor让串行程序像Megatron-LM一样并行
+
+Author: [Haichen Huang](https://github.com/1SAA) and [Jiarui Fang](https://github.com/feifeibear)
+
+**Prerequisite:**
+- [ColoTensor Concepts](../basics/colotensor_concept.md)
+
+## 介绍
+
+在新版本中，我们引入了ColoTensor。ColoTensor为用户使用并行训练提供了极大的便利，使得用户可以在原本的串行代码上，通过较小的修改将训练改为并行。在本教程中，我们将说明如何修改训练模型以自动使代码采取像 Megatron-LM 一样的方式并行训练。我们以 HuggingFace 提供的 GPT-2 模型为例，并提供一种方式让你可以在单个GPU上预训练GPT-2模型。
+
+Megatron-LM 提供了一个具有影响力的并行化范式，这个范式主要应用于Transformer大模型的训练。然而，为了大规模训练 Transformer 语言大模型，用户必须使用Megatron-LM提供的特殊模块来构建他们的模型。这给用户带来了一些困难的工作，例如从预先训练的模型中加载权重，或是构建自己的并行训练模型。为了减轻用户的麻烦，我们提供 ColoTensor 类，以完成自动启用张量模型并行。
+
+## 定义模型和损失函数
+
+首先，我们直接调用 HuggingFace 库中的 GPTModel 和 GPTLoss。
+
+```python
+import torch
+import torch.nn as nn
+from transformers import GPT2Config, GPT2LMHeadModel
+
+class GPTLMModel(nn.Module):
+    def __init__(self, hidden_size=768, num_layers=12, num_attention_heads=12, max_seq_len=1024, vocab_size=50257, checkpoint=False):
+        super().__init__()
+        self.checkpoint = checkpoint
+        self.model = GPT2LMHeadModel(GPT2Config(n_embd=hidden_size, n_layer=num_layers,
+                                     n_head=num_attention_heads, n_positions=max_seq_len, n_ctx=max_seq_len, vocab_size=vocab_size))
+        if checkpoint:
+            self.model.gradient_checkpointing_enable()
+
+    def forward(self, input_ids, attention_mask):
+        # Only return lm_logits
+        return self.model(input_ids=input_ids, attention_mask=attention_mask, use_cache=not self.checkpoint)[0]
+
+
+class GPTLMLoss(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.loss_fn = nn.CrossEntropyLoss()
+
+    def forward(self, logits, labels):
+        shift_logits = logits[..., :-1, :].contiguous()
+        shift_labels = labels[..., 1:].contiguous()
+        # Flatten the tokens
+        return self.loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+```
+
+## 对GPT-2的简短回顾
+
+现在，我们回顾一下 GPT-2 模型的结构。每个 GPT-2 模型都可以表示为一个 DAG。如下图所示，每个圆圈代表一个算子，每个方块代表一个权重。每个箭头表示输入数据的流向，而箭头旁边的符号表示输入数据的形状。
+
+然后，让我们深入了解一下这个 GPT-2 模型。它由三部分组成，分别是**嵌入模块**、**转换器层**和**分类头**。
+
+嵌入模块包含两个权重，符号嵌入权重和位置嵌入权重。在嵌入模块的前向操作之后，原始输入数据的所有序列中的每个单词都会被嵌入到隐藏状态。
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/08/17/omfkIEN6ui5jcL3.png"/>
+<figcaption>嵌入模块</figcaption>
+</figure>
+
+每个转换器层包含两个块。自注意操作在第一个块中调用，同时一个双层感知器位于第二个块中。
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/08/17/LAVzDlpRcj4dYeb.png"/>
+<figcaption>转换器层</figcaption>
+</figure>
+
+最后，分类头只是一个不加偏差的线性模块，里面只有一个线性权重。
+
+## 应用ColoTensor
+
+两个步骤使您的串行代码采取 Megatron-LM 张量并行风格。
+1. 在ColoInitContext的上下文中初始化模型。
+2. 为每个参数设置 ColoTensorSpec。
+
+### 使用 ColoInitContext 初始化
+
+我们应该在 ColoInitContext 中构建模型。在该种上下文中，任何初始化的参数都将转换为 ColoParameter 并自动移动到相应的设备上。
+
+```python
+from colossalai.utils.model.colo_init_context import ColoInitContext
+
+with ColoInitContext(device=torch.device('cpu')):
+    model = GPTLMModel()
+```
+
+### 为每个参数设置 ColoTensorSpec
+
+模型创建完成后，我们通过ProcessGroup建立分布式环境。这里，我们将张量并行度指定为所有GPU的数量，即数据并行度为一。
+
+```python
+import torch.distributed as dist
+from colossalai.tensor import ProcessGroup
+
+pg = ProcessGroup(tp_degree=dist.get_world_size())
+```
+
+现在，我们需要一些辅助函数为下一步做准备。我们定义了两个函数来切分参数。Megatron-LM张量并行需要沿参数的第一维或最后一维切分参数张量。
+
+```python
+from colossalai.tensor import ShardSpec, ComputeSpec, ComputePattern, ColoParameter, ProcessGroup
+
+def split_param_single_dim_tp1d(dim: int, param: ColoParameter, pg: ProcessGroup):
+    spec = (ShardSpec([dim], [pg.tp_world_size()]), ComputeSpec(ComputePattern.TP1D))
+    if param.process_group.tp_world_size() == 1:
+        param.set_process_group(pg)
+    param.set_tensor_spec(*spec)
+
+
+def split_param_row_tp1d(param: ColoParameter, pg: ProcessGroup):
+    split_param_single_dim_tp1d(0, param, pg)
+
+
+def split_param_col_tp1d(param: ColoParameter, pg: ProcessGroup):
+    split_param_single_dim_tp1d(-1, param, pg)
+```
+
+然后我们使模型采用张量并行。根据 Megatron 中使用的张量并行，应该沿着张量的最后一个维度进行切片，包括符号嵌入的权重，位置嵌入的权重，自注意力块中的所有线性权重和偏差，以及每个双层感知器中的第一个线性权重和偏差。且需要沿第一个维度切分双层感知器中的第二个线性权重。
+
+```python
+for mn, module in model.named_modules():
+    for pn, param in module.named_parameters(recurse=False):
+        # set process group for all parameters
+        param.set_process_group(pg)
+
+        if 'mlp.c_fc' in mn:
+            if 'weight' in pn or 'bias' in pn:
+                split_param_col_tp1d(param, pg)  # colmn slice
+                # keep the shape of the output from c_fc
+                param.compute_spec.set_output_replicate(False)
+        elif 'mlp.c_proj' in mn:
+            if 'weight' in pn:
+                split_param_row_tp1d(param, pg)  # row slice
+        elif 'wte' in mn or 'wpe' in mn:
+            split_param_col_tp1d(param, pg)  # colmn slice
+        elif 'c_attn' in mn or 'c_proj' in mn:
+            split_param_col_tp1d(param, pg)  # colmn slice
+```
+
+修改后的模型如下图所示。
+
+嵌入模块:
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/08/17/Yu2xzXEabHV7pwe.png"/>
+<figcaption>修改后的嵌入模块</figcaption>
+</figure>
+
+转换器层:
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/08/17/4HWsA2xz51IhPFO.png"/>
+<figcaption>修改后的转换器层</figcaption>
+</figure>
+
+一旦用户指定了每个参数的在并行中的分布模式，ColoTensor 就能够推断出所有算子的计算模式，包括矩阵乘法、线性函数、torch.nn.functional 中的其他逐元素函数，以及其他的一些常用函数。这样，用户可以像往常一样训练他们的模型。
+
+在我们最新示例中还定义了一个Gemini + ZeRO DDP 的模型从而减小开销，提升效率。这一部分的详细内容可以参考[ZeRO](../features/zero_with_chunk.md)，你可以将这两部分内容结合起来看从而理解我们整个训练流程：
+
+```python
+def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy: str = "auto"):
+    from colossalai.nn.parallel import GeminiDDP
+    model = GeminiDDP(model,
+                        device=get_current_device(),
+                        placement_policy=placememt_policy,
+                        pin_memory=True,
+                        search_range_mb=32)
+    return model
+```
+
+## 在单个GPU上预训练GPT-2
+
+我们做的上述优化让我们可以在单GPU上训练GPT-2模型，只需要将`run.sh`中设置参数`GPUNUM`=1，再运行文件时就可以在单个GPU上完成模型的训练。
+
+GPT-2 示例在[Train GPT with Colossal-AI](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/gpt). 获得。
diff --git a/docs/source/zh/advanced_tutorials/train_gpt_using_hybrid_parallelism.md b/docs/source/zh/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
new file mode 100644
index 000000000000..6c6dcf6e850d
--- /dev/null
+++ b/docs/source/zh/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
@@ -0,0 +1,275 @@
+# 使用混合并行训练 GPT
+
+作者: Hongxin Liu, Yongbin Li
+
+**示例代码**
+- [ColossalAI-Examples GPT2](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/language/gpt_2)
+- [ColossalAI-Examples GPT3](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/language/gpt_3)
+
+**相关论文**
+- [Colossal-AI: A Unified Deep Learning System For Large-Scale Parallel Training](https://arxiv.org/abs/2110.14883)
+- [Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM](https://arxiv.org/abs/2104.04473)
+
+## 引言
+
+在上一篇教程中，我们介绍了如何用流水并行训练 ViT。在本教程中，你将学习一个更复杂的场景--用混合并行方式训练GPT。在这种情况下，由于GPT-3过大，即使CPU内存也无法容纳它。因此，你必须自己分割模型。
+
+## 目录
+
+在本教程中，我们将介绍:
+
+1. 基于 colossalai/model_zoo 定义 GPT 模型
+2. 处理数据集
+3. 使用混合并行训练 GPT
+
+## 导入依赖库
+
+```python
+import json
+import os
+from typing import Callable
+
+import colossalai
+import colossalai.utils as utils
+import model_zoo.gpt.gpt as col_gpt
+import torch
+import torch.nn as nn
+from colossalai import nn as col_nn
+from colossalai.amp import AMP_TYPE
+from colossalai.builder.pipeline import partition_uniform
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.engine.schedule import (InterleavedPipelineSchedule,
+                                        PipelineSchedule)
+from colossalai.logging import disable_existing_loggers, get_dist_logger
+from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper
+from colossalai.trainer import Trainer, hooks
+from colossalai.utils.timer import MultiTimer
+from model_zoo.gpt import GPTLMLoss
+from torch.nn import functional as F
+from torch.utils.data import Dataset
+from transformers import GPT2Tokenizer
+```
+
+
+
+## 定义 GPT 模型
+
+在前面的教程中，我们介绍了3种建立流水并行模型的方法，但对于像 GPT-3 这样的巨大模型，你甚至不能在 CPU 中建立模型。在这种情况下，你必须自己分割模型。
+
+GPT 数据加载器返回 `input_ids` 和 `attention_mask`, 因此我们在 `forward()` 中使用两个关键字参数来获得它们。请注意，对于除第一阶段以外的其他阶段， `forward()` 的第一个位置参数是上一阶段的输出张量。所以 `hidden_states` 来自前一阶段，并且对于第一阶段来说，它是 `None`。
+
+对于 GPT, *word embedding layer* 与 *output head* 共享权重。我们提供 `PipelineSharedModuleWrapper` 在流水阶段间共享参数。它需要一个 `int` 型的 `list` 作为参数, 这意味着 rank 们共享这些参数。你可以使用 `register_module()`
+或 `register_parameter()` 来注册一个模块或一个参数作为共享模块或参数。如果你有多组共享模块/参数，你应该有多个 `PipelineSharedModuleWrapper` 实例。 如果参数在**一个**阶段内共享, 你不应该使用
+`PipelineSharedModuleWrapper`, 而只是使用同一个模块/参数实例。在这个例子中，*word embedding layer* 在第一阶段, 而 *output head* 在最后一个阶段。因此，他们在 rank `[0, pipeline_size - 1]` 之间共享参数。
+
+对于第一阶段，它维护 embedding layer 和一些 transformer blocks。对于最后一个阶段，它维护一些 transformer blocks 和 output head layer。对于其他阶段，他们只维护一些 transformer blocks。
+`partition_uniform(num_layers, pipeline_size, num_chunks)` 返回所有 rank 的 parts, part 是一个 `(start, end)` (不包括end) 的 `tuple`。`start == 0` 表示这是第一阶段, 而 `end == num_layers` 表示这是最后一个阶段。
+
+```python
+class PipelineGPTHybrid(nn.Module):
+    def __init__(self,
+                 num_layers: int = 12,
+                 hidden_size: int = 768,
+                 num_attention_heads: int = 12,
+                 vocab_size: int = 50304,
+                 embed_drop_rate: float = 0.,
+                 act_func: Callable = F.gelu,
+                 mlp_ratio: int = 4,
+                 attn_drop_rate: float = 0.,
+                 drop_rate: float = 0.,
+                 dtype: torch.dtype = torch.float,
+                 checkpoint: bool = False,
+                 max_position_embeddings: int = 1024,
+                 layer_norm_epsilon: float = 1e-5,
+                 first: bool = False,
+                 last: bool = False):
+        super().__init__()
+        self.embedding = None
+        self.norm = None
+        self.head = None
+        if first:
+            self.embedding = col_gpt.GPTEmbedding(
+                hidden_size, vocab_size, max_position_embeddings, dropout=embed_drop_rate, dtype=dtype)
+        self.blocks = nn.ModuleList([
+            col_gpt.GPTBlock(hidden_size, num_attention_heads, mlp_ratio=mlp_ratio, attention_dropout=attn_drop_rate,
+                             dropout=drop_rate, dtype=dtype, checkpoint=checkpoint, activation=act_func)
+            for _ in range(num_layers)
+        ])
+        if last:
+            self.norm = col_nn.LayerNorm(hidden_size, eps=layer_norm_epsilon)
+            self.head = col_gpt.GPTLMHead(vocab_size=vocab_size,
+                                          dim=hidden_size,
+                                          dtype=dtype,
+                                          bias=False)
+
+    def forward(self, hidden_states=None, input_ids=None, attention_mask=None):
+        if self.embedding is not None:
+            hidden_states = self.embedding(input_ids=input_ids)
+        batch_size = hidden_states.shape[0]
+        attention_mask = attention_mask.view(batch_size, -1)
+        attention_mask = attention_mask[:, None, None, :]
+        attention_mask = attention_mask.to(dtype=hidden_states.dtype)  # fp16 compatibility
+        attention_mask = (1.0 - attention_mask) * -10000.0
+        for block in self.blocks:
+            hidden_states, attention_mask = block(hidden_states, attention_mask)
+        if self.norm is not None:
+            hidden_states = self.head(self.norm(hidden_states))
+        return hidden_states
+
+
+def build_gpt_pipeline(num_layers, num_chunks, device=torch.device('cuda'), **kwargs):
+    logger = get_dist_logger()
+    pipeline_size = gpc.get_world_size(ParallelMode.PIPELINE)
+    pipeline_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
+    rank = gpc.get_global_rank()
+    wrapper = PipelineSharedModuleWrapper([0, pipeline_size - 1])
+    parts = partition_uniform(num_layers, pipeline_size, num_chunks)[pipeline_rank]
+    models = []
+    for start, end in parts:
+        kwargs['num_layers'] = end - start
+        kwargs['first'] = start == 0
+        kwargs['last'] = end == num_layers
+        logger.info(f'Rank{rank} build layer {start}-{end}, {end-start}/{num_layers} layers')
+        chunk = PipelineGPTHybrid(**kwargs).to(device)
+        if start == 0:
+            wrapper.register_module(chunk.embedding.word_embeddings)
+        elif end == num_layers:
+            wrapper.register_module(chunk.head)
+        models.append(chunk)
+    if len(models) == 1:
+        model = models[0]
+    else:
+        model = nn.ModuleList(models)
+    return model
+
+
+def GPT2_exlarge_pipeline_hybrid(num_chunks=1, checkpoint=False, dtype=torch.float):
+    cfg = dict(hidden_size=1600, num_attention_heads=32, checkpoint=checkpoint, dtype=dtype)
+    return build_gpt_pipeline(48, num_chunks, **cfg)
+
+
+def GPT3_pipeline_hybrid(num_chunks=1, checkpoint=False, dtype=torch.float):
+    cfg = dict(hidden_size=12288, num_attention_heads=96,
+               checkpoint=checkpoint, max_position_embeddings=2048, dtype=dtype)
+    return build_gpt_pipeline(96, num_chunks, **cfg)
+```
+
+## 处理数据集
+
+我们在这里提供了一个小型 GPT web-text 数据集。 原始格式是 loose JSON, 我们将保存处理后的数据集。
+
+```python
+class WebtextDataset(Dataset):
+    def __init__(self, path, seq_len=1024) -> None:
+        super().__init__()
+        root = os.path.dirname(path)
+        encoded_data_cache_path = os.path.join(root, f'gpt_webtext_{seq_len}.pt')
+        if os.path.isfile(encoded_data_cache_path):
+            seq_len_, data, attention_mask = torch.load(
+                encoded_data_cache_path)
+            if seq_len_ == seq_len:
+                self.data = data
+                self.attention_mask = attention_mask
+                return
+        raw_data = []
+        with open(path) as f:
+            for line in f.readlines():
+                raw_data.append(json.loads(line)['text'])
+        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        tokenizer.pad_token = tokenizer.unk_token
+        encoded_data = tokenizer(
+            raw_data, padding=True, truncation=True, max_length=seq_len, return_tensors='pt')
+        self.data = encoded_data['input_ids']
+        self.attention_mask = encoded_data['attention_mask']
+        torch.save((seq_len, self.data, self.attention_mask),
+                   encoded_data_cache_path)
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, index):
+        return {
+            'input_ids': self.data[index],
+            'attention_mask': self.attention_mask[index]
+        }, self.data[index]
+```
+
+## 使用混合并行训练 GPT
+
+在上一个教程中，我们解释了一些流水并行的参数含义。在本例中，我们可以确定在流水阶段之间交换的每个输出张量的形状。对于 GPT，该形状为
+`(MICRO BATCH SIZE, SEQUENCE LEN, HIDDEN SIZE)`。通过设置该参数，我们可以避免交换每个阶段的张量形状。当你不确定张量的形状时，你可以把它保留为
+`None`, 形状会被自动推测。请确保你的模型的 `dtype` 是正确的：当你使用 `fp16`，模型的 `dtype` 必须是 `torch.half`；否则，`dtype` 必须是 `torch.float`。对于流水并行，仅支持 `AMP_TYPE.NAIVE`。
+
+你可以通过在 `CONFIG` 里使用 `parallel` 来轻松使用张量并行。数据并行的大小是根据 GPU 的数量自动设置的。
+
+```python
+NUM_EPOCHS = 60
+SEQ_LEN = 1024
+BATCH_SIZE = 192
+NUM_CHUNKS = None
+TENSOR_SHAPE = (1, 1024, 1600)
+# only pipeline parallel
+# CONFIG = dict(NUM_MICRO_BATCHES = 192, parallel=dict(pipeline=2), fp16=dict(mode=AMP_TYPE.NAIVE))
+# pipeline + 1D model parallel
+CONFIG = dict(NUM_MICRO_BATCHES = 192, parallel=dict(pipeline=2, tensor=dict(mode='1d', size=2)), fp16=dict(mode=AMP_TYPE.NAIVE))
+
+
+def train():
+    disable_existing_loggers()
+    parser = colossalai.get_default_parser()
+    args = parser.parse_args()
+    colossalai.launch_from_torch(config=CONFIG, backend=args.backend)
+    logger = get_dist_logger()
+
+    train_ds = WebtextDataset(os.environ['DATA'], seq_len=SEQ_LEN)
+    train_dataloader = utils.get_dataloader(train_ds,
+                                            seed=42,
+                                            batch_size=BATCH_SIZE,
+                                            pin_memory=True,
+                                            shuffle=True,
+                                            drop_last=True)
+
+    use_interleaved = NUM_CHUNKS is not None
+    num_chunks = 1 if not use_interleaved else NUM_CHUNKS
+    model = GPT2_exlarge_pipeline_hybrid(num_chunks=num_chunks, checkpoint=True, dtype=torch.half)
+    # model = GPT3_pipeline_hybrid(num_chunks=num_chunks, checkpoint=True, dtype=torch.half)
+    if use_interleaved and not isinstance(model, nn.ModuleList):
+        model = nn.ModuleList([model])
+
+    criterion = GPTLMLoss()
+
+    optimizer = torch.optim.Adam(model.parameters(), lr=0.00015, weight_decay=1e-2,)
+
+    engine, train_dataloader, _, _ = colossalai.initialize(model,
+                                                           optimizer,
+                                                           criterion,
+                                                           train_dataloader=train_dataloader)
+    global_batch_size = BATCH_SIZE * \
+        gpc.get_world_size(ParallelMode.DATA) * getattr(gpc.config, "gradient_accumulation", 1)
+    logger.info(f'Init done, global batch size = {global_batch_size}', ranks=[0])
+
+    timer = MultiTimer()
+
+    trainer = Trainer(
+        engine=engine,
+        logger=logger,
+        timer=timer
+    )
+
+    hook_list = [
+        hooks.LossHook(),
+        hooks.LogMetricByEpochHook(logger),
+        hooks.ThroughputHook(),
+        hooks.LogMetricByStepHook(),
+    ]
+
+    trainer.fit(
+        train_dataloader=train_dataloader,
+        epochs=NUM_EPOCHS,
+        test_interval=1,
+        hooks=hook_list,
+        display_progress=True,
+        return_output_label=False,
+    )
+```
diff --git a/docs/source/zh/advanced_tutorials/train_vit_using_pipeline_parallelism.md b/docs/source/zh/advanced_tutorials/train_vit_using_pipeline_parallelism.md
new file mode 100644
index 000000000000..495c7fa36cc1
--- /dev/null
+++ b/docs/source/zh/advanced_tutorials/train_vit_using_pipeline_parallelism.md
@@ -0,0 +1,246 @@
+# 使用流水并行训练 ViT
+
+作者: Hongxin Liu, Yongbin Li
+
+**示例代码**
+- [ColossalAI-Examples Pipeline Parallel ViT](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/image/vision_transformer/pipeline_parallel)
+
+**相关论文**
+- [Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM](https://arxiv.org/abs/2104.04473)
+
+## 引言
+
+在本教程中，你将学习如何使用流水并行从头开始训练用于图像分类的 Vision Transformer (ViT)。流水并行是一种模型并行，主要针对 GPU 内存不能满足模型容量的情况。
+通过使用流水并行，我们将原始模型分割成多个阶段，每个阶段保留原始模型的一部分。我们假设你的 GPU 内存不能容纳 ViT/L-16，而你的内存可以容纳这个模型。
+
+##  目录
+
+在本教程中，我们将介绍:
+
+1. 基于 [TIMM](https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py) 定义 ViT 模型
+2. 处理数据集
+3. 使用流水并行训练 ViT
+
+## 导入依赖库
+
+```python
+import os
+from collections import OrderedDict
+from functools import partial
+
+import colossalai
+import colossalai.nn as col_nn
+import torch
+import torch.nn as nn
+from colossalai.builder import build_pipeline_model
+from colossalai.engine.schedule import (InterleavedPipelineSchedule,
+                                        PipelineSchedule)
+from colossalai.logging import disable_existing_loggers, get_dist_logger
+from colossalai.trainer import Trainer, hooks
+from colossalai.utils import MultiTimer, get_dataloader
+from timm.models import vision_transformer as vit
+from torchvision import transforms
+from torchvision.datasets import CIFAR10
+```
+
+
+## 定义 Vision Transformer 模型
+
+总的来说, 我们提供3种方法来建立一个流水并行的模型:
+
+1. `colossalai.builder.build_pipeline_model_from_cfg`
+2. `colossalai.builder.build_pipeline_model`
+3. 自己按阶段拆分模型
+
+当你的内存能够容纳模型时，你可以使用前两种方法来建立你的模型，否则你必须自己分割模型。前两种方法首先在 CPU 上建立整个模型，然后分割模型，最后你可以直接把模型的相应部分移到 GPU 上。
+
+`colossalai.builder.build_pipeline_model_from_cfg()` 接收一个模型的配置文件，它可以均匀地（按层）或平衡地（按参数大小）分割模型。
+
+如果你熟悉 `PyTorch`, 你可以使用 `colossalai.builder.build_pipeline_model()` 它接收一个 `torch.nn.Sequential` 模型并按层均匀分割。
+
+在本教程中，我们将修改 [TIMM/ViT](https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py) to `torch.nn.Sequential`，然后使用 `colossalai.builder.build_pipeline_model()` 来建立流水线模型。
+
+当数据是 **一个** `Tensor`, 你可以使用你的模型 `forward()` 中的位置参数来获得数据张量。对于流水线的第一阶段，`forward()` 的第一个位置参数是从数据加载器加载的数据张量。对于其他阶段，`forward()` 的第一个位置参数是上一阶段的输出张量。注意，如果该阶段不是最后一个阶段，则 `forward()` 的返回必须是一个 `Tensor`。
+
+当数据是一个 `Tensor` 的 `dict`, 你可以使用你模型 `forward()` 的命名关键字参数来获得数据的 `dict`。
+
+```python
+class ViTEmbedding(nn.Module):
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, embed_layer=vit.PatchEmbed, drop_rate=0., distilled=False):
+        super().__init__()
+        self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.num_tokens = 2 if distilled else 1
+        self.patch_embed = embed_layer(
+            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.dist_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) if distilled else None
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        self.init_weights()
+
+    def forward(self, x):
+        x = self.patch_embed(x)
+        cls_token = self.cls_token.expand(x.shape[0], -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        if self.dist_token is None:
+            x = torch.cat((cls_token, x), dim=1)
+        else:
+            x = torch.cat((cls_token, self.dist_token.expand(x.shape[0], -1, -1), x), dim=1)
+        x = self.pos_drop(x + self.pos_embed)
+        return x
+
+    def init_weights(self):
+        vit.trunc_normal_(self.pos_embed, std=.02)
+        if self.dist_token is not None:
+            vit.trunc_normal_(self.dist_token, std=.02)
+        vit.trunc_normal_(self.cls_token, std=.02)
+        self.apply(vit._init_vit_weights)
+
+
+class ViTHead(nn.Module):
+    def __init__(self, embed_dim=768, num_classes=1000, norm_layer=None, distilled=False, representation_size=None):
+        super().__init__()
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        self.norm = norm_layer(embed_dim)
+        self.num_classes = num_classes
+        self.distilled = distilled
+        self.num_features = embed_dim
+        # Representation layer
+        if representation_size and not distilled:
+            self.num_features = representation_size
+            self.pre_logits = nn.Sequential(OrderedDict([
+                ('fc', nn.Linear(embed_dim, representation_size)),
+                ('act', nn.Tanh())
+            ]))
+        else:
+            self.pre_logits = nn.Identity()
+        # Classifier head(s)
+        self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+        self.head_dist = None
+        if distilled:
+            self.head_dist = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+        self.init_weights()
+
+    def forward(self, x):
+        x = self.norm(x)
+        if self.distilled:
+            x, x_dist = self.head(x[:, 0]), self.head_dist(x[:, 1])
+            if self.training and not torch.jit.is_scripting():
+                # during inference, return the average of both classifier predictions
+                return x, x_dist
+            else:
+                return (x + x_dist) / 2
+        else:
+            x = self.pre_logits(x[:, 0])
+            x = self.head(x)
+        return x
+
+    def init_weights(self):
+        self.apply(vit._init_vit_weights)
+
+
+def sequential_vit(img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12,
+                   num_heads=12, mlp_ratio=4., qkv_bias=True, representation_size=None, distilled=False,
+                   drop_rate=0., attn_drop_rate=0., drop_path_rate=0., embed_layer=vit.PatchEmbed, norm_layer=None,
+                   act_layer=None):
+    norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+    act_layer = act_layer or nn.GELU
+    embedding = ViTEmbedding(img_size=img_size, patch_size=patch_size, in_chans=in_chans,
+                             embed_dim=embed_dim, embed_layer=embed_layer, drop_rate=drop_rate, distilled=distilled)
+    dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+    blocks = [vit.Block(
+        dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, drop=drop_rate,
+        attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, act_layer=act_layer)
+        for i in range(depth)]
+    for block in blocks:
+        block.apply(vit._init_vit_weights)
+    head = ViTHead(embed_dim=embed_dim, num_classes=num_classes, norm_layer=norm_layer,
+                   distilled=distilled, representation_size=representation_size)
+    return nn.Sequential(embedding, *blocks, head)
+
+
+def vit_large_patch16_224(**kwargs):
+    model_kwargs = dict(embed_dim=1024, depth=24, num_heads=16, **kwargs)
+    return sequential_vit(**model_kwargs)
+```
+
+## 处理数据集
+
+一般来说, 我们在大型数据集如 ImageNet 上训练 ViT。为了简单期间，我们在这里只使用 CIFAR-10, 因为本教程只是用于流水并行训练。
+
+```python
+def build_cifar(batch_size):
+    transform_train = transforms.Compose([
+        transforms.RandomCrop(224, pad_if_needed=True),
+        transforms.AutoAugment(policy=transforms.AutoAugmentPolicy.CIFAR10),
+        transforms.ToTensor(),
+        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
+    ])
+    transform_test = transforms.Compose([
+        transforms.Resize(224),
+        transforms.ToTensor(),
+        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
+    ])
+
+    train_dataset = CIFAR10(root=os.environ['DATA'], train=True, download=True, transform=transform_train)
+    test_dataset = CIFAR10(root=os.environ['DATA'], train=False, transform=transform_test)
+    train_dataloader = get_dataloader(dataset=train_dataset, shuffle=True, batch_size=batch_size, pin_memory=True)
+    test_dataloader = get_dataloader(dataset=test_dataset, batch_size=batch_size, pin_memory=True)
+    return train_dataloader, test_dataloader
+```
+
+## 使用流水并行训练 ViT
+
+你可以在配置文件中设置流水并行的大小。`NUM_CHUNKS` 在使用交错流水线时很有用 (更多细节见 [Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM](https://arxiv.org/abs/2104.04473) )。
+原始 batch 将会被分割为 `num_microbatches`, 每个阶段每次将加载一个 micro batch。如果你确定性地知道每个阶段输出张量的形状，你可以在配置文件中设置 `tensor_shape` 来减少通信。
+我们的仓库会自动为用户生成合适的schedule来支持流水并行训练。如果你不需要模型的输出和标签，你可以在调用 `trainer.fit()` 时，将 `return_output_label` 设置为 `False`，这样能进一步减少 GPU 显存使用。
+
+你应当使用 `export DATA=/path/to/cifar`。
+
+```python
+BATCH_SIZE = 16
+NUM_EPOCHS = 60
+NUM_CHUNKS = 1
+CONFIG = dict(NUM_MICRO_BATCHES=4, parallel=dict(pipeline=2))
+
+
+def train():
+    disable_existing_loggers()
+    parser = colossalai.get_default_parser()
+    args = parser.parse_args()
+    colossalai.launch_from_torch(backend=args.backend, config=CONFIG)
+    logger = get_dist_logger()
+
+    # build model
+    model = vit_large_patch16_224()
+    model = build_pipeline_model(model, num_chunks=NUM_CHUNKS, verbose=True)
+
+    # build criterion
+    criterion = nn.CrossEntropyLoss()
+
+    # optimizer
+    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0)
+
+    # build dataloader
+    train_dataloader, test_dataloader = build_cifar(BATCH_SIZE)
+
+    engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model, optimizer, criterion,
+                                                                         train_dataloader, test_dataloader)
+    timer = MultiTimer()
+
+    trainer = Trainer(engine=engine, timer=timer, logger=logger)
+
+    hook_list = [
+        hooks.LossHook(),
+        hooks.AccuracyHook(col_nn.metric.Accuracy()),
+        hooks.LogMetricByEpochHook(logger),
+    ]
+
+    trainer.fit(train_dataloader=train_dataloader,
+                epochs=NUM_EPOCHS,
+                test_dataloader=test_dataloader,
+                test_interval=1,
+                hooks=hook_list,
+                display_progress=True)
+```
diff --git a/docs/source/zh/advanced_tutorials/train_vit_with_hybrid_parallelism.md b/docs/source/zh/advanced_tutorials/train_vit_with_hybrid_parallelism.md
new file mode 100644
index 000000000000..6dc5eccf4421
--- /dev/null
+++ b/docs/source/zh/advanced_tutorials/train_vit_with_hybrid_parallelism.md
@@ -0,0 +1,591 @@
+# 使用 Colossal-AI （从数据并行到异构并行）加速 ViT 训练详解
+
+作者：Yuxuan Lou
+
+**示例代码**
+
+- [Colossal-AI Examples ViT on Cifar10](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/image/vision_transformer)
+
+**相关文献**
+- [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/pdf/2010.11929.pdf)
+
+
+## 引言
+
+在这个ViT模型的样例中，Colossal-AI 提供了三种不同的并行技术来加速模型训练：数据并行，流水线并行和张量并行。我们将展示如何使用这三种并行技术在 CIFAR-10 数据集上训练 ViT。为了运行项目，需要2-4个 GPU。
+
+
+## 目录
+1. Colossal-AI 安装方法
+2. 使用数据并行训练 ViT 步骤
+3. 使用数据流水线并行训练 ViT 步骤
+4. 使用张量并行或异构并行训练 ViT 步骤
+
+## Colossal-AI 安装
+可以通过 Python 的官方索引来安装 Colossal-AI 软件包。
+```bash
+pip install colossalai
+```
+
+
+
+## 数据并行
+数据并行是实现加速模型训练的基本方法。通过两步可以实现训练的数据并行：
+1. 构建一个配置文件
+2. 在训练脚本中修改很少的几行代码
+
+### 构建配置文件 (`data_parallel/config.py`)
+为了使用 Colossal-AI，第一步是构建配置文件。并且，在这里有两种变量：
+
+1. **Colossal-AI 功能配置**
+
+Colossal-AI 提供了一系列的功能来加快训练速度（包括模型并行，混合精度，零冗余优化器等）。每个功能都是由配置文件中的相应字段定义的。如果我们只用到数据并行，那么我们只需要具体说明并行模式。在本例中，我们使用 PyTorch 最初提出的混合精度训练，只需要定义混合精度配置 `fp16 = dict(mode=AMP_TYPE.TORCH)` 。
+
+2. **全局超参数**
+
+全局超参数包括特定于模型的超参数、训练设置、数据集信息等。
+
+```python
+from colossalai.amp import AMP_TYPE
+# ViT Base
+BATCH_SIZE = 256
+DROP_RATE = 0.1
+NUM_EPOCHS = 300
+# mix precision
+fp16 = dict(
+    mode=AMP_TYPE.TORCH,
+)
+gradient_accumulation = 16
+clip_grad_norm = 1.0
+dali = dict(
+    gpu_aug=True,
+    mixup_alpha=0.2
+)
+```
+
+### 修改训练脚本 (`/data_parallel/train_with_cifar10.py`)
+
+#### 导入模块
+- Colossal-AI 相关模块
+```python
+import colossalai
+from colossalai.context import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.logging import disable_existing_loggers, get_dist_logger
+from colossalai.nn.lr_scheduler import LinearWarmupLR
+from colossalai.nn.metric import Accuracy
+from colossalai.trainer import Trainer, hooks
+```
+
+- 其他模块
+```python
+import os
+import torch
+from timm.models import vit_base_patch16_224
+from torchvision import transforms
+from torchvision.datasets import CIFAR10
+```
+
+#### 启动 Colossal-AI
+
+在训练脚本中，在构建好配置文件后，需要为 Colossal-AI 初始化分布式环境。我们将此过程称为 `launch` 。在 Colossal-AI 中，我们提供了几种启动方法来初始化分布式后端。在大多数情况下，您可以使用 `colossalai.launch` 和 `colossalai.get_default_parser ` 来实现使用命令行传递参数。此外，Colossal-AI 可以利用 PyTorch 提供的现有启动工具，正如许多用户通过使用熟知的 `colossalai.launch_from_torch` 那样。更多详细信息，您可以查看相关[文档](https://www.colossalai.org/docs/basics/launch_colossalai)。
+
+
+```python
+# initialize distributed setting
+parser = colossalai.get_default_parser()
+args = parser.parse_args()
+colossalai.launch_from_torch(config=args.config)
+disable_existing_loggers()
+logger = get_dist_logger()
+```
+
+初始化后，您可以使用 `colossalai.core.global_context` 访问配置文件中的变量。
+
+```python
+#access parameters
+print(gpc.config.BATCH_SIZE)
+```
+
+#### 构建模型
+
+如果只需要数据并行性，则无需对模型代码进行任何更改。这里，我们使用 `timm` 中的 `vit_base_patch16_224`。
+
+```python
+# build model
+model = vit_base_patch16_224(drop_rate=0.1, num_classes=gpc.config.NUM_CLASSES)
+```
+
+#### 构建 CIFAR-10 数据加载器
+`colossalai.utils.get_dataloader` 可以帮助您轻松构建数据加载器。
+
+```python
+def build_cifar(batch_size):
+    transform_train = transforms.Compose([
+        transforms.RandomCrop(224, pad_if_needed=True),
+        transforms.AutoAugment(policy=transforms.AutoAugmentPolicy.CIFAR10),
+        transforms.ToTensor(),
+        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
+    ])
+    transform_test = transforms.Compose([
+        transforms.Resize(224),
+        transforms.ToTensor(),
+        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
+    ])
+    train_dataset = CIFAR10(root=os.environ['DATA'], train=True, download=True, transform=transform_train)
+    test_dataset = CIFAR10(root=os.environ['DATA'], train=False, transform=transform_test)
+    train_dataloader = get_dataloader(dataset=train_dataset, shuffle=True, batch_size=batch_size, pin_memory=True)
+    test_dataloader = get_dataloader(dataset=test_dataset, batch_size=batch_size, pin_memory=True)
+    return train_dataloader, test_dataloader
+# build dataloader
+train_dataloader, test_dataloader = build_cifar(gpc.config.BATCH_SIZE)
+```
+
+#### 定义优化器，损失函数和学习率调度器
+
+Colossal-AI 提供了自己的优化器、损失函数和学习率调度器。PyTorch 的这些组件与Colossal-AI也兼容。
+
+```python
+# build optimizer
+optimizer = colossalai.nn.Lamb(model.parameters(), lr=1.8e-2, weight_decay=0.1)
+# build loss
+criterion = torch.nn.CrossEntropyLoss()
+# lr_scheduelr
+lr_scheduler = LinearWarmupLR(optimizer, warmup_steps=50, total_steps=gpc.config.NUM_EPOCHS)
+```
+
+#### 启动用于训练的 Colossal-AI 引擎
+
+Engine 本质上是对模型、优化器和损失函数的封装类。当我们使用 `colossalai.initialize` ，将返回一个 engine 对象，并且它已经按照配置文件中的指定内容，配置了梯度剪裁、梯度累积和零冗余优化器等功能。之后，基于 Colossal-AI 的 engine 我们可以进行模型训练。
+
+```python
+engine, train_dataloader, test_dataloader, _ = colossalai.initialize(
+        model, optimizer, criterion, train_dataloader, test_dataloader
+    )
+```
+
+#### 训练：Trainer 应用程序编程接口
+Trainer 是一个更高级的封装类，用户可以用更少的代码就可以实现训练。通过传递 engine 对象很容易创建 trainer 对象。
+
+此外，在 trainer 中，用户可以自定义一些挂钩，并将这些挂钩连接到 trainer 对象。钩子对象将根据训练方案定期执行生命周期方法。例如，`LRSchedulerHook` 将执行`lr_scheduler.step()` 在 `after_train_iter` 或 `after_train_epoch` 阶段更新模型的学习速率。
+
+```python
+# build trainer
+trainer = Trainer(engine=engine, logger=logger)
+# build hooks
+hook_list = [
+    hooks.LossHook(),
+    hooks.AccuracyHook(accuracy_func=MixupAccuracy()),
+    hooks.LogMetricByEpochHook(logger),
+    hooks.LRSchedulerHook(lr_scheduler, by_epoch=True),
+    # comment if you do not need to use the hooks below
+    hooks.SaveCheckpointHook(interval=1, checkpoint_dir='./ckpt'),
+    hooks.TensorboardHook(log_dir='./tb_logs', ranks=[0]),
+]
+```
+
+使用 `trainer.fit` 进行训练:
+
+```python
+# start training
+trainer.fit(
+    train_dataloader=train_dataloader,
+    test_dataloader=test_dataloader,
+    epochs=gpc.config.NUM_EPOCHS,
+    hooks=hook_list,
+    display_progress=True,
+    test_interval=1
+)
+```
+
+### 开始训练
+`DATA` 是自动下载和存储 CIFAR-10 数据集的文件路径。
+
+`<NUM_GPUs>` 是要用于使用 CIFAR-10 数据集，以数据并行方式训练 ViT 的 GPU 数。
+
+```bash
+export DATA=<path_to_data>
+# If your torch >= 1.10.0
+torchrun --standalone --nproc_per_node <NUM_GPUs>  train_dp.py --config ./configs/config_data_parallel.py
+# If your torch >= 1.9.0
+# python -m torch.distributed.run --standalone --nproc_per_node= <NUM_GPUs> train_dp.py --config ./configs/config_data_parallel.py
+# Otherwise
+# python -m torch.distributed.launch --nproc_per_node <NUM_GPUs> --master_addr <node_name> --master_port 29500 train_dp.py --config ./configs/config.py
+```
+
+
+
+## 流水线并行
+除了数据并行性，Colossal-AI 还支持流水线并行。具体而言，Colossal-AI 使用 NVIDIA 引入的 1F1B 流水线。更多详细信息，您可以查看相关[文档](https://www.colossalai.org/tutorials/features/pipeline_parallel)。
+
+### 构建配置文件(`hybrid_parallel/configs/vit_pipeline.py`)
+要在数据并行的基础上应用流水线并行，只需添加一个 **parallel dict**
+```python
+from colossalai.amp import AMP_TYPE
+parallel = dict(
+    pipeline=2
+)
+# pipeline config
+NUM_MICRO_BATCHES = parallel['pipeline']
+TENSOR_SHAPE = (BATCH_SIZE // NUM_MICRO_BATCHES, SEQ_LENGTH, HIDDEN_SIZE)
+fp16 = dict(mode=AMP_TYPE.NAIVE)
+clip_grad_norm = 1.0
+```
+
+其他配置：
+```python
+# hyperparameters
+# BATCH_SIZE is as per GPU
+# global batch size = BATCH_SIZE x data parallel size
+BATCH_SIZE = 256
+LEARNING_RATE = 3e-3
+WEIGHT_DECAY = 0.3
+NUM_EPOCHS = 300
+WARMUP_EPOCHS = 32
+# model config
+IMG_SIZE = 224
+PATCH_SIZE = 16
+HIDDEN_SIZE = 768
+DEPTH = 12
+NUM_HEADS = 12
+MLP_RATIO = 4
+NUM_CLASSES = 10
+CHECKPOINT = True
+SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE) ** 2 + 1  # add 1 for cls token
+```
+
+### 构建流水线模型 (`/hybrid_parallel/model/vit.py`)
+Colossal-AI 提供了两种从现有模型构建流水线模型的方法。
+- `colossalai.builder.build_pipeline_model_from_cfg`
+- `colossalai.builder.build_pipeline_model`
+
+此外，您还可以使用 Colossal-AI 从头开始构建流水线模型。
+```python
+import math
+from typing import Callable
+import inspect
+import torch
+from colossalai import nn as col_nn
+from colossalai.registry import LAYERS, MODELS
+from colossalai.logging import get_dist_logger
+from colossalai.core import global_context as gpc
+from colossalai.context import ParallelMode
+from colossalai.builder.pipeline import partition_uniform
+from torch import dtype, nn
+from model_zoo.vit.vit import ViTBlock, ViTEmbedding, ViTHead
+@MODELS.register_module
+class PipelineVisionTransformer(nn.Module):
+    def __init__(self,
+                 img_size: int = 224,
+                 patch_size: int = 16,
+                 in_chans: int = 3,
+                 num_classes: int = 1000,
+                 depth: int = 12,
+                 num_heads: int = 12,
+                 dim: int = 768,
+                 mlp_ratio: int = 4,
+                 attention_dropout: float = 0.,
+                 dropout: float = 0.1,
+                 drop_path: float = 0.,
+                 layernorm_epsilon: float = 1e-6,
+                 activation: Callable = nn.functional.gelu,
+                 representation_size: int = None,
+                 dtype: dtype = None,
+                 bias: bool = True,
+                 checkpoint: bool = False,
+                 init_method: str = 'torch',
+                 first_stage=True,
+                 last_stage=True,
+                 start_idx=None,
+                 end_idx=None,):
+        super().__init__()
+        layers = []
+        if first_stage:
+            embed = ViTEmbedding(img_size=img_size,
+                                 patch_size=patch_size,
+                                 in_chans=in_chans,
+                                 embedding_dim=dim,
+                                 dropout=dropout,
+                                 dtype=dtype,
+                                 init_method=init_method)
+            layers.append(embed)
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, drop_path, depth)]
+        if start_idx is None and end_idx is None:
+            start_idx = 0
+            end_idx = depth
+        blocks = [
+            ViTBlock(
+                dim=dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                attention_dropout=attention_dropout,
+                dropout=dropout,
+                drop_path=dpr[i],
+                activation=activation,
+                dtype=dtype,
+                bias=bias,
+                checkpoint=checkpoint,
+                init_method=init_method,
+            ) for i in range(start_idx, end_idx)
+        ]
+        layers.extend(blocks)
+        if last_stage:
+            norm = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype)
+            head = ViTHead(dim=dim,
+                           num_classes=num_classes,
+                           representation_size=representation_size,
+                           dtype=dtype,
+                           bias=bias,
+                           init_method=init_method)
+            layers.extend([norm, head])
+        self.layers = nn.Sequential(
+            *layers
+        )
+    def forward(self, x):
+        x = self.layers(x)
+        return x
+def _filter_kwargs(func, kwargs):
+    sig = inspect.signature(func)
+    return {k: v for k, v in kwargs.items() if k in sig.parameters}
+def _build_pipeline_vit(module_cls, num_layers, num_chunks, device=torch.device('cuda'), **kwargs):
+    logger = get_dist_logger()
+    if gpc.is_initialized(ParallelMode.PIPELINE):
+        pipeline_size = gpc.get_world_size(ParallelMode.PIPELINE)
+        pipeline_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
+    else:
+        pipeline_size = 1
+        pipeline_rank = 0
+    rank = gpc.get_global_rank()
+    parts = partition_uniform(num_layers, pipeline_size, num_chunks)[pipeline_rank]
+    models = []
+    for start, end in parts:
+        kwargs['first_stage'] = start == 0
+        kwargs['last_stage'] = end == num_layers
+        kwargs['start_idx'] = start
+        kwargs['end_idx'] = end
+        logger.info(f'Rank{rank} build layer {start}-{end}, {end-start}/{num_layers} layers')
+        chunk = module_cls(**_filter_kwargs(module_cls.__init__, kwargs)).to(device)
+        models.append(chunk)
+    if len(models) == 1:
+        model = models[0]
+    else:
+        model = nn.ModuleList(models)
+    return model
+def build_pipeline_vit(num_layers, num_chunks, device=torch.device('cuda'), **kwargs):
+    return _build_pipeline_vit(PipelineVisionTransformer, num_layers, num_chunks, device, **kwargs)
+```
+
+### 修改训练脚本 (`/hybrid_parallel/train_with_cifar10.py`)
+
+#### 导入模块
+```python
+from colossalai.engine.schedule import (InterleavedPipelineSchedule,
+                                        PipelineSchedule)
+from colossalai.utils import MultiTimer
+import os
+import colossalai
+import torch
+from colossalai.context import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.logging import get_dist_logger
+from colossalai.nn import CrossEntropyLoss
+from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
+from colossalai.utils import is_using_pp, get_dataloader
+from model.vit import build_pipeline_vit
+from model_zoo.vit.vit import _create_vit_model
+from tqdm import tqdm
+from torchvision import transforms
+from torchvision.datasets import CIFAR10
+```
+
+#### 启动 Colossal-AI
+`colossalai.utils.is_using_pp` 可以帮您检查配置文件是否满足流水线并行的要求。
+
+```python
+# initialize distributed setting
+parser = colossalai.get_default_parser()
+args = parser.parse_args()
+# launch from torch
+colossalai.launch_from_torch(config=args.config)
+# get logger
+logger = get_dist_logger()
+logger.info("initialized distributed environment", ranks=[0])
+if hasattr(gpc.config, 'LOG_PATH'):
+    if gpc.get_global_rank() == 0:
+        log_path = gpc.config.LOG_PATH
+        if not os.path.exists(log_path):
+            os.mkdir(log_path)
+        logger.log_to_file(log_path)
+use_pipeline = is_using_pp()
+```
+
+#### 定义模型
+
+```python
+# create model
+model_kwargs = dict(img_size=gpc.config.IMG_SIZE,
+                    patch_size=gpc.config.PATCH_SIZE,
+                    dim=gpc.config.HIDDEN_SIZE,
+                    depth=gpc.config.DEPTH,
+                    num_heads=gpc.config.NUM_HEADS,
+                    mlp_ratio=gpc.config.MLP_RATIO,
+                    num_classes=gpc.config.NUM_CLASSES,
+                    init_method='jax',
+                    checkpoint=gpc.config.CHECKPOINT)
+if use_pipeline:
+    model = build_pipeline_vit(num_layers=model_kwargs['depth'], num_chunks=1, **model_kwargs)
+else:
+    model = _create_vit_model(**model_kwargs)
+```
+
+#### 计算参数个数
+
+您可以轻松计算不同流水线阶段上的模型参数个数。
+
+```
+# count number of parameters
+total_numel = 0
+for p in model.parameters():
+    total_numel += p.numel()
+if not gpc.is_initialized(ParallelMode.PIPELINE):
+    pipeline_stage = 0
+else:
+    pipeline_stage = gpc.get_local_rank(ParallelMode.PIPELINE)
+logger.info(f"number of parameters: {total_numel} on pipeline stage {pipeline_stage}")
+```
+
+#### 构建数据加载器，优化器等组件
+
+```python
+def build_cifar(batch_size):
+    transform_train = transforms.Compose([
+        transforms.RandomCrop(224, pad_if_needed=True),
+        transforms.AutoAugment(policy=transforms.AutoAugmentPolicy.CIFAR10),
+        transforms.ToTensor(),
+        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
+    ])
+    transform_test = transforms.Compose([
+        transforms.Resize(224),
+        transforms.ToTensor(),
+        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
+    ])
+    train_dataset = CIFAR10(root=os.environ['DATA'], train=True, download=True, transform=transform_train)
+    test_dataset = CIFAR10(root=os.environ['DATA'], train=False, transform=transform_test)
+    train_dataloader = get_dataloader(dataset=train_dataset, shuffle=True, batch_size=batch_size, pin_memory=True)
+    test_dataloader = get_dataloader(dataset=test_dataset, batch_size=batch_size, pin_memory=True)
+    return train_dataloader, test_dataloader
+
+
+# craete dataloaders
+train_dataloader , test_dataloader = build_cifar()
+# create loss function
+criterion = CrossEntropyLoss(label_smoothing=0.1)
+# create optimizer
+optimizer = torch.optim.AdamW(model.parameters(), lr=gpc.config.LEARNING_RATE, weight_decay=gpc.config.WEIGHT_DECAY)
+# create lr scheduler
+lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer,
+                                       total_steps=gpc.config.NUM_EPOCHS,
+                                       warmup_steps=gpc.config.WARMUP_EPOCHS)
+```
+
+#### 启动 Colossal-AI 引擎
+
+```python
+# intiailize
+engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model=model,
+                                                                     optimizer=optimizer,
+                                                                     criterion=criterion,
+                                                                     train_dataloader=train_dataloader,
+                                                                     test_dataloader=test_dataloader)
+logger.info("Engine is built", ranks=[0])
+```
+
+#### 训练：基于engine
+
+在数据并行示例中，我们展示了如何使用 Trainer API 训练模型。我们还可以直接训练基于 engine 的模型。通过这种方式，您可以使用更多功能自定义训练方法。
+
+```python
+data_iter = iter(train_dataloader)
+for epoch in range(gpc.config.NUM_EPOCHS):
+    # training
+    engine.train()
+    if gpc.get_global_rank() == 0:
+        description = 'Epoch {} / {}'.format(
+            epoch,
+            gpc.config.NUM_EPOCHS
+        )
+        progress = tqdm(range(len(train_dataloader)), desc=description)
+    else:
+        progress = range(len(train_dataloader))
+    for _ in progress:
+        engine.zero_grad()
+        engine.execute_schedule(data_iter, return_output_label=False)
+        engine.step()
+        lr_scheduler.step()
+```
+
+### 开始训练
+```bash
+export DATA=<path_to_dataset>
+# If your torch >= 1.10.0
+torchrun --standalone --nproc_per_node <NUM_GPUs>  train_hybrid.py --config ./configs/config_pipeline_parallel.py
+# If your torch >= 1.9.0
+# python -m torch.distributed.run --standalone --nproc_per_node= <NUM_GPUs> train_hybrid.py --config ./configs/config_pipeline_parallel.py
+```
+
+
+
+
+## 张量并行和异构并行
+张量并行将每个权重参数跨多个设备进行分区，以减少内存负载。Colossal-AI 支持 1D、2D、2.5D 和 3D 张量并行。此外，还可以将张量并行、流水线并行和数据并行结合起来，实现混合并行。Colossal-AI 还提供了一种简单的方法来应用张量并行和混合并行。只需在配置文件中更改几行代码即可实现流水线并行。
+
+### 构造您的配置文件 (`/hybrid_parallel/configs/vit_1d_tp2_pp2.py`)
+使用张量并行，只需将相关信息添加到 **parallel dict**。具体而言，`TENSOR_PARALLEL_MODE` 可以是“1d”、“2d”、“2.5d”、“3d”。不同并行度的大小应满足：`#GPUs = pipeline parallel size x tensor parallel size x data parallel size`。在指定 GPU 数量、流水线并行大小和张量并行大小后 `data parallel size` 会自动计算。
+
+```python
+from colossalai.amp import AMP_TYPE
+# parallel setting
+TENSOR_PARALLEL_SIZE = 2
+TENSOR_PARALLEL_MODE = '1d'
+parallel = dict(
+    pipeline=2,
+    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE)
+)
+fp16 = dict(mode=AMP_TYPE.NAIVE)
+clip_grad_norm = 1.0
+# pipeline config
+NUM_MICRO_BATCHES = parallel['pipeline']
+TENSOR_SHAPE = (BATCH_SIZE // NUM_MICRO_BATCHES, SEQ_LENGTH, HIDDEN_SIZE)
+```
+
+其他配置:
+```python
+# hyperparameters
+# BATCH_SIZE is as per GPU
+# global batch size = BATCH_SIZE x data parallel size
+BATCH_SIZE = 256
+LEARNING_RATE = 3e-3
+WEIGHT_DECAY = 0.3
+NUM_EPOCHS = 300
+WARMUP_EPOCHS = 32
+# model config
+IMG_SIZE = 224
+PATCH_SIZE = 16
+HIDDEN_SIZE = 768
+DEPTH = 12
+NUM_HEADS = 12
+MLP_RATIO = 4
+NUM_CLASSES = 10
+CHECKPOINT = True
+SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE) ** 2 + 1  # add 1 for cls token
+```
+
+### 开始训练
+```bash
+export DATA=<path_to_dataset>
+# If your torch >= 1.10.0
+torchrun --standalone --nproc_per_node <NUM_GPUs>  train_hybrid.py --config ./configs/config_hybrid_parallel.py
+# If your torch >= 1.9.0
+# python -m torch.distributed.run --standalone --nproc_per_node= <NUM_GPUs> train_hybrid.py --config ./configs/config_hybrid_parallel.py
+```
diff --git a/docs/source/zh/basics/colotensor_concept.md b/docs/source/zh/basics/colotensor_concept.md
new file mode 100644
index 000000000000..cac5b9a4b40d
--- /dev/null
+++ b/docs/source/zh/basics/colotensor_concept.md
@@ -0,0 +1,98 @@
+# ColoTensor Concepts
+
+Author: [Jiarui Fang](https://github.com/feifeibear), [Hongxin Liu](https://github.com/ver217) and [Haichen Huang](https://github.com/1SAA)
+
+**Prerequisite:**
+- [Colossal-AI Overview](../concepts/colossalai_overview.md)
+- [Distributed Training](../concepts/distributed_training.md)
+- [Paradigms of Parallelism](../concepts/paradigms_of_parallelism.md)
+
+## Introduction
+
+在ColossalAI 0.1.8 版本之后，[ColoTensor](https://colossalai.readthedocs.io/en/latest/colossalai/colossalai.tensor.html#colossalai.tensor.ColoTensor) 成为 ColossalAI 中张量的基本数据结构。 它是 torch.Tensor 的子类，可以当做 PyTorch Tensor使用。 此外，一些独特的功能使其能够表示一个payload分布在多个 GPU 设备上的Global  Tensor，并提供一些列方式操作这个Global Tensor。 在 ColoTensor 的帮助下，用户可以以类似编写串行程序方式，编写的分布式 DNN 训练程序。
+
+ColoTensor 包含额外的属性[ColoTensorSpec](https://colossalai.readthedocs.io/en/latest/colossalai/colossalai.tensor.tensor_spec.html#colossalai.tensor.tensor_spec.ColoTensorSpec)
+来描述张量的payload分布和计算模式。
+
+- ProcessGroup：如何将进程组织为通信组。
+- Distributed Spec：张量如何在进程组之间分布。
+- Compute Spec：计算过程中如何使用张量。
+
+我们一一详述。
+
+## ProcessGroup
+
+[ProcessGroup](https://colossalai.readthedocs.io/en/latest/colossalai/colossalai.tensor.html#colossalai.tensor.ProcessGroup) 类的一个实例描述了如何在进程组中组织进程。进程组内的进程可以一起参与同一个集合通信，比如allgather, allreduce等。进程组组织方式被张量的并行策略支配。比如，如果用户定义了Tensor的张量并行（TP），数据并行（DP）方式，那么进程组的进程组织方式将被自动推导出来。 进程组设置可能因不同的张量而异。 因此，它使我们能够支持更复杂的混合并行。流水线并行(PP)定义不在ProcessGroup中描述，它需要另一套机制，我们将在未来补充ColoTensor应用于PP的相关内容。
+
+目前，ColoTensor 的一个进程组由 tp_degree 和 dp_degree 两种配置定义。 在 DP+TP 混合并行的情况下，可以将设备视为 2D 网格。 我们将 TP 通信组放置在设备网格的前导低维上，然后将数据并行组放置在设备网格的高维上。 原因是张量并行比数据并行具有更大的通信开销。 相邻设备放置在一个 TP 进程组内，并且通常放置在同一个节点中。
+
+考虑到8个进程配置为tp_degree=4，dp_degree=2，布局如下图。 进程组 tp0 包含 gpu 0,1,2,3。 进程 dp1 包含 gpu 1 和 5。
+
+<figure style={{textAlign: "center"}}>
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/ColoTensor_layout_demo.PNG"/>
+<figcaption>Process Group using tp_degree=4, dp_degree=2</figcaption>
+</figure>
+
+## Distributed Spec
+
+[Distributed Spec](https://colossalai.readthedocs.io/en/latest/colossalai/colossalai.tensor.distspec.html)描述了 ColoTensor 如何在 ProcessGroup 中分布。
+
+张量在 DP 进程组之间的分布方式是自动导出的，不需要用户手动指定。 如果这个张量是一个模型参数，它会在 DP 进程组中被复制。 如果是activation张量，则沿tensor最高维度在DP进程组中进行平均分割。
+
+因此，在使用 Distributed Spec 时，我们只需要描述张量在 TP 进程组之间的分布方式即可。 TP 进程组目前有两种分布式规范，即 [ShardSpec](https://colossalai.readthedocs.io/en/latest/colossalai/colossalai.tensor.distspec.html#colossalai.tensor.distspec.ShardSpec)和[ReplicaSpec](https://colossalai.readthedocs.io/en/latest/colossalai/colossalai.tensor.distspec.html#colossalai.tensor.distspec.ReplicaSpec)。 ShardSpec 需要指定分区的维度索引 dim 和分区个数 num_partitions。 目前，我们仅支持在单个dim上进行拆分。 TP进程组上不同的dist spec可以通过set_dist_spec()接口相互转换。这些转化操作可以被记录在PyTorch的自动求导机制中，并在反向传播时候触发对应的反向操作。
+
+## Compute Spec
+
+[ComputeSpec](https://colossalai.readthedocs.io/en/latest/colossalai/colossalai.tensor.compute_spec.html#colossalai.tensor.compute_spec.ComputeSpec)类描述Tensor如何参与计算。目前，我们将作为module parameter的ColoTensor设置正确的Compute Pattern。可以触发正取的计算模式。具体应用方式我们会在接下来的文档中展示。
+
+## ColoParameter
+
+[ColoParameter](https://colossalai.readthedocs.io/en/latest/colossalai/colossalai.tensor.colo_parameter.html#colossalai.tensor.colo_parameter.ColoParameter)是ColoTensor的子类。用来声明Parameter。他和ColoTensor关系和Torch.Tensor和torch.Parameter一致。后者可以让tensor出现在module的parameters()和name_parameters() 的返回值中。
+
+## Example
+
+让我们看一个例子。 使用 tp_degree=4, dp_dgree=2 在 8 个 GPU 上初始化并Shard一个ColoTensor。 然后tensor被沿着 TP 进程组中的最后一个维度进行分片。 最后，我们沿着 TP 进程组中的第一个维度（dim 0）对其进行重新Shard。 我们鼓励用户运行代码并观察每个张量的形状。
+
+
+```python
+import torch
+import torch.multiprocessing as mp
+from colossalai.utils import free_port, print_rank_0
+from functools import partial
+
+import colossalai
+from colossalai.tensor import ProcessGroup, ColoTensor, ColoTensorSpec, ShardSpec, ComputeSpec, ComputePattern
+from colossalai.utils import free_port
+
+import torch
+
+def run_dist_tests(rank, world_size, port):
+    colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+    pg = ProcessGroup(tp_degree=2, dp_degree=2)
+
+    torch.manual_seed(0)
+    local_tensor = torch.randn(2, 3, 1).cuda()
+    print_rank_0(f"shape {local_tensor.shape}, {local_tensor.data}")
+
+    spec = ColoTensorSpec(pg, ShardSpec(dims=[-1], num_partitions=[pg.tp_world_size()]), ComputeSpec(ComputePattern.TP1D))
+    t1 = ColoTensor.from_torch_tensor(local_tensor, spec)
+    t1 = t1.to_replicate()
+    print_rank_0(f"shape {t1.shape}, {t1.data}")
+
+    spec2 = ShardSpec([0], [pg.tp_world_size()])
+    t1.set_dist_spec(spec2)
+    print_rank_0(f"shape {t1.shape}, {t1.data}")
+
+def test_dist_cases(world_size):
+    run_func = partial(run_dist_tests, world_size=world_size, port=free_port())
+    mp.spawn(run_func, nprocs=world_size)
+
+if __name__ == '__main__':
+    test_dist_cases(4)
+```
+
+:::caution
+
+The ColoTensor is an experimental feature and may be updated.
+
+:::
diff --git a/docs/source/zh/basics/command_line_tool.md b/docs/source/zh/basics/command_line_tool.md
new file mode 100644
index 000000000000..9b0275a6cedd
--- /dev/null
+++ b/docs/source/zh/basics/command_line_tool.md
@@ -0,0 +1,47 @@
+# 命令行工具
+
+作者: Shenggui Li
+
+**预备知识:**
+- [Distributed Training](../concepts/distributed_training.md)
+- [Colossal-AI Overview](../concepts/colossalai_overview.md)
+
+## 简介
+
+Colossal-AI给用户提供了命令行工具，目前命令行工具可以用来支持以下功能。
+- 检查Colossal-AI是否安装正确
+- 启动分布式训练
+- 张量并行基准测试
+
+## 安装检查
+
+用户可以使用`colossalai check -i`这个命令来检查目前环境里的版本兼容性以及CUDA Extension的状态。
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/05/04/KJmcVknyPHpBofa.png"/>
+<figcaption>Check Installation Demo</figcaption>
+</figure>
+
+## 启动分布式训练
+
+在分布式训练时，我们可以使用`colossalai run`来启动单节点或者多节点的多进程，详细的内容可以参考[启动 Colossal-AI](./launch_colossalai.md)。
+
+## 张量并行基准测试
+
+Colossal-AI提供了多种张量并行，想要充分理解这些方法需要一定的学习成本，对于新手来说很难靠经验选择一个并行方式。
+所以我们提供了一个简单的基准测试，能够让用户在自己的机器上测试不同张量并行的性能。这个基准测试跑一个并行的MLP模型，
+输入数据的维度为`（批大小，序列长度，隐藏层维度）`。通过指定GPU的数量，Colossal-AI会搜索所有可行的并行配置。用户可以通过查看`colossalai benchmark --help`来自定义相关的测试参数。
+
+```shell
+# 使用4个GPU
+colossalai benchmark --gpus 4
+
+# 使用8个GPU
+colossalai benchmark --gpus 8
+```
+
+:::caution
+
+目前仅支持单节点的基准测试。
+
+:::
diff --git a/docs/source/zh/basics/configure_parallelization.md b/docs/source/zh/basics/configure_parallelization.md
new file mode 100644
index 000000000000..eb4b38f48ddb
--- /dev/null
+++ b/docs/source/zh/basics/configure_parallelization.md
@@ -0,0 +1,136 @@
+# 并行配置
+
+作者: Shenggui Li, Siqi Mai
+
+**预备知识:**
+- [分布式训练](../concepts/distributed_training.md)
+- [并行技术](../concepts/paradigms_of_parallelism.md)
+- [构建配置文件](./define_your_config.md)
+
+
+## 简介
+
+我们在 Colossal-AI 中支持多种并行技术。代码库中的混合并行是指您可以轻松地结合数据并行、流水线并行和张量并行（1D、2D、2.5D、3D）的优势共同来进行并行训练。
+
+每种并行方式需要不同的网络拓扑结构，因此要初始化不同的进程组。您可以通过在配置文件中设置 `parallel` 来初始化相应的进程组。 `parallel` 的配置必须遵从以下格式。数据并行度的大小将被根据您对流水线并行和张量并行的输入自动推断。`colossalai.launch` 将根据您的配置自动初始化这些分布式进程组。
+
+我们为您提供了一些配置的例子以供参考。
+
+```python
+# sampler format
+parallel = dict(
+    pipeline=dict("size": int),
+    tensor=dict("size": int, "mode": '1d' or '2d' or '2.5d' or '3d', "kwargs": Any)
+)
+
+# this is ok
+parallel = dict(
+    pipeline=dict(size=2),
+    tensor=dict(size=4, mode='2d')
+)
+
+# this is ok
+parallel = dict(
+    pipeline=2,
+    tensor=dict(size=4, mode='2d')
+)
+
+# this is not ok
+# as you need to specify the mode for tensor parallelism
+parallel = dict(
+    pipeline=2,
+    tensor=4
+)
+
+# this is ok as well as tensor will be default to size 1
+# and mode None
+parallel = dict(
+    pipeline=2
+)
+
+# this is ok as well as pipeline will default to size 1
+parallel = dict(
+    tensor=dict(size=4, mode='2d')
+)
+
+```
+
+关键字 `size` 指的是并行维度的并行大小。 例如，流水线大小为2意味着有
+将有2个流水线阶段。张量并行配置中的关键字 `mode` 意味着相应的张量并行技术
+将被初始化，如1D、2D、2.5D、3D。
+
+**您也可以选择不在您的配置中使用 "并行"，此时流水线和张量的并行度都将默认为大小1。**
+
+**GPU的总数量必须等于` 数据并行大小 x 张量并行大小 x 流水线并行大小` 。**
+
+## 数据并行
+
+数据并行是最常见的分布式训练方式。它将数据分割成几个碎片分别在每个设备上进行训练。数据并行的配置会自动检测并为您设置。您不需要在您的配置中明确地设置它们。在Colossal-AI 中，有两种方法来处理数据并行的 all-reduce。
+
+1. 如果您设置了梯度handler，梯度handler将会all-reduce梯度。
+2. 若没有指定相应的配置，Colossal-AI 将会使用 PyTorch 的 DistributedDataParallel。
+
+在大多数情况下，若您对梯度没有复杂的处理的需求，您将会使用第二种模式。
+
+## 1D, 2D, 2.5D 和 3D 并行
+
+为了实现混合并行，我们提供了一系列张量并行方法。您可以阅读相应的学术论文进行深入的了解。这些并行模式需要和 Colossal-AI 提供的分布式层一同工作。
+
+- 1D: [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053)
+
+- 2D: [An Efficient 2D Method for Training Super-Large Deep Learning Models](https://arxiv.org/abs/2104.05343)
+  2D 并行基于 SUMMA 矩阵乘法，它将输入数据、模型权重和层输出切分成两个不同的维度。 这些张量块分布在 `P = N^2` 设备的二维网格上，其中 `N` 是单一维度上张量块的数量。
+
+- 2.5D: [2.5-dimensional distributed model training](https://arxiv.org/abs/2105.14500)
+  在 2.5D 矩阵乘法的启发下，2.5D 并行引入了一种新的张量并行，进一步将2D张量并行化。其中，`P = N^2 ∗ d` 个处理器被分配到 `d` 层， 每层独立进行矩阵乘法运算，维度为 `N`。
+
+- 3D: [Maximizing Parallelism in Distributed Training for Huge Neural Networks](https://arxiv.org/abs/2105.14450)
+  我们还介绍了一种 3D 张量并行方法，在三维处理器立方体上并行化神经网络。这种方法在数量为 `P` 的处理器上实现了最佳的 `O(P^{1/3})` 通信开销，而计算和内存的使用都是通过优化的参数和激活的负载平衡来实现的。同时，通过优化参数和 activations 的负载平衡，计算和内存的使用都是均匀分布的。
+
+```python
+# 1D parallel
+parallel = dict(
+    tensor=dict(size=4, mode='1d')
+)
+
+# 2D parallel
+parallel = dict(
+    tensor=dict(size=4, mode='2d')
+)
+
+# 2.5D parallel
+parallel = dict(
+    tensor=dict(size=8, mode='2.5d', depth=2)
+)
+
+# 3D parallel
+parallel = dict(
+    tensor=dict(size=8, mode='3d')
+)
+```
+
+当您在配置中指定了张量并行模式，您就可以使用其相应的分布式算子。例如，若您设置模式为 `2d`，那么在模型构建中就能使用 `colossalai.nn.Linear2D` 了。
+
+
+## 流水线并行
+
+流水线并行是将模型按层分成几个部分。例如，假设我们有一个简单的模型，它由两个线性层组成。我们有两个 GPU，我们可以将第一个线性层分配给第一个 GPU 而第二层则分配给第二个 GPU。
+
+您可以在您的配置文件中设置流水线并行度的大小。当流水线并行度大于1，Colossal-AI 将会自动地创建流水线并行的 schedule，这将会为您定义好模型训练的 `forward` 和 `backward`。
+
+```python
+parallel = dict(
+    pipeline=dict(size=4), # number of pipeline stages
+)
+```
+
+## 序列并行
+
+针对处理大图片、视频、长文本、长时间医疗监控等数据的需要，Colossal-AI 还提供了序列并行的方法。该方法是在论文[Sequence Parallelism: Making 4D Parallelism Possible](https://arxiv.org/abs/2105.13120)中提出的。您可以指定模式为 `sequence` 来初始化进程组。
+
+
+```python
+parallel = dict(
+    tensor=dict(size=4, mode='sequence')
+)
+```
diff --git a/docs/source/zh/basics/define_your_config.md b/docs/source/zh/basics/define_your_config.md
new file mode 100644
index 000000000000..d7e49cbf23de
--- /dev/null
+++ b/docs/source/zh/basics/define_your_config.md
@@ -0,0 +1,71 @@
+# 构建配置文件
+
+作者: Guangyang Lu, Shenggui Li, Siqi Mai
+
+**预备知识:**
+- [分布式训练](../concepts/distributed_training.md)
+- [Colossal-AI 总览](../concepts/colossalai_overview.md)
+
+
+## 简介
+
+在 Colossal-AI 中，我们需要一个配置文件来指定系统在训练过程中要注入的特征。在本教程中，我们将向您介绍如何构建您的配置文件以及如何使用这个配置文件。使用配置文件有以下一些好处：
+
+1. 您可以在不同的配置文件中存储您的特征配置和训练超参数。
+2. 对于我们未来发布的新功能，您亦可以在配置中指定，而无需改变训练脚本的代码。
+
+在本教程中，我们将向您介绍如何构建您的配置文件。
+
+## 配置定义
+
+在一个配置文件中，有两种类型的变量。一种是作为特征说明，另一种是作为超参数。所有与特征相关的变量都是保留关键字。例如，如果您想使用混合精度训练，需要在 config 文件中使用变量名`fp16`，并遵循预先定义的格式。
+
+### 功能配置
+
+Colossal-AI 提供了一系列的功能来加快训练速度。每个功能都是由配置文件中的相应字段定义的。在本教程中，我们不会给出所有功能的配置细节，而是提供一个如何指定一个功能的说明。**每个功能的细节可以在其各自的教程中找到。**
+
+为了说明配置文件的使用，我们在这里使用混合精度训练作为例子。您需要遵循以下步骤。
+
+1. 创建一个配置文件（例如 `config.py`，您可以指定任意的文件名）。
+2. 在配置文件中定义混合精度的配置。例如，为了使用 PyTorch 提供的原始混合精度训练，您只需将下面这几行代码写入您的配置文件中。
+
+   ```python
+   from colossalai.amp import AMP_TYPE
+
+   fp16 = dict(
+     mode=AMP_TYPE.TORCH
+   )
+   ```
+
+3. 当启动分布式环境时，向 Colossal-AI 指定您的配置文件的位置。比如下面的例子是配置文件在当前目录下。
+
+   ```python
+   import colossalai
+
+   colossalai.launch(config='./config.py', ...)
+   ```
+
+这样，Colossal-AI 便知道您想使用什么功能，并会在 `colossalai.initialize` 期间注入您所需要的功能。
+
+### 全局超参数
+
+除了功能的配置，您还可以在配置文件中定义训练的超参数。当您想进行多个实验时，这将会变得非常方便。每个实验的细节都可以放在独立的配置文件中，以避免混乱。这些参数将被存储在全局并行环境中，可以在训练脚本中访问。
+
+例如，您可以在配置文件中指定批量大小。
+
+```python
+BATCH_SIZE = 32
+```
+
+启动后，您能够通过全局并行上下文访问您的超参数。
+
+```python
+import colossalai
+from colossalai.core import global_context as gpc
+
+colossalai.launch(config='./config.py', ...)
+
+# access your parameter
+print(gpc.config.BATCH_SIZE)
+
+```
diff --git a/docs/source/zh/basics/engine_trainer.md b/docs/source/zh/basics/engine_trainer.md
new file mode 100644
index 000000000000..a7519bfca14f
--- /dev/null
+++ b/docs/source/zh/basics/engine_trainer.md
@@ -0,0 +1,384 @@
+# 如何在训练中使用 Engine 和 Trainer
+
+作者: Shenggui Li, Siqi Mai
+
+**预备知识:**
+- [初始化功能](./initialize_features.md)
+
+## 简介
+
+在本教程中，您将学习如何使用 Colossal-AI 中提供的 Engine 和 Trainer 来训练您的模型。在深入研究细节之前，我们想先解释一下 Engine 和 Trainer 的概念。
+
+### Engine
+
+Engine 本质上是一个模型、优化器和损失函数的封装类。当我们调用 `colossalai.initialize` 时，一个 Engine 对象将被返回，并且配备了在您的配置文件中指定的梯度剪裁、梯度累计和 ZeRO 优化器等功能。
+
+Engine 将使用与 PyTorch 训练组件类似的 API，因此您只需对代码进行微小的修改即可。
+
+下表展示了Engine的常用API。
+
+| 组件                             | 功能                                      | PyTorch                         | Colossal-AI                            |
+| ------------------------------------- | --------------------------------------------- | ------------------------------- | -------------------------------------- |
+| optimizer                             | 迭代前将所有梯度设置为零 | optimizer.zero_grad()           | engine.zero_grad()                     |
+| optimizer                             | 更新参数                         | optimizer.step()                | engine.step()                          |
+| model                                 | 进行一次前向计算                            | outputs = model(inputs)         | outputs = engine(inputs)               |
+| criterion                             | 计算loss值                      | loss = criterion(output, label) | loss = engine.criterion(output, label) |
+| criterion                             | 反向计算         | loss.backward()                 | engine.backward(loss)                  |
+
+我们需要这样一个 Engine 类的原因是，我们可以添加更多的功能，同时将实现隐藏在
+`colossalai.initialize` 函数中实现。
+假如我们要添加一个新的功能，我们可以在 `colossalai.initialize` 函数中完成对于模型、优化器、数据加载器和损失函数的功能诠释。不管中间的过程有多复杂，最终我们呈现的以及用户需要使用的只有一个 Engine 类，这将十分便捷。
+用户只需要在最小范围内修改他们的代码，将普通的 PyTorch APIs 调整为 Colossal-AI
+Engine 的 API。通过这种方式，他们可以享受更多的功能来进行有效的训练。
+
+以下是一个简单的例子：
+
+```python
+import colossalai
+
+# build your model, optimizer, criterion, dataloaders
+...
+
+engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model,
+                                                                    optimizer,
+                                                                    criterion,
+                                                                    train_dataloader,
+                                                                    test_dataloader)
+for img, label in train_dataloader:
+    engine.zero_grad()
+    output = engine(img)
+    loss = engine.criterion(output, label)
+    engine.backward(loss)
+    engine.step()
+```
+
+### Trainer
+
+Trainer 是一个更高级的封装器，用户可以用更少的代码行来执行训练。 由于 Trainer 的使用会更加简单，相较于 Engine，它会缺少一点灵活性。 Trainer 被设计为进行前向和反向计算来进行模型权重的更新。通过传递 Engine 对象，我们可以很容易地创建一个 Trainer。
+Trainer 的参数 `schedule` 默认值是 `None` 。在大多数情况下，除非我们想使用流水线并行，否则我们把这个值设为 `None`。如果您想探索更多关于这个参数的内容，您可以前往流水线并行的相关教程。
+
+```python
+from colossalai.logging import get_dist_logger
+from colossalai.trainer import Trainer, hooks
+
+# build components and initialize with colossalai.initialize
+...
+
+# create a logger so that trainer can log on the console
+logger = get_dist_logger()
+
+# create a trainer object
+trainer = Trainer(
+    engine=engine,
+    logger=logger
+)
+```
+
+在 Trainer 中，用户可以定制一些 hooks，并将这些 hooks 附加到 Trainer 上。hook 将根据训练方案定期地执行生命周期函数。例如，基于用户是想在每次训练迭代后还是只在整个训练周期后更新学习率，
+`LRSchedulerHook` 将会在 `after_train_iter` 或 `after_train_epoch` 阶段执行 `lr_scheduler.step()` 去为用户更新学习率。您可以将 hook 存储在一个列表中并将其传递给 `trainer.fit` 方法。`trainer.fit` 方法将根据您的参数执行训练和测试。如果 `display_process` 为 True，将在您的控制台显示一个进度条，以显示训练的过程。
+
+
+```python
+# define the hooks to attach to the trainer
+hook_list = [
+    hooks.LossHook(),
+    hooks.LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=True),
+    hooks.AccuracyHook(accuracy_func=Accuracy()),
+    hooks.LogMetricByEpochHook(logger),
+]
+
+# start training
+trainer.fit(
+    train_dataloader=train_dataloader,
+    epochs=NUM_EPOCHS,
+    test_dataloader=test_dataloader,
+    test_interval=1,
+    hooks=hook_list,
+    display_progress=True
+)
+```
+
+如果您想定制您的 hook 类，您可以继承 `hooks.BaseHook` 并重写您想要的生命周期方法。下面提供了一个例子来演示如何创建一个简单的关于日志信息的 hook，以供您参考。
+
+```python
+from colossalai.logging import get_dist_logger
+from colossalai.trainer import hooks
+
+class LogMessageHook(hooks.BaseHook):
+
+    def __init__(self, priority=10):
+        self._logger = get_dist_logger()
+
+    def before_train(self, trainer):
+        self._logger.info('training starts')
+
+    def after_train(self, trainer):
+        self._logger.info('training finished')
+
+
+...
+
+# then in your training script
+hook_list.append(LogMessageHook())
+```
+
+
+
+在下面的章节中，您将会详细地了解到如何用 Engine 和 Trainer 来训练 ResNet 模型。
+
+
+## ResNet
+
+### 总览
+
+在本节中，我们将介绍：
+
+1. 使用一个 Engine 在 CIFAR10 数据集上训练 ResNet34 模型
+2. 使用一个 Trainer 在 CIFAR10 数据集上训练 ResNet34 模型
+
+项目结构如下：
+
+```bash
+-- config.py
+-- run_resnet_cifar10_with_engine.py
+-- run_resnet_cifar10_with_trainer.py
+```
+
+对于使用 Engine 或 Trainer，步骤 1-4 是通用的。 因此，步骤 1-4 + 步骤 5 将会是对应 `run_resnet_cifar10_with_engine.py` 而 步骤 1-4 + 步骤6 则对应 `run_resnet_cifar10_with_trainer.py`。
+
+### 牛刀小试
+
+#### 步骤 1. 创建配置文件
+
+在你的项目文件夹中，创建一个 `config.py`。这个文件是用来指定一些您可能想用来训练您的模型的特征。下面是一个配置文件的例子。
+
+```python
+from colossalai.amp import AMP_TYPE
+
+BATCH_SIZE = 128
+NUM_EPOCHS = 200
+
+fp16=dict(
+    mode=AMP_TYPE.TORCH
+)
+```
+
+在这个配置文件中，我们指定要在每个 GPU 上使用批大小为128，并运行200个 epoch。这两个参数是在 `gpc.config` 中体现的。例如，您可以使用 `gpc.config.BATCH_SIZE` 来访问您存储在配置文件中的批大小值。而 `fp16` 配置则会告诉 `colossalai.initialize` 使用 PyTorch 提供的混合精度训练，以更好的速度和更低的内存消耗来训练模型。
+
+#### 步骤 2. 初始化分布式环境
+
+我们需要初始化分布式训练环境。这在 [启动 Colossal-AI](./launch_colossalai.md) 中有相应的教程。在当前的演示中，我们使用 `launch_from_torch` 和 PyTorch 启用工具。
+
+```python
+import colossalai
+
+# ./config.py refers to the config file we just created in step 1
+colossalai.launch_from_torch(config='./config.py')
+```
+
+#### 步骤 3. 创建所有的训练组件
+
+这时，我们可以创建用于训练的所有组件，包括：
+
+1. 模型
+2. 优化器
+3. 损失函数
+4. 训练/测试数据加载器
+5. 学习率调度器
+6. 日志记录器
+
+
+
+为了构建这些组件，您需要导入以下模块。
+
+```python
+from pathlib import Path
+from colossalai.logging import get_dist_logger
+import torch
+import os
+from colossalai.core import global_context as gpc
+from colossalai.utils import get_dataloader
+from torchvision import transforms
+from colossalai.nn.lr_scheduler import CosineAnnealingLR
+from torchvision.datasets import CIFAR10
+from torchvision.models import resnet34
+```
+
+
+
+然后按照通常在PyTorch脚本中构建组件的方式来构建组件。在下面的脚本中，我们将CIFAR10数据集的根路径设置为环境变量 `DATA`。您可以把它改为您想要的任何路径，例如，您可以把 `root=Path(os.environ['DATA'])` 改为 `root='./data'` ，这样就不需要设置环境变量。
+
+```python
+# build logger
+logger = get_dist_logger()
+
+# build resnet
+model = resnet34(num_classes=10)
+
+# build datasets
+train_dataset = CIFAR10(
+    root='./data',
+    download=True,
+    transform=transforms.Compose(
+        [
+            transforms.RandomCrop(size=32, padding=4),
+            transforms.RandomHorizontalFlip(),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[
+                0.2023, 0.1994, 0.2010]),
+        ]
+    )
+)
+
+test_dataset = CIFAR10(
+    root='./data',
+    train=False,
+    transform=transforms.Compose(
+        [
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[
+                0.2023, 0.1994, 0.2010]),
+        ]
+    )
+)
+
+# build dataloaders
+train_dataloader = get_dataloader(dataset=train_dataset,
+                                  shuffle=True,
+                                  batch_size=gpc.config.BATCH_SIZE,
+                                  num_workers=1,
+                                  pin_memory=True,
+                                  )
+
+test_dataloader = get_dataloader(dataset=test_dataset,
+                                 add_sampler=False,
+                                 batch_size=gpc.config.BATCH_SIZE,
+                                 num_workers=1,
+                                 pin_memory=True,
+                                 )
+
+# build criterion
+criterion = torch.nn.CrossEntropyLoss()
+
+# optimizer
+optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
+
+# lr_scheduler
+lr_scheduler = CosineAnnealingLR(optimizer, total_steps=gpc.config.NUM_EPOCHS)
+```
+
+#### 步骤 4. 用 Colossal-AI 进行初始化
+
+接下来，重要的一步是通过调用 `colossalai.initialize` 获得 Engine。正如 `config.py` 中所述，我们将使用混合精度训练来训练 ResNet34 模型。`colossalai.initialize` 将自动检查您的配置文件，并将相关特征分配给您的训练组件。这样一来，我们的 Engine 已经能够进行混合精度训练，而您不需要进行额外的处理。
+
+```python
+engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model,
+                                                                     optimizer,
+                                                                     criterion,
+                                                                     train_dataloader,
+                                                                     test_dataloader,
+                                                                     )
+```
+
+
+
+#### 步骤 5. 用 Engine 进行训练
+
+当所有的训练组件都准备好后，我们就可以像使用 PyTorch 一样训练 ResNet34 了。
+
+```python
+for epoch in range(gpc.config.NUM_EPOCHS):
+    # execute a training iteration
+    engine.train()
+    for img, label in train_dataloader:
+        img = img.cuda()
+        label = label.cuda()
+
+        # set gradients to zero
+        engine.zero_grad()
+
+        # run forward pass
+        output = engine(img)
+
+        # compute loss value and run backward pass
+        train_loss = engine.criterion(output, label)
+        engine.backward(train_loss)
+
+        # update parameters
+        engine.step()
+
+    # update learning rate
+    lr_scheduler.step()
+
+    # execute a testing iteration
+    engine.eval()
+    correct = 0
+    total = 0
+    for img, label in test_dataloader:
+        img = img.cuda()
+        label = label.cuda()
+
+        # run prediction without back-propagation
+        with torch.no_grad():
+            output = engine(img)
+            test_loss = engine.criterion(output, label)
+
+        # compute the number of correct prediction
+        pred = torch.argmax(output, dim=-1)
+        correct += torch.sum(pred == label)
+        total += img.size(0)
+
+    logger.info(
+        f"Epoch {epoch} - train loss: {train_loss:.5}, test loss: {test_loss:.5}, acc: {correct / total:.5}, lr: {lr_scheduler.get_last_lr()[0]:.5g}", ranks=[0])
+```
+
+#### 步骤 6. 用 Trainer 进行训练
+
+如果您想用 Trainer 进行训练，您可以参考下面的代码进行您的实验。
+
+
+```python
+from colossalai.nn.metric import Accuracy
+from colossalai.trainer import Trainer, hooks
+
+
+# create a trainer object
+trainer = Trainer(
+    engine=engine,
+    logger=logger
+)
+
+# define the hooks to attach to the trainer
+hook_list = [
+    hooks.LossHook(),
+    hooks.LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=True),
+    hooks.AccuracyHook(accuracy_func=Accuracy()),
+    hooks.LogMetricByEpochHook(logger),
+    hooks.LogMemoryByEpochHook(logger)
+]
+
+# start training
+# run testing every 1 epoch
+trainer.fit(
+    train_dataloader=train_dataloader,
+    epochs=gpc.config.NUM_EPOCHS,
+    test_dataloader=test_dataloader,
+    test_interval=1,
+    hooks=hook_list,
+    display_progress=True
+)
+```
+
+
+
+#### 步骤 7. 开始分布式训练
+
+最后，我们可以使用 PyTorch 提供的分布式启动器来调用脚本，因为我们在步骤2中使用了 `launch_from_torch`。您需要把`<num_gpus>` 替换成您机器上可用的GPU数量。如果您只想使用一个 GPU，您可以把这个数字设为1。如果您想使用其他的启动器，请您参考如何启动 Colossal-AI 的教程。
+
+
+```bash
+# with engine
+python -m torch.distributed.launch --nproc_per_node <num_gpus> --master_addr localhost --master_port 29500 run_resnet_cifar10_with_engine.py
+# with trainer
+python -m torch.distributed.launch --nproc_per_node <num_gpus> --master_addr localhost --master_port 29500 run_resnet_cifar10_with_trainer.py
+```
diff --git a/docs/source/zh/basics/initialize_features.md b/docs/source/zh/basics/initialize_features.md
new file mode 100644
index 000000000000..67ea114b42b2
--- /dev/null
+++ b/docs/source/zh/basics/initialize_features.md
@@ -0,0 +1,46 @@
+# 初始化功能
+
+作者: Shenggui Li, Siqi Mai
+
+**预备知识:**
+- [分布式训练](../concepts/distributed_training.md)
+- [Colossal-AI 总览](../concepts/colossalai_overview.md)
+
+## 简介
+
+在本教程中，我们将介绍 `colossalai.initialize` 的使用。 它包含了如何将特征(例如，模型、优化器、数据加载器）无缝注入您的训练组件中。 调用 `colossalai.initialize` 是您进入训练循环前的基本操作。
+
+在下面一节中，我们将介绍 `colossalai.initialize` 是如何工作的以及使用中我们要注意的细节。
+
+## 使用
+
+在一个典型的工作流程中，我们将在训练脚本的开始启动分布式环境。
+之后，我们将实例化我们的对象，如模型、优化器、损失函数、数据加载器等。此时，我们可以使用 `colossalai.initialize` 便捷地为这些对象注入特征。
+具体细节请看以下的伪代码例子。
+
+```python
+import colossalai
+import torch
+...
+
+
+# launch distributed environment
+colossalai.launch(config='./config.py', ...)
+
+# create your objects
+model = MyModel()
+optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
+criterion = torch.nn.CrossEntropyLoss()
+train_dataloader = MyTrainDataloader()
+test_dataloader = MyTrainDataloader()
+
+# initialize features
+engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model,
+                                                                     optimizer,
+                                                                     criterion,
+                                                                     train_dataloader,
+                                                                     test_dataloader)
+```
+
+ `colossalai.initialize` 将返回一个 `Engine` 对象。 该对象把模型、优化器和损失函数封装起来。 **`Engine` 对象会以配置文件中指定的特征运行。**
+关于 `Engine` 的更多使用细节可以在 [在训练中使用Engine和Trainer](./engine_trainer.md) 中获取。
diff --git a/docs/source/zh/basics/launch_colossalai.md b/docs/source/zh/basics/launch_colossalai.md
new file mode 100644
index 000000000000..ca927de578d5
--- /dev/null
+++ b/docs/source/zh/basics/launch_colossalai.md
@@ -0,0 +1,212 @@
+# 启动 Colossal-AI
+
+作者: Chuanrui Wang, Shenggui Li, Siqi Mai
+
+**预备知识:**
+- [分布式训练](../concepts/distributed_training.md)
+- [Colossal-AI 总览](../concepts/colossalai_overview.md)
+
+
+## 简介
+
+正如我们在前面的教程中所提到的，在您的配置文件准备好后，您需要为 Colossal-AI 初始化分布式环境。我们把这个过程称为 `launch`。在本教程中，您将学习如何在您的服务器上启动 Colossal-AI，不管是小型的还是大型的。
+
+在 Colossal-AI 中，我们提供了几种启动方法来初始化分布式后端。
+在大多数情况下，您可以使用 `colossalai.launch` 和 `colossalai.get_default_parser` 来通过命令行传递参数。如果您想使用 SLURM、OpenMPI 和 PyTorch 等启动工具，我们也提供了几个启动的辅助方法以便您的使用。您可以直接从这些启动工具设置的环境变量中访问 rank 和 world size 大小。
+
+在本教程中，我们将介绍如何启动 Colossal-AI 来初始化分布式后端：
+- 用 colossalai.launch 启动
+- 用 Colossal-AI命令行 启动
+- 用 SLURM 启动
+- 用 OpenMPI 启动
+
+## 启动分布式环境
+
+为了启动 Colossal-AI，我们需要两类参数:
+1. 配置文件
+2. 分布式设置
+
+无论我们使用何种启动方式，配置文件是必须要求的，而分布式设置有可能依情况而定。配置文件可以是配置文件的路径或 Python dictionary 的形式。分布式设置可以通过命令行或多进程启动器传递。
+
+### 命令行解析器
+
+在使用 `launch` 之前, 我们首先需要了解我们需要哪些参数来进行初始化。
+如[分布式训练](../concepts/distributed_training.md) 中 `基本概念` 一节所述 ，涉及的重要参数是:
+
+1. host
+2. port
+3. rank
+4. world_size
+5. backend
+
+在 Colossal-AI 中，我们提供了一个命令行解析器，它已经提前添加了这些参数。您可以通过调用 `colossalai.get_default_parser()` 来获得这个解析器。这个解析器通常与 `colossalai.launch` 一起使用。
+
+```python
+# add these lines in your train.py
+import colossalai
+
+# get default parser
+parser = colossalai.get_default_parser()
+
+# if you want to add your own arguments
+parser.add_argument(...)
+
+# parse arguments
+args = parser.parse_args()
+```
+
+您可以在您的终端传入以下这些参数。
+```shell
+
+python train.py --host <host> --rank <rank> --world_size <world_size> --port <port> --backend <backend>
+```
+
+`backend` 是用户可选的，默认值是 nccl。
+
+### 本地启动
+
+为了初始化分布式环境，我们提供了一个通用的 `colossalai.launch` API。`colossalai.launch` 函数接收上面列出的参数，并在通信网络中创建一个默认的进程组。方便起见，这个函数通常与默认解析器一起使用。
+
+```python
+import colossalai
+
+# parse arguments
+args = colossalai.get_default_parser().parse_args()
+
+# launch distributed environment
+colossalai.launch(config=<CONFIG>,
+                  rank=args.rank,
+                  world_size=args.world_size,
+                  host=args.host,
+                  port=args.port,
+                  backend=args.backend
+)
+
+```
+
+
+### 用 Colossal-AI命令行工具 启动
+
+为了更好地支持单节点以及多节点的训练，我们通过封装PyTorch的启动器实现了一个更加方便的启动器。
+PyTorch自带的启动器需要在每个节点上都启动命令才能启动多节点训练，而我们的启动器只需要一次调用即可启动训练。
+
+首先，我们需要在代码里指定我们的启动方式。由于这个启动器是PyTorch启动器的封装，那么我们自然而然应该使用`colossalai.launch_from_torch`。
+分布式环境所需的参数，如 rank, world size, host 和 port 都是由 PyTorch 启动器设置的，可以直接从环境变量中读取。
+
+```python
+import colossalai
+
+colossalai.launch_from_torch(
+    config=<CONFIG>,
+)
+```
+
+接下来，我们可以轻松地在终端使用`colossalai run`来启动训练。下面的命令可以在当前机器上启动一个4卡的训练任务。
+你可以通过设置`nproc_per_node`来调整使用的GPU的数量，也可以改变`master_port`的参数来选择通信的端口。
+
+```shell
+# 在当前节点上启动4卡训练 （默认使用29500端口）
+colossalai run --nproc_per_node 4 train.py
+
+# 在当前节点上启动4卡训练，并使用一个不同的端口
+colossalai run --nproc_per_node 4 --master_port 29505 test.py
+```
+
+如果你在使用一个集群，并且想进行多节点的训练，你需要使用Colossal-AI的命令行工具进行一键启动。我们提供了两种方式来启动多节点任务
+
+- 通过`--hosts`来启动
+
+这个方式适合节点数不多的情况。假设我们有两个节点，分别为`host`和`host2`。我们可以用以下命令进行多节点训练。
+比起单节点训练，多节点训练需要手动设置`--master_addr` （在单节点训练中`master_addr`默认为`127.0.0.1`）。
+
+:::caution
+
+多节点训练时，`master_addr`不能为`localhost`或者`127.0.0.1`，它应该是一个节点的名字或者IP地址。
+
+:::
+
+```shell
+# 在两个节点上训练
+colossalai run --nproc_per_node 4 --host host1,host2 --master_addr host1 test.py
+```
+
+
+- 通过`--hostfile`来启动
+
+这个方式适用于节点数很大的情况。host file是一个简单的文本文件，这个文件里列出了可以使用的节点的名字。
+在一个集群中，可用节点的列表一般由SLURM或者PBS Pro这样的集群资源管理器来提供。比如，在SLURM中，
+你可以从`SLURM_NODELIST`这个环境变量中获取到当前分配列表。在PBS Pro中，这个环境变量为`PBS_NODEFILE`。
+可以通过`echo $SLURM_NODELIST` 或者 `cat $PBS_NODEFILE` 来尝试一下。如果你没有这样的集群管理器，
+那么你可以自己手动写一个这样的文本文件即可。
+
+提供给Colossal-AI的host file需要遵循以下格式，每一行都是一个节点的名字。
+
+```text
+host1
+host2
+```
+
+如果host file准备好了，那么我们就可以用以下命令开始多节点训练了。和使用`--host`一样，你也需要指定一个`master_addr`。
+当使用host file时，我们可以使用一些额外的参数：
+- `--include`: 设置你想要启动训练的节点。比如，你的host file里有8个节点，但是你只想用其中的6个节点进行训练，
+  你可以添加`--include host1,host2,host3,...,host6`，这样训练任务只会在这6个节点上启动。
+
+- `--exclude`: 设置你想排除在训练之外的节点。当你的某一些节点坏掉时，这个参数会比较有用。比如假如host1的GPU有一些问题，无法正常使用，
+  那么你就可以使用`--exclude host1`来将其排除在外，这样你就可以训练任务就只会在剩余的节点上启动。
+
+```shell
+# 使用hostfile启动
+colossalai run --nproc_per_node 4 --hostfile ./hostfile --master_addr host1  test.py
+
+# 只使用部分节点进行训练
+colossalai run --nproc_per_node 4 --hostfile ./hostfile --master_addr host1  --include host1 test.py
+
+# 不使用某些节点进行训练
+colossalai run --nproc_per_node 4 --hostfile ./hostfile --master_addr host1  --exclude host2 test.py
+```
+
+
+### 用 SLURM 启动
+
+如果您是在一个由 SLURM 调度器管理的系统上， 您也可以使用 `srun` 启动器来启动您的 Colossal-AI 脚本。我们提供了辅助函数 `launch_from_slurm` 来与 SLURM 调度器兼容。
+`launch_from_slurm` 会自动从环境变量 `SLURM_PROCID` 和 `SLURM_NPROCS` 中分别读取 rank 和 world size ，并使用它们来启动分布式后端。
+
+您可以在您的训练脚本中尝试以下操作。
+
+```python
+import colossalai
+
+colossalai.launch_from_slurm(
+    config=<CONFIG>,
+    host=args.host,
+    port=args.port
+)
+```
+
+您可以通过在终端使用这个命令来初始化分布式环境。
+
+```bash
+srun python train.py --host <master_node> --port 29500
+```
+
+### 用 OpenMPI 启动
+如果您对OpenMPI比较熟悉，您也可以使用 `launch_from_openmpi` 。
+`launch_from_openmpi` 会自动从环境变量
+`OMPI_COMM_WORLD_LOCAL_RANK`， `MPI_COMM_WORLD_RANK` 和 `OMPI_COMM_WORLD_SIZE` 中分别读取local rank、global rank 和 world size，并利用它们来启动分布式后端。
+
+您可以在您的训练脚本中尝试以下操作。
+```python
+colossalai.launch_from_openmpi(
+    config=<CONFIG>,
+    host=args.host,
+    port=args.port
+)
+```
+
+以下是用 OpenMPI 启动多个进程的示例命令。
+```bash
+mpirun --hostfile <my_hostfile> -np <num_process> python train.py --host <node name or ip> --port 29500
+```
+
+- --hostfile: 指定一个要运行的主机列表。
+- --np: 设置总共要启动的进程（GPU）的数量。例如，如果 --np 4，4个 python 进程将被初始化以运行 train.py。
diff --git a/docs/source/zh/basics/model_checkpoint.md b/docs/source/zh/basics/model_checkpoint.md
new file mode 100644
index 000000000000..cec12d451989
--- /dev/null
+++ b/docs/source/zh/basics/model_checkpoint.md
@@ -0,0 +1,61 @@
+# 模型检查点
+
+作者 : Guangyang Lu
+
+**预备知识:**
+- [Launch Colossal-AI](./launch_colossalai.md)
+- [Initialize Colossal-AI](./initialize_features.md)
+
+**示例代码:**
+- [ColossalAI-Examples Model Checkpoint](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/utils/checkpoint)
+
+**函数是经验函数.**
+
+## 简介
+
+本教程将介绍如何保存和加载模型检查点。
+
+为了充分利用Colossal-AI的强大并行策略，我们需要修改模型和张量，可以直接使用 `torch.save` 或者 `torch.load` 保存或加载模型检查点。在Colossal-AI中，我们提供了应用程序接口实现上述同样的效果。
+
+但是，在加载时，你不需要使用与存储相同的保存策略。
+
+## 使用方法
+
+### 保存
+
+有两种方法可以使用Colossal-AI训练模型，即使用engine或使用trainer。
+**注意我们只保存 `state_dict`.** 因此，在加载检查点时，需要首先定义模型。
+
+#### 同 engine 保存
+
+```python
+from colossalai.utils import save_checkpoint
+model = ...
+engine, _, _, _ = colossalai.initialize(model=model, ...)
+for epoch in range(num_epochs):
+    ... # do some training
+    save_checkpoint('xxx.pt', epoch, model)
+```
+
+#### 用 trainer 保存
+```python
+from colossalai.trainer import Trainer, hooks
+model = ...
+engine, _, _, _ = colossalai.initialize(model=model, ...)
+trainer = Trainer(engine, ...)
+hook_list = [
+            hooks.SaveCheckpointHook(1, 'xxx.pt', model)
+            ...]
+
+trainer.fit(...
+            hook=hook_list)
+```
+
+### 加载
+
+```python
+from colossalai.utils import load_checkpoint
+model = ...
+load_checkpoint('xxx.pt', model)
+... # train or test
+```
diff --git a/docs/source/zh/concepts/colossalai_overview.md b/docs/source/zh/concepts/colossalai_overview.md
new file mode 100755
index 000000000000..cfb35e59e64a
--- /dev/null
+++ b/docs/source/zh/concepts/colossalai_overview.md
@@ -0,0 +1,36 @@
+# Colossal-AI 总览
+
+作者: Shenggui Li, Siqi Mai
+
+## 关于 Colossal-AI
+
+随着深度学习模型规模的发展，向新的训练模式转变是非常重要的。没有并行和优化的传统训练方法将成为过去，新的训练方法是使训练大规模模型高效和节省成本的关键。
+
+Colossal-AI 是一个集成的系统，为用户提供一套综合的训练方法。您可以找到常见的训练方法，如混合精度训练和梯度累积。此外，我们提供了一系列的并行技术，包括数据并行、张量并行和流水线并行。我们通过不同的多维分布式矩阵乘法算法来优化张量并行。我们还提供了不同的流水线并行方法，使用户能够有效地跨节点扩展他们的模型。更多的高级功能，如卸载，也可以在这个教程文档中找到详细的内容。
+
+## Colossal-AI 的使用
+
+我们的目标是使 Colossal-AI 易于使用，并且对用户的代码不产生干扰。如果您想使用Colossal-AI，这里有一个简单的一般工作流程。
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/01/28/ZK7ICWzbMsVuJof.png"/>
+<figcaption>Workflow</figcaption>
+</figure>
+
+1. 准备一个配置文件，指定您要使用的功能和参数。
+2. 用 `colossalai.launch` 初始化分布式后端。
+3. 用 `colossalai.initialize` 将训练特征注入您的训练组件（如模型、优化器）中。
+4. 进行训练和测试.
+
+我们将在`基本教程`部分介绍整个工作流程。
+
+## 未来计划
+
+Colossal-AI 系统将会进一步拓展和优化，包括但不限于:
+
+1. 分布式操作的优化
+2. 异构系统训练的优化
+3. 从模型大小的维度切入，提升训练速度并维持精度
+4. 拓展现有的并行方法
+
+**我们始终欢迎社区的建议和讨论，如果您遇到任何问题，我们将非常愿意帮助您。您可以在GitHub 提 [issue](https://github.com/hpcaitech/ColossalAI/issues) ，或在[论坛](https://github.com/hpcaitech/ColossalAI/discussions)上创建一个讨论主题。**
diff --git a/docs/source/zh/concepts/distributed_training.md b/docs/source/zh/concepts/distributed_training.md
new file mode 100755
index 000000000000..97b3844daa16
--- /dev/null
+++ b/docs/source/zh/concepts/distributed_training.md
@@ -0,0 +1,88 @@
+# 分布式训练
+
+作者: Shenggui Li, Siqi Mai
+
+## 什么是分布式系统？
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/01/28/sE5daHf2ohIy9wX.png"/>
+<figcaption>图片来源: <a href="https://towardsdatascience.com/distributed-training-in-the-cloud-cloud-machine-learning-engine-9e264ddde27f">Towards Data Science</a></figcaption>
+</figure>
+
+分布式系统由多个软件组件组成，在多台机器上运行。例如，传统的数据库运行在一台机器上。随着数据量的爆发式增长，单台机器已经不能为企业提供理想的性能。特别是在双十一这样的网络狂欢节，网络流量会出乎意料的大。为了应对这种压力，现代高性能数据库被设计成在多台机器上运行，它们共同为用户提供高吞吐量和低延迟。
+
+分布式系统的一个重要评价指标是可扩展性。例如，当我们在4台机器上运行一个应用程序时，我们自然希望该应用程序的运行速度能提高4倍。然而，由于通信开销和硬件性能的差异，很难实现线性提速。因此，当我们实现应用程序时，必须考虑如何使其更快。良好的设计和系统优化的算法可以帮助我们提供良好的性能。有时，甚至有可能实现线性和超线性提速。
+
+
+## 为什么我们需要机器学习的分布式训练？
+
+早在2012年，[AlexNet](https://arxiv.org/abs/1404.5997) 就赢得了ImageNet比赛的冠军，而它是在两张 GTX 580 3GB GPU 上训练的。今天，大多数出现在顶级人工智能会议上的模型都是在多个GPU上训练的。当研究人员和工程师开发人工智能模型时，分布式训练无疑是一种常见的做法。这一趋势背后有几个原因。
+
+1. 模型规模迅速增加。2015年的 [ResNet50](https://arxiv.org/abs/1512.03385) 有2000万的参数，
+2018年的 [BERT-Large](https://arxiv.org/abs/1810.04805)有3.45亿的参数，2018年的
+[GPT-2](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)
+有15亿的参数，而2020年的 [GPT-3](https://arxiv.org/abs/2005.14165) 有1750亿个参数。很明显，模型规模随着时间的推移呈指数级增长。目前最大的模型已经超过了1000多亿个参数。而与较小的模型相比，超大型模型通常能提供更优越的性能。
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/01/28/sCyreJ9PF1EdZYf.jpg"/>
+<figcaption>图片来源: <a href="https://huggingface.co/blog/large-language-models">HuggingFace</a></figcaption>
+</figure>
+
+
+2. 数据集规模迅速增加。对于大多数机器学习开发者来说，MNIST 和 CIFAR10 数据集往往是他们训练模型的前几个数据集。然而，与著名的 ImageNet 数据集相比，这些数据集非常小。谷歌甚至有自己的（未公布的）JFT-300M 数据集，它有大约3亿张图片，这比 ImageNet-1k 数据集大了近300倍。
+
+
+3. 计算能力越来越强。随着半导体行业的进步，显卡变得越来越强大。由于核的数量增多，GPU是深度学习最常见的算力资源。从2012年的 K10 GPU 到2020年的 A100 GPU，计算能力已经增加了几百倍。这使我们能够更快地执行计算密集型任务，而深度学习正是这样一项任务。
+
+如今，我们接触到的模型可能太大，以致于无法装入一个GPU，而数据集也可能大到足以在一个GPU上训练一百天。这时，只有用不同的并行化技术在多个GPU上训练我们的模型，我们才能完成并加快模型训练，以追求在合理的时间内获得想要的结果。
+
+
+## 分布式训练的基本概念
+
+分布式训练需要多台机器/GPU。在训练期间，这些设备之间会有通信。为了更好地理解分布式训练，有几个重要的术语需要我们了解清楚。
+
+- host: 主机(host)是通信网络中的主要设备。在初始化分布式环境时，经常需要它作为一个参数。
+- port: 这里的端口(port)主要是指主机上用于通信的主端口。
+- rank: 在网络中赋予设备的唯一ID。
+- world size: 网络中设备的数量。
+- process group: 进程组(process group)是一个通信网络，包括设备的一个子集。总是有一个默认的进程组，它包含所有的设备。一个子集的设备可以形成一个进程组，以便它们只在组内的设备之间进行通信。
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/01/28/qnNBKh8AjzgM5sY.png"/>
+<figcaption>一个分布式系统的例子</figcaption>
+</figure>
+
+为了说明这些概念，让我们假设我们有2台机器（也称为节点），每台机器有4个 GPU。当我们在这两台机器上初始化分布式环境时，我们基本上启动了8个进程（每台机器上有4个进程），每个进程被绑定到一个 GPU 上。
+
+在初始化分布式环境之前，我们需要指定主机（主地址）和端口（主端口）。在这个例子中，我们可以让主机为节点0，端口为一个数字，如29500。所有的8个进程将寻找地址和端口并相互连接，默认的进程组将被创建。默认进程组的 world size 为8，细节如下。
+
+| process ID | rank | Node index | GPU index |
+| ---------- | ---- | ---------- | --------- |
+| 0          | 0    | 0          | 0         |
+| 1          | 1    | 0          | 1         |
+| 2          | 2    | 0          | 2         |
+| 3          | 3    | 0          | 3         |
+| 4          | 4    | 1          | 0         |
+| 5          | 5    | 1          | 1         |
+| 6          | 6    | 1          | 2         |
+| 7          | 7    | 1          | 3         |
+
+
+我们还可以创建一个新的进程组。这个新的进程组可以包含任何进程的子集。例如，我们可以创建一个只包含偶数进程的组:
+
+| process ID | rank | Node index | GPU index |
+| ---------- | ---- | ---------- | --------- |
+| 0          | 0    | 0          | 0         |
+| 2          | 1    | 0          | 2         |
+| 4          | 2    | 1          | 0         |
+| 6          | 3    | 1          | 2         |
+
+**请注意，rank 是相对于进程组而言的，一个进程在不同的进程组中可以有不同的 rank。最大的 rank 始终是 `world size of the process group - 1`。**
+
+在进程组中，各进程可以通过两种方式进行通信。
+1. peer-to-peer: 一个进程向另一个进程发送数据。
+2. collective: 一组进程一起执行分散、聚集、all-reduce、广播等操作。
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/01/28/zTmlxgc3oeAdn97.png"/>
+<figcaption>Collective communication， 来源: <a href="https://pytorch.org/tutorials/intermediate/dist_tuto.html">PyTorch distributed tutorial</a></figcaption>
+</figure>
diff --git a/docs/source/zh/concepts/paradigms_of_parallelism.md b/docs/source/zh/concepts/paradigms_of_parallelism.md
new file mode 100755
index 000000000000..0d6d58fd281c
--- /dev/null
+++ b/docs/source/zh/concepts/paradigms_of_parallelism.md
@@ -0,0 +1,91 @@
+# 并行技术
+
+作者: Shenggui Li, Siqi Mai
+
+## 简介
+
+随着深度学习的发展，对并行训练的需求越来越大。这是因为模型和数据集越来越大，如果我们坚持使用单 GPU 训练，训练过程的等待将会成为一场噩梦。在本节中，我们将对现有的并行训练方法进行简要介绍。如果您想对这篇文章进行补充，欢迎在[GitHub论坛](https://github.com/hpcaitech/ColossalAI/discussions)上进行讨论。
+
+## 数据并行
+
+数据并行是最常见的并行形式，因为它很简单。在数据并行训练中，数据集被分割成几个碎片，每个碎片被分配到一个设备上。这相当于沿批次维度对训练过程进行并行化。每个设备将持有一个完整的模型副本，并在分配的数据集碎片上进行训练。在反向传播之后，模型的梯度将被全部减少，以便在不同设备上的模型参数能够保持同步。
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/01/28/WSAensMqjwHdOlR.png"/>
+<figcaption>数据并行</figcaption>
+</figure>
+
+## 模型并行
+
+在数据并行训练中，一个明显的特点是每个 GPU 持有整个模型权重的副本。这就带来了冗余问题。另一种并行模式是模型并行，即模型被分割并分布在一个设备阵列上。通常有两种类型的并行：张量并行和流水线并行。张量并行是在一个操作中进行并行计算，如矩阵-矩阵乘法。流水线并行是在各层之间进行并行计算。因此，从另一个角度来看，张量并行可以被看作是层内并行，流水线并行可以被看作是层间并行。
+
+### 张量并行
+
+张量并行训练是将一个张量沿特定维度分成 `N` 块，每个设备只持有整个张量的 `1/N`，同时不影响计算图的正确性。这需要额外的通信来确保结果的正确性。
+
+以一般的矩阵乘法为例，假设我们有 `C = AB`。我们可以将B沿着列分割成 `[B0 B1 B2 ... Bn]`，每个设备持有一列。然后我们将 `A` 与每个设备上 `B` 中的每一列相乘，我们将得到 `[AB0 AB1 AB2 ... ABn]` 。此刻，每个设备仍然持有一部分的结果，例如，设备(rank=0)持有 `AB0`。为了确保结果的正确性，我们需要收集全部的结果，并沿列维串联张量。通过这种方式，我们能够将张量分布在设备上，同时确保计算流程保持正确。
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/01/28/2ZwyPDvXANW4tMG.png"/>
+<figcaption>张量并行</figcaption>
+</figure>
+
+在 Colossal-AI 中，我们提供了一系列的张量并行方法，即 1D、2D、2.5D 和 3D 张量并行。我们将在`高级教程`中详细讨论它们。
+
+
+相关文章:
+- [GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding](https://arxiv.org/abs/2006.16668)
+- [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053)
+- [An Efficient 2D Method for Training Super-Large Deep Learning Models](https://arxiv.org/abs/2104.05343)
+- [2.5-dimensional distributed model training](https://arxiv.org/abs/2105.14500)
+- [Maximizing Parallelism in Distributed Training for Huge Neural Networks](https://arxiv.org/abs/2105.14450)
+
+### 流水线并行
+
+流水线并行一般来说很容易理解。请您回忆一下您的计算机结构课程，这确实存在于 CPU 设计中。
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/01/28/at3eDv7kKBusxbd.png"/>
+<figcaption>流水线并行</figcaption>
+</figure>
+
+流水线并行的核心思想是，模型按层分割成若干块，每块都交给一个设备。在前向传递过程中，每个设备将中间的激活传递给下一个阶段。在后向传递过程中，每个设备将输入张量的梯度传回给前一个流水线阶段。这允许设备同时进行计算，并增加了训练的吞吐量。流水线并行训练的一个缺点是，会有一些设备参与计算的冒泡时间，导致计算资源的浪费。
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/01/28/sDNq51PS3Gxbw7F.png"/>
+<figcaption>Source: <a href="https://arxiv.org/abs/1811.06965">GPipe</a></figcaption>
+</figure>
+
+相关文章:
+- [PipeDream: Fast and Efficient Pipeline Parallel DNN Training](https://arxiv.org/abs/1806.03377)
+- [GPipe: Efficient Training of Giant Neural Networks using Pipeline Parallelism](https://arxiv.org/abs/1811.06965)
+- [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053)
+- [Chimera: Efficiently Training Large-Scale Neural Networks with Bidirectional Pipelines](https://arxiv.org/abs/2107.06925)
+
+
+## 优化器相关的并行
+
+另一种并行方法和优化器相关，目前这种并行最流行的方法是 `ZeRO`，即[零冗余优化器](https://arxiv.org/abs/1910.02054)。 ZeRO 在三个层面上工作，以消除内存冗余（ZeRO需要进行fp16训练）。
+
+- Level 1: 优化器状态在各进程中被划分。
+- Level 2: 用于更新模型权重的32位梯度也被划分，因此每个进程只存储与其优化器状态划分相对应的梯度。
+- Level 3: 16位模型参数在各进程中被划分。
+
+相关文章:
+- [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models](https://arxiv.org/abs/1910.02054)
+
+
+## 异构系统的并行
+
+上述方法通常需要大量的 GPU 来训练一个大型模型。然而，人们常常忽略的是，与 GPU 相比，CPU 的内存要大得多。在一个典型的服务器上，CPU 可以轻松拥有几百GB的内存，而每个 GPU 通常只有16或32GB的内存。这促使人们思考为什么 CPU 内存没有被用于分布式训练。
+
+最近的进展是依靠 CPU 甚至是 NVMe 磁盘来训练大型模型。主要的想法是，在不使用张量时，将其卸载回 CPU 内存或 NVMe 磁盘。通过使用异构系统架构，有可能在一台机器上容纳一个巨大的模型。
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/01/28/qLHD5lk97hXQdbv.png"/>
+<figcaption>异构系统</figcaption>
+</figure>
+
+相关文章:
+- [ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning](https://arxiv.org/abs/2104.07857)
+- [PatrickStar: Parallel Training of Pre-trained Models via Chunk-based Memory Management](https://arxiv.org/abs/2108.05818)
diff --git a/docs/source/zh/features/1D_tensor_parallel.md b/docs/source/zh/features/1D_tensor_parallel.md
new file mode 100644
index 000000000000..8f3a3c6209da
--- /dev/null
+++ b/docs/source/zh/features/1D_tensor_parallel.md
@@ -0,0 +1,111 @@
+# 1D 张量并行
+
+作者: Zhengda Bian, Yongbin Li
+
+**前置教程**
+- [定义配置文件](../basics/define_your_config.md)
+- [并行配置](../basics/configure_parallelization.md)
+
+**示例代码**
+- [ColossalAI-Examples 1D Tensor Parallelism](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/features/tensor_parallel/tensor_parallel_1d.py)
+
+**相关论文**
+- [Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM](https://deepakn94.github.io/assets/papers/megatron-sc21.pdf)
+
+## 引言
+
+张量并行将模型参数划分到多个设备上，以减少内存负荷。
+[Megatron-LM](https://deepakn94.github.io/assets/papers/megatron-sc21.pdf) 介绍了一种高效的一维张量并行化实现。
+
+让我们以一个线性层为例，它包括一个 GEMM $Y = XA$。 给定2个处理器，我们把列 $A$ 划分为 $[A_1 ~ A_2]$, 并在每个处理器上计算 $Y_i = XA_i$ , which then forms $[Y_1 ~ Y_2] = [XA_1 ~ XA_2]$. This is called a column-parallel fashion.
+
+当第二个线性层 $Z=YB$ 跟随上述列并行层的时候, 我们把 $B$ 划分为 $\left[\begin{matrix} B_1 \\ B_2 \end{matrix} \right]$,
+这就是所谓的行并行方式.
+为了计算 $Z = [Y_1 ~ Y_2] \left[\begin{matrix} B_1 \\ B_2 \end{matrix} \right]$, 我们首先在每个处理器上计算 $Y_iB_i$ 然后使用一个all-reduce操作将结果汇总为 $Z=Y_1B_1+Y_2B_2$。
+
+我们还需要注意，在后向计算中，列并行线性层需要聚合输入张量 $X$, 因为在每个处理器 $i$ 上，我们只有 $\dot{X_i}=\dot{Y_i}A_i^T$，因此，我们在各处理器之间进行all-reduce，得到 $\dot{X}=\dot{Y}A^T=\dot{Y_1}A_1^T+\dot{Y_2}A_2^T$。
+
+## 效率
+给定 $P$ 个处理器, 我们展现理论上的计算和内存成本，以及基于环形算法的1D张量并行的前向和后向的通信成本。
+
+| 计算 | 内存 (参数) | 内存 (activations) | 通信 (带宽) | 通信 (时延) |
+| :-:         | :-:              | :-:                  | :-:                       | :-:                     |
+| $O(1/P)$    | $O(1/P)$         | $O(1)$               | $O(2(P-1)/P)$             | $O(2(P-1))$             |
+
+## 使用
+
+为了使模型能够实现一维张量并行, 如在2个 GPU 上, 我们需要配置如下的并行设置。
+```python
+CONFIG = dict(parallel=dict(
+    data=1,
+    pipeline=1,
+    tensor=dict(size=2, mode='1d'),
+))
+```
+
+然后 Colossal-AI 会自动对所有来自 `colossalai.nn` 的层应用1D张量并行。
+
+让我们定义一个由两层多层感知器 (MLP) 组成的模型，如下所示。
+```python
+import colossalai
+import colossalai.nn as col_nn
+import torch
+from colossalai.utils import print_rank_0
+
+class MLP(torch.nn.Module):
+    def __init__(self, dim: int = 256):
+        super().__init__()
+        intermediate_dim = dim * 4
+        self.dense_1 = col_nn.Linear(dim, intermediate_dim)
+        print_rank_0(f'Weight of the first linear layer: {self.dense_1.weight.transpose(0, 1).shape}')
+        self.activation = torch.nn.GELU()
+        self.dense_2 = col_nn.Linear(intermediate_dim, dim)
+        print_rank_0(f'Weight of the second linear layer: {self.dense_2.weight.transpose(0, 1).shape}')
+        self.dropout = col_nn.Dropout(0.1)
+
+    def forward(self, x):
+        x = self.dense_1(x)
+        print_rank_0(f'Output of the first linear layer: {x.shape}')
+        x = self.activation(x)
+        x = self.dense_2(x)
+        print_rank_0(f'Output of the second linear layer: {x.shape}')
+        x = self.dropout(x)
+        return x
+```
+
+在2个 GPU 上启动 Colossal-AI 并建立模型。
+
+```python
+parser = colossalai.get_default_parser()
+colossalai.launch(config=CONFIG,
+                  rank=args.rank,
+                  world_size=args.world_size,
+                  local_rank=args.local_rank,
+                  host=args.host,
+                  port=args.port)
+
+m = MLP()
+```
+我们将会看到 MLP 模型中被划分的参数（如权重）的形状。
+```shell
+Weight of the first linear layer: torch.Size([256, 512])
+Weight of the second linear layer: torch.Size([512, 256])
+```
+第一个线性层的完整权重形状应该为 `[256, 1024]`. 经过列-并行分割，它变成了 `[256, 512]`。
+同样地，第二个行并行层将权重 `[1024, 256]` 划分为 `[512, 256]`。
+
+我们可以用一些随机输入来运行这个模型。
+```python
+from colossalai.utils import get_current_device
+
+x = torch.randn((16, 256), device=get_current_device())
+torch.distributed.broadcast(x, src=0)  # synchronize input
+
+x = m(x)
+```
+然后我们可以看到 activation 结果的形状。
+```shell
+Output of the first linear layer: torch.Size([16, 512])
+Output of the second linear layer: torch.Size([16, 256])
+```
+第一个线性层的输出被划分成2块 (每个形状为 `[16, 512]`), 而第二层在整个 GPU 上的输出是相同的。
diff --git a/docs/source/zh/features/2D_tensor_parallel.md b/docs/source/zh/features/2D_tensor_parallel.md
new file mode 100644
index 000000000000..c942f82bf9d2
--- /dev/null
+++ b/docs/source/zh/features/2D_tensor_parallel.md
@@ -0,0 +1,141 @@
+# 2D 张量并行
+
+作者: Zhengda Bian, Yongbin Li
+
+**前置教程**
+- [定义配置文件](../basics/define_your_config.md)
+- [并行配置](../basics/configure_parallelization.md)
+- [1D 张量并行](./1D_tensor_parallel.md)
+
+**示例代码**
+- [ColossalAI-Examples - 2D Tensor Parallelism](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/features/tensor_parallel/tensor_parallel_2d.py)
+
+**相关论文**
+- [An Efficient 2D Method for Training Super-Large Deep Learning Models](https://arxiv.org/pdf/2104.05343.pdf)
+
+## 引言
+
+1D张量并行没有对 activations 进行划分，就大规模模型而言，这也会消耗大量的内存。
+为了平均分配计算和内存负荷，在 SUMMA（可扩展的通用矩阵乘法算法）的基础上， [2D张量并行](https://arxiv.org/pdf/2104.05343.pdf) 被引入。
+
+我们还是以线性层 $Y = XA$ 为例。
+给定 $P=q\times q$ 个处理器（必要条件）, 如 $q=2$, 我们把输入 $X$ 和权重A $A$ 都划分为
+
+$$
+\left[\begin{matrix} X_{10} & X_{11} \\ X_{00} & X_{01} \end{matrix} \right]
+\text{~and~}
+\left[\begin{matrix} A_{10} & A_{11} \\ A_{00} & A_{01} \end{matrix} \right]。
+$$
+
+该计算包括 $q$ 步。 当 $t=1$ 时, $X_{i0}$ 在其行中被广播, 而 $A_{0j}$ 在其列中被广播。因此，我们有
+
+$$
+\left[\begin{matrix} X_{10},A_{00} & X_{10},A_{01} \\ X_{00},A_{00} & X_{00},A_{01} \end{matrix} \right]。
+$$
+
+然后我们在每个处理器 $(i, j)$ 上将 $X_{i0}$ 和 $A_{0j}$ 相乘为
+
+$$
+\left[\begin{matrix} X_{10}A_{00} & X_{10}A_{01} \\ X_{00}A_{00} & X_{00}A_{01} \end{matrix} \right] (1)。
+$$
+
+同样，当 $t=2$ 时, $X_{i1}$ 在其行中被广播, $A_{1j}$ 在其列中被广播, 我们将它们相乘为
+
+$$
+\left[\begin{matrix} X_{11}A_{10} & X_{11}A_{11} \\ X_{01}A_{10} & X_{01}A_{11} \end{matrix} \right] (2)。
+$$
+
+通过将 $(1)$ 和 $(2)$ 相加，我们有
+
+$$
+Y = XA = \left[\begin{matrix} X_{10}A_{00}+X_{11}A_{10} & X_{10}A_{01}+X_{11}A_{11} \\ X_{00}A_{00}+X_{01}A_{10} & X_{00}A_{01}+X_{01}A_{11} \end{matrix} \right]。
+$$
+
+## 效率
+给定 $P=q\times q$ 个处理器, 我们展现理论上的计算和内存成本，以及基于环形算法的2D张量并行的前向和后向的通信成本。
+
+| 计算 | 内存 (参数) | 内存 (activations) | 通信 (带宽) | 通信 (时延) |
+| :-:         | :-:              | :-:                  | :-:                       | :-:                     |
+| $O(1/q^2)$  | $O(1/q^2)$       | $O(1/q^2)$           | $O(6(q-1)/q)$             | $O(6(q-1))$             |
+
+## 使用
+
+为了使我们的模型能够实现二维张量并行，例如在4个 GPU 上，我们需要配置如下的并行设置。
+```python
+CONFIG = dict(parallel=dict(
+    data=1,
+    pipeline=1,
+    tensor=dict(size=4, mode='2d'),
+))
+```
+然后 Colossal-AI 会自动对所有来自 `colossalai.nn` 的层应用2D张量并行。
+
+让我们定义一个由两层多层感知器 (MLP) 组成的模型，如下所示。
+```python
+import colossalai
+import colossalai.nn as col_nn
+import torch
+from colossalai.utils import print_rank_0
+
+class MLP(torch.nn.Module):
+    def __init__(self, dim: int = 256):
+        super().__init__()
+        intermediate_dim = dim * 4
+        self.dense_1 = col_nn.Linear(dim, intermediate_dim)
+        print_rank_0(f'Weight of the first linear layer: {self.dense_1.weight.shape}')
+        self.activation = torch.nn.GELU()
+        self.dense_2 = col_nn.Linear(intermediate_dim, dim)
+        print_rank_0(f'Weight of the second linear layer: {self.dense_2.weight.shape}')
+        self.dropout = col_nn.Dropout(0.1)
+
+    def forward(self, x):
+        x = self.dense_1(x)
+        print_rank_0(f'Output of the first linear layer: {x.shape}')
+        x = self.activation(x)
+        x = self.dense_2(x)
+        print_rank_0(f'Output of the second linear layer: {x.shape}')
+        x = self.dropout(x)
+        return x
+```
+在4个 GPU 上启动 Colossal-AI 并建立模型。
+```python
+parser = colossalai.get_default_parser()
+colossalai.launch(config=CONFIG,
+                  rank=args.rank,
+                  world_size=args.world_size,
+                  local_rank=args.local_rank,
+                  host=args.host,
+                  port=args.port)
+
+m = MLP()
+```
+我们将会看到 MLP 模型中被划分的参数（如权重）的形状。
+```shell
+Weight of the first linear layer: torch.Size([128, 512])
+Weight of the second linear layer: torch.Size([512, 128])
+```
+第一个线性层的完整权重形状应该为 `[256, 1024]`. 经过2D并行划分后，它在每个 GPU 上变成了 `[128, 512]` 。
+同样地，第二层将权重 `[1024, 256]` 划分为 `[512, 128]`.
+
+我们可以用一些随机输入来运行这个模型。
+```python
+from colossalai.context import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.utils import get_current_device
+
+x = torch.randn((16, 256), device=get_current_device())
+# partition input
+torch.distributed.broadcast(x, src=0)
+x = torch.chunk(x, 2, dim=0)[gpc.get_local_rank(ParallelMode.PARALLEL_2D_COL)]
+x = torch.chunk(x, 2, dim=-1)[gpc.get_local_rank(ParallelMode.PARALLEL_2D_ROW)]
+print_rank_0(f'Input: {x.shape}')
+
+x = m(x)
+```
+然后我们可以看到 activation 结果的形状。
+```shell
+Input: torch.Size([8, 128])
+Output of the first linear layer: torch.Size([8, 512])
+Output of the second linear layer: torch.Size([8, 128])
+```
+2D并行中的 activation 张量都是同时在行和列分割的。例如，第一个线性层的输出是 `[8, 512]`, 而第二层的输出为 `[8, 128]`。
diff --git a/docs/source/zh/features/2p5D_tensor_parallel.md b/docs/source/zh/features/2p5D_tensor_parallel.md
new file mode 100644
index 000000000000..59a4be02ce47
--- /dev/null
+++ b/docs/source/zh/features/2p5D_tensor_parallel.md
@@ -0,0 +1,145 @@
+# 2.5D 张量并行
+
+作者: Zhengda Bian, Yongbin Li
+
+**前置教程**
+- [定义配置文件](../basics/define_your_config.md)
+- [并行配置](../basics/configure_parallelization.md)
+- [1D 张量并行](./1D_tensor_parallel.md)
+- [2D 张量并行](./2D_tensor_parallel.md)
+
+**示例代码**
+- [ColossalAI-Examples - 2.5D Tensor Parallelism](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/features/tensor_parallel/tensor_parallel_2p5d.py)
+
+**相关论文**
+- [2.5-dimensional distributed model training](https://arxiv.org/pdf/2105.14500.pdf)
+
+## 引言
+
+与一维张量并行相比，二维并行降低了内存成本，但可能引入更多的通信。因此，[2.5D张量并行](https://arxiv.org/pdf/2105.14500.pdf) 在 2.5D SUMMA 的基础上被提出，它通过使用更多的设备来减少通信。
+
+我们还是以线性层 $Y = XA$ 为例。
+给定 $P=q \times q \times d$ 个处理器（必要条件）, 如 $q=d=2$, 我们把输入 $X$ 划分为 $d\times q$ 行和 $q$ 列
+
+$$
+\left[\begin{matrix} X_{30} & X_{31} \\ X_{20} & X_{21} \\ X_{10} & X_{11} \\ X_{00} & X_{01}\end{matrix} \right],
+$$
+它可以被重塑为 $d$ 层
+
+$$
+\left[\begin{matrix} X_{10} & X_{11} \\ X_{00} & X_{01} \end{matrix} \right] \text{~and~}\left[\begin{matrix} X_{30} & X_{31} \\ X_{20} & X_{21} \end{matrix} \right].
+$$
+
+另外，权重 $A$ 被分割为
+
+$$
+\left[\begin{matrix} A_{10} & A_{11} \\ A_{00} & A_{01} \end{matrix} \right].
+$$
+
+对于 $X$ 相关的每一层, 我们使用SUMMA算法将 $X$ 与 $A$ 相乘。
+然后，我们得到输出
+
+$$
+\left[\begin{matrix} Y_{10}=X_{10}A_{00}+X_{11}A_{10} & Y_{11}=X_{10}A_{01}+X_{11}A_{11} \\ Y_{00}=X_{00}A_{00}+X_{01}A_{10} & Y_{01}=X_{00}A_{01}+X_{01}A_{11} \end{matrix} \right]
+\text{~and~}
+$$
+$$
+\left[\begin{matrix} Y_{30}=X_{30}A_{00}+X_{31}A_{10} & Y_{31}=X_{30}A_{01}+X_{31}A_{11} \\ Y_{20}=X_{20}A_{00}+X_{21}A_{10} & Y_{21}=X_{20}A_{01}+X_{21}A_{11} \end{matrix} \right].
+$$
+
+## 效率
+
+给定 $P=q \times q \times d$ 个处理器, 我们展现理论上的计算和内存成本，以及基于环形算法的2.5D张量并行的前向和后向的通信成本。
+
+| 计算 | 内存 (参数) | 内存 (activations) | 通信 (带宽) | 通信 (时延) |
+| :-:         | :-:              | :-:                  | :-:                       | :-:                     |
+| $O(1/dq^2)$ | $O(1/q^2)$       | $O(1/dq^2)$          | $\small O(3(q-1)(d+1)/dq)$       | $O(6(q-1))$             |
+
+## 使用
+
+为了使我们的模型能够实现2.5D张量并行，例如在8个 GPU 上，我们需要配置如下的并行设置。
+
+```python
+CONFIG = dict(parallel=dict(
+    data=1,
+    pipeline=1,
+    tensor=dict(size=8, mode='2.5d', depth=2),
+))
+
+```
+
+然后 Colossal-AI 会自动对所有来自 `colossalai.nn` 的层应用2.5D张量并行。
+
+让我们定义一个由两层多层感知器 (MLP) 组成的模型，如下所示。
+
+```python
+import colossalai
+import colossalai.nn as col_nn
+import torch
+from colossalai.utils import print_rank_0
+
+class MLP(torch.nn.Module):
+    def __init__(self, dim: int = 256):
+        super().__init__()
+        intermediate_dim = dim * 4
+        self.dense_1 = col_nn.Linear(dim, intermediate_dim)
+        print_rank_0(f'Weight of the first linear layer: {self.dense_1.weight.shape}')
+        self.activation = torch.nn.GELU()
+        self.dense_2 = col_nn.Linear(intermediate_dim, dim)
+        print_rank_0(f'Weight of the second linear layer: {self.dense_2.weight.shape}')
+        self.dropout = col_nn.Dropout(0.1)
+
+    def forward(self, x):
+        x = self.dense_1(x)
+        print_rank_0(f'Output of the first linear layer: {x.shape}')
+        x = self.activation(x)
+        x = self.dense_2(x)
+        print_rank_0(f'Output of the second linear layer: {x.shape}')
+        x = self.dropout(x)
+        return x
+```
+在8个 GPU 上启动 Colossal-AI 并建立模型。
+```python
+parser = colossalai.get_default_parser()
+colossalai.launch(config=CONFIG,
+                  rank=args.rank,
+                  world_size=args.world_size,
+                  local_rank=args.local_rank,
+                  host=args.host,
+                  port=args.port)
+
+m = MLP()
+```
+我们将会看到 MLP 模型中被划分的参数（如权重）的形状。
+```shell
+Weight of the first linear layer: torch.Size([128, 512])
+Weight of the second linear layer: torch.Size([512, 128])
+```
+
+第一个线性层的完整权重形状应该为 `[256, 1024]`. 经过2.5D并行划分后，它在每个 GPU 上变成了 `[128, 512]` 。
+同样地，第二层将权重 `[1024, 256]` 划分为 `[512, 128]`.
+
+我们可以用一些随机输入来运行这个模型。
+```python
+from colossalai.context import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.utils import get_current_device
+
+x = torch.randn((16, 256), device=get_current_device())
+# partition input
+torch.distributed.broadcast(x, src=0)
+x = torch.chunk(x, 2, dim=0)[gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_DEP)]
+x = torch.chunk(x, 2, dim=0)[gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_COL)]
+x = torch.chunk(x, 2, dim=-1)[gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_ROW)]
+print_rank_0(f'Input: {x.shape}')
+
+x = m(x)
+```
+然后我们可以看到 activation 结果的形状。
+```shell
+Input: torch.Size([4, 128])
+Output of the first linear layer: torch.Size([4, 512])
+Output of the second linear layer: torch.Size([4, 128])
+```
+2.5D并行中的 activation 张量都是同时在$d \times q$行和$q$列分割的。例如，第一个线性层的输出是 `[4, 512]`, 而第二层的输出为 `[4, 128]`。
+注意，2.5D并行使用与2D并行相同的划分方法来处理权重，区别在于对输入的划分。
diff --git a/docs/source/zh/features/3D_tensor_parallel.md b/docs/source/zh/features/3D_tensor_parallel.md
new file mode 100644
index 000000000000..440121c94243
--- /dev/null
+++ b/docs/source/zh/features/3D_tensor_parallel.md
@@ -0,0 +1,154 @@
+# 3D 张量并行
+
+作者: Zhengda Bian, Yongbin Li
+
+**前置教程**
+- [定义配置文件](../basics/define_your_config.md)
+- [并行配置](../basics/configure_parallelization.md)
+- [1D 张量并行](./1D_tensor_parallel.md)
+- [2D 张量并行](./2D_tensor_parallel.md)
+
+**示例代码**
+- [ColossalAI-Examples - 3D Tensor Parallelism](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/features/tensor_parallel/tensor_parallel_3d.py)
+
+**相关论文**
+- [Maximizing Parallelism in Distributed Training for Huge Neural Networks](https://arxiv.org/pdf/2105.14450.pdf)
+
+## 引言
+
+[3D 张量并行](https://arxiv.org/pdf/2105.14450.pdf) 是一种将神经网络模型的计算并行化，以期望获得最佳通信成本优化的方法。
+
+我们还是以线性层 $Y = XA$ 为例。
+给定 $P=q \times q \times q$ 个处理器（必要条件）, 如 $q=2$, 我们把输入 $X$ 和权重 $A$ 划分为
+
+$$
+\left[\begin{matrix}
+            X_{000} & X_{001} \\
+            X_{010} & X_{011} \\
+            X_{100} & X_{101} \\
+            X_{110} & X_{111} \end{matrix}
+\right]
+\text{~and~}
+\left[\begin{matrix}
+            A_{000} & A_{001} & A_{010} & A_{011} \\
+            A_{100} & A_{101} & A_{110} & A_{111} \end{matrix}
+\right]
+\text{~respectively,}$$
+其中每个 $X_{ijl}$ 和 $A_{lji}$ 都被存储在处理器 $(i,j,l)$ 上, 如下图所示。
+
+<center>
+<img src="https://s2.loli.net/2022/02/17/JevO6SED5z4PFdp.png" width = "200" height = "250" />
+<img src="https://s2.loli.net/2022/02/17/qvtwjdfNXMAb4nF.png" width = "200" height = "250" />
+<img src="https://s2.loli.net/2022/02/17/WFzm2N4IwKf1jXZ.png" width = "200" height = "250" />
+<img src="https://s2.loli.net/2022/02/17/r2dZQ4hKxwTuIv6.png" width = "200" height = "250" />
+</center>
+
+然后我们在 $(i, 0...q,l)$ 上收集 $X_{ijl}$, 以及在$(0...q, j, l)$ 上收集 $A_{lji}$。
+因此，我们在每个处理器 $(i,j,l)$ 上都有 $X_{il}$ 和 $A_{lj}$ 以获得 $X_{il}A_{lj}$。
+最后，我们在 $(i, j, 0...q)$ 对结果进行 reduce-scatter 得到 $Y_{ijl}$, 形成
+$$
+Y=
+\left[\begin{matrix}
+            Y_{000} & Y_{001} \\
+            Y_{010} & Y_{011} \\
+            Y_{100} & Y_{101} \\
+            Y_{110} & Y_{111} \end{matrix}
+\right].
+$$
+
+我们还需要注意，在后向传播中, 我们需要 all-gather 梯度 $\dot{Y_{ijl}}$, 然后 reduce-scatter 梯度 $\dot{X_{il}}=\dot{Y_{ij}}A_{lj}^T$ and $\dot{A_{lj}}=X_{il}^T\dot{Y_{ij}}$。
+
+## 效率
+给定 $P=q \times q \times q$ 个处理器, 我们展现理论上的计算和内存成本，以及基于环形算法的3D张量并行的前向和后向的通信成本。
+
+| 计算 | 内存 (参数) | 内存 (activations) | 通信 (带宽) | 通信 (时延) |
+| :-:         | :-:              | :-:                  | :-:                       | :-:                     |
+| $O(1/q^3)$  | $O(1/q^3)$       | $O(1/q^3)$           | $O(6(q-1)/q^3)$           | $O(6(q-1))$             |
+
+## 使用
+
+为了使我们的模型能够实现3D张量并行，例如在8个 GPU 上，我们需要配置如下的并行设置。
+
+```python
+CONFIG = dict(parallel=dict(
+    data=1,
+    pipeline=1,
+    tensor=dict(size=8, mode='3d'),
+))
+```
+然后 Colossal-AI 会自动对所有来自 `colossalai.nn` 的层应用3D张量并行。
+
+让我们定义一个由两层多层感知器 (MLP) 组成的模型，如下所示。
+
+```python
+import colossalai
+import colossalai.nn as col_nn
+import torch
+from colossalai.utils import print_rank_0
+
+class MLP(torch.nn.Module):
+    def __init__(self, dim: int = 256):
+        super().__init__()
+        intermediate_dim = dim * 4
+        self.dense_1 = col_nn.Linear(dim, intermediate_dim)
+        print_rank_0(f'Weight of the first linear layer: {self.dense_1.weight.shape}')
+        self.activation = torch.nn.GELU()
+        self.dense_2 = col_nn.Linear(intermediate_dim, dim)
+        print_rank_0(f'Weight of the second linear layer: {self.dense_2.weight.shape}')
+        self.dropout = col_nn.Dropout(0.1)
+
+    def forward(self, x):
+        x = self.dense_1(x)
+        print_rank_0(f'Output of the first linear layer: {x.shape}')
+        x = self.activation(x)
+        x = self.dense_2(x)
+        print_rank_0(f'Output of the second linear layer: {x.shape}')
+        x = self.dropout(x)
+        return x
+```
+在8个 GPU 上启动 Colossal-AI 并建立模型。
+```python
+parser = colossalai.get_default_parser()
+colossalai.launch(config=CONFIG,
+                  rank=args.rank,
+                  world_size=args.world_size,
+                  local_rank=args.local_rank,
+                  host=args.host,
+                  port=args.port)
+
+m = MLP()
+```
+我们将会看到 MLP 模型中被划分的参数（如权重）的形状。
+```shell
+Weight of the first linear layer: torch.Size([128, 256])
+Weight of the second linear layer: torch.Size([512, 64])
+```
+
+第一个线性层的完整权重形状应该为 `[256, 1024]`. 经过3D并行划分后，它在每个 GPU 上变成了 `[128, 256]` 。
+同样地，第二层将权重 `[1024, 256]` 划分为 `[512, 64]`.
+
+我们可以用一些随机输入来运行这个模型。
+
+```python
+from colossalai.context import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.utils import get_current_device
+
+x = torch.randn((16, 256), device=get_current_device())
+# partition input
+torch.distributed.broadcast(x, src=0)
+x = torch.chunk(x, 2, dim=0)[gpc.get_local_rank(ParallelMode.PARALLEL_3D_WEIGHT)]
+x = torch.chunk(x, 2, dim=0)[gpc.get_local_rank(ParallelMode.PARALLEL_3D_INPUT)]
+x = torch.chunk(x, 2, dim=-1)[gpc.get_local_rank(ParallelMode.PARALLEL_3D_OUTPUT)]
+print_rank_0(f'Input: {x.shape}')
+
+x = m(x)
+```
+然后我们可以看到 activation 结果的形状。
+```shell
+Input: torch.Size([4, 128])
+Output of the first linear layer: torch.Size([4, 512])
+Output of the second linear layer: torch.Size([4, 128])
+```
+3D并行中的 activation 张量都是同时在$q^2$行和$q$列分割的。例如，第一个线性层的输出是 `[4, 512]`, 而第二层的输出为 `[4, 128]`。
+注意，虽然这里3D并行的结果与2.5D并行的结果形状相同，但每个划分的内容是不同的。
diff --git a/docs/source/zh/features/gradient_accumulation.md b/docs/source/zh/features/gradient_accumulation.md
new file mode 100644
index 000000000000..e21e5fcd43d8
--- /dev/null
+++ b/docs/source/zh/features/gradient_accumulation.md
@@ -0,0 +1,40 @@
+# 梯度累积
+
+作者: Shenggui Li, Yongbin Li
+
+**前置教程**
+- [定义配置文件](../basics/define_your_config.md)
+- [在训练中使用Engine和Trainer](../basics/engine_trainer.md)
+
+**示例代码**
+- [ColossalAI-Examples Gradient Accumulation](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/features/gradient_accumulation)
+
+## 引言
+
+梯度累积是一种常见的增大训练 batch size 的方式。 在训练大模型时，内存经常会成为瓶颈，并且 batch size 通常会很小（如2），这导致收敛性无法保证。梯度累积将多次迭代的梯度累加，并仅在达到预设迭代次数时更新参数。
+
+## 使用
+
+在 Colossal-AI 中使用梯度累积非常简单，仅需将下列配置添加进 config 文件。其中，整数值代表期望梯度累积的次数。
+
+```python
+gradient_accumulation = <int>
+```
+
+## 实例
+
+我们提供了一个 [运行实例](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/features/gradient_accumulation)
+来展现梯度累积。在这个例子中，梯度累积次数被设置为4，你可以通过一下命令启动脚本
+
+```shell
+python -m torch.distributed.launch --nproc_per_node 1 --master_addr localhost --master_port 29500  run_resnet_cifar10_with_engine.py
+```
+
+你将会看到类似下方的文本输出。这展现了梯度虽然在前3个迭代中被计算，但直到最后一次迭代，参数才被更新。
+
+```text
+iteration 0, first 10 elements of param: tensor([-0.0208,  0.0189,  0.0234,  0.0047,  0.0116, -0.0283,  0.0071, -0.0359, -0.0267, -0.0006], device='cuda:0', grad_fn=<SliceBackward0>)
+iteration 1, first 10 elements of param: tensor([-0.0208,  0.0189,  0.0234,  0.0047,  0.0116, -0.0283,  0.0071, -0.0359, -0.0267, -0.0006], device='cuda:0', grad_fn=<SliceBackward0>)
+iteration 2, first 10 elements of param: tensor([-0.0208,  0.0189,  0.0234,  0.0047,  0.0116, -0.0283,  0.0071, -0.0359, -0.0267, -0.0006], device='cuda:0', grad_fn=<SliceBackward0>)
+iteration 3, first 10 elements of param: tensor([-0.0141,  0.0464,  0.0507,  0.0321,  0.0356, -0.0150,  0.0172, -0.0118, 0.0222,  0.0473], device='cuda:0', grad_fn=<SliceBackward0>)
+```
diff --git a/docs/source/zh/features/gradient_clipping.md b/docs/source/zh/features/gradient_clipping.md
new file mode 100644
index 000000000000..203f66a3fea2
--- /dev/null
+++ b/docs/source/zh/features/gradient_clipping.md
@@ -0,0 +1,51 @@
+# 梯度裁剪
+
+作者: Boxiang Wang, Haichen Huang, Yongbin Li
+
+**前置教程**
+- [定义配置文件](../basics/define_your_config.md)
+- [在训练中使用Engine和Trainer](../basics/engine_trainer.md)
+
+**示例代码**
+- [ColossalAI-Examples Gradient Clipping](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/features/gradient_clipping)
+
+**相关论文**
+- [On the difficulty of training Recurrent Neural Networks](https://arxiv.org/abs/1211.5063)
+
+## 引言
+
+为了加快训练过程和寻求全局最优以获得更好的性能，越来越多的学习率调度器被提出。人们通过控制学习率来调整训练中的下降速度。这使得梯度向量在每一步都能更好地统一。在这种情况下，下降速度可以按预期被控制。
+因此，梯度裁剪，一种可以将梯度向量归一化，以将其限制在统一长度的技术，对于那些希望模型性能更好的人来说是不可或缺的。
+
+在使用 Colossal-AI 时，你不必担心实现梯度剪裁，我们以一种有效而方便的方式支持梯度剪裁。你所需要的只是在你的配置文件中增加一个命令。
+
+## 为什么应该使用 Colossal-AI 中的梯度裁剪
+
+我们不建议用户自己编写梯度剪裁，因为朴素的梯度剪裁在应用张量并行、流水线并行、MoE 等功能时可能会失败。
+
+根据下图，每个 GPU 只拥有线性层中权重的一部分参数。为了得到线性层权重的梯度向量的正确范数，每个 GPU 中的每个梯度向量的范数应该相加。更复杂的是，偏置的分布不同于权重的分布。通信组在求和运算中有所不同。
+
+(注: 这种情况是旧版本的 2D 并行，在代码中的实现是不一样的。但这是一个很好的例子，能够说明在梯度剪裁中统一所有通信的困难。)
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/01/28/KXiJPHt3Dum82cA.png"/>
+<figcaption>参数分布</figcaption>
+</figure>
+
+不用担心它，因为 Colossal-AI 已经为你处理好。
+
+### 使用
+要使用梯度裁剪，只需在配置文件中添加梯度裁剪范数即可。
+
+```python
+clip_grad_norm = 1.0
+```
+
+### 实例
+
+我们提供了一个展现梯度裁剪的[运行实例](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/features/gradient_clipping)
+。在本例中，我们将梯度裁剪范数设置为1.0，你可以使用以下命令运行脚本：
+
+```shell
+python -m torch.distributed.launch --nproc_per_node 1 --master_addr localhost --master_port 29500  train_with_engine.py
+```
diff --git a/docs/source/zh/features/gradient_handler.md b/docs/source/zh/features/gradient_handler.md
new file mode 100644
index 000000000000..701c60fed57f
--- /dev/null
+++ b/docs/source/zh/features/gradient_handler.md
@@ -0,0 +1,59 @@
+# 梯度 Handler
+
+作者: Shenggui Li, Yongbin Li
+
+**前置教程**
+- [定义配置文件](../basics/define_your_config.md)
+- [在训练中使用Engine和Trainer](../basics/engine_trainer.md)
+
+**示例代码**
+- [ColossalAI-Examples Gradient Handler](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/features/gradient_handler)
+
+## 引言
+
+在分布式训练中，每次迭代结束时都需要梯度同步。这很重要，因为我们需要确保在不同的机器中使用相同的梯度更新参数，以便生成的参数都一样。这通常在数据并行中看到，因为在数据并行中的模型是直接复制的。
+
+在 Colossal-AI 中，我们为用户提供了一个接口来定制他们想要如何处理同步。这为实现新的并行方法等情况带来了灵活性。
+
+当梯度 Handler 被使用时, PyTorch 的 `DistributedDataParallel` 将不再被使用，因为它会自动同步梯度.
+
+## 定制你的梯度 Handler
+
+要实现定制的梯度Handler，需要遵循以下步骤。
+1. 继承Colossal-AI中的 `BaseGradientHandler`
+2. 将梯度Handler注册进 `GRADIENT_HANDLER`
+3. 实现 `handle_gradient`
+
+```python
+from colossalai.registry import GRADIENT_HANDLER
+from colossalai.engine.gradient_handler import BaseGradientHandler
+
+
+@GRADIENT_HANDLER.register_module
+class MyGradientHandler(BaseGradientHandler):
+
+    def handle_gradient(self):
+        do_something()
+
+
+```
+
+
+## 使用
+
+要使用梯度 Handler，需要在配置文件中指定梯度 Handler。梯度 Handler 将自动构建并连接到 Engine。
+
+```python
+gradient_handler = [dict(type='MyGradientHandler')]
+```
+
+
+### 实例
+
+我们提供了一个 [运行实例](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/features/gradient_handler)
+展现梯度 Handler 的使用. 在这个例子中，我们使用 `DataParallelGradientHandler` 而不是 PyTorch 的
+`DistributedDataParallel` 实现数据并行.
+
+```shell
+python -m torch.distributed.launch --nproc_per_node 4 --master_addr localhost --master_port 29500  train_with_engine.py
+```
diff --git a/docs/source/zh/features/mixed_precision_training.md b/docs/source/zh/features/mixed_precision_training.md
new file mode 100644
index 000000000000..c9db3a59c1c3
--- /dev/null
+++ b/docs/source/zh/features/mixed_precision_training.md
@@ -0,0 +1,344 @@
+# 自动混合精度训练 (AMP)
+
+作者: Chuanrui Wang, Shenggui Li, Yongbin Li
+
+**前置教程**
+- [定义配置文件](../basics/define_your_config.md)
+- [在训练中使用Engine和Trainer](../basics/engine_trainer.md)
+
+**示例代码**
+- [ColossalAI-Examples AMP](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/features/amp)
+
+**相关论文**
+- [Accelerating Scientific Computations with Mixed Precision Algorithms](https://arxiv.org/abs/0808.2794)
+
+
+## 引言
+
+AMP 代表自动混合精度训练。
+在 Colossal-AI 中, 我们结合了混合精度训练的不同实现:
+
+1. torch.cuda.amp
+2. apex.amp
+3. naive amp
+
+
+| Colossal-AI | 支持张量并行 | 支持流水并行 | fp16范围 |
+| ----------- | ----------------------- | ------------------------- | ----------- |
+| AMP_TYPE.TORCH | ✅ | ❌ | 在前向和反向传播期间，模型参数、激活和梯度向下转换至fp16 |
+| AMP_TYPE.APEX | ❌ | ❌ | 更细粒度，我们可以选择 opt_level O0, O1, O2, O3 |
+| AMP_TYPE.NAIVE | ✅ | ✅ | 模型参数、前向和反向操作，全都向下转换至fp16 |
+
+前两个依赖于 PyTorch (1.6及以上) 和 NVIDIA Apex 的原始实现。最后一种方法类似 Apex O2。在这些方法中，Apex-AMP 与张量并行不兼容。这是因为张量是以张量并行的方式在设备之间拆分的，因此，需要在不同的进程之间进行通信，以检查整个模型权重中是否出现inf或nan。我们修改了torch amp实现，使其现在与张量并行兼容。
+
+> ❌️ fp16与ZeRO配置不兼容
+>
+> ⚠️ 流水并行目前仅支持naive amp
+
+我们建议使用 torch AMP，因为在不使用流水并行时，它通常比 NVIDIA AMP 提供更好的准确性。
+
+## 目录
+
+在本教程中，我们将介绍:
+
+1. AMP 介绍
+2. Colossal-AI 中的 AMP
+3. 练习实例
+
+## AMP 介绍
+
+自动混合精度训练是混合 FP16 和 FP32 训练。
+
+半精度浮点格式（FP16）具有较低的算法复杂度和较高的计算效率。此外，FP16 仅需要 FP32 所需的一半存储空间，并节省了内存和网络带宽，从而为大 batch size 和大模型提供了更多内存。
+
+然而，还有其他操作，如缩减，需要 FP32 的动态范围，以避免数值溢出/下溢。因此，我们引入自动混合精度，尝试将每个操作与其相应的数据类型相匹配，这可以减少内存占用并提高训练效率。
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/01/28/URzLJ3MPeDQbtck.png"/>
+<figcaption>AMP 示意图 (图片来自 <a href="https://arxiv.org/abs/2108.05818">PatrickStar 论文</a>)</figcaption>
+</figure>
+
+## Colossal-AI 中的 AMP
+
+我们支持三种 AMP 训练方法，并允许用户在没有改变代码的情况下使用 AMP 进行训练。只需在配置文件中添加'fp16'配置即可使用 AMP。
+
+```python
+from colossalai.amp import AMP_TYPE
+
+# 使用 Torch AMP
+fp16=dict(
+    mode = AMP_TYPE.TORCH
+)
+
+# 使用 naive AMP
+fp16=dict(
+    mode = AMP_TYPE.NAIVE
+)
+
+# 使用 Nvidia Apex AMP
+fp16=dict(
+    mode = AMP_TYPE.APEX
+)
+
+```
+
+> 这些是最低配置，完整配置将在后面的部分中说明
+
+### AMP 模块化
+
+AMP 模块设计为完全模块化，可以独立使用。如果你想在你的代码库中只使用 AMP 而不使用`colossalai.initialize`，你可以导入`colossalai.amp.convert_to_amp`。
+
+```python
+from colossalai.amp import AMP_TYPE
+
+# 使用torch amp的例子
+model, optimizer, criterion = colossalai.amp.convert_to_amp(model,
+                                                            optimizer,
+                                                            criterion,
+                                                            AMP_TYPE.TORCH)
+```
+
+### Torch AMP 配置
+
+```python
+from colossalai.amp import AMP_TYPE
+
+fp16=dict(
+    mode=AMP_TYPE.TORCH,
+
+    # 下列是grad scaler的默认值
+    init_scale=2.**16,
+    growth_factor=2.0,
+    backoff_factor=0.5,
+    growth_interval=2000,
+    enabled=True
+)
+```
+
+可选参数:
+- init_scale(float, optional, default=2.**16): 初始缩放因子；
+- growth_factor(float, optional, default=2.0): 如果在``growth_interval``连续迭代过程中没有出现 inf/NaN 梯度，则在`update`中乘以比例系数；
+- backoff_factor(float, optional, default=0.5): 如果在迭代中出现 inf/NaN 梯度，则在`update`中乘以比例系数；
+- growth_interval(int, optional, default=2000): 在指定次数的连续迭代中，若没有出现 inf/NaN 梯度，则乘以``growth_factor``.
+- enabled(bool, optional, default=True):  ``False``则使梯度缩放无效，`step` 仅调用底层的 ``optimizer.step()``, 其他方法成为空操作。
+
+### Apex AMP 配置
+
+对于这种模式，我们依靠 Apex 实现混合精度训练。我们支持这个插件，因为它允许对混合精度的粒度进行更精细的控制。
+例如, O2 水平 (优化器水平2) 将保持 batch normalization 为 FP32。
+
+如果你想了解更多细节，请参考 [Apex Documentation](https://nvidia.github.io/apex/)。
+
+```python
+from colossalai.amp import AMP_TYPE
+
+fp16 = dict(
+    mode=AMP_TYPE.APEX,
+
+    # 下列是默认值
+    enabled=True,
+    opt_level='O1',
+    cast_model_type=None,
+    patch_torch_functions=None,
+    keep_batchnorm_fp32=None,
+    master_weights=None,
+    loss_scale=None,
+    cast_model_outputs=None,
+    num_losses=1,
+    verbosity=1,
+    min_loss_scale=None,
+    max_loss_scale=16777216.0
+)
+```
+
+参数:
+- enabled(bool, optional, default=True): False 会使所有 AMP 调用成为空操作, 程序将会像没有使用 AMP 一样运行。
+
+- opt_level(str, optional, default="O1" ): 纯精度或混合精度优化水平。可选值 “O0”, “O1”, “O2”, and “O3”, 详细解释见上方 Apex AMP 文档。
+
+- num_losses(int, optional, default=1): 选择提前告知 AMP 您计划使用多少次损失/反向计算。
+当`amp.scale_loss`与 loss_id 参数一起使用时，使 AMP 在每次损失/反向计算时使用不同的损失比例，这可以提高稳定性。如果 num_losses 被设置为1，AMP 仍支持多次损失/反向计算，但对他们都使用同一个全局损失比例。
+
+- verbosity(int, default=1): 设置为0抑制 AMP 相关输出。
+
+- min_loss_scale(float, default=None): 为可通过动态损耗比例选择的损耗比例值设置下限。
+默认值“None”意味着不设置任何下限。如果不使用动态损耗比例，则忽略 min_loss_scale 。
+
+- max_loss_scale(float, default=2.**24 ): 为可通过动态损耗比例选择的损耗比例值设置上限。如果不使用动态损耗比例，则 max_loss_scale 被忽略.
+
+目前，管理纯精度或混合精度训练的幕后属性有以下几种:
+cast_model_type, patch_torch_functions, keep_batchnorm_fp32, master_weights, loss_scale.
+一旦 opt_level 被确定，它们是可选的可覆盖属性
+
+- cast_model_type: 将模型的参数和缓冲区强制转换为所需的类型。
+- patch_torch_functions: 补全所有的 Torch 函数和张量方法，以便在FP16中执行张量核心友好的操作，如 GEMMs 和卷积，以及在 FP32 中执行任何受益于 FP32 精度的操作。
+- keep_batchnorm_fp32: 为了提高精度并启用 cudnn batchnorm (这会提高性能),在 FP32 中保留 batchnorm 权重通常是有益的，即使模型的其余部分是 FP16。
+- master_weights: 保持 FP32 主权重以配合任何 FP16 模型权重。 FP32 主权重由优化器分级，以提高精度和捕捉小梯度。
+- loss_scale: 如果 loss_scale 是一个浮点数，则使用这个值作为静态（固定）的损失比例。如果 loss_scale 是字符串 "dynamic"，则随着时间的推移自适应地调整损失比例。动态损失比例调整由 AMP 自动执行。
+
+
+### Naive AMP 配置
+
+在 Naive AMP 模式中, 我们实现了混合精度训练，同时保持了与复杂张量和流水并行的兼容性。该 AMP 模式将所有操作转为 FP16 。下列代码块展示了该模式的`config.py`。
+
+```python
+from colossalai.amp import AMP_TYPE
+
+fp16 = dict(
+    mode=AMP_TYPE.NAIVE,
+
+    # below are the default values
+    log_num_zeros_in_grad=False,
+    initial_scale=2 ** 32,
+    min_scale=1,
+    growth_factor=2,
+    backoff_factor=0.5,
+    growth_interval=1000,
+    hysteresis=2
+)
+```
+
+Naive AMP 的默认参数:
+- log_num_zeros_in_grad(bool): 返回0值梯度的个数.
+- initial_scale(int): gradient scaler 的初始值
+- growth_factor(int): loss scale 的增长率
+- backoff_factor(float): loss scale 的下降率
+- hysterisis(int): 动态 loss scaling 的延迟偏移
+- max_scale(int): loss scale 的最大允许值
+- verbose(bool): 如果被设为`True`,将打印调试信息
+
+当使用`colossalai.initialize`时, 首先需要实例化一个模型、一个优化器和一个标准。将输出模型转换为内存消耗较小的 AMP 模型。如果您的输入模型已经太大，无法放置在 GPU 中，请使用`dtype=torch.float16`实例化你的模型。或者请尝试更小的模型，或尝试更多的并行化训练技术！
+
+## 实例
+
+我们提供了一个 [运行实例](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/features/amp)
+展现如何在 Colossal-AI 使用 AMP。在该例程中，我们使用 Torch AMP, 但提供的配置文件也适用于所有 AMP 模式.
+
+### 步骤 1. 创建配置文件
+
+创建一个`config.py`文件并添加`fp16`配置.
+
+```python
+# in config.py
+from colossalai.amp import AMP_TYPE
+
+BATCH_SIZE = 128
+DROP_RATE = 0.1
+NUM_EPOCHS = 300
+
+fp16 = dict(
+    mode=AMP_TYPE.TORCH,
+)
+
+clip_grad_norm = 1.0
+```
+
+### 步骤 2. 在 train_with_engine.py 导入相关库
+
+创建`train_with_engine.py`并导入必要依赖. 请记得通过命令`pip install timm scipy`安装`scipy`和`timm`。
+
+```python
+import os
+import colossalai
+import torch
+from pathlib import Path
+from colossalai.core import global_context as gpc
+from colossalai.logging import get_dist_logger
+from colossalai.utils import get_dataloader
+from colossalai.trainer import Trainer, hooks
+from colossalai.nn.lr_scheduler import LinearWarmupLR
+from timm.models import vit_base_patch16_224
+from torchvision import datasets, transforms
+
+```
+
+### 步骤 3. 初始化分布式环境
+
+我们需要初始化分布式环境。为了快速演示，我们使用`launch_from_torch`。你可以参考 [Launch Colossal-AI](../basics/launch_colossalai.md)
+使用其他初始化方法。
+
+```python
+# 初始化分布式设置
+parser = colossalai.get_default_parser()
+args = parser.parse_args()
+
+# launch from torch
+colossalai.launch_from_torch(config=args.config)
+
+```
+
+### 步骤 4. 创建训练组件
+
+构建你的模型、优化器、损失函数、学习率调整器和数据加载器。注意数据集的路径从环境变量`DATA`获得。你可以通过 `export DATA=/path/to/data` 或 `Path(os.environ['DATA'])`
+在你的机器上设置路径。数据将会被自动下载到该路径。
+
+```python
+# build model
+    model = vit_base_patch16_224(drop_rate=0.1)
+
+    # build dataloader
+    train_dataset = datasets.Caltech101(
+        root=Path(os.environ['DATA']),
+        download=True,
+        transform=transforms.Compose([
+            transforms.Resize(256),
+            transforms.RandomResizedCrop(224),
+            transforms.RandomHorizontalFlip(),
+            transforms.ToTensor(),
+            Gray2RGB(),
+            transforms.Normalize([0.5, 0.5, 0.5],
+                                 [0.5, 0.5, 0.5])
+        ]))
+
+    train_dataloader = get_dataloader(dataset=train_dataset,
+                                      shuffle=True,
+                                      batch_size=gpc.config.BATCH_SIZE,
+                                      num_workers=1,
+                                      pin_memory=True,
+                                      )
+
+    # build optimizer
+    optimizer = torch.optim.SGD(model.parameters(), lr=1e-2, weight_decay=0.1)
+
+    # build loss
+    criterion = torch.nn.CrossEntropyLoss()
+
+    # lr_scheduelr
+    lr_scheduler = LinearWarmupLR(optimizer, warmup_steps=50, total_steps=gpc.config.NUM_EPOCHS)
+```
+
+### 步骤 5. 插入 AMP
+
+调用 `colossalai.initialize` 将所有训练组件转为为FP16模式.
+
+```python
+engine, train_dataloader, _, _ = colossalai.initialize(
+        model, optimizer, criterion, train_dataloader,
+    )
+```
+
+### 步骤 6. 使用 Engine 训练
+
+使用Engine构建一个普通的训练循环
+
+```python
+engine.train()
+for epoch in range(gpc.config.NUM_EPOCHS):
+    for img, label in enumerate(train_dataloader):
+        img = img.cuda()
+        label = label.cuda()
+        engine.zero_grad()
+        output = engine(img)
+        loss = engine.criterion(output, label)
+        engine.backward(loss)
+        engine.step()
+        lr_scheduler.step()
+```
+
+### 步骤 7. 启动训练脚本
+
+使用下列命令启动训练脚本，你可以改变 `--nproc_per_node` 以使用不同数量的 GPU。
+
+```python
+python -m torch.distributed.launch --nproc_per_node 4 --master_addr localhost --master_port 29500 train_with_engine.py --config config/config_AMP_torch.py
+```
diff --git a/docs/source/zh/features/nvme_offload.md b/docs/source/zh/features/nvme_offload.md
new file mode 100644
index 000000000000..0ced6031de63
--- /dev/null
+++ b/docs/source/zh/features/nvme_offload.md
@@ -0,0 +1,43 @@
+# NVMe offload
+
+作者: Hongxin Liu
+
+**前置教程:**
+- [基于Chunk内存管理的零冗余优化器 (ZeRO)](../features/zero_with_chunk.md)
+
+## 引言
+
+如果模型具有`N`个参数，在使用 Adam 时，优化器状态具有`8N`个参数。对于十亿规模的模型，优化器状态至少需要 32 GB 内存。 GPU显存限制了我们可以训练的模型规模，这称为GPU显存墙。如果我们将优化器状态 offload 到磁盘，我们可以突破 GPU 内存墙。
+
+我们实现了一个用户友好且高效的异步 Tensor I/O 库：[TensorNVMe](https://github.com/hpcaitech/TensorNVMe)。有了这个库，我们可以简单地实现 NVMe offload。
+
+> 该库与各种磁盘（HDD、SATA SSD 和 NVMe SSD）兼容。由于 HDD 或 SATA SSD 的 I/O 带宽较低，建议仅在 NVMe 磁盘上使用此库。
+
+在优化参数时，我们可以将优化过程分为三个阶段：读取、计算和 offload。我们以流水线的方式执行优化过程，这可以重叠计算和 I/O。
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/08/16/CvRnowrsNyB4hza.jpg"/>
+<figcaption>优化过程</figcaption>
+</figure>
+
+
+## 使用
+
+首先，请确保您安装了 [TensorNVMe](https://github.com/hpcaitech/TensorNVMe):
+
+```shell
+pip install packaging
+pip install tensornvme
+```
+
+我们为 Adam ([CPUAdam](https://colossalai.readthedocs.io/en/latest/colossalai/colossalai.nn.optimizer.cpu_adam.html) 和 [HybridAdam](https://colossalai.readthedocs.io/en/latest/colossalai/colossalai.nn.optimizer.hybrid_adam.html)) 实现了优化器状态的 NVMe offload。
+
+```python
+from colossalai.nn.optimizer import CPUAdam, HybridAdam
+
+optimizer = HybridAdam(model.parameters(), lr=1e-3, nvme_offload_fraction=1.0, nvme_offload_dir='./')
+```
+
+`nvme_offload_fraction` 是要 offload 到 NVMe 的优化器状态的比例。 `nvme_offload_dir` 是保存 NVMe offload 文件的目录。如果 `nvme_offload_dir` 为 `None`，将使用随机临时目录。
+
+它与 ColossalAI 中的所有并行方法兼容。
diff --git a/docs/source/zh/features/pipeline_parallel.md b/docs/source/zh/features/pipeline_parallel.md
new file mode 100644
index 000000000000..98096b1d7f93
--- /dev/null
+++ b/docs/source/zh/features/pipeline_parallel.md
@@ -0,0 +1,158 @@
+# 流水并行
+
+作者: Guangyang Lu, Hongxin Liu, Yongbin Li
+
+**前置教程**
+- [定义配置文件](../basics/define_your_config.md)
+- [在训练中使用Engine和Trainer](../basics/engine_trainer.md)
+- [并行配置](../basics/configure_parallelization.md)
+
+**示例代码**
+- [ColossalAI-Examples ResNet with pipeline](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/features/pipeline_parallel)
+
+**相关论文**
+- [Colossal-AI: A Unified Deep Learning System For Large-Scale Parallel Training](https://arxiv.org/abs/2110.14883)
+- [Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM](https://arxiv.org/abs/2104.04473)
+- [GPipe: Efficient Training of Giant Neural Networks using Pipeline Parallelism](https://arxiv.org/abs/1811.06965)
+
+## 快速预览
+
+在本教程中，你将学习如何使用流水并行。在 Colossal-AI 中, 我们使用 NVIDIA 推出的 1F1B 流水线。由于在本例中, 使用 ViT 和 ImageNet 太过庞大，因此我们使用 ResNet 和 CIFAR 为例.
+
+## 目录
+
+在本教程中，我们将介绍:
+
+1. 介绍 1F1B 流水线；
+2. 使用非交错和交错 schedule；
+3. 使用流水线训练 ResNet。
+
+## 认识 1F1B 流水线
+
+首先，我们将向您介绍 GPipe，以便您更好地了解。
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/01/28/OAucPF6mWYynUtV.png"/>
+<figcaption>图1: GPipe，来自论文 <a href="https://arxiv.org/pdf/2104.04473.pdf">Megatron-LM</a> 。</figcaption>
+</figure>
+
+正如你所看到的，对于 GPipe，只有当一个批次中所有 microbatches 的前向计算完成后，才会执行后向计算。
+
+一般来说，1F1B（一个前向通道和一个后向通道）比 GPipe （在内存或内存和时间方面）更有效率。1F1B 流水线有两个 schedule ，非交错式和交错式，图示如下。
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/01/28/iJrVkp2HLcahjsT.png"/>
+<figcaption>Figure2: 图片来自论文 <a href="https://arxiv.org/pdf/2104.04473.pdf">Megatron-LM</a> 。上面的部分显示了默认的非交错 schedule，底部显示的是交错的 schedule。</figcaption>
+</figure>
+
+### 非交错 Schedule
+
+非交错式 schedule 可分为三个阶段。第一阶段是热身阶段，处理器进行不同数量的前向计算。在接下来的阶段，处理器进行一次前向计算，然后是一次后向计算。处理器将在最后一个阶段完成后向计算。
+
+这种模式比 GPipe 更节省内存。然而，它需要和 GPipe 一样的时间来完成一轮计算。
+
+### 交错 Schedule
+
+这个 schedule 要求**microbatches的数量是流水线阶段的整数倍**。
+
+在这个 schedule 中，每个设备可以对多个层的子集（称为模型块）进行计算，而不是一个连续层的集合。具体来看，之前设备1拥有层1-4，设备2拥有层5-8，以此类推；但现在设备1有层1,2,9,10，设备2有层3,4,11,12，以此类推。
+在该模式下，流水线上的每个设备都被分配到多个流水线阶段，每个流水线阶段的计算量较少。
+
+这种模式既节省内存又节省时间。
+
+## 使用schedule
+
+在 Colossal-AI 中, 我们提供非交错(`PipelineSchedule`) 和交错(`InterleavedPipelineSchedule`)schedule。
+
+你只需要在配置文件中，设置 `NUM_MICRO_BATCHES` 并在你想使用交错schedule的时候，设置 `NUM_CHUNKS`。 如果你确定性地知道每个管道阶段的输出张量的形状，而且形状都是一样的，你可以设置 `tensor_shape` 以进一步减少通信。否则，你可以忽略 `tensor_shape` , 形状将在管道阶段之间自动交换。 我们将会根据用户提供的配置文件，生成一个合适schedule来支持用户的流水并行训练。
+
+## 使用流水线训练 ResNet
+
+我们首先用Colossal PipelinableContext方式建立 `ResNet` 模型:
+```python
+import os
+from typing import Callable, List, Optional, Type, Union
+import torch
+import torch.nn as nn
+import colossalai
+import colossalai.nn as col_nn
+
+from colossalai.core import global_context as gpc
+from colossalai.logging import disable_existing_loggers, get_dist_logger
+from colossalai.trainer import Trainer, hooks
+from colossalai.utils import MultiTimer, get_dataloader
+from colossalai.context import ParallelMode
+from colossalai.pipeline.pipelinable import PipelinableContext
+
+from titans.dataloader.cifar10 import build_cifar
+from torchvision.models import resnet50
+from torchvision.models.resnet import BasicBlock, Bottleneck, conv1x1
+
+# Define some config
+BATCH_SIZE = 64
+NUM_EPOCHS = 2
+NUM_CHUNKS = 1
+CONFIG = dict(NUM_MICRO_BATCHES=4, parallel=dict(pipeline=2))
+
+# Train
+disable_existing_loggers()
+parser = colossalai.get_default_parser()
+args = parser.parse_args()
+colossalai.launch_from_torch(backend=args.backend, config=CONFIG)
+logger = get_dist_logger()
+pipelinable = PipelinableContext()
+
+# build model
+with pipelinable:
+    model = resnet50()
+```
+
+给定切分顺序，module直接给出name，部分函数需要手动添加。
+```python
+exec_seq = [
+    'conv1', 'bn1', 'relu', 'maxpool', 'layer1', 'layer2', 'layer3', 'layer4', 'avgpool',
+    (lambda x: torch.flatten(x, 1), "behind"), 'fc'
+]
+pipelinable.to_layer_list(exec_seq)
+```
+
+将模型切分成流水线阶段。
+```python
+model = pipelinable.partition(NUM_CHUNKS, gpc.pipeline_parallel_size, gpc.get_local_rank(ParallelMode.PIPELINE))
+```
+
+我们使用`Trainer`训练`ResNet`:
+```python
+# build criterion
+criterion = nn.CrossEntropyLoss()
+
+# optimizer
+optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
+
+# build dataloader
+root = os.environ.get('DATA', './data')
+train_dataloader, test_dataloader = build_cifar(BATCH_SIZE, root, padding=4, crop=32, resize=32)
+
+lr_scheduler = col_nn.lr_scheduler.LinearWarmupLR(optimizer, NUM_EPOCHS, warmup_steps=1)
+engine, train_dataloader, test_dataloader, lr_scheduler = colossalai.initialize(model, optimizer, criterion,
+                                                                                train_dataloader, test_dataloader,
+                                                                                lr_scheduler)
+timer = MultiTimer()
+
+trainer = Trainer(engine=engine, timer=timer, logger=logger)
+
+hook_list = [
+    hooks.LossHook(),
+    hooks.AccuracyHook(col_nn.metric.Accuracy()),
+    hooks.LogMetricByEpochHook(logger),
+    hooks.LRSchedulerHook(lr_scheduler, by_epoch=True)
+]
+
+trainer.fit(train_dataloader=train_dataloader,
+            epochs=NUM_EPOCHS,
+            test_dataloader=test_dataloader,
+            test_interval=1,
+            hooks=hook_list,
+            display_progress=True)
+```
+
+我们使用 `2` 个流水段，并且 batch 将被切分为 `4` 个 micro batches。
diff --git a/docs/source/zh/features/zero_with_chunk.md b/docs/source/zh/features/zero_with_chunk.md
new file mode 100644
index 000000000000..13dd1cd20130
--- /dev/null
+++ b/docs/source/zh/features/zero_with_chunk.md
@@ -0,0 +1,261 @@
+# 基于Chunk内存管理的零冗余优化器 (ZeRO)
+
+作者: [Hongxiu Liu](https://github.com/ver217), [Jiarui Fang](https://github.com/feifeibear), [Zijian Ye](https://github.com/ZijianYY)
+
+**前置教程:**
+- [定义配置文件](../basics/define_your_config.md)
+
+**示例代码**
+- [Train GPT with Colossal-AI](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/gpt)
+
+**相关论文**
+
+- [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models](https://arxiv.org/abs/1910.02054)
+- [ZeRO-Offload: Democratizing Billion-Scale Model Training](https://arxiv.org/abs/2101.06840)
+- [ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning](https://arxiv.org/abs/2104.07857)
+- [PatrickStar: Parallel Training of Pre-trained Models via Chunk-based Memory Management](https://arxiv.org/abs/2108.05818)
+
+## 引言
+
+零冗余优化器 (ZeRO) 通过对三个模型状态（优化器状态、梯度和参数）进行划分而不是复制他们，消除了数据并行进程中的内存冗余。该方法与传统的数据并行相比，内存效率得到了极大的提高，而计算粒度和通信效率得到了保留。
+
+1. **分片优化器状态**: 优化器状态 (如 [Adam optimizer](https://arxiv.org/abs/1412.6980), 32位的权重,
+以及一二阶动量估计) 被划分到各个进程中, 因此每个进程只更新其分区。
+
+
+2. **分片梯度**: 在梯度在数据并行进程组内进行 reduction 后, 梯度张量也被划分，这样每个进程只存储与其划分的优化器状态对应的梯度。 注意, Colossal-AI 将梯度转换为 FP32 格式以参与更新参数。
+
+3. **分片参数**: 16位的模型参数被划分到一个数据并行组的进程中。
+
+4. **[Gemini](../advanced_tutorials/meet_gemini.md)**: 对于参数、梯度、优化器状态的动态异构内存空间管理器。
+
+此外，我们还将介绍基于Chunk内存管理的零冗余优化器。
+
+在使用零冗余优化器 (ZeRO)时，我们通过切分参数的方式对模型进行分布式存储，这种方法的优点是每个节点的内存负载是完全均衡的。但是这种方式有很多缺点。首先，通信时需要申请一块临时内存用来通信，通信完毕释放，这回导致存在内存碎片化的问题。其次，以Tensor为粒度进行通信，会导致网络带宽无法充分利用。通常来说传输的消息长度越长带宽利用率越高。
+
+利用ColossalAI v0.1.8引入了Chunk机制，我们可以提升ZeRO的性能。我们将运算顺序上连续的一组参数存入一个Chunk中（Chunk即一段连续的内存空间），每个Chunk的大小相同。Chunk方式组织内存可以保证PCI-e和GPU-GPU之间网络带宽的高效利用，减小了通信次数，同时避免潜在的内存碎片。
+
+在v0.1.8之前，ZeRO在进行参数聚合时通信成本较高，如果一个参数在连续的几次计算中被使用多次，即会发生多次通信，效率较低。这种情况在使用Checkpoint时非常常见，参数在计算backward时会重计算一遍forward。这种情况下，ZeRO的效率便不高。
+
+以GPT为例，其Checkpoint会应用在每一个GPT Block上，每一个GPT Block包含一个Self-Attention层和MLP层。在计算Backward时，会依次计算Self-Attention层、MLP层的forward，然后依次计算MLP层、Self-Attention层的backward。如使用Chunk机制，我们将Self-Attention层和MLP层放在同一个Chunk中，在每个GPT Block的backward的中便无需再通信。
+
+除此之外，由于小Tensor的通信、内存移动没法完全利用NVLINK、PCIE带宽，而且每次通信、内存移动都有kernel launch的开销。使用了Chunk之后可以把多次小Tensor的通信、内存移动变为一次大Tensor的通信、内存移动，既提高了带宽利用，也减小了kernel launch的开销。
+
+我们提供了轻量级的Chunk搜索机制，帮助用户自动找到内存碎片最小的Chunk尺寸。
+
+## 使用
+
+### GeminiDDP
+
+我们将运用`GeminiDDP`的方式来使用基于Chunk内存管理的ZeRO。这是我们新包装的torch.Module ，它使用 ZeRO-DP 和 Gemini，其中ZeRO 用于并行，Gemini 用于内存管理。
+
+同样需要确保你的模型是在 `ColoInitContext` 的上下文中初始化的。
+
+```python
+with ColoInitContext(device='cpu', default_dist_spec=default_dist_spec, default_pg=default_pg):
+  model = gpt2_medium(checkpoint=True)
+```
+
+定义模型参数如下:
+
+```python
+chunk_manager = init_chunk_manager(model=module,
+                                   init_device=device,
+                                   hidden_dim=hidden_dim,
+                                   search_range_mb=search_range_mb,
+                                   min_chunk_size_mb=min_chunk_size_mb)
+gemini_manager = GeminiManager(placement_policy, chunk_manager)
+model = ZeroDDP(model, gemini_manager)
+```
+
+`hidden dim`是DNN的隐藏维度。用户可以提供这个参数来加快搜索速度。如果用户在训练前不知道这个参数也可以。 我们将使用默认值 1024。`min_chunk_size_mb`是以兆字节为单位的最小块大小。如果参数的总大小仍然小于最小块大小，则所有参数将被压缩为一个小块。
+
+初始化优化器。
+```python
+optimizer = GeminiAdamOptimizer(model, lr=1e-3, initial_scale=2**5)
+```
+
+训练
+```python
+optimizer.zero_grad()
+outputs = model(input_ids, attn_mask)
+loss = criterion(outputs, input_ids)
+optimizer.backward(loss)
+optimizer.step()
+```
+> ⚠️ 注意：请不要使用`loss.backward()`，规范写法是`optimizer.backward(loss)`。
+
+### 训练GPT
+
+在此例程中, 我们使用 `Hugging Face Transformers`，并以 `GPT2 Medium` 为例。你必须在允许该例程前安装 `transformers`。
+
+为了简单起见，我们在这里只使用随机生成的数据。
+
+首先我们只需要引入`Huggingface transformers` 的 `GPT2LMHeadModel`来定义我们的模型，不需要用户进行模型的定义与修改，方便用户使用。
+
+```python
+class GPTLMModel(nn.Module):
+
+    def __init__(self,
+                 hidden_size=768,
+                 num_layers=12,
+                 num_attention_heads=12,
+                 max_seq_len=1024,
+                 vocab_size=50257,
+                 checkpoint=False):
+        super().__init__()
+        self.checkpoint = checkpoint
+        self.model = GPT2LMHeadModel(
+            GPT2Config(n_embd=hidden_size,
+                       n_layer=num_layers,
+                       n_head=num_attention_heads,
+                       n_positions=max_seq_len,
+                       n_ctx=max_seq_len,
+                       vocab_size=vocab_size))
+        if checkpoint:
+            self.model.gradient_checkpointing_enable()
+
+    def forward(self, input_ids, attention_mask):
+        return self.model(input_ids=input_ids, attention_mask=attention_mask, use_cache=not self.checkpoint)[0]
+
+def gpt2_medium(checkpoint=False):
+    return GPTLMModel(hidden_size=1024, num_layers=24, num_attention_heads=16, checkpoint=checkpoint)
+```
+
+定义损失函数:
+
+```python
+class GPTLMLoss(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.loss_fn = nn.CrossEntropyLoss()
+
+    def forward(self, logits, labels):
+        shift_logits = logits[..., :-1, :].contiguous()
+        shift_labels = labels[..., 1:].contiguous()
+        return self.loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+```
+
+定义张量并行和参数分片策略：
+
+```python
+def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup):
+    for mn, module in model.named_modules():
+        for pn, param in module.named_parameters(recurse=False):
+            if hasattr(param, 'visited'):
+                continue
+            param.set_dist_spec(ReplicaSpec())
+            if 'mlp.c_fc' in mn:
+                if 'weight' in pn or 'bias' in pn:
+                    split_param_col_tp1d(param, pg)
+                    param.compute_spec.set_output_replicate(False)
+                else:
+                    param.set_dist_spec(ReplicaSpec())
+            elif 'mlp.c_proj' in mn:
+                if 'weight' in pn:
+                    split_param_row_tp1d(param, pg)
+                else:
+                    param.set_dist_spec(ReplicaSpec())
+            elif 'wte' in mn or 'wpe' in mn:
+                split_param_col_tp1d(param, pg)
+            elif 'c_attn' in mn or 'c_proj' in mn:
+                split_param_col_tp1d(param, pg)
+            else:
+                param.set_dist_spec(ReplicaSpec())
+
+            param.visited = True
+def split_param_single_dim_tp1d(dim: int, param: ColoParameter, pg: ProcessGroup):
+    spec = (ShardSpec([dim], [pg.tp_world_size()]), ComputeSpec(ComputePattern.TP1D))
+    param.set_tensor_spec(*spec)
+
+
+def split_param_row_tp1d(param: ColoParameter, pg: ProcessGroup):
+    split_param_single_dim_tp1d(0, param, pg)
+
+
+def split_param_col_tp1d(param: ColoParameter, pg: ProcessGroup):
+    split_param_single_dim_tp1d(-1, param, pg)
+```
+
+定义一个使用 Gemini + ZeRO DDP 的模型：
+
+```python
+def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy: str = "auto"):
+    cai_version = colossalai.__version__
+    if version.parse(cai_version) > version.parse("0.1.10"):
+        from colossalai.nn.parallel import GeminiDDP
+        model = GeminiDDP(model,
+                          device=get_current_device(),
+                          placement_policy=placememt_policy,
+                          pin_memory=True,
+                          search_range_mb=32)
+    elif version.parse(cai_version) <= version.parse("0.1.10") and version.parse(cai_version) >= version.parse("0.1.9"):
+        from colossalai.gemini import ChunkManager, GeminiManager
+        chunk_size = ChunkManager.search_chunk_size(model, 64 * 1024**2, 32)
+        gemini_manager = GeminiManager(placememt_policy, chunk_manager)
+        chunk_manager = ChunkManager(chunk_size,
+                                     pg,
+                                     enable_distributed_storage=True,
+                                 			init_device=GeminiManager.get_default_device(placememt_policy))
+        model = ZeroDDP(model, gemini_manager)
+    else:
+        raise NotImplemented(f"CAI version {cai_version} is not supported")
+    return model
+```
+
+由于我们在这个例子中对GPT进行预训练，因此只使用了一个简单的语言模型损失函数。
+
+写一个获得随机输入的函数:
+
+```python
+def get_data(batch_size, seq_len, vocab_size):
+    input_ids = torch.randint(0, vocab_size, (batch_size, seq_len), device=torch.cuda.current_device())
+    attention_mask = torch.ones_like(input_ids)
+    return input_ids, attention_mask
+```
+
+最后，我们可以定义我们的训练循环:
+
+```python
+def main():
+    args = parse_args()
+    BATCH_SIZE = 8
+    SEQ_LEN = 1024
+    VOCAB_SIZE = 50257
+    NUM_STEPS = 10
+    colossalai.launch_from_torch(config={})
+
+    # build criterion
+    criterion = GPTLMLoss()
+
+    torch.manual_seed(123)
+    default_pg = ProcessGroup(tp_degree=args.tp_degree)
+    default_dist_spec = ShardSpec([-1], [args.tp_degree]) if args.shardinit else None
+    # build GPT model
+    with ColoInitContext(device='cpu', default_dist_spec=default_dist_spec, default_pg=default_pg):
+      model = gpt2_medium(checkpoint=True)
+    pg = default_pg
+    # Tensor Parallelism (TP)
+    tensor_parallelize(model, pg)
+    # Gemini + ZeRO DP, Note it must be used after TP
+    model = gemini_zero_dpp(model, pg, args.placement)
+    # build optimizer
+    optimizer = GeminiAdamOptimizer(model, lr=1e-3, initial_scale=2**5)
+    numel = sum([p.numel() for p in model.parameters()])
+    get_tflops_func = partial(get_tflops, numel, BATCH_SIZE, SEQ_LEN)
+    torch.cuda.synchronize()
+    model.train()
+    for n in range(NUM_STEPS):
+        # we just use randomly generated data here
+        input_ids, attn_mask = get_data(BATCH_SIZE, SEQ_LEN, VOCAB_SIZE)
+        optimizer.zero_grad()
+        outputs = model(input_ids, attn_mask)
+        loss = criterion(outputs, input_ids)
+        optimizer.backward(loss)
+        optimizer.step()
+
+    torch.cuda.synchronize()
+```
+> ⚠️ 注意：如果你使用Gemini模块的话，请不要使用我们之前提到过的[梯度累加](../features/gradient_accumulation.md)。
+完整的例子代码可以在 [Train GPT with Colossal-AI](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/gpt). 获得。
diff --git a/docs/source/zh/get_started/installation.md b/docs/source/zh/get_started/installation.md
new file mode 100755
index 000000000000..bcb2112bb2e8
--- /dev/null
+++ b/docs/source/zh/get_started/installation.md
@@ -0,0 +1,36 @@
+# 安装
+
+## 从PyPI上安装
+
+你可以PyPI上使用以下命令直接安装Colossal-AI。
+
+```shell
+pip install colossalai
+```
+
+如果你想同时安装PyTorch扩展的话，可以添加`CUDA_EXT=1`。如果不添加的话，PyTorch扩展会在运行时自动安装。
+
+```shell
+CUDA_EXT=1 pip install colossalai
+```
+
+## 从源安装
+
+> 此文档将与版本库的主分支保持一致。如果您遇到任何问题，欢迎给我们提 issue :)
+
+```shell
+git clone https://github.com/hpcaitech/ColossalAI.git
+cd ColossalAI
+
+# install dependency
+pip install -r requirements/requirements.txt
+
+# install colossalai
+pip install .
+```
+
+如果您不想安装和启用 CUDA 内核融合（使用融合优化器时强制安装）：
+
+```shell
+NO_CUDA_EXT=1 pip install .
+```
diff --git a/docs/source/zh/get_started/reading_roadmap.md b/docs/source/zh/get_started/reading_roadmap.md
new file mode 100755
index 000000000000..8813386dc9c1
--- /dev/null
+++ b/docs/source/zh/get_started/reading_roadmap.md
@@ -0,0 +1,10 @@
+# 阅读指引
+
+Colossal-AI为您提供了一系列的并行训练组件。我们的目标是支持您开发分布式深度学习模型，就像您编写单GPU深度学习模型一样简单。ColossalAI提供了易于使用的API来帮助您启动您的训练过程。为了更好地了解ColossalAI的工作原理，我们建议您按照以下顺序阅读本文档。
+
+- 如果您不熟悉分布式系统，或者没有使用过Colossal-AI，您可以先浏览`概念`部分，了解我们要实现的目标同时掌握一些关于分布式训练的背景知识。
+- 接下来，您可以按照`基础教程`进行学习。该节将介绍关于如何使用Colossal-AI的细节。
+- 这时候，您就可以小试牛刀了！`功能` 部分将帮助您尝试如何使用Colossal-AI为您的模型训练进行加速。我们将为每个教程提供一个代码库。这些教程将涵盖Colossal-AI的基本用法，以实现简单的功能，如数据并行和混合精度训练。
+- 最后，如果您希望应用更高超的技术，比如，如何在GPT-3上运行混合并行，快来`高级教程`部分学习如何搭建您自己的模型吧！
+
+**我们始终欢迎社区的建议和讨论，如果您遇到任何问题，我们将非常愿意帮助您。您可以在GitHub 提 [issue](https://github.com/hpcaitech/ColossalAI/issues) ，或在[论坛](https://github.com/hpcaitech/ColossalAI/discussions)上创建一个讨论主题。**
diff --git a/docs/source/zh/get_started/run_demo.md b/docs/source/zh/get_started/run_demo.md
new file mode 100755
index 000000000000..edfc246c22d5
--- /dev/null
+++ b/docs/source/zh/get_started/run_demo.md
@@ -0,0 +1,28 @@
+# 快速演示
+
+Colossal-AI 是一个集成的大规模深度学习系统，具有高效的并行化技术。该系统可以通过应用并行化技术在具有多个 GPU 的分布式系统上加速模型训练。该系统也可以在只有一个 GPU 的系统上运行。以下是展示如何使用 Colossal-AI 的 Quick demos。
+
+## 单 GPU
+
+Colossal-AI 可以用在只有一个 GPU 的系统上训练深度学习模型，并达到 baseline 的性能。 我们提供了一个 [在CIFAR10数据集上训练ResNet](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/image/resnet) 的例子，该例子只需要一个 GPU。
+您可以在 [ColossalAI-Examples](https://github.com/hpcaitech/ColossalAI-Examples) 中获取该例子。详细说明可以在其 `README.md` 中获取。
+
+## 多 GPU
+
+Colossal-AI 可用于在具有多个 GPU 的分布式系统上训练深度学习模型，并通过应用高效的并行化技术大幅加速训练过程。我们提供了多种并行化技术供您尝试。
+
+#### 1. 数据并行
+
+您可以使用与上述单 GPU 演示相同的 [ResNet例子](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/image/resnet)。 通过设置 `--nproc_per_node` 为您机器上的 GPU 数量，您就能把数据并行应用在您的例子上了。
+
+#### 2. 混合并行
+
+混合并行包括数据、张量和流水线并行。在 Colossal-AI 中，我们支持不同类型的张量并行（即 1D、2D、2.5D 和 3D）。您可以通过简单地改变 `config.py` 中的配置在不同的张量并行之间切换。您可以参考 [GPT example](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/language/gpt), 更多细节能在它的 `README.md` 中被找到。
+
+#### 3. MoE并行
+
+我们提供了一个 [WideNet例子](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/image/widenet) 来验证 MoE 的并行性。 WideNet 使用 Mixture of Experts（MoE）来实现更好的性能。更多的细节可以在我们的教程中获取：[教会您如何把Mixture of Experts整合到模型中](../advanced_tutorials/integrate_mixture_of_experts_into_your_model.md)。
+
+#### 4. 序列并行
+
+序列并行是为了解决NLP任务中的内存效率和序列长度限制问题。 我们在 [ColossalAI-Examples](https://github.com/hpcaitech/ColossalAI-Examples) 中提供了一个 [BERT例子](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/language/bert/sequene_parallel)。您可以按照 `README.md` 来执行代码。

From a4ae43f071913271f7d4b4945fa1bc64b5b41234 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Thu, 9 Feb 2023 16:38:49 +0800
Subject: [PATCH 282/503] [doc] added docusaurus-based version control (#2656)

---
 docs/sidebars.js   | 73 ++++++++++++++++++++++++++++++++++++++++++++++
 docs/versions.json |  3 ++
 2 files changed, 76 insertions(+)
 create mode 100644 docs/sidebars.js
 create mode 100644 docs/versions.json

diff --git a/docs/sidebars.js b/docs/sidebars.js
new file mode 100644
index 000000000000..d6273b558767
--- /dev/null
+++ b/docs/sidebars.js
@@ -0,0 +1,73 @@
+module.exports = {
+  docs: [
+    {
+      type: 'category',
+      label: 'Get started',
+      collapsed: false,
+      items: [
+        'get_started/installation', 'get_started/run_demo',
+        'get_started/reading_roadmap'
+      ],
+    },
+    {
+      type: 'category',
+      label: 'Concepts',
+      collapsed: false,
+      items: [
+        'concepts/distributed_training', 'concepts/paradigms_of_parallelism',
+        'concepts/colossalai_overview'
+      ],
+    },
+    {
+      type: 'category',
+      label: 'Basics',
+      collapsed: false,
+      items: [
+        'basics/command_line_tool',
+        'basics/define_your_config',
+        'basics/launch_colossalai',
+        'basics/initialize_features',
+        'basics/engine_trainer',
+        'basics/configure_parallelization',
+        'basics/model_checkpoint',
+        'basics/colotensor_concept',
+      ],
+    },
+    {
+      type: 'category',
+      label: 'Features',
+      collapsed: false,
+      items: [
+        'features/mixed_precision_training', 'features/gradient_accumulation',
+        'features/gradient_clipping', 'features/gradient_handler',
+        'features/zero_with_chunk', {
+          type: 'category',
+          label: 'Tensor Parallel',
+          collapsed: true,
+          items: [
+            'features/1D_tensor_parallel',
+            'features/2D_tensor_parallel',
+            'features/2p5D_tensor_parallel',
+            'features/3D_tensor_parallel',
+          ],
+        },
+        'features/pipeline_parallel', 'features/nvme_offload'
+      ],
+    },
+    {
+      type: 'category',
+      label: 'Advanced Tutorials',
+      collapsed: false,
+      items: [
+        'advanced_tutorials/train_vit_using_pipeline_parallelism',
+        'advanced_tutorials/train_vit_with_hybrid_parallelism',
+        'advanced_tutorials/train_gpt_using_hybrid_parallelism',
+        'advanced_tutorials/define_your_own_parallel_model',
+        'advanced_tutorials/add_your_parallel',
+        'advanced_tutorials/meet_gemini',
+        'advanced_tutorials/parallelize_your_training_like_Megatron',
+        'advanced_tutorials/integrate_mixture_of_experts_into_your_model'
+      ],
+    },
+  ]
+};
diff --git a/docs/versions.json b/docs/versions.json
new file mode 100644
index 000000000000..dde32982b798
--- /dev/null
+++ b/docs/versions.json
@@ -0,0 +1,3 @@
+[
+  "current"
+]

From cd4f02bed8f3dccd22ab49d67ba96a5147a48bc0 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Thu, 9 Feb 2023 17:06:29 +0800
Subject: [PATCH 283/503] [doc] fixed compatiblity with docusaurus (#2657)

---
 docs/source/en/Colossal-Auto/get_started/run_demo.md            | 2 +-
 .../{zh => zh-Hans}/Colossal-Auto/feature/auto_checkpoint.md    | 0
 .../source/{zh => zh-Hans}/Colossal-Auto/feature/device_mesh.md | 0
 .../{zh => zh-Hans}/Colossal-Auto/feature/shape_consistency.md  | 0
 docs/source/{zh => zh-Hans}/Colossal-Auto/feature/tracer.md     | 0
 .../{zh => zh-Hans}/Colossal-Auto/get_started/installation.md   | 0
 .../{zh => zh-Hans}/Colossal-Auto/get_started/introduction.md   | 0
 .../{zh => zh-Hans}/Colossal-Auto/get_started/run_demo.md       | 2 +-
 .../{zh => zh-Hans}/advanced_tutorials/add_your_parallel.md     | 0
 .../advanced_tutorials/define_your_own_parallel_model.md        | 0
 .../integrate_mixture_of_experts_into_your_model.md             | 0
 docs/source/{zh => zh-Hans}/advanced_tutorials/meet_gemini.md   | 0
 docs/source/{zh => zh-Hans}/advanced_tutorials/opt_service.md   | 0
 .../parallelize_your_training_like_Megatron.md                  | 0
 .../advanced_tutorials/train_gpt_using_hybrid_parallelism.md    | 0
 .../advanced_tutorials/train_vit_using_pipeline_parallelism.md  | 0
 .../advanced_tutorials/train_vit_with_hybrid_parallelism.md     | 0
 docs/source/{zh => zh-Hans}/basics/colotensor_concept.md        | 0
 docs/source/{zh => zh-Hans}/basics/command_line_tool.md         | 0
 docs/source/{zh => zh-Hans}/basics/configure_parallelization.md | 0
 docs/source/{zh => zh-Hans}/basics/define_your_config.md        | 0
 docs/source/{zh => zh-Hans}/basics/engine_trainer.md            | 0
 docs/source/{zh => zh-Hans}/basics/initialize_features.md       | 0
 docs/source/{zh => zh-Hans}/basics/launch_colossalai.md         | 0
 docs/source/{zh => zh-Hans}/basics/model_checkpoint.md          | 0
 docs/source/{zh => zh-Hans}/concepts/colossalai_overview.md     | 0
 docs/source/{zh => zh-Hans}/concepts/distributed_training.md    | 0
 .../source/{zh => zh-Hans}/concepts/paradigms_of_parallelism.md | 0
 docs/source/{zh => zh-Hans}/features/1D_tensor_parallel.md      | 0
 docs/source/{zh => zh-Hans}/features/2D_tensor_parallel.md      | 0
 docs/source/{zh => zh-Hans}/features/2p5D_tensor_parallel.md    | 0
 docs/source/{zh => zh-Hans}/features/3D_tensor_parallel.md      | 0
 docs/source/{zh => zh-Hans}/features/gradient_accumulation.md   | 0
 docs/source/{zh => zh-Hans}/features/gradient_clipping.md       | 0
 docs/source/{zh => zh-Hans}/features/gradient_handler.md        | 0
 .../source/{zh => zh-Hans}/features/mixed_precision_training.md | 0
 docs/source/{zh => zh-Hans}/features/nvme_offload.md            | 0
 docs/source/{zh => zh-Hans}/features/pipeline_parallel.md       | 0
 docs/source/{zh => zh-Hans}/features/zero_with_chunk.md         | 0
 docs/source/{zh => zh-Hans}/get_started/installation.md         | 0
 docs/source/{zh => zh-Hans}/get_started/reading_roadmap.md      | 0
 docs/source/{zh => zh-Hans}/get_started/run_demo.md             | 0
 42 files changed, 2 insertions(+), 2 deletions(-)
 rename docs/source/{zh => zh-Hans}/Colossal-Auto/feature/auto_checkpoint.md (100%)
 rename docs/source/{zh => zh-Hans}/Colossal-Auto/feature/device_mesh.md (100%)
 rename docs/source/{zh => zh-Hans}/Colossal-Auto/feature/shape_consistency.md (100%)
 rename docs/source/{zh => zh-Hans}/Colossal-Auto/feature/tracer.md (100%)
 rename docs/source/{zh => zh-Hans}/Colossal-Auto/get_started/installation.md (100%)
 rename docs/source/{zh => zh-Hans}/Colossal-Auto/get_started/introduction.md (100%)
 rename docs/source/{zh => zh-Hans}/Colossal-Auto/get_started/run_demo.md (94%)
 rename docs/source/{zh => zh-Hans}/advanced_tutorials/add_your_parallel.md (100%)
 rename docs/source/{zh => zh-Hans}/advanced_tutorials/define_your_own_parallel_model.md (100%)
 rename docs/source/{zh => zh-Hans}/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md (100%)
 rename docs/source/{zh => zh-Hans}/advanced_tutorials/meet_gemini.md (100%)
 rename docs/source/{zh => zh-Hans}/advanced_tutorials/opt_service.md (100%)
 rename docs/source/{zh => zh-Hans}/advanced_tutorials/parallelize_your_training_like_Megatron.md (100%)
 rename docs/source/{zh => zh-Hans}/advanced_tutorials/train_gpt_using_hybrid_parallelism.md (100%)
 rename docs/source/{zh => zh-Hans}/advanced_tutorials/train_vit_using_pipeline_parallelism.md (100%)
 rename docs/source/{zh => zh-Hans}/advanced_tutorials/train_vit_with_hybrid_parallelism.md (100%)
 rename docs/source/{zh => zh-Hans}/basics/colotensor_concept.md (100%)
 rename docs/source/{zh => zh-Hans}/basics/command_line_tool.md (100%)
 rename docs/source/{zh => zh-Hans}/basics/configure_parallelization.md (100%)
 rename docs/source/{zh => zh-Hans}/basics/define_your_config.md (100%)
 rename docs/source/{zh => zh-Hans}/basics/engine_trainer.md (100%)
 rename docs/source/{zh => zh-Hans}/basics/initialize_features.md (100%)
 rename docs/source/{zh => zh-Hans}/basics/launch_colossalai.md (100%)
 rename docs/source/{zh => zh-Hans}/basics/model_checkpoint.md (100%)
 rename docs/source/{zh => zh-Hans}/concepts/colossalai_overview.md (100%)
 rename docs/source/{zh => zh-Hans}/concepts/distributed_training.md (100%)
 rename docs/source/{zh => zh-Hans}/concepts/paradigms_of_parallelism.md (100%)
 rename docs/source/{zh => zh-Hans}/features/1D_tensor_parallel.md (100%)
 rename docs/source/{zh => zh-Hans}/features/2D_tensor_parallel.md (100%)
 rename docs/source/{zh => zh-Hans}/features/2p5D_tensor_parallel.md (100%)
 rename docs/source/{zh => zh-Hans}/features/3D_tensor_parallel.md (100%)
 rename docs/source/{zh => zh-Hans}/features/gradient_accumulation.md (100%)
 rename docs/source/{zh => zh-Hans}/features/gradient_clipping.md (100%)
 rename docs/source/{zh => zh-Hans}/features/gradient_handler.md (100%)
 rename docs/source/{zh => zh-Hans}/features/mixed_precision_training.md (100%)
 rename docs/source/{zh => zh-Hans}/features/nvme_offload.md (100%)
 rename docs/source/{zh => zh-Hans}/features/pipeline_parallel.md (100%)
 rename docs/source/{zh => zh-Hans}/features/zero_with_chunk.md (100%)
 rename docs/source/{zh => zh-Hans}/get_started/installation.md (100%)
 rename docs/source/{zh => zh-Hans}/get_started/reading_roadmap.md (100%)
 rename docs/source/{zh => zh-Hans}/get_started/run_demo.md (100%)

diff --git a/docs/source/en/Colossal-Auto/get_started/run_demo.md b/docs/source/en/Colossal-Auto/get_started/run_demo.md
index 6918ef497d19..bcf88cafc786 100644
--- a/docs/source/en/Colossal-Auto/get_started/run_demo.md
+++ b/docs/source/en/Colossal-Auto/get_started/run_demo.md
@@ -9,7 +9,7 @@ Detailed instructions can be found in its `README.md`.
 
 ### 2. Integration with activation checkpoint
 
-Colossal-Auto's automatic search function for activation checkpointing finds the most efficient checkpoint within a given memory budget, rather than just aiming for maximum memory compression. To avoid a lengthy search process for an optimal activation checkpoint, Colossal-Auto has implemented a two-stage search process. This allows the system to find a feasible distributed training solution in a reasonable amount of time while still benefiting from activation checkpointing for memory management. The integration of activation checkpointing in Colossal-AI improves the efficiency and effectiveness of large model training. You can follow the [Resnet example](TBA).
+Colossal-Auto's automatic search function for activation checkpointing finds the most efficient checkpoint within a given memory budget, rather than just aiming for maximum memory compression. To avoid a lengthy search process for an optimal activation checkpoint, Colossal-Auto has implemented a two-stage search process. This allows the system to find a feasible distributed training solution in a reasonable amount of time while still benefiting from activation checkpointing for memory management. The integration of activation checkpointing in Colossal-AI improves the efficiency and effectiveness of large model training. You can follow the [Resnet example](https://github.com/hpcaitech/ColossalAI/tree/main/examples/tutorial/auto_parallel).
 Detailed instructions can be found in its `README.md`.
 
 <figure style={{textAlign: "center"}}>
diff --git a/docs/source/zh/Colossal-Auto/feature/auto_checkpoint.md b/docs/source/zh-Hans/Colossal-Auto/feature/auto_checkpoint.md
similarity index 100%
rename from docs/source/zh/Colossal-Auto/feature/auto_checkpoint.md
rename to docs/source/zh-Hans/Colossal-Auto/feature/auto_checkpoint.md
diff --git a/docs/source/zh/Colossal-Auto/feature/device_mesh.md b/docs/source/zh-Hans/Colossal-Auto/feature/device_mesh.md
similarity index 100%
rename from docs/source/zh/Colossal-Auto/feature/device_mesh.md
rename to docs/source/zh-Hans/Colossal-Auto/feature/device_mesh.md
diff --git a/docs/source/zh/Colossal-Auto/feature/shape_consistency.md b/docs/source/zh-Hans/Colossal-Auto/feature/shape_consistency.md
similarity index 100%
rename from docs/source/zh/Colossal-Auto/feature/shape_consistency.md
rename to docs/source/zh-Hans/Colossal-Auto/feature/shape_consistency.md
diff --git a/docs/source/zh/Colossal-Auto/feature/tracer.md b/docs/source/zh-Hans/Colossal-Auto/feature/tracer.md
similarity index 100%
rename from docs/source/zh/Colossal-Auto/feature/tracer.md
rename to docs/source/zh-Hans/Colossal-Auto/feature/tracer.md
diff --git a/docs/source/zh/Colossal-Auto/get_started/installation.md b/docs/source/zh-Hans/Colossal-Auto/get_started/installation.md
similarity index 100%
rename from docs/source/zh/Colossal-Auto/get_started/installation.md
rename to docs/source/zh-Hans/Colossal-Auto/get_started/installation.md
diff --git a/docs/source/zh/Colossal-Auto/get_started/introduction.md b/docs/source/zh-Hans/Colossal-Auto/get_started/introduction.md
similarity index 100%
rename from docs/source/zh/Colossal-Auto/get_started/introduction.md
rename to docs/source/zh-Hans/Colossal-Auto/get_started/introduction.md
diff --git a/docs/source/zh/Colossal-Auto/get_started/run_demo.md b/docs/source/zh-Hans/Colossal-Auto/get_started/run_demo.md
similarity index 94%
rename from docs/source/zh/Colossal-Auto/get_started/run_demo.md
rename to docs/source/zh-Hans/Colossal-Auto/get_started/run_demo.md
index cdeb227eb261..1050dcec6842 100644
--- a/docs/source/zh/Colossal-Auto/get_started/run_demo.md
+++ b/docs/source/zh-Hans/Colossal-Auto/get_started/run_demo.md
@@ -8,7 +8,7 @@ Colossal-Auto 可被用于为每一次操作寻找一个包含数据、张量（
 
 ### 2. 与 activation checkpoint 结合
 
-作为大模型训练中必不可少的显存压缩技术，Colossal-AI 也提供了对于 activation checkpoint 的自动搜索功能。相比于大部分将最大显存压缩作为目标的技术方案，Colossal-AI 的搜索目标是在显存预算以内，找到最快的 activation checkpoint 方案。同时，为了避免将 activation checkpoint 的搜索一起建模到 SPMD solver 中导致搜索时间爆炸，Colossal-AI 做了 2-stage search 的设计，因此可以在合理的时间内搜索到有效可行的分布式训练方案。 您可参考 [Resnet 示例](TBA)。
+作为大模型训练中必不可少的显存压缩技术，Colossal-AI 也提供了对于 activation checkpoint 的自动搜索功能。相比于大部分将最大显存压缩作为目标的技术方案，Colossal-AI 的搜索目标是在显存预算以内，找到最快的 activation checkpoint 方案。同时，为了避免将 activation checkpoint 的搜索一起建模到 SPMD solver 中导致搜索时间爆炸，Colossal-AI 做了 2-stage search 的设计，因此可以在合理的时间内搜索到有效可行的分布式训练方案。 您可参考 [Resnet 示例](https://github.com/hpcaitech/ColossalAI/tree/main/examples/tutorial/auto_parallel)。
 详细的操作指引见其 `README.md`。
 
 <figure style={{textAlign: "center"}}>
diff --git a/docs/source/zh/advanced_tutorials/add_your_parallel.md b/docs/source/zh-Hans/advanced_tutorials/add_your_parallel.md
similarity index 100%
rename from docs/source/zh/advanced_tutorials/add_your_parallel.md
rename to docs/source/zh-Hans/advanced_tutorials/add_your_parallel.md
diff --git a/docs/source/zh/advanced_tutorials/define_your_own_parallel_model.md b/docs/source/zh-Hans/advanced_tutorials/define_your_own_parallel_model.md
similarity index 100%
rename from docs/source/zh/advanced_tutorials/define_your_own_parallel_model.md
rename to docs/source/zh-Hans/advanced_tutorials/define_your_own_parallel_model.md
diff --git a/docs/source/zh/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md b/docs/source/zh-Hans/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md
similarity index 100%
rename from docs/source/zh/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md
rename to docs/source/zh-Hans/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md
diff --git a/docs/source/zh/advanced_tutorials/meet_gemini.md b/docs/source/zh-Hans/advanced_tutorials/meet_gemini.md
similarity index 100%
rename from docs/source/zh/advanced_tutorials/meet_gemini.md
rename to docs/source/zh-Hans/advanced_tutorials/meet_gemini.md
diff --git a/docs/source/zh/advanced_tutorials/opt_service.md b/docs/source/zh-Hans/advanced_tutorials/opt_service.md
similarity index 100%
rename from docs/source/zh/advanced_tutorials/opt_service.md
rename to docs/source/zh-Hans/advanced_tutorials/opt_service.md
diff --git a/docs/source/zh/advanced_tutorials/parallelize_your_training_like_Megatron.md b/docs/source/zh-Hans/advanced_tutorials/parallelize_your_training_like_Megatron.md
similarity index 100%
rename from docs/source/zh/advanced_tutorials/parallelize_your_training_like_Megatron.md
rename to docs/source/zh-Hans/advanced_tutorials/parallelize_your_training_like_Megatron.md
diff --git a/docs/source/zh/advanced_tutorials/train_gpt_using_hybrid_parallelism.md b/docs/source/zh-Hans/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
similarity index 100%
rename from docs/source/zh/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
rename to docs/source/zh-Hans/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
diff --git a/docs/source/zh/advanced_tutorials/train_vit_using_pipeline_parallelism.md b/docs/source/zh-Hans/advanced_tutorials/train_vit_using_pipeline_parallelism.md
similarity index 100%
rename from docs/source/zh/advanced_tutorials/train_vit_using_pipeline_parallelism.md
rename to docs/source/zh-Hans/advanced_tutorials/train_vit_using_pipeline_parallelism.md
diff --git a/docs/source/zh/advanced_tutorials/train_vit_with_hybrid_parallelism.md b/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md
similarity index 100%
rename from docs/source/zh/advanced_tutorials/train_vit_with_hybrid_parallelism.md
rename to docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md
diff --git a/docs/source/zh/basics/colotensor_concept.md b/docs/source/zh-Hans/basics/colotensor_concept.md
similarity index 100%
rename from docs/source/zh/basics/colotensor_concept.md
rename to docs/source/zh-Hans/basics/colotensor_concept.md
diff --git a/docs/source/zh/basics/command_line_tool.md b/docs/source/zh-Hans/basics/command_line_tool.md
similarity index 100%
rename from docs/source/zh/basics/command_line_tool.md
rename to docs/source/zh-Hans/basics/command_line_tool.md
diff --git a/docs/source/zh/basics/configure_parallelization.md b/docs/source/zh-Hans/basics/configure_parallelization.md
similarity index 100%
rename from docs/source/zh/basics/configure_parallelization.md
rename to docs/source/zh-Hans/basics/configure_parallelization.md
diff --git a/docs/source/zh/basics/define_your_config.md b/docs/source/zh-Hans/basics/define_your_config.md
similarity index 100%
rename from docs/source/zh/basics/define_your_config.md
rename to docs/source/zh-Hans/basics/define_your_config.md
diff --git a/docs/source/zh/basics/engine_trainer.md b/docs/source/zh-Hans/basics/engine_trainer.md
similarity index 100%
rename from docs/source/zh/basics/engine_trainer.md
rename to docs/source/zh-Hans/basics/engine_trainer.md
diff --git a/docs/source/zh/basics/initialize_features.md b/docs/source/zh-Hans/basics/initialize_features.md
similarity index 100%
rename from docs/source/zh/basics/initialize_features.md
rename to docs/source/zh-Hans/basics/initialize_features.md
diff --git a/docs/source/zh/basics/launch_colossalai.md b/docs/source/zh-Hans/basics/launch_colossalai.md
similarity index 100%
rename from docs/source/zh/basics/launch_colossalai.md
rename to docs/source/zh-Hans/basics/launch_colossalai.md
diff --git a/docs/source/zh/basics/model_checkpoint.md b/docs/source/zh-Hans/basics/model_checkpoint.md
similarity index 100%
rename from docs/source/zh/basics/model_checkpoint.md
rename to docs/source/zh-Hans/basics/model_checkpoint.md
diff --git a/docs/source/zh/concepts/colossalai_overview.md b/docs/source/zh-Hans/concepts/colossalai_overview.md
similarity index 100%
rename from docs/source/zh/concepts/colossalai_overview.md
rename to docs/source/zh-Hans/concepts/colossalai_overview.md
diff --git a/docs/source/zh/concepts/distributed_training.md b/docs/source/zh-Hans/concepts/distributed_training.md
similarity index 100%
rename from docs/source/zh/concepts/distributed_training.md
rename to docs/source/zh-Hans/concepts/distributed_training.md
diff --git a/docs/source/zh/concepts/paradigms_of_parallelism.md b/docs/source/zh-Hans/concepts/paradigms_of_parallelism.md
similarity index 100%
rename from docs/source/zh/concepts/paradigms_of_parallelism.md
rename to docs/source/zh-Hans/concepts/paradigms_of_parallelism.md
diff --git a/docs/source/zh/features/1D_tensor_parallel.md b/docs/source/zh-Hans/features/1D_tensor_parallel.md
similarity index 100%
rename from docs/source/zh/features/1D_tensor_parallel.md
rename to docs/source/zh-Hans/features/1D_tensor_parallel.md
diff --git a/docs/source/zh/features/2D_tensor_parallel.md b/docs/source/zh-Hans/features/2D_tensor_parallel.md
similarity index 100%
rename from docs/source/zh/features/2D_tensor_parallel.md
rename to docs/source/zh-Hans/features/2D_tensor_parallel.md
diff --git a/docs/source/zh/features/2p5D_tensor_parallel.md b/docs/source/zh-Hans/features/2p5D_tensor_parallel.md
similarity index 100%
rename from docs/source/zh/features/2p5D_tensor_parallel.md
rename to docs/source/zh-Hans/features/2p5D_tensor_parallel.md
diff --git a/docs/source/zh/features/3D_tensor_parallel.md b/docs/source/zh-Hans/features/3D_tensor_parallel.md
similarity index 100%
rename from docs/source/zh/features/3D_tensor_parallel.md
rename to docs/source/zh-Hans/features/3D_tensor_parallel.md
diff --git a/docs/source/zh/features/gradient_accumulation.md b/docs/source/zh-Hans/features/gradient_accumulation.md
similarity index 100%
rename from docs/source/zh/features/gradient_accumulation.md
rename to docs/source/zh-Hans/features/gradient_accumulation.md
diff --git a/docs/source/zh/features/gradient_clipping.md b/docs/source/zh-Hans/features/gradient_clipping.md
similarity index 100%
rename from docs/source/zh/features/gradient_clipping.md
rename to docs/source/zh-Hans/features/gradient_clipping.md
diff --git a/docs/source/zh/features/gradient_handler.md b/docs/source/zh-Hans/features/gradient_handler.md
similarity index 100%
rename from docs/source/zh/features/gradient_handler.md
rename to docs/source/zh-Hans/features/gradient_handler.md
diff --git a/docs/source/zh/features/mixed_precision_training.md b/docs/source/zh-Hans/features/mixed_precision_training.md
similarity index 100%
rename from docs/source/zh/features/mixed_precision_training.md
rename to docs/source/zh-Hans/features/mixed_precision_training.md
diff --git a/docs/source/zh/features/nvme_offload.md b/docs/source/zh-Hans/features/nvme_offload.md
similarity index 100%
rename from docs/source/zh/features/nvme_offload.md
rename to docs/source/zh-Hans/features/nvme_offload.md
diff --git a/docs/source/zh/features/pipeline_parallel.md b/docs/source/zh-Hans/features/pipeline_parallel.md
similarity index 100%
rename from docs/source/zh/features/pipeline_parallel.md
rename to docs/source/zh-Hans/features/pipeline_parallel.md
diff --git a/docs/source/zh/features/zero_with_chunk.md b/docs/source/zh-Hans/features/zero_with_chunk.md
similarity index 100%
rename from docs/source/zh/features/zero_with_chunk.md
rename to docs/source/zh-Hans/features/zero_with_chunk.md
diff --git a/docs/source/zh/get_started/installation.md b/docs/source/zh-Hans/get_started/installation.md
similarity index 100%
rename from docs/source/zh/get_started/installation.md
rename to docs/source/zh-Hans/get_started/installation.md
diff --git a/docs/source/zh/get_started/reading_roadmap.md b/docs/source/zh-Hans/get_started/reading_roadmap.md
similarity index 100%
rename from docs/source/zh/get_started/reading_roadmap.md
rename to docs/source/zh-Hans/get_started/reading_roadmap.md
diff --git a/docs/source/zh/get_started/run_demo.md b/docs/source/zh-Hans/get_started/run_demo.md
similarity index 100%
rename from docs/source/zh/get_started/run_demo.md
rename to docs/source/zh-Hans/get_started/run_demo.md

From a255a38f7f7bb7dc185b752f76d7aea997fe5246 Mon Sep 17 00:00:00 2001
From: "Jiatong (Julius) Han" <59948448+JThh@users.noreply.github.com>
Date: Thu, 9 Feb 2023 20:43:55 +0800
Subject: [PATCH 284/503] [example] Polish README.md (#2658)

* [tutorial] polish readme.md

* [example] Update README.md
---
 examples/language/gpt/README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/language/gpt/README.md b/examples/language/gpt/README.md
index 7e6acb3d399b..3d5ce7c8807c 100644
--- a/examples/language/gpt/README.md
+++ b/examples/language/gpt/README.md
@@ -54,13 +54,13 @@ However, it requires some efforts to start if facing a new model structure.
 bash run_gemini.sh
 ```
 
-The `train_gpt_demo.py` provides three distributed plans, you can choose the plan you want in `run_gemini.sh`. The Colossal-AI leverages Tensor Parallel and Gemini + ZeRO DDP.
+The `train_gpt_demo.py` provides three distributed plans (except ones already provided by PyTorch), you can choose the plan you want in `run_gemini.sh`. The CAI_Gemini leverages Tensor Parallel and Gemini + ZeRO DDP. For their differences, you may check out the answer to issue [here](https://github.com/hpcaitech/ColossalAI/issues/2590#issuecomment-1418766581).
 
-- Colossal-AI
-- ZeRO1 (Colossal-AI)
-- ZeRO2 (Colossal-AI)
-- Pytorch DDP
-- Pytorch ZeRO
+- ZeRO1 (CAI_ZeRO1)
+- ZeRO2 (CAI_ZeRO2)
+- Gemini + ZeRO DDP (CAI_Gemini)
+- Pytorch DDP (Pytorch_DDP)
+- Pytorch ZeRO (Pytorch_ZeRO)
 
 ### Titans (Tensor Parallelism) + ZeRO + Pipeline Parallelism
 

From 94f87f9651ac78f235d53ef1e06be11874bd6121 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Fri, 10 Feb 2023 09:59:07 +0800
Subject: [PATCH 285/503] [workflow] fixed gpu memory check condition (#2659)

---
 .github/workflows/build_on_schedule.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_on_schedule.yml b/.github/workflows/build_on_schedule.yml
index 32b518ac5394..6afdf581e6ca 100644
--- a/.github/workflows/build_on_schedule.yml
+++ b/.github/workflows/build_on_schedule.yml
@@ -23,7 +23,7 @@ jobs:
           for i in $(seq 0 7);
           do
             gpu_used=$(nvidia-smi -i $i --query-gpu=memory.used --format=csv,noheader,nounits)
-            [ "$gpu_used" -le "10000" ] && avai=false
+            [ "$gpu_used" -gt "10000" ] && avai=false
           done
 
           echo "GPU is available: $avai"

From b673e5f78bbe54f18daa9ba7ed068bb8cca5db5c Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Fri, 10 Feb 2023 11:01:24 +0800
Subject: [PATCH 286/503] [release] v0.2.2 (#2661)

---
 docs/versions.json | 2 +-
 version.txt        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/versions.json b/docs/versions.json
index dde32982b798..5961b1c980dc 100644
--- a/docs/versions.json
+++ b/docs/versions.json
@@ -1,3 +1,3 @@
 [
-  "current"
+  "v0.2.2"
 ]
diff --git a/version.txt b/version.txt
index 0c62199f16ac..ee1372d33a29 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.2.1
+0.2.2

From 0385b26ebf4a811ca70eafe8590ad5e0529c0595 Mon Sep 17 00:00:00 2001
From: Boyuan Yao <70263930+Cypher30@users.noreply.github.com>
Date: Fri, 10 Feb 2023 14:29:24 +0800
Subject: [PATCH 287/503] [autoparallel] Patch meta information of
 `torch.nn.LayerNorm` (#2647)

* [autoparallel] layernorm metainfo patch

* [autoparallel] polish test
---
 .../meta_profiler/meta_registry/norm.py       |  55 +++++++-
 .../node_handler/layer_norm_handler.py        |   4 +-
 .../test_metainfo/test_batchnorm_metainfo.py  |  60 --------
 .../test_metainfo/test_matmul_metainfo.py     |  41 +-----
 .../test_metainfo/test_norm_metainfo.py       | 131 ++++++++++++++++++
 .../test_tensor_shard/test_metainfo/utils.py  |  55 +++++++-
 6 files changed, 244 insertions(+), 102 deletions(-)
 delete mode 100644 tests/test_auto_parallel/test_tensor_shard/test_metainfo/test_batchnorm_metainfo.py
 create mode 100644 tests/test_auto_parallel/test_tensor_shard/test_metainfo/test_norm_metainfo.py

diff --git a/colossalai/auto_parallel/meta_profiler/meta_registry/norm.py b/colossalai/auto_parallel/meta_profiler/meta_registry/norm.py
index 9b34332db1b5..3a1db396e188 100644
--- a/colossalai/auto_parallel/meta_profiler/meta_registry/norm.py
+++ b/colossalai/auto_parallel/meta_profiler/meta_registry/norm.py
@@ -16,7 +16,7 @@
 
 from ..registry import meta_register
 
-__all__ = ['batchnormnd_meta_info']
+__all__ = ['batchnormnd_meta_info', 'layernorm_meta_info']
 
 
 @meta_register.register(torch.nn.BatchNorm1d)
@@ -101,3 +101,56 @@ def batchnormnd_meta_info(*args, **kwargs) -> Tuple[TrainCycleItem, TrainCycleIt
     fwd_out = [torch.zeros_like(output_tensor, device='meta')]
 
     return compute_cost, memory_cost, fwd_in, fwd_buffer, fwd_out
+
+
+@meta_register.register(torch.nn.LayerNorm)
+def layernorm_meta_info(*args, **kwargs) -> Tuple[TrainCycleItem, TrainCycleItem, List[torch.Tensor]]:
+    """LayerNorm meta information
+
+    Returns:
+        Tuple[TrainCycleItem, TrainCycleItem, List[torch.Tensor]]: compute cost, memory cost and forward inputs
+    """
+    # construct needed tensors
+    input_tensor = next(filter(lambda x: x.type == OperationDataType.ARG, args)).data
+    output_tensor = next(filter(lambda x: x.type == OperationDataType.OUTPUT, args)).data
+    weight_tensor = next(filter(lambda x: x.name == "weight", args)).data
+    bias_tensor = next(filter(lambda x: x.name == "bias", args)).data
+    running_mean = torch.rand(input_tensor.shape[0], 1, device='meta')
+    running_var = torch.rand(input_tensor.shape[0], 1, device='meta')
+
+    # construct args
+    fwd_in_args = [input_tensor, [input_tensor.shape[0]], weight_tensor]
+    fwd_out_args = [output_tensor]
+    bwd_in_args = [input_tensor, output_tensor, [input_tensor.shape[0]]]
+    bwd_out_args = [weight_tensor, bias_tensor]
+
+    # compute cost
+    fwd_compute_cost = flop_mapping[torch.ops.aten.native_layer_norm.default](fwd_in_args, fwd_out_args)
+    bwd_compute_cost = flop_mapping[torch.ops.aten.native_layer_norm_backward.default](bwd_in_args, bwd_out_args)
+    compute_cost = TrainCycleItem(fwd=fwd_compute_cost, bwd=bwd_compute_cost, total=fwd_compute_cost + bwd_compute_cost)
+
+    # memory cost
+    # NOTE: currently in SPMD solver we always believe that there will be a new tensor created in forward
+    fwd_memory_cost = MemoryCost(activation=activation_size([input_tensor, output_tensor, weight_tensor, bias_tensor]),
+                                 parameter=activation_size([weight_tensor, bias_tensor]),
+                                 temp=0,
+                                 buffer=activation_size([running_mean, running_var]))
+
+    bwd_memory_cost = MemoryCost(activation=activation_size([input_tensor, weight_tensor, bias_tensor]),
+                                 parameter=activation_size([weight_tensor, bias_tensor]),
+                                 temp=activation_size([running_mean, running_var]),
+                                 buffer=activation_size([running_mean, running_var]))
+
+    total_cost = MemoryCost(activation=fwd_memory_cost.activation + bwd_memory_cost.activation,
+                            parameter=fwd_memory_cost.parameter + bwd_memory_cost.parameter,
+                            temp=fwd_memory_cost.temp + bwd_memory_cost.temp,
+                            buffer=fwd_memory_cost.buffer + bwd_memory_cost.buffer)
+
+    memory_cost = TrainCycleItem(fwd=fwd_memory_cost, bwd=bwd_memory_cost, total=total_cost)
+
+    # store fwd_in, fwd_buffer, fwd_out
+    fwd_in = [torch.zeros_like(input_tensor, device='meta')]
+    fwd_buffer = [torch.zeros_like(running_mean, device='meta'), torch.zeros_like(running_var, device='meta')]
+    fwd_out = [torch.zeros_like(output_tensor, device='meta')]
+
+    return compute_cost, memory_cost, fwd_in, fwd_buffer, fwd_out
diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/layer_norm_handler.py b/colossalai/auto_parallel/tensor_shard/node_handler/layer_norm_handler.py
index 132ac30daed8..452381169b74 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/layer_norm_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/layer_norm_handler.py
@@ -3,7 +3,7 @@
 import torch
 
 from ..sharding_strategy import OperationData, OperationDataType
-from .node_handler import ModuleHandler
+from .node_handler import MetaInfoModuleHandler, ModuleHandler
 from .registry import operator_registry
 from .strategy import LayerNormGenerator, StrategyGenerator
 
@@ -11,7 +11,7 @@
 
 
 @operator_registry.register(torch.nn.LayerNorm)
-class LayerNormModuleHandler(ModuleHandler):
+class LayerNormModuleHandler(MetaInfoModuleHandler):
     """
     A LayerNormModuleHandler which deals with the sharding strategies for nn.LayerNorm module.
     """
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_metainfo/test_batchnorm_metainfo.py b/tests/test_auto_parallel/test_tensor_shard/test_metainfo/test_batchnorm_metainfo.py
deleted file mode 100644
index 826c746668da..000000000000
--- a/tests/test_auto_parallel/test_tensor_shard/test_metainfo/test_batchnorm_metainfo.py
+++ /dev/null
@@ -1,60 +0,0 @@
-from functools import partial
-
-import pytest
-import torch
-import torch.multiprocessing as mp
-import torch.nn as nn
-
-from colossalai.device.device_mesh import DeviceMesh
-from colossalai.fx import ColoGraphModule, ColoTracer
-from colossalai.initialize import launch
-from colossalai.logging import disable_existing_loggers
-from colossalai.testing.pytest_wrapper import run_on_environment_flag
-from colossalai.testing.utils import parameterize, rerun_if_address_is_in_use
-from colossalai.utils import free_port
-from tests.test_auto_parallel.test_tensor_shard.test_metainfo.utils import mem_test_for_node_strategy
-
-
-def _batchnorm_module_mem_test(rank, world_size, port):
-    """This function is for batchnorm memory test
-    Test and print real memory cost and estimated, this test will not be executed except with the tag AUTO_PARALLEL
-
-    Args:
-        rank: device rank
-        bias: indicate whether conv module need bias
-        world_size: number of devices
-        port: port for initializing process group
-    """
-    disable_existing_loggers()
-    launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
-    model = nn.Sequential(nn.BatchNorm2d(128)).cuda()
-    input = torch.rand(4, 128, 64, 64).cuda()
-    input.requires_grad = True
-    physical_mesh_id = torch.arange(0, 4)
-    mesh_shape = (2, 2)
-    device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
-
-    # index of target node in computation graph
-    node_index = 1
-    # total number of target node strategies
-    strategy_number = 9
-    mem_test_for_node_strategy(rank=rank,
-                               model=model,
-                               device_mesh=device_mesh,
-                               node_index=node_index,
-                               strategy_number=strategy_number,
-                               input_args=[input],
-                               meta_arg_names=['input'])
-
-
-@run_on_environment_flag(name='AUTO_PARALLEL')
-@pytest.mark.dist
-@rerun_if_address_is_in_use()
-def test_batchnorm_meta_concrete_info_match():
-    world_size = 4
-    run_func_module = partial(_batchnorm_module_mem_test, world_size=world_size, port=free_port())
-    mp.spawn(run_func_module, nprocs=world_size)
-
-
-if __name__ == '__main__':
-    test_batchnorm_meta_concrete_info_match()
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_metainfo/test_matmul_metainfo.py b/tests/test_auto_parallel/test_tensor_shard/test_metainfo/test_matmul_metainfo.py
index 3fb9c3d85d64..fd29c63fb522 100644
--- a/tests/test_auto_parallel/test_tensor_shard/test_metainfo/test_matmul_metainfo.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_metainfo/test_matmul_metainfo.py
@@ -21,7 +21,7 @@
 from colossalai.testing.pytest_wrapper import run_on_environment_flag
 from colossalai.testing.utils import parameterize, rerun_if_address_is_in_use
 from colossalai.utils import free_port
-from tests.test_auto_parallel.test_tensor_shard.test_metainfo.utils import mem_test_for_node_strategy
+from tests.test_auto_parallel.test_tensor_shard.test_metainfo.utils import print_results
 
 if torch.__version__ >= '1.12.0':
     from colossalai.auto_parallel.meta_profiler import MetaInfo, meta_register
@@ -102,43 +102,8 @@ def test_matmul_function_meta_info(tensor_shapes):
     compute_cost: TrainCycleItem
     memory_cost: TrainCycleItem
 
-    print("=====================")
-    print(f"input shapes: {tensor_shapes[0]}, {tensor_shapes[1]}")
-    print(f"output shapes: {output_tensor.shape}")
-
-    # estimated results
-    print("Estimated Results")
-
-    # compute cost
-    print("compute_cost:")
-    print(f"    fwd: {compute_cost.fwd}")
-    print(f"    bwd: {compute_cost.bwd}")
-
-    # memory cost
-    print("memory_cost:")
-    # fwd
-    print(f"    fwd activation: {memory_cost.fwd.activation / 1024} KB")
-    print(f"    fwd buffer: {memory_cost.fwd.buffer / 1024} KB")
-    print(f"    fwd temp: {memory_cost.fwd.temp / 1024} KB")
-    print(f"    fwd parameter: {memory_cost.fwd.parameter / 1024} KB")
-
-    # bwd
-    print(f"    bwd activation: {memory_cost.bwd.activation / 1024} KB")
-    print(f"    bwd buffer: {memory_cost.bwd.buffer / 1024} KB")
-    print(f"    bwd temp: {memory_cost.bwd.temp / 1024} KB")
-    print(f"    bwd parameter: {memory_cost.bwd.parameter / 1024} KB")
-
-    # actual results
-    print("Actual Results")
-
-    print("memory_cost:")
-    # fwd
-    print(f"    fwd allocated: {fwd_allocated / 1024} KB")
-    print(f"    fwd peak: {fwd_peak / 1024} KB")
-
-    # bwd
-    print(f"    bwd allocated: {bwd_allocated / 1024} KB")
-    print(f"    bwd peak: {bwd_peak / 1024} KB")
+    print_results([input_real_tensor, other_real_tensor], [output_real_tensor], compute_cost, memory_cost,
+                  fwd_allocated, fwd_peak, bwd_allocated, bwd_peak)
 
 
 if __name__ == '__main__':
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_metainfo/test_norm_metainfo.py b/tests/test_auto_parallel/test_tensor_shard/test_metainfo/test_norm_metainfo.py
new file mode 100644
index 000000000000..9d3ab9c82670
--- /dev/null
+++ b/tests/test_auto_parallel/test_tensor_shard/test_metainfo/test_norm_metainfo.py
@@ -0,0 +1,131 @@
+from functools import partial
+
+import pytest
+import torch
+import torch.multiprocessing as mp
+import torch.nn as nn
+
+from colossalai.auto_parallel.tensor_shard.sharding_strategy import (
+    MemoryCost,
+    OperationData,
+    OperationDataType,
+    ShardingStrategy,
+    StrategiesVector,
+    TrainCycleItem,
+)
+from colossalai.device.device_mesh import DeviceMesh
+from colossalai.fx import ColoGraphModule, ColoTracer
+from colossalai.initialize import launch
+from colossalai.logging import disable_existing_loggers
+from colossalai.testing.pytest_wrapper import run_on_environment_flag
+from colossalai.testing.utils import parameterize, rerun_if_address_is_in_use
+from colossalai.utils import free_port
+from tests.test_auto_parallel.test_tensor_shard.test_metainfo.utils import mem_test_for_node_strategy, print_results
+
+if torch.__version__ >= '1.12.0':
+    from colossalai.auto_parallel.meta_profiler import MetaInfo, meta_register
+
+
+def _batchnorm_module_mem_test(rank, world_size, port):
+    """This function is for batchnorm memory test
+    Test and print real memory cost and estimated, this test will not be executed except with the tag AUTO_PARALLEL
+
+    Args:
+        rank: device rank
+        bias: indicate whether conv module need bias
+        world_size: number of devices
+        port: port for initializing process group
+    """
+    disable_existing_loggers()
+    launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+    model = nn.Sequential(nn.BatchNorm2d(128)).cuda()
+    input = torch.rand(4, 128, 64, 64).cuda()
+    input.requires_grad = True
+    physical_mesh_id = torch.arange(0, 4)
+    mesh_shape = (2, 2)
+    device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
+
+    # index of target node in computation graph
+    node_index = 1
+    # total number of target node strategies
+    strategy_number = 9
+    mem_test_for_node_strategy(rank=rank,
+                               model=model,
+                               device_mesh=device_mesh,
+                               node_index=node_index,
+                               strategy_number=strategy_number,
+                               input_args=[input],
+                               meta_arg_names=['input'])
+
+
+@run_on_environment_flag(name='AUTO_PARALLEL')
+@pytest.mark.dist
+@rerun_if_address_is_in_use()
+def test_batchnorm_meta_concrete_info_match():
+    world_size = 4
+    run_func_module = partial(_batchnorm_module_mem_test, world_size=world_size, port=free_port())
+    mp.spawn(run_func_module, nprocs=world_size)
+
+
+@pytest.mark.skipif(torch.__version__ < '1.12.0', reason='need pytorch 1.12.0 or higher for aten level operations')
+@parameterize('tensor_shape', [
+    [256, 1024],
+    [1024, 256],
+])
+def test_layernorm_meta_info(tensor_shape):
+    meta_func = meta_register.get(torch.nn.LayerNorm)
+
+    # construct input
+    input_tensor = torch.rand(*tensor_shape, device="meta")
+    output_tensor = torch.rand(*tensor_shape, device="meta")
+    weight_tensor = torch.rand(tensor_shape[1], device="meta")
+    bias_tensor = torch.rand(tensor_shape[1], device="meta")
+
+    # construct operation data
+    input_data = OperationData(name="input", type=OperationDataType.ARG, data=input_tensor)
+
+    output_data = OperationData(name="output", type=OperationDataType.OUTPUT, data=output_tensor)
+
+    weight_data = OperationData(name="weight", type=OperationDataType.PARAM, data=weight_tensor)
+
+    bias_data = OperationData(name="bias", type=OperationDataType.PARAM, data=bias_tensor)
+
+    # construct args and kwargs
+    args = [input_data, output_data, weight_data, bias_data]
+    kwargs = {'inplace': False}
+
+    # estimated results
+    compute_cost, memory_cost, fwd_in, fwd_buffer, fwd_out = meta_func(*args, **kwargs)
+
+    # actual results
+    input_real_tensor = torch.rand(*tensor_shape, device="cuda:0")
+
+    input_real_tensor.requires_grad = True
+
+    ln_module = torch.nn.LayerNorm(tensor_shape[1]).cuda()
+
+    # fwd
+    torch.cuda.reset_peak_memory_stats()
+    mem_stamp0 = torch.cuda.memory_allocated()
+    output_real_tensor = ln_module(input_real_tensor)
+    fwd_allocated = torch.cuda.memory_allocated() - mem_stamp0
+    fwd_peak = torch.cuda.max_memory_allocated() - mem_stamp0
+
+    # bwd
+    upstream_grad = torch.rand_like(output_real_tensor)
+    torch.cuda.reset_peak_memory_stats()
+    mem_stamp0 = torch.cuda.memory_allocated()
+    torch.autograd.backward(output_real_tensor, upstream_grad)
+    bwd_allocated = torch.cuda.memory_allocated() - mem_stamp0
+    bwd_peak = torch.cuda.max_memory_allocated() - mem_stamp0
+
+    compute_cost: TrainCycleItem
+    memory_cost: TrainCycleItem
+
+    print_results([input_real_tensor], [output_real_tensor], compute_cost, memory_cost, fwd_allocated, fwd_peak,
+                  bwd_allocated, bwd_peak)
+
+
+if __name__ == '__main__':
+    test_batchnorm_meta_concrete_info_match()
+    test_layernorm_meta_info()
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_metainfo/utils.py b/tests/test_auto_parallel/test_tensor_shard/test_metainfo/utils.py
index 17eb75fadef0..b8c01d35842e 100644
--- a/tests/test_auto_parallel/test_tensor_shard/test_metainfo/utils.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_metainfo/utils.py
@@ -7,7 +7,7 @@
 
 from colossalai.auto_parallel.passes.runtime_apply_pass import runtime_apply_pass
 from colossalai.auto_parallel.passes.runtime_preparation_pass import runtime_preparation_pass
-from colossalai.auto_parallel.tensor_shard.sharding_strategy import OperationDataType
+from colossalai.auto_parallel.tensor_shard.sharding_strategy import OperationDataType, TrainCycleItem
 from colossalai.auto_parallel.tensor_shard.solver import SolverOptions, StrategiesConstructor
 from colossalai.device.device_mesh import DeviceMesh
 from colossalai.fx.tracer.tracer import ColoTracer
@@ -126,3 +126,56 @@ def mem_test_for_node_strategy(rank: int,
                 f"backward temp: {metainfo.memory_cost.bwd.temp / 1024} kb, backward buffer: {metainfo.memory_cost.bwd.buffer / 1024} kb"
             )
             print("=======================")
+
+
+def print_results(input: List[torch.Tensor], output: List[torch.Tensor], compute_cost: TrainCycleItem,
+                  memory_cost: TrainCycleItem, fwd_allocated, fwd_peak, bwd_allocated, bwd_peak):
+    """Print the results of the meta information test.
+
+    Args:
+        input (List[torch.Tensor]): input tensors
+        output (List[torch.Tensor]): output tensors
+        compute_cost (TrainCycleItem): compute cost estimated by meta_func
+        memory_cost (TrainCycleItem): memory cost estimated by meta_func
+        fwd_allocated: real forward memory allocated
+        fwd_peak: real forward peak memory stats
+        bwd_allocated: real backward memory allocated
+        bwd_peak: real backward peak memory stats
+    """
+    print("=====================")
+    print(f"input shapes: {[tensor.shape for tensor in input]}")
+    print(f"output shapes: {[tensor.shape for tensor in output]}")
+
+    # estimated results
+    print("Estimated Results")
+
+    # compute cost
+    print("compute_cost:")
+    print(f"    fwd: {compute_cost.fwd}")
+    print(f"    bwd: {compute_cost.bwd}")
+
+    # memory cost
+    print("memory_cost:")
+    # fwd
+    print(f"    fwd activation: {memory_cost.fwd.activation / 1024} KB")
+    print(f"    fwd buffer: {memory_cost.fwd.buffer / 1024} KB")
+    print(f"    fwd temp: {memory_cost.fwd.temp / 1024} KB")
+    print(f"    fwd parameter: {memory_cost.fwd.parameter / 1024} KB")
+
+    # bwd
+    print(f"    bwd activation: {memory_cost.bwd.activation / 1024} KB")
+    print(f"    bwd buffer: {memory_cost.bwd.buffer / 1024} KB")
+    print(f"    bwd temp: {memory_cost.bwd.temp / 1024} KB")
+    print(f"    bwd parameter: {memory_cost.bwd.parameter / 1024} KB")
+
+    # actual results
+    print("Actual Results")
+
+    print("memory_cost:")
+    # fwd
+    print(f"    fwd allocated: {fwd_allocated / 1024} KB")
+    print(f"    fwd peak: {fwd_peak / 1024} KB")
+
+    # bwd
+    print(f"    bwd allocated: {bwd_allocated / 1024} KB")
+    print(f"    bwd peak: {bwd_peak / 1024} KB")

From 8de85051b37423aaea421593854024e19902bb10 Mon Sep 17 00:00:00 2001
From: YuliangLiu0306 <72588413+YuliangLiu0306@users.noreply.github.com>
Date: Fri, 10 Feb 2023 18:38:32 +0800
Subject: [PATCH 288/503] [Docs] layout converting management (#2665)

---
 .../feature/layout_converting_management.md         | 13 +++++++++++++
 .../en/Colossal-Auto/feature/shape_consistency.md   |  0
 .../feature/layout_converting_management.md         | 12 ++++++++++++
 .../Colossal-Auto/feature/shape_consistency.md      |  0
 4 files changed, 25 insertions(+)
 create mode 100644 docs/source/en/Colossal-Auto/feature/layout_converting_management.md
 delete mode 100644 docs/source/en/Colossal-Auto/feature/shape_consistency.md
 create mode 100644 docs/source/zh-Hans/Colossal-Auto/feature/layout_converting_management.md
 delete mode 100644 docs/source/zh-Hans/Colossal-Auto/feature/shape_consistency.md

diff --git a/docs/source/en/Colossal-Auto/feature/layout_converting_management.md b/docs/source/en/Colossal-Auto/feature/layout_converting_management.md
new file mode 100644
index 000000000000..2082a33d8a39
--- /dev/null
+++ b/docs/source/en/Colossal-Auto/feature/layout_converting_management.md
@@ -0,0 +1,13 @@
+When a tensor is required to have different sharding specs in upstream and downstream operators, we need to perform layout conversion processing, which can also be called redistribution. There are currently two mainstream methods, enumeration conversion, and dimension-by-dimension conversion. enumeration conversion is to enumerate all possible situations, and then find the corresponding conversion scheme in the table when conversion is required. However, it has a big problem. That is, as the dimension of the device mesh increases, the scale of this problem is so inflated that it cannot be solved by enumerating tables. Dimension-by-dimension conversion is for a sharding spec of an N-D tensor, X0X1...Xn-1, sharding spec is converted from 0 to n-1 dimension by dimension, so that no matter how many dimensions the device mesh and tensor have, with only one-time Scanning, a feasible conversion operation sequence is generated, the problem is that the conversion efficiency will be very poor.
+
+Therefore, we propose a novel algorithm, using heuristic search, to solve the conversion problem of sharding spec, which can be described as:
+1. Generate all one-step transform sharding specs from source spec
+2.  In the one-step transform sharding specs, according to the similarity function, select a sharding spec with the "least difference" as the subsequent source sharding spec, and record the sharding spec in the transform path. If a sharding spec of the one-step transforms is the same as the target sharding spec, the algorithm ends.
+3. Repeat 1, 2 until the end of the algorithm
+
+
+| Source/target sharding spec pairs |All gather | Shard | All to All | One step transform | Best sharding spec |Transform path|
+| :-:         | :-:              | :-:                  | :-:                       | :-:                     | :-:                     |:-:                     |
+| $S_{01}RR， RS_{01}R$  | $S_0RR$       | -           | $S_0RS_1, S_0S_1R$             | $S_0RR, S_0RS_1, S_0S_1R$             | $S_0RR$ | $S_0RR$
+| $S_0RR, RS_{01}RR$  | $RRR$       | $S_0S_1R, S_0RS_1$           | $RS_0R, RRS_0$             | $RRR$, $S_0S_1R$, $S_0RS_1$, $RS_0R$, $RRS_0$             | $RS_0R$ | $S_0RR$ -> $RS_0R$
+| $RS_0R, RS_{01}RR$  | $RRR$       | $RS_{01}R, S_1S_0R, RS_0S_1$           | $S_0RR, RRS_0$             | $RRR$, $RS_{01}R$, $S_1S_0R$, $RS_0S_1$, $S_0RR$, $RRS_0$             | $RS_{01}R$ | $S_0RR$ -> $RS_0R$ -> $RS_{01}R$
diff --git a/docs/source/en/Colossal-Auto/feature/shape_consistency.md b/docs/source/en/Colossal-Auto/feature/shape_consistency.md
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/docs/source/zh-Hans/Colossal-Auto/feature/layout_converting_management.md b/docs/source/zh-Hans/Colossal-Auto/feature/layout_converting_management.md
new file mode 100644
index 000000000000..71bce57ea91b
--- /dev/null
+++ b/docs/source/zh-Hans/Colossal-Auto/feature/layout_converting_management.md
@@ -0,0 +1,12 @@
+当一个张量在上下游算子中被要求的sharding spec不同时，我们需要进行分布转换处理（Layout Conversion）。目前主流的方式有两种，打表转换和逐维度转换。打表转换就是将所有可能的情况枚举出来，然后在遇到需要转换的情况下，去表格中找到对应的转换方案。
+为了解决这个问题，我们提出一个新奇的想法，使用启发式的搜索，来解决sharding spec的转换问题。
+然而它有一个很大问题，就是随着设备块（Device Mesh）的维度增加，这个问题的规模极具膨胀，以至于无法通过这种枚举打表的方式来解决。逐维度转换是对于一个N-d tensor的sharding spec，X0X1...Xn-1，我们让i从0到n-1逐维度地进行转换，这样不管设备块和张量的维度多少，我们都只需要一次扫描，就可以得到一个可行的转换操作序列，然而它问题是这样的转换效率会很差。为了解决这个问题，我们提出一个新奇的想法，使用启发式算法，来解决sharding spec的转换问题。，这个算法可以描述为：
+  1. 从source spec生成所有的one-step transform sharding specs
+  2. 在one-step transform sharding specs中，根据相似度函数，挑选一个”区别最小“的sharding spec作为后续的source sharding spec，并将该sharding spec记录在transform path中，如果one-step transform sharding spec中，有与target sharding spec相同的sharding spec，则算法结束。
+  3. 重复a，b直到算法结束
+
+| Source/target sharding spec pairs |All gather | Shard | All to All | One step transform | Best sharding spec |Transform path|
+| :-:         | :-:              | :-:                  | :-:                       | :-:                     | :-:                     |:-:                     |
+| $S_{01}RR， RS_{01}R$  | $S_0RR$       | -           | $S_0RS_1, S_0S_1R$             | $S_0RR, S_0RS_1, S_0S_1R$             | $S_0RR$ | $S_0RR$
+| $S_0RR, RS_{01}RR$  | $RRR$       | $S_0S_1R, S_0RS_1$           | $RS_0R, RRS_0$             | $RRR$, $S_0S_1R$, $S_0RS_1$, $RS_0R$, $RRS_0$             | $RS_0R$ | $S_0RR$ -> $RS_0R$
+| $RS_0R, RS_{01}RR$  | $RRR$       | $RS_{01}R, S_1S_0R, RS_0S_1$           | $S_0RR, RRS_0$             | $RRR$, $RS_{01}R$, $S_1S_0R$, $RS_0S_1$, $S_0RR$, $RRS_0$             | $RS_{01}R$ | $S_0RR$ -> $RS_0R$ -> $RS_{01}R$
diff --git a/docs/source/zh-Hans/Colossal-Auto/feature/shape_consistency.md b/docs/source/zh-Hans/Colossal-Auto/feature/shape_consistency.md
deleted file mode 100644
index e69de29bb2d1..000000000000

From 85bd29817ef479387d25e5d3fb3187df5b6b5051 Mon Sep 17 00:00:00 2001
From: binmakeswell <binmakeswell@gmail.com>
Date: Fri, 10 Feb 2023 20:36:22 +0800
Subject: [PATCH 289/503] Update README-zh-Hans.md

---
 README-zh-Hans.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README-zh-Hans.md b/README-zh-Hans.md
index 34122db65d75..023b5a21c955 100644
--- a/README-zh-Hans.md
+++ b/README-zh-Hans.md
@@ -360,6 +360,6 @@ docker run -ti --gpus all --rm --ipc=host colossalai bash
 }
 ```
 
-Colossal-AI 已被 [SC](https://sc22.supercomputing.org/), [AAAI](https://aaai.org/Conferences/AAAI-23/), [PPoPP](https://ppopp23.sigplan.org/) 等顶级会议录取为官方教程。
+Colossal-AI 已被 [SC](https://sc22.supercomputing.org/), [AAAI](https://aaai.org/Conferences/AAAI-23/), [PPoPP](https://ppopp23.sigplan.org/), [CVPR](https://cvpr2023.thecvf.com/)等顶级会议录取为官方教程。
 
 <p align="right">(<a href="#top">返回顶端</a>)</p>

From 9ab14b20b5f2b31e5668482839876f32d56a7725 Mon Sep 17 00:00:00 2001
From: binmakeswell <binmakeswell@gmail.com>
Date: Fri, 10 Feb 2023 20:43:34 +0800
Subject: [PATCH 290/503] [doc] add CVPR tutorial (#2666)

---
 README.md                         | 2 +-
 colossalai/nn/optimizer/README.md | 2 +-
 examples/tutorial/README.md       | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 7ec864c02cdb..6ad736a43cde 100644
--- a/README.md
+++ b/README.md
@@ -362,6 +362,6 @@ We leverage the power of [GitHub Actions](https://github.com/features/actions) t
 }
 ```
 
-Colossal-AI has been accepted as official tutorials by top conference [SC](https://sc22.supercomputing.org/), [AAAI](https://aaai.org/Conferences/AAAI-23/), [PPoPP](https://ppopp23.sigplan.org/), etc.
+Colossal-AI has been accepted as official tutorials by top conference [SC](https://sc22.supercomputing.org/), [AAAI](https://aaai.org/Conferences/AAAI-23/), [PPoPP](https://ppopp23.sigplan.org/), [CVPR](https://cvpr2023.thecvf.com/), etc.
 
 <p align="right">(<a href="#top">back to top</a>)</p>
diff --git a/colossalai/nn/optimizer/README.md b/colossalai/nn/optimizer/README.md
index 268e37d57997..e2fc30bc5d4f 100644
--- a/colossalai/nn/optimizer/README.md
+++ b/colossalai/nn/optimizer/README.md
@@ -3,7 +3,7 @@
 ## Introduction
 
 Welcome to the large-scale deep learning optimization techniques of [Colossal-AI](https://github.com/hpcaitech/ColossalAI), 
-which has been accepted as official tutorials by top conference [AAAI](https://aaai.org/Conferences/AAAI-23/), [PPoPP](https://ppopp23.sigplan.org/), etc.
+which has been accepted as official tutorials by top conference [SC](https://sc22.supercomputing.org/), [AAAI](https://aaai.org/Conferences/AAAI-23/), [PPoPP](https://ppopp23.sigplan.org/), [CVPR](https://cvpr2023.thecvf.com/), etc.
 
 
 [Colossal-AI](https://github.com/hpcaitech/ColossalAI), a unified deep learning system for the big model era, integrates
diff --git a/examples/tutorial/README.md b/examples/tutorial/README.md
index 9de1cdfdc31d..1da77e831c23 100644
--- a/examples/tutorial/README.md
+++ b/examples/tutorial/README.md
@@ -2,7 +2,7 @@
 
 ## Introduction
 
-Welcome to the [Colossal-AI](https://github.com/hpcaitech/ColossalAI) tutorial, which has been accepted as official tutorials by top conference [SC](https://sc22.supercomputing.org/), [AAAI](https://aaai.org/Conferences/AAAI-23/), [PPoPP](https://ppopp23.sigplan.org/), etc.
+Welcome to the [Colossal-AI](https://github.com/hpcaitech/ColossalAI) tutorial, which has been accepted as official tutorials by top conference [SC](https://sc22.supercomputing.org/), [AAAI](https://aaai.org/Conferences/AAAI-23/), [PPoPP](https://ppopp23.sigplan.org/), [CVPR](https://cvpr2023.thecvf.com/), etc.
 
 
 [Colossal-AI](https://github.com/hpcaitech/ColossalAI), a unified deep learning system for the big model era, integrates

From 81ea66d25d9dc10fcd4d7331e7a2274e849f0909 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 13 Feb 2023 09:51:25 +0800
Subject: [PATCH 291/503] [release] v0.2.3 (#2669)

* [release] v0.2.3

* polish code
---
 docs/sidebars.js   | 73 ------------------------------------------
 docs/sidebars.json | 79 ++++++++++++++++++++++++++++++++++++++++++++++
 docs/versions.json |  2 +-
 version.txt        |  2 +-
 4 files changed, 81 insertions(+), 75 deletions(-)
 delete mode 100644 docs/sidebars.js
 create mode 100644 docs/sidebars.json

diff --git a/docs/sidebars.js b/docs/sidebars.js
deleted file mode 100644
index d6273b558767..000000000000
--- a/docs/sidebars.js
+++ /dev/null
@@ -1,73 +0,0 @@
-module.exports = {
-  docs: [
-    {
-      type: 'category',
-      label: 'Get started',
-      collapsed: false,
-      items: [
-        'get_started/installation', 'get_started/run_demo',
-        'get_started/reading_roadmap'
-      ],
-    },
-    {
-      type: 'category',
-      label: 'Concepts',
-      collapsed: false,
-      items: [
-        'concepts/distributed_training', 'concepts/paradigms_of_parallelism',
-        'concepts/colossalai_overview'
-      ],
-    },
-    {
-      type: 'category',
-      label: 'Basics',
-      collapsed: false,
-      items: [
-        'basics/command_line_tool',
-        'basics/define_your_config',
-        'basics/launch_colossalai',
-        'basics/initialize_features',
-        'basics/engine_trainer',
-        'basics/configure_parallelization',
-        'basics/model_checkpoint',
-        'basics/colotensor_concept',
-      ],
-    },
-    {
-      type: 'category',
-      label: 'Features',
-      collapsed: false,
-      items: [
-        'features/mixed_precision_training', 'features/gradient_accumulation',
-        'features/gradient_clipping', 'features/gradient_handler',
-        'features/zero_with_chunk', {
-          type: 'category',
-          label: 'Tensor Parallel',
-          collapsed: true,
-          items: [
-            'features/1D_tensor_parallel',
-            'features/2D_tensor_parallel',
-            'features/2p5D_tensor_parallel',
-            'features/3D_tensor_parallel',
-          ],
-        },
-        'features/pipeline_parallel', 'features/nvme_offload'
-      ],
-    },
-    {
-      type: 'category',
-      label: 'Advanced Tutorials',
-      collapsed: false,
-      items: [
-        'advanced_tutorials/train_vit_using_pipeline_parallelism',
-        'advanced_tutorials/train_vit_with_hybrid_parallelism',
-        'advanced_tutorials/train_gpt_using_hybrid_parallelism',
-        'advanced_tutorials/define_your_own_parallel_model',
-        'advanced_tutorials/add_your_parallel',
-        'advanced_tutorials/meet_gemini',
-        'advanced_tutorials/parallelize_your_training_like_Megatron',
-        'advanced_tutorials/integrate_mixture_of_experts_into_your_model'
-      ],
-    },
-  ]
-};
diff --git a/docs/sidebars.json b/docs/sidebars.json
new file mode 100644
index 000000000000..9e9ef89ba63f
--- /dev/null
+++ b/docs/sidebars.json
@@ -0,0 +1,79 @@
+{
+  "tutorialSidebar": [
+    {
+      "type": "category",
+      "label": "Get started",
+      "collapsed": true,
+      "items": [
+        "get_started/installation",
+        "get_started/run_demo",
+        "get_started/reading_roadmap"
+      ]
+    },
+    {
+      "type": "category",
+      "label": "Concepts",
+      "collapsed": true,
+      "items": [
+        "concepts/distributed_training",
+        "concepts/paradigms_of_parallelism",
+        "concepts/colossalai_overview"
+      ]
+    },
+    {
+      "type": "category",
+      "label": "Basics",
+      "collapsed": true,
+      "items": [
+        "basics/command_line_tool",
+        "basics/define_your_config",
+        "basics/launch_colossalai",
+        "basics/initialize_features",
+        "basics/engine_trainer",
+        "basics/configure_parallelization",
+        "basics/model_checkpoint",
+        "basics/colotensor_concept"
+      ]
+    },
+    {
+      "type": "category",
+      "label": "Features",
+      "collapsed": true,
+      "items": [
+        "features/mixed_precision_training",
+        "features/gradient_accumulation",
+        "features/gradient_clipping",
+        "features/gradient_handler",
+        "features/zero_with_chunk",
+        {
+          "type": "category",
+          "label": "Tensor Parallel",
+          "collapsed": true,
+          "items": [
+            "features/1D_tensor_parallel",
+            "features/2D_tensor_parallel",
+            "features/2p5D_tensor_parallel",
+            "features/3D_tensor_parallel"
+          ]
+        },
+        "features/pipeline_parallel",
+        "features/nvme_offload"
+      ]
+    },
+    {
+      "type": "category",
+      "label": "Advanced Tutorials",
+      "collapsed": true,
+      "items": [
+        "advanced_tutorials/train_vit_using_pipeline_parallelism",
+        "advanced_tutorials/train_vit_with_hybrid_parallelism",
+        "advanced_tutorials/train_gpt_using_hybrid_parallelism",
+        "advanced_tutorials/define_your_own_parallel_model",
+        "advanced_tutorials/add_your_parallel",
+        "advanced_tutorials/meet_gemini",
+        "advanced_tutorials/parallelize_your_training_like_Megatron",
+        "advanced_tutorials/integrate_mixture_of_experts_into_your_model"
+      ]
+    }
+  ]
+}
diff --git a/docs/versions.json b/docs/versions.json
index 5961b1c980dc..8497d72d91b0 100644
--- a/docs/versions.json
+++ b/docs/versions.json
@@ -1,3 +1,3 @@
 [
-  "v0.2.2"
+  "v0.2.3"
 ]
diff --git a/version.txt b/version.txt
index ee1372d33a29..7179039691ce 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.2.2
+0.2.3

From 6d606344336a9f2afed5a57407bd796ee25e027b Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 13 Feb 2023 10:10:12 +0800
Subject: [PATCH 292/503] [doc] added documentation sidebar translation (#2670)

---
 .../en/sidebar_category_translation.json      | 26 +++++++++++++++++++
 .../zh-Hans/sidebar_category_translation.json | 26 +++++++++++++++++++
 docs/versions.json                            |  2 +-
 3 files changed, 53 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/en/sidebar_category_translation.json
 create mode 100644 docs/source/zh-Hans/sidebar_category_translation.json

diff --git a/docs/source/en/sidebar_category_translation.json b/docs/source/en/sidebar_category_translation.json
new file mode 100644
index 000000000000..1a68348b02e1
--- /dev/null
+++ b/docs/source/en/sidebar_category_translation.json
@@ -0,0 +1,26 @@
+{
+  "sidebar.docs.category.Get started": {
+    "message": "Get started",
+    "description": "The label for category Get started in sidebar docs"
+  },
+  "sidebar.docs.category.Concepts": {
+    "message": "Concepts",
+    "description": "The label for category Concepts in sidebar docs"
+  },
+  "sidebar.docs.category.Basics": {
+    "message": "Basics",
+    "description": "The label for category Basics in sidebar docs"
+  },
+  "sidebar.docs.category.Features": {
+    "message": "Features",
+    "description": "The label for category Features in sidebar docs"
+  },
+  "sidebar.docs.category.Tensor Parallel": {
+    "message": "Tensor Parallel",
+    "description": "The label for category Tensor Parallel in sidebar docs"
+  },
+  "sidebar.docs.category.Advanced Tutorials": {
+    "message": "Advanced Tutorials",
+    "description": "The label for category Advanced Tutorials in sidebar docs"
+  }
+}
diff --git a/docs/source/zh-Hans/sidebar_category_translation.json b/docs/source/zh-Hans/sidebar_category_translation.json
new file mode 100644
index 000000000000..3cc8f4a51206
--- /dev/null
+++ b/docs/source/zh-Hans/sidebar_category_translation.json
@@ -0,0 +1,26 @@
+{
+  "sidebar.docs.category.Get started": {
+    "message": "快速开始",
+    "description": "The label for category Get started in sidebar docs"
+  },
+  "sidebar.docs.category.Concepts": {
+    "message": "概念",
+    "description": "The label for category Concepts in sidebar docs"
+  },
+  "sidebar.docs.category.Basics": {
+    "message": "基础",
+    "description": "The label for category Basics in sidebar docs"
+  },
+  "sidebar.docs.category.Features": {
+    "message": "功能",
+    "description": "The label for category Features in sidebar docs"
+  },
+  "sidebar.docs.category.Tensor Parallel": {
+    "message": "张量并行",
+    "description": "The label for category Tensor Parallel in sidebar docs"
+  },
+  "sidebar.docs.category.Advanced Tutorials": {
+    "message": "高级教程",
+    "description": "The label for category Advanced Tutorials in sidebar docs"
+  }
+}
diff --git a/docs/versions.json b/docs/versions.json
index 8497d72d91b0..dde32982b798 100644
--- a/docs/versions.json
+++ b/docs/versions.json
@@ -1,3 +1,3 @@
 [
-  "v0.2.3"
+  "current"
 ]

From 0966008839c13c4d4ac643aa6adffb3c1617dccd Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 13 Feb 2023 10:45:16 +0800
Subject: [PATCH 293/503] [dooc] fixed the sidebar itemm key (#2672)

---
 .../en/sidebar_category_translation.json      | 24 +++++++++----------
 .../zh-Hans/sidebar_category_translation.json | 12 +++++-----
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/docs/source/en/sidebar_category_translation.json b/docs/source/en/sidebar_category_translation.json
index 1a68348b02e1..9cc320424e40 100644
--- a/docs/source/en/sidebar_category_translation.json
+++ b/docs/source/en/sidebar_category_translation.json
@@ -1,26 +1,26 @@
 {
-  "sidebar.docs.category.Get started": {
+  "sidebar.tutorialSidebar.category.Get started": {
     "message": "Get started",
-    "description": "The label for category Get started in sidebar docs"
+    "description": "The label for category Get started in sidebar tutorialSidebar"
   },
-  "sidebar.docs.category.Concepts": {
+  "sidebar.tutorialSidebar.category.Concepts": {
     "message": "Concepts",
-    "description": "The label for category Concepts in sidebar docs"
+    "description": "The label for category Concepts in sidebar tutorialSidebar"
   },
-  "sidebar.docs.category.Basics": {
+  "sidebar.tutorialSidebar.category.Basics": {
     "message": "Basics",
-    "description": "The label for category Basics in sidebar docs"
+    "description": "The label for category Basics in sidebar tutorialSidebar"
   },
-  "sidebar.docs.category.Features": {
+  "sidebar.tutorialSidebar.category.Features": {
     "message": "Features",
-    "description": "The label for category Features in sidebar docs"
+    "description": "The label for category Features in sidebar tutorialSidebar"
   },
-  "sidebar.docs.category.Tensor Parallel": {
+  "sidebar.tutorialSidebar.category.Tensor Parallel": {
     "message": "Tensor Parallel",
-    "description": "The label for category Tensor Parallel in sidebar docs"
+    "description": "The label for category Tensor Parallel in sidebar tutorialSidebar"
   },
-  "sidebar.docs.category.Advanced Tutorials": {
+  "sidebar.tutorialSidebar.category.Advanced Tutorials": {
     "message": "Advanced Tutorials",
-    "description": "The label for category Advanced Tutorials in sidebar docs"
+    "description": "The label for category Advanced Tutorials in sidebar tutorialSidebar"
   }
 }
diff --git a/docs/source/zh-Hans/sidebar_category_translation.json b/docs/source/zh-Hans/sidebar_category_translation.json
index 3cc8f4a51206..af4c460c448f 100644
--- a/docs/source/zh-Hans/sidebar_category_translation.json
+++ b/docs/source/zh-Hans/sidebar_category_translation.json
@@ -1,25 +1,25 @@
 {
-  "sidebar.docs.category.Get started": {
+  "sidebar.tutorialSidebar.category.Get started": {
     "message": "快速开始",
     "description": "The label for category Get started in sidebar docs"
   },
-  "sidebar.docs.category.Concepts": {
+  "sidebar.tutorialSidebar.category.Concepts": {
     "message": "概念",
     "description": "The label for category Concepts in sidebar docs"
   },
-  "sidebar.docs.category.Basics": {
+  "sidebar.tutorialSidebar.category.Basics": {
     "message": "基础",
     "description": "The label for category Basics in sidebar docs"
   },
-  "sidebar.docs.category.Features": {
+  "sidebar.dotutorialSidebarcs.category.Features": {
     "message": "功能",
     "description": "The label for category Features in sidebar docs"
   },
-  "sidebar.docs.category.Tensor Parallel": {
+  "sidebar.dtutorialSidebarocs.category.Tensor Parallel": {
     "message": "张量并行",
     "description": "The label for category Tensor Parallel in sidebar docs"
   },
-  "sidebar.docs.category.Advanced Tutorials": {
+  "sidebar.tutorialSidebar.category.Advanced Tutorials": {
     "message": "高级教程",
     "description": "The label for category Advanced Tutorials in sidebar docs"
   }

From 8213f89fd2726e5048266ab4c456531b013a68cb Mon Sep 17 00:00:00 2001
From: HELSON <c2h214748@gmail.com>
Date: Mon, 13 Feb 2023 14:35:32 +0800
Subject: [PATCH 294/503] [gemini] add fake_release_chunk for keep-gathered
 chunk in the inference mode (#2671)

---
 colossalai/gemini/chunk/manager.py         |  8 ++++
 colossalai/nn/parallel/data_parallel.py    |  7 +++-
 tests/test_gemini/update/test_inference.py | 44 +++++++++++++++-------
 3 files changed, 43 insertions(+), 16 deletions(-)

diff --git a/colossalai/gemini/chunk/manager.py b/colossalai/gemini/chunk/manager.py
index 07fb6c48b2d7..e73c59b251fb 100644
--- a/colossalai/gemini/chunk/manager.py
+++ b/colossalai/gemini/chunk/manager.py
@@ -140,6 +140,14 @@ def reduce_chunk(self, chunk: Chunk) -> bool:
         self.__add_memory_usage(chunk.memory_usage)
         return True
 
+    def fake_release_chunk(self, chunk: Chunk) -> None:
+        """Release gathered chunk in a fake mode.
+        This function is used for keep-gathered chunk in the inference mode.
+        """
+        assert chunk.keep_gathered
+        assert chunk.tensor_state_cnter[TensorState.HOLD] == chunk.num_tensors
+        self.__sub_accessed_chunk(chunk)
+
     def copy_tensor_to_chunk_slice(self, tensor: torch.Tensor, data: torch.Tensor) -> None:
         """
         Copy data to the chunk.
diff --git a/colossalai/nn/parallel/data_parallel.py b/colossalai/nn/parallel/data_parallel.py
index a313da59b056..8e0192c71313 100644
--- a/colossalai/nn/parallel/data_parallel.py
+++ b/colossalai/nn/parallel/data_parallel.py
@@ -257,8 +257,11 @@ def _post_forward(self):
         access_list = list(self.chunk_manager.accessed_chunks)
         # we need to scatter all accessed chunks and move them to their original places
         for chunk in access_list:
-            assert chunk.can_release
-            self.chunk_manager.release_chunk(chunk)
+            if chunk.keep_gathered:
+                self.chunk_manager.fake_release_chunk(chunk)
+            else:
+                assert chunk.can_release
+                self.chunk_manager.release_chunk(chunk)
             first_param = next(iter(chunk.tensors_info))
             self.chunk_manager.move_chunk(chunk, self.grads_device[first_param])
         assert self.chunk_manager.accessed_mem == 0
diff --git a/tests/test_gemini/update/test_inference.py b/tests/test_gemini/update/test_inference.py
index 443155865667..b057448ad378 100644
--- a/tests/test_gemini/update/test_inference.py
+++ b/tests/test_gemini/update/test_inference.py
@@ -1,4 +1,5 @@
 from functools import partial
+from typing import Callable
 
 import pytest
 import torch
@@ -13,7 +14,7 @@
 from colossalai.gemini.gemini_mgr import GeminiManager
 from colossalai.nn.optimizer import HybridAdam
 from colossalai.nn.optimizer.zero_optimizer import ZeroOptimizer
-from colossalai.nn.parallel import ZeroDDP
+from colossalai.nn.parallel import ZeroDDP, zero_model_wrapper
 from colossalai.testing import parameterize, rerun_if_address_is_in_use
 from colossalai.utils import free_port
 from colossalai.utils.cuda import get_current_device
@@ -36,9 +37,35 @@ def check_param(model: ZeroDDP, torch_model: torch.nn.Module):
         assert_close(value, temp_zero_value, rtol=1e-3, atol=4e-3)
 
 
+def multi_chunk_init(model: torch.nn.Module, placement_policy: str):
+    world_size = dist.get_world_size()
+    config_dict, *_ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
+    config_dict[world_size]['chunk_size'] = 5000
+    config_dict[world_size]['keep_gathered'] = False
+    if placement_policy != 'cuda':
+        init_device = torch.device('cpu')
+    else:
+        init_device = None
+    chunk_manager = ChunkManager(config_dict, init_device=init_device)
+    gemini_manager = GeminiManager(placement_policy, chunk_manager)
+    model = ZeroDDP(model, gemini_manager, pin_memory=True)
+    return model
+
+
+def single_chunk_init(model: torch.nn.Module, placement_policy: str):
+    gemini_config = dict(
+        device=get_current_device(),
+        placement_policy=placement_policy,
+        pin_memory=True,
+    )
+    model = zero_model_wrapper(model=model, zero_stage=3, gemini_config=gemini_config)
+    return model
+
+
 @parameterize('placement_policy', ['cuda', 'cpu', 'auto', 'const'])
 @parameterize('model_name', ['gpt2'])
-def exam_inference(placement_policy, model_name: str):
+@parameterize('model_init_func', [single_chunk_init, multi_chunk_init])
+def exam_inference(placement_policy: str, model_name: str, model_init_func: Callable):
     set_seed(19360226)
     get_components_func = non_distributed_component_funcs.get_callable(model_name)
     model_builder, train_dataloader, test_dataloader, optimizer_class, criterion = get_components_func()
@@ -56,18 +83,7 @@ def exam_inference(placement_policy, model_name: str):
     for torch_p, p in zip(torch_model.parameters(), model.parameters()):
         p.data.copy_(torch_p.data)
 
-    world_size = torch.distributed.get_world_size()
-    config_dict, *_ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
-    config_dict[world_size]['chunk_size'] = 5000
-    config_dict[world_size]['keep_gathered'] = False
-    if placement_policy != 'cuda':
-        init_device = torch.device('cpu')
-    else:
-        init_device = None
-    chunk_manager = ChunkManager(config_dict, init_device=init_device)
-    gemini_manager = GeminiManager(placement_policy, chunk_manager)
-    model = ZeroDDP(model, gemini_manager, pin_memory=True)
-
+    model = model_init_func(model, placement_policy)
     optimizer = HybridAdam(model.parameters(), lr=1e-3)
     zero_optim = ZeroOptimizer(optimizer, model, initial_scale=128)
 

From 327bc06278cee0939436dac4cc1a518f208b2626 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 13 Feb 2023 15:55:57 +0800
Subject: [PATCH 295/503] [workflow] added doc build test (#2675)

* [workflow] added doc build test

* polish code

* polish code

* polish code

* polish code

* polish code

* polish code

* polish code

* polish code

* polish code
---
 .github/workflows/check_doc_on_pr.yml | 45 +++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/.github/workflows/check_doc_on_pr.yml b/.github/workflows/check_doc_on_pr.yml
index 5b3c4f6fbc6d..6e42053ddc08 100644
--- a/.github/workflows/check_doc_on_pr.yml
+++ b/.github/workflows/check_doc_on_pr.yml
@@ -21,3 +21,48 @@ jobs:
           python-version: '3.8.14'
 
       - run: python .github/workflows/scripts/check_doc_i18n.py -d docs/source
+
+  check-doc-build:
+    name: Test if the docs can be built
+    if: |
+        github.event.pull_request.draft == false &&
+        github.base_ref == 'main' &&
+        github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          path: './ColossalAI'
+          fetch-depth: 0
+
+      - uses: actions/checkout@v2
+        with:
+          path: './ColossalAI-Documentation'
+          repository: 'hpcaitech/ColossalAI-Documentation'
+
+      - uses: actions/setup-python@v2
+        with:
+          python-version: '3.8.14'
+
+      - run: |
+          ls -la
+          ls -la ..
+
+      # we use the versions in the main branch as the guide for versions to display
+      # checkout will give your merged branch
+      # therefore, we need to make the merged branch as the main branch
+      - name: Make the merged branch main
+        run: |
+          cd ColossalAI
+          curBranch=$(git rev-parse --abbrev-ref HEAD)
+          git checkout main
+          git merge $curBranch             # fast-forward master up to the merge
+
+      - name: Build docs
+        run: |
+          cache_dir=ColossalAI-Documentation/doc-build/.cache
+          mkdir $cache_dir
+          mv ColossalAI $cache_dir
+          cd ColossalAI-Documentation
+          pip install -v ./doc-build
+          bash ./scripts/build.sh

From 40c916b1924097b154d611ef4a7177f8c5ebac76 Mon Sep 17 00:00:00 2001
From: Boyuan Yao <70263930+Cypher30@users.noreply.github.com>
Date: Mon, 13 Feb 2023 16:09:22 +0800
Subject: [PATCH 296/503] [autoparallel] Patch meta information of
 `torch.nn.functional.softmax` and `torch.nn.Softmax` (#2674)

* [autoparallel] softmax metainfo

* [autoparallel] softmax metainfo
---
 .../meta_profiler/meta_registry/activation.py | 50 ++++++++++++++++++
 .../test_metainfo/test_activation_metainfo.py | 51 ++++++++++++++++++-
 2 files changed, 99 insertions(+), 2 deletions(-)

diff --git a/colossalai/auto_parallel/meta_profiler/meta_registry/activation.py b/colossalai/auto_parallel/meta_profiler/meta_registry/activation.py
index 774457f7d3b6..c659cd9ac389 100644
--- a/colossalai/auto_parallel/meta_profiler/meta_registry/activation.py
+++ b/colossalai/auto_parallel/meta_profiler/meta_registry/activation.py
@@ -72,3 +72,53 @@ def relu_meta_info(*args, **kwargs) -> Tuple[TrainCycleItem, TrainCycleItem, Lis
     fwd_out = [torch.zeros_like(output_tensor, device='meta')]
 
     return compute_cost, memory_cost, fwd_in, fwd_buffer, fwd_out
+
+
+@meta_register.register(torch.nn.Softmax)
+@meta_register.register(torch.nn.functional.softmax)
+def softmax_meta_info(*args, **kwargs) -> Tuple[TrainCycleItem, TrainCycleItem, List[torch.Tensor]]:
+    """torch.nn.Softmax metainfo generator
+    Returns:
+        Tuple[TrainCycleItem, TrainCycleItem, List[torch.Tensor]]: compute cost, memory cost and forward inputs
+    """
+    input_tensor = next(
+        filter(
+            lambda x:
+            (x.type == OperationDataType.ARG or x.type == OperationDataType.PARAM) and x.name != 'softmax_dim',
+            args)).data
+    output_tensor = next(filter(lambda x: x.type == OperationDataType.OUTPUT, args)).data
+    softmax_dim = next(filter(lambda x: x.name == 'softmax_dim', args)).data
+
+    # calculate cost
+
+    # calculate compute cost
+    fwd_compute_cost = flop_mapping[torch.ops.aten._softmax.default]([input_tensor], [output_tensor])
+    bwd_compute_cost = flop_mapping[torch.ops.aten._softmax_backward_data.default]([output_tensor], [input_tensor])
+
+    compute_cost = TrainCycleItem(fwd=fwd_compute_cost, bwd=bwd_compute_cost, total=fwd_compute_cost + bwd_compute_cost)
+
+    # calculate memory cost
+    # NOTE: currently in SPMD solver we always believe that there will be a new tensor created in forward
+    fwd_memory_cost = MemoryCost(activation=activation_size([input_tensor, output_tensor]),
+                                 parameter=0,
+                                 temp=0,
+                                 buffer=0)
+    bwd_memory_cost = MemoryCost(activation=activation_size(input_tensor),
+                                 parameter=0,
+                                 temp=activation_size(input_tensor),
+                                 buffer=0)
+
+    # total cost is the sum of forward and backward cost
+    total_cost = MemoryCost(activation=fwd_memory_cost.activation + bwd_memory_cost.activation,
+                            parameter=fwd_memory_cost.parameter + bwd_memory_cost.parameter,
+                            temp=fwd_memory_cost.temp + bwd_memory_cost.temp,
+                            buffer=fwd_memory_cost.buffer + bwd_memory_cost.buffer)
+
+    memory_cost = TrainCycleItem(fwd=fwd_memory_cost, bwd=bwd_memory_cost, total=total_cost)
+
+    # store fwd_in, fwd_buffer, fwd_out
+    fwd_in = []
+    fwd_buffer = [torch.zeros_like(output_tensor, device='meta')]
+    fwd_out = [torch.zeros_like(output_tensor, device='meta')]
+
+    return compute_cost, memory_cost, fwd_in, fwd_buffer, fwd_out
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_metainfo/test_activation_metainfo.py b/tests/test_auto_parallel/test_tensor_shard/test_metainfo/test_activation_metainfo.py
index f468b1ab2113..b9b42f8c161d 100644
--- a/tests/test_auto_parallel/test_tensor_shard/test_metainfo/test_activation_metainfo.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_metainfo/test_activation_metainfo.py
@@ -5,6 +5,8 @@
 import torch.multiprocessing as mp
 import torch.nn as nn
 
+from colossalai.auto_parallel.meta_profiler import meta_register
+from colossalai.auto_parallel.tensor_shard.sharding_strategy import OperationData, OperationDataType
 from colossalai.device.device_mesh import DeviceMesh
 from colossalai.fx import ColoGraphModule, ColoTracer
 from colossalai.initialize import launch
@@ -12,7 +14,7 @@
 from colossalai.testing.pytest_wrapper import run_on_environment_flag
 from colossalai.testing.utils import parameterize, rerun_if_address_is_in_use
 from colossalai.utils import free_port
-from tests.test_auto_parallel.test_tensor_shard.test_metainfo.utils import mem_test_for_node_strategy
+from tests.test_auto_parallel.test_tensor_shard.test_metainfo.utils import mem_test_for_node_strategy, print_results
 
 
 def _ReLU_module_mem_test(rank, world_size, port):
@@ -57,5 +59,50 @@ def test_ReLU_meta_concrete_info_match():
     mp.spawn(run_func_module, nprocs=world_size)
 
 
+@pytest.mark.skipif(torch.__version__ < '1.12.0', reason="need pytorch 1.12.0 or higher for aten level operations")
+def test_sofmax_meta_info():
+    meta_func = meta_register.get(torch.nn.functional.softmax)
+    # construct meta tensors
+    input_tensor = torch.rand(256, 1024, device="meta")
+    output_tensor = torch.rand(256, 1024, device="meta")
+    softmax_dim = 0
+
+    # construct operation data
+    input_data = OperationData(name='input', type=OperationDataType.ARG, data=input_tensor)
+    output_data = OperationData(name='output', type=OperationDataType.OUTPUT, data=output_tensor)
+    softmax_dim_data = OperationData(name='softmax_dim', type=OperationDataType.ARG, data=softmax_dim)
+
+    # construct args and kwargs
+    args = [input_data, softmax_dim_data, output_data]
+    kwargs = {'inplace': False}
+
+    # estimated results
+    compute_cost, memory_cost, fwd_in, fwd_buffer, fwd_out = meta_func(*args, **kwargs)
+
+    # actual results
+    input_real_tensor = torch.rand(256, 1024, device="cuda")
+
+    input_real_tensor.requires_grad = True
+
+    # fwd
+    torch.cuda.reset_peak_memory_stats()
+    mem_stamp0 = torch.cuda.memory_allocated()
+    output_real_tensor = torch.nn.functional.softmax(input_real_tensor, dim=softmax_dim)
+    fwd_allocated = torch.cuda.memory_allocated() - mem_stamp0
+    fwd_peak = torch.cuda.max_memory_allocated() - mem_stamp0
+
+    # bwd
+    upstream_grad = torch.rand_like(output_real_tensor)
+    torch.cuda.reset_peak_memory_stats()
+    mem_stamp0 = torch.cuda.memory_allocated()
+    torch.autograd.backward(output_real_tensor, upstream_grad)
+    bwd_allocated = torch.cuda.memory_allocated() - mem_stamp0
+    bwd_peak = torch.cuda.max_memory_allocated() - mem_stamp0
+
+    print_results([input_real_tensor], [output_real_tensor], compute_cost, memory_cost, fwd_allocated, fwd_peak,
+                  bwd_allocated, bwd_peak)
+
+
 if __name__ == '__main__':
-    test_ReLU_meta_concrete_info_match()
+    # test_ReLU_meta_concrete_info_match()
+    test_sofmax_meta_info()

From c44fd0c867939ecdd5089ae0fcc5879896c18151 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 13 Feb 2023 16:53:26 +0800
Subject: [PATCH 297/503] [workflow] added trigger to build doc upon release
 (#2678)

---
 .github/workflows/doc_build_after_merge.yml   | 28 +++++++++++++++++++
 ...heck_doc_on_pr.yml => doc_check_on_pr.yml} |  0
 2 files changed, 28 insertions(+)
 create mode 100644 .github/workflows/doc_build_after_merge.yml
 rename .github/workflows/{check_doc_on_pr.yml => doc_check_on_pr.yml} (100%)

diff --git a/.github/workflows/doc_build_after_merge.yml b/.github/workflows/doc_build_after_merge.yml
new file mode 100644
index 000000000000..dae3b70e1f4c
--- /dev/null
+++ b/.github/workflows/doc_build_after_merge.yml
@@ -0,0 +1,28 @@
+name: Build Documentation upon Release
+
+on:
+  workflow_dispatch:
+  pull_request:
+    paths:
+      - 'version.txt'
+    types:
+      - closed
+
+jobs:
+  build-doc:
+    name: Trigger Documentation Build Workflow
+    if: ( github.event_name == 'workflow_dispatch' || github.event.pull_request.merged == true ) && github.repository == 'hpcaitech/ColossalAI'
+    runs-on: ubuntu-latest
+    steps:
+      - name: trigger workflow in ColossalAI-Documentation
+        run: |
+          gh
+          curl \
+            -X POST \
+            -H "Accept: application/vnd.github+json" \
+            -H "Authorization: Bearer ${GH_TOKEN}"\
+            -H "X-GitHub-Api-Version: 2022-11-28" \
+            https://api.github.com/repos/hpcaitech/ColossalAI-Documentation/actions/workflows/deploy.yml/dispatches \
+            -d '{"ref":"main"}'
+        env:
+          GH_TOKEN: ${{secrets.DOC_REPO_TOKEN}}
diff --git a/.github/workflows/check_doc_on_pr.yml b/.github/workflows/doc_check_on_pr.yml
similarity index 100%
rename from .github/workflows/check_doc_on_pr.yml
rename to .github/workflows/doc_check_on_pr.yml

From 5cd8cae0c9db7ae4bfb98e42ec9c7ab2e6ebd17f Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 13 Feb 2023 17:04:49 +0800
Subject: [PATCH 298/503] [workflow] fixed communtity report ranking (#2680)

---
 .../scripts/generate_leaderboard_and_send_to_lark.py        | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.github/workflows/scripts/generate_leaderboard_and_send_to_lark.py b/.github/workflows/scripts/generate_leaderboard_and_send_to_lark.py
index 36cdd9518486..16b8957c1d88 100644
--- a/.github/workflows/scripts/generate_leaderboard_and_send_to_lark.py
+++ b/.github/workflows/scripts/generate_leaderboard_and_send_to_lark.py
@@ -292,7 +292,13 @@ def generate_user_engagement_leaderboard_image(github_token: str, output_path: s
     y = []
 
     if len(total_engagement_count) > 0:
+        ranking = []
         for name, count in total_engagement_count.items():
+            ranking.append((name, count))
+
+        ranking.sort(key=lambda x: x[1], reverse=True)
+
+        for name, count in ranking:
             x.append(count)
             y.append(name)
 

From f0aa191f51704806e65ac849da137069bb35a6d5 Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Mon, 13 Feb 2023 17:53:15 +0800
Subject: [PATCH 299/503] [gemini] fix colo_init_context (#2683)

---
 colossalai/utils/model/colo_init_context.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/colossalai/utils/model/colo_init_context.py b/colossalai/utils/model/colo_init_context.py
index ab354ea70320..87ae413a2a8a 100644
--- a/colossalai/utils/model/colo_init_context.py
+++ b/colossalai/utils/model/colo_init_context.py
@@ -32,7 +32,7 @@ def _convert_to_coloparam(param: torch.nn.Parameter,
                           default_pg: Optional[ProcessGroup] = None,
                           default_dist_spec: Optional[Any] = None) -> ColoParameter:
 
-    if isinstance(param, ColoParameter):
+    if type(param) is ColoParameter:
         return param
     # detaching tensor is necessary for optimizers.
     requires_grad = param.requires_grad
@@ -102,7 +102,7 @@ def _post_init_method(self, module: torch.nn.Module, *args, **kwargs):
         """
         name_list = []
         for name, param in _named_params_with_replica(module):
-            if isinstance(param, ColoTensor):
+            if type(param) is ColoParameter:
                 continue
 
             split = name.rfind('.')

From df4f020ee32c5b857d2a9806d5ec40d0b2064021 Mon Sep 17 00:00:00 2001
From: HELSON <c2h214748@gmail.com>
Date: Mon, 13 Feb 2023 18:00:16 +0800
Subject: [PATCH 300/503] [zero1&2] only append parameters with gradients
 (#2681)

---
 colossalai/zero/sharded_optim/low_level_optim.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/colossalai/zero/sharded_optim/low_level_optim.py b/colossalai/zero/sharded_optim/low_level_optim.py
index d174fc6ac138..89f5f9fadca4 100644
--- a/colossalai/zero/sharded_optim/low_level_optim.py
+++ b/colossalai/zero/sharded_optim/low_level_optim.py
@@ -131,7 +131,10 @@ def __init__(
         # partition these param groups for data parallel training
         # and add buffers to parameter store for future access
         for group_id, param_group in enumerate(self.optim.param_groups):
-            group_params = param_group['params']
+            group_params = list()
+            for param in param_group['params']:
+                if param.requires_grad:
+                    group_params.append(param)
 
             # add the fp16 params to fp16_param_groups for bookkeeping
             self._fp16_param_groups[group_id] = group_params

From 88416019e7d55dd4c28e29b9cdded60f71153964 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 13 Feb 2023 18:10:54 +0800
Subject: [PATCH 301/503] Automated submodule synchronization (#2648)

Co-authored-by: github-actions <github-actions@github.com>
---
 examples/tutorial/fastfold/FastFold | 2 +-
 inference                           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/tutorial/fastfold/FastFold b/examples/tutorial/fastfold/FastFold
index 95150c384b9b..0188361b6e2b 160000
--- a/examples/tutorial/fastfold/FastFold
+++ b/examples/tutorial/fastfold/FastFold
@@ -1 +1 @@
-Subproject commit 95150c384b9b6e776cad38dd91494e74115dc4ac
+Subproject commit 0188361b6e2b46bca61d37af5674eacf7ca9947f
diff --git a/inference b/inference
index 5d250f4af628..cde4c8f4e726 160000
--- a/inference
+++ b/inference
@@ -1 +1 @@
-Subproject commit 5d250f4af6283f65a701636628ffeef10447e650
+Subproject commit cde4c8f4e7269decb82b1b225ada278694e10f6a

From 46f20bac4109c29f7a346fa6f62ee8fb66799dc5 Mon Sep 17 00:00:00 2001
From: binmakeswell <binmakeswell@gmail.com>
Date: Mon, 13 Feb 2023 23:05:29 +0800
Subject: [PATCH 302/503] [doc] update auto parallel paper link (#2686)

* [doc] update auto parallel paper link

* [doc] update auto parallel paper link
---
 README-zh-Hans.md | 2 +-
 README.md         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README-zh-Hans.md b/README-zh-Hans.md
index 023b5a21c955..4b0ba9c4213b 100644
--- a/README-zh-Hans.md
+++ b/README-zh-Hans.md
@@ -102,7 +102,7 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
   - 1维, [2维](https://arxiv.org/abs/2104.05343), [2.5维](https://arxiv.org/abs/2105.14500), [3维](https://arxiv.org/abs/2105.14450) 张量并行
   - [序列并行](https://arxiv.org/abs/2105.13120)
   - [零冗余优化器 (ZeRO)](https://arxiv.org/abs/1910.02054)
-  - [自动并行](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/gpt/auto_parallel_with_gpt)
+  - [自动并行](https://arxiv.org/abs/2302.02599)
 - 异构内存管理
   - [PatrickStar](https://arxiv.org/abs/2108.05818)
 - 使用友好
diff --git a/README.md b/README.md
index 6ad736a43cde..703e3f3bf9c6 100644
--- a/README.md
+++ b/README.md
@@ -104,7 +104,7 @@ distributed training and inference in a few lines.
   - 1D, [2D](https://arxiv.org/abs/2104.05343), [2.5D](https://arxiv.org/abs/2105.14500), [3D](https://arxiv.org/abs/2105.14450) Tensor Parallelism
   - [Sequence Parallelism](https://arxiv.org/abs/2105.13120)
   - [Zero Redundancy Optimizer (ZeRO)](https://arxiv.org/abs/1910.02054)
-  - [Auto-Parallelism](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/gpt/auto_parallel_with_gpt)
+  - [Auto-Parallelism](https://arxiv.org/abs/2302.02599)
 
 - Heterogeneous Memory Management
   - [PatrickStar](https://arxiv.org/abs/2108.05818)

From 1712da2800e8bc2b539583692668c13b267ed7af Mon Sep 17 00:00:00 2001
From: Shawn-Kong <xk39@berkeley.edu>
Date: Mon, 13 Feb 2023 19:55:23 -0800
Subject: [PATCH 303/503] [NFC] polish colossalai/gemini/gemini_context.py code
 style (#2690)

---
 colossalai/gemini/gemini_context.py | 96 ++++++++++++++---------------
 1 file changed, 48 insertions(+), 48 deletions(-)

diff --git a/colossalai/gemini/gemini_context.py b/colossalai/gemini/gemini_context.py
index 98c8a914e5ca..9a7da6b80fba 100644
--- a/colossalai/gemini/gemini_context.py
+++ b/colossalai/gemini/gemini_context.py
@@ -1,48 +1,48 @@
-from enum import EnumMeta
-
-
-class GeminiMemoryManager(object):
-
-    def __init__(self, states_cls: EnumMeta):
-        super().__init__()
-        self.states_cls = states_cls
-        self._cnter = 0    # the counter of instances
-
-        self.total_mem = dict()
-        self.state_mem = dict()
-        self.state_mem['cpu'] = dict()
-        self.state_mem['cuda'] = dict()
-
-        self.reset()
-
-    @property
-    def total_number(self):
-        return self._cnter
-
-    def reset(self):
-        self._cnter = 0    # the counter of instances
-
-        self.total_mem['cpu'] = 0    # memory occupation of instances in cpu
-        self.total_mem['cuda'] = 0    # memory of occupation of instances in cuda
-
-        # memory conditions for all states
-        for state in self.states_cls:
-            self.state_mem['cpu'][state] = 0
-            self.state_mem['cuda'][state] = 0
-
-    def register_new_instance(self):
-        self._cnter += 1
-
-    def delete_instance(self):
-        self._cnter -= 1
-
-    def print_info(self):
-        print(f"Total number: {self.total_number}",
-              f"Total CPU memory occupation: {self.total_mem['cpu']}",
-              f"Total CUDA memory occupation: {self.total_mem['cuda']}\n",
-              sep='\n')
-
-        for state in self.states_cls:
-            print(f"{state}: CPU memory occupation: {self.state_mem['cpu'][state]}",
-                  f"{state}: CUDA memory occupation: {self.state_mem['cuda'][state]}\n",
-                  sep='\n')
+from enum import EnumMeta
+
+
+class GeminiMemoryManager(object):
+
+    def __init__(self, states_cls: EnumMeta):
+        super().__init__()
+        self.states_cls = states_cls
+        self._cnter = 0    # the counter of instances
+
+        self.total_mem = dict()
+        self.state_mem = dict()
+        self.state_mem['cpu'] = dict()
+        self.state_mem['cuda'] = dict()
+
+        self.reset()
+
+    @property
+    def total_number(self):
+        return self._cnter
+
+    def reset(self):
+        self._cnter = 0    # the counter of instances
+
+        self.total_mem['cpu'] = 0    # memory occupation of instances in cpu
+        self.total_mem['cuda'] = 0    # memory of occupation of instances in cuda
+
+        # memory conditions for all states
+        for state in self.states_cls:
+            self.state_mem['cpu'][state] = 0
+            self.state_mem['cuda'][state] = 0
+
+    def register_new_instance(self):
+        self._cnter += 1
+
+    def delete_instance(self):
+        self._cnter -= 1
+
+    def print_info(self):
+        print(f"Total number: {self.total_number}",
+              f"Total CPU memory occupation: {self.total_mem['cpu']}",
+              f"Total CUDA memory occupation: {self.total_mem['cuda']}\n",
+              sep='\n')
+
+        for state in self.states_cls:
+            print(f"{state}: CPU memory occupation: {self.state_mem['cpu'][state]}",
+                  f"{state}: CUDA memory occupation: {self.state_mem['cuda'][state]}\n",
+                  sep='\n')

From 56ff1921e9d3d31c30a9e7077b906f7a2bad2e66 Mon Sep 17 00:00:00 2001
From: LuGY <74758262+Gy-Lu@users.noreply.github.com>
Date: Tue, 14 Feb 2023 18:02:45 +0800
Subject: [PATCH 304/503] [NFC] polish colossalai/context/moe_context.py code
 style (#2693)

---
 colossalai/context/moe_context.py | 258 +++++++++++++++---------------
 1 file changed, 129 insertions(+), 129 deletions(-)

diff --git a/colossalai/context/moe_context.py b/colossalai/context/moe_context.py
index 0879f5fd2659..1d7a883b1552 100644
--- a/colossalai/context/moe_context.py
+++ b/colossalai/context/moe_context.py
@@ -1,129 +1,129 @@
-import torch
-import torch.distributed as dist
-
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.context.singleton_meta import SingletonMeta
-from colossalai.tensor import ProcessGroup
-
-from typing import Tuple
-
-
-def _check_sanity():
-    from colossalai.core import global_context as gpc
-    if gpc.tensor_parallel_size > 1 or gpc.pipeline_parallel_size > 1:
-        raise NotImplementedError("Moe is not compatible with tensor or "
-                                  "pipeline parallel at present.")
-
-
-class MoeParallelInfo:
-    """Moe parallelism information, storing parallel sizes and groups.
-    """
-
-    def __init__(self, ep_size: int, dp_size: int):
-        _check_sanity()
-        self.ep_size = ep_size
-        self.dp_size = dp_size
-        self.pg = ProcessGroup(tp_degree=ep_size, dp_degree=dp_size)
-        self.ep_group = self.pg.tp_process_group()
-        self.dp_group = self.pg.dp_process_group()
-
-
-class MoeContext(metaclass=SingletonMeta):
-    """MoE parallel context manager. This class manages different
-    parallel groups in MoE context and MoE loss in training.
-    """
-
-    def __init__(self):
-        self.world_size = 1
-        # Users may want to set maximum expert parallel size smaller than the world size
-        # since very low bandwidth across nodes may constrain the performance of MoE
-        # When we have a maximum expert parallel size, we have a minimum data parallel size naturally
-        self.max_ep_size = 1
-        self.min_dp_size = 1
-        self.aux_loss = None
-        self.use_kernel_optim = True
-
-        self.has_setup = False
-        self._parallel_info_dict = dict()
-
-    @property
-    def parallel_info_dict(self):
-        return self._parallel_info_dict
-
-    @property
-    def is_initialized(self):
-        return self.has_setup
-
-    def setup(self, seed: int, use_kernel_optim: bool = True):
-        assert not self.is_initialized, "MoE distributed context shouldn't be set up again"
-        _check_sanity()
-        assert torch.cuda.is_available(), "MoE requires to enable CUDA first"
-
-        self.world_size = dist.get_world_size()
-
-        from colossalai.core import global_context as gpc
-        self.max_ep_size = gpc.config.get('max_ep_size', self.world_size)
-        assert self.world_size % self.max_ep_size == 0, \
-            "Maximum epxert parallel size must be a factor of the number of GPUs"
-        self.min_dp_size = self.world_size // self.max_ep_size
-
-        # Enabling kernel optimization may raise error in some cases
-        # Users can close kernel optimization manually
-        self.use_kernel_optim = use_kernel_optim
-
-        from .random import moe_set_seed
-        moe_set_seed(seed)
-        self.has_setup = True
-
-    def get_info(self, num_experts: int) -> Tuple[int, MoeParallelInfo]:
-        """Calculate the Data Parallel Group and Expert Parallel Group.
-
-        Parameters
-        ----------
-        num_experts : int
-            The number experts
-
-        Returns
-        -------
-        int, MoeParallelInfo
-            number of local experts, the MoeParallelInfo of the current ep_size
-        """
-
-        gt_flag = num_experts % self.max_ep_size == 0    # check whether num_experts is greater
-        lt_flag = self.max_ep_size % num_experts == 0    # check whether num_experts is less
-
-        assert gt_flag or lt_flag, "Automatic experts placement dose not not support expert number" \
-                                   " is not a multiple of ep size or vice versa."
-
-        # If the number of experts is greater than maximum expert parallel size. a.k.a ep_size,
-        # there are multiple experts in each GPU and each GPU has different experts
-        # So it's data parallel size is 1
-        # Otherwise, there is only one expert in each GPU
-        # The data parallel size should be calculated
-        dp_size = 1 if gt_flag else self.max_ep_size // num_experts
-        ep_size = self.max_ep_size // dp_size
-
-        # Calculate the number of experts for each GPU
-        num_local_experts = 1 if lt_flag else num_experts // self.max_ep_size
-
-        # Don't forget to multiply minimum data parallel size
-        dp_size *= self.min_dp_size
-        if not (ep_size in self.parallel_info_dict):
-            self.parallel_info_dict[ep_size] = MoeParallelInfo(ep_size, dp_size)
-
-        return num_local_experts, self.parallel_info_dict[ep_size]
-
-    def set_kernel_not_use(self):
-        self.use_kernel_optim = False
-
-    def reset_loss(self):
-        self.aux_loss = 0
-
-    def add_loss(self, loss):
-        self.aux_loss += loss
-
-    def get_loss(self):
-        return self.aux_loss
-
-
-MOE_CONTEXT = MoeContext()
+from typing import Tuple
+
+import torch
+import torch.distributed as dist
+
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.context.singleton_meta import SingletonMeta
+from colossalai.tensor import ProcessGroup
+
+
+def _check_sanity():
+    from colossalai.core import global_context as gpc
+    if gpc.tensor_parallel_size > 1 or gpc.pipeline_parallel_size > 1:
+        raise NotImplementedError("Moe is not compatible with tensor or "
+                                  "pipeline parallel at present.")
+
+
+class MoeParallelInfo:
+    """Moe parallelism information, storing parallel sizes and groups.
+    """
+
+    def __init__(self, ep_size: int, dp_size: int):
+        _check_sanity()
+        self.ep_size = ep_size
+        self.dp_size = dp_size
+        self.pg = ProcessGroup(tp_degree=ep_size, dp_degree=dp_size)
+        self.ep_group = self.pg.tp_process_group()
+        self.dp_group = self.pg.dp_process_group()
+
+
+class MoeContext(metaclass=SingletonMeta):
+    """MoE parallel context manager. This class manages different
+    parallel groups in MoE context and MoE loss in training.
+    """
+
+    def __init__(self):
+        self.world_size = 1
+        # Users may want to set maximum expert parallel size smaller than the world size
+        # since very low bandwidth across nodes may constrain the performance of MoE
+        # When we have a maximum expert parallel size, we have a minimum data parallel size naturally
+        self.max_ep_size = 1
+        self.min_dp_size = 1
+        self.aux_loss = None
+        self.use_kernel_optim = True
+
+        self.has_setup = False
+        self._parallel_info_dict = dict()
+
+    @property
+    def parallel_info_dict(self):
+        return self._parallel_info_dict
+
+    @property
+    def is_initialized(self):
+        return self.has_setup
+
+    def setup(self, seed: int, use_kernel_optim: bool = True):
+        assert not self.is_initialized, "MoE distributed context shouldn't be set up again"
+        _check_sanity()
+        assert torch.cuda.is_available(), "MoE requires to enable CUDA first"
+
+        self.world_size = dist.get_world_size()
+
+        from colossalai.core import global_context as gpc
+        self.max_ep_size = gpc.config.get('max_ep_size', self.world_size)
+        assert self.world_size % self.max_ep_size == 0, \
+            "Maximum epxert parallel size must be a factor of the number of GPUs"
+        self.min_dp_size = self.world_size // self.max_ep_size
+
+        # Enabling kernel optimization may raise error in some cases
+        # Users can close kernel optimization manually
+        self.use_kernel_optim = use_kernel_optim
+
+        from .random import moe_set_seed
+        moe_set_seed(seed)
+        self.has_setup = True
+
+    def get_info(self, num_experts: int) -> Tuple[int, MoeParallelInfo]:
+        """Calculate the Data Parallel Group and Expert Parallel Group.
+
+        Parameters
+        ----------
+        num_experts : int
+            The number experts
+
+        Returns
+        -------
+        int, MoeParallelInfo
+            number of local experts, the MoeParallelInfo of the current ep_size
+        """
+
+        gt_flag = num_experts % self.max_ep_size == 0    # check whether num_experts is greater
+        lt_flag = self.max_ep_size % num_experts == 0    # check whether num_experts is less
+
+        assert gt_flag or lt_flag, "Automatic experts placement dose not not support expert number" \
+                                   " is not a multiple of ep size or vice versa."
+
+        # If the number of experts is greater than maximum expert parallel size. a.k.a ep_size,
+        # there are multiple experts in each GPU and each GPU has different experts
+        # So it's data parallel size is 1
+        # Otherwise, there is only one expert in each GPU
+        # The data parallel size should be calculated
+        dp_size = 1 if gt_flag else self.max_ep_size // num_experts
+        ep_size = self.max_ep_size // dp_size
+
+        # Calculate the number of experts for each GPU
+        num_local_experts = 1 if lt_flag else num_experts // self.max_ep_size
+
+        # Don't forget to multiply minimum data parallel size
+        dp_size *= self.min_dp_size
+        if not (ep_size in self.parallel_info_dict):
+            self.parallel_info_dict[ep_size] = MoeParallelInfo(ep_size, dp_size)
+
+        return num_local_experts, self.parallel_info_dict[ep_size]
+
+    def set_kernel_not_use(self):
+        self.use_kernel_optim = False
+
+    def reset_loss(self):
+        self.aux_loss = 0
+
+    def add_loss(self, loss):
+        self.aux_loss += loss
+
+    def get_loss(self):
+        return self.aux_loss
+
+
+MOE_CONTEXT = MoeContext()

From 534f68c83c948bf5f2c134ea59f0c19a67cdab19 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E3=82=A2=E3=83=9E=E3=83=87=E3=82=A6=E3=82=B9?=
 <kurisusnowdeng@users.noreply.github.com>
Date: Tue, 14 Feb 2023 18:12:01 +0800
Subject: [PATCH 305/503] [NFC] polish pipeline process group code style
 (#2694)

---
 .../context/process_group_initializer/initializer_pipeline.py  | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/colossalai/context/process_group_initializer/initializer_pipeline.py b/colossalai/context/process_group_initializer/initializer_pipeline.py
index edd1a3706c68..0ddb52f63e22 100644
--- a/colossalai/context/process_group_initializer/initializer_pipeline.py
+++ b/colossalai/context/process_group_initializer/initializer_pipeline.py
@@ -4,8 +4,9 @@
 from torch import distributed as dist
 
 from colossalai.registry import DIST_GROUP_INITIALIZER
-from .process_group_initializer import ProcessGroupInitializer
+
 from ..parallel_mode import ParallelMode
+from .process_group_initializer import ProcessGroupInitializer
 
 
 @DIST_GROUP_INITIALIZER.register_module

From c3abdd085d3daa33d7a026b610651cc06fa3a246 Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Tue, 14 Feb 2023 19:37:14 +0800
Subject: [PATCH 306/503] [release] update version (#2691)

---
 version.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version.txt b/version.txt
index 7179039691ce..abd410582dea 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.2.3
+0.2.4

From 6427c406cf4c23564a09a55570c487222d70a552 Mon Sep 17 00:00:00 2001
From: Liu Ziming <38985202+MaruyamaAya@users.noreply.github.com>
Date: Tue, 14 Feb 2023 21:30:25 +0800
Subject: [PATCH 307/503] [NFC] polish
 colossalai/auto_parallel/tensor_shard/deprecated/op_handler/strategy_generator.py
 code style (#2695)

Co-authored-by: shenggan <csg19971016@gmail.com>
---
 .../deprecated/op_handler/strategy_generator.py       | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/strategy_generator.py b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/strategy_generator.py
index 4e39fcd8e82d..5f6cc69ba2dd 100644
--- a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/strategy_generator.py
+++ b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/strategy_generator.py
@@ -1,6 +1,7 @@
-from dataclasses import dataclass
 from abc import ABC, abstractmethod
-from typing import List, Dict
+from dataclasses import dataclass
+from typing import Dict, List
+
 from colossalai.device.device_mesh import DeviceMesh
 
 __all__ = ['IntermediateStrategy', 'StrategyGenerator']
@@ -9,7 +10,7 @@
 @dataclass
 class IntermediateStrategy:
     """
-    IntermediateStrategy contains the subset of meta information for ShardingStrategy. It is 
+    IntermediateStrategy contains the subset of meta information for ShardingStrategy. It is
     to store the essential information regarding the tensor sharding and leave other meta information to OperatorHandler.
 
     Args:
@@ -24,7 +25,7 @@ class IntermediateStrategy:
 
 class StrategyGenerator(ABC):
     """
-    StrategyGenerator is used to generate the same group of sharding strategies. 
+    StrategyGenerator is used to generate the same group of sharding strategies.
     """
 
     def __init__(self, device_mesh: DeviceMesh):
@@ -39,7 +40,7 @@ def generate(self) -> List[IntermediateStrategy]:
     @abstractmethod
     def validate(self, *args, **kwargs) -> bool:
         """
-        Validate if the operands are of desired shape. 
+        Validate if the operands are of desired shape.
         If True, means this generator can be used for the current operation.
         """
         pass

From 1b34701027f4654bd5c543330b8969c0b001c68c Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Tue, 14 Feb 2023 22:17:25 +0800
Subject: [PATCH 308/503] [app] add chatgpt application (#2698)

---
 applications/ChatGPT/.gitignore               | 146 +++++++++++++
 applications/ChatGPT/LICENSE                  | 202 ++++++++++++++++++
 applications/ChatGPT/README.md                |  80 +++++++
 applications/ChatGPT/benchmarks/README.md     |  94 ++++++++
 .../ChatGPT/benchmarks/benchmark_gpt_dummy.py | 183 ++++++++++++++++
 .../ChatGPT/benchmarks/benchmark_gpt_dummy.sh |  45 ++++
 .../benchmarks/benchmark_opt_lora_dummy.py    | 178 +++++++++++++++
 applications/ChatGPT/chatgpt/__init__.py      |   0
 .../ChatGPT/chatgpt/dataset/__init__.py       |   3 +
 .../ChatGPT/chatgpt/dataset/reward_dataset.py |  52 +++++
 .../chatgpt/experience_maker/__init__.py      |   4 +
 .../ChatGPT/chatgpt/experience_maker/base.py  |  77 +++++++
 .../ChatGPT/chatgpt/experience_maker/naive.py |  36 ++++
 applications/ChatGPT/chatgpt/nn/__init__.py   |  18 ++
 applications/ChatGPT/chatgpt/nn/actor.py      |  62 ++++++
 .../ChatGPT/chatgpt/nn/bloom_actor.py         |  35 +++
 .../ChatGPT/chatgpt/nn/bloom_critic.py        |  37 ++++
 applications/ChatGPT/chatgpt/nn/bloom_rm.py   |  37 ++++
 applications/ChatGPT/chatgpt/nn/critic.py     |  47 ++++
 applications/ChatGPT/chatgpt/nn/generation.py | 137 ++++++++++++
 .../ChatGPT/chatgpt/nn/generation_utils.py    |  92 ++++++++
 applications/ChatGPT/chatgpt/nn/gpt_actor.py  |  31 +++
 applications/ChatGPT/chatgpt/nn/gpt_critic.py |  33 +++
 applications/ChatGPT/chatgpt/nn/gpt_rm.py     |  33 +++
 applications/ChatGPT/chatgpt/nn/lora.py       | 127 +++++++++++
 applications/ChatGPT/chatgpt/nn/loss.py       | 105 +++++++++
 applications/ChatGPT/chatgpt/nn/opt_actor.py  |  35 +++
 applications/ChatGPT/chatgpt/nn/opt_critic.py |  37 ++++
 applications/ChatGPT/chatgpt/nn/opt_rm.py     |  33 +++
 .../ChatGPT/chatgpt/nn/reward_model.py        |  41 ++++
 applications/ChatGPT/chatgpt/nn/utils.py      |  92 ++++++++
 .../ChatGPT/chatgpt/replay_buffer/__init__.py |   4 +
 .../ChatGPT/chatgpt/replay_buffer/base.py     |  43 ++++
 .../ChatGPT/chatgpt/replay_buffer/naive.py    |  57 +++++
 .../ChatGPT/chatgpt/replay_buffer/utils.py    |  73 +++++++
 .../ChatGPT/chatgpt/trainer/__init__.py       |   5 +
 applications/ChatGPT/chatgpt/trainer/base.py  | 162 ++++++++++++++
 .../chatgpt/trainer/callbacks/__init__.py     |   4 +
 .../ChatGPT/chatgpt/trainer/callbacks/base.py |  39 ++++
 .../callbacks/performance_evaluator.py        | 133 ++++++++++++
 applications/ChatGPT/chatgpt/trainer/ppo.py   | 104 +++++++++
 applications/ChatGPT/chatgpt/trainer/rm.py    |  77 +++++++
 .../chatgpt/trainer/strategies/__init__.py    |   6 +
 .../chatgpt/trainer/strategies/base.py        |  45 ++++
 .../chatgpt/trainer/strategies/colossalai.py  | 125 +++++++++++
 .../ChatGPT/chatgpt/trainer/strategies/ddp.py |  59 +++++
 .../chatgpt/trainer/strategies/naive.py       |  36 ++++
 applications/ChatGPT/chatgpt/trainer/utils.py |   5 +
 applications/ChatGPT/examples/README.md       | 105 +++++++++
 .../ChatGPT/examples/requirements.txt         |   1 +
 applications/ChatGPT/examples/test_ci.sh      |  27 +++
 applications/ChatGPT/examples/train_dummy.py  | 121 +++++++++++
 applications/ChatGPT/examples/train_dummy.sh  |  18 ++
 .../ChatGPT/examples/train_prompts.py         | 113 ++++++++++
 .../ChatGPT/examples/train_prompts.sh         |  18 ++
 .../ChatGPT/examples/train_reward_model.py    |  53 +++++
 applications/ChatGPT/examples/train_rm.sh     |  18 ++
 applications/ChatGPT/pytest.ini               |   6 +
 .../requirements/requirements-test.txt        |   1 +
 .../ChatGPT/requirements/requirements.txt     |   6 +
 applications/ChatGPT/setup.py                 |  42 ++++
 applications/ChatGPT/tests/__init__.py        |   0
 applications/ChatGPT/tests/test_data.py       | 117 ++++++++++
 applications/ChatGPT/version.txt              |   1 +
 64 files changed, 3756 insertions(+)
 create mode 100644 applications/ChatGPT/.gitignore
 create mode 100644 applications/ChatGPT/LICENSE
 create mode 100644 applications/ChatGPT/README.md
 create mode 100644 applications/ChatGPT/benchmarks/README.md
 create mode 100644 applications/ChatGPT/benchmarks/benchmark_gpt_dummy.py
 create mode 100755 applications/ChatGPT/benchmarks/benchmark_gpt_dummy.sh
 create mode 100644 applications/ChatGPT/benchmarks/benchmark_opt_lora_dummy.py
 create mode 100644 applications/ChatGPT/chatgpt/__init__.py
 create mode 100644 applications/ChatGPT/chatgpt/dataset/__init__.py
 create mode 100644 applications/ChatGPT/chatgpt/dataset/reward_dataset.py
 create mode 100644 applications/ChatGPT/chatgpt/experience_maker/__init__.py
 create mode 100644 applications/ChatGPT/chatgpt/experience_maker/base.py
 create mode 100644 applications/ChatGPT/chatgpt/experience_maker/naive.py
 create mode 100644 applications/ChatGPT/chatgpt/nn/__init__.py
 create mode 100644 applications/ChatGPT/chatgpt/nn/actor.py
 create mode 100644 applications/ChatGPT/chatgpt/nn/bloom_actor.py
 create mode 100644 applications/ChatGPT/chatgpt/nn/bloom_critic.py
 create mode 100644 applications/ChatGPT/chatgpt/nn/bloom_rm.py
 create mode 100644 applications/ChatGPT/chatgpt/nn/critic.py
 create mode 100644 applications/ChatGPT/chatgpt/nn/generation.py
 create mode 100644 applications/ChatGPT/chatgpt/nn/generation_utils.py
 create mode 100644 applications/ChatGPT/chatgpt/nn/gpt_actor.py
 create mode 100644 applications/ChatGPT/chatgpt/nn/gpt_critic.py
 create mode 100644 applications/ChatGPT/chatgpt/nn/gpt_rm.py
 create mode 100644 applications/ChatGPT/chatgpt/nn/lora.py
 create mode 100644 applications/ChatGPT/chatgpt/nn/loss.py
 create mode 100644 applications/ChatGPT/chatgpt/nn/opt_actor.py
 create mode 100644 applications/ChatGPT/chatgpt/nn/opt_critic.py
 create mode 100644 applications/ChatGPT/chatgpt/nn/opt_rm.py
 create mode 100644 applications/ChatGPT/chatgpt/nn/reward_model.py
 create mode 100644 applications/ChatGPT/chatgpt/nn/utils.py
 create mode 100644 applications/ChatGPT/chatgpt/replay_buffer/__init__.py
 create mode 100644 applications/ChatGPT/chatgpt/replay_buffer/base.py
 create mode 100644 applications/ChatGPT/chatgpt/replay_buffer/naive.py
 create mode 100644 applications/ChatGPT/chatgpt/replay_buffer/utils.py
 create mode 100644 applications/ChatGPT/chatgpt/trainer/__init__.py
 create mode 100644 applications/ChatGPT/chatgpt/trainer/base.py
 create mode 100644 applications/ChatGPT/chatgpt/trainer/callbacks/__init__.py
 create mode 100644 applications/ChatGPT/chatgpt/trainer/callbacks/base.py
 create mode 100644 applications/ChatGPT/chatgpt/trainer/callbacks/performance_evaluator.py
 create mode 100644 applications/ChatGPT/chatgpt/trainer/ppo.py
 create mode 100644 applications/ChatGPT/chatgpt/trainer/rm.py
 create mode 100644 applications/ChatGPT/chatgpt/trainer/strategies/__init__.py
 create mode 100644 applications/ChatGPT/chatgpt/trainer/strategies/base.py
 create mode 100644 applications/ChatGPT/chatgpt/trainer/strategies/colossalai.py
 create mode 100644 applications/ChatGPT/chatgpt/trainer/strategies/ddp.py
 create mode 100644 applications/ChatGPT/chatgpt/trainer/strategies/naive.py
 create mode 100644 applications/ChatGPT/chatgpt/trainer/utils.py
 create mode 100644 applications/ChatGPT/examples/README.md
 create mode 100644 applications/ChatGPT/examples/requirements.txt
 create mode 100755 applications/ChatGPT/examples/test_ci.sh
 create mode 100644 applications/ChatGPT/examples/train_dummy.py
 create mode 100755 applications/ChatGPT/examples/train_dummy.sh
 create mode 100644 applications/ChatGPT/examples/train_prompts.py
 create mode 100755 applications/ChatGPT/examples/train_prompts.sh
 create mode 100644 applications/ChatGPT/examples/train_reward_model.py
 create mode 100755 applications/ChatGPT/examples/train_rm.sh
 create mode 100644 applications/ChatGPT/pytest.ini
 create mode 100644 applications/ChatGPT/requirements/requirements-test.txt
 create mode 100644 applications/ChatGPT/requirements/requirements.txt
 create mode 100644 applications/ChatGPT/setup.py
 create mode 100644 applications/ChatGPT/tests/__init__.py
 create mode 100644 applications/ChatGPT/tests/test_data.py
 create mode 100644 applications/ChatGPT/version.txt

diff --git a/applications/ChatGPT/.gitignore b/applications/ChatGPT/.gitignore
new file mode 100644
index 000000000000..40f3f6debeee
--- /dev/null
+++ b/applications/ChatGPT/.gitignore
@@ -0,0 +1,146 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+docs/.build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# IDE
+.idea/
+.vscode/
+
+# macos
+*.DS_Store
+#data/
+
+docs/.build
+
+# pytorch checkpoint
+*.pt
+
+# ignore version.py generated by setup.py
+colossalai/version.py
diff --git a/applications/ChatGPT/LICENSE b/applications/ChatGPT/LICENSE
new file mode 100644
index 000000000000..0528c89ea9ec
--- /dev/null
+++ b/applications/ChatGPT/LICENSE
@@ -0,0 +1,202 @@
+Copyright 2021- HPC-AI Technology Inc. All rights reserved.
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2021- HPC-AI Technology Inc.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/applications/ChatGPT/README.md b/applications/ChatGPT/README.md
new file mode 100644
index 000000000000..dce59ad4b834
--- /dev/null
+++ b/applications/ChatGPT/README.md
@@ -0,0 +1,80 @@
+# RLHF - ColossalAI
+
+Implementation of RLHF (Reinforcement Learning with Human Feedback) powered by ColossalAI. It supports distributed training and offloading, which can fit extremly large models.
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chatgpt/chatgpt.png" width=700/>
+</p>
+
+## Training process (step 3)
+<p align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chatgpt/experience.jpg" width=500/>
+</p>
+<p align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chatgpt/train.jpg" width=500/>
+</p>
+
+
+## Install
+```shell
+pip install .
+```
+
+
+## Usage
+
+The main entrypoint is `Trainer`. We only support PPO trainer now. We support many training strategies:
+
+- NaiveStrategy: simplest strategy. Train on single GPU.
+- DDPStrategy: use `torch.nn.parallel.DistributedDataParallel`. Train on multi GPUs.
+- ColossalAIStrategy: use Gemini and Zero of ColossalAI. It eliminates model duplication on each GPU and supports offload. It's very useful when training large models on multi GPUs.
+
+Simplest usage:
+
+```python
+from chatgpt.trainer import PPOTrainer
+from chatgpt.trainer.strategies import ColossalAIStrategy
+
+strategy = ColossalAIStrategy()
+
+with strategy.model_init_context():
+  # init your model here
+  actor = Actor()
+  critic = Critic()
+
+trainer = PPOTrainer(actor = actor, critic= critic, strategy, ...)
+
+trainer.fit(dataset, ...)
+```
+
+For more details, see `examples/`.
+
+We also support training reward model with true-world data. See `examples/train_reward_model.py`.
+
+## Todo
+
+- [x] implement PPO training
+- [x] implement training reward model
+- [x] support LoRA
+- [ ] implement PPO-ptx fine-tuning
+- [ ] integrate with Ray
+- [ ] support more RL paradigms, like Implicit Language Q-Learning (ILQL)
+
+## Citations
+
+```bibtex
+@article{Hu2021LoRALA,
+    title   = {LoRA: Low-Rank Adaptation of Large Language Models},
+    author  = {Edward J. Hu and Yelong Shen and Phillip Wallis and Zeyuan Allen-Zhu and Yuanzhi Li and Shean Wang and Weizhu Chen},
+    journal = {ArXiv},
+    year    = {2021},
+    volume  = {abs/2106.09685}
+}
+
+@article{ouyang2022training,
+  title={Training language models to follow instructions with human feedback},
+  author={Ouyang, Long and Wu, Jeff and Jiang, Xu and Almeida, Diogo and Wainwright, Carroll L and Mishkin, Pamela and Zhang, Chong and Agarwal, Sandhini and Slama, Katarina and Ray, Alex and others},
+  journal={arXiv preprint arXiv:2203.02155},
+  year={2022}
+}
+```
diff --git a/applications/ChatGPT/benchmarks/README.md b/applications/ChatGPT/benchmarks/README.md
new file mode 100644
index 000000000000..f7212fc89908
--- /dev/null
+++ b/applications/ChatGPT/benchmarks/README.md
@@ -0,0 +1,94 @@
+# Benchmarks
+
+## Benchmark GPT on dummy prompt data
+
+We provide various GPT models (string in parentheses is the corresponding model name used in this script):
+
+- GPT2-S (s)
+- GPT2-M (m)
+- GPT2-L (l)
+- GPT2-XL (xl)
+- GPT2-4B (4b)
+- GPT2-6B (6b)
+- GPT2-8B (8b)
+- GPT2-10B (10b)
+- GPT2-12B (12b)
+- GPT2-15B (15b)
+- GPT2-18B (18b)
+- GPT2-20B (20b)
+- GPT2-24B (24b)
+- GPT2-28B (28b)
+- GPT2-32B (32b)
+- GPT2-36B (36b)
+- GPT2-40B (40b)
+- GPT3 (175b)
+
+We also provide various training strategies:
+
+- ddp: torch DDP
+- colossalai_gemini: ColossalAI GeminiDDP with `placement_policy="cuda"`, like zero3
+- colossalai_gemini_cpu: ColossalAI GeminiDDP with `placement_policy="cpu"`, like zero3-offload
+- colossalai_zero2: ColossalAI zero2
+- colossalai_zero2_cpu: ColossalAI zero2-offload
+- colossalai_zero1: ColossalAI zero1
+- colossalai_zero1_cpu: ColossalAI zero1-offload
+
+We only support `torchrun` to launch now. E.g.
+
+```shell
+# run GPT2-S on single-node single-GPU with min batch size
+torchrun --standalone --nproc_pero_node 1 benchmark_gpt_dummy.py --model s --strategy ddp --experience_batch_size 1 --train_batch_size 1
+# run GPT2-XL on single-node 4-GPU
+torchrun --standalone --nproc_per_node 4 benchmark_gpt_dummy.py --model xl --strategy colossalai_zero2
+# run GPT3 on 8-node 8-GPU
+torchrun --nnodes 8 --nproc_per_node 8 \
+ --rdzv_id=$JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$HOST_NODE_ADDR \
+ benchmark_gpt_dummy.py --model 175b --strategy colossalai_gemini
+```
+
+> ⚠ Batch sizes in CLI args and outputed throughput/TFLOPS are all values of per GPU.
+
+In this benchmark, we assume the model architectures/sizes of actor and critic are the same for simplicity. But in practice, to reduce training cost, we may use a smaller critic.
+
+We also provide a simple shell script to run a set of benchmarks. But it only supports benchmark on single node. However, it's easy to run on multi-nodes by modifying launch command in this script.
+
+Usage:
+
+```shell
+# run for GPUS=(1 2 4 8) x strategy=("ddp" "colossalai_zero2" "colossalai_gemini" "colossalai_zero2_cpu" "colossalai_gemini_cpu") x model=("s" "m" "l" "xl" "2b" "4b" "6b" "8b" "10b") x batch_size=(1 2 4 8 16 32 64 128 256)
+./benchmark_gpt_dummy.sh
+# run for GPUS=2 x strategy=("ddp" "colossalai_zero2" "colossalai_gemini" "colossalai_zero2_cpu" "colossalai_gemini_cpu") x model=("s" "m" "l" "xl" "2b" "4b" "6b" "8b" "10b") x batch_size=(1 2 4 8 16 32 64 128 256)
+./benchmark_gpt_dummy.sh 2
+# run for GPUS=2 x strategy=ddp x model=("s" "m" "l" "xl" "2b" "4b" "6b" "8b" "10b") x batch_size=(1 2 4 8 16 32 64 128 256)
+./benchmark_gpt_dummy.sh 2 ddp
+# run for GPUS=2 x strategy=ddp x model=l x batch_size=(1 2 4 8 16 32 64 128 256)
+./benchmark_gpt_dummy.sh 2 ddp l
+```
+
+## Benchmark OPT with LoRA on dummy prompt data
+
+We provide various OPT models (string in parentheses is the corresponding model name used in this script):
+
+- OPT-125M (125m)
+- OPT-350M (350m)
+- OPT-700M (700m)
+- OPT-1.3B (1.3b)
+- OPT-2.7B (2.7b)
+- OPT-3.5B (3.5b)
+- OPT-5.5B (5.5b)
+- OPT-6.7B (6.7b)
+- OPT-10B (10b)
+- OPT-13B (13b)
+
+We only support `torchrun` to launch now. E.g.
+
+```shell
+# run OPT-125M with no lora (lora_rank=0) on single-node single-GPU with min batch size
+torchrun --standalone --nproc_pero_node 1 benchmark_opt_lora_dummy.py --model 125m --strategy ddp --experience_batch_size 1 --train_batch_size 1 --lora_rank 0
+# run OPT-350M with lora_rank=4 on single-node 4-GPU
+torchrun --standalone --nproc_per_node 4 benchmark_opt_lora_dummy.py --model 350m --strategy colossalai_zero2 --lora_rank 4
+```
+
+> ⚠ Batch sizes in CLI args and outputed throughput/TFLOPS are all values of per GPU.
+
+In this benchmark, we assume the model architectures/sizes of actor and critic are the same for simplicity. But in practice, to reduce training cost, we may use a smaller critic.
diff --git a/applications/ChatGPT/benchmarks/benchmark_gpt_dummy.py b/applications/ChatGPT/benchmarks/benchmark_gpt_dummy.py
new file mode 100644
index 000000000000..8474f3ba7b7c
--- /dev/null
+++ b/applications/ChatGPT/benchmarks/benchmark_gpt_dummy.py
@@ -0,0 +1,183 @@
+import argparse
+from copy import deepcopy
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from chatgpt.nn import GPTActor, GPTCritic, RewardModel
+from chatgpt.nn.generation_utils import gpt_prepare_inputs_fn, update_model_kwargs_fn
+from chatgpt.trainer import PPOTrainer
+from chatgpt.trainer.callbacks import PerformanceEvaluator
+from chatgpt.trainer.strategies import ColossalAIStrategy, DDPStrategy, Strategy
+from torch.optim import Adam
+from transformers.models.gpt2.configuration_gpt2 import GPT2Config
+from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
+
+from colossalai.nn.optimizer import HybridAdam
+
+
+def get_model_numel(model: nn.Module, strategy: Strategy) -> int:
+    numel = sum(p.numel() for p in model.parameters())
+    if isinstance(strategy, ColossalAIStrategy) and strategy.stage == 3 and strategy.shard_init:
+        numel *= dist.get_world_size()
+    return numel
+
+
+def preprocess_batch(samples) -> dict:
+    input_ids = torch.stack(samples)
+    attention_mask = torch.ones_like(input_ids, dtype=torch.long)
+    return {'input_ids': input_ids, 'attention_mask': attention_mask}
+
+
+def print_rank_0(*args, **kwargs) -> None:
+    if dist.get_rank() == 0:
+        print(*args, **kwargs)
+
+
+def print_model_numel(model_dict: dict) -> None:
+    B = 1024**3
+    M = 1024**2
+    K = 1024
+    outputs = ''
+    for name, numel in model_dict.items():
+        outputs += f'{name}: '
+        if numel >= B:
+            outputs += f'{numel / B:.2f} B\n'
+        elif numel >= M:
+            outputs += f'{numel / M:.2f} M\n'
+        elif numel >= K:
+            outputs += f'{numel / K:.2f} K\n'
+        else:
+            outputs += f'{numel}\n'
+    print_rank_0(outputs)
+
+
+def get_gpt_config(model_name: str) -> GPT2Config:
+    model_map = {
+        's': GPT2Config(),
+        'm': GPT2Config(n_embd=1024, n_layer=24, n_head=16),
+        'l': GPT2Config(n_embd=1280, n_layer=36, n_head=20),
+        'xl': GPT2Config(n_embd=1600, n_layer=48, n_head=25),
+        '2b': GPT2Config(n_embd=2048, n_layer=40, n_head=16),
+        '4b': GPT2Config(n_embd=2304, n_layer=64, n_head=16),
+        '6b': GPT2Config(n_embd=4096, n_layer=30, n_head=16),
+        '8b': GPT2Config(n_embd=4096, n_layer=40, n_head=16),
+        '10b': GPT2Config(n_embd=4096, n_layer=50, n_head=16),
+        '12b': GPT2Config(n_embd=4096, n_layer=60, n_head=16),
+        '15b': GPT2Config(n_embd=4096, n_layer=78, n_head=16),
+        '18b': GPT2Config(n_embd=4096, n_layer=90, n_head=16),
+        '20b': GPT2Config(n_embd=8192, n_layer=25, n_head=16),
+        '24b': GPT2Config(n_embd=8192, n_layer=30, n_head=16),
+        '28b': GPT2Config(n_embd=8192, n_layer=35, n_head=16),
+        '32b': GPT2Config(n_embd=8192, n_layer=40, n_head=16),
+        '36b': GPT2Config(n_embd=8192, n_layer=45, n_head=16),
+        '40b': GPT2Config(n_embd=8192, n_layer=50, n_head=16),
+        '175b': GPT2Config(n_positions=2048, n_embd=12288, n_layer=96, n_head=96),
+    }
+    try:
+        return model_map[model_name]
+    except KeyError:
+        raise ValueError(f'Unknown model "{model_name}"')
+
+
+def main(args):
+    if args.strategy == 'ddp':
+        strategy = DDPStrategy()
+    elif args.strategy == 'colossalai_gemini':
+        strategy = ColossalAIStrategy(stage=3, placement_policy='cuda', initial_scale=2**5)
+    elif args.strategy == 'colossalai_gemini_cpu':
+        strategy = ColossalAIStrategy(stage=3, placement_policy='cpu', initial_scale=2**5)
+    elif args.strategy == 'colossalai_zero2':
+        strategy = ColossalAIStrategy(stage=2, placement_policy='cuda')
+    elif args.strategy == 'colossalai_zero2_cpu':
+        strategy = ColossalAIStrategy(stage=2, placement_policy='cpu')
+    elif args.strategy == 'colossalai_zero1':
+        strategy = ColossalAIStrategy(stage=1, placement_policy='cuda')
+    elif args.strategy == 'colossalai_zero1_cpu':
+        strategy = ColossalAIStrategy(stage=1, placement_policy='cpu')
+    else:
+        raise ValueError(f'Unsupported strategy "{args.strategy}"')
+
+    model_config = get_gpt_config(args.model)
+
+    with strategy.model_init_context():
+        actor = GPTActor(config=model_config).cuda()
+        critic = GPTCritic(config=model_config).cuda()
+
+        initial_model = deepcopy(actor).cuda()
+        reward_model = RewardModel(deepcopy(critic.model), deepcopy(critic.value_head)).cuda()
+
+    actor_numel = get_model_numel(actor, strategy)
+    critic_numel = get_model_numel(critic, strategy)
+    initial_model_numel = get_model_numel(initial_model, strategy)
+    reward_model_numel = get_model_numel(reward_model, strategy)
+    print_model_numel({
+        'Actor': actor_numel,
+        'Critic': critic_numel,
+        'Initial model': initial_model_numel,
+        'Reward model': reward_model_numel
+    })
+    performance_evaluator = PerformanceEvaluator(actor_numel,
+                                                 critic_numel,
+                                                 initial_model_numel,
+                                                 reward_model_numel,
+                                                 enable_grad_checkpoint=False,
+                                                 ignore_episodes=1)
+
+    if args.strategy.startswith('colossalai'):
+        actor_optim = HybridAdam(actor.parameters(), lr=5e-6)
+        critic_optim = HybridAdam(critic.parameters(), lr=5e-6)
+    else:
+        actor_optim = Adam(actor.parameters(), lr=5e-6)
+        critic_optim = Adam(critic.parameters(), lr=5e-6)
+
+    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+    tokenizer.pad_token = tokenizer.eos_token
+
+    trainer = PPOTrainer(strategy,
+                         actor,
+                         critic,
+                         reward_model,
+                         initial_model,
+                         actor_optim,
+                         critic_optim,
+                         max_epochs=args.max_epochs,
+                         train_batch_size=args.train_batch_size,
+                         experience_batch_size=args.experience_batch_size,
+                         tokenizer=preprocess_batch,
+                         max_length=512,
+                         do_sample=True,
+                         temperature=1.0,
+                         top_k=50,
+                         pad_token_id=tokenizer.pad_token_id,
+                         eos_token_id=tokenizer.eos_token_id,
+                         prepare_inputs_fn=gpt_prepare_inputs_fn,
+                         update_model_kwargs_fn=update_model_kwargs_fn,
+                         callbacks=[performance_evaluator])
+
+    random_prompts = torch.randint(tokenizer.vocab_size, (1000, 400), device=torch.cuda.current_device())
+    trainer.fit(random_prompts,
+                num_episodes=args.num_episodes,
+                max_timesteps=args.max_timesteps,
+                update_timesteps=args.update_timesteps)
+
+    print_rank_0(f'Peak CUDA mem: {torch.cuda.max_memory_allocated()/1024**3:.2f} GB')
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model', default='s')
+    parser.add_argument('--strategy',
+                        choices=[
+                            'ddp', 'colossalai_gemini', 'colossalai_gemini_cpu', 'colossalai_zero2',
+                            'colossalai_zero2_cpu', 'colossalai_zero1', 'colossalai_zero1_cpu'
+                        ],
+                        default='ddp')
+    parser.add_argument('--num_episodes', type=int, default=3)
+    parser.add_argument('--max_timesteps', type=int, default=8)
+    parser.add_argument('--update_timesteps', type=int, default=8)
+    parser.add_argument('--max_epochs', type=int, default=3)
+    parser.add_argument('--train_batch_size', type=int, default=8)
+    parser.add_argument('--experience_batch_size', type=int, default=8)
+    args = parser.parse_args()
+    main(args)
diff --git a/applications/ChatGPT/benchmarks/benchmark_gpt_dummy.sh b/applications/ChatGPT/benchmarks/benchmark_gpt_dummy.sh
new file mode 100755
index 000000000000..d70f8872570a
--- /dev/null
+++ b/applications/ChatGPT/benchmarks/benchmark_gpt_dummy.sh
@@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+# Usage: $0 <?number-of-gpus> <?strategy> <?model>
+set -xu
+
+BASE=$(realpath $(dirname $0))
+
+
+PY_SCRIPT=${BASE}/benchmark_gpt_dummy.py
+export OMP_NUM_THREADS=8
+
+function tune_batch_size() {
+    # we found when experience batch size is equal to train batch size
+    # peak CUDA memory usage of making experience phase is less than or equal to that of training phase
+    # thus, experience batch size can be larger than or equal to train batch size
+    for bs in 1 2 4 8 16 32 64 128 256; do
+        torchrun --standalone --nproc_per_node $1 $PY_SCRIPT --model $2 --strategy $3 --experience_batch_size $bs --train_batch_size $bs || return 1
+    done
+}
+
+if [ $# -eq 0 ]; then
+    num_gpus=(1 2 4 8)
+else
+    num_gpus=($1)
+fi
+
+if [ $# -le 1 ]; then
+    strategies=("ddp" "colossalai_zero2" "colossalai_gemini" "colossalai_zero2_cpu" "colossalai_gemini_cpu")
+else
+    strategies=($2)
+fi
+
+if [ $# -le 2 ]; then
+    models=("s" "m" "l" "xl" "2b" "4b" "6b" "8b" "10b")
+else
+    models=($3)
+fi
+
+
+for num_gpu in ${num_gpus[@]}; do
+    for strategy in ${strategies[@]}; do
+        for model in ${models[@]}; do
+            tune_batch_size $num_gpu $model $strategy || break
+        done
+    done
+done
diff --git a/applications/ChatGPT/benchmarks/benchmark_opt_lora_dummy.py b/applications/ChatGPT/benchmarks/benchmark_opt_lora_dummy.py
new file mode 100644
index 000000000000..accbc4155fb1
--- /dev/null
+++ b/applications/ChatGPT/benchmarks/benchmark_opt_lora_dummy.py
@@ -0,0 +1,178 @@
+import argparse
+from copy import deepcopy
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from chatgpt.nn import OPTActor, OPTCritic, RewardModel
+from chatgpt.nn.generation_utils import opt_prepare_inputs_fn, update_model_kwargs_fn
+from chatgpt.trainer import PPOTrainer
+from chatgpt.trainer.callbacks import PerformanceEvaluator
+from chatgpt.trainer.strategies import ColossalAIStrategy, DDPStrategy, Strategy
+from torch.optim import Adam
+from transformers import AutoTokenizer
+from transformers.models.opt.configuration_opt import OPTConfig
+
+from colossalai.nn.optimizer import HybridAdam
+
+
+def get_model_numel(model: nn.Module, strategy: Strategy) -> int:
+    numel = sum(p.numel() for p in model.parameters())
+    if isinstance(strategy, ColossalAIStrategy) and strategy.stage == 3 and strategy.shard_init:
+        numel *= dist.get_world_size()
+    return numel
+
+
+def preprocess_batch(samples) -> dict:
+    input_ids = torch.stack(samples)
+    attention_mask = torch.ones_like(input_ids, dtype=torch.long)
+    return {'input_ids': input_ids, 'attention_mask': attention_mask}
+
+
+def print_rank_0(*args, **kwargs) -> None:
+    if dist.get_rank() == 0:
+        print(*args, **kwargs)
+
+
+def print_model_numel(model_dict: dict) -> None:
+    B = 1024**3
+    M = 1024**2
+    K = 1024
+    outputs = ''
+    for name, numel in model_dict.items():
+        outputs += f'{name}: '
+        if numel >= B:
+            outputs += f'{numel / B:.2f} B\n'
+        elif numel >= M:
+            outputs += f'{numel / M:.2f} M\n'
+        elif numel >= K:
+            outputs += f'{numel / K:.2f} K\n'
+        else:
+            outputs += f'{numel}\n'
+    print_rank_0(outputs)
+
+
+def get_gpt_config(model_name: str) -> OPTConfig:
+    model_map = {
+        '125m': OPTConfig.from_pretrained('facebook/opt-125m'),
+        '350m': OPTConfig(hidden_size=1024, ffn_dim=4096, num_hidden_layers=24, num_attention_heads=16),
+        '700m': OPTConfig(hidden_size=1280, ffn_dim=5120, num_hidden_layers=36, num_attention_heads=20),
+        '1.3b': OPTConfig.from_pretrained('facebook/opt-1.3b'),
+        '2.7b': OPTConfig.from_pretrained('facebook/opt-2.7b'),
+        '3.5b': OPTConfig(hidden_size=3072, ffn_dim=12288, num_hidden_layers=32, num_attention_heads=32),
+        '5.5b': OPTConfig(hidden_size=3840, ffn_dim=15360, num_hidden_layers=32, num_attention_heads=32),
+        '6.7b': OPTConfig.from_pretrained('facebook/opt-6.7b'),
+        '10b': OPTConfig(hidden_size=5120, ffn_dim=20480, num_hidden_layers=32, num_attention_heads=32),
+        '13b': OPTConfig.from_pretrained('facebook/opt-13b'),
+    }
+    try:
+        return model_map[model_name]
+    except KeyError:
+        raise ValueError(f'Unknown model "{model_name}"')
+
+
+def main(args):
+    if args.strategy == 'ddp':
+        strategy = DDPStrategy()
+    elif args.strategy == 'colossalai_gemini':
+        strategy = ColossalAIStrategy(stage=3, placement_policy='cuda', initial_scale=2**5)
+    elif args.strategy == 'colossalai_gemini_cpu':
+        strategy = ColossalAIStrategy(stage=3, placement_policy='cpu', initial_scale=2**5)
+    elif args.strategy == 'colossalai_zero2':
+        strategy = ColossalAIStrategy(stage=2, placement_policy='cuda')
+    elif args.strategy == 'colossalai_zero2_cpu':
+        strategy = ColossalAIStrategy(stage=2, placement_policy='cpu')
+    elif args.strategy == 'colossalai_zero1':
+        strategy = ColossalAIStrategy(stage=1, placement_policy='cuda')
+    elif args.strategy == 'colossalai_zero1_cpu':
+        strategy = ColossalAIStrategy(stage=1, placement_policy='cpu')
+    else:
+        raise ValueError(f'Unsupported strategy "{args.strategy}"')
+
+    torch.cuda.set_per_process_memory_fraction(args.cuda_mem_frac)
+
+    model_config = get_gpt_config(args.model)
+
+    with strategy.model_init_context():
+        actor = OPTActor(config=model_config, lora_rank=args.lora_rank).cuda()
+        critic = OPTCritic(config=model_config, lora_rank=args.lora_rank).cuda()
+
+        initial_model = deepcopy(actor).cuda()
+        reward_model = RewardModel(deepcopy(critic.model), deepcopy(critic.value_head)).cuda()
+
+    actor_numel = get_model_numel(actor, strategy)
+    critic_numel = get_model_numel(critic, strategy)
+    initial_model_numel = get_model_numel(initial_model, strategy)
+    reward_model_numel = get_model_numel(reward_model, strategy)
+    print_model_numel({
+        'Actor': actor_numel,
+        'Critic': critic_numel,
+        'Initial model': initial_model_numel,
+        'Reward model': reward_model_numel
+    })
+    performance_evaluator = PerformanceEvaluator(actor_numel,
+                                                 critic_numel,
+                                                 initial_model_numel,
+                                                 reward_model_numel,
+                                                 enable_grad_checkpoint=False,
+                                                 ignore_episodes=1)
+
+    if args.strategy.startswith('colossalai'):
+        actor_optim = HybridAdam(actor.parameters(), lr=5e-6)
+        critic_optim = HybridAdam(critic.parameters(), lr=5e-6)
+    else:
+        actor_optim = Adam(actor.parameters(), lr=5e-6)
+        critic_optim = Adam(critic.parameters(), lr=5e-6)
+
+    tokenizer = AutoTokenizer.from_pretrained('facebook/opt-350m')
+    tokenizer.pad_token = tokenizer.eos_token
+
+    trainer = PPOTrainer(strategy,
+                         actor,
+                         critic,
+                         reward_model,
+                         initial_model,
+                         actor_optim,
+                         critic_optim,
+                         max_epochs=args.max_epochs,
+                         train_batch_size=args.train_batch_size,
+                         experience_batch_size=args.experience_batch_size,
+                         tokenizer=preprocess_batch,
+                         max_length=512,
+                         do_sample=True,
+                         temperature=1.0,
+                         top_k=50,
+                         pad_token_id=tokenizer.pad_token_id,
+                         eos_token_id=tokenizer.eos_token_id,
+                         prepare_inputs_fn=opt_prepare_inputs_fn,
+                         update_model_kwargs_fn=update_model_kwargs_fn,
+                         callbacks=[performance_evaluator])
+
+    random_prompts = torch.randint(tokenizer.vocab_size, (1000, 400), device=torch.cuda.current_device())
+    trainer.fit(random_prompts,
+                num_episodes=args.num_episodes,
+                max_timesteps=args.max_timesteps,
+                update_timesteps=args.update_timesteps)
+
+    print_rank_0(f'Peak CUDA mem: {torch.cuda.max_memory_allocated()/1024**3:.2f} GB')
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model', default='125m')
+    parser.add_argument('--strategy',
+                        choices=[
+                            'ddp', 'colossalai_gemini', 'colossalai_gemini_cpu', 'colossalai_zero2',
+                            'colossalai_zero2_cpu', 'colossalai_zero1', 'colossalai_zero1_cpu'
+                        ],
+                        default='ddp')
+    parser.add_argument('--num_episodes', type=int, default=3)
+    parser.add_argument('--max_timesteps', type=int, default=8)
+    parser.add_argument('--update_timesteps', type=int, default=8)
+    parser.add_argument('--max_epochs', type=int, default=3)
+    parser.add_argument('--train_batch_size', type=int, default=8)
+    parser.add_argument('--experience_batch_size', type=int, default=8)
+    parser.add_argument('--lora_rank', type=int, default=4)
+    parser.add_argument('--cuda_mem_frac', type=float, default=1.0)
+    args = parser.parse_args()
+    main(args)
diff --git a/applications/ChatGPT/chatgpt/__init__.py b/applications/ChatGPT/chatgpt/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/applications/ChatGPT/chatgpt/dataset/__init__.py b/applications/ChatGPT/chatgpt/dataset/__init__.py
new file mode 100644
index 000000000000..2f330ee67afe
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/dataset/__init__.py
@@ -0,0 +1,3 @@
+from .reward_dataset import RewardDataset
+
+__all__ = ['RewardDataset']
diff --git a/applications/ChatGPT/chatgpt/dataset/reward_dataset.py b/applications/ChatGPT/chatgpt/dataset/reward_dataset.py
new file mode 100644
index 000000000000..14edcce30d19
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/dataset/reward_dataset.py
@@ -0,0 +1,52 @@
+from typing import Callable
+
+from torch.utils.data import Dataset
+from tqdm import tqdm
+
+
+class RewardDataset(Dataset):
+    """
+    Dataset for reward model
+
+    Args:
+        dataset: dataset for reward model
+        tokenizer: tokenizer for reward model
+        max_length: max length of input
+    """
+
+    def __init__(self, dataset, tokenizer: Callable, max_length: int) -> None:
+        super().__init__()
+        self.chosen = []
+        self.reject = []
+        for data in tqdm(dataset):
+            prompt = data['prompt']
+
+            chosen = prompt + data['chosen'] + "<|endoftext|>"
+            chosen_token = tokenizer(chosen,
+                                     max_length=max_length,
+                                     padding="max_length",
+                                     truncation=True,
+                                     return_tensors="pt")
+            self.chosen.append({
+                "input_ids": chosen_token['input_ids'],
+                "attention_mask": chosen_token['attention_mask']
+            })
+
+            reject = prompt + data['rejected'] + "<|endoftext|>"
+            reject_token = tokenizer(reject,
+                                     max_length=max_length,
+                                     padding="max_length",
+                                     truncation=True,
+                                     return_tensors="pt")
+            self.reject.append({
+                "input_ids": reject_token['input_ids'],
+                "attention_mask": reject_token['attention_mask']
+            })
+
+    def __len__(self):
+        length = len(self.chosen)
+        return length
+
+    def __getitem__(self, idx):
+        return self.chosen[idx]["input_ids"], self.chosen[idx]["attention_mask"], self.reject[idx][
+            "input_ids"], self.reject[idx]["attention_mask"]
diff --git a/applications/ChatGPT/chatgpt/experience_maker/__init__.py b/applications/ChatGPT/chatgpt/experience_maker/__init__.py
new file mode 100644
index 000000000000..39ca7576b227
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/experience_maker/__init__.py
@@ -0,0 +1,4 @@
+from .base import Experience, ExperienceMaker
+from .naive import NaiveExperienceMaker
+
+__all__ = ['Experience', 'ExperienceMaker', 'NaiveExperienceMaker']
diff --git a/applications/ChatGPT/chatgpt/experience_maker/base.py b/applications/ChatGPT/chatgpt/experience_maker/base.py
new file mode 100644
index 000000000000..61895322cb31
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/experience_maker/base.py
@@ -0,0 +1,77 @@
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from chatgpt.nn.actor import Actor
+
+
+@dataclass
+class Experience:
+    """Experience is a batch of data.
+    These data should have the the sequence length and number of actions.
+    Left padding for sequences is applied.
+
+    Shapes of each tensor:
+    sequences: (B, S)
+    action_log_probs: (B, A)
+    values: (B)
+    reward: (B)
+    advatanges: (B)
+    attention_mask: (B, S)
+    action_mask: (B, A)
+
+    "A" is the number of actions.
+    """
+    sequences: torch.Tensor
+    action_log_probs: torch.Tensor
+    values: torch.Tensor
+    reward: torch.Tensor
+    advantages: torch.Tensor
+    attention_mask: Optional[torch.LongTensor]
+    action_mask: Optional[torch.BoolTensor]
+
+    @torch.no_grad()
+    def to_device(self, device: torch.device) -> None:
+        self.sequences = self.sequences.to(device)
+        self.action_log_probs = self.action_log_probs.to(device)
+        self.values = self.values.to(device)
+        self.reward = self.reward.to(device)
+        self.advantages = self.advantages.to(device)
+        if self.attention_mask is not None:
+            self.attention_mask = self.attention_mask.to(device)
+        if self.action_mask is not None:
+            self.action_mask = self.action_mask.to(device)
+
+    def pin_memory(self):
+        self.sequences = self.sequences.pin_memory()
+        self.action_log_probs = self.action_log_probs.pin_memory()
+        self.values = self.values.pin_memory()
+        self.reward = self.reward.pin_memory()
+        self.advantages = self.advantages.pin_memory()
+        if self.attention_mask is not None:
+            self.attention_mask = self.attention_mask.pin_memory()
+        if self.action_mask is not None:
+            self.action_mask = self.action_mask.pin_memory()
+        return self
+
+
+class ExperienceMaker(ABC):
+
+    def __init__(self,
+                 actor: Actor,
+                 critic: nn.Module,
+                 reward_model: nn.Module,
+                 initial_model: Actor,
+                 kl_coef: float = 0.1) -> None:
+        super().__init__()
+        self.actor = actor
+        self.critic = critic
+        self.reward_model = reward_model
+        self.initial_model = initial_model
+        self.kl_coef = kl_coef
+
+    @abstractmethod
+    def make_experience(self, input_ids: torch.Tensor, **generate_kwargs) -> Experience:
+        pass
diff --git a/applications/ChatGPT/chatgpt/experience_maker/naive.py b/applications/ChatGPT/chatgpt/experience_maker/naive.py
new file mode 100644
index 000000000000..f4fd2078c1eb
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/experience_maker/naive.py
@@ -0,0 +1,36 @@
+import torch
+from chatgpt.nn.utils import compute_reward, normalize
+
+from .base import Experience, ExperienceMaker
+
+
+class NaiveExperienceMaker(ExperienceMaker):
+    """
+    Naive experience maker.
+    """
+
+    @torch.no_grad()
+    def make_experience(self, input_ids: torch.Tensor, **generate_kwargs) -> Experience:
+        self.actor.eval()
+        self.critic.eval()
+        self.initial_model.eval()
+        self.reward_model.eval()
+
+        sequences, attention_mask, action_mask = self.actor.generate(input_ids,
+                                                                     return_action_mask=True,
+                                                                     **generate_kwargs)
+        num_actions = action_mask.size(1)
+
+        action_log_probs = self.actor(sequences, num_actions, attention_mask)
+        base_action_log_probs = self.initial_model(sequences, num_actions, attention_mask)
+        value = self.critic(sequences, action_mask, attention_mask)
+        r = self.reward_model(sequences, attention_mask)
+
+        reward = compute_reward(r, self.kl_coef, action_log_probs, base_action_log_probs, action_mask=action_mask)
+
+        advantage = reward - value
+        # TODO(ver217): maybe normalize adv
+        if advantage.ndim == 1:
+            advantage = advantage.unsqueeze(-1)
+
+        return Experience(sequences, action_log_probs, value, reward, advantage, attention_mask, action_mask)
diff --git a/applications/ChatGPT/chatgpt/nn/__init__.py b/applications/ChatGPT/chatgpt/nn/__init__.py
new file mode 100644
index 000000000000..c728d7df37d4
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/nn/__init__.py
@@ -0,0 +1,18 @@
+from .actor import Actor
+from .bloom_actor import BLOOMActor
+from .bloom_critic import BLOOMCritic
+from .bloom_rm import BLOOMRM
+from .critic import Critic
+from .gpt_actor import GPTActor
+from .gpt_critic import GPTCritic
+from .gpt_rm import GPTRM
+from .loss import PairWiseLoss, PolicyLoss, PPOPtxActorLoss, ValueLoss
+from .opt_actor import OPTActor
+from .opt_critic import OPTCritic
+from .opt_rm import OPTRM
+from .reward_model import RewardModel
+
+__all__ = [
+    'Actor', 'Critic', 'RewardModel', 'PolicyLoss', 'ValueLoss', 'PPOPtxActorLoss', 'PairWiseLoss', 'GPTActor',
+    'GPTCritic', 'GPTRM', 'BLOOMActor', 'BLOOMCritic', 'BLOOMRM', 'OPTActor', 'OPTCritic', 'OPTRM'
+]
diff --git a/applications/ChatGPT/chatgpt/nn/actor.py b/applications/ChatGPT/chatgpt/nn/actor.py
new file mode 100644
index 000000000000..c4c0d579de58
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/nn/actor.py
@@ -0,0 +1,62 @@
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .generation import generate
+from .lora import LoRAModule
+from .utils import log_probs_from_logits
+
+
+class Actor(LoRAModule):
+    """
+    Actor model base class.
+
+    Args:
+        model (nn.Module): Actor Model.
+        lora_rank (int): LoRA rank.
+        lora_train_bias (str): LoRA bias training mode.
+    """
+
+    def __init__(self, model: nn.Module, lora_rank: int = 0, lora_train_bias: str = 'none') -> None:
+        super().__init__(lora_rank=lora_rank, lora_train_bias=lora_train_bias)
+        self.model = model
+        self.convert_to_lora()
+
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: torch.Tensor,
+        return_action_mask: bool = True,
+        **kwargs
+    ) -> Union[Tuple[torch.LongTensor, torch.LongTensor], Tuple[torch.LongTensor, torch.LongTensor, torch.BoolTensor]]:
+        sequences = generate(self.model, input_ids, **kwargs)
+        attention_mask = None
+        pad_token_id = kwargs.get('pad_token_id', None)
+        if pad_token_id is not None:
+            attention_mask = sequences.not_equal(pad_token_id).to(dtype=torch.long, device=sequences.device)
+        if not return_action_mask:
+            return sequences, attention_mask
+        input_len = input_ids.size(1)
+        eos_token_id = kwargs.get('eos_token_id', None)
+        if eos_token_id is None:
+            action_mask = torch.ones_like(sequences, dtype=torch.bool)
+        else:
+            # left padding may be applied, only mask action
+            action_mask = (sequences[:, input_len:] == eos_token_id).cumsum(dim=-1) == 0
+            action_mask = F.pad(action_mask, (1 + input_len, -1), value=True)    # include eos token and input
+        action_mask[:, :input_len] = False
+        action_mask = action_mask[:, 1:]
+        return sequences, attention_mask, action_mask[:, -(sequences.size(1) - input_len):]
+
+    def forward(self,
+                sequences: torch.LongTensor,
+                num_actions: int,
+                attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """Returns action log probs
+        """
+        output = self.model(sequences, attention_mask=attention_mask)
+        logits = output['logits']
+        log_probs = log_probs_from_logits(logits[:, :-1, :], sequences[:, 1:])
+        return log_probs[:, -num_actions:]
diff --git a/applications/ChatGPT/chatgpt/nn/bloom_actor.py b/applications/ChatGPT/chatgpt/nn/bloom_actor.py
new file mode 100644
index 000000000000..103536bc3940
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/nn/bloom_actor.py
@@ -0,0 +1,35 @@
+from typing import Optional
+
+import torch
+from transformers import BloomConfig, BloomForCausalLM, BloomModel
+
+from .actor import Actor
+
+
+class BLOOMActor(Actor):
+    """
+    BLOOM Actor model.
+
+    Args:
+        pretrained (str): Pretrained model name or path.
+        config (BloomConfig): Model config.
+        checkpoint (bool): Enable gradient checkpointing.
+        lora_rank (int): LoRA rank.
+        lora_train_bias (str): LoRA bias training mode.
+    """
+
+    def __init__(self,
+                 pretrained: str = None,
+                 config: Optional[BloomConfig] = None,
+                 checkpoint: bool = False,
+                 lora_rank: int = 0,
+                 lora_train_bias: str = 'none') -> None:
+        if pretrained is not None:
+            model = BloomForCausalLM.from_pretrained(pretrained)
+        elif config is not None:
+            model = BloomForCausalLM(config)
+        else:
+            model = BloomForCausalLM(BloomConfig())
+        if checkpoint:
+            model.gradient_checkpointing_enable()
+        super().__init__(model, lora_rank, lora_train_bias)
diff --git a/applications/ChatGPT/chatgpt/nn/bloom_critic.py b/applications/ChatGPT/chatgpt/nn/bloom_critic.py
new file mode 100644
index 000000000000..3b03471a3d1d
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/nn/bloom_critic.py
@@ -0,0 +1,37 @@
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from transformers import BloomConfig, BloomForCausalLM, BloomModel
+
+from .critic import Critic
+
+
+class BLOOMCritic(Critic):
+    """
+    BLOOM Critic model.
+
+    Args:
+        pretrained (str): Pretrained model name or path.
+        config (BloomConfig): Model config.
+        checkpoint (bool): Enable gradient checkpointing.
+        lora_rank (int): LoRA rank.
+        lora_train_bias (str): LoRA bias training mode.
+    """
+
+    def __init__(self,
+                 pretrained: str = None,
+                 config: Optional[BloomConfig] = None,
+                 checkpoint: bool = False,
+                 lora_rank: int = 0,
+                 lora_train_bias: str = 'none') -> None:
+        if pretrained is not None:
+            model = BloomModel.from_pretrained(pretrained)
+        elif config is not None:
+            model = BloomModel(config)
+        else:
+            model = BloomModel(BloomConfig())
+        if checkpoint:
+            model.gradient_checkpointing_enable()
+        value_head = nn.Linear(model.config.hidden_size, 1)
+        super().__init__(model, value_head, lora_rank, lora_train_bias)
diff --git a/applications/ChatGPT/chatgpt/nn/bloom_rm.py b/applications/ChatGPT/chatgpt/nn/bloom_rm.py
new file mode 100644
index 000000000000..0d4dd43fa07a
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/nn/bloom_rm.py
@@ -0,0 +1,37 @@
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from transformers import BloomConfig, BloomForCausalLM, BloomModel
+
+from .reward_model import RewardModel
+
+
+class BLOOMRM(RewardModel):
+    """
+    BLOOM Reward model.
+
+    Args:
+        pretrained (str): Pretrained model name or path.
+        config (BloomConfig): Model config.
+        checkpoint (bool): Enable gradient checkpointing.
+        lora_rank (int): LoRA rank.
+        lora_train_bias (str): LoRA bias training mode.
+    """
+
+    def __init__(self,
+                 pretrained: str = None,
+                 config: Optional[BloomConfig] = None,
+                 checkpoint: bool = False,
+                 lora_rank: int = 0,
+                 lora_train_bias: str = 'none') -> None:
+        if pretrained is not None:
+            model = BloomModel.from_pretrained(pretrained)
+        elif config is not None:
+            model = BloomModel(config)
+        else:
+            model = BloomModel(BloomConfig())
+        if checkpoint:
+            model.gradient_checkpointing_enable()
+        value_head = nn.Linear(model.config.hidden_size, 1)
+        super().__init__(model, value_head, lora_rank, lora_train_bias)
diff --git a/applications/ChatGPT/chatgpt/nn/critic.py b/applications/ChatGPT/chatgpt/nn/critic.py
new file mode 100644
index 000000000000..f3a1238540f9
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/nn/critic.py
@@ -0,0 +1,47 @@
+from typing import Optional
+
+import torch
+import torch.nn as nn
+
+from .lora import LoRAModule
+from .utils import masked_mean
+
+
+class Critic(LoRAModule):
+    """
+    Critic model base class.
+
+    Args:
+        model (nn.Module): Critic model.
+        value_head (nn.Module): Value head to get value.
+        lora_rank (int): LoRA rank.
+        lora_train_bias (str): LoRA bias training mode.
+    """
+
+    def __init__(self,
+                 model: nn.Module,
+                 value_head: nn.Module,
+                 lora_rank: int = 0,
+                 lora_train_bias: str = 'none') -> None:
+
+        super().__init__(lora_rank=lora_rank, lora_train_bias=lora_train_bias)
+        self.model = model
+        self.value_head = value_head
+        self.convert_to_lora()
+
+    def forward(self,
+                sequences: torch.LongTensor,
+                action_mask: Optional[torch.Tensor] = None,
+                attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        outputs = self.model(sequences, attention_mask=attention_mask)
+        last_hidden_states = outputs['last_hidden_state']
+
+        values = self.value_head(last_hidden_states).squeeze(-1)[:, :-1]
+
+        if action_mask is not None:
+            num_actions = action_mask.size(1)
+            values = values[:, -num_actions:]
+            value = masked_mean(values, action_mask, dim=1)
+            return value
+        value = values.mean(dim=1).squeeze(1)
+        return value
diff --git a/applications/ChatGPT/chatgpt/nn/generation.py b/applications/ChatGPT/chatgpt/nn/generation.py
new file mode 100644
index 000000000000..4ee797561f7f
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/nn/generation.py
@@ -0,0 +1,137 @@
+from typing import Any, Callable, Optional
+
+import torch
+import torch.nn as nn
+
+try:
+    from transformers.generation_logits_process import (
+        LogitsProcessorList,
+        TemperatureLogitsWarper,
+        TopKLogitsWarper,
+        TopPLogitsWarper,
+    )
+except ImportError:
+    from transformers.generation import LogitsProcessorList, TemperatureLogitsWarper, TopKLogitsWarper, TopPLogitsWarper
+
+
+def prepare_logits_processor(top_k: Optional[int] = None,
+                             top_p: Optional[float] = None,
+                             temperature: Optional[float] = None) -> LogitsProcessorList:
+    processor_list = LogitsProcessorList()
+    if temperature is not None and temperature != 1.0:
+        processor_list.append(TemperatureLogitsWarper(temperature))
+    if top_k is not None and top_k != 0:
+        processor_list.append(TopKLogitsWarper(top_k))
+    if top_p is not None and top_p < 1.0:
+        processor_list.append(TopPLogitsWarper(top_p))
+    return processor_list
+
+
+def sample(model: nn.Module,
+           input_ids: torch.Tensor,
+           max_length: int,
+           early_stopping: bool = False,
+           eos_token_id: Optional[int] = None,
+           pad_token_id: Optional[int] = None,
+           top_k: Optional[int] = None,
+           top_p: Optional[float] = None,
+           temperature: Optional[float] = None,
+           prepare_inputs_fn: Optional[Callable[[torch.Tensor, Any], dict]] = None,
+           update_model_kwargs_fn: Optional[Callable[[dict, Any], dict]] = None,
+           **model_kwargs) -> torch.Tensor:
+    if input_ids.size(1) >= max_length:
+        return input_ids
+
+    logits_processor = prepare_logits_processor(top_k, top_p, temperature)
+    unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
+
+    for _ in range(input_ids.size(1), max_length):
+        model_inputs = prepare_inputs_fn(input_ids, **model_kwargs) if prepare_inputs_fn is not None else {
+            'input_ids': input_ids
+        }
+        outputs = model(**model_inputs)
+
+        next_token_logits = outputs['logits'][:, -1, :]
+        # pre-process distribution
+        next_token_logits = logits_processor(input_ids, next_token_logits)
+        # sample
+        probs = torch.softmax(next_token_logits, dim=-1, dtype=torch.float)
+        next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+
+        # finished sentences should have their next token be a padding token
+        if eos_token_id is not None:
+            if pad_token_id is None:
+                raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
+            next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
+
+        # update generated ids, model inputs for next step
+        input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+        if update_model_kwargs_fn is not None:
+            model_kwargs = update_model_kwargs_fn(outputs, **model_kwargs)
+
+        # if eos_token was found in one sentence, set sentence to finished
+        if eos_token_id is not None:
+            unfinished_sequences = unfinished_sequences.mul((next_tokens != eos_token_id).long())
+
+        # stop when each sentence is finished if early_stopping=True
+        if early_stopping and unfinished_sequences.max() == 0:
+            break
+
+    return input_ids
+
+
+def generate(model: nn.Module,
+             input_ids: torch.Tensor,
+             max_length: int,
+             num_beams: int = 1,
+             do_sample: bool = True,
+             early_stopping: bool = False,
+             eos_token_id: Optional[int] = None,
+             pad_token_id: Optional[int] = None,
+             top_k: Optional[int] = None,
+             top_p: Optional[float] = None,
+             temperature: Optional[float] = None,
+             prepare_inputs_fn: Optional[Callable[[torch.Tensor, Any], dict]] = None,
+             update_model_kwargs_fn: Optional[Callable[[dict, Any], dict]] = None,
+             **model_kwargs) -> torch.Tensor:
+    """Generate token sequence. The returned sequence is input_ids + generated_tokens.
+
+    Args:
+        model (nn.Module): model
+        input_ids (torch.Tensor): input sequence
+        max_length (int): max length of the returned sequence
+        num_beams (int, optional): number of beams. Defaults to 1.
+        do_sample (bool, optional): whether to do sample. Defaults to True.
+        early_stopping (bool, optional): if True, the sequence length may be smaller than max_length due to finding eos. Defaults to False.
+        eos_token_id (Optional[int], optional): end of sequence token id. Defaults to None.
+        pad_token_id (Optional[int], optional): pad token id. Defaults to None.
+        top_k (Optional[int], optional): the number of highest probability vocabulary tokens to keep for top-k-filtering. Defaults to None.
+        top_p (Optional[float], optional): If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. Defaults to None.
+        temperature (Optional[float], optional): The value used to module the next token probabilities. Defaults to None.
+        prepare_inputs_fn (Optional[Callable[[torch.Tensor, Any], dict]], optional): Function to preprocess model inputs. Arguments of this function should be input_ids and model_kwargs. Defaults to None.
+        update_model_kwargs_fn (Optional[Callable[[dict, Any], dict]], optional): Function to update model_kwargs based on outputs. Arguments of this function should be outputs and model_kwargs. Defaults to None.
+    """
+    is_greedy_gen_mode = ((num_beams == 1) and do_sample is False)
+    is_sample_gen_mode = ((num_beams == 1) and do_sample is True)
+    is_beam_gen_mode = ((num_beams > 1) and do_sample is False)
+    if is_greedy_gen_mode:
+        # run greedy search
+        raise NotImplementedError
+    elif is_sample_gen_mode:
+        # run sample
+        return sample(model,
+                      input_ids,
+                      max_length,
+                      early_stopping=early_stopping,
+                      eos_token_id=eos_token_id,
+                      pad_token_id=pad_token_id,
+                      top_k=top_k,
+                      top_p=top_p,
+                      temperature=temperature,
+                      prepare_inputs_fn=prepare_inputs_fn,
+                      update_model_kwargs_fn=update_model_kwargs_fn,
+                      **model_kwargs)
+    elif is_beam_gen_mode:
+        raise NotImplementedError
+    else:
+        raise ValueError("Unsupported generation mode")
diff --git a/applications/ChatGPT/chatgpt/nn/generation_utils.py b/applications/ChatGPT/chatgpt/nn/generation_utils.py
new file mode 100644
index 000000000000..c7bc1b383fb9
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/nn/generation_utils.py
@@ -0,0 +1,92 @@
+from typing import Optional
+
+import torch
+
+
+def gpt_prepare_inputs_fn(input_ids: torch.Tensor, past: Optional[torch.Tensor] = None, **kwargs) -> dict:
+    token_type_ids = kwargs.get("token_type_ids", None)
+    # only last token for inputs_ids if past is defined in kwargs
+    if past:
+        input_ids = input_ids[:, -1].unsqueeze(-1)
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
+
+    attention_mask = kwargs.get("attention_mask", None)
+    position_ids = kwargs.get("position_ids", None)
+
+    if attention_mask is not None and position_ids is None:
+        # create position_ids on the fly for batch generation
+        position_ids = attention_mask.long().cumsum(-1) - 1
+        position_ids.masked_fill_(attention_mask == 0, 1)
+        if past:
+            position_ids = position_ids[:, -1].unsqueeze(-1)
+    else:
+        position_ids = None
+    return {
+        "input_ids": input_ids,
+        "past_key_values": past,
+        "use_cache": kwargs.get("use_cache"),
+        "position_ids": position_ids,
+        "attention_mask": attention_mask,
+        "token_type_ids": token_type_ids,
+    }
+
+
+def update_model_kwargs_fn(outputs: dict, **model_kwargs) -> dict:
+    if "past_key_values" in outputs:
+        model_kwargs["past"] = outputs["past_key_values"]
+    else:
+        model_kwargs["past"] = None
+
+    # update token_type_ids with last value
+    if "token_type_ids" in model_kwargs:
+        token_type_ids = model_kwargs["token_type_ids"]
+        model_kwargs["token_type_ids"] = torch.cat([token_type_ids, token_type_ids[:, -1].unsqueeze(-1)], dim=-1)
+
+    # update attention mask
+    if "attention_mask" in model_kwargs:
+        attention_mask = model_kwargs["attention_mask"]
+        model_kwargs["attention_mask"] = torch.cat(
+            [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1)
+
+    return model_kwargs
+
+
+def opt_prepare_inputs_fn(input_ids: torch.Tensor,
+                          past: Optional[torch.Tensor] = None,
+                          attention_mask: Optional[torch.Tensor] = None,
+                          use_cache: Optional[bool] = None,
+                          **kwargs) -> dict:
+    # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+    if attention_mask is None:
+        attention_mask = input_ids.new_ones(input_ids.shape)
+
+    if past:
+        input_ids = input_ids[:, -1:]
+    # first step, decoder_cached_states are empty
+    return {
+        "input_ids": input_ids,    # encoder_outputs is defined. input_ids not needed
+        "attention_mask": attention_mask,
+        "past_key_values": past,
+        "use_cache": use_cache,
+    }
+
+
+def bloom_prepare_inputs_fn(input_ids: torch.Tensor,
+                            past: Optional[torch.Tensor] = None,
+                            attention_mask: Optional[torch.Tensor] = None,
+                            use_cache: Optional[bool] = None,
+                            **kwargs) -> dict:
+    # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+    if attention_mask is None:
+        attention_mask = input_ids.new_ones(input_ids.shape)
+
+    if past:
+        input_ids = input_ids[:, -1:]
+    # first step, decoder_cached_states are empty
+    return {
+        "input_ids": input_ids,    # encoder_outputs is defined. input_ids not needed
+        "attention_mask": attention_mask,
+        "past_key_values": past,
+        "use_cache": use_cache,
+    }
diff --git a/applications/ChatGPT/chatgpt/nn/gpt_actor.py b/applications/ChatGPT/chatgpt/nn/gpt_actor.py
new file mode 100644
index 000000000000..491182ffa405
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/nn/gpt_actor.py
@@ -0,0 +1,31 @@
+from typing import Optional
+
+from transformers.models.gpt2.configuration_gpt2 import GPT2Config
+from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
+
+from .actor import Actor
+
+
+class GPTActor(Actor):
+    """
+    GPT Actor model.
+
+    Args:
+        pretrained (str): Pretrained model name or path.
+        config (GPT2Config): Model config.
+        checkpoint (bool): Enable gradient checkpointing.
+    """
+
+    def __init__(self,
+                 pretrained: Optional[str] = None,
+                 config: Optional[GPT2Config] = None,
+                 checkpoint: bool = False) -> None:
+        if pretrained is not None:
+            model = GPT2LMHeadModel.from_pretrained(pretrained)
+        elif config is not None:
+            model = GPT2LMHeadModel(config)
+        else:
+            model = GPT2LMHeadModel(GPT2Config())
+        if checkpoint:
+            model.gradient_checkpointing_enable()
+        super().__init__(model)
diff --git a/applications/ChatGPT/chatgpt/nn/gpt_critic.py b/applications/ChatGPT/chatgpt/nn/gpt_critic.py
new file mode 100644
index 000000000000..b0a001f4aff5
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/nn/gpt_critic.py
@@ -0,0 +1,33 @@
+from typing import Optional
+
+import torch.nn as nn
+from transformers.models.gpt2.configuration_gpt2 import GPT2Config
+from transformers.models.gpt2.modeling_gpt2 import GPT2Model
+
+from .critic import Critic
+
+
+class GPTCritic(Critic):
+    """
+    GPT Critic model.
+
+    Args:
+        pretrained (str): Pretrained model name or path.
+        config (GPT2Config): Model config.
+        checkpoint (bool): Enable gradient checkpointing.
+    """
+
+    def __init__(self,
+                 pretrained: Optional[str] = None,
+                 config: Optional[GPT2Config] = None,
+                 checkpoint: bool = False) -> None:
+        if pretrained is not None:
+            model = GPT2Model.from_pretrained(pretrained)
+        elif config is not None:
+            model = GPT2Model(config)
+        else:
+            model = GPT2Model(GPT2Config())
+        if checkpoint:
+            model.gradient_checkpointing_enable()
+        value_head = nn.Linear(model.config.n_embd, 1)
+        super().__init__(model, value_head)
diff --git a/applications/ChatGPT/chatgpt/nn/gpt_rm.py b/applications/ChatGPT/chatgpt/nn/gpt_rm.py
new file mode 100644
index 000000000000..c6c41a45a684
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/nn/gpt_rm.py
@@ -0,0 +1,33 @@
+from typing import Optional
+
+import torch.nn as nn
+from transformers.models.gpt2.configuration_gpt2 import GPT2Config
+from transformers.models.gpt2.modeling_gpt2 import GPT2Model
+
+from .reward_model import RewardModel
+
+
+class GPTRM(RewardModel):
+    """
+    GPT Reward model.
+
+    Args:
+        pretrained (str): Pretrained model name or path.
+        config (GPT2Config): Model config.
+        checkpoint (bool): Enable gradient checkpointing.
+    """
+
+    def __init__(self,
+                 pretrained: Optional[str] = None,
+                 config: Optional[GPT2Config] = None,
+                 checkpoint: bool = False) -> None:
+        if pretrained is not None:
+            model = GPT2Model.from_pretrained(pretrained)
+        elif config is not None:
+            model = GPT2Model(config)
+        else:
+            model = GPT2Model(GPT2Config())
+        if checkpoint:
+            model.gradient_checkpointing_enable()
+        value_head = nn.Linear(model.config.n_embd, 1)
+        super().__init__(model, value_head)
diff --git a/applications/ChatGPT/chatgpt/nn/lora.py b/applications/ChatGPT/chatgpt/nn/lora.py
new file mode 100644
index 000000000000..46a43ec91681
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/nn/lora.py
@@ -0,0 +1,127 @@
+import math
+from typing import Optional
+
+import loralib as lora
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class LoraLinear(lora.LoRALayer, nn.Module):
+    """Replace in-place ops to out-of-place ops to fit gemini. Convert a torch.nn.Linear to LoraLinear.
+    """
+
+    def __init__(
+        self,
+        weight: nn.Parameter,
+        bias: Optional[nn.Parameter],
+        r: int = 0,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.,
+        fan_in_fan_out: bool = False,    # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
+        merge_weights: bool = True,
+    ):
+        nn.Module.__init__(self)
+        lora.LoRALayer.__init__(self,
+                                r=r,
+                                lora_alpha=lora_alpha,
+                                lora_dropout=lora_dropout,
+                                merge_weights=merge_weights)
+        self.weight = weight
+        self.bias = bias
+
+        out_features, in_features = weight.shape
+        self.in_features = in_features
+        self.out_features = out_features
+
+        self.fan_in_fan_out = fan_in_fan_out
+        # Actual trainable parameters
+        if r > 0:
+            self.lora_A = nn.Parameter(self.weight.new_zeros((r, in_features)))
+            self.lora_B = nn.Parameter(self.weight.new_zeros((out_features, r)))
+            self.scaling = self.lora_alpha / self.r
+            # Freezing the pre-trained weight matrix
+            self.weight.requires_grad = False
+        self.reset_parameters()
+        if fan_in_fan_out:
+            self.weight.data = self.weight.data.T
+
+    def reset_parameters(self):
+        if hasattr(self, 'lora_A'):
+            # initialize A the same way as the default for nn.Linear and B to zero
+            nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
+            nn.init.zeros_(self.lora_B)
+
+    def train(self, mode: bool = True):
+
+        def T(w):
+            return w.T if self.fan_in_fan_out else w
+
+        nn.Module.train(self, mode)
+        if self.merge_weights and self.merged:
+            # Make sure that the weights are not merged
+            if self.r > 0:
+                self.weight.data -= T(self.lora_B @ self.lora_A) * self.scaling
+            self.merged = False
+
+    def eval(self):
+
+        def T(w):
+            return w.T if self.fan_in_fan_out else w
+
+        nn.Module.eval(self)
+        if self.merge_weights and not self.merged:
+            # Merge the weights and mark it
+            if self.r > 0:
+                self.weight.data += T(self.lora_B @ self.lora_A) * self.scaling
+            self.merged = True
+
+    def forward(self, x: torch.Tensor):
+
+        def T(w):
+            return w.T if self.fan_in_fan_out else w
+
+        if self.r > 0 and not self.merged:
+            result = F.linear(x, T(self.weight), bias=self.bias)
+            if self.r > 0:
+                result = result + (self.lora_dropout(x) @ self.lora_A.t() @ self.lora_B.t()) * self.scaling
+            return result
+        else:
+            return F.linear(x, T(self.weight), bias=self.bias)
+
+
+def lora_linear_wrapper(linear: nn.Linear, lora_rank: int) -> LoraLinear:
+    assert lora_rank <= linear.in_features, f'LoRA rank ({lora_rank}) must be less than or equal to in features ({linear.in_features})'
+    lora_linear = LoraLinear(linear.weight, linear.bias, r=lora_rank, merge_weights=False)
+    return lora_linear
+
+
+def convert_to_lora_recursively(module: nn.Module, lora_rank: int) -> None:
+    for name, child in module.named_children():
+        if isinstance(child, nn.Linear):
+            setattr(module, name, lora_linear_wrapper(child, lora_rank))
+        else:
+            convert_to_lora_recursively(child, lora_rank)
+
+
+class LoRAModule(nn.Module):
+    """A LoRA module base class. All derived classes should call `convert_to_lora()` at the bottom of `__init__()`.
+    This calss will convert all torch.nn.Linear layer to LoraLinear layer.
+
+    Args:
+        lora_rank (int, optional): LoRA rank. 0 means LoRA is not applied. Defaults to 0.
+        lora_train_bias (str, optional): Whether LoRA train biases.
+            'none' means it doesn't train biases. 'all' means it trains all biases. 'lora_only' means it only trains biases of LoRA layers.
+            Defaults to 'none'.
+    """
+
+    def __init__(self, lora_rank: int = 0, lora_train_bias: str = 'none') -> None:
+        super().__init__()
+        self.lora_rank = lora_rank
+        self.lora_train_bias = lora_train_bias
+
+    def convert_to_lora(self) -> None:
+        if self.lora_rank <= 0:
+            return
+        convert_to_lora_recursively(self, self.lora_rank)
+        lora.mark_only_lora_as_trainable(self, self.lora_train_bias)
diff --git a/applications/ChatGPT/chatgpt/nn/loss.py b/applications/ChatGPT/chatgpt/nn/loss.py
new file mode 100644
index 000000000000..0ebcfea061b0
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/nn/loss.py
@@ -0,0 +1,105 @@
+from typing import Optional
+
+import torch
+import torch.nn as nn
+
+from .utils import masked_mean
+
+
+class GPTLMLoss(nn.Module):
+    """
+    GPT Language Model Loss
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.loss = nn.CrossEntropyLoss()
+
+    def forward(self, logits: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
+        shift_logits = logits[..., :-1, :].contiguous()
+        shift_labels = labels[..., 1:].contiguous()
+        # Flatten the tokens
+        return self.loss(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+
+
+class PolicyLoss(nn.Module):
+    """
+    Policy Loss for PPO
+    """
+
+    def __init__(self, clip_eps: float = 0.2) -> None:
+        super().__init__()
+        self.clip_eps = clip_eps
+
+    def forward(self,
+                log_probs: torch.Tensor,
+                old_log_probs: torch.Tensor,
+                advantages: torch.Tensor,
+                action_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        ratio = (log_probs - old_log_probs).exp()
+        surr1 = ratio * advantages
+        surr2 = ratio.clamp(1 - self.clip_eps, 1 + self.clip_eps) * advantages
+        loss = -torch.min(surr1, surr2)
+        if action_mask is not None:
+            loss = masked_mean(loss, action_mask)
+        loss = loss.mean()
+        return loss
+
+
+class ValueLoss(nn.Module):
+    """
+    Value Loss for PPO
+    """
+
+    def __init__(self, clip_eps: float = 0.4) -> None:
+        super().__init__()
+        self.clip_eps = clip_eps
+
+    def forward(self,
+                values: torch.Tensor,
+                old_values: torch.Tensor,
+                reward: torch.Tensor,
+                action_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        values_clipped = old_values + (values - old_values).clamp(-self.clip_eps, self.clip_eps)
+        surr1 = (values_clipped - reward)**2
+        surr2 = (values - reward)**2
+        loss = torch.max(surr1, surr2)
+        loss = loss.mean()
+        return loss
+
+
+class PPOPtxActorLoss(nn.Module):
+    """
+    To Do:
+
+    PPO-ptx Actor Loss
+    """
+
+    def __init__(self, policy_clip_eps: float = 0.2, pretrain_coef: float = 0.0, pretrain_loss_fn=GPTLMLoss()) -> None:
+        super().__init__()
+        self.pretrain_coef = pretrain_coef
+        self.policy_loss_fn = PolicyLoss(clip_eps=policy_clip_eps)
+        self.pretrain_loss_fn = pretrain_loss_fn
+
+    def forward(self,
+                log_probs: torch.Tensor,
+                old_log_probs: torch.Tensor,
+                advantages: torch.Tensor,
+                lm_logits: torch.Tensor,
+                lm_input_ids: torch.Tensor,
+                action_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        policy_loss = self.policy_loss_fn(log_probs, old_log_probs, advantages, action_mask=action_mask)
+        lm_loss = self.pretrain_loss_fn(lm_logits, lm_input_ids)
+        return policy_loss + self.pretrain_coef * lm_loss
+
+
+class PairWiseLoss(nn.Module):
+    """
+    Pairwise Loss for Reward Model
+    """
+
+    def forward(self, chosen_reward: torch.Tensor, reject_reward: torch.Tensor) -> torch.Tensor:
+        probs = torch.sigmoid(chosen_reward - reject_reward)
+        log_probs = torch.log(probs)
+        loss = -log_probs.mean()
+        return loss
diff --git a/applications/ChatGPT/chatgpt/nn/opt_actor.py b/applications/ChatGPT/chatgpt/nn/opt_actor.py
new file mode 100644
index 000000000000..ff2bf7c00bea
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/nn/opt_actor.py
@@ -0,0 +1,35 @@
+from typing import Optional
+
+from transformers.models.opt.configuration_opt import OPTConfig
+from transformers.models.opt.modeling_opt import OPTForCausalLM
+
+from .actor import Actor
+
+
+class OPTActor(Actor):
+    """
+    OPT Actor model.
+
+    Args:
+        pretrained (str): Pretrained model name or path.
+        config (OPTConfig): Model config.
+        checkpoint (bool): Enable gradient checkpointing.
+        lora_rank (int): Rank of the low-rank approximation.
+        lora_train_bias (str): LoRA bias training mode.
+    """
+
+    def __init__(self,
+                 pretrained: Optional[str] = None,
+                 config: Optional[OPTConfig] = None,
+                 checkpoint: bool = False,
+                 lora_rank: int = 0,
+                 lora_train_bias: str = 'none') -> None:
+        if pretrained is not None:
+            model = OPTForCausalLM.from_pretrained(pretrained)
+        elif config is not None:
+            model = OPTForCausalLM(config)
+        else:
+            model = OPTForCausalLM(OPTConfig())
+        if checkpoint:
+            model.gradient_checkpointing_enable()
+        super().__init__(model, lora_rank, lora_train_bias)
diff --git a/applications/ChatGPT/chatgpt/nn/opt_critic.py b/applications/ChatGPT/chatgpt/nn/opt_critic.py
new file mode 100644
index 000000000000..9c9cb873f38a
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/nn/opt_critic.py
@@ -0,0 +1,37 @@
+from typing import Optional
+
+import torch.nn as nn
+from transformers.models.opt.configuration_opt import OPTConfig
+from transformers.models.opt.modeling_opt import OPTModel
+
+from .critic import Critic
+
+
+class OPTCritic(Critic):
+    """
+    OPT Critic model.
+
+    Args:
+        pretrained (str): Pretrained model name or path.
+        config (OPTConfig): Model config.
+        checkpoint (bool): Enable gradient checkpointing.
+        lora_rank (int): Rank of the low-rank approximation.
+        lora_train_bias (str): LoRA bias training mode.
+    """
+
+    def __init__(self,
+                 pretrained: Optional[str] = None,
+                 config: Optional[OPTConfig] = None,
+                 checkpoint: bool = False,
+                 lora_rank: int = 0,
+                 lora_train_bias: str = 'none') -> None:
+        if pretrained is not None:
+            model = OPTModel.from_pretrained(pretrained)
+        elif config is not None:
+            model = OPTModel(config)
+        else:
+            model = OPTModel(OPTConfig())
+        if checkpoint:
+            model.gradient_checkpointing_enable()
+        value_head = nn.Linear(model.config.hidden_size, 1)
+        super().__init__(model, value_head, lora_rank, lora_train_bias)
diff --git a/applications/ChatGPT/chatgpt/nn/opt_rm.py b/applications/ChatGPT/chatgpt/nn/opt_rm.py
new file mode 100644
index 000000000000..150f832e0c35
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/nn/opt_rm.py
@@ -0,0 +1,33 @@
+from typing import Optional
+
+import torch.nn as nn
+from transformers.models.opt.configuration_opt import OPTConfig
+from transformers.models.opt.modeling_opt import OPTModel
+
+from .reward_model import RewardModel
+
+
+class OPTRM(RewardModel):
+    """
+    OPT Reward model.
+
+    Args:
+        pretrained (str): Pretrained model name or path.
+        config (OPTConfig): Model config.
+        lora_rank (int): Rank of the low-rank approximation.
+        lora_train_bias (str): LoRA bias training mode.
+    """
+
+    def __init__(self,
+                 pretrained: Optional[str] = None,
+                 config: Optional[OPTConfig] = None,
+                 lora_rank: int = 0,
+                 lora_train_bias: str = 'none') -> None:
+        if pretrained is not None:
+            model = OPTModel.from_pretrained(pretrained)
+        elif config is not None:
+            model = OPTModel(config)
+        else:
+            model = OPTModel(OPTConfig())
+        value_head = nn.Linear(model.config.hidden_size, 1)
+        super().__init__(model, value_head, lora_rank, lora_train_bias)
diff --git a/applications/ChatGPT/chatgpt/nn/reward_model.py b/applications/ChatGPT/chatgpt/nn/reward_model.py
new file mode 100644
index 000000000000..5108f61a6186
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/nn/reward_model.py
@@ -0,0 +1,41 @@
+from typing import Optional
+
+import torch
+import torch.nn as nn
+
+from .lora import LoRAModule
+
+
+class RewardModel(LoRAModule):
+    """
+    Reward model base class.
+
+    Args:
+        model (nn.Module): Reward model.
+        value_head (nn.Module): Value head to get reward score.
+        lora_rank (int): LoRA rank.
+        lora_train_bias (str): LoRA bias training mode.
+    """
+
+    def __init__(self,
+                 model: nn.Module,
+                 value_head: Optional[nn.Module] = None,
+                 lora_rank: int = 0,
+                 lora_train_bias: str = 'none') -> None:
+        super().__init__(lora_rank=lora_rank, lora_train_bias=lora_train_bias)
+        self.model = model
+        if value_head is not None:
+            if value_head.out_features != 1:
+                raise ValueError("The value head of reward model's output dim should be 1!")
+            self.value_head = value_head
+
+        else:
+            self.value_head = nn.Linear(model.config.n_embd, 1)
+        self.convert_to_lora()
+
+    def forward(self, sequences: torch.LongTensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        outputs = self.model(sequences, attention_mask=attention_mask)
+        last_hidden_states = outputs['last_hidden_state']
+        values = self.value_head(last_hidden_states)[:, :-1]
+        value = values.mean(dim=1).squeeze(1)    # ensure shape is (B)
+        return value
diff --git a/applications/ChatGPT/chatgpt/nn/utils.py b/applications/ChatGPT/chatgpt/nn/utils.py
new file mode 100644
index 000000000000..0ff13181fcd2
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/nn/utils.py
@@ -0,0 +1,92 @@
+from typing import Optional, Union
+
+import loralib as lora
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def compute_approx_kl(log_probs: torch.Tensor,
+                      log_probs_base: torch.Tensor,
+                      action_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+    """
+    Compute the approximate KL divergence between two distributions.
+    Schulman blog: http://joschu.net/blog/kl-approx.html
+
+    Args:
+        log_probs: Log probabilities of the new distribution.
+        log_probs_base: Log probabilities of the base distribution.
+        action_mask: Mask for actions.
+    """
+
+    log_ratio = log_probs - log_probs_base
+    approx_kl = (log_ratio.exp() - 1) - log_ratio
+    if action_mask is not None:
+        approx_kl = masked_mean(approx_kl, action_mask, dim=1)
+        return approx_kl
+    approx_kl = approx_kl.mean(dim=1)
+    return approx_kl
+
+
+def compute_reward(r: Union[torch.Tensor, float],
+                   kl_coef: float,
+                   log_probs: torch.Tensor,
+                   log_probs_base: torch.Tensor,
+                   action_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+    if kl_coef <= 0.0:
+        return r
+    kl = compute_approx_kl(log_probs, log_probs_base, action_mask=action_mask)
+    reward = r - kl_coef * kl
+    return reward
+
+
+def log_probs_from_logits(logits: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
+    log_probs = F.log_softmax(logits, dim=-1)
+    log_probs_labels = log_probs.gather(dim=-1, index=labels.unsqueeze(-1))
+    return log_probs_labels.squeeze(-1)
+
+
+def masked_mean(tensor: torch.Tensor, mask: torch.Tensor, dim: int = 1) -> torch.Tensor:
+    tensor = tensor * mask
+    tensor = tensor.sum(dim=dim)
+    mask_sum = mask.sum(dim=dim)
+    mean = tensor / (mask_sum + 1e-8)
+    return mean
+
+
+def masked_normalize(tensor: torch.Tensor, mask: torch.Tensor, dim: int = 1, eps: float = 1e-8) -> torch.Tensor:
+    tensor = tensor * mask
+    mean = masked_mean(tensor, mask, dim=dim)
+    mean_centered = tensor - mean
+    var = masked_mean(mean_centered**2, mask, dim=dim)
+    return mean_centered * var.clamp(min=eps).rsqrt()
+
+
+def normalize(tensor: torch.Tensor, dim: int = 0, eps: float = 1e-8) -> torch.Tensor:
+    mean = tensor.mean(dim)
+    mean_centered = tensor - mean
+    var = (mean_centered**2).mean(dim)
+    norm = mean_centered * var.clamp(min=eps).rsqrt()
+    return norm
+
+
+def convert_to_lora(model: nn.Module,
+                    input_size: int,
+                    output_size: int,
+                    lora_rank: int = 16,
+                    lora_alpha: int = 1,
+                    lora_dropout: float = 0.,
+                    fan_in_fan_out: bool = False,
+                    merge_weights: bool = True):
+    if lora_rank > min(input_size, output_size):
+        raise ValueError(f"LoRA rank {lora_rank} must be less or equal than {min(input_size, output_size)}")
+
+    for name, module in model.named_modules():
+        if isinstance(module, nn.Linear):
+            module._modules[name] = lora.Linear(input_size,
+                                                output_size,
+                                                r=lora_rank,
+                                                lora_alpha=lora_alpha,
+                                                lora_dropout=lora_dropout,
+                                                fan_in_fan_out=fan_in_fan_out,
+                                                merge_weights=merge_weights)
diff --git a/applications/ChatGPT/chatgpt/replay_buffer/__init__.py b/applications/ChatGPT/chatgpt/replay_buffer/__init__.py
new file mode 100644
index 000000000000..1ebf60382913
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/replay_buffer/__init__.py
@@ -0,0 +1,4 @@
+from .base import ReplayBuffer
+from .naive import NaiveReplayBuffer
+
+__all__ = ['ReplayBuffer', 'NaiveReplayBuffer']
diff --git a/applications/ChatGPT/chatgpt/replay_buffer/base.py b/applications/ChatGPT/chatgpt/replay_buffer/base.py
new file mode 100644
index 000000000000..5036b09045c4
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/replay_buffer/base.py
@@ -0,0 +1,43 @@
+from abc import ABC, abstractmethod
+from typing import Any
+
+from chatgpt.experience_maker.base import Experience
+
+
+class ReplayBuffer(ABC):
+    """Replay buffer base class. It stores experience.
+
+     Args:
+         sample_batch_size (int): Batch size when sampling.
+         limit (int, optional): Limit of number of experience samples. A number <= 0 means unlimited. Defaults to 0.
+    """
+
+    def __init__(self, sample_batch_size: int, limit: int = 0) -> None:
+        super().__init__()
+        self.sample_batch_size = sample_batch_size
+        # limit <= 0 means unlimited
+        self.limit = limit
+
+    @abstractmethod
+    def append(self, experience: Experience) -> None:
+        pass
+
+    @abstractmethod
+    def clear(self) -> None:
+        pass
+
+    @abstractmethod
+    def sample(self) -> Experience:
+        pass
+
+    @abstractmethod
+    def __len__(self) -> int:
+        pass
+
+    @abstractmethod
+    def __getitem__(self, idx: int) -> Any:
+        pass
+
+    @abstractmethod
+    def collate_fn(self, batch: Any) -> Experience:
+        pass
diff --git a/applications/ChatGPT/chatgpt/replay_buffer/naive.py b/applications/ChatGPT/chatgpt/replay_buffer/naive.py
new file mode 100644
index 000000000000..3fc53da65bff
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/replay_buffer/naive.py
@@ -0,0 +1,57 @@
+import random
+from typing import List
+
+import torch
+from chatgpt.experience_maker.base import Experience
+
+from .base import ReplayBuffer
+from .utils import BufferItem, make_experience_batch, split_experience_batch
+
+
+class NaiveReplayBuffer(ReplayBuffer):
+    """Naive replay buffer class. It stores experience.
+
+     Args:
+         sample_batch_size (int): Batch size when sampling.
+         limit (int, optional): Limit of number of experience samples. A number <= 0 means unlimited. Defaults to 0.
+         cpu_offload (bool, optional): Whether to offload experience to cpu when sampling. Defaults to True.
+    """
+
+    def __init__(self, sample_batch_size: int, limit: int = 0, cpu_offload: bool = True) -> None:
+        super().__init__(sample_batch_size, limit)
+        self.cpu_offload = cpu_offload
+        self.target_device = torch.device(f'cuda:{torch.cuda.current_device()}')
+        # TODO(ver217): add prefetch
+        self.items: List[BufferItem] = []
+
+    @torch.no_grad()
+    def append(self, experience: Experience) -> None:
+        if self.cpu_offload:
+            experience.to_device(torch.device('cpu'))
+        items = split_experience_batch(experience)
+        self.items.extend(items)
+        if self.limit > 0:
+            samples_to_remove = len(self.items) - self.limit
+            if samples_to_remove > 0:
+                self.items = self.items[samples_to_remove:]
+
+    def clear(self) -> None:
+        self.items.clear()
+
+    @torch.no_grad()
+    def sample(self) -> Experience:
+        items = random.sample(self.items, self.sample_batch_size)
+        experience = make_experience_batch(items)
+        if self.cpu_offload:
+            experience.to_device(self.target_device)
+        return experience
+
+    def __len__(self) -> int:
+        return len(self.items)
+
+    def __getitem__(self, idx: int) -> BufferItem:
+        return self.items[idx]
+
+    def collate_fn(self, batch) -> Experience:
+        experience = make_experience_batch(batch)
+        return experience
diff --git a/applications/ChatGPT/chatgpt/replay_buffer/utils.py b/applications/ChatGPT/chatgpt/replay_buffer/utils.py
new file mode 100644
index 000000000000..752f16704771
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/replay_buffer/utils.py
@@ -0,0 +1,73 @@
+from dataclasses import dataclass
+from typing import List, Optional
+
+import torch
+import torch.nn.functional as F
+from chatgpt.experience_maker.base import Experience
+
+
+@dataclass
+class BufferItem:
+    """BufferItem is an item of experience data.
+
+    Shapes of each tensor:
+    sequences: (S)
+    action_log_probs: (A)
+    values: (1)
+    reward: (1)
+    advatanges: (1)
+    attention_mask: (S)
+    action_mask: (A)
+
+    "A" is the number of actions.
+    """
+    sequences: torch.Tensor
+    action_log_probs: torch.Tensor
+    values: torch.Tensor
+    reward: torch.Tensor
+    advantages: torch.Tensor
+    attention_mask: Optional[torch.LongTensor]
+    action_mask: Optional[torch.BoolTensor]
+
+
+def split_experience_batch(experience: Experience) -> List[BufferItem]:
+    batch_size = experience.sequences.size(0)
+    batch_kwargs = [{} for _ in range(batch_size)]
+    keys = ('sequences', 'action_log_probs', 'values', 'reward', 'advantages', 'attention_mask', 'action_mask')
+    for key in keys:
+        value = getattr(experience, key)
+        if isinstance(value, torch.Tensor):
+            vals = torch.unbind(value)
+        else:
+            # None
+            vals = [value for _ in range(batch_size)]
+        assert batch_size == len(vals)
+        for i, v in enumerate(vals):
+            batch_kwargs[i][key] = v
+    items = [BufferItem(**kwargs) for kwargs in batch_kwargs]
+    return items
+
+
+def zero_pad_sequences(sequences: List[torch.Tensor], side: str = 'left') -> torch.Tensor:
+    assert side in ('left', 'right')
+    max_len = max(seq.size(0) for seq in sequences)
+    padded_sequences = []
+    for seq in sequences:
+        pad_len = max_len - seq.size(0)
+        padding = (pad_len, 0) if side == 'left' else (0, pad_len)
+        padded_sequences.append(F.pad(seq, padding))
+    return torch.stack(padded_sequences, dim=0)
+
+
+def make_experience_batch(items: List[BufferItem]) -> Experience:
+    kwargs = {}
+    to_pad_keys = set(('action_log_probs', 'action_mask'))
+    keys = ('sequences', 'action_log_probs', 'values', 'reward', 'advantages', 'attention_mask', 'action_mask')
+    for key in keys:
+        vals = [getattr(item, key) for item in items]
+        if key in to_pad_keys:
+            batch_data = zero_pad_sequences(vals)
+        else:
+            batch_data = torch.stack(vals, dim=0)
+        kwargs[key] = batch_data
+    return Experience(**kwargs)
diff --git a/applications/ChatGPT/chatgpt/trainer/__init__.py b/applications/ChatGPT/chatgpt/trainer/__init__.py
new file mode 100644
index 000000000000..c47c76347ee5
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/trainer/__init__.py
@@ -0,0 +1,5 @@
+from .base import Trainer
+from .ppo import PPOTrainer
+from .rm import RewardModelTrainer
+
+__all__ = ['Trainer', 'PPOTrainer', 'RewardModelTrainer']
diff --git a/applications/ChatGPT/chatgpt/trainer/base.py b/applications/ChatGPT/chatgpt/trainer/base.py
new file mode 100644
index 000000000000..42547af78cfb
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/trainer/base.py
@@ -0,0 +1,162 @@
+import random
+from abc import ABC, abstractmethod
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+from chatgpt.experience_maker import Experience, ExperienceMaker
+from chatgpt.replay_buffer import ReplayBuffer
+from torch import Tensor
+from torch.utils.data import DistributedSampler
+from tqdm import tqdm
+
+from .callbacks import Callback
+from .strategies import Strategy
+from .utils import is_rank_0
+
+
+class Trainer(ABC):
+    """
+        Base class for rlhf trainers.
+
+    Args:
+        strategy (Strategy):the strategy to use for training
+        experience_maker (ExperienceMaker): the experience maker to use for produce experience to fullfill replay buffer
+        replay_buffer (ReplayBuffer): the replay buffer to use for training
+        experience_batch_size (int, defaults to 8): the batch size to use for experience generation
+        max_epochs (int, defaults to 1): the number of epochs of training process
+        tokenizer (Callable, optional): the tokenizer to use for tokenizing the input
+        sample_replay_buffer (bool, defaults to False): whether to sample from replay buffer
+        data_loader_pin_memory (bool, defaults to True): whether to pin memory for data loader
+        callbacks (List[Callback], defaults to []): the callbacks to call during training process
+        generate_kwargs (dict, optional): the kwargs to use while model generating
+    """
+
+    def __init__(self,
+                 strategy: Strategy,
+                 experience_maker: ExperienceMaker,
+                 replay_buffer: ReplayBuffer,
+                 experience_batch_size: int = 8,
+                 max_epochs: int = 1,
+                 tokenizer: Optional[Callable[[Any], dict]] = None,
+                 sample_replay_buffer: bool = False,
+                 dataloader_pin_memory: bool = True,
+                 callbacks: List[Callback] = [],
+                 **generate_kwargs) -> None:
+        super().__init__()
+        self.strategy = strategy
+        self.experience_maker = experience_maker
+        self.replay_buffer = replay_buffer
+        self.experience_batch_size = experience_batch_size
+        self.max_epochs = max_epochs
+        self.tokenizer = tokenizer
+        self.generate_kwargs = generate_kwargs
+        self.sample_replay_buffer = sample_replay_buffer
+        self.dataloader_pin_memory = dataloader_pin_memory
+        self.callbacks = callbacks
+
+    @abstractmethod
+    def training_step(self, experience: Experience) -> Dict[str, Any]:
+        pass
+
+    def _make_experience(self, inputs: Union[Tensor, Dict[str, Tensor]]) -> Experience:
+        if isinstance(inputs, Tensor):
+            return self.experience_maker.make_experience(inputs, **self.generate_kwargs)
+        elif isinstance(inputs, dict):
+            return self.experience_maker.make_experience(**inputs, **self.generate_kwargs)
+        else:
+            raise ValueError(f'Unsupported input type "{type(inputs)}"')
+
+    def _sample_prompts(self, prompts) -> list:
+        indices = list(range(len(prompts)))
+        sampled_indices = random.sample(indices, self.experience_batch_size)
+        return [prompts[i] for i in sampled_indices]
+
+    def _learn(self):
+        # replay buffer may be empty at first, we should rebuild at each training
+        if not self.sample_replay_buffer:
+            dataloader = self.strategy.setup_dataloader(self.replay_buffer, self.dataloader_pin_memory)
+            device = torch.cuda.current_device()
+        if self.sample_replay_buffer:
+            pbar = tqdm(range(self.max_epochs), desc='Train epoch', disable=not is_rank_0())
+            for _ in pbar:
+                experience = self.replay_buffer.sample()
+                metrics = self.training_step(experience)
+                pbar.set_postfix(metrics)
+        else:
+            for epoch in range(self.max_epochs):
+                self._on_learn_epoch_start(epoch)
+                if isinstance(dataloader.sampler, DistributedSampler):
+                    dataloader.sampler.set_epoch(epoch)
+                pbar = tqdm(dataloader, desc=f'Train epoch [{epoch+1}/{self.max_epochs}]', disable=not is_rank_0())
+                for experience in pbar:
+                    self._on_learn_batch_start()
+                    experience.to_device(device)
+                    metrics = self.training_step(experience)
+                    self._on_learn_batch_end(metrics, experience)
+                    pbar.set_postfix(metrics)
+                self._on_learn_epoch_end(epoch)
+
+    def fit(self, prompts, num_episodes: int = 50000, max_timesteps: int = 500, update_timesteps: int = 5000) -> None:
+        time = 0
+        self._on_fit_start()
+        for episode in range(num_episodes):
+            self._on_episode_start(episode)
+            for timestep in tqdm(range(max_timesteps),
+                                 desc=f'Episode [{episode+1}/{num_episodes}]',
+                                 disable=not is_rank_0()):
+                time += 1
+                rand_prompts = self._sample_prompts(prompts)
+                if self.tokenizer is not None:
+                    inputs = self.tokenizer(rand_prompts)
+                else:
+                    inputs = rand_prompts
+                self._on_make_experience_start()
+                experience = self._make_experience(inputs)
+                self._on_make_experience_end(experience)
+                self.replay_buffer.append(experience)
+                if time % update_timesteps == 0:
+                    self._learn()
+                    self.replay_buffer.clear()
+            self._on_episode_end(episode)
+        self._on_fit_end()
+
+    # TODO(ver217): maybe simplify these code using context
+    def _on_fit_start(self) -> None:
+        for callback in self.callbacks:
+            callback.on_fit_start()
+
+    def _on_fit_end(self) -> None:
+        for callback in self.callbacks:
+            callback.on_fit_end()
+
+    def _on_episode_start(self, episode: int) -> None:
+        for callback in self.callbacks:
+            callback.on_episode_start(episode)
+
+    def _on_episode_end(self, episode: int) -> None:
+        for callback in self.callbacks:
+            callback.on_episode_end(episode)
+
+    def _on_make_experience_start(self) -> None:
+        for callback in self.callbacks:
+            callback.on_make_experience_start()
+
+    def _on_make_experience_end(self, experience: Experience) -> None:
+        for callback in self.callbacks:
+            callback.on_make_experience_end(experience)
+
+    def _on_learn_epoch_start(self, epoch: int) -> None:
+        for callback in self.callbacks:
+            callback.on_learn_epoch_start(epoch)
+
+    def _on_learn_epoch_end(self, epoch: int) -> None:
+        for callback in self.callbacks:
+            callback.on_learn_epoch_end(epoch)
+
+    def _on_learn_batch_start(self) -> None:
+        for callback in self.callbacks:
+            callback.on_learn_batch_start()
+
+    def _on_learn_batch_end(self, metrics: dict, experience: Experience) -> None:
+        for callback in self.callbacks:
+            callback.on_learn_batch_end(metrics, experience)
diff --git a/applications/ChatGPT/chatgpt/trainer/callbacks/__init__.py b/applications/ChatGPT/chatgpt/trainer/callbacks/__init__.py
new file mode 100644
index 000000000000..79ea9ffcdf61
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/trainer/callbacks/__init__.py
@@ -0,0 +1,4 @@
+from .base import Callback
+from .performance_evaluator import PerformanceEvaluator
+
+__all__ = ['Callback', 'PerformanceEvaluator']
diff --git a/applications/ChatGPT/chatgpt/trainer/callbacks/base.py b/applications/ChatGPT/chatgpt/trainer/callbacks/base.py
new file mode 100644
index 000000000000..0b01345f7872
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/trainer/callbacks/base.py
@@ -0,0 +1,39 @@
+from abc import ABC
+
+from chatgpt.experience_maker import Experience
+
+
+class Callback(ABC):
+    """
+        Base callback class. It defines the interface for callbacks.
+    """
+
+    def on_fit_start(self) -> None:
+        pass
+
+    def on_fit_end(self) -> None:
+        pass
+
+    def on_episode_start(self, episode: int) -> None:
+        pass
+
+    def on_episode_end(self, episode: int) -> None:
+        pass
+
+    def on_make_experience_start(self) -> None:
+        pass
+
+    def on_make_experience_end(self, experience: Experience) -> None:
+        pass
+
+    def on_learn_epoch_start(self, epoch: int) -> None:
+        pass
+
+    def on_learn_epoch_end(self, epoch: int) -> None:
+        pass
+
+    def on_learn_batch_start(self) -> None:
+        pass
+
+    def on_learn_batch_end(self, metrics: dict, experience: Experience) -> None:
+        pass
diff --git a/applications/ChatGPT/chatgpt/trainer/callbacks/performance_evaluator.py b/applications/ChatGPT/chatgpt/trainer/callbacks/performance_evaluator.py
new file mode 100644
index 000000000000..faa38af1b84e
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/trainer/callbacks/performance_evaluator.py
@@ -0,0 +1,133 @@
+from time import time
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+from chatgpt.experience_maker import Experience
+
+from .base import Callback
+
+
+def get_world_size() -> int:
+    if dist.is_initialized():
+        return dist.get_world_size()
+    return 1
+
+
+def print_rank_0(*args, **kwargs) -> None:
+    if not dist.is_initialized() or dist.get_rank() == 0:
+        print(*args, **kwargs)
+
+
+@torch.no_grad()
+def all_reduce_mean(x: float, world_size: int) -> float:
+    if world_size == 1:
+        return x
+    tensor = torch.tensor([x], device=torch.cuda.current_device())
+    dist.all_reduce(tensor)
+    tensor = tensor / world_size
+    return tensor.item()
+
+
+class PerformanceEvaluator(Callback):
+    """
+        Callback for valuate the performance of the model.
+    Args:
+        actor_num_params: The number of parameters of the actor model.
+        critic_num_params: The number of parameters of the critic model.
+        initial_model_num_params: The number of parameters of the initial model.
+        reward_model_num_params: The number of parameters of the reward model.
+        enable_grad_checkpoint: Whether to enable gradient checkpointing.
+        ignore_episodes: The number of episodes to ignore when calculating the performance.
+    """
+
+    def __init__(self,
+                 actor_num_params: int,
+                 critic_num_params: int,
+                 initial_model_num_params: int,
+                 reward_model_num_params: int,
+                 enable_grad_checkpoint: bool = False,
+                 ignore_episodes: int = 0) -> None:
+        super().__init__()
+        self.world_size = get_world_size()
+        self.actor_num_params = actor_num_params
+        self.critic_num_params = critic_num_params
+        self.initial_model_num_params = initial_model_num_params
+        self.reward_model_num_params = reward_model_num_params
+        self.enable_grad_checkpoint = enable_grad_checkpoint
+        self.ignore_episodes = ignore_episodes
+        self.disable: bool = False
+
+        self.make_experience_duration: float = 0.
+        self.make_experience_start_time: Optional[float] = None
+        self.make_experience_num_samples: int = 0
+        self.make_experience_flop: int = 0
+        self.learn_duration: float = 0.
+        self.learn_start_time: Optional[float] = None
+        self.learn_num_samples: int = 0
+        self.learn_flop: int = 0
+
+    def on_episode_start(self, episode: int) -> None:
+        self.disable = self.ignore_episodes > 0 and episode < self.ignore_episodes
+
+    def on_make_experience_start(self) -> None:
+        if self.disable:
+            return
+        self.make_experience_start_time = time()
+
+    def on_make_experience_end(self, experience: Experience) -> None:
+        if self.disable:
+            return
+        self.make_experience_duration += time() - self.make_experience_start_time
+
+        batch_size, seq_len = experience.sequences.shape
+
+        self.make_experience_num_samples += batch_size
+
+        # actor generate
+        num_actions = experience.action_mask.size(1)
+        input_len = seq_len - num_actions
+        total_seq_len = (input_len + seq_len - 1) * num_actions / 2
+        self.make_experience_flop += self.actor_num_params * batch_size * total_seq_len * 2
+        # actor forward
+        self.make_experience_flop += self.actor_num_params * batch_size * seq_len * 2
+        # critic forward
+        self.make_experience_flop += self.critic_num_params * batch_size * seq_len * 2
+        # initial model forward
+        self.make_experience_flop += self.initial_model_num_params * batch_size * seq_len * 2
+        # reward model forward
+        self.make_experience_flop += self.reward_model_num_params * batch_size * seq_len * 2
+
+    def on_learn_batch_start(self) -> None:
+        if self.disable:
+            return
+        self.learn_start_time = time()
+
+    def on_learn_batch_end(self, metrics: dict, experience: Experience) -> None:
+        if self.disable:
+            return
+        self.learn_duration += time() - self.learn_start_time
+
+        batch_size, seq_len = experience.sequences.shape
+
+        self.learn_num_samples += batch_size
+
+        # actor forward-backward, 3 means forward(1) + backward(2)
+        self.learn_flop += self.actor_num_params * batch_size * seq_len * 2 * (3 + int(self.enable_grad_checkpoint))
+        # critic foward-backward
+        self.learn_flop += self.critic_num_params * batch_size * seq_len * 2 * (3 + int(self.enable_grad_checkpoint))
+
+    def on_fit_end(self) -> None:
+        avg_make_experience_duration = all_reduce_mean(self.make_experience_duration, self.world_size)
+        avg_learn_duration = all_reduce_mean(self.learn_duration, self.world_size)
+
+        avg_make_experience_throughput = self.make_experience_num_samples / (avg_make_experience_duration + 1e-12)
+        avg_make_experience_tflops = self.make_experience_flop / 1e12 / (avg_make_experience_duration + 1e-12)
+
+        avg_learn_throughput = self.learn_num_samples / (avg_learn_duration + 1e-12)
+        avg_learn_tflops = self.learn_flop / 1e12 / (avg_learn_duration + 1e-12)
+
+        print_rank_0(
+            f'Making experience throughput: {avg_make_experience_throughput:.3f} samples/sec, TFLOPS: {avg_make_experience_tflops:.3f}'
+        )
+        print_rank_0(f'Learning throughput: {avg_learn_throughput:.3f} samples/sec, TFLOPS: {avg_learn_tflops:.3f}')
diff --git a/applications/ChatGPT/chatgpt/trainer/ppo.py b/applications/ChatGPT/chatgpt/trainer/ppo.py
new file mode 100644
index 000000000000..85beb223e33a
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/trainer/ppo.py
@@ -0,0 +1,104 @@
+from typing import Any, Callable, Dict, List, Optional
+
+import torch.nn as nn
+from chatgpt.experience_maker import Experience, NaiveExperienceMaker
+from chatgpt.nn import Actor, Critic, PolicyLoss, ValueLoss
+from chatgpt.replay_buffer import NaiveReplayBuffer
+from torch.optim import Optimizer
+
+from .base import Trainer
+from .callbacks import Callback
+from .strategies import Strategy
+
+
+class PPOTrainer(Trainer):
+    """
+        Trainer for PPO algorithm.
+
+    Args:
+        strategy (Strategy): the strategy to use for training
+        actor (Actor): the actor model in ppo algorithm
+        critic (Critic): the critic model in ppo algorithm
+        reward_model (nn.Module): the reward model in rlhf algorithm to make reward of sentences
+        initial_model (Actor): the initial model in rlhf algorithm to generate reference logits to limit the update of actor
+        actor_optim (Optimizer): the optimizer to use for actor model
+        critic_optim (Optimizer): the optimizer to use for critic model
+        kl_coef (float, defaults to 0.1): the coefficient of kl divergence loss
+        train_batch_size (int, defaults to 8): the batch size to use for training
+        buffer_limit (int, defaults to 0): the max_size limitaiton of replay buffer
+        buffer_cpu_offload (bool, defaults to True): whether to offload replay buffer to cpu
+        eps_clip (float, defaults to 0.2): the clip coefficient of policy loss
+        value_clip (float, defaults to 0.4): the clip coefficient of value loss
+        experience_batch_size (int, defaults to 8): the batch size to use for experience generation
+        max_epochs (int, defaults to 1): the number of epochs of training process
+        tokenier (Callable, optional): the tokenizer to use for tokenizing the input
+        sample_replay_buffer (bool, defaults to False): whether to sample from replay buffer
+        dataloader_pin_memory (bool, defaults to True): whether to pin memory for data loader
+        callbacks (List[Callback], defaults to []): the callbacks to call during training process
+        generate_kwargs (dict, optional): the kwargs to use while model generating
+    """
+
+    def __init__(self,
+                 strategy: Strategy,
+                 actor: Actor,
+                 critic: Critic,
+                 reward_model: nn.Module,
+                 initial_model: Actor,
+                 actor_optim: Optimizer,
+                 critic_optim: Optimizer,
+                 kl_coef: float = 0.1,
+                 train_batch_size: int = 8,
+                 buffer_limit: int = 0,
+                 buffer_cpu_offload: bool = True,
+                 eps_clip: float = 0.2,
+                 value_clip: float = 0.4,
+                 experience_batch_size: int = 8,
+                 max_epochs: int = 1,
+                 tokenizer: Optional[Callable[[Any], dict]] = None,
+                 sample_replay_buffer: bool = False,
+                 dataloader_pin_memory: bool = True,
+                 callbacks: List[Callback] = [],
+                 **generate_kwargs) -> None:
+        actor = Actor(strategy.setup_model(actor.model))
+        critic = strategy.setup_model(critic)
+        reward_model = strategy.setup_model(reward_model)
+        initial_model = Actor(strategy.setup_model(initial_model.model))
+        experience_maker = NaiveExperienceMaker(actor, critic, reward_model, initial_model, kl_coef)
+        replay_buffer = NaiveReplayBuffer(train_batch_size, buffer_limit, buffer_cpu_offload)
+        super().__init__(strategy, experience_maker, replay_buffer, experience_batch_size, max_epochs, tokenizer,
+                         sample_replay_buffer, dataloader_pin_memory, callbacks, **generate_kwargs)
+        self.actor = actor
+        self.critic = critic
+
+        self.actor_loss_fn = PolicyLoss(eps_clip)
+        self.critic_loss_fn = ValueLoss(value_clip)
+
+        self.actor_optim = strategy.setup_optimizer(actor_optim, self.actor.model)
+        self.critic_optim = strategy.setup_optimizer(critic_optim, self.critic)
+
+    def training_step(self, experience: Experience) -> Dict[str, float]:
+        self.actor.train()
+        self.critic.train()
+
+        num_actions = experience.action_mask.size(1)
+        action_log_probs = self.actor(experience.sequences, num_actions, attention_mask=experience.attention_mask)
+        actor_loss = self.actor_loss_fn(action_log_probs,
+                                        experience.action_log_probs,
+                                        experience.advantages,
+                                        action_mask=experience.action_mask)
+        self.strategy.backward(actor_loss, self.actor, self.actor_optim)
+        self.strategy.optimizer_step(self.actor_optim)
+        self.actor_optim.zero_grad()
+
+        values = self.critic(experience.sequences,
+                             action_mask=experience.action_mask,
+                             attention_mask=experience.attention_mask)
+        critic_loss = self.critic_loss_fn(values,
+                                          experience.values,
+                                          experience.reward,
+                                          action_mask=experience.action_mask)
+        self.strategy.backward(critic_loss, self.critic, self.critic_optim)
+        self.strategy.optimizer_step(self.critic_optim)
+        self.critic_optim.zero_grad()
+
+        return {'actor_loss': actor_loss.item(), 'critic_loss': critic_loss.item()}
diff --git a/applications/ChatGPT/chatgpt/trainer/rm.py b/applications/ChatGPT/chatgpt/trainer/rm.py
new file mode 100644
index 000000000000..c24289502830
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/trainer/rm.py
@@ -0,0 +1,77 @@
+from abc import ABC
+
+import loralib as lora
+from chatgpt.dataset import RewardDataset
+from chatgpt.nn import PairWiseLoss
+from torch.optim import Adam
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+
+
+class RewardModelTrainer(ABC):
+    """
+        Trainer to use while training reward model.
+
+    Args:
+        model (torch.nn.Module): the model to train
+        train_dataset (RewardDataset): the dataset to use for training
+        eval_dataset (RewardDataset): the dataset to use for evaluation
+        batch_size (int, defaults to 1): the batch size while training
+        num_epochs (int, defaults to 2): the number of epochs to train
+        optim_kwargs (dict, defaults to {'lr':1e-4}): the kwargs to use while initializing optimizer
+    """
+
+    def __init__(self,
+                 model,
+                 train_dataset: RewardDataset,
+                 eval_dataset: RewardDataset,
+                 batch_size: int = 1,
+                 num_epochs: int = 2,
+                 optim_kwargs: dict = {'lr': 1e-4}) -> None:
+        super().__init__()
+        self.model = model
+        self.train_dataloader = DataLoader(train_dataset, batch_size=batch_size)
+        self.eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size)
+        self.loss_fn = PairWiseLoss()
+        self.optimizer = Adam(self.model.parameters(), **optim_kwargs)
+        self.epochs = num_epochs
+
+    def fit(self, use_lora):
+        epoch_bar = tqdm(range(self.epochs), desc='Train epoch')
+        for epoch in range(self.epochs):
+            step_bar = tqdm(range(self.train_dataloader.__len__()), desc='Train step of epoch %d' % epoch)
+            # train
+            if use_lora > 0:
+                print("Using Lora")
+                lora.mark_only_lora_as_trainable(self.model)
+            else:
+                self.model.train()
+            for chosen_ids, c_mask, reject_ids, r_mask in self.train_dataloader:
+                chosen_ids = chosen_ids.squeeze(1).cuda()
+                c_mask = c_mask.squeeze(1).cuda()
+                reject_ids = reject_ids.squeeze(1).cuda()
+                r_mask = r_mask.squeeze(1).cuda()
+                chosen_reward = self.model(chosen_ids, attention_mask=c_mask)
+                reject_reward = self.model(reject_ids, attention_mask=r_mask)
+                loss = self.loss_fn(chosen_reward, reject_reward)
+                loss.backward()
+                self.optimizer.step()
+                self.optimizer.zero_grad()
+                step_bar.update()
+                step_bar.set_postfix({'loss': loss.item()})
+
+            # eval
+            self.model.eval()
+            for chosen_ids, c_mask, reject_ids, r_mask in self.eval_dataloader:
+                dist = 0
+                chosen_ids = chosen_ids.squeeze(1).cuda()
+                c_mask = c_mask.squeeze(1).cuda()
+                reject_ids = reject_ids.squeeze(1).cuda()
+                r_mask = r_mask.squeeze(1).cuda()
+                chosen_reward = self.model(chosen_ids, attention_mask=c_mask)
+                reject_reward = self.model(reject_ids, attention_mask=r_mask)
+                dist += (chosen_reward - reject_reward)
+            dist_mean = dist / self.eval_dataloader.__len__()
+            epoch_bar.update()
+            step_bar.set_postfix({'loss': loss.item(), 'dist_mean': dist_mean.item()})
+            step_bar.close()
diff --git a/applications/ChatGPT/chatgpt/trainer/strategies/__init__.py b/applications/ChatGPT/chatgpt/trainer/strategies/__init__.py
new file mode 100644
index 000000000000..f258c9b8a873
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/trainer/strategies/__init__.py
@@ -0,0 +1,6 @@
+from .base import Strategy
+from .colossalai import ColossalAIStrategy
+from .ddp import DDPStrategy
+from .naive import NaiveStrategy
+
+__all__ = ['Strategy', 'NaiveStrategy', 'DDPStrategy', 'ColossalAIStrategy']
diff --git a/applications/ChatGPT/chatgpt/trainer/strategies/base.py b/applications/ChatGPT/chatgpt/trainer/strategies/base.py
new file mode 100644
index 000000000000..3a2923b8c678
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/trainer/strategies/base.py
@@ -0,0 +1,45 @@
+from abc import ABC, abstractmethod
+from contextlib import nullcontext
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from chatgpt.replay_buffer import ReplayBuffer
+from torch.utils.data import DataLoader
+
+
+class Strategy(ABC):
+    """
+        Base class for training strategies.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.setup_distributed()
+
+    @abstractmethod
+    def backward(self, loss: torch.Tensor, model: nn.Module, optimizer: optim.Optimizer, **kwargs) -> None:
+        pass
+
+    @abstractmethod
+    def optimizer_step(self, optimizer: optim.Optimizer, **kwargs) -> None:
+        pass
+
+    @abstractmethod
+    def setup_distributed(self) -> None:
+        pass
+
+    @abstractmethod
+    def setup_model(self, model: nn.Module) -> nn.Module:
+        pass
+
+    @abstractmethod
+    def setup_optimizer(self, optimizer: optim.Optimizer, model: nn.Module) -> optim.Optimizer:
+        pass
+
+    @abstractmethod
+    def setup_dataloader(self, replay_buffer: ReplayBuffer, pin_memory: bool = False) -> DataLoader:
+        pass
+
+    def model_init_context(self):
+        return nullcontext()
diff --git a/applications/ChatGPT/chatgpt/trainer/strategies/colossalai.py b/applications/ChatGPT/chatgpt/trainer/strategies/colossalai.py
new file mode 100644
index 000000000000..665bfa913d00
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/trainer/strategies/colossalai.py
@@ -0,0 +1,125 @@
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.optim as optim
+
+import colossalai
+from colossalai.nn.optimizer import CPUAdam, HybridAdam
+from colossalai.nn.parallel import zero_model_wrapper, zero_optim_wrapper
+from colossalai.tensor import ProcessGroup, ShardSpec
+from colossalai.utils import get_current_device
+from colossalai.utils.model.colo_init_context import ColoInitContext
+
+from .ddp import DDPStrategy
+
+
+class ColossalAIStrategy(DDPStrategy):
+    """
+        The strategy for training with ColossalAI.
+
+    Args:
+        stage(int): The stage to use in ZeRO. Choose in (1, 2, 3)
+        seed(int): The seed for the random number generator.
+        shard_init(bool): Whether to shard the model parameters during initialization. Only for ZeRO-3.
+        placement_policy(str): The placement policy for gemini. Choose in ('cpu', 'cuda')
+                          If it is “cpu”, parameters, gradients and optimizer states will be offloaded to CPU,
+                          If it is “cuda”, they will not be offloaded, which means max CUDA memory will be used. It is the fastest.
+        pin_memory(bool): Whether to pin the memory for the data loader. Only for ZeRO-3.
+        force_outputs_fp32(bool): Whether to force the outputs to be fp32. Only for ZeRO-3.
+        search_range_mb(int): The search range in MB for the chunk size. Only for ZeRO-3.
+        hidden_dim(optional, int): The hidden dimension for the gemini. Only for ZeRO-3.
+        min_chunk_size_mb(float): The minimum chunk size in MB. Only for ZeRO-3.
+        gpu_margin_mem_ratio(float): The margin memory ratio for the GPU. Only for ZeRO-3.
+        reduce_bugket_size(int): The reduce bucket size in bytes. Only for ZeRO-1 and ZeRO-2.
+        overlap_communication(bool): Whether to overlap communication and computation. Only for ZeRO-1 and ZeRO-2.
+        initial_scale(float): The initial scale for the optimizer.
+        growth_factor(float): The growth factor for the optimizer.
+        backoff_factor(float): The backoff factor for the optimizer.
+        growth_interval(int): The growth interval for the optimizer.
+        hysteresis(int): The hysteresis for the optimizer.
+        min_scale(float): The minimum scale for the optimizer.
+        max_scale(float): The maximum scale for the optimizer.
+        max_norm(float): The maximum norm for the optimizer.
+        norm_type(float): The norm type for the optimizer.
+
+    """
+
+    def __init__(
+            self,
+            stage: int = 3,
+            seed: int = 42,
+            shard_init: bool = True,    # only for stage 3
+            placement_policy: str = 'cuda',
+            pin_memory: bool = True,    # only for stage 3
+            force_outputs_fp32: bool = False,    # only for stage 3
+            search_range_mb: int = 32,    # only for stage 3
+            hidden_dim: Optional[int] = None,    # only for stage 3
+            min_chunk_size_mb: float = 32,    # only for stage 3
+            gpu_margin_mem_ratio: float = 0.0,    # only for stage 3
+            reduce_bucket_size: int = 12 * 1024**2,    # only for stage 1&2
+            overlap_communication: bool = True,    # only for stage 1&2
+            initial_scale: float = 2**16,
+            growth_factor: float = 2,
+            backoff_factor: float = 0.5,
+            growth_interval: int = 1000,
+            hysteresis: int = 2,
+            min_scale: float = 1,
+            max_scale: float = 2**32,
+            max_norm: float = 0.0,
+            norm_type: float = 2.0) -> None:
+        super().__init__(seed)
+        assert placement_policy in ('cpu', 'cuda'), f'Unsupported placement policy "{placement_policy}"'
+        self.stage = stage
+        self.shard_init = shard_init
+        self.gemini_config = dict(device=get_current_device(),
+                                  placement_policy=placement_policy,
+                                  pin_memory=pin_memory,
+                                  force_outputs_fp32=force_outputs_fp32,
+                                  strict_ddp_mode=shard_init,
+                                  search_range_mb=search_range_mb,
+                                  hidden_dim=hidden_dim,
+                                  min_chunk_size_mb=min_chunk_size_mb)
+        if stage == 3:
+            self.zero_optim_config = dict(gpu_margin_mem_ratio=gpu_margin_mem_ratio)
+        else:
+            self.zero_optim_config = dict(reduce_bucket_size=reduce_bucket_size,
+                                          overlap_communication=overlap_communication,
+                                          cpu_offload=(placement_policy == 'cpu'))
+        self.optim_kwargs = dict(initial_scale=initial_scale,
+                                 growth_factor=growth_factor,
+                                 backoff_factor=backoff_factor,
+                                 growth_interval=growth_interval,
+                                 hysteresis=hysteresis,
+                                 min_scale=min_scale,
+                                 max_scale=max_scale,
+                                 max_norm=max_norm,
+                                 norm_type=norm_type)
+
+    def setup_distributed(self) -> None:
+        colossalai.launch_from_torch({}, seed=self.seed)
+
+    def model_init_context(self):
+        if self.stage == 3:
+            world_size = dist.get_world_size()
+            shard_pg = ProcessGroup(tp_degree=world_size) if self.shard_init else None
+            default_dist_spec = ShardSpec([-1], [world_size]) if self.shard_init else None
+            return ColoInitContext(device=get_current_device(),
+                                   dtype=torch.half,
+                                   default_pg=shard_pg,
+                                   default_dist_spec=default_dist_spec)
+        return super().model_init_context()
+
+    def setup_model(self, model: nn.Module) -> nn.Module:
+        return zero_model_wrapper(model, zero_stage=self.stage, gemini_config=self.gemini_config)
+
+    def setup_optimizer(self, optimizer: optim.Optimizer, model: nn.Module) -> optim.Optimizer:
+        assert isinstance(optimizer, (CPUAdam, HybridAdam)), f'Unsupported optimizer {type(optimizer)}'
+        return zero_optim_wrapper(model, optimizer, optim_config=self.zero_optim_config, **self.optim_kwargs)
+
+    def backward(self, loss: torch.Tensor, model: nn.Module, optimizer: optim.Optimizer, **kwargs) -> None:
+        optimizer.backward(loss)
+
+    def optimizer_step(self, optimizer: optim.Optimizer, **kwargs) -> None:
+        optimizer.step()
diff --git a/applications/ChatGPT/chatgpt/trainer/strategies/ddp.py b/applications/ChatGPT/chatgpt/trainer/strategies/ddp.py
new file mode 100644
index 000000000000..b636515b443e
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/trainer/strategies/ddp.py
@@ -0,0 +1,59 @@
+import os
+import random
+
+import numpy as np
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from chatgpt.replay_buffer import ReplayBuffer
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.data import DataLoader, DistributedSampler
+
+from .naive import NaiveStrategy
+
+
+class DDPStrategy(NaiveStrategy):
+    """
+        Strategy for distributed training using torch.distributed.
+    """
+
+    def __init__(self, seed: int = 42) -> None:
+        self.seed = seed
+        super().__init__()
+
+    def setup_distributed(self) -> None:
+        try:
+            rank = int(os.environ['RANK'])
+            local_rank = int(os.environ['LOCAL_RANK'])
+            world_size = int(os.environ['WORLD_SIZE'])
+            host = os.environ['MASTER_ADDR']
+            port = int(os.environ['MASTER_PORT'])
+        except KeyError as e:
+            raise RuntimeError(
+                f"Could not find {e} in the torch environment, visit https://www.colossalai.org/ for more information on launching with torch"
+            )
+        dist.init_process_group('nccl', init_method=f'tcp://[{host}]:{port}', world_size=world_size, rank=rank)
+        self.set_seed(self.seed)
+        torch.cuda.set_device(local_rank)
+
+    def set_seed(self, seed: int) -> None:
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+
+    def setup_model(self, model: nn.Module) -> nn.Module:
+        device = torch.cuda.current_device()
+        return DDP(model, device_ids=[device])
+
+    def setup_dataloader(self, replay_buffer: ReplayBuffer, pin_memory: bool = False) -> DataLoader:
+        sampler = DistributedSampler(replay_buffer,
+                                     num_replicas=dist.get_world_size(),
+                                     rank=dist.get_rank(),
+                                     shuffle=True,
+                                     seed=self.seed,
+                                     drop_last=True)
+        return DataLoader(replay_buffer,
+                          batch_size=replay_buffer.sample_batch_size,
+                          sampler=sampler,
+                          pin_memory=pin_memory,
+                          collate_fn=replay_buffer.collate_fn)
diff --git a/applications/ChatGPT/chatgpt/trainer/strategies/naive.py b/applications/ChatGPT/chatgpt/trainer/strategies/naive.py
new file mode 100644
index 000000000000..1bb472ae657e
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/trainer/strategies/naive.py
@@ -0,0 +1,36 @@
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from chatgpt.replay_buffer import ReplayBuffer
+from torch.utils.data import DataLoader
+
+from .base import Strategy
+
+
+class NaiveStrategy(Strategy):
+    """
+        Strategy for single GPU. No parallelism is used.
+    """
+
+    def backward(self, loss: torch.Tensor, model: nn.Module, optimizer: optim.Optimizer, **kwargs) -> None:
+        loss.backward()
+
+    def optimizer_step(self, optimizer: optim.Optimizer, **kwargs) -> None:
+        optimizer.step()
+
+    def setup_distributed(self) -> None:
+        pass
+
+    def setup_model(self, model: nn.Module) -> nn.Module:
+        return model
+
+    def setup_optimizer(self, optimizer: optim.Optimizer, model: nn.Module) -> optim.Optimizer:
+        return optimizer
+
+    def setup_dataloader(self, replay_buffer: ReplayBuffer, pin_memory: bool = False) -> DataLoader:
+        return DataLoader(replay_buffer,
+                          batch_size=replay_buffer.sample_batch_size,
+                          shuffle=True,
+                          drop_last=True,
+                          pin_memory=pin_memory,
+                          collate_fn=replay_buffer.collate_fn)
diff --git a/applications/ChatGPT/chatgpt/trainer/utils.py b/applications/ChatGPT/chatgpt/trainer/utils.py
new file mode 100644
index 000000000000..6c9f7f085f8c
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/trainer/utils.py
@@ -0,0 +1,5 @@
+import torch.distributed as dist
+
+
+def is_rank_0() -> bool:
+    return not dist.is_initialized() or dist.get_rank() == 0
diff --git a/applications/ChatGPT/examples/README.md b/applications/ChatGPT/examples/README.md
new file mode 100644
index 000000000000..5f9d8698d616
--- /dev/null
+++ b/applications/ChatGPT/examples/README.md
@@ -0,0 +1,105 @@
+# Examples
+
+## Install requirements
+
+```shell
+pip install -r requirements.txt
+```
+
+## Train with dummy prompt data
+
+This script supports 3 strategies:
+
+- naive
+- ddp
+- colossalai
+
+It uses random generated prompt data.
+
+Naive strategy only support single GPU training:
+
+```shell
+python train_dummy.py --strategy naive
+# display cli help
+python train_dummy.py -h
+```
+
+DDP strategy and ColossalAI strategy support multi GPUs training:
+
+```shell
+# run DDP on 2 GPUs
+torchrun --standalone --nproc_per_node=2 train_dummy.py --strategy ddp
+# run ColossalAI on 2 GPUs
+torchrun --standalone --nproc_per_node=2 train_dummy.py --strategy colossalai
+```
+
+## Train with real prompt data
+
+We use [awesome-chatgpt-prompts](https://huggingface.co/datasets/fka/awesome-chatgpt-prompts) as example dataset. It is a small dataset with hundreds of prompts.
+
+You should download `prompts.csv` first.
+
+This script also supports 3 strategies.
+
+```shell
+# display cli help
+python train_dummy.py -h
+# run naive on 1 GPU
+python train_prompts.py prompts.csv --strategy naive
+# run DDP on 2 GPUs
+torchrun --standalone --nproc_per_node=2 train_prompts.py prompts.csv --strategy ddp
+# run ColossalAI on 2 GPUs
+torchrun --standalone --nproc_per_node=2 train_prompts.py prompts.csv --strategy colossalai
+```
+
+## Train the reward model
+We use [rm-static](https://huggingface.co/datasets/Dahoas/rm-static) as dataset to train our reward model. It is a dataset of chosen & rejected response of the same prompt.
+
+You can download the dataset from huggingface automatically.
+
+Use these code to train your reward model.
+
+```shell
+# Naive reward model training
+python train_reward_model.py --pretrain <your model path>
+# if to use LoRA
+python train_reward_model.py --pretrain <your model path> --lora_rank 16
+```
+
+## Support Model
+
+### GPT
+- [ ]  GPT2-S (s)
+- [ ]  GPT2-M (m)
+- [ ]  GPT2-L (l)
+- [ ]  GPT2-XL (xl)
+- [ ]  GPT2-4B (4b)
+- [ ]  GPT2-6B (6b)
+- [ ]  GPT2-8B (8b)
+- [ ]  GPT2-10B (10b)
+- [ ]  GPT2-12B (12b)
+- [ ]  GPT2-15B (15b)
+- [ ]  GPT2-18B (18b)
+- [ ]  GPT2-20B (20b)
+- [ ]  GPT2-24B (24b)
+- [ ]  GPT2-28B (28b)
+- [ ]  GPT2-32B (32b)
+- [ ]  GPT2-36B (36b)
+- [ ]  GPT2-40B (40b)
+- [ ]  GPT3 (175b)
+
+### BLOOM
+- [x] [BLOOM-560m](https://huggingface.co/bigscience/bloom-560m)
+- [x] [BLOOM-1b1](https://huggingface.co/bigscience/bloom-1b1)
+- [ ] [BLOOM-3b](https://huggingface.co/bigscience/bloom-3b)
+- [ ] [BLOOM-7b](https://huggingface.co/bigscience/bloomz-7b1)
+- [ ] BLOOM-175b
+
+### OPT
+- [x] [OPT-125M](https://huggingface.co/facebook/opt-125m)
+- [x] [OPT-350M](https://huggingface.co/facebook/opt-350m)
+- [ ] [OPT-1.3B](https://huggingface.co/facebook/opt-1.3b)
+- [ ] [OPT-2.7B](https://huggingface.co/facebook/opt-2.7b)
+- [ ] [OPT-6.7B](https://huggingface.co/facebook/opt-6.7b)
+- [ ] [OPT-13B](https://huggingface.co/facebook/opt-13b)
+- [ ] [OPT-30B](https://huggingface.co/facebook/opt-30b)
diff --git a/applications/ChatGPT/examples/requirements.txt b/applications/ChatGPT/examples/requirements.txt
new file mode 100644
index 000000000000..6c5dac292486
--- /dev/null
+++ b/applications/ChatGPT/examples/requirements.txt
@@ -0,0 +1 @@
+pandas>=1.4.1
diff --git a/applications/ChatGPT/examples/test_ci.sh b/applications/ChatGPT/examples/test_ci.sh
new file mode 100755
index 000000000000..c4a5ead1d1d3
--- /dev/null
+++ b/applications/ChatGPT/examples/test_ci.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+
+set -xue
+
+if [ -z "$PROMPT_PATH" ]; then
+    echo "Please set \$PROMPT_PATH to the path to prompts csv."
+    exit 1
+fi
+
+BASE=$(realpath $(dirname $0))
+
+export OMP_NUM_THREADS=8
+
+# install requirements
+pip install -r ${BASE}/requirements.txt
+
+# train dummy
+python ${BASE}/train_dummy.py --strategy naive --num_episodes 3 --max_timesteps 3 --update_timesteps 3 --max_epochs 3 --train_batch_size 2
+for strategy in ddp colossalai_gemini colossalai_zero2; do
+    torchrun --standalone --nproc_per_node=2 ${BASE}/train_dummy.py --strategy ${strategy} --num_episodes 3 --max_timesteps 3 --update_timesteps 3 --max_epochs 3 --train_batch_size 2
+done
+
+# train prompts
+python ${BASE}/train_prompts.py $PROMPT_PATH --strategy naive --num_episodes 3 --max_timesteps 3 --update_timesteps 3 --max_epochs 3
+for strategy in ddp colossalai_gemini colossalai_zero2; do
+    torchrun --standalone --nproc_per_node=2 ${BASE}/train_prompts.py $PROMPT_PATH --strategy ${strategy} --num_episodes 3 --max_timesteps 3 --update_timesteps 3 --max_epochs 3 --train_batch_size 2
+done
diff --git a/applications/ChatGPT/examples/train_dummy.py b/applications/ChatGPT/examples/train_dummy.py
new file mode 100644
index 000000000000..313be2c3b841
--- /dev/null
+++ b/applications/ChatGPT/examples/train_dummy.py
@@ -0,0 +1,121 @@
+import argparse
+from copy import deepcopy
+
+import torch
+from chatgpt.nn import BLOOMActor, BLOOMCritic, GPTActor, GPTCritic, OPTActor, OPTCritic, RewardModel
+from chatgpt.nn.generation_utils import (
+    bloom_prepare_inputs_fn,
+    gpt_prepare_inputs_fn,
+    opt_prepare_inputs_fn,
+    update_model_kwargs_fn,
+)
+from chatgpt.trainer import PPOTrainer
+from chatgpt.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
+from torch.optim import Adam
+from transformers import AutoTokenizer, BloomTokenizerFast
+from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
+
+from colossalai.nn.optimizer import HybridAdam
+
+
+def preprocess_batch(samples):
+    input_ids = torch.stack(samples)
+    attention_mask = torch.ones_like(input_ids, dtype=torch.long)
+    return {'input_ids': input_ids, 'attention_mask': attention_mask}
+
+
+def main(args):
+    # configure strategy
+    if args.strategy == 'naive':
+        strategy = NaiveStrategy()
+    elif args.strategy == 'ddp':
+        strategy = DDPStrategy()
+    elif args.strategy == 'colossalai_gemini':
+        strategy = ColossalAIStrategy(stage=3, placement_policy='cuda')
+    elif args.strategy == 'colossalai_zero2':
+        strategy = ColossalAIStrategy(stage=2, placement_policy='cuda')
+    else:
+        raise ValueError(f'Unsupported strategy "{args.strategy}"')
+
+    # configure model
+    with strategy.model_init_context():
+        if args.model == 'gpt2':
+            actor = GPTActor().cuda()
+            critic = GPTCritic().cuda()
+        elif args.model == 'bloom':
+            actor = BLOOMActor(pretrained=args.pretrain, lora_rank=args.lora_rank).cuda()
+            critic = BLOOMCritic(pretrained=args.pretrain, lora_rank=args.lora_rank).cuda()
+        elif args.model == 'opt':
+            actor = OPTActor().cuda()
+            critic = OPTCritic().cuda()
+        else:
+            raise ValueError(f'Unsupported model "{args.model}"')
+
+        initial_model = deepcopy(actor).cuda()
+        reward_model = RewardModel(deepcopy(critic.model), deepcopy(critic.value_head)).cuda()
+
+    # configure optimizer
+    if args.strategy.startswith('colossalai'):
+        actor_optim = HybridAdam(actor.parameters(), lr=5e-6)
+        critic_optim = HybridAdam(critic.parameters(), lr=5e-6)
+    else:
+        actor_optim = Adam(actor.parameters(), lr=5e-6)
+        critic_optim = Adam(critic.parameters(), lr=5e-6)
+
+    # configure tokenizer
+    if args.model == 'gpt2':
+        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        tokenizer.pad_token = tokenizer.eos_token
+        prepare_inputs_fn = gpt_prepare_inputs_fn
+    elif args.model == 'bloom':
+        tokenizer = BloomTokenizerFast.from_pretrained(args.pretrain)
+        tokenizer.pad_token = tokenizer.eos_token
+        prepare_inputs_fn = bloom_prepare_inputs_fn
+    elif args.model == 'opt':
+        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
+        prepare_inputs_fn = opt_prepare_inputs_fn
+    else:
+        raise ValueError(f'Unsupported model "{args.model}"')
+
+    # configure trainer
+    trainer = PPOTrainer(strategy,
+                         actor,
+                         critic,
+                         reward_model,
+                         initial_model,
+                         actor_optim,
+                         critic_optim,
+                         max_epochs=args.max_epochs,
+                         train_batch_size=args.train_batch_size,
+                         tokenizer=preprocess_batch,
+                         max_length=128,
+                         do_sample=True,
+                         temperature=1.0,
+                         top_k=50,
+                         pad_token_id=tokenizer.pad_token_id,
+                         eos_token_id=tokenizer.eos_token_id,
+                         prepare_inputs_fn=prepare_inputs_fn,
+                         update_model_kwargs_fn=update_model_kwargs_fn)
+
+    random_prompts = torch.randint(tokenizer.vocab_size, (1000, 64), device=torch.cuda.current_device())
+    trainer.fit(random_prompts,
+                num_episodes=args.num_episodes,
+                max_timesteps=args.max_timesteps,
+                update_timesteps=args.update_timesteps)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--strategy',
+                        choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
+                        default='naive')
+    parser.add_argument('--model', type=str, default='gpt2', choices=['gpt2', 'bloom', 'opt'])
+    parser.add_argument('--pretrain', type=str, default=None)
+    parser.add_argument('--num_episodes', type=int, default=50)
+    parser.add_argument('--max_timesteps', type=int, default=10)
+    parser.add_argument('--update_timesteps', type=int, default=10)
+    parser.add_argument('--max_epochs', type=int, default=5)
+    parser.add_argument('--train_batch_size', type=int, default=8)
+    parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
+    args = parser.parse_args()
+    main(args)
diff --git a/applications/ChatGPT/examples/train_dummy.sh b/applications/ChatGPT/examples/train_dummy.sh
new file mode 100755
index 000000000000..559d338ee021
--- /dev/null
+++ b/applications/ChatGPT/examples/train_dummy.sh
@@ -0,0 +1,18 @@
+set_n_least_used_CUDA_VISIBLE_DEVICES() {
+    local n=${1:-"9999"}
+    echo "GPU Memory Usage:"
+    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv \
+        | tail -n +2 \
+        | nl -v 0 \
+        | tee /dev/tty \
+        | sort -g -k 2 \
+        | awk '{print $1}' \
+        | head -n $n)
+    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
+    echo "Now CUDA_VISIBLE_DEVICES is set to:"
+    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
+}
+
+set_n_least_used_CUDA_VISIBLE_DEVICES 1
+
+python train_dummy.py --model bloom --pretrain '/data2/users/lczht/bloom-560m' --lora_rank 16
diff --git a/applications/ChatGPT/examples/train_prompts.py b/applications/ChatGPT/examples/train_prompts.py
new file mode 100644
index 000000000000..994b10fe0734
--- /dev/null
+++ b/applications/ChatGPT/examples/train_prompts.py
@@ -0,0 +1,113 @@
+import argparse
+from copy import deepcopy
+
+import pandas as pd
+from chatgpt.nn import BLOOMActor, BLOOMCritic, GPTActor, GPTCritic, OPTActor, OPTCritic, RewardModel
+from chatgpt.nn.generation_utils import gpt_prepare_inputs_fn, update_model_kwargs_fn
+from chatgpt.trainer import PPOTrainer
+from chatgpt.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
+from torch.optim import Adam
+from transformers import AutoTokenizer, BloomTokenizerFast
+from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
+
+from colossalai.nn.optimizer import HybridAdam
+
+
+def main(args):
+    # configure strategy
+    if args.strategy == 'naive':
+        strategy = NaiveStrategy()
+    elif args.strategy == 'ddp':
+        strategy = DDPStrategy()
+    elif args.strategy == 'colossalai_gemini':
+        strategy = ColossalAIStrategy(stage=3, placement_policy='cuda')
+    elif args.strategy == 'colossalai_zero2':
+        strategy = ColossalAIStrategy(stage=2, placement_policy='cuda')
+    else:
+        raise ValueError(f'Unsupported strategy "{args.strategy}"')
+
+    # configure model
+    with strategy.model_init_context():
+        if args.model == 'gpt2':
+            actor = GPTActor().cuda()
+            critic = GPTCritic().cuda()
+        elif args.model == 'bloom':
+            actor = BLOOMActor(pretrained=args.pretrain, lora_rank=args.lora_rank).cuda()
+            critic = BLOOMCritic(pretrained=args.pretrain, lora_rank=args.lora_rank).cuda()
+        elif args.model == 'opt':
+            actor = OPTActor(lora_rank=args.lora_rank).cuda()
+            critic = OPTCritic(lora_rank=args.lora_rank).cuda()
+        else:
+            raise ValueError(f'Unsupported model "{args.model}"')
+
+        initial_model = deepcopy(actor)
+        reward_model = RewardModel(deepcopy(critic.model), deepcopy(critic.value_head)).cuda()
+
+    # configure optimizer
+    if args.strategy.startswith('colossalai'):
+        actor_optim = HybridAdam(actor.parameters(), lr=5e-6)
+        critic_optim = HybridAdam(critic.parameters(), lr=5e-6)
+    else:
+        actor_optim = Adam(actor.parameters(), lr=5e-6)
+        critic_optim = Adam(critic.parameters(), lr=5e-6)
+
+    # configure tokenizer
+    if args.model == 'gpt2':
+        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        tokenizer.pad_token = tokenizer.eos_token
+    elif args.model == 'bloom':
+        tokenizer = BloomTokenizerFast.from_pretrained(args.pretrain)
+        tokenizer.pad_token = tokenizer.eos_token
+    elif args.model == 'opt':
+        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
+    else:
+        raise ValueError(f'Unsupported model "{args.model}"')
+
+    dataset = pd.read_csv(args.prompt_path)['prompt']
+
+    def tokenize_fn(texts):
+        batch = tokenizer(texts, return_tensors='pt', max_length=96, padding=True, truncation=True)
+        return {k: v.cuda() for k, v in batch.items()}
+
+    # configure trainer
+    trainer = PPOTrainer(strategy,
+                         actor,
+                         critic,
+                         reward_model,
+                         initial_model,
+                         actor_optim,
+                         critic_optim,
+                         max_epochs=args.max_epochs,
+                         train_batch_size=args.train_batch_size,
+                         tokenizer=tokenize_fn,
+                         max_length=128,
+                         do_sample=True,
+                         temperature=1.0,
+                         top_k=50,
+                         pad_token_id=tokenizer.pad_token_id,
+                         eos_token_id=tokenizer.eos_token_id,
+                         prepare_inputs_fn=gpt_prepare_inputs_fn,
+                         update_model_kwargs_fn=update_model_kwargs_fn)
+
+    trainer.fit(dataset,
+                num_episodes=args.num_episodes,
+                max_timesteps=args.max_timesteps,
+                update_timesteps=args.update_timesteps)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('prompt_path')
+    parser.add_argument('--strategy',
+                        choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
+                        default='naive')
+    parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt'])
+    parser.add_argument('--pretrain', type=str, default=None)
+    parser.add_argument('--num_episodes', type=int, default=10)
+    parser.add_argument('--max_timesteps', type=int, default=10)
+    parser.add_argument('--update_timesteps', type=int, default=10)
+    parser.add_argument('--max_epochs', type=int, default=5)
+    parser.add_argument('--train_batch_size', type=int, default=8)
+    parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
+    args = parser.parse_args()
+    main(args)
diff --git a/applications/ChatGPT/examples/train_prompts.sh b/applications/ChatGPT/examples/train_prompts.sh
new file mode 100755
index 000000000000..0b82d3f1cd5e
--- /dev/null
+++ b/applications/ChatGPT/examples/train_prompts.sh
@@ -0,0 +1,18 @@
+set_n_least_used_CUDA_VISIBLE_DEVICES() {
+    local n=${1:-"9999"}
+    echo "GPU Memory Usage:"
+    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv \
+        | tail -n +2 \
+        | nl -v 0 \
+        | tee /dev/tty \
+        | sort -g -k 2 \
+        | awk '{print $1}' \
+        | head -n $n)
+    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
+    echo "Now CUDA_VISIBLE_DEVICES is set to:"
+    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
+}
+
+set_n_least_used_CUDA_VISIBLE_DEVICES 1
+
+python train_prompts.py prompts.csv --pretrain '/data2/users/lczht/bloom-560m' --lora_rank 16
diff --git a/applications/ChatGPT/examples/train_reward_model.py b/applications/ChatGPT/examples/train_reward_model.py
new file mode 100644
index 000000000000..fd78a2ac6325
--- /dev/null
+++ b/applications/ChatGPT/examples/train_reward_model.py
@@ -0,0 +1,53 @@
+import argparse
+
+import loralib as lora
+import torch
+from chatgpt.dataset import RewardDataset
+from chatgpt.nn import BLOOMRM
+from chatgpt.trainer import RewardModelTrainer
+from datasets import load_dataset
+from transformers import BloomTokenizerFast
+
+
+def train(args):
+    tokenizer = BloomTokenizerFast.from_pretrained(args.pretrain)
+    tokenizer.pad_token = tokenizer.eos_token
+    model = BLOOMRM(pretrained=args.pretrain)
+
+    model.cuda()
+
+    max_len = 1024
+
+    # prepare for data and dataset
+    data = load_dataset(args.dataset)
+    train_data = data["train"]
+    eval_data = data['test']
+    train_dataset = RewardDataset(train_data, tokenizer, max_len)
+    eval_dataset = RewardDataset(eval_data, tokenizer, max_len)
+
+    # batch_size here is expected to be C(k,2), k means # response of each prompt
+    # be limited with the format of dataset 'Dahoas/rm-static', we'd better use batch_size as 1
+    trainer = RewardModelTrainer(model=model,
+                                 train_dataset=train_dataset,
+                                 eval_dataset=eval_dataset,
+                                 batch_size=args.batch_size,
+                                 num_epochs=args.max_epochs)
+
+    trainer.fit(use_lora=args.lora_rank)
+
+    if args.lora_rank > 0:
+        torch.save({'model_state_dict': lora.lora_state_dict(trainer.model)}, args.save_path)
+    else:
+        torch.save(trainer.model, args.save_path)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--pretrain', type=str, default=None)
+    parser.add_argument('--dataset', type=str, default='Dahoas/rm-static')
+    parser.add_argument('--save_path', type=str, default='rm_ckpt.pth')
+    parser.add_argument('--max_epochs', type=int, default=2)
+    parser.add_argument('--batch_size', type=int, default=1)
+    parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
+    args = parser.parse_args()
+    train(args)
diff --git a/applications/ChatGPT/examples/train_rm.sh b/applications/ChatGPT/examples/train_rm.sh
new file mode 100755
index 000000000000..bf46d7e43ff2
--- /dev/null
+++ b/applications/ChatGPT/examples/train_rm.sh
@@ -0,0 +1,18 @@
+set_n_least_used_CUDA_VISIBLE_DEVICES() {
+    local n=${1:-"9999"}
+    echo "GPU Memory Usage:"
+    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv \
+        | tail -n +2 \
+        | nl -v 0 \
+        | tee /dev/tty \
+        | sort -g -k 2 \
+        | awk '{print $1}' \
+        | head -n $n)
+    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
+    echo "Now CUDA_VISIBLE_DEVICES is set to:"
+    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
+}
+
+set_n_least_used_CUDA_VISIBLE_DEVICES 1
+
+python train_reward_model.py --pretrain '/data2/users/lczht/bloom-560m' --lora_rank 16
diff --git a/applications/ChatGPT/pytest.ini b/applications/ChatGPT/pytest.ini
new file mode 100644
index 000000000000..01e5cd217c5d
--- /dev/null
+++ b/applications/ChatGPT/pytest.ini
@@ -0,0 +1,6 @@
+[pytest]
+markers =
+    cpu: tests which can run on CPU
+    gpu: tests which requires a single GPU
+    dist: tests which are run in a multi-GPU or multi-machine environment
+    experiment: tests for experimental features
diff --git a/applications/ChatGPT/requirements/requirements-test.txt b/applications/ChatGPT/requirements/requirements-test.txt
new file mode 100644
index 000000000000..e079f8a6038d
--- /dev/null
+++ b/applications/ChatGPT/requirements/requirements-test.txt
@@ -0,0 +1 @@
+pytest
diff --git a/applications/ChatGPT/requirements/requirements.txt b/applications/ChatGPT/requirements/requirements.txt
new file mode 100644
index 000000000000..87f6a52cc0e2
--- /dev/null
+++ b/applications/ChatGPT/requirements/requirements.txt
@@ -0,0 +1,6 @@
+transformers>=4.20.1
+tqdm
+datasets
+loralib
+colossalai>=0.2.4
+torch
diff --git a/applications/ChatGPT/setup.py b/applications/ChatGPT/setup.py
new file mode 100644
index 000000000000..f9607190ac62
--- /dev/null
+++ b/applications/ChatGPT/setup.py
@@ -0,0 +1,42 @@
+from setuptools import find_packages, setup
+
+
+def fetch_requirements(path):
+    with open(path, 'r') as fd:
+        return [r.strip() for r in fd.readlines()]
+
+
+def fetch_readme():
+    with open('README.md', encoding='utf-8') as f:
+        return f.read()
+
+
+def fetch_version():
+    with open('version.txt', 'r') as f:
+        return f.read().strip()
+
+
+setup(
+    name='chatgpt',
+    version=fetch_version(),
+    packages=find_packages(exclude=(
+        'tests',
+        'benchmarks',
+        'requirements',
+        '*.egg-info',
+    )),
+    description='A RLFH implementation (ChatGPT) powered by ColossalAI',
+    long_description=fetch_readme(),
+    long_description_content_type='text/markdown',
+    license='Apache Software License 2.0',
+    url='https://github.com/hpcaitech/ChatGPT',
+    install_requires=fetch_requirements('requirements/requirements.txt'),
+    python_requires='>=3.6',
+    classifiers=[
+        'Programming Language :: Python :: 3',
+        'License :: OSI Approved :: Apache Software License',
+        'Environment :: GPU :: NVIDIA CUDA',
+        'Topic :: Scientific/Engineering :: Artificial Intelligence',
+        'Topic :: System :: Distributed Computing',
+    ],
+)
diff --git a/applications/ChatGPT/tests/__init__.py b/applications/ChatGPT/tests/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/applications/ChatGPT/tests/test_data.py b/applications/ChatGPT/tests/test_data.py
new file mode 100644
index 000000000000..b0a9433c2e4f
--- /dev/null
+++ b/applications/ChatGPT/tests/test_data.py
@@ -0,0 +1,117 @@
+import os
+from copy import deepcopy
+from functools import partial
+
+import pytest
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+from chatgpt.experience_maker import NaiveExperienceMaker
+from chatgpt.nn import GPTActor, GPTCritic, RewardModel
+from chatgpt.replay_buffer import NaiveReplayBuffer
+from chatgpt.trainer.strategies import ColossalAIStrategy, DDPStrategy
+
+from colossalai.testing import rerun_if_address_is_in_use
+from colossalai.utils import free_port
+
+
+def get_data(batch_size: int, seq_len: int = 10) -> dict:
+    input_ids = torch.randint(0, 50257, (batch_size, seq_len), device='cuda')
+    attention_mask = torch.ones_like(input_ids)
+    return dict(input_ids=input_ids, attention_mask=attention_mask)
+
+
+def gather_and_equal(tensor: torch.Tensor) -> bool:
+    world_size = dist.get_world_size()
+    outputs = [torch.empty_like(tensor) for _ in range(world_size)]
+    dist.all_gather(outputs, tensor.contiguous())
+    for t in outputs[1:]:
+        if not torch.equal(outputs[0], t):
+            return False
+    return True
+
+
+def run_test_data(strategy):
+    EXPERINCE_BATCH_SIZE = 4
+    SAMPLE_BATCH_SIZE = 2
+
+    if strategy == 'ddp':
+        strategy = DDPStrategy()
+    elif strategy == 'colossalai':
+        strategy = ColossalAIStrategy(placement_policy='cuda')
+    else:
+        raise ValueError(f'Unsupported strategy "{strategy}"')
+
+    actor = GPTActor().cuda()
+    critic = GPTCritic().cuda()
+
+    initial_model = deepcopy(actor)
+    reward_model = RewardModel(deepcopy(critic.model)).cuda()
+
+    experience_maker = NaiveExperienceMaker(actor, critic, reward_model, initial_model)
+    replay_buffer = NaiveReplayBuffer(SAMPLE_BATCH_SIZE, cpu_offload=False)
+
+    # experience of all ranks should be the same
+    for _ in range(2):
+        data = get_data(EXPERINCE_BATCH_SIZE)
+        assert gather_and_equal(data['input_ids'])
+        assert gather_and_equal(data['attention_mask'])
+        experience = experience_maker.make_experience(**data,
+                                                      do_sample=True,
+                                                      max_length=16,
+                                                      eos_token_id=50256,
+                                                      pad_token_id=50256)
+        assert gather_and_equal(experience.sequences)
+        assert gather_and_equal(experience.action_log_probs)
+        assert gather_and_equal(experience.values)
+        assert gather_and_equal(experience.reward)
+        assert gather_and_equal(experience.advantages)
+        assert gather_and_equal(experience.action_mask)
+        assert gather_and_equal(experience.attention_mask)
+        replay_buffer.append(experience)
+
+    # replay buffer's data should be the same
+    buffer_size = torch.tensor([len(replay_buffer)], device='cuda')
+    assert gather_and_equal(buffer_size)
+    for item in replay_buffer.items:
+        assert gather_and_equal(item.sequences)
+        assert gather_and_equal(item.action_log_probs)
+        assert gather_and_equal(item.values)
+        assert gather_and_equal(item.reward)
+        assert gather_and_equal(item.advantages)
+        assert gather_and_equal(item.action_mask)
+        assert gather_and_equal(item.attention_mask)
+
+    # dataloader of each rank should have the same size and different batch
+    dataloader = strategy.setup_dataloader(replay_buffer)
+    dataloader_size = torch.tensor([len(dataloader)], device='cuda')
+    assert gather_and_equal(dataloader_size)
+    for experience in dataloader:
+        assert not gather_and_equal(experience.sequences)
+        assert not gather_and_equal(experience.action_log_probs)
+        assert not gather_and_equal(experience.values)
+        assert not gather_and_equal(experience.reward)
+        assert not gather_and_equal(experience.advantages)
+        # action mask and attention mask may be same
+
+
+def run_dist(rank, world_size, port, strategy):
+    os.environ['RANK'] = str(rank)
+    os.environ['LOCAL_RANK'] = str(rank)
+    os.environ['WORLD_SIZE'] = str(world_size)
+    os.environ['MASTER_ADDR'] = 'localhost'
+    os.environ['MASTER_PORT'] = str(port)
+    run_test_data(strategy)
+
+
+@pytest.mark.dist
+@pytest.mark.parametrize('world_size', [2])
+@pytest.mark.parametrize('strategy', ['ddp', 'colossalai'])
+@rerun_if_address_is_in_use()
+def test_data(world_size, strategy):
+    run_func = partial(run_dist, world_size=world_size, port=free_port(), strategy=strategy)
+    mp.spawn(run_func, nprocs=world_size)
+
+
+if __name__ == '__main__':
+    test_data(2, 'colossalai')
diff --git a/applications/ChatGPT/version.txt b/applications/ChatGPT/version.txt
new file mode 100644
index 000000000000..6e8bf73aa550
--- /dev/null
+++ b/applications/ChatGPT/version.txt
@@ -0,0 +1 @@
+0.1.0

From 8408c852a69036de73984f981b81874223ae8835 Mon Sep 17 00:00:00 2001
From: binmakeswell <binmakeswell@gmail.com>
Date: Tue, 14 Feb 2023 22:48:15 +0800
Subject: [PATCH 309/503] [app] fix ChatGPT requirements (#2704)

---
 applications/ChatGPT/{requirements => }/requirements-test.txt | 0
 applications/ChatGPT/{requirements => }/requirements.txt      | 0
 applications/ChatGPT/setup.py                                 | 3 +--
 3 files changed, 1 insertion(+), 2 deletions(-)
 rename applications/ChatGPT/{requirements => }/requirements-test.txt (100%)
 rename applications/ChatGPT/{requirements => }/requirements.txt (100%)

diff --git a/applications/ChatGPT/requirements/requirements-test.txt b/applications/ChatGPT/requirements-test.txt
similarity index 100%
rename from applications/ChatGPT/requirements/requirements-test.txt
rename to applications/ChatGPT/requirements-test.txt
diff --git a/applications/ChatGPT/requirements/requirements.txt b/applications/ChatGPT/requirements.txt
similarity index 100%
rename from applications/ChatGPT/requirements/requirements.txt
rename to applications/ChatGPT/requirements.txt
diff --git a/applications/ChatGPT/setup.py b/applications/ChatGPT/setup.py
index f9607190ac62..deec10e0c841 100644
--- a/applications/ChatGPT/setup.py
+++ b/applications/ChatGPT/setup.py
@@ -22,7 +22,6 @@ def fetch_version():
     packages=find_packages(exclude=(
         'tests',
         'benchmarks',
-        'requirements',
         '*.egg-info',
     )),
     description='A RLFH implementation (ChatGPT) powered by ColossalAI',
@@ -30,7 +29,7 @@ def fetch_version():
     long_description_content_type='text/markdown',
     license='Apache Software License 2.0',
     url='https://github.com/hpcaitech/ChatGPT',
-    install_requires=fetch_requirements('requirements/requirements.txt'),
+    install_requires=fetch_requirements('requirements.txt'),
     python_requires='>=3.6',
     classifiers=[
         'Programming Language :: Python :: 3',

From 6a8cd687e3bc7da424390a508734976287f4260f Mon Sep 17 00:00:00 2001
From: binmakeswell <binmakeswell@gmail.com>
Date: Tue, 14 Feb 2023 22:48:30 +0800
Subject: [PATCH 310/503] [doc] add ChatGPT (#2703)

---
 README-zh-Hans.md | 28 ++++++++++++++++++++++++++--
 README.md         | 29 +++++++++++++++++++++++++++--
 2 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/README-zh-Hans.md b/README-zh-Hans.md
index 4b0ba9c4213b..e16db47f9886 100644
--- a/README-zh-Hans.md
+++ b/README-zh-Hans.md
@@ -3,7 +3,7 @@
 
    [![logo](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/colossal-ai_logo_vertical.png)](https://www.colossalai.org/)
 
-   Colossal-AI: 一个面向大模型时代的通用深度学习系统
+   Colossal-AI: 让AI大模型更低成本、方便易用、高效扩展
 
    <h3> <a href="https://arxiv.org/abs/2110.14883"> 论文 </a> |
    <a href="https://www.colossalai.org/"> 文档 </a> |
@@ -23,10 +23,10 @@
 </div>
 
 ## 新闻
+* [2023/02] [Open source solution replicates ChatGPT training process! Ready to go with only 1.6GB GPU memory](https://www.hpc-ai.tech/blog/colossal-ai-chatgpt)
 * [2023/01] [Hardware Savings Up to 46 Times for AIGC and  Automatic Parallelism](https://www.hpc-ai.tech/blog/colossal-ai-0-2-0)
 * [2022/11] [Diffusion Pretraining and Hardware Fine-Tuning Can Be Almost 7X Cheaper](https://www.hpc-ai.tech/blog/diffusion-pretraining-and-hardware-fine-tuning-can-be-almost-7x-cheaper)
 * [2022/10] [Use a Laptop to Analyze 90% of Proteins, With a Single-GPU Inference Sequence Exceeding 10,000](https://www.hpc-ai.tech/blog/use-a-laptop-to-analyze-90-of-proteins-with-a-single-gpu-inference-sequence-exceeding)
-* [2022/10] [Embedding Training With 1% GPU Memory and 100 Times Less Budget for Super-Large Recommendation Model](https://www.hpc-ai.tech/blog/embedding-training-with-1-gpu-memory-and-10-times-less-budget-an-open-source-solution-for)
 * [2022/09] [HPC-AI Tech Completes $6 Million Seed and Angel Round Fundraising](https://www.hpc-ai.tech/blog/hpc-ai-tech-completes-6-million-seed-and-angel-round-fundraising-led-by-bluerun-ventures-in-the)
 
 
@@ -64,6 +64,7 @@
 <li>
    <a href="#Colossal-AI-in-the-Real-World">Colossal-AI 成功案例</a>
    <ul>
+     <li><a href="#ChatGPT">ChatGPT: 低成本复现ChatGPT完整流程</a></li>
      <li><a href="#AIGC">AIGC: 加速 Stable Diffusion</a></li>
      <li><a href="#生物医药">生物医药: 加速AlphaFold蛋白质结构预测</a></li>
    </ul>
@@ -209,6 +210,29 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
 <p align="right">(<a href="#top">返回顶端</a>)</p>
 
 ## Colossal-AI 成功案例
+### ChatGPT
+低成本复现[ChatGPT](https://openai.com/blog/chatgpt/)完整流程 [[代码]](https://github.com/hpcaitech/ColossalAI/tree/main/applications/ChatGPT) [[博客]](https://www.hpc-ai.tech/blog/colossal-ai-chatgpt)
+<p id="ChatGPT_scaling" align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chatgpt/ChatGPT%20scaling.png" width=800/>
+</p>
+
+- 最高可提升单机训练速度7.73倍，单卡推理速度1.42倍
+
+<p id="ChatGPT-1GPU" align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chatgpt/ChatGPT-1GPU.jpg" width=800/>
+</p>
+
+- 单卡模型容量最多提升10.3倍
+- 最小demo训练流程最低仅需1.62GB显存 (任意消费级GPU)
+
+<p id="inference" align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chatgpt/LoRA%20data.jpg" width=800/>
+</p>
+
+- 提升单卡的微调模型容量3.7倍
+- 同时保持高速运行
+
+<p align="right">(<a href="#top">back to top</a>)</p>
 
 ### AIGC
 加速AIGC(AI内容生成)模型，如[Stable Diffusion v1](https://github.com/CompVis/stable-diffusion) 和 [Stable Diffusion v2](https://github.com/Stability-AI/stablediffusion)
diff --git a/README.md b/README.md
index 703e3f3bf9c6..e4ffca890918 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 
    [![logo](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/colossal-ai_logo_vertical.png)](https://www.colossalai.org/)
 
-   Colossal-AI: A Unified Deep Learning System for Big Model Era
+   Colossal-AI: Make big AI models cheaper, easier, and scalable
 
    <h3> <a href="https://arxiv.org/abs/2110.14883"> Paper </a> |
    <a href="https://www.colossalai.org/"> Documentation </a> |
@@ -24,10 +24,10 @@
 </div>
 
 ## Latest News
+* [2023/02] [Open source solution replicates ChatGPT training process! Ready to go with only 1.6GB GPU memory](https://www.hpc-ai.tech/blog/colossal-ai-chatgpt)
 * [2023/01] [Hardware Savings Up to 46 Times for AIGC and  Automatic Parallelism](https://www.hpc-ai.tech/blog/colossal-ai-0-2-0)
 * [2022/11] [Diffusion Pretraining and Hardware Fine-Tuning Can Be Almost 7X Cheaper](https://www.hpc-ai.tech/blog/diffusion-pretraining-and-hardware-fine-tuning-can-be-almost-7x-cheaper)
 * [2022/10] [Use a Laptop to Analyze 90% of Proteins, With a Single-GPU Inference Sequence Exceeding 10,000](https://www.hpc-ai.tech/blog/use-a-laptop-to-analyze-90-of-proteins-with-a-single-gpu-inference-sequence-exceeding)
-* [2022/10] [Embedding Training With 1% GPU Memory and 100 Times Less Budget for Super-Large Recommendation Model](https://www.hpc-ai.tech/blog/embedding-training-with-1-gpu-memory-and-10-times-less-budget-an-open-source-solution-for)
 * [2022/09] [HPC-AI Tech Completes $6 Million Seed and Angel Round Fundraising](https://www.hpc-ai.tech/blog/hpc-ai-tech-completes-6-million-seed-and-angel-round-fundraising-led-by-bluerun-ventures-in-the)
 
 ## Table of Contents
@@ -64,6 +64,7 @@
    <li>
    <a href="#Colossal-AI-in-the-Real-World">Colossal-AI for Real World Applications</a>
    <ul>
+     <li><a href="#ChatGPT">ChatGPT: Low-cost ChatGPT Equivalent Implementation Process</a></li>
      <li><a href="#AIGC">AIGC: Acceleration of Stable Diffusion</a></li>
      <li><a href="#Biomedicine">Biomedicine: Acceleration of AlphaFold Protein Structure</a></li>
    </ul>
@@ -211,6 +212,30 @@ Please visit our [documentation](https://www.colossalai.org/) and [examples](htt
 <p align="right">(<a href="#top">back to top</a>)</p>
 
 ## Colossal-AI in the Real World
+### ChatGPT
+A low-cost [ChatGPT](https://openai.com/blog/chatgpt/) equivalent implementation process. [[code]](https://github.com/hpcaitech/ColossalAI/tree/main/applications/ChatGPT) [[blog]](https://www.hpc-ai.tech/blog/colossal-ai-chatgpt)
+<p id="ChatGPT_scaling" align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chatgpt/ChatGPT%20scaling.png" width=800/>
+</p>
+
+- Up to 7.73 times faster for single server training and 1.42 times faster for single-GPU inference
+
+<p id="ChatGPT-1GPU" align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chatgpt/ChatGPT-1GPU.jpg" width=800/>
+</p>
+
+- Up to 10.3x growth in model capacity on one GPU
+- A mini demo training process requires only 1.62GB of GPU memory (any consumer-grade GPU)
+
+<p id="inference" align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chatgpt/LoRA%20data.jpg" width=800/>
+</p>
+
+- Increase the capacity of the fine-tuning model by up to 3.7 times on a single GPU
+- Keep in a sufficiently high running speed
+
+<p align="right">(<a href="#top">back to top</a>)</p>
+
 
 ### AIGC
 Acceleration of AIGC (AI-Generated Content) models such as [Stable Diffusion v1](https://github.com/CompVis/stable-diffusion) and [Stable Diffusion v2](https://github.com/Stability-AI/stablediffusion).

From 71deddc87f8d8d108fc5198708873b81aefa11b0 Mon Sep 17 00:00:00 2001
From: binmakeswell <binmakeswell@gmail.com>
Date: Tue, 14 Feb 2023 22:56:15 +0800
Subject: [PATCH 311/503] [doc] resize figure (#2705)

* [doc] resize figure

* [doc] resize figure
---
 README-zh-Hans.md | 4 ++--
 README.md         | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/README-zh-Hans.md b/README-zh-Hans.md
index e16db47f9886..18623d67a920 100644
--- a/README-zh-Hans.md
+++ b/README-zh-Hans.md
@@ -219,14 +219,14 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
 - 最高可提升单机训练速度7.73倍，单卡推理速度1.42倍
 
 <p id="ChatGPT-1GPU" align="center">
-<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chatgpt/ChatGPT-1GPU.jpg" width=800/>
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chatgpt/ChatGPT-1GPU.jpg" width=450/>
 </p>
 
 - 单卡模型容量最多提升10.3倍
 - 最小demo训练流程最低仅需1.62GB显存 (任意消费级GPU)
 
 <p id="inference" align="center">
-<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chatgpt/LoRA%20data.jpg" width=800/>
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chatgpt/LoRA%20data.jpg" width=600/>
 </p>
 
 - 提升单卡的微调模型容量3.7倍
diff --git a/README.md b/README.md
index e4ffca890918..20a5f2606ca5 100644
--- a/README.md
+++ b/README.md
@@ -221,14 +221,14 @@ A low-cost [ChatGPT](https://openai.com/blog/chatgpt/) equivalent implementation
 - Up to 7.73 times faster for single server training and 1.42 times faster for single-GPU inference
 
 <p id="ChatGPT-1GPU" align="center">
-<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chatgpt/ChatGPT-1GPU.jpg" width=800/>
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chatgpt/ChatGPT-1GPU.jpg" width=450/>
 </p>
 
 - Up to 10.3x growth in model capacity on one GPU
 - A mini demo training process requires only 1.62GB of GPU memory (any consumer-grade GPU)
 
 <p id="inference" align="center">
-<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chatgpt/LoRA%20data.jpg" width=800/>
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chatgpt/LoRA%20data.jpg" width=600/>
 </p>
 
 - Increase the capacity of the fine-tuning model by up to 3.7 times on a single GPU

From 94f000515b3f5700934072c39890f45cf419eebc Mon Sep 17 00:00:00 2001
From: binmakeswell <binmakeswell@gmail.com>
Date: Tue, 14 Feb 2023 23:07:30 +0800
Subject: [PATCH 312/503] [doc] add Quick Preview (#2706)

---
 applications/ChatGPT/README.md | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/applications/ChatGPT/README.md b/applications/ChatGPT/README.md
index dce59ad4b834..43085f3abfa6 100644
--- a/applications/ChatGPT/README.md
+++ b/applications/ChatGPT/README.md
@@ -1,6 +1,6 @@
-# RLHF - ColossalAI
+# RLHF - Colossal-AI
 
-Implementation of RLHF (Reinforcement Learning with Human Feedback) powered by ColossalAI. It supports distributed training and offloading, which can fit extremly large models.
+Implementation of RLHF (Reinforcement Learning with Human Feedback) powered by Colossal-AI. It supports distributed training and offloading, which can fit extremly large models. More details can be found in the [blog](https://www.hpc-ai.tech/blog/colossal-ai-chatgpt).
 
 <p align="center">
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chatgpt/chatgpt.png" width=700/>
@@ -60,6 +60,27 @@ We also support training reward model with true-world data. See `examples/train_
 - [ ] integrate with Ray
 - [ ] support more RL paradigms, like Implicit Language Q-Learning (ILQL)
 
+## Quick Preview
+<p id="ChatGPT_scaling" align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chatgpt/ChatGPT%20scaling.png" width=800/>
+</p>
+
+- Up to 7.73 times faster for single server training and 1.42 times faster for single-GPU inference
+
+<p id="ChatGPT-1GPU" align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chatgpt/ChatGPT-1GPU.jpg" width=450/>
+</p>
+
+- Up to 10.3x growth in model capacity on one GPU
+- A mini demo training process requires only 1.62GB of GPU memory (any consumer-grade GPU)
+
+<p id="inference" align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chatgpt/LoRA%20data.jpg" width=600/>
+</p>
+
+- Increase the capacity of the fine-tuning model by up to 3.7 times on a single GPU
+- Keep in a sufficiently high running speed
+
 ## Citations
 
 ```bibtex

From d701ef81b1ef59b7e3d09106a9482b9d9e19b2dd Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 15 Feb 2023 09:39:44 +0800
Subject: [PATCH 313/503] Automated submodule synchronization (#2707)

Co-authored-by: github-actions <github-actions@github.com>
---
 examples/tutorial/fastfold/FastFold | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tutorial/fastfold/FastFold b/examples/tutorial/fastfold/FastFold
index 0188361b6e2b..f05e712982ae 160000
--- a/examples/tutorial/fastfold/FastFold
+++ b/examples/tutorial/fastfold/FastFold
@@ -1 +1 @@
-Subproject commit 0188361b6e2b46bca61d37af5674eacf7ca9947f
+Subproject commit f05e712982aeba6a32a9b3d1ee4dee6492426cec

From 4ac8bfb07285a417dba3d302e477d6e57b0b6d5f Mon Sep 17 00:00:00 2001
From: CZYCW <czyczf@163.com>
Date: Wed, 15 Feb 2023 09:40:08 +0800
Subject: [PATCH 314/503] [NFC] polish
 colossalai/engine/gradient_handler/utils.py code style (#2708)

---
 colossalai/engine/gradient_handler/utils.py | 59 +++++++++++----------
 1 file changed, 30 insertions(+), 29 deletions(-)

diff --git a/colossalai/engine/gradient_handler/utils.py b/colossalai/engine/gradient_handler/utils.py
index e92044b47279..fca5f2ec9da9 100644
--- a/colossalai/engine/gradient_handler/utils.py
+++ b/colossalai/engine/gradient_handler/utils.py
@@ -1,29 +1,30 @@
-import torch.distributed as dist
-import torch.nn as nn
-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
-from typing import Iterable
-
-
-def bucket_allreduce(param_list: Iterable[nn.Parameter], group=None):
-    # get communication world size
-    comm_size = dist.get_world_size(group)
-    # bucketize and all-reduce
-    buckets = {}
-    # Pack the buckets.
-    for param in param_list:
-        if param.requires_grad and param.grad is not None:
-            tp = param.data.type()
-            if tp not in buckets:
-                buckets[tp] = []
-            buckets[tp].append(param)
-
-    # For each bucket, all-reduce and copy all-reduced grads.
-    for tp in buckets:
-        bucket = buckets[tp]
-        grads = [param.grad.data for param in bucket]
-        coalesced = _flatten_dense_tensors(grads)
-        coalesced /= comm_size
-
-        dist.all_reduce(coalesced, group=group)
-        for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
-            buf.copy_(synced)
+from typing import Iterable
+
+import torch.distributed as dist
+import torch.nn as nn
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+
+
+def bucket_allreduce(param_list: Iterable[nn.Parameter], group=None):
+    # get communication world size
+    comm_size = dist.get_world_size(group)
+    # bucketize and all-reduce
+    buckets = {}
+    # Pack the buckets.
+    for param in param_list:
+        if param.requires_grad and param.grad is not None:
+            tp = param.data.type()
+            if tp not in buckets:
+                buckets[tp] = []
+            buckets[tp].append(param)
+
+    # For each bucket, all-reduce and copy all-reduced grads.
+    for tp in buckets:
+        bucket = buckets[tp]
+        grads = [param.grad.data for param in bucket]
+        coalesced = _flatten_dense_tensors(grads)
+        coalesced /= comm_size
+
+        dist.all_reduce(coalesced, group=group)
+        for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
+            buf.copy_(synced)

From 7fa6be49d2ac1eae2eda60f150597f0d3998ddf7 Mon Sep 17 00:00:00 2001
From: YuliangLiu0306 <72588413+YuliangLiu0306@users.noreply.github.com>
Date: Wed, 15 Feb 2023 09:43:29 +0800
Subject: [PATCH 315/503] [autoparallel] test compatibility for gemini and auto
 parallel (#2700)

---
 .../passes/runtime_preparation_pass.py        |  10 +-
 .../test_compatibility_with_ddp.py            |  98 ++++++++++++++++
 .../test_compatibility_with_gemini.py         | 108 ++++++++++++++++++
 3 files changed, 212 insertions(+), 4 deletions(-)
 create mode 100644 tests/test_auto_parallel/test_tensor_shard/test_compatibility_with_ddp.py
 create mode 100644 tests/test_auto_parallel/test_tensor_shard/test_compatibility_with_gemini.py

diff --git a/colossalai/auto_parallel/passes/runtime_preparation_pass.py b/colossalai/auto_parallel/passes/runtime_preparation_pass.py
index 897602ce1d24..ecf3f1f18de5 100644
--- a/colossalai/auto_parallel/passes/runtime_preparation_pass.py
+++ b/colossalai/auto_parallel/passes/runtime_preparation_pass.py
@@ -377,8 +377,9 @@ def _module_params_sharding(gm: torch.fx.GraphModule, device_mesh: DeviceMesh, o
                     # TODO: build a ColoParamter class to manager the distributed parameters
                     # we could use .data here, because all the operations just happen before the real training
                     # loop, so we don't need to track these operations in the autograd graph.
-                    param.data = shape_consistency_manager.apply_for_autoparallel_runtime(
-                        param.data, param.sharding_spec, target_sharding_spec).detach().clone()
+                    param = torch.nn.Parameter(
+                        shape_consistency_manager.apply_for_autoparallel_runtime(param.data, param.sharding_spec,
+                                                                                 target_sharding_spec).detach().clone())
 
                 setattr(target_module, name, param)
                 comm_actions = node.best_strategy.communication_actions
@@ -432,8 +433,9 @@ def hook_fn(grad):
                 # TODO: build a ColoParamter class to manager the distributed parameters
                 # we could use .data here, because all the operations just happen before the real training
                 # loop, so we don't need to track these operations in the autograd graph.
-                target.data = shape_consistency_manager.apply_for_autoparallel_runtime(
-                    target.data, target.sharding_spec, target_sharding_spec).detach().clone()
+                target = torch.nn.Parameter(
+                    shape_consistency_manager.apply_for_autoparallel_runtime(target.data, target.sharding_spec,
+                                                                             target_sharding_spec).detach().clone())
 
             assert hasattr(target_module, atoms[-1])
             setattr(target_module, atoms[-1], target)
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_compatibility_with_ddp.py b/tests/test_auto_parallel/test_tensor_shard/test_compatibility_with_ddp.py
new file mode 100644
index 000000000000..365981f105f0
--- /dev/null
+++ b/tests/test_auto_parallel/test_tensor_shard/test_compatibility_with_ddp.py
@@ -0,0 +1,98 @@
+import copy
+from functools import partial
+
+import pytest
+import torch
+import torch.multiprocessing as mp
+from torch.nn.parallel import DistributedDataParallel as DDP
+
+from colossalai.auto_parallel.tensor_shard.initialize import initialize_model
+from colossalai.device.device_mesh import DeviceMesh
+from colossalai.initialize import launch
+from colossalai.logging import disable_existing_loggers
+from colossalai.testing import assert_close, rerun_if_address_is_in_use
+from colossalai.testing.pytest_wrapper import run_on_environment_flag
+from colossalai.utils import free_port
+
+
+class MLP(torch.nn.Module):
+
+    def __init__(self, in_features):
+        super().__init__()
+        self.linear_1 = torch.nn.Linear(in_features, 4 * in_features, bias=False)
+        self.linear_2 = torch.nn.Linear(4 * in_features, in_features, bias=False)
+
+    def forward(self, x):
+        x = self.linear_1(x)
+        x = self.linear_2(x)
+
+        return x
+
+
+def check_compatibility_with_ddp(rank, world_size, port):
+    disable_existing_loggers()
+    launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+    model = MLP(4).cuda()
+    input = torch.rand(4, 4).cuda()
+    output_compare = model(input)
+    loss_compare = output_compare.sum()
+    loss_compare.backward()
+    grad_compare = copy.deepcopy(model.linear_1.weight.grad)
+
+    physical_mesh_id = torch.arange(0, 4)
+    mesh_shape = (2, 2)
+    # [[0, 1]
+    #  [2, 3]]
+    device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
+    meta_args = {'x': torch.rand(4, 4).to('meta')}
+    gm, solution = initialize_model(model,
+                                    meta_args=meta_args,
+                                    device_mesh=device_mesh,
+                                    return_solution=True,
+                                    solver_preference='tp',
+                                    shard_option='shard_last_axis')
+
+    msg = '| TP strategy combination chosen by auto-parallel solver |'
+    msg_length = len(msg)
+    if rank == 0:
+        print('=' * msg_length)
+        print(msg)
+        print('=' * msg_length)
+        for strategy in solution:
+            print(strategy)
+        print('=' * msg_length)
+
+    dp_process_group = None
+    for (ranks, process_group_handle) in device_mesh.process_groups_dict[0]:
+        if rank in ranks:
+            dp_process_group = process_group_handle
+    assert dp_process_group is not None
+    gm = DDP(gm, process_group=dp_process_group)
+    output = gm(input)
+
+    assert_close(output, output_compare)
+    print(f'output on rank{rank} is correct')
+    loss = output.sum()
+
+    loss.backward()
+
+    if rank in (0, 2):
+        assert_close(gm.module.module.linear_1.weight.grad, grad_compare.narrow(0, 0, 8))
+
+    if rank in (1, 3):
+        assert_close(gm.module.module.linear_1.weight.grad, grad_compare.narrow(0, 8, 8))
+
+    print(f'gradient on rank{rank} is correct')
+
+
+@run_on_environment_flag(name='AUTO_PARALLEL')
+@pytest.mark.dist
+@rerun_if_address_is_in_use()
+def test_compatibility_with_ddp():
+    world_size = 4
+    run_func = partial(check_compatibility_with_ddp, world_size=world_size, port=free_port())
+    mp.spawn(run_func, nprocs=world_size)
+
+
+if __name__ == '__main__':
+    test_compatibility_with_ddp()
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_compatibility_with_gemini.py b/tests/test_auto_parallel/test_tensor_shard/test_compatibility_with_gemini.py
new file mode 100644
index 000000000000..b4080c54599a
--- /dev/null
+++ b/tests/test_auto_parallel/test_tensor_shard/test_compatibility_with_gemini.py
@@ -0,0 +1,108 @@
+import copy
+from functools import partial
+
+import pytest
+import torch
+import torch.multiprocessing as mp
+from torch.nn.parallel import DistributedDataParallel as DDP
+
+from colossalai.auto_parallel.tensor_shard.initialize import initialize_model
+from colossalai.device.device_mesh import DeviceMesh
+from colossalai.initialize import launch
+from colossalai.logging import disable_existing_loggers
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.nn.parallel import zero_model_wrapper, zero_optim_wrapper
+from colossalai.tensor.process_group import ProcessGroup
+from colossalai.testing import assert_close, rerun_if_address_is_in_use
+from colossalai.testing.pytest_wrapper import run_on_environment_flag
+from colossalai.utils import free_port, get_current_device
+from colossalai.utils.model.colo_init_context import ColoInitContext, post_process_colo_init_ctx
+
+
+class MLP(torch.nn.Module):
+
+    def __init__(self, in_features):
+        super().__init__()
+        self.linear_1 = torch.nn.Linear(in_features, 4 * in_features, bias=False)
+        self.linear_2 = torch.nn.Linear(4 * in_features, in_features, bias=False)
+
+    def forward(self, x):
+        x = self.linear_1(x)
+        x = self.linear_2(x)
+
+        return x
+
+
+def check_auto_parallel_with_gemini(rank, world_size, port):
+    disable_existing_loggers()
+    launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+    model = MLP(4).half().cuda()
+
+    input = torch.rand(4, 4).half().cuda()
+    output_compare = model(input)
+    loss_compare = output_compare.sum()
+    loss_compare.backward()
+    grad_compare = copy.deepcopy(model.linear_1.weight.grad)
+
+    physical_mesh_id = torch.arange(0, 4)
+    mesh_shape = (2, 2)
+    # [[0, 1]
+    #  [2, 3]]
+    device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
+    meta_args = {'x': torch.rand(4, 4).half().to('meta')}
+    gm, solution = initialize_model(model,
+                                    meta_args=meta_args,
+                                    device_mesh=device_mesh,
+                                    return_solution=True,
+                                    solver_preference='tp',
+                                    shard_option='shard_last_axis')
+
+    if rank == 0:
+        msg = '| TP strategy combination chosen by auto-parallel solver |'
+        msg_length = len(msg)
+        print('=' * msg_length)
+        print(msg)
+        print('=' * msg_length)
+        for strategy in solution:
+            print(strategy)
+        print('=' * msg_length)
+
+    dp_process_group = ProcessGroup(rank=rank, ranks=[0, 1, 2, 3], tp_degree=2, dp_degree=2)
+    gemini_config = dict(strict_ddp_mode=False,
+                         device=get_current_device(),
+                         placement_policy='cpu',
+                         pin_memory=True,
+                         search_range_mb=128)
+
+    post_process_colo_init_ctx(gm, device=get_current_device(), default_pg=dp_process_group)
+    gm = zero_model_wrapper(gm, zero_stage=3, gemini_config=gemini_config)
+    optimizer = HybridAdam(gm.parameters(), betas=(0, 0))
+    optimizer = zero_optim_wrapper(gm, optimizer, initial_scale=1)
+    output = gm(input)
+    assert_close(output, output_compare)
+    print(f'output on rank{rank} is correct')
+    loss = output.sum()
+    optimizer.zero_grad()
+    optimizer.backward(loss)
+    optimizer.step()
+
+    if rank in (0, 2):
+        assert_close(list(optimizer.optim.state.values())[0]['exp_avg'].half(), grad_compare.narrow(0, 0, 8).flatten())
+
+    if rank in (1, 3):
+        assert_close(list(optimizer.optim.state.values())[0]['exp_avg'].half(), grad_compare.narrow(0, 8, 8).flatten())
+
+    print(f'gradient on rank{rank} is correct')
+
+
+@run_on_environment_flag(name='AUTO_PARALLEL')
+@pytest.mark.dist
+@rerun_if_address_is_in_use()
+def test_auto_parallel_with_gemini():
+    world_size = 4
+    run_func = partial(check_auto_parallel_with_gemini, world_size=world_size, port=free_port())
+    mp.spawn(run_func, nprocs=world_size)
+
+
+if __name__ == '__main__':
+    test_auto_parallel_with_gemini()

From 0b2a738393bbc4b37efd5dfe9e60f33587357749 Mon Sep 17 00:00:00 2001
From: YuliangLiu0306 <72588413+YuliangLiu0306@users.noreply.github.com>
Date: Wed, 15 Feb 2023 09:54:32 +0800
Subject: [PATCH 316/503] [autoparallel] remove deprecated codes (#2664)

---
 .../tensor_shard/deprecated/__init__.py       |   6 -
 .../tensor_shard/deprecated/_utils.py         | 142 ----
 .../tensor_shard/deprecated/constants.py      |  83 --
 .../tensor_shard/deprecated/cost_graph.py     | 172 ----
 .../tensor_shard/deprecated/graph_analysis.py | 163 ----
 .../deprecated/op_handler/__init__.py         |  15 -
 .../op_handler/batch_norm_handler.py          | 492 ------------
 .../deprecated/op_handler/bcast_op_handler.py | 552 -------------
 .../deprecated/op_handler/conv_handler.py     | 609 --------------
 .../deprecated/op_handler/dot_handler.py      | 756 ------------------
 .../op_handler/embedding_handler.py           | 179 -----
 .../op_handler/layer_norm_handler.py          | 241 ------
 .../deprecated/op_handler/operator_handler.py | 149 ----
 .../deprecated/op_handler/reshape_handler.py  |  89 ---
 .../op_handler/strategy_generator.py          |  45 --
 .../op_handler/unary_elementwise_handler.py   |  88 --
 .../deprecated/op_handler/where_handler.py    | 186 -----
 .../tensor_shard/deprecated/options.py        |  11 -
 .../deprecated/sharding_strategy.py           |  91 ---
 .../tensor_shard/deprecated/solver.py         | 469 -----------
 .../deprecated/strategies_constructor.py      | 426 ----------
 .../test_deprecated_cost_graph.py             |  96 ---
 .../test_deprecated_batch_norm_handler.py     | 118 ---
 .../test_deprecated_bcast_handler.py          |  75 --
 .../test_deprecated_bcast_matmul.py           |  54 --
 .../test_deprecated_conv_handler.py           |  90 ---
 .../test_deprecated_dot_handler.py            |  83 --
 .../test_deprecated_layer_norm_handler.py     |  70 --
 .../test_deprecated_reshape_handler.py        |  59 --
 .../test_deprecated_where_handler.py          |  66 --
 .../test_deprecated_shape_consistency_pass.py |  86 --
 .../test_deprecated/test_deprecated_solver.py |  79 --
 .../test_deprecated_solver_with_gpt.py        |  81 --
 .../test_deprecated_solver_with_mlp.py        |  94 ---
 .../test_deprecated_strategies_constructor.py | 103 ---
 35 files changed, 6118 deletions(-)
 delete mode 100644 colossalai/auto_parallel/tensor_shard/deprecated/__init__.py
 delete mode 100644 colossalai/auto_parallel/tensor_shard/deprecated/_utils.py
 delete mode 100644 colossalai/auto_parallel/tensor_shard/deprecated/constants.py
 delete mode 100644 colossalai/auto_parallel/tensor_shard/deprecated/cost_graph.py
 delete mode 100644 colossalai/auto_parallel/tensor_shard/deprecated/graph_analysis.py
 delete mode 100644 colossalai/auto_parallel/tensor_shard/deprecated/op_handler/__init__.py
 delete mode 100644 colossalai/auto_parallel/tensor_shard/deprecated/op_handler/batch_norm_handler.py
 delete mode 100644 colossalai/auto_parallel/tensor_shard/deprecated/op_handler/bcast_op_handler.py
 delete mode 100644 colossalai/auto_parallel/tensor_shard/deprecated/op_handler/conv_handler.py
 delete mode 100644 colossalai/auto_parallel/tensor_shard/deprecated/op_handler/dot_handler.py
 delete mode 100644 colossalai/auto_parallel/tensor_shard/deprecated/op_handler/embedding_handler.py
 delete mode 100644 colossalai/auto_parallel/tensor_shard/deprecated/op_handler/layer_norm_handler.py
 delete mode 100644 colossalai/auto_parallel/tensor_shard/deprecated/op_handler/operator_handler.py
 delete mode 100644 colossalai/auto_parallel/tensor_shard/deprecated/op_handler/reshape_handler.py
 delete mode 100644 colossalai/auto_parallel/tensor_shard/deprecated/op_handler/strategy_generator.py
 delete mode 100644 colossalai/auto_parallel/tensor_shard/deprecated/op_handler/unary_elementwise_handler.py
 delete mode 100644 colossalai/auto_parallel/tensor_shard/deprecated/op_handler/where_handler.py
 delete mode 100644 colossalai/auto_parallel/tensor_shard/deprecated/options.py
 delete mode 100644 colossalai/auto_parallel/tensor_shard/deprecated/sharding_strategy.py
 delete mode 100644 colossalai/auto_parallel/tensor_shard/deprecated/solver.py
 delete mode 100644 colossalai/auto_parallel/tensor_shard/deprecated/strategies_constructor.py
 delete mode 100644 tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_cost_graph.py
 delete mode 100644 tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_op_handler/test_deprecated_batch_norm_handler.py
 delete mode 100644 tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_op_handler/test_deprecated_bcast_handler.py
 delete mode 100644 tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_op_handler/test_deprecated_bcast_matmul.py
 delete mode 100644 tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_op_handler/test_deprecated_conv_handler.py
 delete mode 100644 tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_op_handler/test_deprecated_dot_handler.py
 delete mode 100644 tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_op_handler/test_deprecated_layer_norm_handler.py
 delete mode 100644 tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_op_handler/test_deprecated_reshape_handler.py
 delete mode 100644 tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_op_handler/test_deprecated_where_handler.py
 delete mode 100644 tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_shape_consistency_pass.py
 delete mode 100644 tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_solver.py
 delete mode 100644 tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_solver_with_gpt.py
 delete mode 100644 tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_solver_with_mlp.py
 delete mode 100644 tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_strategies_constructor.py

diff --git a/colossalai/auto_parallel/tensor_shard/deprecated/__init__.py b/colossalai/auto_parallel/tensor_shard/deprecated/__init__.py
deleted file mode 100644
index bd47f2adf3d6..000000000000
--- a/colossalai/auto_parallel/tensor_shard/deprecated/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from .cost_graph import CostGraph
-from .graph_analysis import GraphAnalyser
-from .options import SolverOptions
-from .sharding_strategy import ShardingStrategy, StrategiesVector
-from .solver import Solver
-from .strategies_constructor import StrategiesConstructor
diff --git a/colossalai/auto_parallel/tensor_shard/deprecated/_utils.py b/colossalai/auto_parallel/tensor_shard/deprecated/_utils.py
deleted file mode 100644
index d6af7ad57154..000000000000
--- a/colossalai/auto_parallel/tensor_shard/deprecated/_utils.py
+++ /dev/null
@@ -1,142 +0,0 @@
-import functools
-import operator
-import warnings
-from functools import reduce
-from typing import Dict, List, Optional, Union
-
-import torch
-from torch.fx.node import Node
-
-from colossalai.device.device_mesh import DeviceMesh
-from colossalai.tensor.shape_consistency import ShapeConsistencyManager
-from colossalai.tensor.sharding_spec import ShardingSpec
-
-from .constants import INFINITY_COST
-
-
-def generate_sharding_spec(input_: Union[Node, torch.Tensor], device_mesh: DeviceMesh,
-                           dim_partition_dict: Dict[int, List[int]]) -> ShardingSpec:
-    """
-    Generate the sharding spec of the tensor based on the given dim_partition_dict.
-
-
-    Args:
-        input_ (Union[Node, torch.Tensor]): the input can be a Node object or a PyTorch tensor. If a node is used, it will look for its meta data associated with this node.
-        device_mesh (DeviceMesh): a DeviceMesh object which contains the meta information about the cluster.
-        dim_partition_dict (Dict[int, List[int]]): a dictionary to specify the sharding specs, the key is the tensor dimension and the value is the mesh dimension for sharding.
-    """
-
-    if isinstance(input_, Node):
-        assert hasattr(input_, '_meta_data'), f'The given node has no attribte _meta_data'
-        meta_tensor = input_._meta_data
-        assert meta_tensor is not None, "The given node's _meta_data attribute is None"
-        shape = meta_tensor.shape
-    elif isinstance(input_, torch.Tensor):
-        shape = input_.shape
-    else:
-        raise TypeError(
-            f'We cannot generate sharding spec for {type(input_)} type, only torch.fx.Node or torch.Tensor is expected.'
-        )
-    for dim_index, sharding_index_list in dim_partition_dict.items():
-        sharding_list = [device_mesh.mesh_shape[sharding_index] for sharding_index in sharding_index_list]
-        sharding_size = reduce(operator.mul, sharding_list, 1)
-        assert shape[
-            dim_index] % sharding_size == 0, f'we cannot shard the {dim_index} dimension of tensor into {sharding_size} partitions.'
-
-    sharding_spec = ShardingSpec(device_mesh=device_mesh, entire_shape=shape, dim_partition_dict=dim_partition_dict)
-    return sharding_spec
-
-
-def generate_resharding_costs(nodes: List[Node],
-                              sharding_specs: List[ShardingSpec],
-                              count_backward: Optional[bool] = True,
-                              dtype: Optional[torch.dtype] = None,
-                              index=None):
-    '''
-    Compute the resharding costs with this specific strategy.
-
-    Argument:
-        nodes (List[Node]): a list of nodes
-        sharding_spec_for_input(ShardingSpec): a list of ShardingSpec for the nodes.
-        count_backward (Optional[bool]): whether to include the cost of resharding in the backward pass, default is True. False can be used for inference.
-        dtype (Optional[torch.dtype]): the data type for cost calculation, default is None.
-    '''
-    # The resharding_cost of weight is counted due to sharing weight cases.
-    resharding_costs = {}
-    size_per_elem_bytes = torch.tensor([], dtype=dtype).element_size()
-
-    # shape consistency manager is a singleton class
-    shape_consistency_manager = ShapeConsistencyManager()
-
-    for input_node, input_spec in zip(nodes, sharding_specs):
-        resharding_costs[input_node] = []
-        for strategy in input_node.strategies_vector:
-            input_sharding_spec = strategy.output_sharding_spec
-            if not isinstance(input_sharding_spec, ShardingSpec):
-                assert isinstance(input_sharding_spec, list), 'only ShardingSpec or List[ShardingSpec] is expected.'
-                input_sharding_spec = input_sharding_spec[index]
-            assert isinstance(input_sharding_spec, ShardingSpec), f'The input node should NOT be a tuple of tensor.'
-            try:
-                # compute the resharding cost
-                _, _, total_resharding_cost = shape_consistency_manager.shape_consistency(
-                    input_sharding_spec, input_spec)
-
-                # we need multiply the size of elem dtype to get correct communication cost
-                resharding_cost = total_resharding_cost["total"] * size_per_elem_bytes
-            except AssertionError as e:
-                warnings.warn(f'{e}')
-                resharding_cost = INFINITY_COST
-            resharding_costs[input_node].append(resharding_cost)
-    return resharding_costs
-
-
-def ignore_sharding_exception(func):
-    """
-    A function wrapper which executes the function with a specified seed.
-    """
-
-    @functools.wraps(func)
-    def wrapper(*args, **kwargs):
-        try:
-            rst = func(*args, **kwargs)
-            return rst
-        except AssertionError as e:
-            warnings.warn(f'{e}')
-
-    return wrapper
-
-
-def enumerate_all_possible_2d_sharding(mesh_dim_0, mesh_dim_1, dim_size):
-    dim_partition_list = []
-    # enumerate all the 2D sharding cases
-    for i in range(dim_size):
-        for j in range(i + 1, dim_size):
-            dim_partition_dict_0 = {i: [mesh_dim_0], j: [mesh_dim_1]}
-            dim_partition_dict_1 = {i: [mesh_dim_1], j: [mesh_dim_0]}
-            dim_partition_list.append(dim_partition_dict_0)
-            dim_partition_list.append(dim_partition_dict_1)
-    for i in range(dim_size):
-        dim_partition_dict_flatten = {i: [mesh_dim_0, mesh_dim_1]}
-        dim_partition_list.append(dim_partition_dict_flatten)
-
-    return dim_partition_list
-
-
-def enumerate_all_possible_1d_sharding(mesh_dim_0, dim_size):
-    dim_partition_list = []
-    # enumerate all the 1D sharding cases
-    for i in range(dim_size):
-        dim_partition_dict_0 = {i: [mesh_dim_0]}
-        dim_partition_list.append(dim_partition_dict_0)
-
-    return dim_partition_list
-
-
-def generate_sharding_size(dim_partition_dict, device_mesh):
-    total_sharding_size = 1
-    for mesh_dim_list in dim_partition_dict.values():
-        mesh_dim_sharding_size = [device_mesh.shape[mesh_dim] for mesh_dim in mesh_dim_list]
-        sharding_size = reduce(operator.mul, mesh_dim_sharding_size)
-        total_sharding_size *= sharding_size
-
-    return total_sharding_size
diff --git a/colossalai/auto_parallel/tensor_shard/deprecated/constants.py b/colossalai/auto_parallel/tensor_shard/deprecated/constants.py
deleted file mode 100644
index 91c20d343487..000000000000
--- a/colossalai/auto_parallel/tensor_shard/deprecated/constants.py
+++ /dev/null
@@ -1,83 +0,0 @@
-import torch
-import operator
-
-__all__ = [
-    'ELEMENTWISE_MODULE_OP', 'ELEMENTWISE_FUNC_OP', 'RESHAPE_FUNC_OP', 'CONV_MODULE_OP', 'CONV_FUNC_OP',
-    'LINEAR_MODULE_OP', 'LINEAR_FUNC_OP', 'BATCHNORM_MODULE_OP', 'POOL_MODULE_OP', 'NON_PARAM_FUNC_OP', 'BCAST_FUNC_OP',
-    'EMBEDDING_MODULE_OP', 'LAYERNORM_MODULE_OP', 'ELEMENTWISE_METHOD_OP', 'RESHAPE_METHOD_OP', 'INFINITY_COST'
-]
-
-ELEMENTWISE_MODULE_OP = [torch.nn.Dropout, torch.nn.ReLU]
-ELEMENTWISE_FUNC_OP = [
-    torch.abs,
-    torch.cos,
-    torch.exp,
-    operator.neg,
-    torch.multiply,
-    torch.nn.functional.relu,
-    torch.nn.functional.dropout,
-    # softmax should not be here
-    torch.nn.functional.softmax
-]
-ELEMENTWISE_METHOD_OP = [
-    torch.Tensor.to,
-    torch.Tensor.type,
-    # TODO: contiguous maybe need some extra processes.
-    torch.Tensor.contiguous
-]
-RESHAPE_FUNC_OP = [torch.flatten, torch.reshape]
-RESHAPE_METHOD_OP = [
-    torch.Tensor.view,
-    torch.Tensor.unsqueeze,
-    torch.Tensor.split,
-    torch.Tensor.permute,
-    torch.Tensor.transpose,
-]
-BCAST_FUNC_OP = [
-    torch.add, torch.sub, torch.mul, torch.div, torch.floor_divide, torch.true_divide, operator.add, operator.sub,
-    operator.mul, operator.floordiv, operator.truediv, torch.matmul, torch.where, operator.pow, torch.pow, torch.tanh
-]
-CONV_MODULE_OP = [
-    torch.nn.Conv1d, torch.nn.Conv2d, torch.nn.Conv3d, torch.nn.ConvTranspose1d, torch.nn.ConvTranspose2d,
-    torch.nn.ConvTranspose3d
-]
-CONV_FUNC_OP = [
-    torch.conv1d, torch.conv2d, torch.conv3d, torch.conv_transpose1d, torch.conv_transpose2d, torch.conv_transpose3d
-]
-EMBEDDING_MODULE_OP = [torch.nn.modules.sparse.Embedding]
-LINEAR_MODULE_OP = [torch.nn.Linear]
-LINEAR_FUNC_OP = [torch.nn.functional.linear, torch.matmul, torch.bmm]
-BATCHNORM_MODULE_OP = [torch.nn.BatchNorm1d, torch.nn.BatchNorm2d, torch.nn.BatchNorm3d, torch.nn.SyncBatchNorm]
-LAYERNORM_MODULE_OP = [torch.nn.LayerNorm]
-POOL_MODULE_OP = [torch.nn.MaxPool1d, torch.nn.MaxPool2d, torch.nn.MaxPool3d, torch.nn.AdaptiveAvgPool2d]
-NON_PARAM_FUNC_OP = [
-    torch.flatten,
-    torch.reshape,
-    torch.abs,
-    torch.cos,
-    torch.exp,
-    operator.neg,
-    torch.multiply,
-    torch.nn.functional.relu,
-    torch.nn.functional.dropout,
-    torch.flatten,
-    torch.where,
-    operator.pow,
-    torch.pow,
-    torch.tanh,
-    torch.add,
-    torch.sub,
-    torch.mul,
-    torch.div,
-    torch.floor_divide,
-    torch.true_divide,
-    operator.add,
-    operator.sub,
-    operator.mul,
-    operator.floordiv,
-    operator.truediv,
-    # softmax should not be here
-    torch.nn.functional.softmax
-]
-
-INFINITY_COST = 1e13
diff --git a/colossalai/auto_parallel/tensor_shard/deprecated/cost_graph.py b/colossalai/auto_parallel/tensor_shard/deprecated/cost_graph.py
deleted file mode 100644
index 239d02115d0e..000000000000
--- a/colossalai/auto_parallel/tensor_shard/deprecated/cost_graph.py
+++ /dev/null
@@ -1,172 +0,0 @@
-from typing import List
-import math
-from torch.fx.node import Node
-from .constants import INFINITY_COST
-
-
-class CostGraph:
-    '''
-    A graph data structure to simplify the edge cost graph. It has two main functions:
-    1. To feed the quadratic resharding costs into solver, we need to linearize it. We build edge_cost in
-    CostGraph, and it stored every combinations of strategies for a src-dst node pair in an 1D list.
-    2. To reduce the searching space, we merge computationally-trivial operators, such as 
-    element-wise operators, transpose, and reduction, into their following nodes. The merging infomation will
-    be given by the StrategiesVector depending on the type of target node and following nodes.
-
-    Argument:
-        leaf_strategies(List[StrategiesVector]): It stores StrategiesVector of every nodes on the graph.
-        simplify(bool, optional): The generated cost graph will be simplified if it is true. (default to True)
-    '''
-
-    def __init__(self, leaf_strategies, simplify=True):
-        self.leaf_strategies = leaf_strategies
-        self.nodes = [strategies_vector.node for strategies_vector in self.leaf_strategies]
-        # stores number of strategies in each node
-        self.node_lens = {strategies_vector.node: len(strategies_vector) for strategies_vector in self.leaf_strategies}
-        # extra_node_costs will store the extra costs introduced by merging nodes
-        self.extra_node_costs = {}
-        self.following_dict = {}
-        self.simplify = simplify
-        self._build_cost_graph()
-
-    def _remove_invalid_node(self, node, attr_name):
-        remove_list = []
-        target_node_list = getattr(node, attr_name, [])
-        for target_node in target_node_list:
-            if target_node not in self.nodes:
-                remove_list.append(target_node)
-        for element in remove_list:
-            target_node_list.remove(element)
-
-    def _build_cost_graph(self):
-        '''
-        This method will generate edge_cost for adjacent node pair. Additionally, 'parents' and 'children' attribute will be
-        set to node.
-        '''
-        self.edge_costs = {}
-        if self.simplify:
-            self.merge_pair = []
-        for strategies_vector in self.leaf_strategies:
-            # build edge_cost
-            dst_node = strategies_vector.node
-            for src_node in strategies_vector.predecessor_nodes:
-                if src_node not in self.nodes:
-                    continue
-                node_pair = (src_node, dst_node)
-                # src_index = strategies_vector.predecessor_nodes.index(src_node)
-                edge_cost = {}
-                for i in range(len(strategies_vector)):
-                    for j in range(len(src_node.strategies_vector)):
-                        edge_cost[(j, i)] = strategies_vector[i].resharding_costs[src_node][j]
-                self.edge_costs[node_pair] = edge_cost
-            # add parents and children attribute to node
-            setattr(dst_node, 'parents', strategies_vector.predecessor_nodes)
-            setattr(dst_node, 'children', strategies_vector.successor_nodes)
-            self._remove_invalid_node(dst_node, 'parents')
-            self._remove_invalid_node(dst_node, 'children')
-
-            if self.simplify and strategies_vector.check_merge():
-                for followed_node in strategies_vector.predecessor_nodes:
-                    self.merge_pair.append((followed_node, dst_node))
-
-    def get_edge_cost(self, src_node, dst_node):
-        return self.edge_costs[(src_node, dst_node)]
-
-    def merge_node(self, src_node, dst_node):
-        '''
-        To merge dst_node into src_node, we need to do it in following steps:
-        
-        1. For each strategy in dst_node, we need to pick an appropriate strategy
-        of src_node to merge, it is important because the logical resharding costs 
-        between the parents node of src_node and merged node depend on the src_node 
-        strategies dispatching. For example, for the graph 0->1->2, after merging node 1
-        into node 2, edge_costs[(node 0, node 2)][(0, 0)] = edge_costs[(node 0, node 1)][(0, x)]
-        x represents the picking strategy of node 1 merged into node 2 strategy 0.
-        
-        2. We need to accumulate the extra costs introduced by merging nodes, the extra costs
-        contains two parts, one is resharding costs between src_node strategy and dst_node strategy,
-        another is the origin extra costs in src_node strategy.
-
-        3. Build connections between new node pairs, and remove the src_node after all consumer nodes
-        detached from it.
-
-        Argument:
-            src_node(Node): The node will be merged into dst_node.
-            dst_node(Node): The node to integrate src_node.
-        '''
-        src_node_index = dst_node.parents.index(src_node)
-        # build merge_map
-        merge_map = {}
-        for src_index, strategy in enumerate(src_node.strategies_vector):
-            min_cost = INFINITY_COST
-            lowest_cost_index = -1
-            for dst_index, dst_strategy in enumerate(dst_node.strategies_vector):
-                resharding_cost = dst_strategy.resharding_costs[src_node][src_index]
-                if resharding_cost <= min_cost:
-                    min_cost = resharding_cost
-                    lowest_cost_index = dst_index
-            merge_map[src_index] = lowest_cost_index
-
-        # extra_node_cost for src node
-        self.extra_node_costs[src_node] = [0.0] * self.node_lens[src_node]
-        for src_index, strategy in enumerate(src_node.strategies_vector):
-            target_strate_index = merge_map[src_index]
-            target_strategy = dst_node.strategies_vector[target_strate_index]
-            self.extra_node_costs[src_node][src_index] += target_strategy.resharding_costs[src_node][src_index]
-            if dst_node in self.extra_node_costs:
-                self.extra_node_costs[src_node][src_index] += self.extra_node_costs[dst_node][target_strate_index]
-
-        # add new node pair to cost graph
-        for child_node in dst_node.children:
-            new_node_pair = (src_node, child_node)
-            old_node_pair = (dst_node, child_node)
-            if new_node_pair in self.edge_costs:
-                continue
-            edge_cost = {}
-            for i in range(self.node_lens[src_node]):
-                for j in range(self.node_lens[child_node]):
-                    dst_strate_index = merge_map[i]
-                    # dst_strategy = dst_node.strategies_vector[dst_strate_index]
-                    edge_cost[(i, j)] = self.edge_costs[old_node_pair][(dst_strate_index, j)]
-            if new_node_pair not in self.edge_costs:
-                self.edge_costs[new_node_pair] = edge_cost
-            else:
-                # we should accumulate the resharding costs if args of child node contain
-                # both src node and dst node.
-                for index_pair, resharding_cost in self.edge_costs[new_node_pair]:
-                    self.edge_costs[new_node_pair][index_pair] += edge_cost[index_pair]
-
-        # connect src node and children of dst node
-        dst_node.parents.remove(src_node)
-        src_node.children.remove(dst_node)
-        self.edge_costs.pop((src_node, dst_node))
-        for child_node in dst_node.children:
-            if child_node not in src_node.children:
-                src_node.children.append(child_node)
-            if src_node not in child_node.parents:
-                child_node.parents.append(src_node)
-            # remove dst node from cost graph when dst node has no producer.
-            if len(dst_node.parents) == 0:
-                child_node.parents.remove(dst_node)
-                node_pair = (dst_node, child_node)
-                self.edge_costs.pop(node_pair)
-        if len(dst_node.parents) == 0:
-            self.following_dict[dst_node] = src_node
-            dst_node.children = []
-
-    def _reindexing_src(self, src):
-        if src not in self.following_dict:
-            return src
-        return self._reindexing_src(self.following_dict[src])
-
-    def simplify_graph(self):
-        if not self.simplify:
-            return
-        self.merge_pair.reverse()
-        for (src_node, dst_node) in self.merge_pair:
-            self.merge_node(src_node, dst_node)
-        self.merge_pair.reverse()
-        reindexing_following_dict = {}
-        for dst, src in self.following_dict.items():
-            reindexing_following_dict[dst] = self._reindexing_src(src)
-        self.following_dict = reindexing_following_dict
diff --git a/colossalai/auto_parallel/tensor_shard/deprecated/graph_analysis.py b/colossalai/auto_parallel/tensor_shard/deprecated/graph_analysis.py
deleted file mode 100644
index 831e7eadd179..000000000000
--- a/colossalai/auto_parallel/tensor_shard/deprecated/graph_analysis.py
+++ /dev/null
@@ -1,163 +0,0 @@
-from dataclasses import dataclass
-from torch.fx.node import Node
-from torch.fx.graph import Graph
-from torch.fx.graph_module import GraphModule
-from collections import OrderedDict as ODict
-from typing import List, OrderedDict, Union, Any
-from colossalai.fx.passes.utils import get_node_module
-
-__all__ = ['LiveVariable', 'LiveVariableVector', 'LiveStage', 'GraphAnalyser']
-
-
-@dataclass
-class LiveVariable:
-    """
-    LiveVariable is a data structure to store the meta information of a variable for liveness analysis.
-    """
-    name: str
-    node: Node
-    is_inplace: bool
-
-
-class LiveVariableVector(list):
-    """
-    LiveVariableVector is a data structure to store the list of LiveVariable objects.
-    """
-
-    def exists(self, name) -> bool:
-        """
-        Check if a variable has already existed in the current list by name.
-        """
-        for var in self:
-            if name == var.name:
-                return True
-        return False
-
-    def get(self, name) -> LiveVariable:
-        for var in self:
-            if name == var.name:
-                return var
-        raise KeyError(f"Variable {name} is not found")
-
-    def copy(self) -> "LiveVariableVector":
-        """
-        Create a copy of this vector
-        """
-        vector = LiveVariableVector()
-        for var in self:
-            vector.append(var)
-        return vector
-
-
-@dataclass
-class LiveStage:
-    """
-    LiveStage is a data structure to record the living variables at this current node.
-    """
-    name: str
-    node: Node
-    all_live_vars: LiveVariableVector
-    unique_live_vars: LiveVariableVector
-
-
-class GraphAnalyser:
-
-    def __init__(self, gm: GraphModule):
-        self._gm = gm
-        self._graph = gm.graph
-
-    @property
-    def gm(self) -> GraphModule:
-        """
-        Return the GraphModule object associated with this analyser.
-        """
-        return self._gm
-
-    @property
-    def graph(self) -> Graph:
-        """
-        Return the Graph object associated with this analyser.
-        """
-        return self._graph
-
-    def liveness_analysis(self) -> List[LiveStage]:
-        """
-        Analyse the graph to obtain the variable liveness information. This function returns
-        an ordered dictionary where the key is the compute stage ID and the value is a LivenessStage object.
-        """
-        compute_nodes = self.graph.nodes
-        liveness_list = []
-
-        # checked: record all variables created since the first stage
-        # all: record the live variables only exist until the current stage.
-        #       this can be different from the `checked list`` as some varialbes may be destroyed prior to this stage.
-        # unique: record the unique live variables only exist until the current stage.
-        #       this is different from `all list` as some variables are duplicated.
-        checked_variables = LiveVariableVector()
-        all_live_variables = LiveVariableVector()
-        unique_live_vars = LiveVariableVector()
-
-        for idx, node in enumerate(compute_nodes):
-            #############################
-            # find new living variables #
-            #############################
-            # detect whether the current op is an in-place op
-            # if it is an in-place op, we would deem it as a duplciate var
-            is_inplace = False
-            if node.op == 'call_function':
-                # check if this is an inplace op such as torch.nn.functional.relu(x, inplace=True)
-                if node.kwargs.get('inplace', False):
-                    is_inplace = True
-            elif node.op == 'call_module':
-                # to check if this is an inplace op such as torch.nn.Relu(inplace=True)
-                module = get_node_module(node)
-                if getattr(module, 'inplace', False):
-                    is_inplace = True
-
-            # add the output var
-            meta = getattr(node, '_meta_data', None)
-            live_var = LiveVariable(name=node.name, node=node, is_inplace=is_inplace)
-            if not is_inplace:
-                unique_live_vars.append(live_var)
-            checked_variables.append(live_var)
-            all_live_variables.append(live_var)
-
-            # check if any input is not checked yet
-            for arg in node.args:
-                if not isinstance(arg, Node):
-                    continue
-                arg_name = arg.name
-                if not checked_variables.exists(arg_name):
-                    live_var_from_arg = LiveVariable(name=arg_name, node=node, is_inplace=False)
-                    all_live_variables.append(live_var_from_arg)
-                    checked_variables.append(live_var_from_arg)
-                    unique_live_vars.append(live_var_from_arg)
-
-            # TODO: add the logic to remove live variables
-            # this should be completed if we are able to trace the backward compute graph
-
-            # add this stage to liveness dict
-            stage = LiveStage(name=node.name,
-                              node=node,
-                              all_live_vars=all_live_variables.copy(),
-                              unique_live_vars=unique_live_vars.copy())
-            # if a LiveStage is covered by another LiveStage, we just keep the larger one.
-            replace = False
-            for index, prev_stage in enumerate(liveness_list):
-                all_covered = True
-                for ele in prev_stage.unique_live_vars:
-                    if ele not in stage.unique_live_vars:
-                        all_covered = False
-                        break
-                if all_covered:
-                    replace = True
-                    break
-            if replace:
-                liveness_list[index] = stage
-            else:
-                liveness_list.append(stage)
-
-        return liveness_list
-
-    def get_alias_set(self):
-        pass
diff --git a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/__init__.py b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/__init__.py
deleted file mode 100644
index 723e1bcf95ed..000000000000
--- a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from .batch_norm_handler import BatchNormHandler
-from .bcast_op_handler import BcastOpHandler
-from .conv_handler import ConvHandler
-from .dot_handler import DotHandler
-from .embedding_handler import EmbeddingHandler
-from .layer_norm_handler import LayerNormHandler
-from .operator_handler import OperatorHandler
-from .reshape_handler import ReshapeHandler
-from .unary_elementwise_handler import UnaryElementwiseHandler
-from .where_handler import WhereHandler
-
-__all__ = [
-    'OperatorHandler', 'DotHandler', 'ConvHandler', 'BatchNormHandler', 'ReshapeHandler', 'BcastOpHandler',
-    'UnaryElementwiseHandler', 'EmbeddingHandler', 'WhereHandler', 'LayerNormHandler'
-]
diff --git a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/batch_norm_handler.py b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/batch_norm_handler.py
deleted file mode 100644
index 519436270828..000000000000
--- a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/batch_norm_handler.py
+++ /dev/null
@@ -1,492 +0,0 @@
-import operator
-from functools import reduce
-
-import torch
-from colossalai.auto_parallel.tensor_shard.deprecated._utils import \
-    ignore_sharding_exception
-from colossalai.auto_parallel.tensor_shard.deprecated.sharding_strategy import (ShardingStrategy, StrategiesVector)
-
-from .operator_handler import OperatorHandler
-
-__all__ = ['BatchNormHandler']
-
-
-class BatchNormHandler(OperatorHandler):
-    """
-    A OperatorHandler which deals with the sharding strategies of normalization.
-
-    To keep the math consistency, there are two way to do BatchNorm if the input
-    shards on batch dimension:
-    1. We gather the input partitions through batch dimension, then do the normal BatchNorm.
-    2. We do the SyncBatchNorm on the each input partition seperately, the SyncBN op will help
-       us to keep the computing correctness.
-    In this handler, both methods will be considered.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.input_data = self.predecessor_node[0]._meta_data
-        self.weight = self.module_named_parameters['weight']
-        self.bias = self.module_named_parameters['bias']
-        self.output_data = self.node._meta_data
-        self._sanity_check()
-
-    def _sanity_check(self):
-        '''
-        In sanity check, we need make sure the input data having correct dimension size.
-        For BatchNorm1d, the dim of input data should be 3([N, C, L]).
-        For BatchNorm2d, the dim of input data should be 4([N, C, H, W]).
-        For BatchNorm3d, the dim of input data should be 5([N, C, H, W, D]).
-        '''
-        assert self.input_data.dim() in (3, 4,
-                                         5), f'We suppose the dim of input fed into conv op should in range of [3, 5].'
-
-    def _generate_compute_cost(self, bs, channel_in):
-        '''
-        Compute the computation cost per device with this specific strategy.
-
-        Note: compute_cost need to be devided by TFLOPS, now it just shows the computation size.
-
-        Argument:
-            bs(int): Batch size of the input data.
-            channel_in(int): The channel dimension of input data.
-
-        Return:
-            compute_cost(float): Computation cost per device with this specific strategy
-        '''
-        # TODO: compute_cost need to be devided by TFLOPS, now it just shows the computation size.
-        # TODO: a constant coefficient need to be added.
-        # 1D: (L) * N * Cin
-        # 2D: (H * W) * N  * Cin
-        # 3D: (H * W  * D) * N  * Cin
-
-        input_size = self.input_data.shape[2:]
-        input_size_product = reduce(operator.mul, input_size, 1)
-        forward_compute_cost = input_size_product * bs * channel_in
-        backward_activation_compute_cost = input_size_product * bs * channel_in
-        backward_weight_compute_cost = input_size_product * bs * channel_in
-        backward_compute_cost = backward_activation_compute_cost + backward_weight_compute_cost
-        compute_cost = forward_compute_cost + backward_compute_cost
-        return compute_cost
-
-    def _generate_memory_cost(self, sharding_size_forward, sharding_size_backward_activation, sharding_size_weight):
-        '''
-        Compute the memory cost per device with this specific strategy.
-
-        Argument:
-            sharding_size_forward(int): The forward activation will be divided
-                into sharding_size_forward number partions.
-            sharding_size_backward_activation(int): The backward activation will 
-                be divided into sharding_size_backward_activation number partions.
-            sharding_size_weight(int): The backward weight will be divided
-                into sharding_size_weight number partions.
-
-        Return:
-            memory_cost(Tuple[float]): Memory cost per device with this 
-                specific strategy, the first element of this tuple is forward
-                memory cost, and the second element of this tuple is backward
-                memory cost.
-            memory_cost_forward(float): Memory cost of forward activation per 
-                device with this specific strategy.
-            memory_cost_backward_activation(float): Memory cost of backward activation 
-                per device with this specific strategy.
-        '''
-        # compute the memory cost of this strategy
-        dtype = self.input_data.dtype
-        numel_output = self.output_data.numel()
-        numel_input = numel_output
-        numel_weight = self.weight.numel()
-        size_per_elem_bytes = torch.tensor([], dtype=dtype).element_size()
-
-        # forward memory_cost
-        memory_cost_forward_activation = numel_output * size_per_elem_bytes / sharding_size_forward
-        memory_cost_forward_weight = numel_weight * size_per_elem_bytes / sharding_size_weight
-        memory_cost_forward = memory_cost_forward_activation + memory_cost_forward_weight
-
-        # backward memory_cost
-        memory_cost_backward_activation = numel_input * size_per_elem_bytes / sharding_size_backward_activation
-        memory_cost_backward_weight = numel_weight * size_per_elem_bytes / sharding_size_weight
-        memory_cost_backward = memory_cost_backward_activation + memory_cost_backward_weight
-
-        # memory_cost pair
-        memory_cost = (memory_cost_forward, memory_cost_backward)
-
-        return memory_cost, memory_cost_forward_activation, memory_cost_backward_activation
-
-    @ignore_sharding_exception
-    def split_input_channel(self, mesh_dim_0, mesh_dim_1):
-        name = f'RS{mesh_dim_0} = RS{mesh_dim_0} x S{mesh_dim_0}'
-
-        dim_partition_dict_for_input = {1: [mesh_dim_0]}
-        sharding_spec_for_input = self._generate_sharding_spec(self.input_data, dim_partition_dict_for_input)
-
-        dim_partition_dict_for_weight = {0: [mesh_dim_0]}
-        sharding_spec_for_weight = self._generate_sharding_spec(self.weight, dim_partition_dict_for_weight)
-
-        dim_partition_dict_for_output = {1: [mesh_dim_0]}
-        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
-
-        # generate resharding cost for this strategy
-        resharding_costs = self._generate_resharding_costs([sharding_spec_for_input])
-
-        # compute the computation cost of this strategy
-        bs = self.input_data.shape[0]
-        channel_in = self.input_data.shape[1] // self.device_mesh.shape[mesh_dim_0]
-        compute_cost = self._generate_compute_cost(bs, channel_in)
-
-        # compute the memory cost of this strategy
-        sharding_size_forward = self.device_mesh.shape[mesh_dim_0]
-        sharding_size_backward_activation = self.device_mesh.shape[mesh_dim_0]
-        sharding_size_weight = self.device_mesh.shape[mesh_dim_0]
-        memory_cost, _, _ = self._generate_memory_cost(sharding_size_forward, sharding_size_backward_activation,
-                                                       sharding_size_weight)
-
-        # This strategy do not need to do all_reduce operation
-        communication_cost = 0
-        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_output,
-                                               compute_cost=compute_cost,
-                                               communication_cost=communication_cost,
-                                               memory_cost=memory_cost,
-                                               resharding_costs=resharding_costs,
-                                               input_shardings=(sharding_spec_for_input, sharding_spec_for_weight))
-
-        self.strategies_vector.append(sharding_strategies)
-
-        # shard the output batch dimension to get all possible sharding strategy from this basic strategy
-        new_name = f'S{mesh_dim_1}S{mesh_dim_0} = RS{mesh_dim_0} x S{mesh_dim_0}'
-
-        dim_partition_dict_for_output = {0: [mesh_dim_1], 1: [mesh_dim_0]}
-        new_sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
-        # the computation cost is all the same
-        new_compute_cost = compute_cost
-
-        # the memory cost need to be recomputed
-        # compute the memroy cost of new strategy
-        new_sharding_size_forward = self.device_mesh.shape[mesh_dim_0] * self.device_mesh.shape[mesh_dim_1]
-        sharding_size_backward_activation = self.device_mesh.shape[mesh_dim_0]
-        sharding_size_weight = self.device_mesh.shape[mesh_dim_0]
-        new_memory_cost, _, memory_cost_backward_activation = self._generate_memory_cost(
-            new_sharding_size_forward, sharding_size_backward_activation, sharding_size_weight)
-
-        # the communication cost need to count the sharding cost into this strategy
-        # compute the communication cost of new strategy
-        origin_communication_cost = communication_cost
-        tiny_shard_cost = 10
-        new_forward_communication_cost = tiny_shard_cost
-        # we need to all gather the batch dimension for the basic strategy
-        new_backward_communication_cost = self.device_mesh.all_gather_cost(memory_cost_backward_activation, mesh_dim_1)
-        new_communication_cost = origin_communication_cost + new_forward_communication_cost + new_backward_communication_cost
-
-        sharding_strategies = ShardingStrategy(new_name,
-                                               output_sharding_spec=new_sharding_spec_for_output,
-                                               compute_cost=new_compute_cost,
-                                               communication_cost=new_communication_cost,
-                                               memory_cost=new_memory_cost,
-                                               resharding_costs=resharding_costs,
-                                               input_shardings=(sharding_spec_for_input, sharding_spec_for_weight))
-
-        self.strategies_vector.append(sharding_strategies)
-
-    @ignore_sharding_exception
-    def split_input_channel_1d(self, mesh_dim_0, mesh_dim_1):
-        name = f'RS{mesh_dim_0}{mesh_dim_1} = RS{mesh_dim_0}{mesh_dim_1} x S{mesh_dim_0}{mesh_dim_1}'
-
-        dim_partition_dict_for_input = {1: [mesh_dim_0, mesh_dim_1]}
-        sharding_spec_for_input = self._generate_sharding_spec(self.input_data, dim_partition_dict_for_input)
-
-        dim_partition_dict_for_weight = {0: [mesh_dim_0, mesh_dim_1]}
-        sharding_spec_for_weight = self._generate_sharding_spec(self.weight, dim_partition_dict_for_weight)
-
-        dim_partition_dict_for_output = {1: [mesh_dim_0, mesh_dim_1]}
-        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
-
-        # generate resharding cost for this strategy
-        resharding_costs = self._generate_resharding_costs([sharding_spec_for_input])
-
-        # compute the computation cost of this strategy
-        bs = self.input_data.shape[0]
-        channel_in = self.input_data.shape[1] // (self.device_mesh.shape[mesh_dim_0] *
-                                                  self.device_mesh.shape[mesh_dim_1])
-        compute_cost = self._generate_compute_cost(bs, channel_in)
-
-        # compute the memory cost of this strategy
-        sharding_size_forward = self.device_mesh.shape[mesh_dim_0] * self.device_mesh.shape[mesh_dim_1]
-        sharding_size_backward_activation = self.device_mesh.shape[mesh_dim_0] * self.device_mesh.shape[mesh_dim_1]
-        sharding_size_weight = self.device_mesh.shape[mesh_dim_0] * self.device_mesh.shape[mesh_dim_1]
-        memory_cost, _, _ = self._generate_memory_cost(sharding_size_forward, sharding_size_backward_activation,
-                                                       sharding_size_weight)
-
-        # This strategy do not need to do all_reduce operation
-        communication_cost = 0
-        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_output,
-                                               compute_cost=compute_cost,
-                                               communication_cost=communication_cost,
-                                               memory_cost=memory_cost,
-                                               resharding_costs=resharding_costs,
-                                               input_shardings=(sharding_spec_for_input, sharding_spec_for_weight))
-
-        self.strategies_vector.append(sharding_strategies)
-
-    @ignore_sharding_exception
-    def non_split(self, mesh_dim_0, mesh_dim_1):
-        name = f'RR = RR x R'
-
-        dim_partition_dict_for_input = {}
-        sharding_spec_for_input = self._generate_sharding_spec(self.input_data, dim_partition_dict_for_input)
-
-        dim_partition_dict_for_weight = {}
-        sharding_spec_for_weight = self._generate_sharding_spec(self.weight, dim_partition_dict_for_weight)
-
-        dim_partition_dict_for_output = {}
-        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
-
-        # generate resharding cost for this strategy
-        resharding_costs = self._generate_resharding_costs([sharding_spec_for_input])
-
-        # compute the computation cost of this strategy
-        bs = self.input_data.shape[0]
-        channel_in = self.input_data.shape[1]
-        compute_cost = self._generate_compute_cost(bs, channel_in)
-
-        # compute the memory cost of this strategy
-        sharding_size_forward = 1
-        sharding_size_backward_activation = 1
-        sharding_size_weight = 1
-        memory_cost, _, _ = self._generate_memory_cost(sharding_size_forward, sharding_size_backward_activation,
-                                                       sharding_size_weight)
-
-        # This strategy do not need to do all_reduce operation
-        communication_cost = 0
-        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_output,
-                                               compute_cost=compute_cost,
-                                               communication_cost=communication_cost,
-                                               memory_cost=memory_cost,
-                                               resharding_costs=resharding_costs,
-                                               input_shardings=(sharding_spec_for_input, sharding_spec_for_weight))
-
-        self.strategies_vector.append(sharding_strategies)
-
-        def _construct_batch_sharding_strategies(mesh_dim_list, new_name):
-            dim_partition_dict_for_output = {0: mesh_dim_list}
-            new_sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
-
-            # the computation cost is all the same
-            new_compute_cost = compute_cost
-
-            # the memory cost need to be recomputed
-            new_sharding_size_input = 1
-            for mesh_dim in mesh_dim_list:
-                new_sharding_size_input = new_sharding_size_input * self.device_mesh.shape[mesh_dim]
-            new_memory_cost, _, memory_cost_backward_activation = self._generate_memory_cost(
-                new_sharding_size_input, sharding_size_backward_activation, sharding_size_weight)
-
-            # the communication cost need to count the sharding cost into this strategy
-            origin_communication_cost = communication_cost
-            tiny_shard_cost = 10
-            new_forward_communication_cost = tiny_shard_cost
-            if len(mesh_dim_list) == 1:
-                new_backward_communication_cost = self.device_mesh.all_gather_cost(memory_cost_backward_activation,
-                                                                                   mesh_dim_list[0])
-            else:
-                new_backward_communication_cost = self.device_mesh.flatten_device_mesh.all_gather_cost(
-                    memory_cost_backward_activation, 0)
-            new_communication_cost = origin_communication_cost + new_forward_communication_cost + new_backward_communication_cost
-
-            new_sharding_strategy = ShardingStrategy(new_name,
-                                                     output_sharding_spec=new_sharding_spec_for_output,
-                                                     compute_cost=new_compute_cost,
-                                                     communication_cost=new_communication_cost,
-                                                     memory_cost=new_memory_cost,
-                                                     resharding_costs=resharding_costs,
-                                                     input_shardings=(sharding_spec_for_input,
-                                                                      sharding_spec_for_weight))
-
-            return new_sharding_strategy
-
-        # shard the output batch dimension to get all possible sharding strategy from this basic strategy
-        # shard on mesh_dim_0
-        new_name = f'S{mesh_dim_0}R = RR x R'
-        mesh_dim_list = [mesh_dim_0]
-        new_sharding_strategy = _construct_batch_sharding_strategies(mesh_dim_list, new_name)
-        self.strategies_vector.append(new_sharding_strategy)
-
-        # shard on mesh_dim_1
-        new_name = f'S{mesh_dim_1}R = RR x R'
-        mesh_dim_list = [mesh_dim_1]
-        new_sharding_strategy = _construct_batch_sharding_strategies(mesh_dim_list, new_name)
-        self.strategies_vector.append(new_sharding_strategy)
-
-        # shard on mesh_dim_0, mesh_dim_1
-        new_name = f'S{mesh_dim_0}{mesh_dim_1}R = RR x R'
-        mesh_dim_list = [mesh_dim_0, mesh_dim_1]
-        new_sharding_strategy = _construct_batch_sharding_strategies(mesh_dim_list, new_name)
-        self.strategies_vector.append(new_sharding_strategy)
-
-    @ignore_sharding_exception
-    def split_input_batch(self, mesh_dim_0):
-        name = f'S{mesh_dim_0}R = S{mesh_dim_0}R x R WITH SYNC_BN'
-
-        dim_partition_dict_for_input = {0: [mesh_dim_0]}
-        sharding_spec_for_input = self._generate_sharding_spec(self.input_data, dim_partition_dict_for_input)
-
-        dim_partition_dict_for_weight = {}
-        sharding_spec_for_weight = self._generate_sharding_spec(self.weight, dim_partition_dict_for_weight)
-
-        dim_partition_dict_for_output = {0: [mesh_dim_0]}
-        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
-
-        # generate resharding cost for this strategy
-        resharding_costs = self._generate_resharding_costs([sharding_spec_for_input])
-
-        # compute the computation cost of this strategy
-        bs = self.input_data.shape[0] // self.device_mesh.shape[mesh_dim_0]
-        channel_in = self.input_data.shape[1]
-        compute_cost = self._generate_compute_cost(bs, channel_in)
-
-        # compute the memory cost of this strategy
-        sharding_size_forward = self.device_mesh.shape[mesh_dim_0]
-        sharding_size_backward_activation = self.device_mesh.shape[mesh_dim_0]
-        sharding_size_weight = 1
-        memory_cost, memory_cost_forward_activation, _ = self._generate_memory_cost(sharding_size_forward,
-                                                                                    sharding_size_backward_activation,
-                                                                                    sharding_size_weight)
-
-        # the all reduce communication will happen during the sync bn computing.
-        communication_cost = self.device_mesh.all_reduce_cost(memory_cost_forward_activation, mesh_dim_0)
-        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_output,
-                                               compute_cost=compute_cost,
-                                               communication_cost=communication_cost,
-                                               memory_cost=memory_cost,
-                                               resharding_costs=resharding_costs,
-                                               input_shardings=(sharding_spec_for_input, sharding_spec_for_weight))
-
-        self.strategies_vector.append(sharding_strategies)
-
-    @ignore_sharding_exception
-    def split_input_batch_1d(self, mesh_dim_0, mesh_dim_1):
-        name = f'S{mesh_dim_0}{mesh_dim_1}R = S{mesh_dim_0}{mesh_dim_1}R x R WITH SYNC_BN'
-
-        dim_partition_dict_for_input = {0: [mesh_dim_0, mesh_dim_1]}
-        sharding_spec_for_input = self._generate_sharding_spec(self.input_data, dim_partition_dict_for_input)
-
-        dim_partition_dict_for_weight = {}
-        sharding_spec_for_weight = self._generate_sharding_spec(self.weight, dim_partition_dict_for_weight)
-
-        dim_partition_dict_for_output = {0: [mesh_dim_0, mesh_dim_1]}
-        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
-
-        # generate resharding cost for this strategy
-        resharding_costs = self._generate_resharding_costs([sharding_spec_for_input])
-
-        # compute the computation cost of this strategy
-        bs = self.input_data.shape[0] // (self.device_mesh.shape[mesh_dim_0] * self.device_mesh.shape[mesh_dim_1])
-        channel_in = self.input_data.shape[1]
-        compute_cost = self._generate_compute_cost(bs, channel_in)
-
-        # compute the memory cost of this strategy
-        sharding_size_forward = self.device_mesh.shape[mesh_dim_0] * self.device_mesh.shape[mesh_dim_1]
-        sharding_size_backward_activation = self.device_mesh.shape[mesh_dim_0] * self.device_mesh.shape[mesh_dim_1]
-        sharding_size_weight = 1
-        memory_cost, memory_cost_forward_activation, _ = self._generate_memory_cost(sharding_size_forward,
-                                                                                    sharding_size_backward_activation,
-                                                                                    sharding_size_weight)
-
-        # the all reduce communication will happen during the sync bn computing.
-        communication_cost = self.device_mesh.flatten_device_mesh.all_reduce_cost(memory_cost_forward_activation, 0)
-        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_output,
-                                               compute_cost=compute_cost,
-                                               communication_cost=communication_cost,
-                                               memory_cost=memory_cost,
-                                               resharding_costs=resharding_costs,
-                                               input_shardings=(sharding_spec_for_input, sharding_spec_for_weight))
-
-        self.strategies_vector.append(sharding_strategies)
-
-    @ignore_sharding_exception
-    def split_input_both_dim(self, mesh_dim_0, mesh_dim_1):
-        name = f'S{mesh_dim_0}S{mesh_dim_1} = S{mesh_dim_0}S{mesh_dim_1} x S{mesh_dim_1} WITH SYNC_BN'
-
-        dim_partition_dict_for_input = {0: [mesh_dim_0], 1: [mesh_dim_1]}
-        sharding_spec_for_input = self._generate_sharding_spec(self.input_data, dim_partition_dict_for_input)
-
-        dim_partition_dict_for_weight = {0: [mesh_dim_1]}
-        sharding_spec_for_weight = self._generate_sharding_spec(self.weight, dim_partition_dict_for_weight)
-
-        dim_partition_dict_for_output = {0: [mesh_dim_0], 1: [mesh_dim_1]}
-        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
-
-        # generate resharding cost for this strategy
-        resharding_costs = self._generate_resharding_costs([sharding_spec_for_input])
-
-        # compute the computation cost of this strategy
-        bs = self.input_data.shape[0] // self.device_mesh.shape[mesh_dim_0]
-        channel_in = self.input_data.shape[1] // self.device_mesh.shape[mesh_dim_1]
-        compute_cost = self._generate_compute_cost(bs, channel_in)
-
-        # compute the memory cost of this strategy
-        sharding_size_forward = self.device_mesh.shape[mesh_dim_0] * self.device_mesh.shape[mesh_dim_1]
-        sharding_size_backward_activation = self.device_mesh.shape[mesh_dim_0] * self.device_mesh.shape[mesh_dim_1]
-        sharding_size_weight = self.device_mesh.shape[mesh_dim_1]
-        memory_cost, memory_cost_forward_activation, _ = self._generate_memory_cost(sharding_size_forward,
-                                                                                    sharding_size_backward_activation,
-                                                                                    sharding_size_weight)
-
-        # the all reduce communication will happen during the sync bn computing.
-        communication_cost = self.device_mesh.all_reduce_cost(memory_cost_forward_activation, mesh_dim_0)
-        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_output,
-                                               compute_cost=compute_cost,
-                                               communication_cost=communication_cost,
-                                               memory_cost=memory_cost,
-                                               resharding_costs=resharding_costs,
-                                               input_shardings=(sharding_spec_for_input, sharding_spec_for_weight))
-
-        self.strategies_vector.append(sharding_strategies)
-
-    def register_strategy(self) -> StrategiesVector:
-        '''
-        Generate every possible strategies for a BatchNorm node, and record all strategies into the strategies_vector.
-
-        Example:
-            norm_handler = BatchNormHandler(node,  strategies_vector,
-                                               self.shape_consistency_manager)
-            norm_handler.register_strategy()
-            for strategy in norm_handler.strategies_vector:
-                print(f'{strategy.name}, computation_cost: {strategy.compute_cost}, memory_cost: {strategy.memory_cost}')
-        
-        Output:
-            RS0 = RS0 x S0, computation_cost: 131072, memory_cost: 524288.0
-            RS1 = RS1 x S1, computation_cost: 131072, memory_cost: 524288.0
-            RR = RR x R, computation_cost: 262144, memory_cost: 1048576
-            RS01 = RS01 x S01, computation_cost: 65536, memory_cost: 262144.0
-        '''
-
-        # RS = RS x S and strategies based on it, such as
-        # SS = RS x S
-        self.split_input_channel(0, 1)
-        self.split_input_channel(1, 0)
-
-        # RR = RR x R and strategies based on it, such as
-        # SR = SR x R
-        self.non_split(0, 1)
-
-        # RS01 = RS01 x S01
-        self.split_input_channel_1d(0, 1)
-
-        # SR = SR x R WITH SYNC_BN
-        self.split_input_batch(0)
-        self.split_input_batch(1)
-
-        # SS = SS x S WITH SYNC_BN
-        self.split_input_both_dim(0, 1)
-        self.split_input_both_dim(1, 0)
-
-        # S01R = S01R x R WITH SYNC_BN
-        self.split_input_batch_1d(0, 1)
-
-        return self.strategies_vector
diff --git a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/bcast_op_handler.py b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/bcast_op_handler.py
deleted file mode 100644
index 6ac6dce76675..000000000000
--- a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/bcast_op_handler.py
+++ /dev/null
@@ -1,552 +0,0 @@
-import operator
-import warnings
-from copy import deepcopy
-from functools import reduce
-from typing import Dict, List
-
-import torch
-from colossalai.auto_parallel.tensor_shard.deprecated._utils import (enumerate_all_possible_1d_sharding,
-                                                                     enumerate_all_possible_2d_sharding,
-                                                                     ignore_sharding_exception)
-from colossalai.auto_parallel.tensor_shard.deprecated.sharding_strategy import (ShardingStrategy, StrategiesVector)
-from colossalai.tensor.shape_consistency import ShapeConsistencyManager
-from colossalai.tensor.sharding_spec import ShardingSpec
-
-from .operator_handler import OperatorHandler
-
-__all__ = ['BcastOpHandler']
-
-
-class BcastOpHandler(OperatorHandler):
-    """
-    An OperatorHandler which deals with the sharding strategies of broadcast operators(such as operator.add).
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        assert len(self.predecessor_node) == 2
-        self.lhs_data = self.predecessor_node[0]._meta_data
-        self.rhs_data = self.predecessor_node[1]._meta_data
-        self.lhs = self.predecessor_node[0]
-        self.rhs = self.predecessor_node[1]
-        self.output_data = self.node._meta_data
-
-    def _generate_sharding_spec(self, input_: torch.Tensor, dim_partition_dict: Dict[int, List[int]]) -> ShardingSpec:
-        shape = list(input_.shape)
-
-        # padding the shape to the same length as output_data
-        while len(shape) < self.output_data.dim():
-            shape.insert(0, 1)
-        shape = torch.Size(shape)
-
-        # if the sharding happens on a size one dimension, we should record it as R.
-        processed_dim_partition_dict = deepcopy(dim_partition_dict)
-        for dim_index, _ in dim_partition_dict.items():
-            if shape[dim_index] == 1:
-                processed_dim_partition_dict.pop(dim_index)
-        for dim_index, sharding_index_list in processed_dim_partition_dict.items():
-            sharding_list = [self.device_mesh.mesh_shape[sharding_index] for sharding_index in sharding_index_list]
-            sharding_size = reduce(operator.mul, sharding_list, 1)
-            assert shape[
-                dim_index] % sharding_size == 0, f'we cannot shard the {dim_index} dimension of tensor into {sharding_size} partitions.'
-        sharding_spec = ShardingSpec(device_mesh=self.device_mesh,
-                                     entire_shape=shape,
-                                     dim_partition_dict=processed_dim_partition_dict)
-
-        return sharding_spec
-
-    def _generate_compute_cost(self, total_sharding_size):
-        lhs_matrix_shape = self.lhs_data.shape[-2:]
-        rhs_matrix_shape = self.rhs_data.shape[-2:]
-        batch_dimensions_shape = self.output_data.shape[:-2]
-        batch_dimensions_product = reduce(operator.mul, batch_dimensions_shape, 1)
-        compute_cost = reduce(
-            operator.mul, lhs_matrix_shape) * rhs_matrix_shape[0] * batch_dimensions_product * 2 / total_sharding_size
-        return compute_cost
-
-    def _generate_resharding_costs(self, sharding_specs):
-        # The resharding_cost of weight is counted due to sharing weight cases.
-        dtype = self.node._meta_data.dtype
-        nodes = self.predecessor_node
-        resharding_costs = {}
-        size_per_elem_bytes = torch.tensor([], dtype=dtype).element_size()
-
-        # shape consistency manager is a singleton class
-        shape_consistency_manager = ShapeConsistencyManager()
-
-        for input_node, input_spec in zip(nodes, sharding_specs):
-            resharding_costs[input_node] = []
-            for strategy in input_node.strategies_vector:
-                input_sharding_spec = strategy.output_sharding_spec
-                assert isinstance(input_sharding_spec, ShardingSpec), f'The input node should NOT be a tuple of tensor.'
-                # if the input shape is smaller than the target input, we will fill the input to the same length as target.
-                # Then, use the padded input sharding spec to compute the resharding cost.
-                if len(input_sharding_spec.entire_shape) < len(input_spec.entire_shape):
-                    new_entire_shape = list(input_sharding_spec.entire_shape)
-                    while len(new_entire_shape) < len(input_spec.entire_shape):
-                        new_entire_shape.insert(0, 1)
-                    new_entire_shape = torch.Size(new_entire_shape)
-                    new_device_mesh = input_sharding_spec.device_mesh
-                    new_dim_partition_dict = input_sharding_spec.dim_partition_dict
-                    input_sharding_spec = ShardingSpec(device_mesh=new_device_mesh,
-                                                       entire_shape=new_entire_shape,
-                                                       dim_partition_dict=new_dim_partition_dict)
-
-                # compute the resharding cost
-                _, _, total_resharding_cost = shape_consistency_manager.shape_consistency(
-                    input_sharding_spec, input_spec)
-
-                # we need multiply the size of elem dtype to get correct communication cost
-                resharding_cost = total_resharding_cost["total"] * size_per_elem_bytes
-                resharding_costs[input_node].append(resharding_cost)
-
-        return resharding_costs
-
-    def _convert_partition_dict_to_sharding_spec(self, dim_partition_list):
-
-        sharding_spec_list = []
-        check_duplicated_list = []
-        for output_dim_partition_dict in dim_partition_list:
-            try:
-                output_sharding_spec = self._generate_sharding_spec(self.output_data, output_dim_partition_dict)
-            except AssertionError as e:
-                warnings.warn(f'{e}')
-                break
-            sharding_seq = output_sharding_spec.sharding_sequence
-            if sharding_seq not in check_duplicated_list:
-                check_duplicated_list.append(sharding_seq)
-                sharding_spec_list.append(output_sharding_spec)
-
-        return sharding_spec_list
-
-    def _enumerate_all_possible_output(self, mesh_dim_0, mesh_dim_1):
-        # use mesh_dim_0, mesh_dim_1 instead of constant 0, 1 in here for N-D device mesh scaliablity.
-
-        output_dim_partition_list = []
-        dim_size = self.output_data.dim()
-        # enumerate all the 2D sharding cases
-        sharding_list_2d = enumerate_all_possible_2d_sharding(mesh_dim_0, mesh_dim_1, dim_size)
-        output_dim_partition_list.extend(sharding_list_2d)
-
-        # enumerate all the 1D sharding cases
-        sharding_list_1d_on_dim_0 = enumerate_all_possible_1d_sharding(mesh_dim_0, dim_size)
-        output_dim_partition_list.extend(sharding_list_1d_on_dim_0)
-        sharding_list_1d_on_dim_1 = enumerate_all_possible_1d_sharding(mesh_dim_1, dim_size)
-        output_dim_partition_list.extend(sharding_list_1d_on_dim_1)
-
-        # add empty dict for fully replicated case
-        output_dim_partition_list.append({})
-        output_sharding_spec_list = self._convert_partition_dict_to_sharding_spec(output_dim_partition_list)
-
-        return output_sharding_spec_list
-
-    @ignore_sharding_exception
-    def _register_strategy(self, output_sharding_spec):
-        dim_partition_dict_for_input = output_sharding_spec.dim_partition_dict
-        sharding_spec_for_lhs = self._generate_sharding_spec(self.lhs_data, dim_partition_dict_for_input)
-        sharding_spec_for_rhs = self._generate_sharding_spec(self.rhs_data, dim_partition_dict_for_input)
-
-        name = f'{output_sharding_spec.sharding_sequence} = {sharding_spec_for_lhs.sharding_sequence} x {sharding_spec_for_rhs.sharding_sequence}'
-        dim_partition_dict_for_output = output_sharding_spec.dim_partition_dict
-
-        # generate resharding cost for this strategy
-        resharding_costs = self._generate_resharding_costs([sharding_spec_for_lhs, sharding_spec_for_rhs])
-
-        # compute the computation cost of this strategy
-        sharding_dims = []
-        for mesh_dims in dim_partition_dict_for_output.values():
-            for mesh_dim in mesh_dims:
-                sharding_dims.append(self.device_mesh.shape[mesh_dim])
-        sharding_size = reduce(operator.mul, sharding_dims, 1)
-        memory_cost = self.output_data.numel() / sharding_size
-        compute_cost = memory_cost
-        communication_cost = 0
-
-        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=output_sharding_spec,
-                                               compute_cost=compute_cost,
-                                               communication_cost=communication_cost,
-                                               memory_cost=memory_cost,
-                                               resharding_costs=resharding_costs,
-                                               input_shardings=(sharding_spec_for_lhs, sharding_spec_for_rhs))
-
-        self.strategies_vector.append(sharding_strategies)
-
-    ##############################################
-    #used to generate strategies for torch.matmul#
-    ##############################################
-    @ignore_sharding_exception
-    def _registry_no_split_strategies_for_matmul(self, dim_partition_dict_for_batch_dim):
-        # this dim partition dict only describes the batch dimensions, but in this scenario,
-        # matrix dimensions are fully replicated, so it do not need extra process.
-        sharding_spec_for_lhs = self._generate_sharding_spec(self.lhs_data, dim_partition_dict_for_batch_dim)
-        sharding_spec_for_rhs = self._generate_sharding_spec(self.rhs_data, dim_partition_dict_for_batch_dim)
-        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_batch_dim)
-
-        name = f'{sharding_spec_for_output.sharding_sequence} = {sharding_spec_for_lhs.sharding_sequence} x {sharding_spec_for_rhs.sharding_sequence}'
-
-        # generate resharding cost for this strategy
-        resharding_costs = self._generate_resharding_costs([sharding_spec_for_lhs, sharding_spec_for_rhs])
-
-        # compute the memory cost of this strategy
-        batch_sharding_dims = []
-        for mesh_dims in dim_partition_dict_for_batch_dim.values():
-            for mesh_dim in mesh_dims:
-                batch_sharding_dims.append(self.device_mesh.shape[mesh_dim])
-        batch_sharding_size = reduce(operator.mul, batch_sharding_dims, 1)
-        # in this case, total_sharding_size is equal to the batch sharding size
-        memory_cost = self.output_data.numel() / batch_sharding_size
-
-        # compute the computation cost of this strategy
-        compute_cost = self._generate_compute_cost(batch_sharding_size)
-
-        # in this case, no communication takes place.
-        # TODO: add all-reduce cost if lhs or rhs is type of Parameters.
-        communication_cost = 0
-
-        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_output,
-                                               compute_cost=compute_cost,
-                                               communication_cost=communication_cost,
-                                               memory_cost=memory_cost,
-                                               resharding_costs=resharding_costs,
-                                               input_shardings=(sharding_spec_for_lhs, sharding_spec_for_rhs))
-
-        self.strategies_vector.append(sharding_strategies)
-
-    @ignore_sharding_exception
-    def _split_dim_i(self, dim_partition_dict_for_batch_dim, mesh_dim_on_matrix):
-        # A batched matrix multiplication can be viewed as [b, i, k] x [b, k, j] -> [b, i, j]
-        # this dim partition dict describe the batch dimensions, so we should append the matrix dimension sharding info on it.
-        # In this scenario, matrix dimensions will be sharded on 'i' dimension.
-
-        # in this case, the matrix dimensions of lhs is sharded on 'i' dimension.
-        dim_partition_dict_for_lhs = deepcopy(dim_partition_dict_for_batch_dim)
-        dim_partition_dict_for_lhs.update({-2: mesh_dim_on_matrix})
-
-        # in this case, the matrix dimensions of rhs is fully replicated.
-        dim_partition_dict_for_rhs = deepcopy(dim_partition_dict_for_batch_dim)
-
-        # in this case, the matrix dimensions of output is sharded on 'i' dimension.
-
-        dim_partition_dict_for_output = deepcopy(dim_partition_dict_for_batch_dim)
-        dim_partition_dict_for_output.update({-2: mesh_dim_on_matrix})
-
-        # generate sharding specs
-        sharding_spec_for_lhs = self._generate_sharding_spec(self.lhs_data, dim_partition_dict_for_lhs)
-        sharding_spec_for_rhs = self._generate_sharding_spec(self.rhs_data, dim_partition_dict_for_rhs)
-        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
-
-        name = f'{sharding_spec_for_output.sharding_sequence} = {sharding_spec_for_lhs.sharding_sequence} x {sharding_spec_for_rhs.sharding_sequence}'
-
-        # generate resharding cost for this strategy
-        resharding_costs = self._generate_resharding_costs([sharding_spec_for_lhs, sharding_spec_for_rhs])
-
-        # compute the memory cost of this strategy
-        total_sharding_dims = []
-
-        # append batch sharding dims
-        for mesh_dims in dim_partition_dict_for_batch_dim.values():
-            for mesh_dim in mesh_dims:
-                total_sharding_dims.append(self.device_mesh.shape[mesh_dim])
-
-        # append the sharding dims on matrix dimension
-        for mesh_dim in mesh_dim_on_matrix:
-            total_sharding_dims.append(self.device_mesh.shape[mesh_dim])
-        total_sharding_size = reduce(operator.mul, total_sharding_dims, 1)
-
-        # in this case, output_data uses all the sharding dims.
-        memory_cost = self.output_data.numel() / total_sharding_size
-        compute_cost = self._generate_compute_cost(total_sharding_size)
-
-        # TODO: add all-reduce cost if lhs or rhs is type of Parameters.
-        communication_cost = 0
-
-        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_output,
-                                               compute_cost=compute_cost,
-                                               communication_cost=communication_cost,
-                                               memory_cost=memory_cost,
-                                               resharding_costs=resharding_costs,
-                                               input_shardings=(sharding_spec_for_lhs, sharding_spec_for_rhs))
-
-        self.strategies_vector.append(sharding_strategies)
-
-    @ignore_sharding_exception
-    def _split_dim_k(self, dim_partition_dict_for_batch_dim, mesh_dim_on_matrix):
-        # A batched matrix multiplication can be viewed as [b, i, k] x [b, k, j] -> [b, i, j]
-        # this dim partition dict describe the batch dimensions, so we should append the matrix dimension sharding info on it.
-        # In this scenario, matrix dimensions will be sharded on 'k' dimension.
-
-        # in this case, the matrix dimensions of lhs is sharded on 'k' dimension.
-        dim_partition_dict_for_lhs = deepcopy(dim_partition_dict_for_batch_dim)
-        dim_partition_dict_for_lhs.update({-1: mesh_dim_on_matrix})
-
-        # in this case, the matrix dimensions of rhs is sharded on 'k' dimension.
-        dim_partition_dict_for_rhs = deepcopy(dim_partition_dict_for_batch_dim)
-        dim_partition_dict_for_rhs.update({-2: mesh_dim_on_matrix})
-
-        # in this case, the matrix dimensions of output is fully replicated.
-        dim_partition_dict_for_output = deepcopy(dim_partition_dict_for_batch_dim)
-
-        # generate sharding specs
-        sharding_spec_for_lhs = self._generate_sharding_spec(self.lhs_data, dim_partition_dict_for_lhs)
-        sharding_spec_for_rhs = self._generate_sharding_spec(self.rhs_data, dim_partition_dict_for_rhs)
-        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
-
-        name = f'{sharding_spec_for_output.sharding_sequence} = {sharding_spec_for_lhs.sharding_sequence} x {sharding_spec_for_rhs.sharding_sequence}'
-
-        # generate resharding cost for this strategy
-        resharding_costs = self._generate_resharding_costs([sharding_spec_for_lhs, sharding_spec_for_rhs])
-
-        # compute the memory cost of this strategy
-        total_sharding_dims = []
-        batch_sharding_dims = []
-        # append batch sharding dims
-        for mesh_dims in dim_partition_dict_for_batch_dim.values():
-            for mesh_dim in mesh_dims:
-                total_sharding_dims.append(self.device_mesh.shape[mesh_dim])
-                batch_sharding_dims.append(self.device_mesh.shape[mesh_dim])
-
-        # append the sharding dims on matrix dimension
-        for mesh_dim in mesh_dim_on_matrix:
-            total_sharding_dims.append(self.device_mesh.shape[mesh_dim])
-        batch_sharding_size = reduce(operator.mul, batch_sharding_dims, 1)
-        total_sharding_size = reduce(operator.mul, total_sharding_dims, 1)
-
-        # in this case, output_data is fully replicated on matrix dimensions.
-        memory_cost = self.output_data.numel() / batch_sharding_size
-
-        compute_cost = self._generate_compute_cost(total_sharding_size)
-
-        # TODO: add all-reduce cost if lhs or rhs is type of Parameters.
-        # The communication takes place during forward activation computation.
-        if len(mesh_dim_on_matrix) == 1:
-            communication_cost = self.device_mesh.all_reduce_cost(memory_cost, mesh_dim_on_matrix[0])
-        else:
-            communication_cost = self.device_mesh.flatten_device_mesh.all_reduce_cost(memory_cost, 0)
-
-        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_output,
-                                               compute_cost=compute_cost,
-                                               communication_cost=communication_cost,
-                                               memory_cost=memory_cost,
-                                               resharding_costs=resharding_costs,
-                                               input_shardings=(sharding_spec_for_lhs, sharding_spec_for_rhs))
-
-        self.strategies_vector.append(sharding_strategies)
-
-    @ignore_sharding_exception
-    def _split_dim_j(self, dim_partition_dict_for_batch_dim, mesh_dim_on_matrix):
-        # A batched matrix multiplication can be viewed as [b, i, k] x [b, k, j] -> [b, i, j]
-        # this dim partition dict describe the batch dimensions, so we should append the matrix dimension sharding info on it.
-        # In this scenario, matrix dimensions will be is sharded on 'j' dimension.
-
-        # in this case, the matrix dimensions of lhs is fully replicated.
-        dim_partition_dict_for_lhs = deepcopy(dim_partition_dict_for_batch_dim)
-
-        # in this case, the matrix dimensions of rhs is sharded on 'j' dimension.
-        dim_partition_dict_for_rhs = deepcopy(dim_partition_dict_for_batch_dim)
-        dim_partition_dict_for_rhs.update({-1: mesh_dim_on_matrix})
-
-        # in this case, the matrix dimensions of output is sharded on 'j' dimension.
-        dim_partition_dict_for_output = deepcopy(dim_partition_dict_for_batch_dim)
-        dim_partition_dict_for_output.update({-1: mesh_dim_on_matrix})
-
-        # generate sharding specs
-        sharding_spec_for_lhs = self._generate_sharding_spec(self.lhs_data, dim_partition_dict_for_lhs)
-        sharding_spec_for_rhs = self._generate_sharding_spec(self.rhs_data, dim_partition_dict_for_rhs)
-        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
-
-        name = f'{sharding_spec_for_output.sharding_sequence} = {sharding_spec_for_lhs.sharding_sequence} x {sharding_spec_for_rhs.sharding_sequence}'
-
-        # generate resharding cost for this strategy
-        resharding_costs = self._generate_resharding_costs([sharding_spec_for_lhs, sharding_spec_for_rhs])
-
-        # compute the memory cost of this strategy
-        total_sharding_dims = []
-
-        # append batch sharding dims
-        for mesh_dims in dim_partition_dict_for_batch_dim.values():
-            for mesh_dim in mesh_dims:
-                total_sharding_dims.append(self.device_mesh.shape[mesh_dim])
-
-        # append the sharding dims on matrix dimension
-        for mesh_dim in mesh_dim_on_matrix:
-            total_sharding_dims.append(self.device_mesh.shape[mesh_dim])
-        total_sharding_size = reduce(operator.mul, total_sharding_dims, 1)
-
-        # in this case, output_data uses all the sharding dims.
-        memory_cost = self.output_data.numel() / total_sharding_size
-        compute_cost = self._generate_compute_cost(total_sharding_size)
-
-        # TODO: add all-reduce cost if lhs or rhs is type of Parameters.
-        # The communication takes place during backward activation computation.
-        if len(mesh_dim_on_matrix) == 1:
-            communication_cost = self.device_mesh.all_reduce_cost(memory_cost, mesh_dim_on_matrix[0])
-        else:
-            communication_cost = self.device_mesh.flatten_device_mesh.all_reduce_cost(memory_cost, 0)
-
-        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_output,
-                                               compute_cost=compute_cost,
-                                               communication_cost=communication_cost,
-                                               memory_cost=memory_cost,
-                                               resharding_costs=resharding_costs,
-                                               input_shardings=(sharding_spec_for_lhs, sharding_spec_for_rhs))
-
-        self.strategies_vector.append(sharding_strategies)
-
-    def _registry_1d_strategies_for_matmul(self, dim_partition_dict, mesh_dim_list):
-        self._split_dim_i(dim_partition_dict, mesh_dim_list)
-        self._split_dim_k(dim_partition_dict, mesh_dim_list)
-        self._split_dim_j(dim_partition_dict, mesh_dim_list)
-
-    @ignore_sharding_exception
-    def _split_lhs_space_both_contract(self, mesh_dim_0, mesh_dim_1):
-        dim_partition_dict_for_lhs = {-2: [mesh_dim_0], -1: [mesh_dim_1]}
-        sharding_spec_for_lhs = self._generate_sharding_spec(self.lhs_data, dim_partition_dict_for_lhs)
-
-        dim_partition_dict_for_rhs = {-2: [mesh_dim_1]}
-        sharding_spec_for_rhs = self._generate_sharding_spec(self.rhs_data, dim_partition_dict_for_rhs)
-
-        dim_partition_dict_for_output = {-2: [mesh_dim_0]}
-        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
-
-        name = f'{sharding_spec_for_output.sharding_sequence} = {sharding_spec_for_lhs.sharding_sequence} x {sharding_spec_for_rhs.sharding_sequence}'
-
-        # generate resharding cost for this strategy
-        resharding_costs = self._generate_resharding_costs([sharding_spec_for_lhs, sharding_spec_for_rhs])
-
-        # compute the memory cost of this strategy
-        total_sharding_size = reduce(operator.mul, self.device_mesh.shape, 1)
-        output_sharding_size = reduce(operator.mul, self.output_data.shape, 1)
-        # in this case, output_data uses all the sharding dims.
-        memory_cost = self.output_data.numel() / output_sharding_size
-        compute_cost = self._generate_compute_cost(total_sharding_size)
-
-        # TODO: add all-reduce cost if lhs or rhs is type of Parameters.
-        # The communication takes place during forward activation computation.
-        communication_cost = self.device_mesh.all_reduce_cost(memory_cost, mesh_dim_1)
-
-        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_output,
-                                               compute_cost=compute_cost,
-                                               communication_cost=communication_cost,
-                                               memory_cost=memory_cost,
-                                               resharding_costs=resharding_costs,
-                                               input_shardings=(sharding_spec_for_lhs, sharding_spec_for_rhs))
-
-        self.strategies_vector.append(sharding_strategies)
-
-    @ignore_sharding_exception
-    def _split_rhs_space_both_contract(self, mesh_dim_0, mesh_dim_1):
-        dim_partition_dict_for_lhs = {-1: [mesh_dim_0]}
-        sharding_spec_for_lhs = self._generate_sharding_spec(self.lhs_data, dim_partition_dict_for_lhs)
-
-        dim_partition_dict_for_rhs = {-2: [mesh_dim_0], -1: [mesh_dim_1]}
-        sharding_spec_for_rhs = self._generate_sharding_spec(self.rhs_data, dim_partition_dict_for_rhs)
-
-        dim_partition_dict_for_output = {-1: [mesh_dim_1]}
-        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
-
-        name = f'{sharding_spec_for_output.sharding_sequence} = {sharding_spec_for_lhs.sharding_sequence} x {sharding_spec_for_rhs.sharding_sequence}'
-
-        # generate resharding cost for this strategy
-        resharding_costs = self._generate_resharding_costs([sharding_spec_for_lhs, sharding_spec_for_rhs])
-
-        # compute the memory cost of this strategy
-        total_sharding_size = reduce(operator.mul, self.device_mesh.shape, 1)
-        output_sharding_size = reduce(operator.mul, self.output_data.shape, 1)
-        # in this case, output_data uses all the sharding dims.
-        memory_cost = self.output_data.numel() / output_sharding_size
-        compute_cost = self._generate_compute_cost(total_sharding_size)
-
-        # TODO: add all-reduce cost if lhs or rhs is type of Parameters.
-        # The communication takes place during forward and backward activation computation.
-        communication_cost_forward_activation = self.device_mesh.all_reduce_cost(memory_cost, mesh_dim_0)
-        communication_cost_backward_activation = self.device_mesh.all_reduce_cost(memory_cost, mesh_dim_1)
-        communication_cost = communication_cost_backward_activation + communication_cost_forward_activation
-
-        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_output,
-                                               compute_cost=compute_cost,
-                                               communication_cost=communication_cost,
-                                               memory_cost=memory_cost,
-                                               resharding_costs=resharding_costs,
-                                               input_shardings=(sharding_spec_for_lhs, sharding_spec_for_rhs))
-
-        self.strategies_vector.append(sharding_strategies)
-
-    @ignore_sharding_exception
-    def _split_lhs_space_rhs_space(self, mesh_dim_0, mesh_dim_1):
-        dim_partition_dict_for_lhs = {-2: [mesh_dim_0]}
-        sharding_spec_for_lhs = self._generate_sharding_spec(self.lhs_data, dim_partition_dict_for_lhs)
-
-        dim_partition_dict_for_rhs = {-1: [mesh_dim_1]}
-        sharding_spec_for_rhs = self._generate_sharding_spec(self.rhs_data, dim_partition_dict_for_rhs)
-
-        dim_partition_dict_for_output = {-2: [mesh_dim_0], -1: [mesh_dim_1]}
-        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
-
-        name = f'{sharding_spec_for_output.sharding_sequence} = {sharding_spec_for_lhs.sharding_sequence} x {sharding_spec_for_rhs.sharding_sequence}'
-
-        # generate resharding cost for this strategy
-        resharding_costs = self._generate_resharding_costs([sharding_spec_for_lhs, sharding_spec_for_rhs])
-
-        # compute the memory cost of this strategy
-        total_sharding_size = reduce(operator.mul, self.device_mesh.shape, 1)
-        output_sharding_size = reduce(operator.mul, self.output_data.shape, 1)
-        # in this case, output_data uses all the sharding dims.
-        memory_cost = self.output_data.numel() / output_sharding_size
-        compute_cost = self._generate_compute_cost(total_sharding_size)
-
-        # TODO: add all-reduce cost if lhs or rhs is type of Parameters.
-        # The communication takes place during backward activation computation.
-        communication_cost = self.device_mesh.all_reduce_cost(memory_cost, mesh_dim_1)
-        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_output,
-                                               compute_cost=compute_cost,
-                                               communication_cost=communication_cost,
-                                               memory_cost=memory_cost,
-                                               resharding_costs=resharding_costs,
-                                               input_shardings=(sharding_spec_for_lhs, sharding_spec_for_rhs))
-
-        self.strategies_vector.append(sharding_strategies)
-
-    def _registry_2d_strategies_for_matmul(self):
-        self._split_lhs_space_both_contract(0, 1)
-        self._split_lhs_space_both_contract(1, 0)
-        self._split_rhs_space_both_contract(0, 1)
-        self._split_rhs_space_both_contract(1, 0)
-        self._split_lhs_space_rhs_space(0, 1)
-        self._split_lhs_space_rhs_space(1, 0)
-
-    def register_strategy(self) -> StrategiesVector:
-        MESH_DIM_LIST = [0, 1]
-        if self.node.target != torch.matmul:
-            output_sharding_specs = self._enumerate_all_possible_output(MESH_DIM_LIST[0], MESH_DIM_LIST[1])
-            for output_sharding_spec in output_sharding_specs:
-                self._register_strategy(output_sharding_spec)
-        else:
-            # we only care about the non-computing dimensions,
-            # therefore, we omit the last two dimensions.
-            dim_size = self.output_data.dim() - 2
-
-            # Both device mesh axises are uesd on batch dimensions
-            dim_partition_dicts_2d = enumerate_all_possible_2d_sharding(MESH_DIM_LIST[0], MESH_DIM_LIST[1], dim_size)
-            for dim_partition_dict in dim_partition_dicts_2d:
-                self._registry_no_split_strategies_for_matmul(dim_partition_dict)
-
-            # Only one device mesh axis is uesd on batch dimensions
-            for mesh_dim_index in [0, 1]:
-                dim_partition_dicts_1d = enumerate_all_possible_1d_sharding(MESH_DIM_LIST[mesh_dim_index], dim_size)
-                for dim_partition_dict in dim_partition_dicts_1d:
-                    self._registry_no_split_strategies_for_matmul(dim_partition_dict)
-                    self._registry_1d_strategies_for_matmul(dim_partition_dict, [MESH_DIM_LIST[mesh_dim_index - 1]])
-
-            # No device mesh axis is uesd on batch dimensions
-            dim_partition_dict_on_batch_dim = {}
-            self._registry_no_split_strategies_for_matmul(dim_partition_dict_on_batch_dim)
-            self._registry_1d_strategies_for_matmul(dim_partition_dict_on_batch_dim, MESH_DIM_LIST)
-            self._registry_2d_strategies_for_matmul()
diff --git a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/conv_handler.py b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/conv_handler.py
deleted file mode 100644
index d8952040dffe..000000000000
--- a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/conv_handler.py
+++ /dev/null
@@ -1,609 +0,0 @@
-import operator
-import warnings
-from functools import reduce
-
-import torch
-
-from colossalai.auto_parallel.tensor_shard.deprecated._utils import ignore_sharding_exception
-from colossalai.auto_parallel.tensor_shard.deprecated.sharding_strategy import ShardingStrategy, StrategiesVector
-
-from .operator_handler import OperatorHandler
-
-__all__ = ['ConvHandler']
-
-
-class ConvHandler(OperatorHandler):
-    """
-    An OperatorHandler which deals with the sharding strategies of Convolution.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.input_data = self.predecessor_node[0]._meta_data
-        self.weight = self.module_named_parameters['weight']
-        self.output_data = self.node._meta_data
-        self._sanity_check()
-
-    def _sanity_check(self):
-        '''
-        In sanity check, we need make sure the input data having correct dimension size.
-        For Conv1d, the dim of input data should be 3([N, C, L]).
-        For Conv2d, the dim of input data should be 4([N, C, H, W]).
-        For Conv3d, the dim of input data should be 5([N, C, H, W, D]).
-        '''
-        assert self.input_data.dim() in (3, 4,
-                                         5), f'We suppose the dim of input fed into conv op should in range of [3, 5].'
-
-    def _generate_compute_cost(self, bs, channel_in, channel_out):
-        '''
-        Compute the computation cost per device with this specific strategy.
-
-        Note: compute_cost need to be devided by TFLOPS, now it just shows the computation size.
-
-        Argument:
-            bs(int): Batch size of the input data.
-            channel_in(int): The channel dimension of input data.
-            channel_out(int): The out channel of the conv weight.
-
-        Return:
-            compute_cost(float): Computation cost per device with this specific strategy
-        '''
-        # TODO: compute_cost need to be devided by TFLOPS, now it just shows the computation size.
-        # 1D: (L) * N * Cout * Cin * kernel
-        # 2D: (H * W) * N * Cout * Cin * kernel
-        # 3D: (H * W  * D) * N * Cout * Cin * kernel
-        output_size = self.output_data.shape[2:]
-        output_size_product = reduce(operator.mul, output_size, 1)
-        input_size = self.input_data.shape[2:]
-        input_size_product = reduce(operator.mul, input_size, 1)
-        kernel_size = self.weight.shape[2:]
-        kernel_size_product = reduce(operator.mul, kernel_size, 1)
-        forward_compute_cost = output_size_product * bs * channel_in * channel_out * kernel_size_product
-        backward_activation_cost = input_size_product * bs * channel_in * channel_out * kernel_size_product
-        backward_weight_cost = output_size_product * bs * channel_in * channel_out * kernel_size_product
-        compute_cost = forward_compute_cost + backward_activation_cost + backward_weight_cost
-        return compute_cost
-
-    def _generate_memory_cost(self, sharding_size_forward, sharding_size_backward_activation, sharding_size_weight):
-        '''
-        Compute the memory cost per device with this specific strategy.
-
-        Argument:
-            sharding_size_forward(int): The forward activation will be divided
-                into sharding_size_forward number partions.
-            sharding_size_backward_activation(int): The backward activation will
-                be divided into sharding_size_backward_activation number partions.
-            sharding_size_weight(int): The backward weight will be divided
-                into sharding_size_weight number partions.
-
-        Return:
-            memory_cost(Tuple[float]): Memory cost per device with this
-                specific strategy, the first element of this tuple is forward
-                memory cost, and the second element of this tuple is backward
-                memory cost.
-            memory_cost_forward(float): Memory cost of forward activation per
-                device with this specific strategy.
-            memory_cost_backward_activation(float): Memory cost of backward activation
-                per device with this specific strategy.
-        '''
-        # compute the memory cost of this strategy
-        dtype = self.input_data.dtype
-        numel_output = self.output_data.numel()
-        numel_input = self.input_data.numel()
-        numel_weight = self.weight.numel()
-        size_per_elem_bytes = torch.tensor([], dtype=dtype).element_size()
-
-        # forward memory_cost
-        memory_cost_forward_activation = numel_output * size_per_elem_bytes / sharding_size_forward
-        memory_cost_forward_weight = numel_weight * size_per_elem_bytes / sharding_size_weight
-        memory_cost_forward = memory_cost_forward_activation + memory_cost_forward_weight
-
-        # backward memory_cost
-        memory_cost_backward_activation = numel_input * size_per_elem_bytes / sharding_size_backward_activation
-        memory_cost_backward_weight = numel_weight * size_per_elem_bytes / sharding_size_weight
-        memory_cost_backward = memory_cost_backward_activation + memory_cost_backward_weight
-
-        # memory_cost pair
-        memory_cost = (memory_cost_forward, memory_cost_backward)
-
-        return memory_cost, memory_cost_forward_activation, memory_cost_backward_activation, memory_cost_backward_weight
-
-    @ignore_sharding_exception
-    def split_input_batch_weight_out_channel(self, mesh_dim_0, mesh_dim_1):
-        name = f'S{mesh_dim_0}S{mesh_dim_1} = S{mesh_dim_0}R x RS{mesh_dim_1}'
-
-        dim_partition_dict_for_input = {0: [mesh_dim_0]}
-        sharding_spec_for_input = self._generate_sharding_spec(self.input_data, dim_partition_dict_for_input)
-
-        dim_partition_dict_for_weight = {1: [mesh_dim_1]}
-        sharding_spec_for_weight = self._generate_sharding_spec(self.weight, dim_partition_dict_for_weight)
-
-        dim_partition_dict_for_output = {0: [mesh_dim_0], 1: [mesh_dim_1]}
-        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
-
-        # generate resharding cost for this strategy
-        resharding_costs = self._generate_resharding_costs([sharding_spec_for_input, sharding_spec_for_weight])
-
-        # compute the computation cost of this strategy
-        bs = self.input_data.shape[0] // self.device_mesh.shape[mesh_dim_0]
-        channel_in = self.input_data.shape[1]
-        channel_out = self.weight.shape[1] // self.device_mesh.shape[mesh_dim_1]
-        compute_cost = self._generate_compute_cost(bs, channel_in, channel_out)
-
-        # compute the memory cost of this strategy
-        sharding_size_forward = self.device_mesh.shape[mesh_dim_0] * self.device_mesh.shape[mesh_dim_1]
-        sharding_size_backward_activation = self.device_mesh.shape[mesh_dim_0]
-        sharding_size_weight = self.device_mesh.shape[mesh_dim_1]
-        memory_cost, _, memory_cost_backward_activation, memory_cost_backward_weight = self._generate_memory_cost(
-            sharding_size_forward, sharding_size_backward_activation, sharding_size_weight)
-
-        # This strategy do not need to do all_reduce operation during forward
-        communication_cost_forward = 0
-        # compute the backward communication cost to all reduce the input activation grad
-        communication_cost_backward_activation = self.device_mesh.all_reduce_cost(memory_cost_backward_activation,
-                                                                                  mesh_dim_1)
-        # compute the backward communication cost to all reduce the weight due to data parallel
-        communication_cost_backward_weight = self.device_mesh.all_reduce_cost(memory_cost_backward_weight, mesh_dim_0)
-        # total communication cost
-        communication_cost = communication_cost_forward + communication_cost_backward_activation + communication_cost_backward_weight
-
-        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_output,
-                                               compute_cost=compute_cost,
-                                               communication_cost=communication_cost,
-                                               memory_cost=memory_cost,
-                                               resharding_costs=resharding_costs,
-                                               input_shardings=(sharding_spec_for_input, sharding_spec_for_weight))
-        self.strategies_vector.append(sharding_strategies)
-
-    @ignore_sharding_exception
-    def split_input_batch(self, mesh_dim_0):
-        name = f'S{mesh_dim_0}R = S{mesh_dim_0}R x RR'
-
-        dim_partition_dict_for_input = {0: [mesh_dim_0]}
-        sharding_spec_for_input = self._generate_sharding_spec(self.input_data, dim_partition_dict_for_input)
-
-        dim_partition_dict_for_weight = {}
-        sharding_spec_for_weight = self._generate_sharding_spec(self.weight, dim_partition_dict_for_weight)
-
-        dim_partition_dict_for_output = {0: [mesh_dim_0]}
-        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
-
-        # generate resharding cost for this strategy
-        resharding_costs = self._generate_resharding_costs([sharding_spec_for_input, sharding_spec_for_weight])
-
-        # compute the computation cost of this strategy
-        bs = self.input_data.shape[0] // self.device_mesh.shape[mesh_dim_0]
-        channel_in = self.input_data.shape[1]
-        channel_out = self.weight.shape[1]
-        compute_cost = self._generate_compute_cost(bs, channel_in, channel_out)
-
-        # compute the memory cost of this strategy
-        sharding_size_forward = self.device_mesh.shape[mesh_dim_0]
-        sharding_size_backward_activation = self.device_mesh.shape[mesh_dim_0]
-        sharding_size_weight = 1
-        memory_cost, _, _, memory_cost_backward_weight = self._generate_memory_cost(sharding_size_forward,
-                                                                                    sharding_size_backward_activation,
-                                                                                    sharding_size_weight)
-
-        # This strategy do not need to do all_reduce operation in forward phase.
-        communication_cost_forward = 0
-        # compute the backward communication cost to all reduce the weight due to data parallel
-        communication_cost_backward_weight = self.device_mesh.all_reduce_cost(memory_cost_backward_weight, mesh_dim_0)
-        # compute the total cost
-        communication_cost = communication_cost_forward + communication_cost_backward_weight
-        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_output,
-                                               compute_cost=compute_cost,
-                                               communication_cost=communication_cost,
-                                               memory_cost=memory_cost,
-                                               resharding_costs=resharding_costs,
-                                               input_shardings=(sharding_spec_for_input, sharding_spec_for_weight))
-
-        self.strategies_vector.append(sharding_strategies)
-
-    @ignore_sharding_exception
-    def split_input_both_dim_weight_in_channel(self, mesh_dim_0, mesh_dim_1):
-        name = f'S{mesh_dim_0}R = S{mesh_dim_0}S{mesh_dim_1} x S{mesh_dim_1}R'
-
-        dim_partition_dict_for_input = {0: [mesh_dim_0], 1: [mesh_dim_1]}
-        sharding_spec_for_input = self._generate_sharding_spec(self.input_data, dim_partition_dict_for_input)
-
-        dim_partition_dict_for_weight = {0: [mesh_dim_0]}
-        sharding_spec_for_weight = self._generate_sharding_spec(self.weight, dim_partition_dict_for_weight)
-
-        dim_partition_dict_for_output = {0: [mesh_dim_0]}
-        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
-
-        # generate resharding cost for this strategy
-        resharding_costs = self._generate_resharding_costs([sharding_spec_for_input, sharding_spec_for_weight])
-
-        # compute the computation cost of this strategy
-        bs = self.input_data.shape[0] // self.device_mesh.shape[mesh_dim_0]
-        channel_in = self.input_data.shape[1] // self.device_mesh.shape[mesh_dim_1]
-        channel_out = self.weight.shape[1]
-        compute_cost = self._generate_compute_cost(bs, channel_in, channel_out)
-
-        # compute the memory cost of this strategy
-        sharding_size_forward = self.device_mesh.shape[mesh_dim_0]
-        sharding_size_backward_activation = self.device_mesh.shape[mesh_dim_0] * self.device_mesh.shape[mesh_dim_1]
-        sharding_size_weight = self.device_mesh.shape[mesh_dim_1]
-        memory_cost, memory_cost_forward_activation, _, memory_cost_backward_weight = self._generate_memory_cost(
-            sharding_size_forward, sharding_size_backward_activation, sharding_size_weight)
-
-        # compute the communication cost of this strategy during forward phase
-        communication_cost_forward = self.device_mesh.all_reduce_cost(memory_cost_forward_activation, mesh_dim_1)
-        # This strategy do not need to do all_reduce operation to compute the input activation grad
-        communication_cost_backward_activation = 0
-        # compute the backward communication cost to all reduce the weight due to data parallel
-        communication_cost_backward_weight = self.device_mesh.all_reduce_cost(memory_cost_backward_weight, mesh_dim_0)
-        # compute total cost
-        communication_cost = communication_cost_forward + communication_cost_backward_activation + communication_cost_backward_weight
-        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_output,
-                                               compute_cost=compute_cost,
-                                               communication_cost=communication_cost,
-                                               memory_cost=memory_cost,
-                                               resharding_costs=resharding_costs,
-                                               input_shardings=(sharding_spec_for_input, sharding_spec_for_weight))
-        self.strategies_vector.append(sharding_strategies)
-
-    @ignore_sharding_exception
-    def split_input_in_channel_weight_both_channel(self, mesh_dim_0, mesh_dim_1):
-        name = f'RS{mesh_dim_1} = RS{mesh_dim_0} x S{mesh_dim_0}S{mesh_dim_1}'
-
-        dim_partition_dict_for_input = {1: [mesh_dim_0]}
-        sharding_spec_for_input = self._generate_sharding_spec(self.input_data, dim_partition_dict_for_input)
-
-        dim_partition_dict_for_weight = {0: [mesh_dim_0], 1: [mesh_dim_1]}
-        sharding_spec_for_weight = self._generate_sharding_spec(self.weight, dim_partition_dict_for_weight)
-
-        dim_partition_dict_for_output = {1: [mesh_dim_1]}
-        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
-
-        # generate resharding cost for this strategy
-        resharding_costs = self._generate_resharding_costs([sharding_spec_for_input, sharding_spec_for_weight])
-
-        # compute the computation cost of this strategy
-        bs = self.input_data.shape[0]
-        channel_in = self.input_data.shape[1] // self.device_mesh.shape[mesh_dim_0]
-        channel_out = self.weight.shape[1] // self.device_mesh.shape[mesh_dim_1]
-        compute_cost = self._generate_compute_cost(bs, channel_in, channel_out)
-
-        # compute the memory cost of this strategy
-        sharding_size_forward = self.device_mesh.shape[mesh_dim_1]
-        sharding_size_backward_activation = self.device_mesh.shape[mesh_dim_0]
-        sharding_size_weight = self.device_mesh.shape[mesh_dim_0] * self.device_mesh.shape[mesh_dim_1]
-        memory_cost, memory_cost_forward_activation, memory_cost_backward_activation, _ = self._generate_memory_cost(
-            sharding_size_forward, sharding_size_backward_activation, sharding_size_weight)
-
-        # compute the communication cost of this strategy during forward phase
-        communication_cost_forward = self.device_mesh.all_reduce_cost(memory_cost_forward_activation, mesh_dim_0)
-        # compute the communication cost of this strategy during backward phase
-        communication_cost_backward = self.device_mesh.all_reduce_cost(memory_cost_backward_activation, mesh_dim_1)
-        communication_cost = communication_cost_forward + communication_cost_backward
-        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_output,
-                                               compute_cost=compute_cost,
-                                               communication_cost=communication_cost,
-                                               memory_cost=memory_cost,
-                                               resharding_costs=resharding_costs,
-                                               input_shardings=(sharding_spec_for_input, sharding_spec_for_weight))
-        self.strategies_vector.append(sharding_strategies)
-
-    @ignore_sharding_exception
-    def split_input_in_channel_weight_in_channel(self, mesh_dim_0):
-        name = f'RR = RS{mesh_dim_0} x S{mesh_dim_0}R'
-
-        dim_partition_dict_for_input = {1: [mesh_dim_0]}
-        sharding_spec_for_input = self._generate_sharding_spec(self.input_data, dim_partition_dict_for_input)
-
-        dim_partition_dict_for_weight = {0: [mesh_dim_0]}
-        sharding_spec_for_weight = self._generate_sharding_spec(self.weight, dim_partition_dict_for_weight)
-
-        dim_partition_dict_for_output = {}
-        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
-
-        # generate resharding cost for this strategy
-        resharding_costs = self._generate_resharding_costs([sharding_spec_for_input, sharding_spec_for_weight])
-
-        # compute the computation cost of this strategy
-        bs = self.input_data.shape[0]
-        channel_in = self.input_data.shape[1] // self.device_mesh.shape[mesh_dim_0]
-        channel_out = self.weight.shape[1]
-        compute_cost = self._generate_compute_cost(bs, channel_in, channel_out)
-
-        # compute the memory cost of this strategy
-        sharding_size_forward = 1
-        sharding_size_backward_activation = self.device_mesh.shape[mesh_dim_0]
-        sharding_size_weight = self.device_mesh.shape[mesh_dim_0]
-        memory_cost, memory_cost_forward_activation, _, _ = self._generate_memory_cost(
-            sharding_size_forward, sharding_size_backward_activation, sharding_size_weight)
-
-        # compute the communication cost of this strategy during forward phase
-        communication_cost_forward = self.device_mesh.all_reduce_cost(memory_cost_forward_activation, mesh_dim_0)
-        # This strategy do NOT need all_reduce during forward phase
-        communication_cost_backward = 0
-        communication_cost = communication_cost_forward + communication_cost_backward
-        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_output,
-                                               compute_cost=compute_cost,
-                                               communication_cost=communication_cost,
-                                               memory_cost=memory_cost,
-                                               resharding_costs=resharding_costs,
-                                               input_shardings=(sharding_spec_for_input, sharding_spec_for_weight))
-        self.strategies_vector.append(sharding_strategies)
-
-    @ignore_sharding_exception
-    def split_weight_out_channel(self, mesh_dim_0):
-        name = f'RS{mesh_dim_0} = RR x RS{mesh_dim_0}'
-
-        dim_partition_dict_for_input = {}
-        sharding_spec_for_input = self._generate_sharding_spec(self.input_data, dim_partition_dict_for_input)
-
-        dim_partition_dict_for_weight = {1: [mesh_dim_0]}
-        sharding_spec_for_weight = self._generate_sharding_spec(self.weight, dim_partition_dict_for_weight)
-
-        dim_partition_dict_for_output = {1: [mesh_dim_0]}
-        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
-
-        # generate resharding cost for this strategy
-        resharding_costs = self._generate_resharding_costs([sharding_spec_for_input, sharding_spec_for_weight])
-
-        # compute the computation cost of this strategy
-        bs = self.input_data.shape[0]
-        channel_in = self.input_data.shape[1]
-        channel_out = self.weight.shape[1] // self.device_mesh.shape[mesh_dim_0]
-        compute_cost = self._generate_compute_cost(bs, channel_in, channel_out)
-
-        # compute the memory cost of this strategy
-        sharding_size_forward = self.device_mesh.shape[mesh_dim_0]
-        sharding_size_backward_activation = 1
-        sharding_size_weight = self.device_mesh.shape[mesh_dim_0]
-        memory_cost, _, memory_cost_backward_activation, _ = self._generate_memory_cost(
-            sharding_size_forward, sharding_size_backward_activation, sharding_size_weight)
-
-        # This strategy do not need to do all_reduce during forward phase
-        communication_cost_forward = 0
-        # compute the communication cost of this strategy during backward phase
-        communication_cost_backward = self.device_mesh.all_reduce_cost(memory_cost_backward_activation, mesh_dim_0)
-        communication_cost = communication_cost_forward + communication_cost_backward
-        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_output,
-                                               compute_cost=compute_cost,
-                                               communication_cost=communication_cost,
-                                               memory_cost=memory_cost,
-                                               resharding_costs=resharding_costs,
-                                               input_shardings=(sharding_spec_for_input, sharding_spec_for_weight))
-        self.strategies_vector.append(sharding_strategies)
-
-    @ignore_sharding_exception
-    def non_split(self):
-        name = f'RR = RR x RR'
-
-        dim_partition_dict_for_input = {}
-        sharding_spec_for_input = self._generate_sharding_spec(self.input_data, dim_partition_dict_for_input)
-
-        dim_partition_dict_for_weight = {}
-        sharding_spec_for_weight = self._generate_sharding_spec(self.weight, dim_partition_dict_for_weight)
-
-        dim_partition_dict_for_output = {}
-        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
-
-        # generate resharding cost for this strategy
-        resharding_costs = self._generate_resharding_costs([sharding_spec_for_input, sharding_spec_for_weight])
-
-        # compute the computation cost of this strategy
-        bs = self.input_data.shape[0]
-        channel_in = self.input_data.shape[1]
-        channel_out = self.weight.shape[1]
-        compute_cost = self._generate_compute_cost(bs, channel_in, channel_out)
-
-        # compute the memory cost of this strategy
-        sharding_size_forward = 1
-        sharding_size_backward_activation = 1
-        sharding_size_weight = 1
-        memory_cost, _, _, _ = self._generate_memory_cost(sharding_size_forward, sharding_size_backward_activation,
-                                                          sharding_size_weight)
-
-        # This strategy do not need to do all_reduce in both forward and backward phase
-        communication_cost = 0
-
-        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_output,
-                                               compute_cost=compute_cost,
-                                               communication_cost=communication_cost,
-                                               memory_cost=memory_cost,
-                                               resharding_costs=resharding_costs,
-                                               input_shardings=(sharding_spec_for_input, sharding_spec_for_weight))
-        self.strategies_vector.append(sharding_strategies)
-
-    @ignore_sharding_exception
-    def split_1d_parallel_on_input_batch(self, mesh_dim_0, mesh_dim_1):
-        name = f'S{mesh_dim_0}{mesh_dim_1}R = S{mesh_dim_0}{mesh_dim_1}R x RR'
-
-        dim_partition_dict_for_input = {0: [mesh_dim_0, mesh_dim_1]}
-        sharding_spec_for_input = self._generate_sharding_spec(self.input_data, dim_partition_dict_for_input)
-
-        dim_partition_dict_for_weight = {}
-        sharding_spec_for_weight = self._generate_sharding_spec(self.weight, dim_partition_dict_for_weight)
-
-        dim_partition_dict_for_output = {0: [mesh_dim_0, mesh_dim_1]}
-        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
-
-        # generate resharding cost for this strategy
-        resharding_costs = self._generate_resharding_costs([sharding_spec_for_input, sharding_spec_for_weight])
-
-        # compute the computation cost of this strategy
-        bs = self.input_data.shape[0] // (self.device_mesh.shape[mesh_dim_0] * self.device_mesh.shape[mesh_dim_1])
-        channel_in = self.input_data.shape[1]
-        channel_out = self.weight.shape[1]
-        compute_cost = self._generate_compute_cost(bs, channel_in, channel_out)
-
-        # compute the memory cost of this strategy
-        sharding_size_forward = self.device_mesh.mesh_shape[mesh_dim_0] * self.device_mesh.mesh_shape[mesh_dim_1]
-        sharding_size_backward_activation = self.device_mesh.mesh_shape[mesh_dim_0] * self.device_mesh.mesh_shape[
-            mesh_dim_1]
-        sharding_size_weight = 1
-        memory_cost, _, _, memory_cost_backward_weight = self._generate_memory_cost(sharding_size_forward,
-                                                                                    sharding_size_backward_activation,
-                                                                                    sharding_size_weight)
-
-        # This strategy do not need to do all_reduce in forward phase
-        communication_cost_forward = 0
-        # compute the backward communication cost to all reduce the weight due to data parallel
-        communication_cost_backward_weight = self.device_mesh.flatten_device_mesh.all_reduce_cost(
-            memory_cost_backward_weight, 0)
-        # compute the total communication cost
-        communication_cost = communication_cost_backward_weight + communication_cost_forward
-
-        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_output,
-                                               compute_cost=compute_cost,
-                                               communication_cost=communication_cost,
-                                               memory_cost=memory_cost,
-                                               resharding_costs=resharding_costs,
-                                               input_shardings=(sharding_spec_for_input, sharding_spec_for_weight))
-        self.strategies_vector.append(sharding_strategies)
-
-    @ignore_sharding_exception
-    def split_1d_parallel_on_in_channel(self, mesh_dim_0, mesh_dim_1):
-        name = f'RR = RS{mesh_dim_0}{mesh_dim_1} x S{mesh_dim_0}{mesh_dim_1}R'
-
-        dim_partition_dict_for_input = {1: [mesh_dim_0, mesh_dim_1]}
-        sharding_spec_for_input = self._generate_sharding_spec(self.input_data, dim_partition_dict_for_input)
-
-        dim_partition_dict_for_weight = {0: [mesh_dim_0, mesh_dim_1]}
-        sharding_spec_for_weight = self._generate_sharding_spec(self.weight, dim_partition_dict_for_weight)
-
-        dim_partition_dict_for_output = {}
-        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
-
-        # generate resharding cost for this strategy
-        resharding_costs = self._generate_resharding_costs([sharding_spec_for_input, sharding_spec_for_weight])
-
-        # compute the computation cost of this strategy
-        bs = self.input_data.shape[0]
-        channel_in = self.input_data.shape[1] // (self.device_mesh.shape[mesh_dim_0] *
-                                                  self.device_mesh.shape[mesh_dim_1])
-        channel_out = self.weight.shape[1]
-        compute_cost = self._generate_compute_cost(bs, channel_in, channel_out)
-
-        # compute the memory cost of this strategy
-        sharding_size_forward = 1
-        sharding_size_backward_activation = self.device_mesh.mesh_shape[mesh_dim_0] * self.device_mesh.mesh_shape[
-            mesh_dim_1]
-        sharding_size_weight = self.device_mesh.mesh_shape[mesh_dim_0] * self.device_mesh.mesh_shape[mesh_dim_1]
-        memory_cost, memory_cost_forward_activation, _, _ = self._generate_memory_cost(
-            sharding_size_forward, sharding_size_backward_activation, sharding_size_weight)
-
-        # compute communication cost during forward phase
-        communication_cost_forward = self.device_mesh.flatten_device_mesh.all_reduce_cost(
-            memory_cost_forward_activation, 0)
-        # This strategy do NOT need do all_reduce during backward phase
-        communication_cost_backward = 0
-        communication_cost = communication_cost_forward + communication_cost_backward
-
-        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_output,
-                                               compute_cost=compute_cost,
-                                               communication_cost=communication_cost,
-                                               memory_cost=memory_cost,
-                                               resharding_costs=resharding_costs,
-                                               input_shardings=(sharding_spec_for_input, sharding_spec_for_weight))
-        self.strategies_vector.append(sharding_strategies)
-
-    def register_strategy(self) -> StrategiesVector:
-        '''
-        Generate every possible strategies for a Conv node, and record all strategies into the strategies_vector.
-
-        Example:
-            physical_mesh_id = torch.arange(0, 4)
-            mesh_shape = (2, 2)
-            # [[0, 1]
-            #  [2, 3]]
-            device_mesh = DeviceMesh(physical_mesh_id, mesh_shape)
-            shape_consistency_manager = ShapeConsistencyManager()
-
-            model = ConvModel(16, 32)
-            input_sample = {'x': torch.rand(4, 16, 64, 64).to('meta')}
-            # graph():
-            #     %x : torch.Tensor [#users=1] = placeholder[target=x]
-            #     %mul : [#users=1] = call_function[target=operator.mul](args = (%x, 2), kwargs = {})
-            #     %conv : [#users=1] = call_module[target=conv](args = (%mul,), kwargs = {})
-            #     return conv
-            graph = tracer.trace(root=model, meta_args=input_sample)
-            gm = GraphModule(model, graph, model.__class__.__name__)
-            gm.recompile()
-            # [x, mul, conv, output]
-            nodes = [node for node in gm.graph.nodes]
-
-            # strategies_for_input = [[R, R, R, R], [R, S0, R, R], [R, S1, R, R], [S0, R, R, R], [S0, S1, R, R], [S1, R, R, R], [S1, S0, R, R]]
-            strategies_vector_for_input = StrategiesVector(node=nodes[0], in_nodes=[nodes[1], 2], strategies=strategies_for_input)
-            setattr(nodes[1], 'strategies_vector', strategies_vector_for_input)
-
-            strategies_vector = StrategiesVector(node=nodes[2], in_nodes=[nodes[1], ])
-            conv_handler = ConvHandler(input_node=nodes[1], input_index=0, weight=dict(gm.named_modules())[nodes[2].name].weight, output_node=nodes[2],
-                                    device_mesh=device_mesh, strategies_vector=strategies_vector, shape_consistency_manager=shape_consistency_manager)
-            conv_handler.register_strategy_into_strategies_vector()
-            for strategy in conv_handler.strategies_vector:
-                print(f'{strategy.name}: compute_cost is {strategy.compute_cost}, communication_cost is {strategy.communication_cost}, memory_cost is {strategy.memory_cost}, resharding_costs is {strategy.resharding_costs}')
-
-        Output:
-            S0S1 = S0R x RS1: compute_cost is 8856576, communication_cost is 0, memory_cost is 492032.0, resharding_costs is {mul: [0, 32769.001, 131074.2, 0, 32769.1, 131074.2, 98307.201]}
-            S1S0 = S1R x RS0: compute_cost is 8856576, communication_cost is 0, memory_cost is 492032.0, resharding_costs is {mul: [0, 131074.2, 32769.001, 131074.2, 98307.201, 0, 32769.1]}
-            S0R = S0R x RR: compute_cost is 17713152, communication_cost is 0, memory_cost is 984064.0, resharding_costs is {mul: [0, 32769.001, 131074.2, 0, 32769.1, 131074.2, 98307.201]}
-            S1R = S1R x RR: compute_cost is 17713152, communication_cost is 0, memory_cost is 984064.0, resharding_costs is {mul: [0, 131074.2, 32769.001, 131074.2, 98307.201, 0, 32769.1]}
-            S0R = S0S1 x S1R: compute_cost is 8856576, communication_cost is 984065.01, memory_cost is 984064.0, resharding_costs is {mul: [0, 65538.002, 0, 0, 0, 65538.002, 196614.402]}
-            S1R = S1S0 x S0R: compute_cost is 8856576, communication_cost is 984065.01, memory_cost is 984064.0, resharding_costs is {mul: [0, 0, 65538.002, 65538.002, 196614.402, 0, 0]}
-            RS1 = RS0 x S0S1: compute_cost is 8856576, communication_cost is 984065.01, memory_cost is 984064.0, resharding_costs is {mul: [0, 0, 131074.2, 32769.001, 98307.201, 131074.2, 32769.1]}
-            RS0 = RS1 x S1S0: compute_cost is 8856576, communication_cost is 984065.01, memory_cost is 984064.0, resharding_costs is {mul: [0, 131074.2, 0, 131074.2, 32769.1, 32769.001, 98307.201]}
-            RR = RS0 x S0R: compute_cost is 17713152, communication_cost is 1968129.01, memory_cost is 1968128, resharding_costs is {mul: [0, 0, 131074.2, 32769.001, 98307.201, 131074.2, 32769.1]}
-            RR = RS1 x S1R: compute_cost is 17713152, communication_cost is 1968129.01, memory_cost is 1968128, resharding_costs is {mul: [0, 131074.2, 0, 131074.2, 32769.1, 32769.001, 98307.201]}
-            RS0 = RR x RS0: compute_cost is 17713152, communication_cost is 0, memory_cost is 984064.0, resharding_costs is {mul: [0, 65537.1, 65537.1, 65537.1, 131075.30000000002, 65537.1, 131075.30000000002]}
-            RS1 = RR x RS1: compute_cost is 17713152, communication_cost is 0, memory_cost is 984064.0, resharding_costs is {mul: [0, 65537.1, 65537.1, 65537.1, 131075.30000000002, 65537.1, 131075.30000000002]}
-            RR = RR x RR: compute_cost is 35426304, communication_cost is 0, memory_cost is 1968128, resharding_costs is {mul: [0, 65537.1, 65537.1, 65537.1, 131075.30000000002, 65537.1, 131075.30000000002]}
-            S01R = S01R x RR: compute_cost is 8856576, communication_cost is 0, memory_cost is 492032.0, resharding_costs is {mul: [0, 65538.002, 262148.4, 0, 16385.001, 262148.4, 196614.402]}
-            RR = RS01 x S01R: compute_cost is 8856576, communication_cost is 0, memory_cost is 1968128, resharding_costs is {mul: [0, 0, 262148.4, 65538.002, 196614.402, 262148.4, 65538.2]}
-        '''
-        # SS = SR x RS
-        self.split_input_batch_weight_out_channel(0, 1)
-        self.split_input_batch_weight_out_channel(1, 0)
-
-        # SR = SR x RR
-        self.split_input_batch(0)
-        self.split_input_batch(1)
-
-        # SR = SS x SR
-        self.split_input_both_dim_weight_in_channel(0, 1)
-        self.split_input_both_dim_weight_in_channel(1, 0)
-
-        # RS = RS x SS
-        self.split_input_in_channel_weight_both_channel(0, 1)
-        self.split_input_in_channel_weight_both_channel(1, 0)
-
-        # RR = RS x SR
-        self.split_input_in_channel_weight_in_channel(0)
-        self.split_input_in_channel_weight_in_channel(1)
-
-        # RS = RR x RS
-        self.split_weight_out_channel(0)
-        self.split_weight_out_channel(1)
-
-        # RR= RR x RR
-        self.non_split()
-
-        # S01R = S01R x RR
-        self.split_1d_parallel_on_input_batch(0, 1)
-
-        # RR = RS01 x S01R
-        self.split_1d_parallel_on_in_channel(0, 1)
-
-        return self.strategies_vector
-
-
-CONV_STRATEGIES_LIST = [
-    'S0S1 = S0R x RS1', 'S1S0 = S1R x RS0', 'S0R = S0R x RR', 'S1R = S1R x RR', 'S0R = S0S1 x S1R', 'S1R = S1S0 x S0R',
-    'RS1 = RS0 x S0S1', 'RS0 = RS1 x S1S0', 'RR = RS0 x S0R', 'RR = RS1 x S1R', 'RS0 = RR x RS0', 'RS1 = RR x RS1',
-    'RR = RR x RR', 'S01R = S01R x RR', 'RR = RS01 x S01R'
-]
diff --git a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/dot_handler.py b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/dot_handler.py
deleted file mode 100644
index 1f2281cc4172..000000000000
--- a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/dot_handler.py
+++ /dev/null
@@ -1,756 +0,0 @@
-import operator
-from enum import Enum
-from functools import reduce
-from typing import List
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from colossalai.auto_parallel.tensor_shard.deprecated._utils import ignore_sharding_exception
-from colossalai.auto_parallel.tensor_shard.deprecated.sharding_strategy import ShardingStrategy, StrategiesVector
-
-from ..constants import LINEAR_FUNC_OP, LINEAR_MODULE_OP
-from .operator_handler import OperatorHandler
-from .strategy_generator import IntermediateStrategy, StrategyGenerator
-
-__all__ = ['DotHandler']
-
-
-class DotProductStrategyGenerator(StrategyGenerator):
-    """
-    DotProductStrategyGenerator is used to generate the sharding strategies for two 1D tensors in dot product computation.
-    This is created for torch.matmul where two tensors are 1D tensors. As torch.matmul does not include a bias argument, so we
-    do not consider bias here.
-    """
-
-    def validate(self, input, other):
-        assert input.dim() == 1 and other.dim() == 1
-
-    def no_split(self):
-        name = f'R = R dot R'
-        dim_partition_dict = {"input": {}, "other": {}, "output": {}}
-        return IntermediateStrategy(name=name, dim_partition_dict=dim_partition_dict)
-
-    def split_one_dim(self, mesh_dim):
-        name = f'S{mesh_dim} = S{mesh_dim} dot S{mesh_dim}'
-        dim_partition_dict = {"input": {0: [mesh_dim]}, "other": {0: [mesh_dim]}, "output": {}}
-        return IntermediateStrategy(name=name, dim_partition_dict=dim_partition_dict, all_reduce_axis=[mesh_dim])
-
-    def generate(self) -> List[IntermediateStrategy]:
-        strategy_list = []
-
-        # do not split dimensions for dot product
-        # R = R dot R
-        strategy_list.append(self.no_split())
-
-        # split two tensors in the same dimensions
-        # S = S dot S
-        strategy_list.append(self.split_one_dim(0))
-        strategy_list.append(self.split_one_dim(1))
-
-        return strategy_list
-
-
-class MatVecStrategyGenerator(StrategyGenerator):
-
-    def validate(self, input, other) -> bool:
-        assert input.dim() > 1 and other.dim() == 1
-
-    def no_split(self):
-        name = "R = R x R"
-        dim_partition_dict = {"input": {}, "other": {}, "output": {}}
-        return IntermediateStrategy(name=name, dim_partition_dict=dim_partition_dict)
-
-    def split_input_batch(self, mesh_dim):
-        name = f'S{mesh_dim}R = S{mesh_dim}R x R'
-        dim_partition_dict = {"input": {0: [mesh_dim]}, "other": {}, "output": {0: [mesh_dim]}}
-        return IntermediateStrategy(name=name, dim_partition_dict=dim_partition_dict)
-
-    def generate(self) -> List[IntermediateStrategy]:
-        strategy_list = []
-
-        # no split
-        strategy_list.append(self.no_split())
-
-        # split the batch dim for the first tensor only
-        strategy_list.append(self.split_input_batch(0))
-        strategy_list.append(self.split_input_batch(1))
-
-        return strategy_list
-
-
-class MatMulStrategyGenerator(StrategyGenerator):
-    """
-    MatMulStrategyGenerator is used to generate the sharding strategies when the second tensor is
-    a 2D tensor. This is used for nn.Linear, F.linear, torch.matmul and torch.addmm.
-
-    A matmul can be formulated as [n, p] x [p, q] = [n, q]
-
-    Args:
-        is_linear (bool): whether this generator is used for nn.Linear and F.linear.
-            This will incur extra transformation of the dim partitioning as the weight is transposed.
-    """
-
-    def __init__(self, is_linear: bool, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.is_linear = is_linear
-
-        # as the weight for the linear module is transposed, we can compute
-        # the correponding dimension indexfor convenience
-        if is_linear:
-            self.dim_q = 0
-            self.dim_p = 1
-        else:
-            self.dim_q = 1
-            self.dim_p = 0
-
-    def validate(self, input, other, bias) -> bool:
-        # make sure the second tensor is a 2D tensor
-        assert input.dim() > 0 and other.dim() == 2
-
-        # make sure bias is of the same dimension
-        if self.is_linear:
-            assert bias is None or bias.shape[-1] == other.shape[0]
-        else:
-            assert bias is None or bias.shape[-1] == other.shape[1]
-
-    def split_lhs_space_rhs_space(self, mesh_dim_0, mesh_dim_1):
-        # handle case SS = SR x RS
-        name = f'S{mesh_dim_0}S{mesh_dim_1} = S{mesh_dim_0}R x RS{mesh_dim_1}'
-
-        dim_partition_dict = {
-            "input": {
-                0: [mesh_dim_0]
-            },
-            "other": {
-                self.dim_q: [mesh_dim_1]
-            },
-            "bias": {
-                -1: [mesh_dim_1]
-            },
-            "output": {
-                0: [mesh_dim_0],
-                -1: [mesh_dim_1]
-            },
-        }
-        return IntermediateStrategy(name=name, dim_partition_dict=dim_partition_dict)
-
-    def split_lhs_space_both_contract(self, mesh_dim_0, mesh_dim_1):
-        # handle the case SR = SS x SR
-        name = f'S{mesh_dim_0}R = S{mesh_dim_0}S{mesh_dim_1} x S{mesh_dim_1}R'
-        dim_partition_dict = {
-            "input": {
-                0: [mesh_dim_0],
-                -1: [mesh_dim_1]
-            },
-            "other": {
-                self.dim_p: [mesh_dim_1]
-            },
-            "bias": {},
-            "output": {
-                0: [mesh_dim_0]
-            },
-        }
-        return IntermediateStrategy(name=name, dim_partition_dict=dim_partition_dict, all_reduce_axis=[mesh_dim_1])
-
-    def split_rhs_space_both_contract(self, mesh_dim_0, mesh_dim_1):
-        name = f'RS{mesh_dim_1} = RS{mesh_dim_0} x S{mesh_dim_0}S{mesh_dim_1}'
-        dim_partition_dict = {
-            "input": {
-                -1: [mesh_dim_0]
-            },
-            "other": {
-                self.dim_p: [mesh_dim_0],
-                self.dim_q: [mesh_dim_1]
-            },
-            "bias": {
-                -1: [mesh_dim_1]
-            },
-            "output": {
-                -1: [mesh_dim_1]
-            },
-        }
-        return IntermediateStrategy(name=name, dim_partition_dict=dim_partition_dict)
-
-    def recompute_split_both_contract(self, mesh_dim):
-        name = f'RR = RS{mesh_dim} x S{mesh_dim}R'
-        dim_partition_dict = {
-            "input": {
-                -1: [mesh_dim]
-            },
-            "other": {
-                self.dim_p: [mesh_dim]
-            },
-            "bias": {},
-            "output": {},
-        }
-        return IntermediateStrategy(name=name, dim_partition_dict=dim_partition_dict, all_reduce_axis=[mesh_dim])
-
-    def split_rhs_space_only(self, mesh_dim):
-        name = f'RS{mesh_dim} = RR x RS{mesh_dim}'
-        dim_partition_dict = {
-            "input": {},
-            "other": {
-                self.dim_q: [mesh_dim]
-            },
-            "bias": {
-                -1: [mesh_dim]
-            },
-            "output": {
-                -1: [mesh_dim]
-            },
-        }
-        return IntermediateStrategy(name=name, dim_partition_dict=dim_partition_dict, all_reduce_axis=[mesh_dim])
-
-    def split_lhs_1st_dim_1d(self, mesh_dim_0, mesh_dim_1):
-        name = f'S{mesh_dim_0}{mesh_dim_1}R = S{mesh_dim_0}{mesh_dim_1}R x RR'
-        dim_partition_dict = {
-            "input": {
-                0: [mesh_dim_0, mesh_dim_1]
-            },
-            "other": {},
-            "bias": {},
-            "output": {
-                0: [mesh_dim_0, mesh_dim_1]
-            },
-        }
-        return IntermediateStrategy(name=name, dim_partition_dict=dim_partition_dict)
-
-    def split_lhs_2nd_dim_1d(self, mesh_dim_0, mesh_dim_1):
-        name = f'RR = RS{mesh_dim_0}{mesh_dim_1} x S{mesh_dim_0}{mesh_dim_1}R'
-        dim_partition_dict = {
-            "input": {
-                -1: [mesh_dim_0, mesh_dim_1]
-            },
-            "other": {
-                self.dim_p: [mesh_dim_0, mesh_dim_1]
-            },
-            "bias": {},
-            "output": {},
-        }
-        return IntermediateStrategy(name=name,
-                                    dim_partition_dict=dim_partition_dict,
-                                    all_reduce_axis=[mesh_dim_0, mesh_dim_1])
-
-    def split_rhs_2nd_dim_1d(self, mesh_dim_0, mesh_dim_1):
-        name = f'RS{mesh_dim_0}{mesh_dim_1} = RR x RS{mesh_dim_0}{mesh_dim_1}'
-
-        dim_partition_dict = {
-            "input": {},
-            "other": {
-                self.dim_q: [mesh_dim_0, mesh_dim_1]
-            },
-            "bias": {
-                -1: [mesh_dim_0, mesh_dim_1]
-            },
-            "output": {
-                -1: [mesh_dim_0, mesh_dim_1]
-            },
-        }
-        return IntermediateStrategy(name=name, dim_partition_dict=dim_partition_dict)
-
-
-class BatchedMatMulStrategyGenerator(StrategyGenerator):
-    """
-    Generate sharding strategies for the batched matrix multiplication.
-
-    A batched matrix multiplication can be viewed as
-    [b, i, k] x [b, k, j] -> [b, i, j]
-    """
-
-    def __init__(self, is_torch_bmm: bool, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.is_torch_bmm = is_torch_bmm
-
-    def validate(self, input, other, bias) -> bool:
-        if self.is_torch_bmm:
-            assert input.shape == other.shape
-            assert input.dim() > 2
-            assert other.shape[-1] == bias.shape[0]
-        else:
-            # TODO: validate these inputs are broadcastable
-            pass
-
-    def split_one_batch_dim(self):
-        if 1 in self.device_mesh.mesh_shape:
-            mesh_dim = self.device_mesh.mesh_shape.index(1)
-            name = f'Sb{mesh_dim} = Sb{mesh_dim} x Sb{mesh_dim}'
-            dim_partition_dict = {
-                "input": {
-                    0: [mesh_dim]
-                },
-                "other": {
-                    0: [mesh_dim]
-                },
-                "bias": {},
-                "output": {
-                    0: [mesh_dim]
-                }
-            }
-            return IntermediateStrategy(name=name, dim_partition_dict=dim_partition_dict)
-        else:
-            return None
-
-    def split_two_batch_dim(self, mesh_dim_0, mesh_dim_1):
-        name = f'Sb{mesh_dim_0}{mesh_dim_1} = Sb{mesh_dim_0}{mesh_dim_1} x Sb{mesh_dim_0}{mesh_dim_1}'
-        dim_partition_dict = {
-            "input": {
-                0: [mesh_dim_0, mesh_dim_1]
-            },
-            "other": {
-                0: [mesh_dim_0, mesh_dim_1]
-            },
-            "bias": {},
-            "output": {
-                0: [mesh_dim_0, mesh_dim_1]
-            }
-        }
-        return IntermediateStrategy(name=name, dim_partition_dict=dim_partition_dict)
-
-    def split_one_batch_dim(self, mesh_dim):
-        name = f'Sb{mesh_dim} = Sb{mesh_dim} x Sb{mesh_dim}'
-        dim_partition_dict = {"input": {0: [mesh_dim]}, "other": {0: [mesh_dim]}, "bias": {}, "output": {0: [mesh_dim]}}
-        return IntermediateStrategy(name=name, dim_partition_dict=dim_partition_dict)
-
-    def split_batch_dim_lhs_space(self, mesh_dim_0, mesh_dim_1):
-        name = f'Sb{mesh_dim_0}Si{mesh_dim_1} = Sb{mesh_dim_0}Si{mesh_dim_1} x Sb{mesh_dim_0}'
-        dim_partition_dict = {
-            "input": {
-                0: [mesh_dim_0],
-                -2: [mesh_dim_1]
-            },
-            "other": {
-                0: [mesh_dim_0]
-            },
-            "bias": {},
-            "output": {
-                0: mesh_dim_0,
-                -2: [mesh_dim_1]
-            }
-        }
-        return IntermediateStrategy(name=name, dim_partition_dict=dim_partition_dict)
-
-    def split_batch_dim_rhs_space(self, mesh_dim_0, mesh_dim_1):
-        name = f'Sb{mesh_dim_0}Sj{mesh_dim_1} = Sb{mesh_dim_0}R x Sb{mesh_dim_0}Sj{mesh_dim_1}'
-        dim_partition_dict = {
-            "input": {
-                0: [mesh_dim_0]
-            },
-            "other": {
-                0: [mesh_dim_0],
-                -1: [mesh_dim_1]
-            },
-            "bias": {
-                -1: [mesh_dim_1]
-            },
-            "output": {
-                0: [mesh_dim_0],
-                -1: [mesh_dim_1]
-            }
-        }
-        return IntermediateStrategy(name=name, dim_partition_dict=dim_partition_dict)
-
-    def split_batch_dim_both_contract(self, mesh_dim_0, mesh_dim_1):
-        name = f'Sb{mesh_dim_0}R = Sb{mesh_dim_0}Sk{mesh_dim_1} x Sb{mesh_dim_0}Sk{mesh_dim_1}'
-        dim_partition_dict = {
-            "input": {
-                0: [mesh_dim_0],
-                -1: [mesh_dim_1]
-            },
-            "other": {
-                0: [mesh_dim_0],
-                -2: [mesh_dim_1]
-            },
-            "bias": {},
-            "output": {
-                0: [mesh_dim_0],
-                -2: [mesh_dim_1]
-            }
-        }
-        return IntermediateStrategy(name=name, dim_partition_dict=dim_partition_dict, all_reduce_axis=[mesh_dim_1])
-
-    def generate(self) -> List[IntermediateStrategy]:
-        strategy_list = []
-
-        # split only the batch dimension
-        # Sb = Sb x Sb
-        # can be None as it is only for 1D device mesh
-        strategy = self.split_one_batch_dim()
-        if strategy:
-            strategy_list.append(strategy)
-
-        # split batch dim of two inputs and the i dim of the first tensor
-        # SbSi = SbSi x Sb
-        strategy_list.append(self.split_batch_dim_lhs_space(0, 1))
-        strategy_list.append(self.split_batch_dim_lhs_space(1, 0))
-
-        # split batch dim of two inputs and the j of the second tensor
-        # SbSj = Sb x SbSj
-        strategy_list.append(self.split_batch_dim_rhs_space(0, 1))
-        strategy_list.append(self.split_batch_dim_rhs_space(1, 0))
-
-        # split batch dim of two inputs and the k dim of two inputs
-        # Sb = SbSk x SbSk, need to all-reduce by k dim
-        strategy_list.append(self.split_batch_dim_both_contract(0, 1))
-        strategy_list.append(self.split_batch_dim_both_contract(1, 0))
-
-        # split two batch dim
-        strategy_list.append(self.split_two_batch_dim(0, 1))
-        strategy_list.append(self.split_two_batch_dim(1, 0))
-
-        return strategy_list
-
-
-class DotHandler(OperatorHandler):
-    """
-    A OperatorHandler which deals with the sharding strategies for nn.Linear and F.linear.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.input_data = self.predecessor_node[0]._meta_data
-        self.weight = self.module_named_parameters['weight']
-        self.output_data = self.node._meta_data
-
-    def _generate_compute_cost(self, input_shape, weight_shape, total_sharding_size):
-        # TODO: consider bias addition
-        compute_cost = reduce(operator.mul, input_shape) * weight_shape[0] * 2 // total_sharding_size
-        return compute_cost
-
-    @ignore_sharding_exception
-    def split_lhs_space_rhs_space(self, mesh_dim_0, mesh_dim_1):
-        # handle case SS = SR x RS
-        name = f'S{mesh_dim_0}S{mesh_dim_1} = S{mesh_dim_0}R x RS{mesh_dim_1}'
-
-        dim_partition_dict_for_input = {0: [mesh_dim_0]}
-        sharding_spec_for_input = self._generate_sharding_spec(self.input_data, dim_partition_dict_for_input)
-
-        # linear layer weight is transposed during init
-        dim_partition_dict_for_weight = {0: [mesh_dim_1]}
-        sharding_spec_for_weight = self._generate_sharding_spec(self.weight, dim_partition_dict_for_weight)
-
-        dim_partition_dict_for_output = {0: [mesh_dim_0], 1: [mesh_dim_1]}
-        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_input)
-
-        # generate resharding cost for this strategy
-        resharding_costs = self._generate_resharding_costs([sharding_spec_for_input, sharding_spec_for_weight])
-
-        # compute computation cost
-        total_sharding_size = self.device_mesh.shape[mesh_dim_0] * self.device_mesh.shape[mesh_dim_1]
-        compute_cost = self._generate_compute_cost(self.input_data.shape, self.weight.shape, total_sharding_size)
-
-        # compute the memory cost of this strategy
-        toatl_memory_cost, activation_memory_cost, weight_memory_cost, input_grad_memory_cost = self._generate_memory_cost(
-            dim_partition_dict_for_output, dim_partition_dict_for_weight, dim_partition_dict_for_input)
-
-        # compute the communication cost
-        communication_cost_activation_backward = self.device_mesh.all_reduce_cost(activation_memory_cost, mesh_dim_1)
-        communication_cost_weight_backward = self.device_mesh.all_reduce_cost(weight_memory_cost, mesh_dim_0)
-        communication_cost = communication_cost_activation_backward + communication_cost_weight_backward
-
-        # create and register strategy
-        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_output,
-                                               compute_cost=compute_cost,
-                                               communication_cost=communication_cost,
-                                               memory_cost=toatl_memory_cost,
-                                               resharding_costs=resharding_costs,
-                                               input_shardings=(sharding_spec_for_input, sharding_spec_for_weight))
-        self.strategies_vector.append(sharding_strategies)
-
-    @ignore_sharding_exception
-    def split_lhs_space_both_contract(self, mesh_dim_0, mesh_dim_1):
-        # handle the case SR = SS x SR
-        name = f'S{mesh_dim_0}R = S{mesh_dim_0}S{mesh_dim_1} x S{mesh_dim_1}R'
-
-        dim_partition_dict_for_input = {0: [mesh_dim_0], 1: [mesh_dim_1]}
-        sharding_spec_for_input = self._generate_sharding_spec(self.input_data, dim_partition_dict_for_input)
-
-        # since weight of the linear layer is transposed
-        # the actual dim to be sharded is 1
-        dim_partition_dict_for_weight = {1: [mesh_dim_1]}
-        sharding_spec_for_weight = self._generate_sharding_spec(self.weight, dim_partition_dict_for_weight)
-
-        dim_partition_dict_for_output = {0: [mesh_dim_0]}
-        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
-
-        # generate resharding cost for this strategy
-        resharding_costs = self._generate_resharding_costs([sharding_spec_for_input, sharding_spec_for_weight])
-
-        # compute the computation cost of this strategy
-        total_sharding_size = self.device_mesh.shape[mesh_dim_0] * self.device_mesh.shape[mesh_dim_1]
-        compute_cost = self._generate_compute_cost(self.input_data.shape, self.weight.shape, total_sharding_size)
-
-        # compute the memory cost of this strategy
-        toatl_memory_cost, activation_memory_cost, weight_memory_cost, input_grad_memory_cost = self._generate_memory_cost(
-            dim_partition_dict_for_output, dim_partition_dict_for_weight, dim_partition_dict_for_input)
-
-        # compute the communication cost of this strategy
-        communication_cost_activation_forward = self.device_mesh.all_reduce_cost(activation_memory_cost, mesh_dim_1)
-        communication_cost_grad_backward = self.device_mesh.all_reduce_cost(weight_memory_cost, mesh_dim_0)
-        communication_cost = communication_cost_activation_forward + communication_cost_grad_backward
-        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_output,
-                                               compute_cost=compute_cost,
-                                               communication_cost=communication_cost,
-                                               memory_cost=toatl_memory_cost,
-                                               resharding_costs=resharding_costs,
-                                               input_shardings=(sharding_spec_for_input, sharding_spec_for_weight))
-        self.strategies_vector.append(sharding_strategies)
-
-    @ignore_sharding_exception
-    def split_rhs_space_both_contract(self, mesh_dim_0, mesh_dim_1):
-        name = f'RS{mesh_dim_1} = RS{mesh_dim_0} x S{mesh_dim_0}S{mesh_dim_1}'
-
-        dim_partition_dict_for_input = {1: [mesh_dim_0]}
-        sharding_spec_for_input = self._generate_sharding_spec(self.input_data, dim_partition_dict_for_input)
-
-        dim_partition_dict_for_weight = {0: [mesh_dim_0], 1: [mesh_dim_1]}
-        sharding_spec_for_weight = self._generate_sharding_spec(self.weight, dim_partition_dict_for_weight)
-
-        dim_partition_dict_for_output = {1: [mesh_dim_1]}
-        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_input)
-
-        # generate resharding cost for this strategy
-        resharding_costs = self._generate_resharding_costs([sharding_spec_for_input, sharding_spec_for_weight])
-
-        # compute the computation cost of this strategy
-        total_sharding_size = self.device_mesh.shape[mesh_dim_0] * self.device_mesh.shape[mesh_dim_1]
-        compute_cost = self._generate_compute_cost(self.input_data.shape, self.weight.shape, total_sharding_size)
-
-        # compute the memory cost of this strategy
-        toatl_memory_cost, activation_memory_cost, weight_memory_cost, input_grad_memory_cost = self._generate_memory_cost(
-            dim_partition_dict_for_output, dim_partition_dict_for_weight, dim_partition_dict_for_input)
-
-        # compute the communication cost of this strategy
-        communication_cost_activation_forward = self.device_mesh.all_reduce_cost(activation_memory_cost, mesh_dim_0)
-        communication_cost_activation_backward = self.device_mesh.all_reduce_cost(input_grad_memory_cost, mesh_dim_1)
-        communication_cost = communication_cost_activation_backward + communication_cost_activation_forward
-
-        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_output,
-                                               compute_cost=compute_cost,
-                                               communication_cost=communication_cost,
-                                               memory_cost=toatl_memory_cost,
-                                               resharding_costs=resharding_costs,
-                                               input_shardings=(sharding_spec_for_input, sharding_spec_for_weight))
-        self.strategies_vector.append(sharding_strategies)
-
-    @ignore_sharding_exception
-    def recompute_split_both_contract(self, mesh_dim):
-        name = f'RR = RS{mesh_dim} x S{mesh_dim}R'
-
-        dim_partition_dict_for_input = {1: [mesh_dim]}
-        sharding_spec_for_input = self._generate_sharding_spec(self.input_data, dim_partition_dict_for_input)
-
-        dim_partition_dict_for_weight = {1: [mesh_dim]}
-        sharding_spec_for_weight = self._generate_sharding_spec(self.weight, dim_partition_dict_for_weight)
-
-        dim_partition_dict_for_output = {}
-        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
-
-        # generate resharding cost for this strategy
-        resharding_costs = self._generate_resharding_costs([sharding_spec_for_input, sharding_spec_for_weight])
-
-        # compute the computation cost of this strategy
-        total_sharding_size = self.device_mesh.shape[mesh_dim]
-        compute_cost = self._generate_compute_cost(self.input_data.shape, self.weight.shape, total_sharding_size)
-
-        # compute the memory cost of this strategy
-        toatl_memory_cost, activation_memory_cost, weight_memory_cost, input_grad_memory_cost = self._generate_memory_cost(
-            dim_partition_dict_for_output, dim_partition_dict_for_weight, dim_partition_dict_for_input)
-
-        # compute the communication cost of this strategy
-        communication_cost = self.device_mesh.all_reduce_cost(activation_memory_cost, mesh_dim)
-        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_output,
-                                               compute_cost=compute_cost,
-                                               communication_cost=communication_cost,
-                                               memory_cost=toatl_memory_cost,
-                                               resharding_costs=resharding_costs,
-                                               input_shardings=(sharding_spec_for_input, sharding_spec_for_weight))
-        self.strategies_vector.append(sharding_strategies)
-
-    @ignore_sharding_exception
-    def split_rhs_space_only(self, mesh_dim):
-        name = f'RS{mesh_dim} = RR x RS{mesh_dim}'
-
-        dim_partition_dict_for_input = {}
-        sharding_spec_for_input = self._generate_sharding_spec(self.input_data, dim_partition_dict_for_input)
-
-        dim_partition_dict_for_weight = {0: [mesh_dim]}
-        sharding_spec_for_weight = self._generate_sharding_spec(self.weight, dim_partition_dict_for_weight)
-
-        dim_partition_dict_for_output = {1: [mesh_dim]}
-        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
-
-        # generate resharding cost for this strategy
-        resharding_costs = self._generate_resharding_costs([sharding_spec_for_input, sharding_spec_for_weight])
-
-        # compute the computation cost of this strategy
-        total_sharding_size = self.device_mesh.shape[mesh_dim]
-        compute_cost = self._generate_compute_cost(self.input_data.shape, self.weight.shape, total_sharding_size)
-
-        # compute the memory cost of this strategy
-        toatl_memory_cost, activation_memory_cost, weight_memory_cost, input_grad_memory_cost = self._generate_memory_cost(
-            dim_partition_dict_for_output, dim_partition_dict_for_weight, dim_partition_dict_for_input)
-
-        # compute the communication cost of this strategy
-        communication_cost_activation_backward = self.device_mesh.all_reduce_cost(input_grad_memory_cost, mesh_dim)
-        communication_cost = communication_cost_activation_backward
-        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_output,
-                                               compute_cost=compute_cost,
-                                               communication_cost=communication_cost,
-                                               memory_cost=toatl_memory_cost,
-                                               resharding_costs=resharding_costs,
-                                               input_shardings=(sharding_spec_for_input, sharding_spec_for_weight))
-        self.strategies_vector.append(sharding_strategies)
-
-    @ignore_sharding_exception
-    def split_lhs_1st_dim_1d(self, mesh_dim_0, mesh_dim_1):
-        name = f'S{mesh_dim_0}{mesh_dim_1}R = S{mesh_dim_0}{mesh_dim_1}R x RR'
-
-        dim_partition_dict_for_input = {0: [mesh_dim_0, mesh_dim_1]}
-        sharding_spec_for_input = self._generate_sharding_spec(self.input_data, dim_partition_dict_for_input)
-
-        dim_partition_dict_for_weight = {}
-        sharding_spec_for_weight = self._generate_sharding_spec(self.weight, dim_partition_dict_for_weight)
-
-        dim_partition_dict_for_output = {0: [mesh_dim_0, mesh_dim_1]}
-        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
-
-        # generate resharding cost for this strategy
-        resharding_costs = self._generate_resharding_costs([sharding_spec_for_input, sharding_spec_for_weight])
-
-        # compute the computation cost of this strategy
-        total_sharding_size = self.device_mesh.shape[mesh_dim_0] * self.device_mesh.shape[mesh_dim_1]
-        compute_cost = self._generate_compute_cost(self.input_data.shape, self.weight.shape, total_sharding_size)
-
-        # compute the memory cost of this strategy
-        toatl_memory_cost, activation_memory_cost, weight_memory_cost, input_grad_memory_cost = self._generate_memory_cost(
-            dim_partition_dict_for_output, dim_partition_dict_for_weight, dim_partition_dict_for_input)
-
-        # compute the communication cost of this strategy
-        communication_cost_weight_backward = self.device_mesh.flatten_device_mesh.all_reduce_cost(weight_memory_cost, 0)
-        communication_cost = communication_cost_weight_backward
-        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_output,
-                                               compute_cost=compute_cost,
-                                               communication_cost=communication_cost,
-                                               memory_cost=toatl_memory_cost,
-                                               resharding_costs=resharding_costs,
-                                               input_shardings=(sharding_spec_for_input, sharding_spec_for_weight))
-        self.strategies_vector.append(sharding_strategies)
-
-    @ignore_sharding_exception
-    def split_lhs_2nd_dim_1d(self, mesh_dim_0, mesh_dim_1):
-        name = f'RR = RS{mesh_dim_0}{mesh_dim_1} x S{mesh_dim_0}{mesh_dim_1}R'
-
-        dim_partition_dict_for_input = {1: [mesh_dim_0, mesh_dim_1]}
-        sharding_spec_for_input = self._generate_sharding_spec(self.input_data, dim_partition_dict_for_input)
-
-        dim_partition_dict_for_weight = {0: [mesh_dim_0, mesh_dim_1]}
-        sharding_spec_for_weight = self._generate_sharding_spec(self.weight, dim_partition_dict_for_weight)
-
-        dim_partition_dict_for_output = {}
-        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
-
-        # generate resharding cost for this strategy
-        resharding_costs = self._generate_resharding_costs([sharding_spec_for_input, sharding_spec_for_weight])
-
-        # compute the computation cost of this strategy
-        total_sharding_size = self.device_mesh.shape[mesh_dim_0] * self.device_mesh.shape[mesh_dim_1]
-        compute_cost = self._generate_compute_cost(self.input_data.shape, self.weight.shape, total_sharding_size)
-
-        # compute the memory cost of this strategy
-        toatl_memory_cost, activation_memory_cost, weight_memory_cost, input_grad_memory_cost = self._generate_memory_cost(
-            dim_partition_dict_for_output, dim_partition_dict_for_weight, dim_partition_dict_for_input)
-
-        # compute the communication cost of this strategy
-        communication_cost_forward_activation = self.device_mesh.flatten_device_mesh.all_reduce_cost(
-            activation_memory_cost, 0)
-        communication_cost = communication_cost_forward_activation
-        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_output,
-                                               compute_cost=compute_cost,
-                                               communication_cost=communication_cost,
-                                               memory_cost=toatl_memory_cost,
-                                               resharding_costs=resharding_costs,
-                                               input_shardings=(sharding_spec_for_input, sharding_spec_for_weight))
-        self.strategies_vector.append(sharding_strategies)
-
-    @ignore_sharding_exception
-    def split_rhs_2nd_dim_1d(self, mesh_dim_0, mesh_dim_1):
-        name = f'RS{mesh_dim_0}{mesh_dim_1} = RR x RS{mesh_dim_0}{mesh_dim_1}'
-
-        dim_partition_dict_for_input = {}
-        sharding_spec_for_input = self._generate_sharding_spec(self.input_data, dim_partition_dict_for_input)
-
-        dim_partition_dict_for_weight = {1: [mesh_dim_0, mesh_dim_1]}
-        sharding_spec_for_weight = self._generate_sharding_spec(self.weight, dim_partition_dict_for_weight)
-
-        dim_partition_dict_for_output = {1: [mesh_dim_0, mesh_dim_1]}
-        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
-
-        # generate resharding cost for this strategy
-        resharding_costs = self._generate_resharding_costs([sharding_spec_for_input, sharding_spec_for_weight])
-
-        # compute the computation cost of this strategy
-        total_sharding_size = self.device_mesh.shape[mesh_dim_0] * self.device_mesh.shape[mesh_dim_1]
-        compute_cost = self._generate_compute_cost(self.input_data.shape, self.weight.shape, total_sharding_size)
-
-        # compute the memory cost of this strategy
-        toatl_memory_cost, activation_memory_cost, weight_memory_cost, input_grad_memory_cost = self._generate_memory_cost(
-            dim_partition_dict_for_output, dim_partition_dict_for_weight, dim_partition_dict_for_input)
-        # compute the communication cost of this strategy
-        communication_cost_activation_backward = self.device_mesh.flatten_device_mesh.all_reduce_cost(
-            input_grad_memory_cost, 0)
-        communication_cost = communication_cost_activation_backward
-        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_output,
-                                               compute_cost=compute_cost,
-                                               communication_cost=communication_cost,
-                                               memory_cost=toatl_memory_cost,
-                                               resharding_costs=resharding_costs,
-                                               input_shardings=(sharding_spec_for_input, sharding_spec_for_weight))
-        self.strategies_vector.append(sharding_strategies)
-
-    def register_strategy(self) -> StrategiesVector:
-        '''
-        Generate every possible strategies for a linear node, and record all strategies into the strategies_vector.
-
-        Output:
-
-        '''
-        # SS = SR x RS
-        self.split_lhs_space_rhs_space(0, 1)
-        self.split_lhs_space_rhs_space(1, 0)
-
-        # SR = SS x SR
-        self.split_lhs_space_both_contract(0, 1)
-        self.split_lhs_space_both_contract(1, 0)
-
-        # RS = RS x SS
-        self.split_rhs_space_both_contract(0, 1)
-        self.split_rhs_space_both_contract(1, 0)
-
-        # RR= RS x SR
-        self.recompute_split_both_contract(0)
-        self.recompute_split_both_contract(1)
-
-        # RS = RR x RS
-        self.split_rhs_space_only(0)
-        self.split_rhs_space_only(1)
-
-        # S01R = S01R x RR
-        self.split_lhs_1st_dim_1d(0, 1)
-
-        # RR = RS01 x S01R
-        self.split_lhs_2nd_dim_1d(0, 1)
-
-        # RS01 = RR x RS01
-        self.split_rhs_2nd_dim_1d(0, 1)
-
-        return self.strategies_vector
diff --git a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/embedding_handler.py b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/embedding_handler.py
deleted file mode 100644
index d01a487ad673..000000000000
--- a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/embedding_handler.py
+++ /dev/null
@@ -1,179 +0,0 @@
-import operator
-import warnings
-from copy import deepcopy
-from functools import reduce
-from typing import Dict, List
-
-import torch
-from colossalai.auto_parallel.tensor_shard.deprecated._utils import \
-    ignore_sharding_exception
-from colossalai.auto_parallel.tensor_shard.deprecated.sharding_strategy import (ShardingStrategy, StrategiesVector)
-from colossalai.tensor.shape_consistency import ShapeConsistencyManager
-from colossalai.tensor.sharding_spec import ShardingSpec
-
-from .operator_handler import OperatorHandler
-
-__all__ = ['EmbeddingHandler']
-
-
-class EmbeddingHandler(OperatorHandler):
-    """
-    An OperatorHandler which deals with the sharding strategies of Embedding operators(such as nn.embedding).
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.input_data = self.predecessor_node[0]._meta_data
-        self.weight = self.module_named_parameters['weight']
-        self.output_data = self.node._meta_data
-
-    def _generate_compute_cost(self, total_sharding_size):
-        input_shape = self.input_data.shape
-        weight_shape = self.weight.shape
-        input_shape_product = reduce(operator.mul, input_shape, 1)
-        weight_shape_product = reduce(operator.mul, weight_shape, 1)
-        compute_cost = input_shape_product * weight_shape_product * 2 / total_sharding_size
-        return compute_cost
-
-    def _generate_memory_cost(self, sharding_size_forward, sharding_size_backward_activation, sharding_size_weight):
-        '''
-        Compute the memory cost per device with this specific strategy.
-
-        Argument:
-            sharding_size_forward(int): The forward activation will be divided
-                into sharding_size_forward number partions.
-            sharding_size_backward_activation(int): The backward activation will 
-                be divided into sharding_size_backward_activation number partions.
-            sharding_size_weight(int): The backward weight will be divided
-                into sharding_size_weight number partions.
-
-        Return:
-            memory_cost(Tuple[float]): Memory cost per device with this 
-                specific strategy, the first element of this tuple is forward
-                memory cost, and the second element of this tuple is backward
-                memory cost.
-            memory_cost_forward(float): Memory cost of forward activation per 
-                device with this specific strategy.
-            memory_cost_backward_activation(float): Memory cost of backward activation 
-                per device with this specific strategy.
-        '''
-        # compute the memory cost of this strategy
-        dtype = self.input_data.dtype
-        numel_output = self.output_data.numel()
-        numel_input = self.input_data.numel()
-        numel_weight = self.weight.numel()
-        size_per_elem_bytes = torch.tensor([], dtype=dtype).element_size()
-
-        # forward memory_cost
-        memory_cost_forward_activation = numel_output * size_per_elem_bytes / sharding_size_forward
-        memory_cost_forward_weight = numel_weight * size_per_elem_bytes / sharding_size_weight
-        memory_cost_forward = memory_cost_forward_activation + memory_cost_forward_weight
-
-        # backward memory_cost
-        memory_cost_backward_activation = numel_input * size_per_elem_bytes / sharding_size_backward_activation
-        memory_cost_backward_weight = numel_weight * size_per_elem_bytes / sharding_size_weight
-        memory_cost_backward = memory_cost_backward_activation + memory_cost_backward_weight
-
-        # memory_cost pair
-        memory_cost = (memory_cost_forward, memory_cost_backward)
-
-        return memory_cost, memory_cost_forward_activation, memory_cost_backward_activation, memory_cost_backward_weight
-
-    @ignore_sharding_exception
-    def split_weight_both_dim(self, mesh_dim_0, mesh_dim_1):
-        name = f'RRS{mesh_dim_1} = RR x S{mesh_dim_0}S{mesh_dim_1}'
-
-        dim_partition_dict_for_input = {}
-        sharding_spec_for_input = self._generate_sharding_spec(self.input_data, dim_partition_dict_for_input)
-
-        dim_partition_dict_for_weight = {0: [mesh_dim_0], 1: [mesh_dim_1]}
-        sharding_spec_for_weight = self._generate_sharding_spec(self.weight, dim_partition_dict_for_weight)
-
-        dim_partition_dict_for_output = {2: [mesh_dim_1]}
-        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
-
-        # generate resharding cost for this strategy
-        resharding_costs = self._generate_resharding_costs([sharding_spec_for_input, sharding_spec_for_weight])
-
-        # compute the computation cost of this strategy
-        total_sharding_size = self.device_mesh.shape[0] * self.device_mesh.shape[1]
-        compute_cost = self._generate_compute_cost(total_sharding_size)
-
-        # compute the memory cost of this strategy
-        sharding_size_forward = self.device_mesh.shape[mesh_dim_1]
-        sharding_size_backward_activation = 1
-        sharding_size_weight = self.device_mesh.shape[mesh_dim_0] * self.device_mesh.shape[mesh_dim_1]
-        memory_cost, memory_cost_forward_activation, memory_cost_backward_activation, _ = self._generate_memory_cost(
-            sharding_size_forward, sharding_size_backward_activation, sharding_size_weight)
-
-        # compute the communication cost of this strategy during forward phase
-        communication_cost_forward = self.device_mesh.all_reduce_cost(memory_cost_forward_activation, mesh_dim_0)
-        # compute the communication cost of this strategy during backward phase
-        communication_cost_backward = self.device_mesh.all_reduce_cost(memory_cost_backward_activation, mesh_dim_1)
-        communication_cost = communication_cost_forward + communication_cost_backward
-        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_output,
-                                               compute_cost=compute_cost,
-                                               communication_cost=communication_cost,
-                                               memory_cost=memory_cost,
-                                               resharding_costs=resharding_costs,
-                                               input_shardings=(sharding_spec_for_input, sharding_spec_for_weight))
-        self.strategies_vector.append(sharding_strategies)
-
-    @ignore_sharding_exception
-    def split_input_both_dim(self, mesh_dim_0, mesh_dim_1):
-        name = f'S{mesh_dim_0}S{mesh_dim_1}R = S{mesh_dim_0}S{mesh_dim_1} x RR'
-
-        dim_partition_dict_for_input = {0: [mesh_dim_0], 1: [mesh_dim_1]}
-        sharding_spec_for_input = self._generate_sharding_spec(self.input_data, dim_partition_dict_for_input)
-
-        dim_partition_dict_for_weight = {}
-        sharding_spec_for_weight = self._generate_sharding_spec(self.weight, dim_partition_dict_for_weight)
-
-        dim_partition_dict_for_output = {0: [mesh_dim_0], 1: [mesh_dim_1]}
-        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
-
-        # generate resharding cost for this strategy
-        resharding_costs = self._generate_resharding_costs([sharding_spec_for_input, sharding_spec_for_weight])
-
-        # compute the computation cost of this strategy
-        total_sharding_size = self.device_mesh.shape[0] * self.device_mesh.shape[1]
-        compute_cost = self._generate_compute_cost(total_sharding_size)
-
-        # compute the memory cost of this strategy
-        sharding_size_forward = self.device_mesh.shape[mesh_dim_0] * self.device_mesh.shape[mesh_dim_1]
-        sharding_size_backward_activation = self.device_mesh.shape[mesh_dim_0] * self.device_mesh.shape[mesh_dim_1]
-        sharding_size_weight = 1
-        memory_cost, memory_cost_forward_activation, memory_cost_backward_activation, memory_cost_backward_weight = self._generate_memory_cost(
-            sharding_size_forward, sharding_size_backward_activation, sharding_size_weight)
-
-        # This strategy do not need to do all_reduce during forward phase
-        communication_cost_forward = 0
-        # compute the communication cost of this strategy during backward phase
-        communication_cost_backward_activation = 0
-        communication_cost_backward_weight = self.device_mesh.flatten_device_mesh.all_reduce_cost(
-            memory_cost_backward_weight, 0)
-        communication_cost_backward = communication_cost_backward_activation + communication_cost_backward_weight
-        communication_cost = communication_cost_forward + communication_cost_backward
-        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_output,
-                                               compute_cost=compute_cost,
-                                               communication_cost=communication_cost,
-                                               memory_cost=memory_cost,
-                                               resharding_costs=resharding_costs,
-                                               input_shardings=(sharding_spec_for_input, sharding_spec_for_weight))
-        self.strategies_vector.append(sharding_strategies)
-
-    def register_strategy(self) -> StrategiesVector:
-        '''
-        Generate every possible strategies for a Conv node, and record all strategies into the strategies_vector.
-        '''
-        # RRS = RR x SS
-        self.split_weight_both_dim(0, 1)
-        self.split_weight_both_dim(1, 0)
-
-        # SSR = SS x RR
-        self.split_input_both_dim(0, 1)
-        self.split_input_both_dim(1, 0)
-
-        return self.strategies_vector
diff --git a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/layer_norm_handler.py b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/layer_norm_handler.py
deleted file mode 100644
index 8062d0f4babf..000000000000
--- a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/layer_norm_handler.py
+++ /dev/null
@@ -1,241 +0,0 @@
-import operator
-from functools import reduce
-
-import torch
-
-from colossalai.auto_parallel.tensor_shard.deprecated._utils import (
-    enumerate_all_possible_1d_sharding,
-    enumerate_all_possible_2d_sharding,
-    generate_sharding_size,
-    ignore_sharding_exception,
-)
-from colossalai.auto_parallel.tensor_shard.deprecated.sharding_strategy import ShardingStrategy, StrategiesVector
-
-from .operator_handler import OperatorHandler
-
-__all__ = ['LayerNormHandler']
-
-
-class LayerNormHandler(OperatorHandler):
-    """
-    A OperatorHandler which deals with the sharding strategies of normalization.
-
-    Note: To keep the math consistency, LayerNorm do not allow shards on hidden dimension.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.input_data = self.predecessor_node[0]._meta_data
-        self.weight = self.module_named_parameters['weight']
-        self.bias = self.module_named_parameters['bias']
-        self.output_data = self.node._meta_data
-
-    def _generate_compute_cost(self, total_sharding_size):
-        '''
-        Compute the computation cost per device with this specific strategy.
-
-        Note: compute_cost need to be devided by TFLOPS, now it just shows the computation size.
-
-        Argument:
-            bs(int): Batch size of the input data.
-            channel_in(int): The channel dimension of input data.
-
-        Return:
-            compute_cost(float): Computation cost per device with this specific strategy
-        '''
-        # TODO: compute_cost need to be devided by TFLOPS, now it just shows the computation size.
-        # TODO: a constant coefficient need to be added.
-
-        norm_kernel_size = self.weight.shape
-        # in LayerNorm context, batch dimensions mean all the dimensions do not join the normalization.
-        input_batch_shape = self.input_data.shape[:-len(norm_kernel_size)]
-        input_batch_product = reduce(operator.mul, input_batch_shape, 1)
-        norm_kernel_product = reduce(operator.mul, norm_kernel_size, 1)
-        forward_compute_cost = input_batch_product * norm_kernel_product / total_sharding_size
-        backward_activation_compute_cost = input_batch_product * norm_kernel_product / total_sharding_size
-        # To compute gradient of on norm kernel element requires input_batch_product times computation, so
-        # the total cost is input_batch_product * norm_kernel_product
-        backward_weight_compute_cost = input_batch_product * norm_kernel_product / total_sharding_size
-        backward_compute_cost = backward_activation_compute_cost + backward_weight_compute_cost
-        compute_cost = forward_compute_cost + backward_compute_cost
-        return compute_cost
-
-    def _generate_memory_cost(self, sharding_size_forward, sharding_size_backward_activation, sharding_size_weight):
-        '''
-        Compute the memory cost per device with this specific strategy.
-
-        Argument:
-            sharding_size_forward(int): The forward activation will be divided
-                into sharding_size_forward number partions.
-            sharding_size_backward_activation(int): The backward activation will
-                be divided into sharding_size_backward_activation number partions.
-            sharding_size_weight(int): The backward weight will be divided
-                into sharding_size_weight number partions.
-
-        Return:
-            memory_cost(Tuple[float]): Memory cost per device with this
-                specific strategy, the first element of this tuple is forward
-                memory cost, and the second element of this tuple is backward
-                memory cost.
-            memory_cost_forward(float): Memory cost of forward activation per
-                device with this specific strategy.
-            memory_cost_backward_activation(float): Memory cost of backward activation
-                per device with this specific strategy.
-        '''
-        # compute the memory cost of this strategy
-        dtype = self.input_data.dtype
-        numel_output = self.output_data.numel()
-        # this operation will not change the shape of input
-        numel_input = numel_output
-        numel_weight = self.weight.numel()
-        size_per_elem_bytes = torch.tensor([], dtype=dtype).element_size()
-
-        # forward memory_cost
-        memory_cost_forward_activation = numel_output * size_per_elem_bytes / sharding_size_forward
-        memory_cost_forward_weight = numel_weight * size_per_elem_bytes / sharding_size_weight
-        memory_cost_forward = memory_cost_forward_activation + memory_cost_forward_weight
-
-        # backward memory_cost
-        memory_cost_backward_activation = numel_input * size_per_elem_bytes / sharding_size_backward_activation
-        memory_cost_backward_weight = numel_weight * size_per_elem_bytes / sharding_size_weight
-        memory_cost_backward = memory_cost_backward_activation + memory_cost_backward_weight
-
-        # memory_cost pair
-        memory_cost = (memory_cost_forward, memory_cost_backward)
-
-        return memory_cost, memory_cost_forward_activation, memory_cost_backward_activation, memory_cost_backward_weight
-
-    def _generate_strategy_with_dim_partition(self, dim_partition):
-        dim_partition_dict_for_input = dim_partition
-        sharding_spec_for_input = self._generate_sharding_spec(self.input_data, dim_partition_dict_for_input)
-
-        dim_partition_dict_for_weight = {}
-        sharding_spec_for_weight = self._generate_sharding_spec(self.weight, dim_partition_dict_for_weight)
-
-        dim_partition_dict_for_output = dim_partition
-        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
-
-        name = f'{sharding_spec_for_output.sharding_sequence} = {sharding_spec_for_input.sharding_sequence} x {sharding_spec_for_weight.sharding_sequence}'
-        # generate resharding cost for this strategy
-        resharding_costs = self._generate_resharding_costs([sharding_spec_for_input])
-
-        total_sharding_size = generate_sharding_size(dim_partition, self.device_mesh)
-        # compute the computation cost of this strategy
-        compute_cost = self._generate_compute_cost(total_sharding_size)
-
-        # compute the memory cost of this strategy
-        sharding_size_forward = generate_sharding_size(dim_partition_dict_for_input, self.device_mesh)
-        sharding_size_backward_activation = generate_sharding_size(dim_partition_dict_for_output, self.device_mesh)
-        sharding_size_weight = generate_sharding_size(dim_partition_dict_for_weight, self.device_mesh)
-        memory_cost, _, _, memory_cost_backward_weight = self._generate_memory_cost(sharding_size_forward,
-                                                                                    sharding_size_backward_activation,
-                                                                                    sharding_size_weight)
-
-        total_mesh_dim_list = []
-        for mesh_dim_list in dim_partition.values():
-            total_mesh_dim_list.extend(mesh_dim_list)
-
-        # This strategy do not need to do all_reduce operation for activation
-        communication_cost_forward_activation = 0
-        communication_cost_backward_activation = 0
-        if len(total_mesh_dim_list) == 1:
-            communication_cost_backward_weight = self.device_mesh.all_reduce_cost(memory_cost_backward_weight,
-                                                                                  total_mesh_dim_list[0])
-        else:
-            assert len(total_mesh_dim_list) == 2, f'temporally we just support 2d device mesh.'
-            communication_cost_backward_weight = self.device_mesh.flatten_device_mesh.all_reduce_cost(
-                memory_cost_backward_weight, 0)
-        communication_cost = communication_cost_forward_activation + communication_cost_backward_activation + communication_cost_backward_weight
-
-        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_output,
-                                               compute_cost=compute_cost,
-                                               communication_cost=communication_cost,
-                                               memory_cost=memory_cost,
-                                               resharding_costs=resharding_costs,
-                                               input_shardings=(sharding_spec_for_input, sharding_spec_for_weight))
-
-        self.strategies_vector.append(sharding_strategies)
-
-    @ignore_sharding_exception
-    def split_input_batch_single_mesh_dim(self, mesh_dim_0):
-        batch_dimension_length = self.input_data.dim() - self.weight.dim()
-        dim_partition_list = enumerate_all_possible_1d_sharding(mesh_dim_0, batch_dimension_length)
-        for dim_partition in dim_partition_list:
-            self._generate_strategy_with_dim_partition(dim_partition)
-
-    @ignore_sharding_exception
-    def split_input_batch_both_mesh_dim(self, mesh_dim_0, mesh_dim_1):
-        batch_dimension_length = self.input_data.dim() - self.weight.dim()
-        dim_partition_list = enumerate_all_possible_2d_sharding(mesh_dim_0, mesh_dim_1, batch_dimension_length)
-        for dim_partition in dim_partition_list:
-            self._generate_strategy_with_dim_partition(dim_partition)
-
-    @ignore_sharding_exception
-    def non_split(self):
-        name = f'RR = RR x R'
-
-        dim_partition_dict_for_input = {}
-        sharding_spec_for_input = self._generate_sharding_spec(self.input_data, dim_partition_dict_for_input)
-
-        dim_partition_dict_for_weight = {}
-        sharding_spec_for_weight = self._generate_sharding_spec(self.weight, dim_partition_dict_for_weight)
-
-        dim_partition_dict_for_output = {}
-        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
-
-        # generate resharding cost for this strategy
-        resharding_costs = self._generate_resharding_costs([sharding_spec_for_input])
-
-        total_sharding_size = 1
-        # compute the computation cost of this strategy
-        compute_cost = self._generate_compute_cost(total_sharding_size)
-
-        # compute the memory cost of this strategy
-        sharding_size_forward = 1
-        sharding_size_backward_activation = 1
-        sharding_size_weight = 1
-        memory_cost, _, _, _ = self._generate_memory_cost(sharding_size_forward, sharding_size_backward_activation,
-                                                          sharding_size_weight)
-
-        # This strategy do not need to do all_reduce operation
-        communication_cost = 0
-        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_output,
-                                               compute_cost=compute_cost,
-                                               communication_cost=communication_cost,
-                                               memory_cost=memory_cost,
-                                               resharding_costs=resharding_costs,
-                                               input_shardings=(sharding_spec_for_input, sharding_spec_for_weight))
-
-        self.strategies_vector.append(sharding_strategies)
-
-    def register_strategy(self) -> StrategiesVector:
-        '''
-        Generate every possible strategies for a BatchNorm node, and record all strategies into the strategies_vector.
-
-        Example:
-            norm_handler = BatchNormHandler(node,  strategies_vector,
-                                               self.shape_consistency_manager)
-            norm_handler.register_strategy()
-            for strategy in norm_handler.strategies_vector:
-                print(f'{strategy.name}, computation_cost: {strategy.compute_cost}, memory_cost: {strategy.memory_cost}')
-
-        Output:
-            RS0 = RS0 x S0, computation_cost: 131072, memory_cost: 524288.0
-            RS1 = RS1 x S1, computation_cost: 131072, memory_cost: 524288.0
-            RR = RR x R, computation_cost: 262144, memory_cost: 1048576
-            RS01 = RS01 x S01, computation_cost: 65536, memory_cost: 262144.0
-        '''
-
-        # SR = SR x R with single mesh dim on batch dimensions
-        self.split_input_batch_single_mesh_dim(0)
-        self.split_input_batch_single_mesh_dim(1)
-
-        # SR = SR x R with both mesh dims on batch dimensions
-        self.split_input_batch_both_mesh_dim(0, 1)
-
-        # RR = RR x R
-        self.non_split()
-
-        return self.strategies_vector
diff --git a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/operator_handler.py b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/operator_handler.py
deleted file mode 100644
index b120cc16b04b..000000000000
--- a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/operator_handler.py
+++ /dev/null
@@ -1,149 +0,0 @@
-from abc import ABC, abstractmethod
-from typing import Dict, List
-from webbrowser import Opera
-
-import torch
-import torch.nn as nn
-from torch.fx.node import Node
-
-from colossalai.auto_parallel.tensor_shard.deprecated.constants import *
-from colossalai.device.device_mesh import DeviceMesh
-from colossalai.tensor.sharding_spec import ShardingSpec
-
-from .._utils import generate_resharding_costs, generate_sharding_spec
-from ..sharding_strategy import StrategiesVector
-
-__all__ = ['OperatorHandler']
-
-
-class OperatorHandler(ABC):
-    '''
-    The OperatorHandler is an abstract class used to generate every possible strategies for an operator node.
-
-    Args:
-        node (Node): the input node in node argument list.
-        device_mesh (DeviceMesh): A logical view of a physical mesh.
-        strategies_vector (StrategiesVector): all the strategies generated in this handler will be recorded into the strategies_vector.
-        handle_backward (Optional[bool]): whether to consider the backward pass. The default value is True. False can be used for inference.
-    '''
-
-    def __init__(self,
-                 node: Node,
-                 device_mesh: DeviceMesh,
-                 strategies_vector: StrategiesVector,
-                 handle_backward: bool = True):
-        self.node = node
-        self.predecessor_node = list(node._input_nodes.keys())
-        self.successor_node = list(node.users.keys())
-        self.device_mesh = device_mesh
-        self.strategies_vector = strategies_vector
-        self.handle_backward = handle_backward
-
-        # find the module and its parameters associated with this node
-        # this can be used to compute the compute/communication/sharding cost
-        if self.node.op == 'call_module':
-            module = node.graph.owning_module.get_submodule(node.target)
-            named_parameters = list(module.named_parameters(recurse=False))
-            # convert named parameters from list to dict
-            named_parameters = {k: v for k, v in named_parameters}
-        elif self.node.op == 'call_function' and self.node.target not in NON_PARAM_FUNC_OP:
-            module = None
-            parameters = list(self.node.args)[1]
-            if isinstance(parameters, Node):
-                named_parameters = {'weight': parameters._meta_data}
-            else:
-                named_parameters = {}
-        else:
-            module = None
-            named_parameters = None
-        self.module = module
-        self.module_named_parameters = named_parameters
-
-    @abstractmethod
-    def register_strategy(self) -> StrategiesVector:
-        """
-        Register
-        """
-        pass
-
-    def _generate_memory_cost(self, dim_partition_dict_for_output, dim_partition_dict_for_weight,
-                              sharding_spec_for_input):
-        '''
-        Compute the memory cost per device with this specific strategy.
-
-        Argument:
-            dim_partition_dict_for_output(List[int]): The key is the dimension of output to be sharded,
-                and the value of the key decribe which logical axis will be sharded in that dimension.
-            dim_partition_dict_for_weight(List[int]): The key is the dimension of weight to be sharded,
-                and the value of the key decribe which logical axis will be sharded in that dimension.
-        Return:
-            total_memory_cost(float): total memory cost per device with this specific strategy
-            activation_cost(float): the memory cost of activation per device with this specific strategy
-            weight_memory_cost(float): the memory cost of weight per device with this specific strategy
-        '''
-        # compute the size of one element with specific dtype
-        dtype = self.input_data.dtype
-        size_per_elem_bytes = torch.tensor([], dtype=dtype).element_size()
-
-        # compute the memory cost of activation
-        activation_numel = self.output_data.numel()
-        output_mesh_dims = []
-        for sharding_dim, mesh_dims in dim_partition_dict_for_output.items():
-            output_mesh_dims.extend(mesh_dims)
-        activation_sharding_size = 1
-        for mesh_dim in output_mesh_dims:
-            activation_sharding_size *= self.device_mesh.shape[mesh_dim]
-        activation_memory_cost = activation_numel / activation_sharding_size * size_per_elem_bytes
-
-        # compute the memory cost of weight
-        weight_numel = self.weight.numel()
-        weight_sharding_size = 1
-        weight_mesh_dims = []
-        for sharding_dim, mesh_dims in dim_partition_dict_for_weight.items():
-            weight_mesh_dims.extend(mesh_dims)
-        for mesh_dim in weight_mesh_dims:
-            weight_sharding_size *= self.device_mesh.shape[mesh_dim]
-        weight_memory_cost = weight_numel / weight_sharding_size * size_per_elem_bytes
-
-        # compute the memory cost of input grad
-        input_grad_numel = self.input_data.numel()
-        input_grad_sharding_size = 1
-        input_grad_mesh_dims = []
-        for sharding_dim, mesh_dims in sharding_spec_for_input.items():
-            input_grad_mesh_dims.extend(mesh_dims)
-        for mesh_dim in input_grad_mesh_dims:
-            input_grad_sharding_size *= self.device_mesh.shape[mesh_dim]
-        input_grad_memory_cost = input_grad_numel / input_grad_sharding_size * size_per_elem_bytes
-
-        memory_cost_forward = activation_memory_cost + weight_memory_cost
-        memory_cost_backward = input_grad_memory_cost + weight_memory_cost
-
-        return (memory_cost_forward,
-                memory_cost_backward), activation_memory_cost, weight_memory_cost, input_grad_memory_cost
-
-    def _generate_resharding_costs(self, sharding_specs):
-        # The resharding_cost of weight is counted due to sharing weight cases.
-        if hasattr(self.node._meta_data, 'dtype'):
-            dtype = self.node._meta_data.dtype
-        else:
-            assert isinstance(self.node._meta_data,
-                              tuple), f'Only torch.Tensor, torch.fx.Node and tuple of torch.Tensor is expected'
-            dtype = self.node._meta_data[0].dtype
-
-        nodes = self.predecessor_node
-        return generate_resharding_costs(nodes=nodes,
-                                         sharding_specs=sharding_specs,
-                                         count_backward=self.handle_backward,
-                                         dtype=dtype)
-
-    def _generate_sharding_spec(self, input_: torch.Tensor, dim_partition_dict: Dict[int, List[int]]) -> ShardingSpec:
-        return generate_sharding_spec(input_=input_,
-                                      device_mesh=self.device_mesh,
-                                      dim_partition_dict=dim_partition_dict)
-
-    @abstractmethod
-    def _generate_compute_cost(self, *args, **kwargs):
-        """
-        Compute the flops involved in the node.
-        """
-        pass
diff --git a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/reshape_handler.py b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/reshape_handler.py
deleted file mode 100644
index d4ccc8a9c323..000000000000
--- a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/reshape_handler.py
+++ /dev/null
@@ -1,89 +0,0 @@
-import colorsys
-import math
-import warnings
-from copy import deepcopy
-
-import torch
-
-from colossalai.auto_parallel.tensor_shard.deprecated._utils import ignore_sharding_exception
-from colossalai.auto_parallel.tensor_shard.deprecated.sharding_strategy import ShardingStrategy, StrategiesVector
-from colossalai.tensor.shape_consistency import ShapeConsistencyManager
-from colossalai.tensor.sharding_spec import ShardingSpec
-
-from ..constants import INFINITY_COST
-from .operator_handler import OperatorHandler
-
-
-class ReshapeHandler(OperatorHandler):
-    """
-    An OperatorHandler which deals with the sharding strategies of Reshape Operator, such as torch.reshape, torch.flatten, etc.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.input_data = self.predecessor_node[0]._meta_data
-        self.output_data = self.node._meta_data
-
-    def _generate_compute_cost(self, *args, **kwargs):
-        return super()._generate_compute_cost(*args, **kwargs)
-
-    @ignore_sharding_exception
-    def register_strategy(self):
-        # TODO: add strategies with more output sharding specs other than only fully replicated.
-        input_node = self.strategies_vector.predecessor_nodes[0]
-        # For reshape function, to keep the computing correctness we keep the sharding
-        # spec of input is fully replicated. In addition, we will keep the output in
-        # replica status and let the successor node choose the way to resharding the
-        # output node. Therefore, the different strategies of input node with same
-        # output sharding spec will generate same strategy for reshape function.
-        sharding_spec_checklist = []
-        for strategy in input_node.strategies_vector:
-            # It looks a little bit confusing, the input of the processing node
-            # is the output of the input_node.
-            input_sharding_spec = strategy.output_sharding_spec
-            assert isinstance(input_sharding_spec, ShardingSpec), f'The input node should NOT be a tuple of tensor.'
-            if input_sharding_spec in sharding_spec_checklist:
-                continue
-            sharding_spec_checklist.append(input_sharding_spec)
-            dim_partition_dict_for_output = {}
-            if isinstance(self.output_data, tuple):
-                dim_partition_dict_for_output = [{} for _ in range(len(self.output_data))]
-            try:
-                if isinstance(self.output_data, tuple):
-                    output_sharding_spec = []
-                    for output, dim_partition_dict in zip(self.output_data, dim_partition_dict_for_output):
-                        output_sharding_spec.append(self._generate_sharding_spec(output, dim_partition_dict))
-                else:
-                    output_sharding_spec = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
-            except AssertionError as e:
-                warnings.warn(f'{e}')
-                continue
-            name = f'{input_sharding_spec.sharding_sequence} -> FULLY REPLICATED'
-            # TODO: use meta_info_prop to profile memory cost and compute cost
-            compute_cost = 0
-            # consider node._meta_data is in type of tuple
-            memory_cost = 0
-
-            # compute the communication cost, in reshape op, the communication happens during casting the input sharding spec to fully replicating.
-            dim_partition_dict_for_replicate_input = {}
-            replicate_input_sharding_spec = self._generate_sharding_spec(self.input_data,
-                                                                         dim_partition_dict_for_replicate_input)
-            # shape consistency manager is a singleton class
-            shape_consistency_manager = ShapeConsistencyManager()
-            _, _, communication_cost = shape_consistency_manager.shape_consistency(input_sharding_spec,
-                                                                                   replicate_input_sharding_spec)
-            communication_cost = communication_cost["total"]
-
-            # generate resharding cost
-            resharding_costs = self._generate_resharding_costs([input_sharding_spec])
-
-            # to prevent the resharding happening, set their resharding cost to inf.
-            resharding_costs[input_node] = [0 if cost == 0 else INFINITY_COST for cost in resharding_costs[input_node]]
-            sharding_strategy = ShardingStrategy(name,
-                                                 output_sharding_spec,
-                                                 compute_cost=compute_cost,
-                                                 communication_cost=communication_cost,
-                                                 memory_cost=memory_cost,
-                                                 resharding_costs=resharding_costs,
-                                                 input_shardings=[input_sharding_spec])
-            self.strategies_vector.append(sharding_strategy)
diff --git a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/strategy_generator.py b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/strategy_generator.py
deleted file mode 100644
index 4e39fcd8e82d..000000000000
--- a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/strategy_generator.py
+++ /dev/null
@@ -1,45 +0,0 @@
-from dataclasses import dataclass
-from abc import ABC, abstractmethod
-from typing import List, Dict
-from colossalai.device.device_mesh import DeviceMesh
-
-__all__ = ['IntermediateStrategy', 'StrategyGenerator']
-
-
-@dataclass
-class IntermediateStrategy:
-    """
-    IntermediateStrategy contains the subset of meta information for ShardingStrategy. It is 
-    to store the essential information regarding the tensor sharding and leave other meta information to OperatorHandler.
-
-    Args:
-        name (str): name of the sharding strategy.
-        dim_partition_dict (Dict[Dict]): stores the tensor to dim partition dict mapping.
-        all_reduce_dims (List[int]): stores the dimensions which require an all-reduce operation.
-    """
-    name: str
-    dim_partition_dict: Dict[str, Dict[int, List[int]]]
-    all_reduce_axis: List[int] = None
-
-
-class StrategyGenerator(ABC):
-    """
-    StrategyGenerator is used to generate the same group of sharding strategies. 
-    """
-
-    def __init__(self, device_mesh: DeviceMesh):
-        self.device_mesh = device_mesh
-
-    @abstractmethod
-    def generate(self) -> List[IntermediateStrategy]:
-        """
-        """
-        pass
-
-    @abstractmethod
-    def validate(self, *args, **kwargs) -> bool:
-        """
-        Validate if the operands are of desired shape. 
-        If True, means this generator can be used for the current operation.
-        """
-        pass
diff --git a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/unary_elementwise_handler.py b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/unary_elementwise_handler.py
deleted file mode 100644
index c929d2fade98..000000000000
--- a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/unary_elementwise_handler.py
+++ /dev/null
@@ -1,88 +0,0 @@
-import math
-import operator
-import warnings
-from copy import deepcopy
-from functools import reduce
-from typing import Dict, List
-
-import torch
-from colossalai.auto_parallel.tensor_shard.deprecated._utils import \
-    ignore_sharding_exception
-from colossalai.auto_parallel.tensor_shard.deprecated.constants import \
-    INFINITY_COST
-from colossalai.auto_parallel.tensor_shard.deprecated.sharding_strategy import (ShardingStrategy, StrategiesVector)
-from colossalai.tensor.shape_consistency import ShapeConsistencyManager
-from colossalai.tensor.sharding_spec import ShardingSpec
-
-from .operator_handler import OperatorHandler
-
-__all__ = ['UnaryElementwiseHandler']
-
-
-class UnaryElementwiseHandler(OperatorHandler):
-    """
-    An OperatorHandler which deals with the sharding strategies of UnaryElementwiseOp.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        if self.node.op == 'call_module':
-            target = self.node.target
-            submod = self.node.graph.owning_module.get_submodule(target)
-            submod_type = type(submod)
-            if submod_type == torch.nn.Dropout:
-                print(f'predecessor nodes of dropout node are {self.predecessor_node}')
-        input_nodes_len = 0
-        for check_node in self.predecessor_node:
-            if isinstance(check_node._meta_data, torch.Tensor):
-                input_nodes_len += 1
-        assert input_nodes_len == 1, f'Temporally, we just support single input element-wise op, node name is {self.node}, node args is {self.node.args}.'
-        self.input_data = self.predecessor_node[0]._meta_data
-        self.input_node = self.predecessor_node[0]
-        self.output_data = self.node._meta_data
-
-    def _generate_compute_cost(self, *args, **kwargs):
-        return super()._generate_compute_cost(*args, **kwargs)
-
-    @ignore_sharding_exception
-    def register_strategy(self):
-        # TODO: integrate element-wise func and module together
-        # create sharding strategy for element-wise function
-
-        # For element-wise function, we keep the sharding spec of output node same as
-        # the input. Therefore, the different strategies of input node with same
-        # output sharding spec will generate same strategy for element-wise function.
-
-        for index, strategy in enumerate(self.input_node.strategies_vector):
-            # It looks a little bit confusing, the input of the processing node
-            # is the output of the input_node.
-            input_sharding_spec = strategy.output_sharding_spec
-            assert isinstance(input_sharding_spec, ShardingSpec), f'The input node should NOT be a tuple of tensor.'
-
-            dim_partition_dict = deepcopy(input_sharding_spec.dim_partition_dict)
-            try:
-                output_sharding_spec = self._generate_sharding_spec(self.output_data, dim_partition_dict)
-            except AssertionError as e:
-                warnings.warn(f'{e}')
-                continue
-            # add index into name to pass the duplicated check
-            # we keep same strategies with different name for node merging, and it will not increase the searching space,
-            # because in solver, this node will be merged into other nodes, and solver will not create a new variable for this node.
-            name = f'{input_sharding_spec.sharding_sequence} -> {output_sharding_spec.sharding_sequence}_{index}'
-            # TODO: use meta_info_prop to profile memory cost and compute cost
-            compute_cost = self.output_data.numel()
-            memory_cost = 0
-
-            resharding_costs = self._generate_resharding_costs([input_sharding_spec])
-
-            # to prevent the resharding happening, set their resharding cost to inf.
-            resharding_costs[self.input_node] = [
-                0 if cost == 0 else INFINITY_COST for cost in resharding_costs[self.input_node]
-            ]
-            sharding_strategy = ShardingStrategy(name,
-                                                 output_sharding_spec,
-                                                 compute_cost=compute_cost,
-                                                 memory_cost=memory_cost,
-                                                 resharding_costs=resharding_costs,
-                                                 input_shardings=[input_sharding_spec])
-            self.strategies_vector.append(sharding_strategy)
diff --git a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/where_handler.py b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/where_handler.py
deleted file mode 100644
index 6991e913d463..000000000000
--- a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/where_handler.py
+++ /dev/null
@@ -1,186 +0,0 @@
-import operator
-import warnings
-from copy import deepcopy
-from functools import reduce
-from typing import Dict, List
-
-import torch
-
-from colossalai.auto_parallel.tensor_shard.deprecated._utils import (enumerate_all_possible_1d_sharding,
-                                                                     enumerate_all_possible_2d_sharding,
-                                                                     ignore_sharding_exception)
-from colossalai.auto_parallel.tensor_shard.deprecated.sharding_strategy import (ShardingStrategy, StrategiesVector)
-from colossalai.tensor.shape_consistency import ShapeConsistencyManager
-from colossalai.tensor.sharding_spec import ShardingSpec
-
-from .operator_handler import OperatorHandler
-
-__all__ = ['WhereHandler']
-
-
-class WhereHandler(OperatorHandler):
-    """
-    An OperatorHandler which deals with the sharding strategies of torch.where.
-    """
-
-    def __init__(self, *args, **kwargs):
-        # TODO: x or y could be scalar
-        super().__init__(*args, **kwargs)
-        assert len(self.predecessor_node) == 3
-        self.condition_data = self.predecessor_node[0]._meta_data
-        self.x_data = self.predecessor_node[1]._meta_data
-        self.y_data = self.predecessor_node[2]._meta_data
-        self.condition = self.predecessor_node[0]
-        self.x = self.predecessor_node[1]
-        self.y = self.predecessor_node[2]
-        self.output_data = self.node._meta_data
-
-    def _generate_sharding_spec(self, input_: torch.Tensor, dim_partition_dict: Dict[int, List[int]]) -> ShardingSpec:
-        shape = list(input_.shape)
-
-        # padding the shape to the same length as output_data
-        while len(shape) < self.output_data.dim():
-            shape.insert(0, 1)
-        shape = torch.Size(shape)
-
-        # if the sharding happens on a size one dimension, we should record it as R.
-        processed_dim_partition_dict = deepcopy(dim_partition_dict)
-        for dim_index, _ in dim_partition_dict.items():
-            if shape[dim_index] == 1:
-                processed_dim_partition_dict.pop(dim_index)
-        for dim_index, sharding_index_list in processed_dim_partition_dict.items():
-            sharding_list = [self.device_mesh.mesh_shape[sharding_index] for sharding_index in sharding_index_list]
-            sharding_size = reduce(operator.mul, sharding_list, 1)
-            assert shape[
-                dim_index] % sharding_size == 0, f'we cannot shard the {dim_index} dimension of tensor into {sharding_size} partitions.'
-        sharding_spec = ShardingSpec(device_mesh=self.device_mesh,
-                                     entire_shape=shape,
-                                     dim_partition_dict=processed_dim_partition_dict)
-
-        return sharding_spec
-
-    def _generate_compute_cost(self, total_sharding_size):
-        lhs_matrix_shape = self.lhs_data.shape[-2:]
-        rhs_matrix_shape = self.rhs_data.shape[-2:]
-        batch_dimensions_shape = self.output_data.shape[:-2]
-        batch_dimensions_product = reduce(operator.mul, batch_dimensions_shape, 1)
-        compute_cost = reduce(
-            operator.mul, lhs_matrix_shape) * rhs_matrix_shape[0] * batch_dimensions_product * 2 / total_sharding_size
-        return compute_cost
-
-    def _generate_resharding_costs(self, sharding_specs):
-        # The resharding_cost of weight is counted due to sharing weight cases.
-        dtype = self.node._meta_data.dtype
-        nodes = self.predecessor_node
-        resharding_costs = {}
-        size_per_elem_bytes = torch.tensor([], dtype=dtype).element_size()
-
-        # shape consistency manager is a singleton class
-        shape_consistency_manager = ShapeConsistencyManager()
-
-        for input_node, input_spec in zip(nodes, sharding_specs):
-            resharding_costs[input_node] = []
-            for strategy in input_node.strategies_vector:
-                input_sharding_spec = strategy.output_sharding_spec
-                assert isinstance(input_sharding_spec, ShardingSpec), f'The input node should NOT be a tuple of tensor.'
-                # if the input shape is smaller than the target input, we will fill the input to the same length as target.
-                # Then, use the padded input sharding spec to compute the resharding cost.
-                if len(input_sharding_spec.entire_shape) < len(input_spec.entire_shape):
-                    new_entire_shape = list(input_sharding_spec.entire_shape)
-                    while len(new_entire_shape) < len(input_spec.entire_shape):
-                        new_entire_shape.insert(0, 1)
-                    new_entire_shape = torch.Size(new_entire_shape)
-                    new_device_mesh = input_sharding_spec.device_mesh
-                    new_dim_partition_dict = input_sharding_spec.dim_partition_dict
-                    input_sharding_spec = ShardingSpec(device_mesh=new_device_mesh,
-                                                       entire_shape=new_entire_shape,
-                                                       dim_partition_dict=new_dim_partition_dict)
-
-                # compute the resharding cost
-                _, _, total_resharding_cost = shape_consistency_manager.shape_consistency(
-                    input_sharding_spec, input_spec)
-                total_resharding_cost = total_resharding_cost['total']
-                # we need multiply the size of elem dtype to get correct communication cost
-                resharding_cost = total_resharding_cost * size_per_elem_bytes
-                resharding_costs[input_node].append(resharding_cost)
-
-        return resharding_costs
-
-    def _convert_partition_dict_to_sharding_spec(self, dim_partition_list):
-
-        sharding_spec_list = []
-        check_duplicated_list = []
-        for output_dim_partition_dict in dim_partition_list:
-            try:
-                output_sharding_spec = self._generate_sharding_spec(self.output_data, output_dim_partition_dict)
-            except AssertionError as e:
-                warnings.warn(f'{e}')
-                break
-            sharding_seq = output_sharding_spec.sharding_sequence
-            if sharding_seq not in check_duplicated_list:
-                check_duplicated_list.append(sharding_seq)
-                sharding_spec_list.append(output_sharding_spec)
-
-        return sharding_spec_list
-
-    def _enumerate_all_possible_output(self, mesh_dim_0, mesh_dim_1):
-        # use mesh_dim_0, mesh_dim_1 instead of constant 0, 1 in here for N-D device mesh scaliablity.
-
-        output_dim_partition_list = []
-        dim_size = self.output_data.dim()
-        # enumerate all the 2D sharding cases
-        sharding_list_2d = enumerate_all_possible_2d_sharding(mesh_dim_0, mesh_dim_1, dim_size)
-        output_dim_partition_list.extend(sharding_list_2d)
-
-        # enumerate all the 1D sharding cases
-        sharding_list_1d_on_dim_0 = enumerate_all_possible_1d_sharding(mesh_dim_0, dim_size)
-        output_dim_partition_list.extend(sharding_list_1d_on_dim_0)
-        sharding_list_1d_on_dim_1 = enumerate_all_possible_1d_sharding(mesh_dim_1, dim_size)
-        output_dim_partition_list.extend(sharding_list_1d_on_dim_1)
-
-        # add empty dict for fully replicated case
-        output_dim_partition_list.append({})
-        output_sharding_spec_list = self._convert_partition_dict_to_sharding_spec(output_dim_partition_list)
-
-        return output_sharding_spec_list
-
-    @ignore_sharding_exception
-    def _register_strategy(self, output_sharding_spec):
-        dim_partition_dict_for_input = output_sharding_spec.dim_partition_dict
-        sharding_spec_for_condition = self._generate_sharding_spec(self.condition_data, dim_partition_dict_for_input)
-        sharding_spec_for_x = self._generate_sharding_spec(self.x_data, dim_partition_dict_for_input)
-        sharding_spec_for_y = self._generate_sharding_spec(self.y_data, dim_partition_dict_for_input)
-
-        name = f'{output_sharding_spec.sharding_sequence} = {sharding_spec_for_condition.sharding_sequence} x {sharding_spec_for_x.sharding_sequence} x {sharding_spec_for_y.sharding_sequence}'
-        dim_partition_dict_for_output = output_sharding_spec.dim_partition_dict
-
-        # generate resharding cost for this strategy
-        resharding_costs = self._generate_resharding_costs(
-            [sharding_spec_for_condition, sharding_spec_for_x, sharding_spec_for_y])
-
-        # compute the computation cost of this strategy
-        sharding_dims = []
-        for mesh_dims in dim_partition_dict_for_output.values():
-            for mesh_dim in mesh_dims:
-                sharding_dims.append(self.device_mesh.shape[mesh_dim])
-        sharding_size = reduce(operator.mul, sharding_dims, 1)
-        memory_cost = self.output_data.numel() / sharding_size
-        compute_cost = memory_cost
-        communication_cost = 0
-
-        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=output_sharding_spec,
-                                               compute_cost=compute_cost,
-                                               communication_cost=communication_cost,
-                                               memory_cost=memory_cost,
-                                               resharding_costs=resharding_costs,
-                                               input_shardings=(sharding_spec_for_condition, sharding_spec_for_x,
-                                                                sharding_spec_for_y))
-
-        self.strategies_vector.append(sharding_strategies)
-
-    def register_strategy(self) -> StrategiesVector:
-        MESH_DIM_LIST = [0, 1]
-        output_sharding_specs = self._enumerate_all_possible_output(MESH_DIM_LIST[0], MESH_DIM_LIST[1])
-        for output_sharding_spec in output_sharding_specs:
-            self._register_strategy(output_sharding_spec)
diff --git a/colossalai/auto_parallel/tensor_shard/deprecated/options.py b/colossalai/auto_parallel/tensor_shard/deprecated/options.py
deleted file mode 100644
index 2d34f5c6447e..000000000000
--- a/colossalai/auto_parallel/tensor_shard/deprecated/options.py
+++ /dev/null
@@ -1,11 +0,0 @@
-from dataclasses import dataclass
-
-__all__ = ['SolverOptions']
-
-
-@dataclass
-class SolverOptions:
-    """
-    SolverOptions is a dataclass used to configure the preferences for the parallel execution plan search.
-    """
-    fast: bool = False
diff --git a/colossalai/auto_parallel/tensor_shard/deprecated/sharding_strategy.py b/colossalai/auto_parallel/tensor_shard/deprecated/sharding_strategy.py
deleted file mode 100644
index d468c858e9a9..000000000000
--- a/colossalai/auto_parallel/tensor_shard/deprecated/sharding_strategy.py
+++ /dev/null
@@ -1,91 +0,0 @@
-from copy import deepcopy
-from dataclasses import dataclass
-from abc import ABC, abstractmethod
-from enum import Enum
-import operator
-import torch
-from functools import reduce
-
-from colossalai.device.device_mesh import DeviceMesh
-from colossalai.tensor.sharding_spec import ShardingSpec
-from colossalai.tensor.shape_consistency import CollectiveCommPattern, CommSpec
-from typing import Dict, List, Union, Tuple, Any
-from torch.fx.node import Node
-from .constants import *
-
-__all__ = ['ShardingStrategy', 'StrategiesVector']
-
-
-@dataclass
-class ShardingStrategy:
-    '''
-    ShardingStrategy is a structure containing sharding strategies of inputs and output of this node
-    and costs information using in solver.
-
-    Argument:
-        name(str): express the sharding strategies in string, such as 'S0S1 = S0R x RS1'.
-        output_sharding_spec(ShardingSpec): ShardingSpec of the output node.
-        compute_cost(float): Computation cost to complete this strategy.(default to 0)
-        communication_cost(float): Communication cost to complete this strategy.(default to 0)
-        memory_cost(float): Memory cost of the output node using this strategy.(default to 0)
-        resharding_costs(Dict[int, List[float]]): resharding_cost[i][j] means the cost of i-th argument in the output node argument list
-                                                  with j-th strategy in its strategies_vector transforms to sharding spec wanted in this
-                                                  strategy.(default to None)
-        input_shardings(List(ShardingSpec)): The ShardingSpecs of the input nodes.
-    '''
-
-    name: str
-    # TODO: output of fx node,such as torch.var_mean, could be a tuple, so we cannot simply suppose it is a tensor.
-    output_sharding_spec: Union[ShardingSpec, Tuple[ShardingSpec]]
-    compute_cost: float = 0.
-    communication_cost: float = 0.
-    memory_cost: float = 0.
-    resharding_costs: Dict[Node, List[float]] = None
-    # sometimes the input node could be a tuple of nodes, but most of op won't accept tuple of node as input.
-    # Therefore, we could process them at the specific op(operator.getitem)
-    input_shardings: List[ShardingSpec] = None
-
-
-class StrategiesVector(list):
-    '''
-    Each node in fx graph will have a corresponding StrategiesVector, to store all the possible
-    strategies of the node.
-
-    Argument:
-        node (Node): node for which the list of sharding strategies are generated.
-    '''
-
-    def __init__(self, node: Node):
-        super().__init__()
-        self.node = node
-        # fetch its input and output nodes
-        # TODO: placeholder input nodes
-        self.predecessor_nodes = list(node._input_nodes.keys())
-        if self.node.op == 'output':
-            self.predecessor_nodes = list(node._input_nodes.keys())[:1]
-        self.successor_nodes = list(node.users.keys())
-
-    def check_merge(self):
-        merge_label = False
-        if self.node.op == 'call_module':
-            target = self.node.target
-            root_module = self.node.graph.owning_module
-            submod = root_module.get_submodule(target)
-            submod_type = type(submod)
-            # merge elementwise module node into source nodes
-            # we could merge element-wise op, because the output sharding spec is always same as the input sharding spec.
-            if submod_type in ELEMENTWISE_MODULE_OP:
-                merge_label = True
-
-        if self.node.op == 'call_function':
-            # we could merge element-wise op, because the output sharding spec is always same as the input sharding spec.
-            if self.node.target in ELEMENTWISE_FUNC_OP:
-                merge_label = True
-            # we could merge bcast op if the rhs is a scalar, because it will fall back to the element-wise case.
-            if self.node.target in BCAST_FUNC_OP and len(self.predecessor_nodes) == 1:
-                merge_label = True
-            # we could merge reshape op, because the output sharding spec of reshape op is always fully replicated.
-            if self.node.target in RESHAPE_FUNC_OP:
-                merge_label = True
-
-        return merge_label
diff --git a/colossalai/auto_parallel/tensor_shard/deprecated/solver.py b/colossalai/auto_parallel/tensor_shard/deprecated/solver.py
deleted file mode 100644
index 4c1d2f3bed5a..000000000000
--- a/colossalai/auto_parallel/tensor_shard/deprecated/solver.py
+++ /dev/null
@@ -1,469 +0,0 @@
-import multiprocessing
-import time
-import warnings
-from typing import Dict
-
-import numpy as np
-from torch.fx.graph import Graph
-from torch.fx.node import Node
-
-from .constants import INFINITY_COST
-from .cost_graph import CostGraph
-from .graph_analysis import GraphAnalyser
-from .strategies_constructor import StrategiesConstructor
-
-try:
-    import pulp
-    from pulp import LpMinimize, LpProblem, LpStatus, LpVariable, lpDot, lpSum
-except:
-    warnings.warn(f'please install the pulp')
-
-__all___ = ['Solver']
-
-
-class Solver:
-
-    def __init__(self,
-                 graph: Graph,
-                 strategies_constructor: StrategiesConstructor,
-                 cost_graph: CostGraph,
-                 graph_analyser: GraphAnalyser,
-                 memory_budget: float = -1.0,
-                 solution_numbers: int = 1,
-                 memory_increasing_coefficient: float = 1.3):
-        '''
-        Solver class will integrate information provided by the components and use ILP solver to find a possible optimal strategies combination for target computing graph.
-
-        Argument:
-            graph: The computing graph to be optimized.
-            strategies_constructor: It will provide all the possible strategies for each node in the computing graph.
-            cost_graph: A graph data structure to simplify the edge cost graph.
-            graph_analyser: graph_analyser will analyse the graph to obtain the variable liveness information, which will be used to generate memory constraints.
-            memory_budget: Memory constraint for the solution.
-            solution_numbers: If solution_numbers is larger than one, solver will us a serious of solutions based on different memory budget.
-            memory_increasing_coefficient: If solution_numbers is larger than one, we will use this coefficient to generate new memory budget.
-        '''
-        self.graph = graph
-        self.strategies_constructor = strategies_constructor
-        self.cost_graph = cost_graph
-        self.graph_analyser = graph_analyser
-        self.leaf_strategies = self.strategies_constructor.leaf_strategies
-        self.nodes = [strategies_vector.node for strategies_vector in self.leaf_strategies]
-        self.strategy_map = self.strategies_constructor.strategy_map
-        self.memory_budget = memory_budget
-        self.solution_numbers = solution_numbers
-        if self.solution_numbers > 1:
-            self.memory_increasing_coefficient = memory_increasing_coefficient
-        else:
-            self.memory_increasing_coefficient = 1
-        self.liveness_list = self.graph_analyser.liveness_analysis()
-        self.node_index_dict = self._generate_node_index_dict()
-        # The last solution vector of auto sharding.
-        self.last_s_val = None
-        # The last objective value of the best ILP solution.
-        self.last_objective = None
-
-    def _recover_merged_node_strategy(self):
-        '''
-        During cost graph constructing, some nodes, such as unary element-wise node or ReshapeOp, were merged into the previous node.
-        Therefore, the index of those strategies are copied from the previous node. This method is used to recover the strategy index of those merged
-        node.
-        '''
-        for node_index, node in enumerate(self.nodes):
-            if node.strategies_vector.check_merge():
-                # the merged node has only one input, and its strategies follow the input sharding strategy
-                input_strategies_vector = node.args[0].strategies_vector
-                input_best_strategy_index = self.last_s_val[node_index - 1]
-                input_sharding_spec = input_strategies_vector[input_best_strategy_index].output_sharding_spec
-                for strategy_index, strategy in enumerate(node.strategies_vector):
-                    if strategy.input_shardings[0].sharding_sequence == input_sharding_spec.sharding_sequence:
-                        self.last_s_val[node_index] = strategy_index
-                        break
-
-    def _generate_node_index_dict(self) -> Dict[Node, int]:
-        node_index_dict = {}
-        for index, strategies_vector in enumerate(self.leaf_strategies):
-            node_index_dict[strategies_vector.node] = index
-        return node_index_dict
-
-    def _prepare_data_for_solver(self):
-        '''
-        Extract information from components for solver.
-        '''
-        node_nums = len(self.leaf_strategies)
-        memory_budget = self.memory_budget
-
-        # prepare strategies_len
-        strategies_len = []
-        for node in self.nodes:
-            strategies_len.append(self.cost_graph.node_lens[node])
-        strategies_len = np.array(strategies_len)
-
-        # prepare following_nodes
-        following_nodes = self.cost_graph.following_dict
-        index_following_nodes = {}
-        for src, target in following_nodes.items():
-            src_index = self.node_index_dict[src]
-            target_index = self.node_index_dict[target]
-            index_following_nodes[src_index] = target_index
-        following_nodes = index_following_nodes
-        for index in range(node_nums):
-            if index not in following_nodes:
-                following_nodes[index] = -1
-
-        # prepare edge_pairs and resharding costs
-        edge_pairs = []
-        resharding_costs = []
-        for pairs, edge_cost in self.cost_graph.edge_costs.items():
-            src_node = pairs[0]
-            dst_node = pairs[1]
-            src_node_index = self.node_index_dict[src_node]
-            dst_node_index = self.node_index_dict[dst_node]
-            edge_pairs.append(src_node_index)
-            edge_pairs.append(dst_node_index)
-
-            for i in range(strategies_len[src_node_index]):
-                for j in range(strategies_len[dst_node_index]):
-                    resharding_costs.append(edge_cost[(i, j)])
-        edge_pairs = np.array(edge_pairs)
-        resharding_costs = np.array(resharding_costs)
-
-        # prepare liveness_set
-        liveness_set = self.liveness_list
-
-        # omit alias_set now
-        alias_set = None
-        alias_convert_costs = None
-
-        # prepare compute_costs, communication_costs and memory_costs
-        compute_costs = []
-        communication_costs = []
-        memory_costs = []
-        extra_node_costs = self.cost_graph.extra_node_costs
-        for strategies_vector in self.leaf_strategies:
-            node = strategies_vector.node
-            for index, strategy in enumerate(strategies_vector):
-                compute_costs.append(strategy.compute_cost)
-                # node in extra_node_costs means it has some extra communication
-                # cost from node merging, so we need to add those extra communication
-                # cost into
-                if node in extra_node_costs:
-                    origin_communication_cost = strategy.communication_cost
-                    extra_node_cost = extra_node_costs[node][index]
-                    communication_cost = origin_communication_cost + extra_node_cost
-                    communication_costs.append(communication_cost)
-                else:
-                    communication_costs.append(strategy.communication_cost)
-                # temporarily we just consider the forward memory cost
-                memory_cost = strategy.memory_cost
-                if isinstance(memory_cost, tuple):
-                    memory_costs.append(memory_cost[0])
-                else:
-                    memory_costs.append(memory_cost)
-        compute_costs = np.array(compute_costs)
-        communication_costs = np.array(communication_costs)
-        memory_costs = np.array(memory_costs)
-
-        # omit initial value for nodes
-        s_init_np = None
-
-        return node_nums, memory_budget, strategies_len, following_nodes, edge_pairs, alias_set, liveness_set, compute_costs, communication_costs, memory_costs, resharding_costs, alias_convert_costs, s_init_np
-
-    def _call_solver_serialized_args(self,
-                                     node_nums,
-                                     memory_budget,
-                                     strategies_len,
-                                     following_nodes,
-                                     edge_pairs,
-                                     alias_set,
-                                     liveness_set,
-                                     compute_costs,
-                                     communication_costs,
-                                     memory_costs,
-                                     resharding_costs,
-                                     alias_convert_costs,
-                                     s_init_np=None):
-        """
-        Call the solver with serialized arguments.
-        """
-
-        tic = time.time()
-
-        for x in [strategies_len, edge_pairs, compute_costs, communication_costs, memory_costs, resharding_costs]:
-            assert isinstance(x, np.ndarray)
-        assert len(strategies_len) == node_nums, "strategies_len"
-
-        def get_non_zero_index(binary_vector):
-            """
-            Get the index of non-zero item in a vector.
-            """
-            ct = 0
-            ret = None
-            for i, elem in enumerate(binary_vector):
-                if pulp.value(elem):
-                    ret = i
-                    ct += 1
-
-            assert ct == 1
-            return ret
-
-        # 0. Unpack flatten numpy arrays
-        s_follow = following_nodes
-
-        E = edge_pairs.reshape((-1, 2))    # noqa
-        r = []
-        pt = 0
-        edge_set = set()
-        for (i, j) in E:
-            prod_length = strategies_len[i] * strategies_len[j]
-
-            if (i, j) in edge_set:
-                raise ValueError(f"Duplicated edges: {(i, j)}")
-
-            edge_set.add((i, j))
-            r.append(resharding_costs[pt:pt + prod_length])
-            pt += prod_length
-        assert pt == len(resharding_costs)
-
-        ######################
-        # omit alias set now #
-        ######################
-
-        # A = alias_set.reshape((-1, 2))  # noqa
-        # for (i, j) in A:
-        #     prod_length = strategies_len[i] * strategies_len[j]
-        #     v.append(alias_convert_costs[pt:pt + prod_length])
-        #     pt += prod_length
-        # assert pt == len(alias_convert_costs)
-
-        # L = []  # noqa
-        # pt = node_nums
-        # for i in range(node_nums):
-        #     length = liveness_set[i]
-        #     L.append(liveness_set[pt:pt + length])
-        #     pt += length
-        # assert pt == len(liveness_set)
-        v = []
-        pt = 0
-
-        c = []
-        d = []
-        m = []
-        pt = 0
-        for i in range(node_nums):
-            length = strategies_len[i]
-            c.append(compute_costs[pt:pt + length])
-            d.append(communication_costs[pt:pt + length])
-            m.append(memory_costs[pt:pt + length])
-            pt += length
-        assert pt == len(compute_costs), f"{pt} == {len(compute_costs)}"
-        assert pt == len(communication_costs), f"{pt} == {len(communication_costs)}"
-        assert pt == len(memory_costs), f"{pt} == {len(memory_costs)}"
-
-        # 1. Create variables
-
-        #############################
-        # create variables for node #
-        #############################
-        s = []
-        num_nodes = 0
-        reverse_follow_backpatch = []
-        for i in range(node_nums):
-            if s_follow[i] < 0:
-                if strategies_len[i] == 1:
-                    s.append([1])
-                else:
-                    num_nodes += 1
-                    s.append(LpVariable.matrix(f"s[{i}]", (range(strategies_len[i]),), cat="Binary"))
-            else:
-                if s_follow[i] < len(s):
-                    s.append(s[s_follow[i]])
-                else:
-                    s.append(None)
-                    reverse_follow_backpatch.append(i)
-
-        for i in reverse_follow_backpatch:
-            s[i] = s[s_follow[i]]
-
-        #############################
-        # create variables for edge #
-        #############################
-        e = []
-        num_edges = 0
-        for (idx, (i, j)) in enumerate(E):
-            if len(s[i]) == 1:
-                e.append(s[j])
-            elif len(s[j]) == 1:
-                e.append(s[i])
-            else:
-                num_edges += 1
-                e.append(LpVariable.matrix(f"e[{i},{j}]", (range(len(s[i]) * len(s[j])),), cat="Binary"))
-            assert len(e[idx]) == len(r[idx])
-        for element in s:
-            assert len(element) > 0
-        # 2. Set initial value
-        ######################################
-        # set a initial value for warm start #
-        ######################################
-        if s_init_np is not None:
-            s_init = s_init_np.reshape((-1, 3))
-            for (idx, value, fix) in s_init:
-                for i in range(len(s[idx])):
-                    s[idx][i].setInitialValue(i == value)
-                    if fix:
-                        s[idx][i].fixValue()
-
-        # 3. Objective
-        prob = LpProblem("myProblem", LpMinimize)
-        ###################################################################
-        # computing the node cost(computing cost and communication cost)  #
-        ###################################################################
-        obj = 0
-        for i in range(node_nums):
-            assert len(s[i]) == len(c[i])
-            assert len(s[i]) == len(d[i])
-
-            obj += lpDot(s[i], c[i]) + lpDot(s[i], d[i])
-
-        #############################################
-        # computing the edge cost(resharding cost)  #
-        #############################################
-        for i in range(len(E)):
-            assert len(e[i]) == len(r[i])
-            obj += lpDot(e[i], r[i])
-
-        prob += obj
-
-        # 4. Constraints
-        # (a). specified by `cat="Binary"`
-
-        # (b)
-        #################################################
-        # make sure each node only choose one strategy  #
-        #################################################
-        for i in range(node_nums):
-            if s_follow[i] < 0:
-                prob += lpSum(s[i]) == 1
-
-        # (c)
-        #################################################
-        # compute memory consumption with liveness set  #
-        #################################################
-        if memory_budget > 0:
-            for liveness_stage in liveness_set:
-                mem = 0
-                for live_variable in liveness_stage.unique_live_vars:
-                    node_index = self.node_index_dict[live_variable.node]
-                    mem += lpSum(s[node_index][j] * m[node_index][j] for j in range(len(s[node_index])))
-                prob += mem <= memory_budget
-
-        # (d). specified by `cat="Binary"`
-
-        for (idx, (i, j)) in enumerate(E):
-            if strategies_len[i] == 1 or strategies_len[j] == 1:
-                continue
-
-            # (e)
-            prob += lpSum(e[idx]) == 1
-
-            # (f)
-            for row in range(len(s[i])):
-                C = len(s[j])    # noqa
-                prob += lpSum(e[idx][row * C + col] for col in range(0, C)) <= s[i][row]
-
-            # (g)
-            for col in range(len(s[j])):
-                R = len(s[i])    # noqa
-                C = len(s[j])    # noqa
-                prob += lpSum(e[idx][row * C + col] for row in range(0, R)) <= s[j][col]
-
-        # (h)
-        ######################
-        # omit alias set now #
-        ######################
-
-        # alias_set = set()
-        # for (idx, (i, j)) in enumerate(A):
-        #     R = len(s[i])  # noqa
-        #     C = len(s[j])  # noqa
-        #     if (i, j) in alias_set:
-        #         raise ValueError(f"Duplicated edges: {(i, j)}")
-
-        #     alias_set.add((i, j))
-        #     alias_set.add((j, i))
-
-        #     for row in range(len(s[i])):
-        #         for col in range(len(s[j])):
-        #             if v[idx][row * C + col] > 0.5:
-        #                 prob += s[i][row] + s[j][col] <= 1
-
-        verbose = True
-
-        msg = verbose
-        time_limit = 600
-        assert "COIN_CMD" in pulp.listSolvers(
-            onlyAvailable=True), ("Please install ILP solvers by 'sudo apt install coinor-cbc'")
-
-        solver = pulp.COIN_CMD(mip=True, msg=msg, timeLimit=time_limit, threads=multiprocessing.cpu_count())
-        # solver = pulp.GLPK_CMD(mip=True, msg=msg, timeLimit=time_limit)
-        prob.solve(solver)
-
-        status = prob.status
-        objective = pulp.value(prob.objective)
-        objective = float(objective) if objective is not None else -1.0
-        if verbose:
-            print(f"ILP Status: {LpStatus[status]}\tObjective: {objective}\t"
-                  f"Time: {time.time() - tic}")
-            print(f"#nodes: {num_nodes},  #edges: {num_edges}")
-
-        if prob.status in [pulp.LpStatusInfeasible]:
-            raise RuntimeError("Cannot run the function under the given memory budget. "
-                               "Please increase the memory budget.")
-
-        # Get and check results
-        s_val = np.full((node_nums,), -1, dtype=np.int32)
-        for i in range(node_nums):
-            s_val[i] = get_non_zero_index(s[i])
-
-        e_val = np.full((len(E),), -1, dtype=np.int32)
-        for (idx, (i, j)) in enumerate(E):
-            e_val[idx] = get_non_zero_index(e[idx])
-            i_spec_index = e_val[idx] // len(s[j])
-            j_spec_index = e_val[idx] % len(s[j])
-            assert i_spec_index == s_val[i], f"e_val[{i}][{j}]"
-            assert j_spec_index == s_val[j], f"e_val[{i}][{j}]"
-            if verbose and r[idx][e_val[idx]] > 0:
-                print(f"Edge cost {(i, j)} : {r[idx][e_val[idx]]}")
-
-        self.last_s_val = list(s_val)
-        self._recover_merged_node_strategy()
-        self.last_objective = objective
-
-        if objective > INFINITY_COST:
-            warnings.warn("Detect unexpected behaviors in the auto-sharding pass.")
-
-        return self.last_s_val, e_val, self.last_objective, status
-
-    def call_solver_serialized_args(self):
-        """
-        Call the solver with serialized arguments and handle python errors. Additionally,
-        we could give a serious of solutions with different memory budget.
-        """
-        if self.solution_numbers == 1:
-            args = self._prepare_data_for_solver()
-            ret = self._call_solver_serialized_args(*args)
-
-            return ret
-
-        origin_memory_budget = self.memory_budget
-        memory_budget_list = [
-            origin_memory_budget * self.memory_increasing_coefficient**i for i in range(self.solution_numbers)
-        ]
-        ret_list = []
-        for memory_budget in memory_budget_list:
-            self.memory_budget = memory_budget
-            args = self._prepare_data_for_solver()
-            ret = self._call_solver_serialized_args(*args)
-            ret_list.append(ret)
-
-        return ret_list
diff --git a/colossalai/auto_parallel/tensor_shard/deprecated/strategies_constructor.py b/colossalai/auto_parallel/tensor_shard/deprecated/strategies_constructor.py
deleted file mode 100644
index 7bebde9d65a0..000000000000
--- a/colossalai/auto_parallel/tensor_shard/deprecated/strategies_constructor.py
+++ /dev/null
@@ -1,426 +0,0 @@
-import builtins
-import math
-import operator
-from copy import deepcopy
-from typing import Dict, List
-
-import torch
-from torch.fx import Graph, Node
-
-from colossalai.device.device_mesh import DeviceMesh
-from colossalai.tensor.shape_consistency import ShapeConsistencyManager
-from colossalai.tensor.sharding_spec import ShardingSpec
-
-from ._utils import generate_resharding_costs, generate_sharding_spec
-from .constants import *
-from .op_handler import *
-from .options import SolverOptions
-from .sharding_strategy import ShardingStrategy, StrategiesVector
-
-__all__ = ['StrategiesConstructor']
-
-
-class StrategiesConstructor:
-    """
-    StrategiesConstructor is used to construct the parallelization plan for the model execution.
-
-    Args:
-        graph (Graph): a Graph object used for analysis and strategy generation.
-        device_mesh (DeviceMesh): a DeviceMesh object which contains the meta information about the cluster.
-        solver_options (SolverOptions): a SolverOptions object which specifies the preferences for plan searching.
-    """
-
-    def __init__(self, graph: Graph, device_mesh: DeviceMesh, solver_options: SolverOptions):
-        self.graph = graph
-        assert graph.owning_module is not None, 'The given graph is not associated with a owning_module'
-        self.root_module = self.graph.owning_module
-        self.nodes = list(graph.nodes)
-        self.device_mesh = device_mesh
-        self.leaf_strategies = []
-        self.strategy_map = {}
-        self.solver_options = solver_options
-
-    def remove_duplicated_strategy(self, strategies_vector):
-        '''
-        In build_strategies_and_cost method, we may produce some duplicated strategies.
-        In this method, we will remove the duplicated strategies depending on the strategies name.
-        '''
-        name_checklist = []
-        remove_list = []
-        for strategy in strategies_vector:
-            if strategy.name not in name_checklist:
-                name_checklist.append(strategy.name)
-            else:
-                remove_list.append(strategy)
-
-        for strategy in remove_list:
-            strategies_vector.remove(strategy)
-
-    def _is_bcast_matmul(self, node):
-        is_bcast_matmul = False
-        if node.target is torch.matmul and len(node.args) == 2:
-            lhs_data = node.args[0]._meta_data
-            rhs_data = node.args[1]._meta_data
-            if lhs_data.dim() >= 3 and rhs_data.dim() >= 3:
-                is_bcast_matmul = True
-        return is_bcast_matmul
-
-    def build_strategies_and_cost(self):
-        for node in self.nodes:
-            strategies_vector = StrategiesVector(node)
-            input_nodes_len = 0
-            for check_node in strategies_vector.predecessor_nodes:
-                if isinstance(check_node._meta_data, torch.Tensor):
-                    input_nodes_len += 1
-            # input_nodes_len = len(strategies_vector.predecessor_nodes)
-            # placeholder node
-            if node.op == 'placeholder':
-                # For placeholder nodes, if solver_options.fast is True, we just let them in
-                # fully replicate status, then strategies of following node will be treated equally due
-                # to replicate status has no resharding cost to other status. At the same time, the searching
-                # space is smaller than enumerating all the possible sharding spec for the placeholder node.
-                # Otherwise, all the possible sharding spec for the placeholder node will be enumerated.
-
-                if self.solver_options.fast:
-                    # create sharding strategy for placeholder
-                    name = 'Replica Placeholder'
-                    dim_partition_dict = {}
-                    output_sharding_spec = generate_sharding_spec(node, self.device_mesh, dim_partition_dict)
-                    # TODO: use meta_info_prop to profile memory cost
-                    memory_cost = 0
-                    sharding_strategy_placeholder = ShardingStrategy(name,
-                                                                     output_sharding_spec,
-                                                                     memory_cost=memory_cost)
-                    strategies_vector.append(sharding_strategy_placeholder)
-
-            # get_attr node
-            if node.op == 'get_attr':
-                # Same as placeholder nodes, if solver_options.fast is True, we just let them in
-                # fully replicate status, then strategies of following node will be treated equally due
-                # to replicate status has no resharding cost to other status. At the same time, the searching
-                # space is smaller than enumerating all the possible sharding spec for the get_attr node.
-                # Otherwise, all the possible sharding spec for the get_attr node will be enumerated.
-                if self.solver_options.fast:
-                    # create sharding strategy for get_attr
-                    name = 'Replica Attribute'
-                    dim_partition_dict = {}
-                    output_sharding_spec = generate_sharding_spec(node, self.device_mesh, dim_partition_dict)
-                    # TODO: use meta_info_prop to profile memory cost
-                    memory_cost = 0
-                    sharding_strategy_attribute = ShardingStrategy(name, output_sharding_spec, memory_cost=memory_cost)
-                    strategies_vector.append(sharding_strategy_attribute)
-
-            # call_module node
-            if node.op == 'call_module':
-
-                target = node.target
-                submod = self.root_module.get_submodule(target)
-                submod_type = type(submod)
-
-                # conv module
-                if submod_type in CONV_MODULE_OP:
-                    # use ConvHandler to create sharding strategies for conv module node
-                    conv_handler = ConvHandler(node, self.device_mesh, strategies_vector)
-                    conv_handler.register_strategy()
-
-                # linear module
-                elif submod_type in LINEAR_MODULE_OP:
-                    # use DotHandler to create sharding strategies for linear module node
-                    dot_handler = DotHandler(node, self.device_mesh, strategies_vector)
-                    dot_handler.register_strategy()
-
-                # element-wise module
-                elif submod_type in ELEMENTWISE_MODULE_OP:
-                    unary_elementwise_handler = UnaryElementwiseHandler(node, self.device_mesh, strategies_vector)
-                    unary_elementwise_handler.register_strategy()
-
-                # BatchNormNd module
-                elif submod_type in BATCHNORM_MODULE_OP:
-                    # create sharding strategy for element-wise module
-                    norm_handler = BatchNormHandler(node, self.device_mesh, strategies_vector)
-                    norm_handler.register_strategy()
-                    # for strategy in norm_handler.strategies_vector:
-                    #     print(f'{strategy.name}, computation_cost: {strategy.compute_cost}, memory_cost: {strategy.memory_cost}')
-                    # assert False
-
-                # MaxPool module
-                elif submod_type in POOL_MODULE_OP:
-                    # TODO: add sharding constraints on image dimension
-                    # e.g.: for a 2D pooling input NCHW, we should promise no sharding happens on H and W dimension
-
-                    # create sharding strategy for element-wise module
-                    assert input_nodes_len == 1, f'Temporally, we just support single input element-wise op.'
-                    input_node = strategies_vector.predecessor_nodes[0]
-                    # For element-wise module, we keep the sharding spec of output node same as
-                    # the input. Therefore, the different strategies of input node with same
-                    # output sharding spec will generate same strategy for element-wise module.
-                    sharding_spec_checklist = []
-                    for strategy in input_node.strategies_vector:
-                        # It looks a little bit confusing, the input of the processing node
-                        # is the output of the input_node.
-                        input_sharding_spec = strategy.output_sharding_spec
-                        assert isinstance(input_sharding_spec,
-                                          ShardingSpec), f'The input node should NOT be a tuple of tensor.'
-                        if input_sharding_spec in sharding_spec_checklist:
-                            continue
-
-                        sharding_spec_checklist.append(input_sharding_spec)
-                        dim_partition_dict = deepcopy(input_sharding_spec.dim_partition_dict)
-                        output_sharding_spec = generate_sharding_spec(node, self.device_mesh, dim_partition_dict)
-
-                        name = f'{input_sharding_spec.sharding_sequence} -> {output_sharding_spec.sharding_sequence}'
-
-                        # TODO: use meta_info_prop to profile memory cost and compute cost
-                        compute_cost = node._meta_data.numel()
-                        memory_cost = 0
-                        resharding_costs = generate_resharding_costs(strategies_vector.predecessor_nodes,
-                                                                     [input_sharding_spec])
-
-                        sharding_strategy = ShardingStrategy(name,
-                                                             output_sharding_spec,
-                                                             compute_cost=compute_cost,
-                                                             memory_cost=memory_cost,
-                                                             resharding_costs=resharding_costs,
-                                                             input_shardings=[input_sharding_spec])
-                        strategies_vector.append(sharding_strategy)
-
-                # embedding module
-                elif submod_type in EMBEDDING_MODULE_OP:
-                    embedding_handler = EmbeddingHandler(node, self.device_mesh, strategies_vector)
-                    embedding_handler.register_strategy()
-
-                # layernorm module
-                elif submod_type in LAYERNORM_MODULE_OP:
-                    layernorm_handler = LayerNormHandler(node, self.device_mesh, strategies_vector)
-                    layernorm_handler.register_strategy()
-                # other module
-                else:
-                    raise RuntimeError(f'{submod_type} module is NOT supported now.')
-
-            # call_function node
-            if node.op == 'call_function':
-                target = node.target
-                # conv function
-                if target in CONV_FUNC_OP:
-                    # use ConvHandler to create sharding strategies for conv node
-                    # TODO: the operator_handler does NOT support function node processing now.
-                    conv_handler = ConvHandler(node, self.device_mesh, strategies_vector)
-                    conv_handler.register_strategy()
-
-                # linear function
-                elif target in LINEAR_FUNC_OP and not self._is_bcast_matmul(node):
-                    # use DotHandler to create sharding strategies for linear node
-                    # TODO: the operator_handler does NOT support function node processing now.
-                    linear_handler = DotHandler(node, self.device_mesh, strategies_vector)
-                    linear_handler.register_strategy()
-
-                # where function
-                elif target == torch.where:
-                    if input_nodes_len == 1:
-                        # both of x and y are scalar
-                        pass
-
-                    elif input_nodes_len == 2:
-                        # one of x or y is type of scalar
-                        pass
-
-                    else:
-                        # general case
-                        where_handler = WhereHandler(node, self.device_mesh, strategies_vector)
-                        where_handler.register_strategy()
-
-                # reshape function
-                elif target in RESHAPE_FUNC_OP:
-                    # use ReshapeHandler to create sharding strategies for rehsape node
-                    reshape_handler = ReshapeHandler(node, self.device_mesh, strategies_vector)
-                    reshape_handler.register_strategy()
-
-                # element-wise function
-                elif target in ELEMENTWISE_FUNC_OP or (target in BCAST_FUNC_OP and input_nodes_len == 1):
-                    unary_elementwise_handler = UnaryElementwiseHandler(node, self.device_mesh, strategies_vector)
-                    unary_elementwise_handler.register_strategy()
-
-                # bcast op
-                elif target in BCAST_FUNC_OP:
-                    if isinstance(node._meta_data, torch.Tensor):
-                        bcast_op_handler = BcastOpHandler(node, self.device_mesh, strategies_vector)
-                        bcast_op_handler.register_strategy()
-
-                # torch.var_mean
-                elif target == torch.var_mean:
-                    dim = node.kwargs['dim']
-                    input_tensor_node = strategies_vector.predecessor_nodes[0]
-                    for strategy in input_tensor_node.strategies_vector:
-                        input_sharding_spec = strategy.output_sharding_spec
-                        assert isinstance(input_sharding_spec,
-                                          ShardingSpec), f'The input node should NOT be a tuple of tensor.'
-                        entire_shape_input = input_sharding_spec.entire_shape
-                        dim_partition_dict_input = input_sharding_spec.dim_partition_dict
-                        name = f'{new_input_sharding_spec.sharding_sequence} -> ({output_sharding_spec.sharding_sequence}, {output_sharding_spec.sharding_sequence})'
-                        if dim in dim_partition_dict_input:
-                            # We need to make the action dimension in replicate status
-                            dim_partition_dict_for_input = deepcopy(dim_partition_dict_input)
-                            dim_partition_dict_for_input.pop(dim)
-                            new_input_sharding_spec = ShardingSpec(self.device_mesh,
-                                                                   entire_shape_input,
-                                                                   dim_partition_dict=dim_partition_dict_for_input)
-                            entire_shape_output = deepcopy(entire_shape_input)
-                            entire_shape_output.pop(dim)
-                            dim_partition_dict_for_output = deepcopy(dim_partition_dict_for_input)
-                            output_sharding_spec = ShardingSpec(self.device_mesh,
-                                                                entire_shape_output,
-                                                                dim_partition_dict=dim_partition_dict_for_input)
-                            # TODO: use meta_info_prop to profile origin memory cost and compute cost, then divide them depending on sharding spec.
-                            compute_cost = 0
-                            memory_cost = 0
-                            resharding_costs = generate_resharding_costs(strategies_vector.predecessor_nodes,
-                                                                         [new_input_sharding_spec])
-                            sharding_strategy = ShardingStrategy(name, (output_sharding_spec, output_sharding_spec),
-                                                                 compute_cost=compute_cost,
-                                                                 memory_cost=memory_cost,
-                                                                 resharding_costs=resharding_costs,
-                                                                 input_shardings=[new_input_sharding_spec])
-
-                        else:
-                            entire_shape_output = deepcopy(entire_shape_input)
-                            entire_shape_output.pop(dim)
-                            dim_partition_dict_for_output = deepcopy(dim_partition_dict_input)
-                            output_sharding_spec = ShardingSpec(self.device_mesh,
-                                                                entire_shape_output,
-                                                                dim_partion_dict=dim_partition_dict_input)
-                            # TODO: use meta_info_prop to profile origin memory cost and compute cost, then divide them depending on sharding spec.
-                            compute_cost = 0
-                            memory_cost = 0
-                            resharding_costs = generate_resharding_costs(strategies_vector.predecessor_nodes,
-                                                                         [input_sharding_spec])
-                            sharding_strategy = ShardingStrategy(name, (output_sharding_spec, output_sharding_spec),
-                                                                 compute_cost=compute_cost,
-                                                                 memory_cost=memory_cost,
-                                                                 resharding_costs=resharding_costs,
-                                                                 input_shardings=[input_sharding_spec])
-
-                        strategies_vector.append(sharding_strategy)
-
-                # operator.getitem
-                elif target == operator.getitem:
-                    index = node.args[1]
-                    input_tensor_node = strategies_vector.predecessor_nodes[0]
-                    for strategy in input_tensor_node.strategies_vector:
-                        if isinstance(strategy.output_sharding_spec, ShardingSpec):
-                            input_sharding_spec = strategy.output_sharding_spec
-                        else:
-                            input_sharding_spec = strategy.output_sharding_spec[index]
-                        assert isinstance(input_sharding_spec, ShardingSpec), f'This assertion is used to debug.'
-                        dim_partition_dict_for_output = deepcopy(input_sharding_spec.dim_partition_dict)
-                        entire_shape_output = deepcopy(input_sharding_spec.entire_shape)
-                        output_sharding_spec = ShardingSpec(self.device_mesh,
-                                                            entire_shape_output,
-                                                            dim_partition_dict=dim_partition_dict_for_output)
-                        # TODO: use meta_info_prop to profile origin memory cost and compute cost, then divide them depending on sharding spec.
-                        compute_cost = 0
-                        memory_cost = 0
-                        resharding_costs = generate_resharding_costs(strategies_vector.predecessor_nodes,
-                                                                     [input_sharding_spec],
-                                                                     index=index)
-                        # to prevent the resharding happening, set their resharding cost to inf.
-                        resharding_costs[input_tensor_node] = [
-                            cost if cost == 0 else INFINITY_COST for cost in resharding_costs[input_tensor_node]
-                        ]
-                        sharding_strategy = ShardingStrategy(name,
-                                                             output_sharding_spec,
-                                                             compute_cost=compute_cost,
-                                                             memory_cost=memory_cost,
-                                                             resharding_costs=resharding_costs,
-                                                             input_shardings=[strategy.output_sharding_spec])
-                        strategies_vector.append(sharding_strategy)
-
-                # torch.arange function
-                elif target == torch.arange:
-                    name = f'FULLY REPLICATED ARANGE'
-                    entire_shape_output = node._meta_data.shape
-                    dim_partition_dict_for_output = {}
-                    output_sharding_spec = ShardingSpec(self.device_mesh,
-                                                        entire_shape_output,
-                                                        dim_partition_dict=dim_partition_dict_for_output)
-                    memory_cost = node._meta_data.numel()
-                    sharding_strategy = ShardingStrategy(name,
-                                                         output_sharding_spec,
-                                                         compute_cost=0,
-                                                         memory_cost=memory_cost)
-                    strategies_vector.append(sharding_strategy)
-
-                # op list to be processed to support gpt2
-                elif target in (builtins.getattr, operator.le, torch.addmm):
-                    pass
-                # other function
-                else:
-                    raise RuntimeError(f'{target} function is NOT supported now.')
-
-            # call_method node
-            if node.op == 'call_method':
-                method = getattr(node.args[0]._meta_data.__class__, node.target)
-                if method in (torch.Tensor.size,):
-                    pass
-                elif method in ELEMENTWISE_METHOD_OP:
-                    unary_elementwise_handler = UnaryElementwiseHandler(node, self.device_mesh, strategies_vector)
-                    unary_elementwise_handler.register_strategy()
-
-                elif method in RESHAPE_METHOD_OP:
-                    reshape_handler = ReshapeHandler(node, self.device_mesh, strategies_vector)
-                    reshape_handler.register_strategy()
-                    # print(strategies_vector)
-                    # if len(strategies_vector) == 0:
-                    #     print(node)
-                    #     assert False
-                else:
-                    raise RuntimeError(f'{method} function is NOT supported now.')
-
-            # output node
-            if node.op == 'output':
-                if self.solver_options.fast:
-                    # create sharding strategy for output
-                    name = 'Replica Output'
-                    input_nodes = strategies_vector.predecessor_nodes
-                    input_sharding_specs = []
-                    for input_node in input_nodes:
-                        dim_partition_dict_for_input = {}
-                        entire_shape = input_node._meta_data.shape
-                        sharding_spec = ShardingSpec(self.device_mesh,
-                                                     entire_shape,
-                                                     dim_partition_dict=dim_partition_dict_for_input)
-                        input_sharding_specs.append(sharding_spec)
-
-                    dim_partition_dict = {}
-                    output_sharding_spec = input_sharding_specs
-                    # TODO: use meta_info_prop to profile memory cost
-                    memory_cost = 0
-                    resharding_costs = generate_resharding_costs(strategies_vector.predecessor_nodes,
-                                                                 input_sharding_specs)
-
-                    # clear the resharding cost for the output node
-                    # TODO: we may remove this in final version
-                    for prev_node, resharding_cost_list in resharding_costs.items():
-                        resharding_costs[prev_node] = [0] * len(resharding_cost_list)
-
-                    sharding_strategy_attribute = ShardingStrategy(name,
-                                                                   output_sharding_spec,
-                                                                   memory_cost=memory_cost,
-                                                                   resharding_costs=resharding_costs,
-                                                                   input_shardings=tuple(input_sharding_specs))
-                    strategies_vector.append(sharding_strategy_attribute)
-
-            self.remove_duplicated_strategy(strategies_vector)
-            setattr(node, 'strategies_vector', strategies_vector)
-            self.leaf_strategies.append(strategies_vector)
-            self.strategy_map[node] = strategies_vector
-
-        # remove no strategy nodes
-        remove_list = []
-        for strategies_vector in self.leaf_strategies:
-            if len(strategies_vector) == 0:
-                remove_list.append(strategies_vector.node)
-        for node in remove_list:
-            if node.strategies_vector in self.leaf_strategies:
-                self.leaf_strategies.remove(node.strategies_vector)
-            if node in self.strategy_map:
-                self.strategy_map.pop(node)
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_cost_graph.py b/tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_cost_graph.py
deleted file mode 100644
index 96d96a4594c3..000000000000
--- a/tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_cost_graph.py
+++ /dev/null
@@ -1,96 +0,0 @@
-from copy import deepcopy
-from pickletools import optimize
-
-import pytest
-import torch
-import torch.nn as nn
-from torch.fx import GraphModule
-
-from colossalai.auto_parallel.tensor_shard.deprecated.cost_graph import CostGraph
-from colossalai.auto_parallel.tensor_shard.deprecated.options import SolverOptions
-from colossalai.auto_parallel.tensor_shard.deprecated.strategies_constructor import StrategiesConstructor
-from colossalai.device.device_mesh import DeviceMesh
-from colossalai.fx.tracer.tracer import ColoTracer
-
-
-class ConvModel(nn.Module):
-
-    def __init__(self, c_in, c_out):
-        super().__init__()
-        self.conv1 = nn.Conv2d(c_in, c_out, kernel_size=3)
-        self.relu = nn.ReLU()
-
-    def forward(self, x):
-        x = x * 2
-        x = self.conv1(x)
-        x = x / 2
-        x = self.relu(x)
-        return x
-
-
-def test_cost_graph():
-    physical_mesh_id = torch.arange(0, 4)
-    mesh_shape = (2, 2)
-    # [[0, 1]
-    #  [2, 3]]
-    device_mesh = DeviceMesh(physical_mesh_id, mesh_shape)
-    entire_shape = torch.Size((4, 16, 64, 64))
-
-    tracer = ColoTracer()
-    model = ConvModel(16, 32)
-    input_sample = {'x': torch.rand(4, 16, 64, 64).to('meta')}
-
-    # graph():
-    #     %x : torch.Tensor [#users=1] = placeholder[target=x]
-    #     %mul : [#users=1] = call_function[target=operator.mul](args = (%x, 2), kwargs = {})
-    #     %conv1 : [#users=1] = call_module[target=conv1](args = (%mul,), kwargs = {})
-    #     %truediv : [#users=1] = call_function[target=operator.truediv](args = (%conv1, 2), kwargs = {})
-    #     %relu : [#users=1] = call_module[target=relu](args = (%truediv,), kwargs = {})
-    #     return relu
-    graph = tracer.trace(root=model, meta_args=input_sample)
-    gm = GraphModule(model, graph, model.__class__.__name__)
-    gm.recompile()
-
-    solver_options = SolverOptions(fast=True)
-    strategies_constructor = StrategiesConstructor(graph, device_mesh, solver_options)
-    strategies_constructor.build_strategies_and_cost()
-
-    # (x, mul):{(0, 0): 0}
-    # (mul, conv1):{(0, 0): 65547.1, (0, 1): 65547.1, (0, 2): 65547.1, (0, 3): 65547.1, (0, 4): 131105.30000000002, (0, 5): 131105.30000000002, (0, 6): 65547.1, (0, 7): 65547.1, (0, 8): 65547.1, (0, 9): 65547.1, (0, 10): 0, (0, 11): 0, (0, 12): 0, (0, 13): 131105.30000000002, (0, 14): 131105.30000000002}
-    # (conv1, truediv):{(0, 0): 0, (1, 0): inf, (2, 0): inf, (3, 0): inf, (4, 0): 0, (5, 0): inf, (6, 0): inf, (7, 0): inf, (8, 0): inf, (9, 0): inf, (10, 0): inf, (11, 0): inf, (12, 0): inf, (13, 0): inf, (14, 0): inf, (0, 1): inf, (1, 1): 0, (2, 1): inf, (3, 1): inf, (4, 1): inf, (5, 1): 0, (6, 1): inf, (7, 1): inf, (8, 1): inf, (9, 1): inf, (10, 1): inf, (11, 1): inf, (12, 1): inf, (13, 1): inf, (14, 1): inf, (0, 2): inf, (1, 2): inf, (2, 2): 0, (3, 2): inf, (4, 2): inf, (5, 2): inf, (6, 2): inf, (7, 2): inf, (8, 2): inf, (9, 2): inf, (10, 2): inf, (11, 2): inf, (12, 2): inf, (13, 2): inf, (14, 2): inf, (0, 3): inf, (1, 3): inf, (2, 3): inf, (3, 3): 0, (4, 3): inf, (5, 3): inf, (6, 3): inf, (7, 3): inf, (8, 3): inf, (9, 3): inf, (10, 3): inf, (11, 3): inf, (12, 3): inf, (13, 3): inf, (14, 3): inf, (0, 4): inf, (1, 4): inf, (2, 4): inf, (3, 4): inf, (4, 4): inf, (5, 4): inf, (6, 4): 0, (7, 4): inf, (8, 4): 0, (9, 4): inf, (10, 4): inf, (11, 4): inf, (12, 4): inf, (13, 4): inf, (14, 4): inf, (0, 5): inf, (1, 5): inf, (2, 5): inf, (3, 5): inf, (4, 5): inf, (5, 5): inf, (6, 5): inf, (7, 5): 0, (8, 5): inf, (9, 5): 0, (10, 5): inf, (11, 5): inf, (12, 5): inf, (13, 5): inf, (14, 5): inf, (0, 6): inf, (1, 6): inf, (2, 6): inf, (3, 6): inf, (4, 6): inf, (5, 6): inf, (6, 6): inf, (7, 6): inf, (8, 6): inf, (9, 6): inf, (10, 6): 0, (11, 6): 0, (12, 6): 0, (13, 6): inf, (14, 6): inf, (0, 7): inf, (1, 7): inf, (2, 7): inf, (3, 7): inf, (4, 7): inf, (5, 7): inf, (6, 7): inf, (7, 7): inf, (8, 7): inf, (9, 7): inf, (10, 7): inf, (11, 7): inf, (12, 7): inf, (13, 7): 0, (14, 7): inf, (0, 8): inf, (1, 8): inf, (2, 8): inf, (3, 8): inf, (4, 8): inf, (5, 8): inf, (6, 8): inf, (7, 8): inf, (8, 8): inf, (9, 8): inf, (10, 8): inf, (11, 8): inf, (12, 8): inf, (13, 8): inf, (14, 8): 0}
-    # (truediv, relu):{(0, 0): 0, (1, 0): inf, (2, 0): inf, (3, 0): inf, (4, 0): inf, (5, 0): inf, (6, 0): inf, (7, 0): inf, (8, 0): inf, (0, 1): inf, (1, 1): 0, (2, 1): inf, (3, 1): inf, (4, 1): inf, (5, 1): inf, (6, 1): inf, (7, 1): inf, (8, 1): inf, (0, 2): inf, (1, 2): inf, (2, 2): 0, (3, 2): inf, (4, 2): inf, (5, 2): inf, (6, 2): inf, (7, 2): inf, (8, 2): inf, (0, 3): inf, (1, 3): inf, (2, 3): inf, (3, 3): 0, (4, 3): inf, (5, 3): inf, (6, 3): inf, (7, 3): inf, (8, 3): inf, (0, 4): inf, (1, 4): inf, (2, 4): inf, (3, 4): inf, (4, 4): 0, (5, 4): inf, (6, 4): inf, (7, 4): inf, (8, 4): inf, (0, 5): inf, (1, 5): inf, (2, 5): inf, (3, 5): inf, (4, 5): inf, (5, 5): 0, (6, 5): inf, (7, 5): inf, (8, 5): inf, (0, 6): inf, (1, 6): inf, (2, 6): inf, (3, 6): inf, (4, 6): inf, (5, 6): inf, (6, 6): 0, (7, 6): inf, (8, 6): inf, (0, 7): inf, (1, 7): inf, (2, 7): inf, (3, 7): inf, (4, 7): inf, (5, 7): inf, (6, 7): inf, (7, 7): 0, (8, 7): inf, (0, 8): inf, (1, 8): inf, (2, 8): inf, (3, 8): inf, (4, 8): inf, (5, 8): inf, (6, 8): inf, (7, 8): inf, (8, 8): 0}
-    # (relu, output):{(0, 0): 246019.30000000002, (1, 0): 246019.30000000002, (2, 0): 123009.1, (3, 0): 123009.1, (4, 0): 123009.1, (5, 0): 123009.1, (6, 0): 0, (7, 0): 246019.30000000002, (8, 0): 246019.30000000002}
-    cost_graph = CostGraph(strategies_constructor.leaf_strategies)
-
-    # construct all node pairs
-    all_node_pairs = []
-
-    for node in graph.nodes:
-        if node.op == 'output':
-            continue
-        for child in node.users.keys():
-            all_node_pairs.append((node, child))
-
-    for node_pair in all_node_pairs:
-        assert node_pair in cost_graph.edge_costs
-
-    # construct merged node pairs
-    merged_node_pairs = []
-    node_list = list(graph.nodes)
-    # add (conv1_weight, conv2d), (conv1_bias, view), (conv2d, add), (view, add), (add, output), (x, conv2d) into check node pairs
-    merged_node_pairs.append((node_list[0], node_list[4]))
-    merged_node_pairs.append((node_list[2], node_list[4]))
-    merged_node_pairs.append((node_list[3], node_list[5]))
-    merged_node_pairs.append((node_list[5], node_list[6]))
-    merged_node_pairs.append((node_list[4], node_list[6]))
-    merged_node_pairs.append((node_list[6], node_list[-1]))
-    cost_graph.simplify_graph()
-    for node_pair in all_node_pairs:
-        if node_pair in merged_node_pairs:
-            assert node_pair in cost_graph.edge_costs
-        else:
-            assert node_pair not in cost_graph.edge_costs
-
-
-if __name__ == '__main__':
-    test_cost_graph()
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_op_handler/test_deprecated_batch_norm_handler.py b/tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_op_handler/test_deprecated_batch_norm_handler.py
deleted file mode 100644
index 2d3e71551eb2..000000000000
--- a/tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_op_handler/test_deprecated_batch_norm_handler.py
+++ /dev/null
@@ -1,118 +0,0 @@
-import torch
-from torch.fx import GraphModule
-import torch.nn as nn
-import pytest
-
-from colossalai.fx.proxy import ColoProxy
-from colossalai.fx.tracer.tracer import ColoTracer
-from colossalai.tensor.sharding_spec import ShardingSpec, _DimSpec
-from colossalai.auto_parallel.tensor_shard.deprecated.op_handler.batch_norm_handler import BatchNormHandler
-from colossalai.auto_parallel.tensor_shard.deprecated.sharding_strategy import ShardingStrategy, StrategiesVector
-from colossalai.device.device_mesh import DeviceMesh
-
-
-class BNModel(nn.Module):
-
-    def __init__(self, c):
-        super().__init__()
-        self.bn = nn.BatchNorm2d(c)
-
-    def forward(self, x):
-        x = x * 2
-        x = self.bn(x)
-        return x
-
-
-def test_bn_handler():
-    physical_mesh_id = torch.arange(0, 4)
-    mesh_shape = (2, 2)
-    # [[0, 1]
-    #  [2, 3]]
-    device_mesh = DeviceMesh(physical_mesh_id, mesh_shape)
-    entire_shape = torch.Size((4, 16, 64, 64))
-
-    tracer = ColoTracer()
-    model = BNModel(16)
-    input_sample = {'x': torch.rand(4, 16, 64, 64).to('meta')}
-    # graph():
-    #     %x : torch.Tensor [#users=1] = placeholder[target=x]
-    #     %mul : [#users=1] = call_function[target=operator.mul](args = (%x, 2), kwargs = {})
-    #     %bn : [#users=1] = call_module[target=bn](args = (%mul,), kwargs = {})
-    #     return bn
-    graph = tracer.trace(root=model, meta_args=input_sample)
-    gm = GraphModule(model, graph, model.__class__.__name__)
-    gm.recompile()
-    # [x, mul, bn, output]
-    nodes = [node for node in gm.graph.nodes]
-
-    # find the sharding strategies for the input node of the bn node
-    # strategies_for_input = [[R, R, R, R], [R, S0, R, R], [R, S1, R, R], [S0, R, R, R], [S0, S1, R, R], [S1, R, R, R], [S1, S0, R, R]]
-    strategies_vector_for_input = StrategiesVector(nodes[1])
-    sharding_option = (None, 0, 1)
-    for first_sharding_index in sharding_option:
-        for second_sharding_index in sharding_option:
-            if first_sharding_index is not None and second_sharding_index == first_sharding_index:
-                continue
-            if first_sharding_index is None:
-                first_dim_spec = _DimSpec([])
-            else:
-                first_dim_spec = _DimSpec([first_sharding_index])
-
-            if second_sharding_index is None:
-                second_dim_spec = _DimSpec([])
-            else:
-                second_dim_spec = _DimSpec([second_sharding_index])
-
-            replica_dim_spec = _DimSpec([])
-            sharding_sequence = [first_dim_spec, second_dim_spec, replica_dim_spec, replica_dim_spec]
-            sharding_spec = ShardingSpec(device_mesh=device_mesh,
-                                         entire_shape=entire_shape,
-                                         sharding_sequence=sharding_sequence)
-            strategy_name = str(sharding_spec.sharding_sequence)
-            sharding_strategy = ShardingStrategy(name=strategy_name, output_sharding_spec=sharding_spec)
-            strategies_vector_for_input.append(sharding_strategy)
-    setattr(nodes[1], 'strategies_vector', strategies_vector_for_input)
-
-    # generate bn strategy
-    strategies_vector = StrategiesVector(node=nodes[2])
-    bn_handler = BatchNormHandler(
-        node=nodes[2],
-        device_mesh=device_mesh,
-        strategies_vector=strategies_vector,
-    )
-    bn_handler.register_strategy()
-    # ['RS0 = RS0 x S0', 'S1S0 = RS0 x S0', 'RS1 = RS1 x S1', 'S0S1 = RS1 x S1', 'RR = RR x R', 'S0R = RR x R', 'S1R = RR x R', 'S01R = RR x R', 'RS01 = RS01 x S01',
-    # 'S0R = S0R x R WITH SYNC_BN', 'S1R = S1R x R WITH SYNC_BN', 'S0S1 = S0S1 x S1 WITH SYNC_BN', 'S1S0 = S1S0 x S0 WITH SYNC_BN', 'S01R = S01R x R WITH SYNC_BN']
-    strategy_name_list = [strategy.name for strategy in bn_handler.strategies_vector]
-
-    # RS = RS x S and strategies based on it, such as
-    # SS = RS x S
-    assert 'RS0 = RS0 x S0' in strategy_name_list
-    assert 'S1S0 = RS0 x S0' in strategy_name_list
-    assert 'RS1 = RS1 x S1' in strategy_name_list
-    assert 'S0S1 = RS1 x S1' in strategy_name_list
-
-    # RR = RR x R and strategies based on it, such as
-    # SR = SR x R
-    assert 'RR = RR x R' in strategy_name_list
-    assert 'S0R = RR x R' in strategy_name_list
-    assert 'S1R = RR x R' in strategy_name_list
-    assert 'S01R = RR x R' in strategy_name_list
-
-    # RS01 = RS01 x S01
-    assert 'RS01 = RS01 x S01' in strategy_name_list
-
-    # SR = SR x R WITH SYNC_BN
-    assert 'S0R = S0R x R WITH SYNC_BN' in strategy_name_list
-    assert 'S1R = S1R x R WITH SYNC_BN' in strategy_name_list
-
-    # SS = SS x S WITH SYNC_BN
-    assert 'S0S1 = S0S1 x S1 WITH SYNC_BN' in strategy_name_list
-    assert 'S1S0 = S1S0 x S0 WITH SYNC_BN' in strategy_name_list
-
-    # S01R = S01R x R WITH SYNC_BN
-    assert 'S01R = S01R x R WITH SYNC_BN' in strategy_name_list
-
-
-if __name__ == '__main__':
-    test_bn_handler()
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_op_handler/test_deprecated_bcast_handler.py b/tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_op_handler/test_deprecated_bcast_handler.py
deleted file mode 100644
index 7adc211cfc07..000000000000
--- a/tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_op_handler/test_deprecated_bcast_handler.py
+++ /dev/null
@@ -1,75 +0,0 @@
-from cProfile import run
-
-import pytest
-import torch
-import torch.nn as nn
-from torch.fx import GraphModule
-
-from colossalai.auto_parallel.tensor_shard.deprecated.options import SolverOptions
-from colossalai.auto_parallel.tensor_shard.deprecated.strategies_constructor import StrategiesConstructor
-from colossalai.device.device_mesh import DeviceMesh
-from colossalai.fx.tracer.tracer import ColoTracer
-from colossalai.testing.pytest_wrapper import run_on_environment_flag
-
-
-class ConvModel(nn.Module):
-
-    def __init__(self, c_in, c_out):
-        super().__init__()
-        self.conv1 = nn.Conv2d(c_in, c_out, kernel_size=3, padding=1)
-        self.conv2 = nn.Conv2d(c_in, c_out, kernel_size=3, padding=1, stride=2)
-
-    def forward(self, x):
-        x1 = self.conv1(x)
-        x2 = x1 + 1
-        x1 = torch.reshape(x1, [1, -1, 64, 1])
-        x3 = self.conv2(x1)
-        x3 = torch.reshape(x3, [4, 1, 64, -1])
-        x = x1 + x3
-
-        return x
-
-
-@run_on_environment_flag(name='AUTO_PARALLEL')
-def test_conv_handler():
-    physical_mesh_id = torch.arange(0, 4)
-    mesh_shape = (2, 2)
-    # [[0, 1]
-    #  [2, 3]]
-    device_mesh = DeviceMesh(physical_mesh_id, mesh_shape)
-
-    tracer = ColoTracer()
-    model = ConvModel(16, 32)
-    input_sample = {'x': torch.rand(4, 16, 64, 64).to('meta')}
-    # graph():
-    #     %x : torch.Tensor [#users=1] = placeholder[target=x]
-    #     %conv1 : [#users=2] = call_module[target=conv1](args = (%x,), kwargs = {})
-    #     %add : [#users=0] = call_function[target=operator.add](args = (%conv1, 1), kwargs = {})
-    #     %reshape : [#users=2] = call_function[target=torch.reshape](args = (%conv1, [1, -1, 64, 1]), kwargs = {})
-    #     %conv2 : [#users=1] = call_module[target=conv2](args = (%reshape,), kwargs = {})
-    #     %reshape_1 : [#users=1] = call_function[target=torch.reshape](args = (%conv2, [4, 1, 64, -1]), kwargs = {})
-    #     %add_1 : [#users=1] = call_function[target=operator.add](args = (%reshape, %reshape_1), kwargs = {})
-    #     return add_1
-    graph = tracer.trace(root=model, meta_args=input_sample)
-    gm = GraphModule(model, graph, model.__class__.__name__)
-    # [x, conv1, add, reshape, conv2, reshape_1, add_1, output]
-    nodes = [node for node in gm.graph.nodes]
-    solver_options = SolverOptions(fast=True)
-    strategies_constructor = StrategiesConstructor(graph, device_mesh, solver_options)
-
-    strategies_constructor.build_strategies_and_cost()
-    strategy_map = strategies_constructor.strategy_map
-    # check a tensor add with a scalar case
-    conv1_strategies = strategy_map[nodes[1]]
-    add_strategies = strategy_map[nodes[2]]
-    add_strategies_cover_list = [strategy.input_shardings[0].sharding_sequence for strategy in add_strategies]
-    for strategy in conv1_strategies:
-        assert strategy.output_sharding_spec.sharding_sequence in add_strategies_cover_list
-
-    # check two tensors element-wise add case
-    add_1_strategies = strategy_map[nodes[6]]
-    assert len(add_1_strategies) == 25
-
-
-if __name__ == '__main__':
-    test_conv_handler()
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_op_handler/test_deprecated_bcast_matmul.py b/tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_op_handler/test_deprecated_bcast_matmul.py
deleted file mode 100644
index 426d179f10d5..000000000000
--- a/tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_op_handler/test_deprecated_bcast_matmul.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import pytest
-import torch
-import torch.nn as nn
-from torch.fx import GraphModule
-
-from colossalai.auto_parallel.tensor_shard.deprecated.options import SolverOptions
-from colossalai.auto_parallel.tensor_shard.deprecated.strategies_constructor import StrategiesConstructor
-from colossalai.device.device_mesh import DeviceMesh
-from colossalai.fx.tracer.tracer import ColoTracer
-from colossalai.testing.pytest_wrapper import run_on_environment_flag
-
-
-class MatmulModel(nn.Module):
-
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x1, x2):
-        x = torch.matmul(x1, x2)
-
-        return x
-
-
-@run_on_environment_flag(name='AUTO_PARALLEL')
-def test_conv_handler():
-    physical_mesh_id = torch.arange(0, 4)
-    mesh_shape = (2, 2)
-    # [[0, 1]
-    #  [2, 3]]
-    device_mesh = DeviceMesh(physical_mesh_id, mesh_shape)
-
-    tracer = ColoTracer()
-    model = MatmulModel()
-    input_sample = {'x1': torch.rand(4, 4, 8).to('meta'), 'x2': torch.rand(4, 1, 8, 4).to('meta')}
-    # graph():
-    #     %x1 : torch.Tensor [#users=1] = placeholder[target=x1]
-    #     %x2 : torch.Tensor [#users=1] = placeholder[target=x2]
-    #     %matmul : [#users=1] = call_function[target=torch.matmul](args = (%x1, %x2), kwargs = {})
-    #     return matmul
-    graph = tracer.trace(root=model, meta_args=input_sample)
-    gm = GraphModule(model, graph, model.__class__.__name__)
-    # [x1, x2, matmul, output]
-    nodes = [node for node in gm.graph.nodes]
-    solver_options = SolverOptions(fast=True)
-    strategies_constructor = StrategiesConstructor(graph, device_mesh, solver_options)
-
-    strategies_constructor.build_strategies_and_cost()
-    strategy_map = strategies_constructor.strategy_map
-    matmul_strategies = strategy_map[nodes[2]]
-    assert len(matmul_strategies) == 30
-
-
-if __name__ == '__main__':
-    test_conv_handler()
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_op_handler/test_deprecated_conv_handler.py b/tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_op_handler/test_deprecated_conv_handler.py
deleted file mode 100644
index 9342e06a040a..000000000000
--- a/tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_op_handler/test_deprecated_conv_handler.py
+++ /dev/null
@@ -1,90 +0,0 @@
-import pytest
-import torch
-import torch.nn as nn
-from torch.fx import GraphModule
-
-from colossalai.auto_parallel.tensor_shard.deprecated.op_handler.conv_handler import ConvHandler
-from colossalai.auto_parallel.tensor_shard.deprecated.options import SolverOptions
-from colossalai.auto_parallel.tensor_shard.deprecated.sharding_strategy import ShardingStrategy, StrategiesVector
-from colossalai.auto_parallel.tensor_shard.deprecated.strategies_constructor import StrategiesConstructor
-from colossalai.device.device_mesh import DeviceMesh
-from colossalai.fx.proxy import ColoProxy
-from colossalai.fx.tracer.tracer import ColoTracer
-from colossalai.tensor.sharding_spec import ShardingSpec, _DimSpec
-
-
-class ConvModel(nn.Module):
-
-    def __init__(self, c_in, c_out):
-        super().__init__()
-        self.conv = nn.Conv2d(c_in, c_out, kernel_size=3)
-
-    def forward(self, x):
-        x = x * 2
-        x = self.conv(x)
-        return x
-
-
-def test_conv_handler():
-    physical_mesh_id = torch.arange(0, 4)
-    mesh_shape = (2, 2)
-    # [[0, 1]
-    #  [2, 3]]
-    device_mesh = DeviceMesh(physical_mesh_id, mesh_shape)
-    entire_shape = torch.Size((4, 16, 64, 64))
-
-    tracer = ColoTracer()
-    model = ConvModel(16, 32)
-    input_sample = {'x': torch.rand(4, 16, 64, 64).to('meta')}
-    # graph():
-    #     %x : torch.Tensor [#users=1] = placeholder[target=x]
-    #     %mul : [#users=1] = call_function[target=operator.mul](args = (%x, 2), kwargs = {})
-    #     %conv_weight : [#users=1] = get_attr[target=conv.weight]
-    #     %conv_bias : [#users=1] = get_attr[target=conv.bias]
-    #     %conv2d : [#users=1] = call_function[target=torch.conv2d](args = (%mul, %conv_weight), kwargs = {groups: 1, dilation: (1, 1), stride: (1, 1), padding: (0, 0)})
-    #     %view : [#users=1] = call_method[target=view](args = (%conv_bias, [1, -1, 1, 1]), kwargs = {})
-    #     %add : [#users=1] = call_function[target=operator.add](args = (%conv2d, %view), kwargs = {})
-    #     return add
-    graph = tracer.trace(root=model, meta_args=input_sample)
-    gm = GraphModule(model, graph, model.__class__.__name__)
-    gm.recompile()
-    solver_options = SolverOptions(fast=True)
-    strategies_constructor = StrategiesConstructor(graph, device_mesh, solver_options)
-
-    strategies_constructor.build_strategies_and_cost()
-    conv_node = list(graph.nodes)[4]
-    # ['S0S1 = S0R x RS1', 'S1S0 = S1R x RS0', 'S0R = S0R x RR', 'S1R = S1R x RR', 'S0R = S0S1 x S1R', 'S1R = S1S0 x S0R', 'RS1 = RS0 x S0S1', 'RS0 = RS1 x S1S0', 'RR = RS0 x S0R', 'RR = RS1 x S1R', 'RS0 = RR x RS0', 'RS1 = RR x RS1', 'RR = RR x RR', 'S01R = S01R x RR', 'RR = RS01 x S01R']
-    strategy_name_list = [strategy.name for strategy in conv_node.strategies_vector]
-
-    # SS = SR x RS
-    assert 'S0S1 = S0R x RS1' in strategy_name_list
-    assert 'S1S0 = S1R x RS0' in strategy_name_list
-
-    # SR = SS x SR
-    assert 'S0R = S0S1 x S1R' in strategy_name_list
-    assert 'S1R = S1S0 x S0R' in strategy_name_list
-
-    # RS = RS x SS
-    assert 'RS0 = RS1 x S1S0' in strategy_name_list
-    assert 'RS1 = RS0 x S0S1' in strategy_name_list
-
-    # RS = RR x RS
-    assert 'RS0 = RR x RS0' in strategy_name_list
-    assert 'RS1 = RR x RS1' in strategy_name_list
-
-    # RR= RR x RR
-    assert 'RR = RR x RR' in strategy_name_list
-
-    # SR = SR x RR
-    assert 'S0R = S0R x RR' in strategy_name_list
-    assert 'S1R = S1R x RR' in strategy_name_list
-    assert 'S01R = S01R x RR' in strategy_name_list
-
-    # RR = RS x SR
-    assert 'RR = RS0 x S0R' in strategy_name_list
-    assert 'RR = RS1 x S1R' in strategy_name_list
-    assert 'RR = RS01 x S01R' in strategy_name_list
-
-
-if __name__ == '__main__':
-    test_conv_handler()
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_op_handler/test_deprecated_dot_handler.py b/tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_op_handler/test_deprecated_dot_handler.py
deleted file mode 100644
index 0a2dba1611f0..000000000000
--- a/tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_op_handler/test_deprecated_dot_handler.py
+++ /dev/null
@@ -1,83 +0,0 @@
-import pytest
-import torch
-import torch.nn as nn
-from torch.fx import GraphModule
-
-from colossalai.auto_parallel.tensor_shard.deprecated.op_handler.dot_handler import DotHandler
-from colossalai.auto_parallel.tensor_shard.deprecated.options import SolverOptions
-from colossalai.auto_parallel.tensor_shard.deprecated.sharding_strategy import ShardingStrategy, StrategiesVector
-from colossalai.auto_parallel.tensor_shard.deprecated.strategies_constructor import StrategiesConstructor
-from colossalai.device.device_mesh import DeviceMesh
-from colossalai.fx.proxy import ColoProxy
-from colossalai.fx.tracer.tracer import ColoTracer
-from colossalai.tensor.sharding_spec import ShardingSpec, _DimSpec
-
-
-class LinearModel(nn.Module):
-
-    def __init__(self, in_features, out_features):
-        super().__init__()
-        self.linear = nn.Linear(in_features, out_features)
-
-    def forward(self, x):
-        x = x * 2
-        x = self.linear(x)
-        return x
-
-
-@pytest.mark.skip('F.linear is not supported in deprecated handler')
-def test_dot_handler():
-    physical_mesh_id = torch.arange(0, 4)
-    mesh_shape = (2, 2)
-    # [[0, 1]
-    #  [2, 3]]
-    device_mesh = DeviceMesh(physical_mesh_id, mesh_shape)
-    entire_shape = torch.Size((4, 8))
-
-    tracer = ColoTracer()
-    model = LinearModel(8, 16)
-    input_sample = {'x': torch.rand(4, 8).to('meta')}
-    # graph():
-    #     %x : torch.Tensor [#users=1] = placeholder[target=x]
-    #     %mul : [#users=1] = call_function[target=operator.mul](args = (%x, 2), kwargs = {})
-    #     %linear_weight : [#users=1] = get_attr[target=linear.weight]
-    #     %linear_bias : [#users=1] = get_attr[target=linear.bias]
-    #     %linear : [#users=1] = call_function[target=torch._C._nn.linear](args = (%mul, %linear_weight), kwargs = {})
-    #     %add : [#users=1] = call_function[target=operator.add](args = (%linear, %linear_bias), kwargs = {})
-    #     return add
-    graph = tracer.trace(root=model, meta_args=input_sample)
-
-    gm = GraphModule(model, graph, model.__class__.__name__)
-    gm.recompile()
-    solver_options = SolverOptions(fast=True)
-    strategies_constructor = StrategiesConstructor(graph, device_mesh, solver_options)
-
-    strategies_constructor.build_strategies_and_cost()
-    linear_node = list(graph.nodes)[4]
-
-    # ['S0S1 = S0R x RS1', 'S1S0 = S1R x RS0', 'S0R = S0S1 x S1R', 'S1R = S1S0 x S0R', 'RS1 = RS0 x S0S1', 'RS0 = RS1 x S1S0', 'RS0 = RR x RS0', 'RS1 = RR x RS1', 'RR = RR x RR']
-    strategy_name_list = [strategy.name for strategy in linear_node.strategies_vector]
-
-    # SS = SR x RS
-    assert 'S0S1 = S0R x RS1' in strategy_name_list
-    assert 'S1S0 = S1R x RS0' in strategy_name_list
-
-    # SR = SS x SR
-    assert 'S0R = S0S1 x S1R' in strategy_name_list
-    assert 'S1R = S1S0 x S0R' in strategy_name_list
-
-    # RS = RS x SS
-    assert 'RS0 = RS1 x S1S0' in strategy_name_list
-    assert 'RS1 = RS0 x S0S1' in strategy_name_list
-
-    # RR = RS x SR
-    assert 'RR = RS0 x S0R' in strategy_name_list
-    assert 'RR = RS1 x S1R' in strategy_name_list
-
-    # RS= RR x RS
-    assert 'RS0 = RR x RS0' in strategy_name_list
-    assert 'RS1 = RR x RS1' in strategy_name_list
-
-
-if __name__ == '__main__':
-    test_dot_handler()
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_op_handler/test_deprecated_layer_norm_handler.py b/tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_op_handler/test_deprecated_layer_norm_handler.py
deleted file mode 100644
index 40e227cb53eb..000000000000
--- a/tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_op_handler/test_deprecated_layer_norm_handler.py
+++ /dev/null
@@ -1,70 +0,0 @@
-import torch
-from torch.fx import GraphModule
-import torch.nn as nn
-import pytest
-from colossalai.auto_parallel.tensor_shard.deprecated import sharding_strategy
-
-from colossalai.fx.proxy import ColoProxy
-from colossalai.fx.tracer.tracer import ColoTracer
-from colossalai.tensor.sharding_spec import ShardingSpec, _DimSpec
-from colossalai.auto_parallel.tensor_shard.deprecated.op_handler.layer_norm_handler import LayerNormHandler
-from colossalai.auto_parallel.tensor_shard.deprecated.sharding_strategy import ShardingStrategy, StrategiesVector
-from colossalai.device.device_mesh import DeviceMesh
-
-
-class LNModel(nn.Module):
-
-    def __init__(self, c):
-        super().__init__()
-        self.ln = nn.LayerNorm(c)
-
-    def forward(self, x):
-        x = x * 2
-        x = self.ln(x)
-        return x
-
-
-def test_bn_handler():
-    physical_mesh_id = torch.arange(0, 4)
-    mesh_shape = (2, 2)
-    # [[0, 1]
-    #  [2, 3]]
-    device_mesh = DeviceMesh(physical_mesh_id, mesh_shape)
-    entire_shape = torch.Size((4, 4, 128))
-
-    tracer = ColoTracer()
-    model = LNModel(128)
-    input_sample = {'x': torch.rand(4, 4, 128).to('meta')}
-    # graph():
-    #     %x : torch.Tensor [#users=1] = placeholder[target=x]
-    #     %mul : [#users=1] = call_function[target=operator.mul](args = (%x, 2), kwargs = {})
-    #     %ln : [#users=1] = call_module[target=ln](args = (%mul,), kwargs = {})
-    #     return ln
-    graph = tracer.trace(root=model, meta_args=input_sample)
-    gm = GraphModule(model, graph, model.__class__.__name__)
-    gm.recompile()
-    # [x, mul, ln, output]
-    nodes = [node for node in gm.graph.nodes]
-    sharding_spec_for_input = ShardingSpec(device_mesh, entire_shape, {})
-    sharding_strategy_for_input = ShardingStrategy('node_1', sharding_spec_for_input)
-    strategies_vector_for_input = StrategiesVector(nodes[1])
-    strategies_vector_for_input.append(sharding_strategy_for_input)
-    setattr(nodes[1], 'strategies_vector', strategies_vector_for_input)
-
-    # generate bn strategy
-    strategies_vector = StrategiesVector(node=nodes[2])
-    ln_handler = LayerNormHandler(
-        node=nodes[2],
-        device_mesh=device_mesh,
-        strategies_vector=strategies_vector,
-    )
-    ln_handler.register_strategy()
-    # ['[S0, R, R] = [S0, R, R] x [R]', '[R, S0, R] = [R, S0, R] x [R]', '[S1, R, R] = [S1, R, R] x [R]', '[R, S1, R] = [R, S1, R] x [R]',
-    # '[S0, S1, R] = [S0, S1, R] x [R]', '[S1, S0, R] = [S1, S0, R] x [R]', '[S01, R, R] = [S01, R, R] x [R]', '[R, S01, R] = [R, S01, R] x [R]', 'RR = RR x R']
-    strategy_name_list = [strategy.name for strategy in ln_handler.strategies_vector]
-
-    assert len(strategy_name_list) == 9
-
-
-if __name__ == '__main__':
-    test_bn_handler()
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_op_handler/test_deprecated_reshape_handler.py b/tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_op_handler/test_deprecated_reshape_handler.py
deleted file mode 100644
index ac9df4cd825b..000000000000
--- a/tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_op_handler/test_deprecated_reshape_handler.py
+++ /dev/null
@@ -1,59 +0,0 @@
-import torch
-import torch.nn as nn
-from torch.fx import GraphModule
-
-from colossalai.auto_parallel.tensor_shard.deprecated.options import SolverOptions
-from colossalai.auto_parallel.tensor_shard.deprecated.strategies_constructor import StrategiesConstructor
-from colossalai.device.device_mesh import DeviceMesh
-from colossalai.fx.tracer.tracer import ColoTracer
-
-
-class ConvModel(nn.Module):
-
-    def __init__(self, c_in, c_out):
-        super().__init__()
-        self.conv = nn.Conv2d(c_in, c_out, kernel_size=3)
-
-    def forward(self, x):
-        x = self.conv(x)
-        x = torch.flatten(x)
-        return x
-
-
-def test_conv_handler():
-    physical_mesh_id = torch.arange(0, 4)
-    mesh_shape = (2, 2)
-    # [[0, 1]
-    #  [2, 3]]
-    device_mesh = DeviceMesh(physical_mesh_id, mesh_shape)
-
-    tracer = ColoTracer()
-    model = ConvModel(16, 32)
-    input_sample = {'x': torch.rand(4, 16, 64, 64).to('meta')}
-    # graph():
-    #     %x : torch.Tensor [#users=1] = placeholder[target=x]
-    #     %conv_weight : [#users=1] = get_attr[target=conv.weight]
-    #     %conv_bias : [#users=1] = get_attr[target=conv.bias]
-    #     %conv2d : [#users=1] = call_function[target=torch.conv2d](args = (%x, %conv_weight), kwargs = {groups: 1, dilation: (1, 1), stride: (1, 1), padding: (0, 0)})
-    #     %view : [#users=1] = call_method[target=view](args = (%conv_bias, [1, -1, 1, 1]), kwargs = {})
-    #     %add : [#users=1] = call_function[target=operator.add](args = (%conv2d, %view), kwargs = {})
-    #     %flatten : [#users=1] = call_function[target=torch.flatten](args = (%add,), kwargs = {})
-    #     return flatten
-    graph = tracer.trace(root=model, meta_args=input_sample)
-    gm = GraphModule(model, graph, model.__class__.__name__)
-    # [x, conv, flatten, output]
-    nodes = [node for node in gm.graph.nodes]
-    solver_options = SolverOptions(fast=True)
-    strategies_constructor = StrategiesConstructor(graph, device_mesh, solver_options)
-
-    strategies_constructor.build_strategies_and_cost()
-    strategy_map = strategies_constructor.strategy_map
-    add_strategies = strategy_map[nodes[5]]
-    flatten_strategies = strategy_map[nodes[6]]
-    flatten_strategies_cover_list = [strategy.input_shardings[0].sharding_sequence for strategy in flatten_strategies]
-    for strategy in add_strategies:
-        assert strategy.output_sharding_spec.sharding_sequence in flatten_strategies_cover_list
-
-
-if __name__ == '__main__':
-    test_conv_handler()
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_op_handler/test_deprecated_where_handler.py b/tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_op_handler/test_deprecated_where_handler.py
deleted file mode 100644
index 294a59fc8548..000000000000
--- a/tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_op_handler/test_deprecated_where_handler.py
+++ /dev/null
@@ -1,66 +0,0 @@
-import torch
-from torch.fx import GraphModule
-import torch.nn as nn
-import pytest
-
-from colossalai.auto_parallel.tensor_shard.deprecated.options import SolverOptions
-from colossalai.auto_parallel.tensor_shard.deprecated.strategies_constructor import StrategiesConstructor
-from colossalai.fx.tracer.tracer import ColoTracer
-from colossalai.device.device_mesh import DeviceMesh
-from colossalai.testing.pytest_wrapper import run_on_environment_flag
-
-
-class ConvModel(nn.Module):
-
-    def __init__(self, dim_in, dim_out):
-        super().__init__()
-        self.dim_in = dim_in
-        self.dim_out = dim_out
-
-    def forward(self, condition, x, y):
-        output = torch.where(condition, x, y)
-
-        return output
-
-
-@run_on_environment_flag(name='AUTO_PARALLEL')
-def test_where_handler():
-    physical_mesh_id = torch.arange(0, 4)
-    mesh_shape = (2, 2)
-    # [[0, 1]
-    #  [2, 3]]
-    device_mesh = DeviceMesh(physical_mesh_id, mesh_shape)
-
-    tracer = ColoTracer()
-    model = ConvModel(16, 32)
-    input_sample = {
-        'condition': torch.rand(16, 32).to('meta'),
-        'x': torch.rand(16, 32).to('meta'),
-        'y': torch.rand(16, 32).to('meta')
-    }
-    # graph():
-    #     %condition : torch.Tensor [#users=1] = placeholder[target=condition]
-    #     %x : torch.Tensor [#users=1] = placeholder[target=x]
-    #     %y : torch.Tensor [#users=1] = placeholder[target=y]
-    #     %where : [#users=1] = call_function[target=torch.where](args = (%condition, %x, %y), kwargs = {})
-    #     return where
-    graph = tracer.trace(root=model, meta_args=input_sample)
-    gm = GraphModule(model, graph, model.__class__.__name__)
-
-    # [condition, x, y, where, output]
-    nodes = [node for node in gm.graph.nodes]
-    solver_options = SolverOptions(fast=True)
-    strategies_constructor = StrategiesConstructor(graph, device_mesh, solver_options)
-
-    strategies_constructor.build_strategies_and_cost()
-    strategy_map = strategies_constructor.strategy_map
-    # check a tensor add with a scalar case
-    where_node = strategy_map[nodes[3]]
-    # ['[S0, S1] = [S0, S1] x [S0, S1] x [S0, S1]', '[S1, S0] = [S1, S0] x [S1, S0] x [S1, S0]', '[S01, R] = [S01, R] x [S01, R] x [S01, R]',
-    #  '[R, S01] = [R, S01] x [R, S01] x [R, S01]', '[S0, R] = [S0, R] x [S0, R] x [S0, R]', '[R, S0] = [R, S0] x [R, S0] x [R, S0]',
-    #  '[S1, R] = [S1, R] x [S1, R] x [S1, R]', '[R, S1] = [R, S1] x [R, S1] x [R, S1]', '[R, R] = [R, R] x [R, R] x [R, R]']
-    assert len(where_node) == 9
-
-
-if __name__ == '__main__':
-    test_where_handler()
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_shape_consistency_pass.py b/tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_shape_consistency_pass.py
deleted file mode 100644
index 3286b325c8ab..000000000000
--- a/tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_shape_consistency_pass.py
+++ /dev/null
@@ -1,86 +0,0 @@
-from functools import partial
-import pytest
-import torch
-import torch.multiprocessing as mp
-from torch.fx import GraphModule
-import torch.nn as nn
-import pytest
-from colossalai.initialize import launch
-from colossalai.utils import free_port
-from colossalai.testing import rerun_if_address_is_in_use
-from colossalai.logging import disable_existing_loggers
-from colossalai.auto_parallel.tensor_shard.deprecated.cost_graph import CostGraph
-from colossalai.auto_parallel.tensor_shard.deprecated.graph_analysis import GraphAnalyser
-from colossalai.auto_parallel.tensor_shard.deprecated.strategies_constructor import StrategiesConstructor
-
-from colossalai.fx.tracer.tracer import ColoTracer
-from colossalai.device.device_mesh import DeviceMesh
-from colossalai.fx.passes.experimental.adding_shape_consistency_pass import shape_consistency_pass, solution_annotatation_pass
-from colossalai.auto_parallel.tensor_shard.deprecated import Solver
-from colossalai.auto_parallel.tensor_shard.deprecated.options import SolverOptions
-from colossalai.testing.pytest_wrapper import run_on_environment_flag
-
-
-class ConvModel(nn.Module):
-
-    def __init__(self, c_in, c_out):
-        super().__init__()
-        self.conv = nn.Conv2d(c_in, c_out, kernel_size=3, padding=1, bias=False)
-
-    def forward(self, x):
-        x = self.conv(x)
-        return x
-
-
-def check_apply(rank, world_size, port):
-    disable_existing_loggers()
-    launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
-    input = torch.rand(4, 4, 4, 4).cuda()
-    physical_mesh_id = torch.arange(0, 4)
-    mesh_shape = (2, 2)
-    # [[0, 1]
-    #  [2, 3]]
-    device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
-    entire_shape = torch.Size((4, 4, 8, 8))
-
-    tracer = ColoTracer()
-    model = ConvModel(4, 4).cuda()
-    origin_output = model(input)
-    input_sample = {'x': torch.rand(4, 4, 4, 4).to('meta')}
-    # graph():
-    #     %x : torch.Tensor [#users=1] = placeholder[target=x]
-    #     %conv : [#users=1] = call_module[target=conv](args = (%mul,), kwargs = {})
-    #     return conv
-    graph = tracer.trace(root=model, meta_args=input_sample)
-    gm = GraphModule(model, graph, model.__class__.__name__)
-    gm.recompile()
-    solver_options = SolverOptions(fast=True)
-    strategies_constructor = StrategiesConstructor(graph, device_mesh, solver_options)
-    strategies_constructor.build_strategies_and_cost()
-
-    cost_graph = CostGraph(strategies_constructor.leaf_strategies)
-    cost_graph.simplify_graph()
-    graph_analyser = GraphAnalyser(gm)
-    solver = Solver(gm.graph, strategies_constructor, cost_graph, graph_analyser)
-    ret = solver.call_solver_serialized_args()
-    solution = list(ret[0])
-    sharding_spec_dict, origin_spec_dict = solution_annotatation_pass(gm, solution, device_mesh)
-    shape_consistency_pass(gm)
-    gm.recompile()
-    nodes = [node for node in gm.graph.nodes]
-    # TODO: wrap the gm to avoid the influence of the user training code
-    output = gm(input, sharding_spec_dict, origin_spec_dict)
-    assert output.equal(origin_output)
-
-
-@run_on_environment_flag(name='AUTO_PARALLEL')
-@pytest.mark.dist
-@rerun_if_address_is_in_use()
-def test_apply():
-    world_size = 4
-    run_func = partial(check_apply, world_size=world_size, port=free_port())
-    mp.spawn(run_func, nprocs=world_size)
-
-
-if __name__ == '__main__':
-    test_apply()
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_solver.py b/tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_solver.py
deleted file mode 100644
index baa70727a2e5..000000000000
--- a/tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_solver.py
+++ /dev/null
@@ -1,79 +0,0 @@
-from copy import deepcopy
-
-import pytest
-import torch
-import torch.nn as nn
-from torch.fx import GraphModule
-
-from colossalai.auto_parallel.tensor_shard.deprecated import Solver
-from colossalai.auto_parallel.tensor_shard.deprecated.cost_graph import CostGraph
-from colossalai.auto_parallel.tensor_shard.deprecated.graph_analysis import GraphAnalyser
-from colossalai.auto_parallel.tensor_shard.deprecated.options import SolverOptions
-from colossalai.auto_parallel.tensor_shard.deprecated.strategies_constructor import StrategiesConstructor
-from colossalai.device.device_mesh import DeviceMesh
-from colossalai.fx.tracer.tracer import ColoTracer
-from colossalai.tensor.shape_consistency import ShapeConsistencyManager
-from colossalai.testing.pytest_wrapper import run_on_environment_flag
-
-
-class ConvModel(nn.Module):
-
-    def __init__(self, c_in, c_out):
-        super().__init__()
-        self.conv1 = nn.Conv2d(c_in, c_out, kernel_size=3)
-        self.conv2 = nn.Conv2d(c_out, c_out, kernel_size=3)
-        self.conv3 = nn.Conv2d(c_out, c_out, kernel_size=3)
-        self.relu = nn.ReLU()
-
-    def forward(self, x):
-        x = x * 2
-        x = self.conv1(x)
-        x = self.conv2(x)
-        x = x / 2
-        x = self.conv3(x)
-        x = self.relu(x)
-        return x
-
-
-@run_on_environment_flag(name='AUTO_PARALLEL')
-def test_solver():
-    physical_mesh_id = torch.arange(0, 4)
-    mesh_shape = (2, 2)
-    # [[0, 1]
-    #  [2, 3]]
-    device_mesh = DeviceMesh(physical_mesh_id, mesh_shape)
-    shape_consistency_manager = ShapeConsistencyManager()
-
-    tracer = ColoTracer()
-    model = ConvModel(16, 32)
-    input_sample = {'x': torch.rand(4, 16, 64, 64).to('meta')}
-
-    # graph():
-    #     %x : torch.Tensor [#users=1] = placeholder[target=x]
-    #     %mul : [#users=1] = call_function[target=operator.mul](args = (%x, 2), kwargs = {})
-    #     %conv1 : [#users=1] = call_module[target=conv1](args = (%mul,), kwargs = {})
-    #     %conv2 : [#users=1] = call_module[target=conv2](args = (%conv1,), kwargs = {})
-    #     %truediv : [#users=1] = call_function[target=operator.truediv](args = (%conv2, 2), kwargs = {})
-    #     %conv3 : [#users=1] = call_module[target=conv3](args = (%truediv,), kwargs = {})
-    #     %relu : [#users=1] = call_module[target=relu](args = (%conv3,), kwargs = {})
-    #     return relu
-    graph = tracer.trace(root=model, meta_args=input_sample)
-    gm = GraphModule(model, graph, model.__class__.__name__)
-
-    solver_options = SolverOptions(fast=True)
-    strategies_constructor = StrategiesConstructor(graph, device_mesh, solver_options)
-    strategies_constructor.build_strategies_and_cost()
-
-    cost_graph = CostGraph(strategies_constructor.leaf_strategies)
-    cost_graph.simplify_graph()
-    graph_analyser = GraphAnalyser(gm)
-    solver = Solver(gm.graph, strategies_constructor, cost_graph, graph_analyser)
-    ret = solver.call_solver_serialized_args()
-
-    # [ 0 0 13 13 13 13 13 0]
-    strategies_combination_list = ret[0]
-    assert solver.leaf_strategies[2][13].name == 'S01R = S01R x RR'
-
-
-if __name__ == '__main__':
-    test_solver()
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_solver_with_gpt.py b/tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_solver_with_gpt.py
deleted file mode 100644
index e90d6b15308c..000000000000
--- a/tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_solver_with_gpt.py
+++ /dev/null
@@ -1,81 +0,0 @@
-import torch
-from torch.fx import GraphModule
-import torch.nn as nn
-import pytest
-
-from colossalai.fx.tracer.tracer import ColoTracer
-from colossalai.auto_parallel.tensor_shard.deprecated.sharding_strategy import ShardingStrategy, StrategiesVector
-from colossalai.tensor.shape_consistency import ShapeConsistencyManager
-from colossalai.device.device_mesh import DeviceMesh
-from colossalai.auto_parallel.tensor_shard.deprecated.strategies_constructor import StrategiesConstructor
-from colossalai.auto_parallel.tensor_shard.deprecated.cost_graph import CostGraph
-from copy import deepcopy
-from colossalai.auto_parallel.tensor_shard.deprecated import Solver
-import transformers
-from colossalai.auto_parallel.tensor_shard.deprecated.constants import *
-from colossalai.auto_parallel.tensor_shard.deprecated.graph_analysis import GraphAnalyser
-from colossalai.auto_parallel.tensor_shard.deprecated.options import SolverOptions
-from colossalai.testing.pytest_wrapper import run_on_environment_flag
-
-BATCH_SIZE = 8
-SEQ_LENGHT = 8
-
-
-@run_on_environment_flag(name='AUTO_PARALLEL')
-def test_cost_graph():
-    physical_mesh_id = torch.arange(0, 8)
-    mesh_shape = (2, 4)
-    # [[0, 1]
-    #  [2, 3]]
-    device_mesh = DeviceMesh(physical_mesh_id, mesh_shape)
-    shape_consistency_manager = ShapeConsistencyManager()
-
-    tracer = ColoTracer()
-    config = transformers.GPT2Config(n_position=1024, n_layer=1, n_head=12)
-    model = transformers.GPT2LMHeadModel(config=config)
-    input_ids = torch.zeros((BATCH_SIZE, SEQ_LENGHT), dtype=torch.int64)
-    token_type_ids = torch.zeros((BATCH_SIZE, SEQ_LENGHT), dtype=torch.int64)
-    attention_mask = torch.zeros((BATCH_SIZE, SEQ_LENGHT), dtype=torch.int64)
-    kwargs = dict(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
-    meta_args = {k: v.to('meta') for k, v in kwargs.items()}
-
-    graph = tracer.trace(root=model, meta_args=meta_args)
-    gm = GraphModule(model, graph, model.__class__.__name__)
-    gm.recompile()
-    graph_analyser = GraphAnalyser(gm)
-    liveness_list = graph_analyser.liveness_analysis()
-    solver_options = SolverOptions(fast=True)
-    strategies_constructor = StrategiesConstructor(graph, device_mesh, solver_options)
-    print(graph)
-    strategies_constructor.build_strategies_and_cost()
-    for check_node, strategies_vector in strategies_constructor.strategy_map.items():
-        print(check_node, len(strategies_vector))
-    cost_graph = CostGraph(strategies_constructor.leaf_strategies)
-    cost_graph.simplify_graph()
-    # solver = Solver(gm.graph, strategies_constructor, cost_graph, graph_analyser, memory_budget=1620017824.0)
-    solver = Solver(gm.graph, strategies_constructor, cost_graph, graph_analyser)
-
-    ret = solver.call_solver_serialized_args()
-    print(ret)
-    strategies_list = list(ret[0])
-    print(strategies_list)
-    computation_cost = 0
-    communication_cost = 0
-    memory_cost = 0
-    nodes = [strategies_vector.node for strategies_vector in strategies_constructor.leaf_strategies]
-    for index, node in enumerate(nodes):
-        print(node.name, node.strategies_vector[strategies_list[index]].name)
-        computation_cost += node.strategies_vector[strategies_list[index]].compute_cost
-        communication_cost += node.strategies_vector[strategies_list[index]].communication_cost
-        node_memory_cost = node.strategies_vector[strategies_list[index]].memory_cost
-        if isinstance(node_memory_cost, tuple):
-            node_memory_cost = node_memory_cost[0]
-        memory_cost += node_memory_cost
-
-    print(f'computation cost is {computation_cost}')
-    print(f'communication cost is {communication_cost}')
-    print(f'memory cost is {memory_cost}')
-
-
-if __name__ == '__main__':
-    test_cost_graph()
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_solver_with_mlp.py b/tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_solver_with_mlp.py
deleted file mode 100644
index 415156ed6545..000000000000
--- a/tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_solver_with_mlp.py
+++ /dev/null
@@ -1,94 +0,0 @@
-import torch
-from torch.fx import GraphModule
-import torch.nn as nn
-import pytest
-
-from colossalai.fx.tracer.tracer import ColoTracer
-from colossalai.auto_parallel.tensor_shard.deprecated.sharding_strategy import ShardingStrategy, StrategiesVector
-from colossalai.tensor.shape_consistency import ShapeConsistencyManager
-from colossalai.device.device_mesh import DeviceMesh
-from colossalai.auto_parallel.tensor_shard.deprecated.strategies_constructor import StrategiesConstructor
-from colossalai.auto_parallel.tensor_shard.deprecated.cost_graph import CostGraph
-from copy import deepcopy
-from colossalai.auto_parallel.tensor_shard.deprecated import Solver
-from torchvision.models import resnet34, resnet50
-from colossalai.auto_parallel.tensor_shard.deprecated.constants import *
-from colossalai.auto_parallel.tensor_shard.deprecated.graph_analysis import GraphAnalyser
-from colossalai.auto_parallel.tensor_shard.deprecated.options import SolverOptions
-from colossalai.testing.pytest_wrapper import run_on_environment_flag
-
-
-class MLP(torch.nn.Module):
-
-    def __init__(self, dim: int):
-        super().__init__()
-        self.linear1 = torch.nn.Linear(dim, dim * 4)
-        self.linear2 = torch.nn.Linear(dim * 4, dim)
-        self.dropout = torch.nn.Dropout(0)
-        self.relu = torch.nn.ReLU()
-
-    def forward(self, x):
-        x = self.linear1(x)
-        x = self.dropout(x)
-        x = self.relu(x)
-        x = self.linear2(x)
-        return x
-
-
-@run_on_environment_flag(name='AUTO_PARALLEL')
-def test_cost_graph():
-    physical_mesh_id = torch.arange(0, 8)
-    mesh_shape = (2, 4)
-    device_mesh = DeviceMesh(physical_mesh_id, mesh_shape)
-    shape_consistency_manager = ShapeConsistencyManager()
-
-    tracer = ColoTracer()
-    model = MLP(32)
-
-    input_sample = {'x': torch.rand(16, 32).to('meta')}
-
-    # graph():
-    #     %x : torch.Tensor [#users=1] = placeholder[target=x]
-    #     %linear1 : [#users=1] = call_module[target=linear1](args = (%x,), kwargs = {})
-    #     %dropout : [#users=1] = call_module[target=dropout](args = (%linear1,), kwargs = {})
-    #     %relu : [#users=1] = call_module[target=relu](args = (%dropout,), kwargs = {})
-    #     %linear2 : [#users=1] = call_module[target=linear2](args = (%relu,), kwargs = {})
-    #     return linear2
-    graph = tracer.trace(root=model, meta_args=input_sample)
-
-    gm = GraphModule(model, graph, model.__class__.__name__)
-    gm.recompile()
-    graph_analyser = GraphAnalyser(gm)
-    liveness_list = graph_analyser.liveness_analysis()
-    solver_options = SolverOptions(fast=True)
-    strategies_constructor = StrategiesConstructor(graph, device_mesh, solver_options)
-    strategies_constructor.build_strategies_and_cost()
-
-    cost_graph = CostGraph(strategies_constructor.leaf_strategies)
-    cost_graph.simplify_graph()
-    # # megatron mode if no memory constraints
-    # solver = Solver(gm.graph, strategies_constructor, cost_graph, graph_analyser)
-    # all sharding on out feature dim if memory budget is not sufficient for megatron mode
-    solver = Solver(gm.graph, strategies_constructor, cost_graph, graph_analyser, memory_budget=5500.0)
-
-    ret = solver.call_solver_serialized_args()
-    strategies_list = list(ret[0])
-    computation_cost = 0
-    communication_cost = 0
-    memory_cost = 0
-    for index, node in enumerate(graph.nodes):
-        print(node.name, node.strategies_vector[strategies_list[index]].name)
-        computation_cost += node.strategies_vector[strategies_list[index]].compute_cost
-        communication_cost += node.strategies_vector[strategies_list[index]].communication_cost
-        node_memory_cost = node.strategies_vector[strategies_list[index]].memory_cost
-        if isinstance(node_memory_cost, tuple):
-            node_memory_cost = node_memory_cost[0]
-        memory_cost += node_memory_cost
-
-    print(f'computation cost is {computation_cost}')
-    print(f'communication cost is {communication_cost}')
-    print(f'memory cost is {memory_cost}')
-
-
-if __name__ == '__main__':
-    test_cost_graph()
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_strategies_constructor.py b/tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_strategies_constructor.py
deleted file mode 100644
index 9be1a5d963a9..000000000000
--- a/tests/test_auto_parallel/test_tensor_shard/test_deprecated/test_deprecated_strategies_constructor.py
+++ /dev/null
@@ -1,103 +0,0 @@
-from copy import deepcopy
-
-import pytest
-import torch
-import torch.nn as nn
-from torch.fx import GraphModule
-
-from colossalai.auto_parallel.tensor_shard.deprecated.op_handler.conv_handler import CONV_STRATEGIES_LIST
-from colossalai.auto_parallel.tensor_shard.deprecated.options import SolverOptions
-from colossalai.auto_parallel.tensor_shard.deprecated.sharding_strategy import ShardingStrategy, StrategiesVector
-from colossalai.auto_parallel.tensor_shard.deprecated.strategies_constructor import StrategiesConstructor
-from colossalai.device.device_mesh import DeviceMesh
-from colossalai.fx.proxy import ColoProxy
-from colossalai.fx.tracer.tracer import ColoTracer
-from colossalai.tensor.sharding_spec import ShardingSpec, _DimSpec
-
-
-class ConvModel(nn.Module):
-
-    def __init__(self, c_in, c_out):
-        super().__init__()
-        self.conv = nn.Conv2d(c_in, c_out, kernel_size=3)
-
-    def forward(self, x):
-        x = x * 2
-        x = self.conv(x)
-        return x
-
-
-def test_strategies_constructor():
-    physical_mesh_id = torch.arange(0, 4)
-    mesh_shape = (2, 2)
-    # [[0, 1]
-    #  [2, 3]]
-    device_mesh = DeviceMesh(physical_mesh_id, mesh_shape)
-    entire_shape = torch.Size((4, 16, 64, 64))
-
-    tracer = ColoTracer()
-    model = ConvModel(16, 32)
-    input_sample = {'x': torch.rand(4, 16, 64, 64).to('meta')}
-    # graph():
-    #     %x : torch.Tensor [#users=1] = placeholder[target=x]
-    #     %mul : [#users=1] = call_function[target=operator.mul](args = (%x, 2), kwargs = {})
-    #     %conv_weight : [#users=1] = get_attr[target=conv.weight]
-    #     %conv_bias : [#users=1] = get_attr[target=conv.bias]
-    #     %conv2d : [#users=1] = call_function[target=torch.conv2d](args = (%mul, %conv_weight), kwargs = {groups: 1, dilation: (1, 1), stride: (1, 1), padding: (0, 0)})
-    #     %view : [#users=1] = call_method[target=view](args = (%conv_bias, [1, -1, 1, 1]), kwargs = {})
-    #     %add : [#users=1] = call_function[target=operator.add](args = (%conv2d, %view), kwargs = {})
-    #     return add
-    graph = tracer.trace(root=model, meta_args=input_sample)
-    print(graph)
-    gm = GraphModule(model, graph, model.__class__.__name__)
-    gm.recompile()
-
-    solver_options = SolverOptions(fast=True)
-    strategies_constructor = StrategiesConstructor(graph, device_mesh, solver_options)
-
-    assert strategies_constructor.leaf_strategies == []
-    assert strategies_constructor.strategy_map == {}
-    strategies_constructor.build_strategies_and_cost()
-
-    # check leaf_strategies
-
-    # In fast mode, placeholder node only has replica strategy.
-    assert strategies_constructor.leaf_strategies[0][0].name == 'Replica Placeholder'
-
-    # Second node is mul which is a element-wise node, therefore the output sharding spec is same as input sharding spec.
-    assert strategies_constructor.leaf_strategies[1][0].name == '[R, R, R, R] -> [R, R, R, R]_0'
-
-    # Third node is conv.
-    conv_check_list = deepcopy(CONV_STRATEGIES_LIST)
-    for strategy in strategies_constructor.leaf_strategies[4]:
-        conv_check_list.remove(strategy.name)
-    assert len(conv_check_list) == 0
-
-    # In fast mode, output node only has replica strategy.
-    assert strategies_constructor.leaf_strategies[7][0].name == 'Replica Output'
-
-    # check strategy_map
-
-    nodes = [node for node in graph.nodes]
-    # In fast mode, placeholder node only has replica strategy.
-    x = nodes[0]
-    assert strategies_constructor.strategy_map[x][0].name == 'Replica Placeholder'
-
-    # Second node is mul which is a element-wise node, therefore the output sharding spec is same as input sharding spec.
-    mul = nodes[1]
-    assert strategies_constructor.strategy_map[mul][0].name == '[R, R, R, R] -> [R, R, R, R]_0'
-
-    # fifth node is conv.
-    conv = nodes[4]
-    conv_check_list = deepcopy(CONV_STRATEGIES_LIST)
-    for strategy in strategies_constructor.strategy_map[conv]:
-        conv_check_list.remove(strategy.name)
-    assert len(conv_check_list) == 0
-
-    # In fast mode, output node only has replica strategy.
-    output = nodes[-1]
-    assert strategies_constructor.strategy_map[output][0].name == 'Replica Output'
-
-
-if __name__ == '__main__':
-    test_strategies_constructor()

From d03f4429c1155eb806d9b0763a43dfe4184a98f9 Mon Sep 17 00:00:00 2001
From: Fazzie-Maqianli <55798671+Fazziekey@users.noreply.github.com>
Date: Wed, 15 Feb 2023 09:55:53 +0800
Subject: [PATCH 317/503] add ci (#2641)

---
 examples/images/diffusion/README.md  |  6 ++++++
 examples/images/diffusion/main.py    |  2 ++
 examples/images/diffusion/test_ci.sh | 17 +++++++++++++++++
 3 files changed, 25 insertions(+)
 mode change 100644 => 100755 examples/images/diffusion/test_ci.sh

diff --git a/examples/images/diffusion/README.md b/examples/images/diffusion/README.md
index 952da5d1c3b0..15932f1f524b 100644
--- a/examples/images/diffusion/README.md
+++ b/examples/images/diffusion/README.md
@@ -92,6 +92,12 @@ cd ColossalAI
 CUDA_EXT=1 pip install .
 ```
 
+#### Step 3:Accelerate with flash attention by xformers(Optional)
+
+```
+pip install xformers
+```
+
 ### Option #2: Use Docker
 
 To use the stable diffusion Docker image, you can either build using the provided the [Dockerfile](./docker/Dockerfile) or pull a Docker image from our Docker hub.
diff --git a/examples/images/diffusion/main.py b/examples/images/diffusion/main.py
index 5f166aa1f71f..4dd88a5eca44 100644
--- a/examples/images/diffusion/main.py
+++ b/examples/images/diffusion/main.py
@@ -539,6 +539,8 @@ def on_train_epoch_end(self, trainer, pl_module):
         raise ValueError("-n/--name and -r/--resume cannot be specified both."
                          "If you want to resume training in a new log folder, "
                          "use -n/--name in combination with --resume_from_checkpoint")
+
+    ckpt = None
     if opt.resume:
         rank_zero_info("Resuming from {}".format(opt.resume))
         if not os.path.exists(opt.resume):
diff --git a/examples/images/diffusion/test_ci.sh b/examples/images/diffusion/test_ci.sh
old mode 100644
new mode 100755
index e69de29bb2d1..51ceeb41d47e
--- a/examples/images/diffusion/test_ci.sh
+++ b/examples/images/diffusion/test_ci.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+set -euxo pipefail
+
+conda env create -f environment.yaml
+
+conda activate ldm
+
+conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit=11.3 -c pytorch
+pip install transformers diffusers invisible-watermark
+
+CUDA_EXT=1  pip install colossalai
+
+pip install pytorch-lightning
+
+wget https://huggingface.co/stabilityai/stable-diffusion-2-base/resolve/main/512-base-ema.ckpt
+
+python main.py --logdir /tmp --train --base configs/Teyvat/train_colossalai_teyvat.yaml --ckpt 512-base-ema.ckpt

From b3d10db5f1bc58f79e6eeb010b76612aeb299730 Mon Sep 17 00:00:00 2001
From: Zihao <804673818@qq.com>
Date: Wed, 15 Feb 2023 09:57:22 +0800
Subject: [PATCH 318/503] [NFC] polish colossalai/cli/launcher/__init__.py code
 style (#2709)

---
 colossalai/cli/launcher/__init__.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/colossalai/cli/launcher/__init__.py b/colossalai/cli/launcher/__init__.py
index 4ada68b4b68f..8d9ec147d401 100644
--- a/colossalai/cli/launcher/__init__.py
+++ b/colossalai/cli/launcher/__init__.py
@@ -1,7 +1,9 @@
 import click
-from .run import launch_multi_processes
+
 from colossalai.context import Config
 
+from .run import launch_multi_processes
+
 
 @click.command(help="Launch distributed training on a single node or multiple nodes",
                context_settings=dict(ignore_unknown_options=True))

From 89f8975fb8f0ca7e637fce33f0d13248094ccb4d Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Wed, 15 Feb 2023 10:12:55 +0800
Subject: [PATCH 319/503] [workflow] fixed tensor-nvme build caching (#2711)

---
 .github/workflows/build_on_pr.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml
index c7882db6ec61..f595e677394a 100644
--- a/.github/workflows/build_on_pr.yml
+++ b/.github/workflows/build_on_pr.yml
@@ -96,6 +96,7 @@ jobs:
 
       - name: Store TensorNVMe Cache
         run: |
+          cd TensorNVMe
           cp -p -r ./build /github/home/tensornvme_cache/
 
       - name: Checkout Colossal-AI

From cb2c6a2415d6da40ad694e3c7a7b3dae647ac073 Mon Sep 17 00:00:00 2001
From: YuliangLiu0306 <72588413+YuliangLiu0306@users.noreply.github.com>
Date: Wed, 15 Feb 2023 10:36:19 +0800
Subject: [PATCH 320/503] [autoparallel] refactor runtime pass (#2644)

* [autoparallel] refactor runtime pass

* add unit test

* polish
---
 colossalai/auto_parallel/passes/constants.py  |   5 +
 .../passes/runtime_preparation_pass.py        | 437 +++++++++---------
 .../test_pass/test_node_converting_pass.py    |  54 +++
 .../test_size_value_converting_pass.py        |  65 +++
 .../test_node_handler/test_linear_handler.py  |   3 -
 5 files changed, 351 insertions(+), 213 deletions(-)
 create mode 100644 tests/test_auto_parallel/test_pass/test_node_converting_pass.py
 create mode 100644 tests/test_auto_parallel/test_pass/test_size_value_converting_pass.py

diff --git a/colossalai/auto_parallel/passes/constants.py b/colossalai/auto_parallel/passes/constants.py
index b86088474644..485a87492f4c 100644
--- a/colossalai/auto_parallel/passes/constants.py
+++ b/colossalai/auto_parallel/passes/constants.py
@@ -6,3 +6,8 @@
     torch.nn.ReLU,
     torch.nn.Softmax,
 ]
+
+# SHAPE_ARGUMENT_OPS contains node with (input, *shape) style args.
+# This list could be extended if any other method has the same
+# argument style as view and reshape.
+SHAPE_ARGUMENT_OPS = [torch.Tensor.view, torch.Tensor.reshape, torch.reshape]
diff --git a/colossalai/auto_parallel/passes/runtime_preparation_pass.py b/colossalai/auto_parallel/passes/runtime_preparation_pass.py
index ecf3f1f18de5..bb419be35e55 100644
--- a/colossalai/auto_parallel/passes/runtime_preparation_pass.py
+++ b/colossalai/auto_parallel/passes/runtime_preparation_pass.py
@@ -19,6 +19,8 @@
 from colossalai.tensor.shape_consistency import ShapeConsistencyManager
 from colossalai.tensor.sharding_spec import ShardingSpec
 
+from .constants import SHAPE_ARGUMENT_OPS
+
 shape_consistency_manager = ShapeConsistencyManager()
 
 
@@ -51,23 +53,16 @@ def size_processing(size: Union[int, torch.Size],
     return size
 
 
-def _solution_annotatation(gm: torch.fx.GraphModule,
-                           solution: List[int],
-                           strategies_constructor: StrategiesConstructor = None):
+def solution_annotatation_pass(gm: torch.fx.GraphModule, solution: List[int],
+                               strategies_constructor: StrategiesConstructor):
     """
     This method is used to stick the solution strategy to the nodes and add the information
     required in runtime into graph as placeholder nodes.
     """
     mod_graph = gm.graph
-    # TODO: In future PR, strategies_constructor should be a required argument,
-    # instead of optional argument. This is because we don't need to consider nodes with
-    # no strategy in runtime preparation pass.
-    if strategies_constructor is not None:
-        nodes = [strategies_vector.node for strategies_vector in strategies_constructor.leaf_strategies]
-        no_strategy_nodes = strategies_constructor.no_strategy_nodes
-    else:
-        nodes = tuple(mod_graph.nodes)
-        no_strategy_nodes = []
+
+    nodes = [strategies_vector.node for strategies_vector in strategies_constructor.leaf_strategies]
+    no_strategy_nodes = strategies_constructor.no_strategy_nodes
 
     # the dict to get origin sharding spec of node
     origin_node_sharding_spec_dict = {}
@@ -97,6 +92,7 @@ def _solution_annotatation(gm: torch.fx.GraphModule,
             target_sharding_specs.append(target_sharding_spec)
         sharding_spec_convert_dict[index] = target_sharding_specs
         setattr(node, 'target_sharding_specs', target_sharding_specs)
+
         # the get_attr node strategy is kind of pending strategy, which means we will change it
         # to the same strategy of the user node.
         if node.op == 'get_attr':
@@ -134,7 +130,7 @@ def _solution_annotatation(gm: torch.fx.GraphModule,
     return gm, sharding_spec_convert_dict, origin_node_sharding_spec_dict, comm_actions_dict
 
 
-def _size_value_converting(gm: torch.fx.GraphModule, device_mesh: DeviceMesh):
+def size_value_converting_pass(gm: torch.fx.GraphModule, device_mesh: DeviceMesh):
     """
     In the auto parallel system, tensors may get shard on different devices, so the size of tensors
     need to be converted to the size of original tensor and managed by the users, such as torch.view,
@@ -145,6 +141,80 @@ def _size_value_converting(gm: torch.fx.GraphModule, device_mesh: DeviceMesh):
     nodes = tuple(mod_graph.nodes)
     node_pairs = {}
 
+    # DeviceMesh information instructs the scaling of the size value
+    device_mesh_info = {}
+    for dim, dim_size in enumerate(device_mesh.mesh_shape):
+        device_mesh_info[dim] = dim_size
+
+    def _extract_target_dim(node):
+        '''
+        A helper function to etract the target dimension from size node.
+        There are two usages of torch.Tensor.size:
+        1. tensor.size()
+        2. tensor.size(dim)
+
+        If a target_dim is assigned, then the output will be in type of int, instead of torch.Size.
+        Otherwise, the output will be in type of torch.Size and this function will return None.
+        '''
+        target_dim = None
+        if len(node.args) > 1:
+            target_dim = node.args[1]
+            if target_dim < 0:
+                target_dim += node.args[0]._meta_data.dim()
+        return target_dim
+
+    def _post_processing(node, size_processing_node):
+        '''
+        This function is used to process the dependency between the size node and its users after
+        inserting the size_process_node.
+        '''
+        # store original node and processing node pair in node_pairs dictioanry
+        # It will be used to replace the original node with processing node in slice object
+        node_pairs[node] = size_processing_node
+        size_processing_node._meta_data = node._meta_data
+        if 'activation_checkpoint' in node.meta:
+            size_processing_node.meta['activation_checkpoint'] = node.meta['activation_checkpoint']
+
+        user_list = list(node.users.keys())
+        for user in user_list:
+            if user == size_processing_node:
+                continue
+            new_args = list(user.args)
+            new_kwargs = dict(user.kwargs)
+            # the origin node may be a positional argument or key word argument of user node
+            if node in new_args:
+                # substitute the origin node with size_processing_node
+                new_args[new_args.index(node)] = size_processing_node
+                user.args = tuple(new_args)
+            elif str(node) in new_kwargs:
+                # substitute the origin node with size_processing_node
+                new_kwargs[str(node)] = size_processing_node
+                user.kwargs = new_kwargs
+
+    def _update_slice_object_args(slice_object):
+        '''
+        This function is used to update the slice object argument list.
+        If the slice object contains the Node argument, then the size node will be replaced with
+        '''
+        if isinstance(slice_object, slice):
+            start = slice_object.start
+            stop = slice_object.stop
+            step = slice_object.step
+            if start in node_pairs:
+                start = node_pairs[start]
+            if stop in node_pairs:
+                stop = node_pairs[stop]
+            if step in node_pairs:
+                step = node_pairs[step]
+            return slice(start, stop, step)
+        elif isinstance(slice_object, int):
+            if slice_object in node_pairs:
+                return node_pairs[slice_object]
+            else:
+                return slice_object
+        else:
+            raise RuntimeError(f"Unsupported slice object type: {type(slice_object)}")
+
     for node in nodes:
 
         if node.op == 'call_method' and node.target == 'size':
@@ -154,49 +224,15 @@ def _size_value_converting(gm: torch.fx.GraphModule, device_mesh: DeviceMesh):
             sharding_spec = node.args[0].sharding_spec
             dim_partition_dict = sharding_spec.dim_partition_dict
 
-            # there are two usages of torch.Tensor.size:
-            #   tensor.size()
-            #   tensor.size(dim)
-            # if a target_dim is assigned, then the output will be
-            # in type of int, instead of torch.Size
-            target_dim = None
-            if len(node.args) > 1:
-                target_dim = node.args[1]
-                if target_dim < 0:
-                    target_dim += node.args[0]._meta_data.dim()
-
-            # DeviceMesh information instructs the scaling of the size value
-            device_mesh_info = {}
-            for dim, dim_size in enumerate(device_mesh.mesh_shape):
-                device_mesh_info[dim] = dim_size
+            target_dim = _extract_target_dim(node)
 
+            # insert size_processing node
             with mod_graph.inserting_after(node):
                 size_processing_node = mod_graph.create_node('call_function',
                                                              size_processing,
                                                              args=(node, dim_partition_dict, device_mesh_info,
                                                                    target_dim, node.name))
-                # store original node and processing node pair in node_pairs dictioanry
-                # It will be used to replace the original node with processing node in slice object
-                node_pairs[node] = size_processing_node
-                size_processing_node._meta_data = node._meta_data
-                if 'activation_checkpoint' in node.meta:
-                    size_processing_node.meta['activation_checkpoint'] = node.meta['activation_checkpoint']
-
-            user_list = list(node.users.keys())
-            for user in user_list:
-                if user == size_processing_node:
-                    continue
-                new_args = list(user.args)
-                new_kwargs = dict(user.kwargs)
-                # the origin node may be a positional argument or key word argument of user node
-                if node in new_args:
-                    # substitute the origin node with size_processing_node
-                    new_args[new_args.index(node)] = size_processing_node
-                    user.args = tuple(new_args)
-                elif str(node) in new_kwargs:
-                    # substitute the origin node with size_processing_node
-                    new_kwargs[str(node)] = size_processing_node
-                    user.kwargs = new_kwargs
+            _post_processing(node, size_processing_node)
 
         if node.op == 'call_function' and node.target == operator.getitem:
 
@@ -217,14 +253,7 @@ def _size_value_converting(gm: torch.fx.GraphModule, device_mesh: DeviceMesh):
             # In this pass, we need process the last two cases because
             # node arguments may potentially appear in these cases.
             if isinstance(getitem_index, slice):
-                new_start, new_stop, new_step = getitem_index.start, getitem_index.stop, getitem_index.step
-                if getitem_index.start in node_pairs:
-                    new_start = node_pairs[getitem_index.start]
-                elif getitem_index.stop in node_pairs:
-                    new_stop = node_pairs[getitem_index.stop]
-                elif getitem_index.step in node_pairs:
-                    new_step = node_pairs[getitem_index.step]
-                new_slice_item = slice(new_start, new_stop, new_step)
+                new_slice_item = _update_slice_object_args(getitem_index)
                 new_args = (node.args[0], new_slice_item)
                 node.args = new_args
 
@@ -237,16 +266,7 @@ def _size_value_converting(gm: torch.fx.GraphModule, device_mesh: DeviceMesh):
                     if slice_item is None:
                         new_slice_items.append(None)
                         continue
-
-                    new_start, new_stop, new_step = slice_item.start, slice_item.stop, slice_item.step
-
-                    if slice_item.start in node_pairs:
-                        new_start = node_pairs[slice_item.start]
-                    elif slice_item.stop in node_pairs:
-                        new_stop = node_pairs[slice_item.stop]
-                    elif slice_item.step in node_pairs:
-                        new_step = node_pairs[slice_item.step]
-                    new_slice_item = slice(new_start, new_stop, new_step)
+                    new_slice_item = _update_slice_object_args(slice_item)
                     new_slice_items.append(new_slice_item)
 
                 new_args = (node.args[0], tuple(new_slice_items))
@@ -255,104 +275,109 @@ def _size_value_converting(gm: torch.fx.GraphModule, device_mesh: DeviceMesh):
     return gm
 
 
-def _node_args_converting(gm: torch.fx.GraphModule, device_mesh: DeviceMesh):
+def node_args_converting_pass(gm: torch.fx.GraphModule, device_mesh: DeviceMesh):
     """
     This pass will process node args to adapt the distributed tensor layout.
     """
     mod_graph = gm.graph
     nodes = tuple(mod_graph.nodes)
 
-    for node in nodes:
-        # skip the placeholder node added in _solution_annotation pass
-        if not hasattr(node, 'sharding_spec'):
-            continue
-
-        def _process_sharding_spec(sharding_spec):
-            if isinstance(sharding_spec, ShardingSpec):
-                dim_partition_dict = sharding_spec.dim_partition_dict
-                device_mesh = sharding_spec.device_mesh
-                return dim_partition_dict, device_mesh
-            if sharding_spec is None:
-                return None, None
-            assert isinstance(sharding_spec,
-                              (tuple, list)), 'sharding_spec should be type of ShardingSpec, tuple, list or None'
-
-            device_mesh = sharding_spec[0].device_mesh
-            dim_partition_dict = []
-            for element in sharding_spec:
-                dim_partition_dict.append(_process_sharding_spec(element))
-            return dim_partition_dict, sharding_spec
-
-        output_dim_partition_dict, device_mesh = _process_sharding_spec(node.sharding_spec)
+    def _extract_info_from_sharding_spec(sharding_spec):
+        '''
+        This function is used to extract the dim_partition_dict and device_mesh from
+        sharding spec instance or a list of sharding spec.
+        '''
+        if isinstance(sharding_spec, ShardingSpec):
+            dim_partition_dict = sharding_spec.dim_partition_dict
+            device_mesh = sharding_spec.device_mesh
+            return dim_partition_dict, device_mesh
+        if sharding_spec is None:
+            return None, None
+        assert isinstance(sharding_spec,
+                          (tuple, list)), 'sharding_spec should be type of ShardingSpec, tuple, list or None'
+
+        device_mesh = sharding_spec[0].device_mesh
+        dim_partition_dict = []
+        for element in sharding_spec:
+            dim_partition_dict.append(_extract_info_from_sharding_spec(element))
+        return dim_partition_dict, sharding_spec
+
+    def _process_node_arguments(node):
         new_args = []
+        for arg in node.args:
+            # There are two args style:
+            # 1. (input, *shape)
+            # 2. (input, shape)
+            # We will extract the elements from shape and add them into the new_args
+            # Finally, the args style of new_args will be unified to (input, *shape)
+            if isinstance(arg, Node):
+                if isinstance(arg._meta_data, (tuple, list)):
+                    new_args.extend(arg._meta_data)
+                elif isinstance(arg._meta_data, int):
+                    new_args.append(arg._meta_data)
+                else:
+                    new_args.append(arg)
+            else:
+                assert isinstance(arg,
+                                  (int, tuple, list)), 'The argument in view node should be either type of Node or int.'
+                if isinstance(arg, (tuple, list)):
+                    new_args.extend(arg)
+                else:
+                    new_args.append(arg)
+        return new_args
+
+    def _scale_args_adapt_sharding_spec(dim_partition_dict, device_mesh, node):
+        new_args = _process_node_arguments(node)
+        if node.op == 'call_method':
+            args_to_process = list(new_args[1:])
+        else:
+            args_to_process = list(new_args)
+        for dim, shard_dims in dim_partition_dict.items():
+            total_shard_size = 1
+            for shard_dim in shard_dims:
+                total_shard_size *= device_mesh.shape[shard_dim]
+
+            # we will skip the dim with -1 value
+            if args_to_process[dim] == -1:
+                continue
+            else:
+                # TODO: add assertion here to make sure the dim size is divisible by total_shard_size
+                args_to_process[dim] //= total_shard_size
+
+        args_to_process = tuple(args_to_process)
 
         if node.op == 'call_method':
-            method = getattr(node.args[0]._meta_data.__class__, node.target)
-            # process the node with (input, *shape) style args
-            if method in (torch.Tensor.view, torch.Tensor.reshape):
-
-                for arg in node.args:
-                    if isinstance(arg, Node):
-                        if isinstance(arg._meta_data, (int, tuple, list)):
-                            new_args.append(arg._meta_data)
-                        else:
-                            new_args.append(arg)
-                    else:
-                        assert isinstance(
-                            arg, (int, tuple, list)), 'The argument in view node should be either type of Node or int.'
-                        new_args.append(arg)
-
-                for dim, shard_dims in output_dim_partition_dict.items():
-                    total_shard_size = 1
-                    for shard_dim in shard_dims:
-                        total_shard_size *= device_mesh.shape[shard_dim]
-                    # There are two ways to use torch.view:
-                    # 1. torch.view(input, *shape)
-                    # 2. torch.view(input, shape)
-                    if isinstance(new_args[1], int):
-                        # we will skip the dim with -1 value
-                        if new_args[dim + 1] == -1:
-                            continue
-                        else:
-                            new_args[dim + 1] //= total_shard_size
-                    else:
-                        new_args[1] = list(new_args[1])
-                        # we will skip the dim with -1 value
-                        if new_args[1][dim] == -1:
-                            continue
-                        else:
-                            new_args[1][dim] //= total_shard_size
-                node.args = tuple(new_args)
+            new_args = (new_args[0],) + args_to_process
+        else:
+            new_args = args_to_process
+
+        node.args = new_args
 
+    def _filter_node_with_shape_args(node):
+        if node.op == 'call_method':
+            target = getattr(node.args[0]._meta_data.__class__, node.target)
         elif node.op == 'call_function':
             target = node.target
-            # process the node with (input, torch.Size) style args
-            if target in (torch.reshape,):
-                for arg in node.args:
-                    if isinstance(arg, Node):
-                        if isinstance(arg._meta_data, (tuple, list)):
-                            new_args.append(list(arg._meta_data))
-                        else:
-                            new_args.append(arg)
-                    else:
-                        assert isinstance(
-                            arg, (tuple, list)), 'The argument in reshape node should be either type of Node or tuple.'
-                        new_args.append(list(arg))
-
-                for dim, shard_dims in output_dim_partition_dict.items():
-                    # we will skip the dim with -1 value
-                    if new_args[1][dim] == -1:
-                        continue
-                    total_shard_size = 1
-                    for shard_dim in shard_dims:
-                        total_shard_size *= device_mesh.shape[shard_dim]
-                    new_args[1][dim] //= total_shard_size
-                node.args = tuple(new_args)
+        else:
+            target = None
+
+        if target in SHAPE_ARGUMENT_OPS:
+            return True
+        return False
+
+    for node in nodes:
+        # skip the placeholder node added in _solution_annotation pass
+        if not hasattr(node, 'sharding_spec'):
+            continue
+
+        output_dim_partition_dict, device_mesh = _extract_info_from_sharding_spec(node.sharding_spec)
+        if _filter_node_with_shape_args(node):
+            _scale_args_adapt_sharding_spec(output_dim_partition_dict, device_mesh, node)
 
     return gm
 
 
-def _module_params_sharding(gm: torch.fx.GraphModule, device_mesh: DeviceMesh, overlap=False):
+def module_params_sharding_pass(gm: torch.fx.GraphModule, device_mesh: DeviceMesh, overlap=False):
     """
     Apply the sharding action to the module parameters and buffers following the
     instructions of solver solution.
@@ -361,6 +386,49 @@ def _module_params_sharding(gm: torch.fx.GraphModule, device_mesh: DeviceMesh, o
     nodes = tuple(mod_graph.nodes)
     # This stream is created for overlaping the communication and computation.
     reduction_stream = torch.cuda.Stream()
+
+    def _add_hook_for_grad_communication(node, param):
+
+        comm_actions = node.best_strategy.communication_actions
+
+        def _filter_param_to_hook(node, op_data, comm_action):
+            if node.op == 'call_module' and op_data.type == OperationDataType.PARAM and op_data.name == param.name and comm_action.comm_type == CommType.HOOK:
+                return True
+            if node.op == 'get_attr' and isinstance(
+                    node._meta_data, torch.nn.parameter.Parameter) and comm_action.comm_type == CommType.HOOK:
+                return True
+            return False
+
+        for operation_data, comm_action in comm_actions.items():
+            comm_spec_to_use = comm_action.comm_spec
+            # register hook to the parameters
+            if _filter_param_to_hook(node, operation_data, comm_action):
+
+                def wrapper(param, comm_spec, stream, overlap):
+
+                    def hook_fn(grad):
+                        if overlap:
+                            with torch.cuda.stream(stream):
+                                _all_reduce(grad, comm_spec, async_op=True)
+                        else:
+                            _all_reduce(grad, comm_spec, async_op=False)
+
+                    param.register_hook(hook_fn)
+
+                wrapper(param, comm_spec_to_use, reduction_stream, overlap=overlap)
+
+    def _shard_param(param, target_sharding_spec):
+        # apply the sharding spec of parameters
+        if target_sharding_spec.dim_partition_dict != {}:
+            origin_sharding_spec = ShardingSpec(device_mesh, param.shape, {})
+            setattr(param, 'sharding_spec', origin_sharding_spec)
+            # TODO: build a ColoParamter class to manager the distributed parameters
+            # we could use .data here, because all the operations just happen before the real training
+            # loop, so we don't need to track these operations in the autograd graph.
+            param = torch.nn.Parameter(
+                        shape_consistency_manager.apply_for_autoparallel_runtime(param.data, param.sharding_spec,
+                                                                                 target_sharding_spec).detach().clone())
+
     for node in nodes:
         if node.op == 'call_module':
             target_module = node.graph.owning_module.get_submodule(node.target)
@@ -370,36 +438,10 @@ def _module_params_sharding(gm: torch.fx.GraphModule, device_mesh: DeviceMesh, o
             setattr(target_module, 'processed', True)
             for name, param in target_module.named_parameters():
                 target_sharding_spec = node.best_strategy.get_sharding_spec_by_name(name)
-                # apply the sharding spec of parameters
-                if target_sharding_spec.dim_partition_dict != {}:
-                    origin_sharding_spec = ShardingSpec(device_mesh, param.shape, {})
-                    setattr(param, 'sharding_spec', origin_sharding_spec)
-                    # TODO: build a ColoParamter class to manager the distributed parameters
-                    # we could use .data here, because all the operations just happen before the real training
-                    # loop, so we don't need to track these operations in the autograd graph.
-                    param = torch.nn.Parameter(
-                        shape_consistency_manager.apply_for_autoparallel_runtime(param.data, param.sharding_spec,
-                                                                                 target_sharding_spec).detach().clone())
+                _shard_param(param, target_sharding_spec)
 
                 setattr(target_module, name, param)
-                comm_actions = node.best_strategy.communication_actions
-                for operation_data, comm_action in comm_actions.items():
-                    comm_spec_to_use = comm_action.comm_spec
-                    # register hook to the parameters
-                    if operation_data.type == OperationDataType.PARAM and operation_data.name == name and comm_action.comm_type == CommType.HOOK:
-
-                        def wrapper(param, comm_spec, stream, overlap):
-
-                            def hook_fn(grad):
-                                if overlap:
-                                    with torch.cuda.stream(stream):
-                                        _all_reduce(grad, comm_spec, async_op=True)
-                                else:
-                                    _all_reduce(grad, comm_spec, async_op=False)
-
-                            param.register_hook(hook_fn)
-
-                        wrapper(param, comm_spec_to_use, reduction_stream, overlap=overlap)
+                _add_hook_for_grad_communication(node, param)
 
             sharded_buffer_dict = {}
             # apply the sharding spec of buffers
@@ -427,37 +469,12 @@ def hook_fn(grad):
                 target = getattr(target_module, atoms[-1])
 
             target_sharding_spec = node.sharding_spec
-            if target_sharding_spec.dim_partition_dict != {}:
-                origin_sharding_spec = ShardingSpec(device_mesh, target.shape, {})
-                setattr(target, 'sharding_spec', origin_sharding_spec)
-                # TODO: build a ColoParamter class to manager the distributed parameters
-                # we could use .data here, because all the operations just happen before the real training
-                # loop, so we don't need to track these operations in the autograd graph.
-                target = torch.nn.Parameter(
-                    shape_consistency_manager.apply_for_autoparallel_runtime(target.data, target.sharding_spec,
-                                                                             target_sharding_spec).detach().clone())
+            _shard_param(target, target_sharding_spec)
 
             assert hasattr(target_module, atoms[-1])
             setattr(target_module, atoms[-1], target)
+            _add_hook_for_grad_communication(node, target)
 
-            comm_actions = node.best_strategy.communication_actions
-            for operation_data, comm_action in comm_actions.items():
-                comm_spec_to_use = comm_action.comm_spec
-                # register hook to the parameters
-                if isinstance(node._meta_data, torch.nn.parameter.Parameter) and comm_action.comm_type == CommType.HOOK:
-
-                    def wrapper(param, comm_spec, stream, overlap):
-
-                        def hook_fn(grad):
-                            if overlap:
-                                with torch.cuda.stream(stream):
-                                    _all_reduce(grad, comm_spec, async_op=True)
-                            else:
-                                _all_reduce(grad, comm_spec, async_op=False)
-
-                        param.register_hook(hook_fn)
-
-                    wrapper(target, comm_spec_to_use, reduction_stream, overlap=overlap)
     return gm
 
 
@@ -471,14 +488,14 @@ def implicit_comm_action_apply(gm: torch.fx.GraphModule):
 def runtime_preparation_pass(gm: torch.fx.GraphModule,
                              solution: List[int],
                              device_mesh: DeviceMesh,
-                             strategies_constructor: StrategiesConstructor = None,
+                             strategies_constructor: StrategiesConstructor,
                              overlap=False):
-    gm, sharding_spec_convert_dict, origin_node_sharding_spec_dict, comm_actions_dict = _solution_annotatation(
+    gm, sharding_spec_convert_dict, origin_node_sharding_spec_dict, comm_actions_dict = solution_annotatation_pass(
         gm, solution, strategies_constructor)
-    gm = _size_value_converting(gm, device_mesh)
-    gm = _node_args_converting(gm, device_mesh)
+    gm = size_value_converting_pass(gm, device_mesh)
+    gm = node_args_converting_pass(gm, device_mesh)
     # TODO: the pass below should be uncommented after the implementation of implicit_comm_action_apply_pass completed.
     # gm = implicit_comm_action_apply(gm)
-    gm = _module_params_sharding(gm, device_mesh, overlap=overlap)
+    gm = module_params_sharding_pass(gm, device_mesh, overlap=overlap)
 
     return gm, sharding_spec_convert_dict, origin_node_sharding_spec_dict, comm_actions_dict
diff --git a/tests/test_auto_parallel/test_pass/test_node_converting_pass.py b/tests/test_auto_parallel/test_pass/test_node_converting_pass.py
new file mode 100644
index 000000000000..d0d107610f7a
--- /dev/null
+++ b/tests/test_auto_parallel/test_pass/test_node_converting_pass.py
@@ -0,0 +1,54 @@
+import torch
+import torch.nn.functional as F
+
+from colossalai.auto_parallel.passes.runtime_preparation_pass import node_args_converting_pass
+from colossalai.device.device_mesh import DeviceMesh
+from colossalai.fx.graph_module import ColoGraphModule
+from colossalai.fx.tracer import ColoTracer
+from colossalai.tensor.sharding_spec import ShardingSpec
+
+
+class TestModule(torch.nn.Module):
+
+    def forward(self, x):
+        x = x.view(4, 4, 2)
+        return x
+
+
+def insert_narrow(gm, x_node):
+    graph = gm.graph
+    with graph.inserting_after(x_node):
+        shard_node = graph.create_node('call_method', 'narrow', args=(x_node, 0, 0, 2), kwargs={})
+    view_node = list(x_node.users.keys())[0]
+    new_args = list(view_node.args)
+    new_args[0] = shard_node
+    view_node.args = tuple(new_args)
+    return gm
+
+
+def test_node_args_converting_pass():
+    model = TestModule()
+    physical_mesh_id = torch.arange(0, 4)
+    mesh_shape = (2, 2)
+    device_mesh = DeviceMesh(physical_mesh_id, mesh_shape)
+    meta_args = {'x': torch.rand(4, 8).to('meta')}
+    input = torch.rand(4, 8)
+    tracer = ColoTracer()
+    graph = tracer.trace(root=model, meta_args=meta_args)
+
+    x_node = list(graph.nodes)[0]
+    view_node = list(graph.nodes)[1]
+    sharding_spec = ShardingSpec(device_mesh, entire_shape=(4, 8), dim_partition_dict={0: [0]})
+    setattr(x_node, 'sharding_spec', sharding_spec)
+    setattr(view_node, 'sharding_spec', sharding_spec)
+
+    gm = ColoGraphModule(model, graph)
+    gm = node_args_converting_pass(gm, device_mesh)
+    gm = insert_narrow(gm, x_node)
+    gm.recompile()
+    output = gm(input)
+    assert output.shape == torch.Size([2, 4, 2])
+
+
+if __name__ == '__main__':
+    test_node_args_converting_pass()
diff --git a/tests/test_auto_parallel/test_pass/test_size_value_converting_pass.py b/tests/test_auto_parallel/test_pass/test_size_value_converting_pass.py
new file mode 100644
index 000000000000..3494830080ff
--- /dev/null
+++ b/tests/test_auto_parallel/test_pass/test_size_value_converting_pass.py
@@ -0,0 +1,65 @@
+import torch
+import torch.nn.functional as F
+
+from colossalai.auto_parallel.passes.runtime_preparation_pass import size_value_converting_pass
+from colossalai.device.device_mesh import DeviceMesh
+from colossalai.fx.graph_module import ColoGraphModule
+from colossalai.fx.tracer import ColoTracer
+from colossalai.tensor.sharding_spec import ShardingSpec
+
+
+class TestModule(torch.nn.Module):
+
+    def forward(self, x):
+        size = x.size()
+        return size
+
+
+def insert_narrow(gm, x_node):
+    graph = gm.graph
+    with graph.inserting_after(x_node):
+        shard_node = graph.create_node('call_method', 'narrow', args=(x_node, 0, 0, 2), kwargs={})
+    size_node = list(x_node.users.keys())[0]
+    size_node.args = (shard_node,)
+    return gm
+
+
+def recover_narrow(gm, narrow_node):
+    graph = gm.graph
+    size_node = list(graph.nodes)[2]
+    x_node = narrow_node.args[0]
+    size_node.args = (x_node,)
+    graph.erase_node(narrow_node)
+    return gm
+
+
+def test_size_value_converting_pass():
+    model = TestModule()
+    physical_mesh_id = torch.arange(0, 4)
+    mesh_shape = (2, 2)
+    device_mesh = DeviceMesh(physical_mesh_id, mesh_shape)
+    meta_args = {'x': torch.rand(4, 8).to('meta')}
+    input = torch.rand(4, 8)
+    tracer = ColoTracer()
+    graph = tracer.trace(root=model, meta_args=meta_args)
+
+    x_node = list(graph.nodes)[0]
+    x_sharding_spec = ShardingSpec(device_mesh, entire_shape=(4, 8), dim_partition_dict={0: [0]})
+    setattr(x_node, 'sharding_spec', x_sharding_spec)
+    gm = ColoGraphModule(model, graph)
+    gm = insert_narrow(gm, x_node)
+    gm.recompile()
+    size = gm(input)
+    assert size == torch.Size([2, 8])
+
+    narrow_node = list(gm.graph.nodes)[1]
+    gm = recover_narrow(gm, narrow_node)
+    gm = size_value_converting_pass(gm, device_mesh)
+    gm = insert_narrow(gm, x_node)
+    gm.recompile()
+    size = gm(input)
+    assert size == torch.Size([4, 8])
+
+
+if __name__ == '__main__':
+    test_size_value_converting_pass()
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_linear_handler.py b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_linear_handler.py
index 3d268ea43fc3..18afacf56b8e 100644
--- a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_linear_handler.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_linear_handler.py
@@ -1,12 +1,9 @@
-from faulthandler import disable
 from functools import partial
-from xml.dom import WrongDocumentErr
 
 import pytest
 import torch
 import torch.multiprocessing as mp
 import torch.nn as nn
-from typing_extensions import Self
 
 from colossalai.auto_parallel.tensor_shard.node_handler import LinearFunctionHandler, LinearModuleHandler
 from colossalai.auto_parallel.tensor_shard.sharding_strategy import (

From 4603538dddc7957bc3ebc29caa066471da2417ba Mon Sep 17 00:00:00 2001
From: Ziyue Jiang <ziyue.jiang97@gmail.com>
Date: Wed, 15 Feb 2023 10:53:38 +0800
Subject: [PATCH 321/503] [NFC] posh
 colossalai/context/process_group_initializer/initializer_sequence.py code
 style (#2712)

Co-authored-by: Ziyue Jiang <ziyue.jiang@gmail.com>
---
 .../context/process_group_initializer/initializer_sequence.py  | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/colossalai/context/process_group_initializer/initializer_sequence.py b/colossalai/context/process_group_initializer/initializer_sequence.py
index 682fe4bb7633..eaacb14d2282 100644
--- a/colossalai/context/process_group_initializer/initializer_sequence.py
+++ b/colossalai/context/process_group_initializer/initializer_sequence.py
@@ -3,9 +3,10 @@
 import torch.distributed as dist
 
 from colossalai.registry import DIST_GROUP_INITIALIZER
+
+from ..parallel_mode import ParallelMode
 from .initializer_tensor import Initializer_Tensor
 from .process_group_initializer import ProcessGroupInitializer
-from ..parallel_mode import ParallelMode
 
 
 @DIST_GROUP_INITIALIZER.register_module

From f6b4ca4e6cc2e7822e38cdc61da0566aee129828 Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Wed, 15 Feb 2023 10:53:54 +0800
Subject: [PATCH 322/503] [devops] add chatgpt ci (#2713)

---
 .github/workflows/run_chatgpt_examples.yml   | 41 ++++++++++++++++++++
 .github/workflows/run_chatgpt_unit_tests.yml | 41 ++++++++++++++++++++
 2 files changed, 82 insertions(+)
 create mode 100644 .github/workflows/run_chatgpt_examples.yml
 create mode 100644 .github/workflows/run_chatgpt_unit_tests.yml

diff --git a/.github/workflows/run_chatgpt_examples.yml b/.github/workflows/run_chatgpt_examples.yml
new file mode 100644
index 000000000000..9d7c1ff99d92
--- /dev/null
+++ b/.github/workflows/run_chatgpt_examples.yml
@@ -0,0 +1,41 @@
+name: Run ChatGPT examples
+
+on:
+  pull_request:
+    types: [synchronize, opened, reopened]
+    paths:
+      - 'applications/ChatGPT/chatgpt/**'
+      - 'applications/ChatGPT/requirements.txt'
+      - 'applications/ChatGPT/setup.py'
+      - 'applications/ChatGPT/examples/**'
+
+
+jobs:
+  tests:
+    name: Run ChatGPT examples
+    runs-on: [self-hosted, gpu]
+    container:
+      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      options: --gpus all --rm -v /data/scratch/chatgpt:/data/scratch/chatgpt
+    timeout-minutes: 30
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - name: Checkout ColossalAI
+        uses: actions/checkout@v2
+
+      - name: Install ColossalAI and ChatGPT
+        run: |
+          pip install -v .
+          cd applications/ChatGPT
+          pip install -v .
+          pip install -r examples/requirements.txt
+
+      - name: Execute Examples
+        run: |
+          ./examples/test_ci.sh
+        env:
+          NCCL_SHM_DISABLE: 1
+          MAX_JOBS: 8
+          PROMPT_PATH: /data/scratch/chatgpt/prompts.csv
diff --git a/.github/workflows/run_chatgpt_unit_tests.yml b/.github/workflows/run_chatgpt_unit_tests.yml
new file mode 100644
index 000000000000..3ac0d2d8ca0b
--- /dev/null
+++ b/.github/workflows/run_chatgpt_unit_tests.yml
@@ -0,0 +1,41 @@
+name: Run ChatGPT unit tests
+
+on:
+  pull_request:
+    types: [synchronize, opened, reopened]
+    paths:
+      - 'applications/ChatGPT/chatgpt/**'
+      - 'applications/ChatGPT/requirements.txt'
+      - 'applications/ChatGPT/setup.py'
+      - 'applications/ChatGPT/requirements-test.txt'
+      - 'applications/ChatGPT/tests/**'
+      - 'applications/ChatGPT/pytest.ini'
+
+jobs:
+  tests:
+    name: Run ChatGPT unit tests
+    runs-on: [self-hosted, gpu]
+    container:
+      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      options: --gpus all --rm -v /data/scratch/chatgpt:/data/scratch/chatgpt
+    timeout-minutes: 30
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - name: Checkout ColossalAI
+        uses: actions/checkout@v2
+
+      - name: Install ColossalAI and ChatGPT
+        run: |
+          pip install -v .
+          cd applications/ChatGPT
+          pip install -v .
+          pip install -r requirements-test.txt
+
+      - name: Execute Unit Testing
+        run: |
+          pytest tests/
+        env:
+          NCCL_SHM_DISABLE: 1
+          MAX_JOBS: 8

From d4d3387f452a26720506ac75cca4e754987eb748 Mon Sep 17 00:00:00 2001
From: binmakeswell <binmakeswell@gmail.com>
Date: Wed, 15 Feb 2023 11:08:35 +0800
Subject: [PATCH 323/503] [doc] add open-source contribution invitation (#2714)

* [doc] fix typo

* [doc] add invitation
---
 README.md                      |  2 +-
 applications/ChatGPT/README.md | 13 +++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 20a5f2606ca5..c2ad6ffc78fa 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 
    [![logo](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/colossal-ai_logo_vertical.png)](https://www.colossalai.org/)
 
-   Colossal-AI: Make big AI models cheaper, easier, and scalable
+   Colossal-AI: Making big AI models cheaper, easier, and scalable
 
    <h3> <a href="https://arxiv.org/abs/2110.14883"> Paper </a> |
    <a href="https://www.colossalai.org/"> Documentation </a> |
diff --git a/applications/ChatGPT/README.md b/applications/ChatGPT/README.md
index 43085f3abfa6..b3ea239a9919 100644
--- a/applications/ChatGPT/README.md
+++ b/applications/ChatGPT/README.md
@@ -60,6 +60,19 @@ We also support training reward model with true-world data. See `examples/train_
 - [ ] integrate with Ray
 - [ ] support more RL paradigms, like Implicit Language Q-Learning (ILQL)
 
+## Invitation to open-source contribution
+Referring to the successful attempts of [BLOOM](https://bigscience.huggingface.co/) and [Stable Diffusion](https://en.wikipedia.org/wiki/Stable_Diffusion), any and all developers and partners with computing powers, datasets, models are welcome to join and build an ecosystem with Colossal-AI, making efforts towards the era of big AI models from the starting point of replicating ChatGPT!
+
+You may contact us or participate in the following ways:
+1. Posting an [issue](https://github.com/hpcaitech/ColossalAI/issues/new/choose) or submitting a [PR](https://github.com/hpcaitech/ColossalAI/pulls) on GitHub
+2. Join the Colossal-AI community on 
+[Slack](https://join.slack.com/t/colossalaiworkspace/shared_invite/zt-z7b26eeb-CBp7jouvu~r0~lcFzX832w),
+and [WeChat](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/WeChat.png "qrcode") to share your ideas.
+3. Check out and fill in the [cooperation proposal](https://www.hpc-ai.tech/partners)
+4. Send your proposal to email contact@hpcaitech.com
+
+Thanks so much to all of our amazing contributors!
+
 ## Quick Preview
 <p id="ChatGPT_scaling" align="center">
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chatgpt/ChatGPT%20scaling.png" width=800/>

From 2045d45ab73d6f0458964f38edd92b4637d34556 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Wed, 15 Feb 2023 11:24:18 +0800
Subject: [PATCH 324/503] [doc] updated documentation version list (#2715)

---
 .github/workflows/doc_build_after_merge.yml | 2 +-
 docs/versions.json                          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/doc_build_after_merge.yml b/.github/workflows/doc_build_after_merge.yml
index dae3b70e1f4c..2f7b708ab3a8 100644
--- a/.github/workflows/doc_build_after_merge.yml
+++ b/.github/workflows/doc_build_after_merge.yml
@@ -5,6 +5,7 @@ on:
   pull_request:
     paths:
       - 'version.txt'
+      - 'docs/'
     types:
       - closed
 
@@ -16,7 +17,6 @@ jobs:
     steps:
       - name: trigger workflow in ColossalAI-Documentation
         run: |
-          gh
           curl \
             -X POST \
             -H "Accept: application/vnd.github+json" \
diff --git a/docs/versions.json b/docs/versions.json
index dde32982b798..49a0fab2bd55 100644
--- a/docs/versions.json
+++ b/docs/versions.json
@@ -1,3 +1,3 @@
 [
-  "current"
+  "v0.2.4"
 ]

From 5b24987fa75adee654aac0b02c8805fb8042cc05 Mon Sep 17 00:00:00 2001
From: YuliangLiu0306 <72588413+YuliangLiu0306@users.noreply.github.com>
Date: Wed, 15 Feb 2023 12:25:50 +0800
Subject: [PATCH 325/503] [autoparallel] fix parameters sharding bug (#2716)

---
 .../auto_parallel/passes/runtime_preparation_pass.py     | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/colossalai/auto_parallel/passes/runtime_preparation_pass.py b/colossalai/auto_parallel/passes/runtime_preparation_pass.py
index bb419be35e55..e63bfdfe730c 100644
--- a/colossalai/auto_parallel/passes/runtime_preparation_pass.py
+++ b/colossalai/auto_parallel/passes/runtime_preparation_pass.py
@@ -426,8 +426,9 @@ def _shard_param(param, target_sharding_spec):
             # we could use .data here, because all the operations just happen before the real training
             # loop, so we don't need to track these operations in the autograd graph.
             param = torch.nn.Parameter(
-                        shape_consistency_manager.apply_for_autoparallel_runtime(param.data, param.sharding_spec,
-                                                                                 target_sharding_spec).detach().clone())
+                shape_consistency_manager.apply_for_autoparallel_runtime(param.data, param.sharding_spec,
+                                                                         target_sharding_spec).detach().clone())
+        return param
 
     for node in nodes:
         if node.op == 'call_module':
@@ -438,7 +439,7 @@ def _shard_param(param, target_sharding_spec):
             setattr(target_module, 'processed', True)
             for name, param in target_module.named_parameters():
                 target_sharding_spec = node.best_strategy.get_sharding_spec_by_name(name)
-                _shard_param(param, target_sharding_spec)
+                param = _shard_param(param, target_sharding_spec)
 
                 setattr(target_module, name, param)
                 _add_hook_for_grad_communication(node, param)
@@ -469,7 +470,7 @@ def _shard_param(param, target_sharding_spec):
                 target = getattr(target_module, atoms[-1])
 
             target_sharding_spec = node.sharding_spec
-            _shard_param(target, target_sharding_spec)
+            target = _shard_param(target, target_sharding_spec)
 
             assert hasattr(target_module, atoms[-1])
             setattr(target_module, atoms[-1], target)

From 21d6a48f4d9a4c3880110aecde9015fbe303ce9f Mon Sep 17 00:00:00 2001
From: YuliangLiu0306 <72588413+YuliangLiu0306@users.noreply.github.com>
Date: Wed, 15 Feb 2023 13:48:28 +0800
Subject: [PATCH 326/503] [autoparallel] add shard option (#2696)

* [autoparallel] add shard option

* polish
---
 .../auto_parallel/tensor_shard/initialize.py  | 70 ++++++++++++++++---
 .../tensor_shard/node_handler/__init__.py     |  5 +-
 .../tensor_shard/node_handler/node_handler.py | 31 +++++---
 .../tensor_shard/node_handler/option.py       | 17 -----
 .../auto_parallel/tensor_shard/options.py     | 49 +++++++++++++
 .../tensor_shard/solver/__init__.py           |  3 +-
 .../tensor_shard/solver/options.py            | 30 --------
 .../tensor_shard/solver/solver.py             |  2 +-
 .../solver/strategies_constructor.py          | 26 +++++--
 .../test_gpt/test_solver_with_gpt_module.py   |  9 +--
 .../test_tensor_shard/test_metainfo/utils.py  |  3 +-
 .../test_node_handler/test_shard_option.py    | 14 +++-
 .../test_node_handler/utils.py                |  3 +-
 .../test_param_resharding_cost.py             |  9 +--
 .../test_solver_with_resnet_v2.py             |  9 +--
 15 files changed, 176 insertions(+), 104 deletions(-)
 delete mode 100644 colossalai/auto_parallel/tensor_shard/node_handler/option.py
 create mode 100644 colossalai/auto_parallel/tensor_shard/options.py
 delete mode 100644 colossalai/auto_parallel/tensor_shard/solver/options.py

diff --git a/colossalai/auto_parallel/tensor_shard/initialize.py b/colossalai/auto_parallel/tensor_shard/initialize.py
index 23ed0f433731..012b0ff43c5d 100644
--- a/colossalai/auto_parallel/tensor_shard/initialize.py
+++ b/colossalai/auto_parallel/tensor_shard/initialize.py
@@ -8,14 +8,9 @@
 
 from colossalai.auto_parallel.passes.runtime_apply_pass import runtime_apply_pass
 from colossalai.auto_parallel.passes.runtime_preparation_pass import runtime_preparation_pass
+from colossalai.auto_parallel.tensor_shard.options import DataloaderOption, ShardOption, SolverOptions, SolverPerference
 from colossalai.auto_parallel.tensor_shard.sharding_strategy import CommAction
-from colossalai.auto_parallel.tensor_shard.solver import (
-    CostGraph,
-    GraphAnalyser,
-    Solver,
-    SolverOptions,
-    StrategiesConstructor,
-)
+from colossalai.auto_parallel.tensor_shard.solver import CostGraph, GraphAnalyser, Solver, StrategiesConstructor
 from colossalai.device.alpha_beta_profiler import AlphaBetaProfiler
 from colossalai.device.device_mesh import DeviceMesh
 from colossalai.fx.graph_module import ColoGraphModule
@@ -69,13 +64,43 @@ def extract_alpha_beta_for_device_mesh(alpha_beta_dict: Dict[Tuple[int], Tuple[f
     pass
 
 
-def build_strategy_constructor(graph: Graph, device_mesh: DeviceMesh):
+def build_strategy_constructor(graph: Graph, device_mesh: DeviceMesh, solver_preference: str, dataloader_option: str,
+                               shard_option: str):
     '''
     This method is used to build the strategy_constructor for the given graph.
     After this method, each node in the graph will have a strategies_vector which
     is constructed by the related node handler.
     '''
-    solver_options = SolverOptions()
+    if solver_preference == 'standard':
+        solver_preference = SolverPerference.STANDARD
+    elif solver_preference == 'tp':
+        solver_preference = SolverPerference.TP
+    elif solver_preference == 'dp':
+        solver_preference = SolverPerference.DP
+    else:
+        raise ValueError(f'Invalid solver_preference: {solver_preference}')
+
+    if dataloader_option == 'replicated':
+        dataloader_option = DataloaderOption.REPLICATED
+    elif dataloader_option == 'distributed':
+        dataloader_option = DataloaderOption.DISTRIBUTED
+    else:
+        raise ValueError(f'Invalid dataloader_option: {dataloader_option}')
+
+    if shard_option == 'standard':
+        shard_option = ShardOption.STANDARD
+    elif shard_option == 'shard':
+        shard_option = ShardOption.SHARD
+    elif shard_option == 'shard_last_axis':
+        shard_option = ShardOption.SHARD_LAST_AXIS
+    elif shard_option == 'full_shard':
+        shard_option = ShardOption.FULL_SHARD
+    else:
+        raise ValueError(f'Invalid shard_option: {shard_option}')
+
+    solver_options = SolverOptions(solver_perference=solver_preference,
+                                   dataloader_option=dataloader_option,
+                                   shard_option=shard_option)
     strategies_constructor = StrategiesConstructor(graph, device_mesh, solver_options)
     strategies_constructor.build_strategies_and_cost()
 
@@ -183,6 +208,9 @@ def initialize_model(model: nn.Module,
                      device_mesh: DeviceMesh,
                      memory_budget: float = -1.0,
                      overlap: bool = False,
+                     solver_preference: str = 'standard',
+                     dataloader_option: str = 'replicated',
+                     shard_option: str = 'standard',
                      save_solver_solution: bool = False,
                      load_solver_solution: bool = False,
                      solution_path: str = None,
@@ -198,6 +226,12 @@ def initialize_model(model: nn.Module,
             the memory budget will be infinity.
         overlap(optional): the overlap is used to specify whether to overlap gradient communication and
             backward computing.
+        solver_preference(optional): the solver_preference is used to specify which parallelism algorithm
+            has higher priority. The valid solver_preference could be 'standard', 'tp', or 'dp'.
+        dataloader_option(optional): the dataloader_option is used to specify which kind of data_loader will
+            be used. The valid dataloader_option could be 'replicated' or 'distributed'.
+        shard_option(optional): the shard_option is used to specify how many axes will be used to shard the
+            model. The valid shard_option could be 'standard', 'shard', 'shard_last_axis', or 'full_shard'.
         save_solver_solution(optional): if the save_solver_solution is True, the solution will be saved
             to the solution_path.
         load_solver_solution(optional): if the load_solver_solution is True, the solution will be loaded
@@ -212,7 +246,12 @@ def initialize_model(model: nn.Module,
     graph = tracer.trace(root=model, meta_args=meta_args)
     gm = ColoGraphModule(model, graph, model.__class__.__name__)
     gm.recompile()
-    strategies_constructor = build_strategy_constructor(graph, device_mesh)
+
+    strategies_constructor = build_strategy_constructor(graph,
+                                                        device_mesh,
+                                                        solver_preference=solver_preference,
+                                                        dataloader_option=dataloader_option,
+                                                        shard_option=shard_option)
     if load_solver_solution:
         solution = torch.load(solution_path)
     else:
@@ -240,6 +279,9 @@ def autoparallelize(model: nn.Module,
                     alpha_beta_dict: Dict[Tuple[int], Tuple[float]] = None,
                     logical_mesh_shape: Tuple[int] = None,
                     logical_mesh_id: torch.Tensor = None,
+                    solver_preference: str = 'standard',
+                    dataloader_option: str = 'replicated',
+                    shard_option: str = 'standard',
                     save_solver_solution: bool = False,
                     load_solver_solution: bool = False,
                     solver_solution_path: str = None,
@@ -262,6 +304,12 @@ def autoparallelize(model: nn.Module,
             mesh shape. If the logical_mesh_shape is None, the logical_mesh_shape will be
             generated by search_best_logical_mesh_shape function.
         logical_mesh_id(optional): the logical_mesh_id is used to specify the logical mesh id.
+        solver_preference(optional): the solver_preference is used to specify which parallelism algorithm
+            has higher priority. The valid solver_preference could be 'standard', 'tp', or 'dp'.
+        dataloader_option(optional): the dataloader_option is used to specify which kind of data_loader will
+            be used. The valid dataloader_option could be 'replicated' or 'distributed'.
+        shard_option(optional): the shard_option is used to specify how many axes will be used to shard the
+            model. The valid shard_option could be 'standard', 'shard', 'shard_last_axis', or 'full_shard'.
         save_solver_solution(optional): if the save_solver_solution is True, the solution will be saved
             to the solution_path.
         load_solver_solution(optional): if the load_solver_solution is True, the solution will be loaded
@@ -280,6 +328,8 @@ def autoparallelize(model: nn.Module,
     rst_to_unpack = initialize_model(model,
                                      meta_args,
                                      device_mesh,
+                                     solver_preference=solver_preference,
+                                     dataloader_option=dataloader_option,
                                      save_solver_solution=save_solver_solution,
                                      load_solver_solution=load_solver_solution,
                                      solution_path=solver_solution_path,
diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/__init__.py b/colossalai/auto_parallel/tensor_shard/node_handler/__init__.py
index 0050358ce093..9903ca54e52c 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/__init__.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/__init__.py
@@ -11,7 +11,6 @@
 from .linear_handler import LinearFunctionHandler, LinearModuleHandler
 from .matmul_handler import MatMulHandler
 from .normal_pooling_handler import NormPoolingHandler
-from .option import ShardOption
 from .output_handler import OutputHandler
 from .permute_handler import PermuteHandler
 from .placeholder_handler import PlaceholderHandler
@@ -31,6 +30,6 @@
     'UnaryElementwiseHandler', 'DefaultReshapeHandler', 'PlaceholderHandler', 'OutputHandler', 'WhereHandler',
     'NormPoolingHandler', 'BinaryElementwiseHandler', 'MatMulHandler', 'operator_registry', 'ADDMMFunctionHandler',
     'GetItemHandler', 'GetattrHandler', 'ViewHandler', 'PermuteHandler', 'TensorConstructorHandler',
-    'EmbeddingModuleHandler', 'EmbeddingFunctionHandler', 'SumHandler', 'SoftmaxHandler', 'ShardOption',
-    'TransposeHandler', 'SplitHandler'
+    'EmbeddingModuleHandler', 'EmbeddingFunctionHandler', 'SumHandler', 'SoftmaxHandler', 'TransposeHandler',
+    'SplitHandler'
 ]
diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py b/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py
index c6f8d035a820..136e57c5e0f5 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py
@@ -5,7 +5,7 @@
 from torch.fx.node import Node
 
 from colossalai.auto_parallel.meta_profiler.metainfo import MetaInfo, meta_register
-from colossalai.auto_parallel.tensor_shard.node_handler.option import ShardOption
+from colossalai.auto_parallel.tensor_shard.options import ShardOption, SolverPerference
 from colossalai.auto_parallel.tensor_shard.sharding_strategy import (
     OperationData,
     OperationDataType,
@@ -32,19 +32,19 @@ class NodeHandler(ABC):
         strategies_vector (StrategiesVector): all the strategies generated in this handler will be recorded into the strategies_vector.
     '''
 
-    def __init__(
-        self,
-        node: Node,
-        device_mesh: DeviceMesh,
-        strategies_vector: StrategiesVector,
-        shard_option: ShardOption = ShardOption.STANDARD,
-    ) -> None:
+    def __init__(self,
+                 node: Node,
+                 device_mesh: DeviceMesh,
+                 strategies_vector: StrategiesVector,
+                 shard_option: ShardOption = ShardOption.STANDARD,
+                 solver_perference: SolverPerference = SolverPerference.STANDARD) -> None:
         self.node = node
         self.predecessor_node = list(node._input_nodes.keys())
         self.successor_node = list(node.users.keys())
         self.device_mesh = device_mesh
         self.strategies_vector = strategies_vector
         self.shard_option = shard_option
+        self.solver_perference = solver_perference
 
     def update_resharding_cost(self, strategy: ShardingStrategy) -> None:
         """
@@ -187,15 +187,24 @@ def register_strategy(self, compute_resharding_cost: bool = True) -> StrategiesV
 
         remove_strategy_list = []
         for strategy in self.strategies_vector:
-            shard_level = 0
+            shard_axis_list = []
+            last_axis = len(self.device_mesh.mesh_shape) - 1
             for op_data, sharding_spec in strategy.sharding_specs.items():
                 if op_data.data is not None and isinstance(op_data.data, torch.Tensor):
-                    for dim, shard_axis in sharding_spec.dim_partition_dict.items():
-                        shard_level += len(shard_axis)
+                    for dim, shard_axes in sharding_spec.dim_partition_dict.items():
+                        for shard_axis in shard_axes:
+                            if shard_axis not in shard_axis_list:
+                                shard_axis_list.append(shard_axis)
+
+            shard_level = len(shard_axis_list)
+            using_last_axis = last_axis in shard_axis_list or -1 in shard_axis_list
             if self.shard_option == ShardOption.SHARD and shard_level == 0:
                 remove_strategy_list.append(strategy)
             if self.shard_option == ShardOption.FULL_SHARD and shard_level <= 1:
                 remove_strategy_list.append(strategy)
+            if self.shard_option == ShardOption.SHARD_LAST_AXIS:
+                if shard_level != 1 or using_last_axis == False:
+                    remove_strategy_list.append(strategy)
 
         for strategy in remove_strategy_list:
             self.strategies_vector.remove(strategy)
diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/option.py b/colossalai/auto_parallel/tensor_shard/node_handler/option.py
deleted file mode 100644
index dffb0386df62..000000000000
--- a/colossalai/auto_parallel/tensor_shard/node_handler/option.py
+++ /dev/null
@@ -1,17 +0,0 @@
-from enum import Enum
-
-__all__ = ['ShardOption']
-
-
-class ShardOption(Enum):
-    """
-    This enum class is to define the shard level required in node strategies.
-
-    Notes:
-        STANDARD: We do not add any extra shard requirements.
-        SHARD: We require the node to be shard using at least one device mesh axis.
-        FULL_SHARD: We require the node to be shard using all device mesh axes.
-    """
-    STANDARD = 0
-    SHARD = 1
-    FULL_SHARD = 2
diff --git a/colossalai/auto_parallel/tensor_shard/options.py b/colossalai/auto_parallel/tensor_shard/options.py
new file mode 100644
index 000000000000..f0ea502a6f0e
--- /dev/null
+++ b/colossalai/auto_parallel/tensor_shard/options.py
@@ -0,0 +1,49 @@
+from dataclasses import dataclass
+from enum import Enum
+
+__all__ = ['SolverOptions', 'SolverPerference', 'DataloaderOption', 'ShardOption']
+
+
+class SolverPerference(Enum):
+    """
+    This enum class is to define the solver preference.
+    """
+    STANDARD = 0
+    DP = 1
+    TP = 2
+
+
+class ShardOption(Enum):
+    """
+    This enum class is to define the shard level required in node strategies.
+
+    Notes:
+        STANDARD: We do not add any extra shard requirements.
+        SHARD: We require the node to be shard using at least one device mesh axis.
+        SHARD_ONE_AXIS: We require the node to be shard using the last device mesh axis.
+        FULL_SHARD: We require the node to be shard using all device mesh axes.
+        TP_SHARD: We require the node to be shard using tensor parallel strategies on last device mesh axis.
+        TP_FULL_SHARD: We require the node to be shard using tensor parallel strategies on all device mesh axes.
+    """
+    STANDARD = 0
+    SHARD = 1
+    SHARD_LAST_AXIS = 2
+    FULL_SHARD = 3
+
+
+class DataloaderOption(Enum):
+    """
+    This enum class is to define the dataloader option.
+    """
+    REPLICATED = 0
+    DISTRIBUTED = 1
+
+
+@dataclass
+class SolverOptions:
+    """
+    SolverOptions is a dataclass used to configure the preferences for the parallel execution plan search.
+    """
+    solver_perference: SolverPerference = SolverPerference.STANDARD
+    dataloader_option: DataloaderOption = DataloaderOption.REPLICATED
+    shard_option: ShardOption = ShardOption.STANDARD
diff --git a/colossalai/auto_parallel/tensor_shard/solver/__init__.py b/colossalai/auto_parallel/tensor_shard/solver/__init__.py
index e9f9ba8814a7..f9e6bd923921 100644
--- a/colossalai/auto_parallel/tensor_shard/solver/__init__.py
+++ b/colossalai/auto_parallel/tensor_shard/solver/__init__.py
@@ -1,7 +1,6 @@
 from .cost_graph import CostGraph
 from .graph_analysis import GraphAnalyser
-from .options import SolverOptions
 from .solver import Solver
 from .strategies_constructor import StrategiesConstructor
 
-__all__ = ['GraphAnalyser', 'Solver', 'StrategiesConstructor', 'CostGraph', 'SolverOptions']
+__all__ = ['GraphAnalyser', 'Solver', 'StrategiesConstructor', 'CostGraph']
diff --git a/colossalai/auto_parallel/tensor_shard/solver/options.py b/colossalai/auto_parallel/tensor_shard/solver/options.py
deleted file mode 100644
index b52e55708dfd..000000000000
--- a/colossalai/auto_parallel/tensor_shard/solver/options.py
+++ /dev/null
@@ -1,30 +0,0 @@
-from dataclasses import dataclass
-from enum import Enum
-
-__all__ = ['SolverOptions']
-
-
-class SolverPerference(Enum):
-    """
-    This enum class is to define the solver preference.
-    """
-    STANDARD = 0
-    DP = 1
-    TP = 2
-
-
-class DataloaderOption(Enum):
-    """
-    This enum class is to define the dataloader option.
-    """
-    REPLICATED = 0
-    DISTRIBUTED = 1
-
-
-@dataclass
-class SolverOptions:
-    """
-    SolverOptions is a dataclass used to configure the preferences for the parallel execution plan search.
-    """
-    solver_perference: SolverPerference = SolverPerference.STANDARD
-    dataloader_option: DataloaderOption = DataloaderOption.REPLICATED
diff --git a/colossalai/auto_parallel/tensor_shard/solver/solver.py b/colossalai/auto_parallel/tensor_shard/solver/solver.py
index 89d0da2235a2..3bc3e8960cc8 100644
--- a/colossalai/auto_parallel/tensor_shard/solver/solver.py
+++ b/colossalai/auto_parallel/tensor_shard/solver/solver.py
@@ -33,7 +33,7 @@ def __init__(self,
                  solution_numbers: int = 1,
                  forward_only: bool = False,
                  memory_increasing_coefficient: float = 1.3,
-                 verbose=True):
+                 verbose=False):
         '''
         Solver class will integrate information provided by the components and use ILP solver to find a possible optimal strategies combination for target computing graph.
         Argument:
diff --git a/colossalai/auto_parallel/tensor_shard/solver/strategies_constructor.py b/colossalai/auto_parallel/tensor_shard/solver/strategies_constructor.py
index 042b9bb4b0d1..40741daca702 100644
--- a/colossalai/auto_parallel/tensor_shard/solver/strategies_constructor.py
+++ b/colossalai/auto_parallel/tensor_shard/solver/strategies_constructor.py
@@ -17,7 +17,7 @@
 from colossalai.auto_parallel.tensor_shard.utils import generate_resharding_costs, generate_sharding_spec
 from colossalai.device.device_mesh import DeviceMesh
 
-from .options import DataloaderOption, SolverOptions
+from ..options import DataloaderOption, SolverOptions
 
 __all__ = ['StrategiesConstructor']
 
@@ -101,7 +101,11 @@ def _check_no_strategy_for_data(data):
 
             # get_attr node
             elif node.op == 'get_attr':
-                getattr_handler = GetattrHandler(node, self.device_mesh, strategies_vector)
+                getattr_handler = GetattrHandler(node,
+                                                 self.device_mesh,
+                                                 strategies_vector,
+                                                 shard_option=self.solver_options.shard_option,
+                                                 solver_perference=self.solver_options.solver_perference)
                 getattr_handler.register_strategy()
 
             # call_module node
@@ -109,7 +113,11 @@ def _check_no_strategy_for_data(data):
                 target = node.target
                 submod = self.root_module.get_submodule(target)
                 submod_type = type(submod)
-                handler = operator_registry.get(submod_type)(node, self.device_mesh, strategies_vector)
+                handler = operator_registry.get(submod_type)(node,
+                                                             self.device_mesh,
+                                                             strategies_vector,
+                                                             shard_option=self.solver_options.shard_option,
+                                                             solver_perference=self.solver_options.solver_perference)
                 handler.register_strategy()
                 # attach metainfo_vector to node
                 if hasattr(handler, 'metainfo_vector'):
@@ -118,7 +126,11 @@ def _check_no_strategy_for_data(data):
             # call_function node
             elif node.op == 'call_function':
                 target = node.target
-                handler = operator_registry.get(target)(node, self.device_mesh, strategies_vector)
+                handler = operator_registry.get(target)(node,
+                                                        self.device_mesh,
+                                                        strategies_vector,
+                                                        shard_option=self.solver_options.shard_option,
+                                                        solver_perference=self.solver_options.solver_perference)
                 handler.register_strategy()
                 # attach metainfo_vector to node
                 if hasattr(handler, 'metainfo_vector'):
@@ -127,7 +139,11 @@ def _check_no_strategy_for_data(data):
             # call_method node
             elif node.op == 'call_method':
                 method = getattr(node.args[0]._meta_data.__class__, node.target)
-                handler = operator_registry.get(method)(node, self.device_mesh, strategies_vector)
+                handler = operator_registry.get(method)(node,
+                                                        self.device_mesh,
+                                                        strategies_vector,
+                                                        shard_option=self.solver_options.shard_option,
+                                                        solver_perference=self.solver_options.solver_perference)
                 handler.register_strategy()
                 # attach metainfo_vector to node
                 if hasattr(handler, 'metainfo_vector'):
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_gpt/test_solver_with_gpt_module.py b/tests/test_auto_parallel/test_tensor_shard/test_gpt/test_solver_with_gpt_module.py
index 26ad0d3a08a7..a6be1928b547 100644
--- a/tests/test_auto_parallel/test_tensor_shard/test_gpt/test_solver_with_gpt_module.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_gpt/test_solver_with_gpt_module.py
@@ -4,13 +4,8 @@
 from torch.fx import GraphModule
 
 from colossalai.auto_parallel.tensor_shard.constants import BATCHNORM_MODULE_OP
-from colossalai.auto_parallel.tensor_shard.solver import (
-    CostGraph,
-    GraphAnalyser,
-    Solver,
-    SolverOptions,
-    StrategiesConstructor,
-)
+from colossalai.auto_parallel.tensor_shard.options import SolverOptions
+from colossalai.auto_parallel.tensor_shard.solver import CostGraph, GraphAnalyser, Solver, StrategiesConstructor
 from colossalai.device.device_mesh import DeviceMesh
 from colossalai.fx.tracer.tracer import ColoTracer
 from colossalai.tensor.shape_consistency import ShapeConsistencyManager
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_metainfo/utils.py b/tests/test_auto_parallel/test_tensor_shard/test_metainfo/utils.py
index b8c01d35842e..60ecd1dd9801 100644
--- a/tests/test_auto_parallel/test_tensor_shard/test_metainfo/utils.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_metainfo/utils.py
@@ -7,8 +7,9 @@
 
 from colossalai.auto_parallel.passes.runtime_apply_pass import runtime_apply_pass
 from colossalai.auto_parallel.passes.runtime_preparation_pass import runtime_preparation_pass
+from colossalai.auto_parallel.tensor_shard.options import SolverOptions
 from colossalai.auto_parallel.tensor_shard.sharding_strategy import OperationDataType, TrainCycleItem
-from colossalai.auto_parallel.tensor_shard.solver import SolverOptions, StrategiesConstructor
+from colossalai.auto_parallel.tensor_shard.solver import StrategiesConstructor
 from colossalai.device.device_mesh import DeviceMesh
 from colossalai.fx.tracer.tracer import ColoTracer
 
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_shard_option.py b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_shard_option.py
index fda0411104b8..f6895d92ab03 100644
--- a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_shard_option.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_shard_option.py
@@ -5,7 +5,7 @@
 import torch.nn as nn
 
 from colossalai.auto_parallel.tensor_shard.node_handler import LinearFunctionHandler
-from colossalai.auto_parallel.tensor_shard.node_handler.option import ShardOption
+from colossalai.auto_parallel.tensor_shard.options import ShardOption
 from colossalai.auto_parallel.tensor_shard.sharding_strategy import StrategiesVector
 from colossalai.device.device_mesh import DeviceMesh
 from colossalai.fx import ColoGraphModule, ColoTracer
@@ -49,6 +49,15 @@ def check_shard_option(shard_option):
     strategies_vector = handler.register_strategy(compute_resharding_cost=False)
     strategy_name_list = [val.name for val in strategies_vector]
 
+    if shard_option == ShardOption.SHARD_LAST_AXIS:
+        # RR = RS x SR
+        assert 'RR = RS1 x S1R' in strategy_name_list
+
+        # RS= RR x RS
+        assert 'RS1 = RR x RS1' in strategy_name_list
+
+        return
+
     # SS = SR x RS
     assert 'S1S0 = S1R x RS0_0' in strategy_name_list
     assert 'S0S1 = S0R x RS1_1' in strategy_name_list
@@ -104,7 +113,8 @@ def check_shard_option(shard_option):
 
 @run_on_environment_flag(name='AUTO_PARALLEL')
 def test_shard_option():
-    for shard_option in [ShardOption.STANDARD, ShardOption.SHARD, ShardOption.FULL_SHARD]:
+    # for shard_option in [ShardOption.STANDARD, ShardOption.SHARD, ShardOption.FULL_SHARD, ShardOption.SHARD_LAST_AXIS]:
+    for shard_option in [ShardOption.SHARD_LAST_AXIS]:
         check_shard_option(shard_option)
 
 
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/utils.py b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/utils.py
index db76ed9b85df..14c8cb296949 100644
--- a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/utils.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/utils.py
@@ -6,7 +6,8 @@
 
 from colossalai.auto_parallel.passes.runtime_apply_pass import runtime_apply_pass
 from colossalai.auto_parallel.passes.runtime_preparation_pass import runtime_preparation_pass
-from colossalai.auto_parallel.tensor_shard.solver import SolverOptions, StrategiesConstructor
+from colossalai.auto_parallel.tensor_shard.options import SolverOptions
+from colossalai.auto_parallel.tensor_shard.solver import StrategiesConstructor
 from colossalai.auto_parallel.tensor_shard.solver.cost_graph import CostGraph
 from colossalai.auto_parallel.tensor_shard.solver.graph_analysis import GraphAnalyser
 from colossalai.auto_parallel.tensor_shard.solver.solver import Solver
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_param_resharding_cost.py b/tests/test_auto_parallel/test_tensor_shard/test_param_resharding_cost.py
index b504d59c971f..92f011ba30d2 100644
--- a/tests/test_auto_parallel/test_tensor_shard/test_param_resharding_cost.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_param_resharding_cost.py
@@ -1,13 +1,8 @@
 import torch
 
+from colossalai.auto_parallel.tensor_shard.options import SolverOptions
 from colossalai.auto_parallel.tensor_shard.sharding_strategy import OperationDataType
-from colossalai.auto_parallel.tensor_shard.solver import (
-    CostGraph,
-    GraphAnalyser,
-    Solver,
-    SolverOptions,
-    StrategiesConstructor,
-)
+from colossalai.auto_parallel.tensor_shard.solver import CostGraph, GraphAnalyser, Solver, StrategiesConstructor
 from colossalai.device.device_mesh import DeviceMesh
 from colossalai.fx import ColoGraphModule, ColoTracer
 from colossalai.testing.pytest_wrapper import run_on_environment_flag
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_solver_with_resnet_v2.py b/tests/test_auto_parallel/test_tensor_shard/test_solver_with_resnet_v2.py
index f4a5ae7ac1c0..6f64acd525c2 100644
--- a/tests/test_auto_parallel/test_tensor_shard/test_solver_with_resnet_v2.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_solver_with_resnet_v2.py
@@ -3,13 +3,8 @@
 from torchvision.models import resnet50
 
 from colossalai.auto_parallel.tensor_shard.constants import BATCHNORM_MODULE_OP
-from colossalai.auto_parallel.tensor_shard.solver import (
-    CostGraph,
-    GraphAnalyser,
-    Solver,
-    SolverOptions,
-    StrategiesConstructor,
-)
+from colossalai.auto_parallel.tensor_shard.options import SolverOptions
+from colossalai.auto_parallel.tensor_shard.solver import CostGraph, GraphAnalyser, Solver, StrategiesConstructor
 from colossalai.device.device_mesh import DeviceMesh
 from colossalai.fx.tracer.tracer import ColoTracer
 from colossalai.tensor.shape_consistency import ShapeConsistencyManager

From 9c0943ecdbd0a1f489de94c22e202cd3ebf8efb0 Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Wed, 15 Feb 2023 13:59:58 +0800
Subject: [PATCH 327/503] [chatgpt] optimize generation kwargs (#2717)

* [chatgpt] ppo trainer use default generate args

* [chatgpt] example remove generation preparing fn

* [chatgpt] benchmark remove generation preparing fn

* [chatgpt] fix ci
---
 .github/workflows/run_chatgpt_examples.yml    |  1 +
 .github/workflows/run_chatgpt_unit_tests.yml  |  1 +
 .../ChatGPT/benchmarks/benchmark_gpt_dummy.py |  3 --
 .../benchmarks/benchmark_opt_lora_dummy.py    |  3 --
 applications/ChatGPT/chatgpt/trainer/ppo.py   | 10 +++++
 applications/ChatGPT/examples/train_dummy.py  | 45 ++++++++-----------
 .../ChatGPT/examples/train_prompts.py         | 37 ++++++++-------
 7 files changed, 48 insertions(+), 52 deletions(-)

diff --git a/.github/workflows/run_chatgpt_examples.yml b/.github/workflows/run_chatgpt_examples.yml
index 9d7c1ff99d92..af59c8db2d6b 100644
--- a/.github/workflows/run_chatgpt_examples.yml
+++ b/.github/workflows/run_chatgpt_examples.yml
@@ -34,6 +34,7 @@ jobs:
 
       - name: Execute Examples
         run: |
+          cd applications/ChatGPT
           ./examples/test_ci.sh
         env:
           NCCL_SHM_DISABLE: 1
diff --git a/.github/workflows/run_chatgpt_unit_tests.yml b/.github/workflows/run_chatgpt_unit_tests.yml
index 3ac0d2d8ca0b..8dcf21fe2146 100644
--- a/.github/workflows/run_chatgpt_unit_tests.yml
+++ b/.github/workflows/run_chatgpt_unit_tests.yml
@@ -35,6 +35,7 @@ jobs:
 
       - name: Execute Unit Testing
         run: |
+          cd applications/ChatGPT
           pytest tests/
         env:
           NCCL_SHM_DISABLE: 1
diff --git a/applications/ChatGPT/benchmarks/benchmark_gpt_dummy.py b/applications/ChatGPT/benchmarks/benchmark_gpt_dummy.py
index 8474f3ba7b7c..3e66e4e7a40a 100644
--- a/applications/ChatGPT/benchmarks/benchmark_gpt_dummy.py
+++ b/applications/ChatGPT/benchmarks/benchmark_gpt_dummy.py
@@ -5,7 +5,6 @@
 import torch.distributed as dist
 import torch.nn as nn
 from chatgpt.nn import GPTActor, GPTCritic, RewardModel
-from chatgpt.nn.generation_utils import gpt_prepare_inputs_fn, update_model_kwargs_fn
 from chatgpt.trainer import PPOTrainer
 from chatgpt.trainer.callbacks import PerformanceEvaluator
 from chatgpt.trainer.strategies import ColossalAIStrategy, DDPStrategy, Strategy
@@ -151,8 +150,6 @@ def main(args):
                          top_k=50,
                          pad_token_id=tokenizer.pad_token_id,
                          eos_token_id=tokenizer.eos_token_id,
-                         prepare_inputs_fn=gpt_prepare_inputs_fn,
-                         update_model_kwargs_fn=update_model_kwargs_fn,
                          callbacks=[performance_evaluator])
 
     random_prompts = torch.randint(tokenizer.vocab_size, (1000, 400), device=torch.cuda.current_device())
diff --git a/applications/ChatGPT/benchmarks/benchmark_opt_lora_dummy.py b/applications/ChatGPT/benchmarks/benchmark_opt_lora_dummy.py
index accbc4155fb1..8cee5489e212 100644
--- a/applications/ChatGPT/benchmarks/benchmark_opt_lora_dummy.py
+++ b/applications/ChatGPT/benchmarks/benchmark_opt_lora_dummy.py
@@ -5,7 +5,6 @@
 import torch.distributed as dist
 import torch.nn as nn
 from chatgpt.nn import OPTActor, OPTCritic, RewardModel
-from chatgpt.nn.generation_utils import opt_prepare_inputs_fn, update_model_kwargs_fn
 from chatgpt.trainer import PPOTrainer
 from chatgpt.trainer.callbacks import PerformanceEvaluator
 from chatgpt.trainer.strategies import ColossalAIStrategy, DDPStrategy, Strategy
@@ -144,8 +143,6 @@ def main(args):
                          top_k=50,
                          pad_token_id=tokenizer.pad_token_id,
                          eos_token_id=tokenizer.eos_token_id,
-                         prepare_inputs_fn=opt_prepare_inputs_fn,
-                         update_model_kwargs_fn=update_model_kwargs_fn,
                          callbacks=[performance_evaluator])
 
     random_prompts = torch.randint(tokenizer.vocab_size, (1000, 400), device=torch.cuda.current_device())
diff --git a/applications/ChatGPT/chatgpt/trainer/ppo.py b/applications/ChatGPT/chatgpt/trainer/ppo.py
index 85beb223e33a..b1d11b2242ca 100644
--- a/applications/ChatGPT/chatgpt/trainer/ppo.py
+++ b/applications/ChatGPT/chatgpt/trainer/ppo.py
@@ -3,6 +3,7 @@
 import torch.nn as nn
 from chatgpt.experience_maker import Experience, NaiveExperienceMaker
 from chatgpt.nn import Actor, Critic, PolicyLoss, ValueLoss
+from chatgpt.nn.generation_utils import update_model_kwargs_fn
 from chatgpt.replay_buffer import NaiveReplayBuffer
 from torch.optim import Optimizer
 
@@ -59,6 +60,7 @@ def __init__(self,
                  dataloader_pin_memory: bool = True,
                  callbacks: List[Callback] = [],
                  **generate_kwargs) -> None:
+        self._set_default_generate_kwargs(generate_kwargs, actor)
         actor = Actor(strategy.setup_model(actor.model))
         critic = strategy.setup_model(critic)
         reward_model = strategy.setup_model(reward_model)
@@ -102,3 +104,11 @@ def training_step(self, experience: Experience) -> Dict[str, float]:
         self.critic_optim.zero_grad()
 
         return {'actor_loss': actor_loss.item(), 'critic_loss': critic_loss.item()}
+
+    def _set_default_generate_kwargs(self, generate_kwargs: dict, actor: Actor) -> None:
+        # use huggingface models method directly
+        if 'prepare_inputs_fn' not in generate_kwargs and hasattr(actor.model, 'prepare_inputs_for_generation'):
+            generate_kwargs['prepare_inputs_fn'] = actor.model.prepare_inputs_for_generation
+
+        if 'update_model_kwargs_fn' not in generate_kwargs:
+            generate_kwargs['update_model_kwargs_fn'] = update_model_kwargs_fn
diff --git a/applications/ChatGPT/examples/train_dummy.py b/applications/ChatGPT/examples/train_dummy.py
index 313be2c3b841..a14117ed5cd4 100644
--- a/applications/ChatGPT/examples/train_dummy.py
+++ b/applications/ChatGPT/examples/train_dummy.py
@@ -3,12 +3,6 @@
 
 import torch
 from chatgpt.nn import BLOOMActor, BLOOMCritic, GPTActor, GPTCritic, OPTActor, OPTCritic, RewardModel
-from chatgpt.nn.generation_utils import (
-    bloom_prepare_inputs_fn,
-    gpt_prepare_inputs_fn,
-    opt_prepare_inputs_fn,
-    update_model_kwargs_fn,
-)
 from chatgpt.trainer import PPOTrainer
 from chatgpt.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
 from torch.optim import Adam
@@ -66,36 +60,33 @@ def main(args):
     if args.model == 'gpt2':
         tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
         tokenizer.pad_token = tokenizer.eos_token
-        prepare_inputs_fn = gpt_prepare_inputs_fn
     elif args.model == 'bloom':
         tokenizer = BloomTokenizerFast.from_pretrained(args.pretrain)
         tokenizer.pad_token = tokenizer.eos_token
-        prepare_inputs_fn = bloom_prepare_inputs_fn
     elif args.model == 'opt':
         tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
-        prepare_inputs_fn = opt_prepare_inputs_fn
     else:
         raise ValueError(f'Unsupported model "{args.model}"')
 
     # configure trainer
-    trainer = PPOTrainer(strategy,
-                         actor,
-                         critic,
-                         reward_model,
-                         initial_model,
-                         actor_optim,
-                         critic_optim,
-                         max_epochs=args.max_epochs,
-                         train_batch_size=args.train_batch_size,
-                         tokenizer=preprocess_batch,
-                         max_length=128,
-                         do_sample=True,
-                         temperature=1.0,
-                         top_k=50,
-                         pad_token_id=tokenizer.pad_token_id,
-                         eos_token_id=tokenizer.eos_token_id,
-                         prepare_inputs_fn=prepare_inputs_fn,
-                         update_model_kwargs_fn=update_model_kwargs_fn)
+    trainer = PPOTrainer(
+        strategy,
+        actor,
+        critic,
+        reward_model,
+        initial_model,
+        actor_optim,
+        critic_optim,
+        max_epochs=args.max_epochs,
+        train_batch_size=args.train_batch_size,
+        tokenizer=preprocess_batch,
+        max_length=128,
+        do_sample=True,
+        temperature=1.0,
+        top_k=50,
+        pad_token_id=tokenizer.pad_token_id,
+        eos_token_id=tokenizer.eos_token_id,
+    )
 
     random_prompts = torch.randint(tokenizer.vocab_size, (1000, 64), device=torch.cuda.current_device())
     trainer.fit(random_prompts,
diff --git a/applications/ChatGPT/examples/train_prompts.py b/applications/ChatGPT/examples/train_prompts.py
index 994b10fe0734..cf351b91a461 100644
--- a/applications/ChatGPT/examples/train_prompts.py
+++ b/applications/ChatGPT/examples/train_prompts.py
@@ -3,7 +3,6 @@
 
 import pandas as pd
 from chatgpt.nn import BLOOMActor, BLOOMCritic, GPTActor, GPTCritic, OPTActor, OPTCritic, RewardModel
-from chatgpt.nn.generation_utils import gpt_prepare_inputs_fn, update_model_kwargs_fn
 from chatgpt.trainer import PPOTrainer
 from chatgpt.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
 from torch.optim import Adam
@@ -70,24 +69,24 @@ def tokenize_fn(texts):
         return {k: v.cuda() for k, v in batch.items()}
 
     # configure trainer
-    trainer = PPOTrainer(strategy,
-                         actor,
-                         critic,
-                         reward_model,
-                         initial_model,
-                         actor_optim,
-                         critic_optim,
-                         max_epochs=args.max_epochs,
-                         train_batch_size=args.train_batch_size,
-                         tokenizer=tokenize_fn,
-                         max_length=128,
-                         do_sample=True,
-                         temperature=1.0,
-                         top_k=50,
-                         pad_token_id=tokenizer.pad_token_id,
-                         eos_token_id=tokenizer.eos_token_id,
-                         prepare_inputs_fn=gpt_prepare_inputs_fn,
-                         update_model_kwargs_fn=update_model_kwargs_fn)
+    trainer = PPOTrainer(
+        strategy,
+        actor,
+        critic,
+        reward_model,
+        initial_model,
+        actor_optim,
+        critic_optim,
+        max_epochs=args.max_epochs,
+        train_batch_size=args.train_batch_size,
+        tokenizer=tokenize_fn,
+        max_length=128,
+        do_sample=True,
+        temperature=1.0,
+        top_k=50,
+        pad_token_id=tokenizer.pad_token_id,
+        eos_token_id=tokenizer.eos_token_id,
+    )
 
     trainer.fit(dataset,
                 num_episodes=args.num_episodes,

From 7aacfad8aff2fe3654aa3f0204e5dc6fad813ed3 Mon Sep 17 00:00:00 2001
From: "CH.Li" <32587096+lich99@users.noreply.github.com>
Date: Wed, 15 Feb 2023 14:54:53 +0800
Subject: [PATCH 328/503] fix typo (#2721)

---
 applications/ChatGPT/benchmarks/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/applications/ChatGPT/benchmarks/README.md b/applications/ChatGPT/benchmarks/README.md
index f7212fc89908..b4e28ba1d764 100644
--- a/applications/ChatGPT/benchmarks/README.md
+++ b/applications/ChatGPT/benchmarks/README.md
@@ -37,7 +37,7 @@ We only support `torchrun` to launch now. E.g.
 
 ```shell
 # run GPT2-S on single-node single-GPU with min batch size
-torchrun --standalone --nproc_pero_node 1 benchmark_gpt_dummy.py --model s --strategy ddp --experience_batch_size 1 --train_batch_size 1
+torchrun --standalone --nproc_per_node 1 benchmark_gpt_dummy.py --model s --strategy ddp --experience_batch_size 1 --train_batch_size 1
 # run GPT2-XL on single-node 4-GPU
 torchrun --standalone --nproc_per_node 4 benchmark_gpt_dummy.py --model xl --strategy colossalai_zero2
 # run GPT3 on 8-node 8-GPU
@@ -84,7 +84,7 @@ We only support `torchrun` to launch now. E.g.
 
 ```shell
 # run OPT-125M with no lora (lora_rank=0) on single-node single-GPU with min batch size
-torchrun --standalone --nproc_pero_node 1 benchmark_opt_lora_dummy.py --model 125m --strategy ddp --experience_batch_size 1 --train_batch_size 1 --lora_rank 0
+torchrun --standalone --nproc_per_node 1 benchmark_opt_lora_dummy.py --model 125m --strategy ddp --experience_batch_size 1 --train_batch_size 1 --lora_rank 0
 # run OPT-350M with lora_rank=4 on single-node 4-GPU
 torchrun --standalone --nproc_per_node 4 benchmark_opt_lora_dummy.py --model 350m --strategy colossalai_zero2 --lora_rank 4
 ```

From 51c45c2460aa183bbd0f5d9347faaf2018b58bb3 Mon Sep 17 00:00:00 2001
From: yuxuan-lou <83441848+yuxuan-lou@users.noreply.github.com>
Date: Wed, 15 Feb 2023 16:12:24 +0800
Subject: [PATCH 329/503] [NFC] polish
 colossalai/auto_parallel/tensor_shard/deprecated/op_handler/where_handler.py
 code style (#2723)

---
 .../deprecated/op_handler/where_handler.py             | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/where_handler.py b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/where_handler.py
index 6991e913d463..e1d679b8e2f4 100644
--- a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/where_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/where_handler.py
@@ -6,10 +6,12 @@
 
 import torch
 
-from colossalai.auto_parallel.tensor_shard.deprecated._utils import (enumerate_all_possible_1d_sharding,
-                                                                     enumerate_all_possible_2d_sharding,
-                                                                     ignore_sharding_exception)
-from colossalai.auto_parallel.tensor_shard.deprecated.sharding_strategy import (ShardingStrategy, StrategiesVector)
+from colossalai.auto_parallel.tensor_shard.deprecated._utils import (
+    enumerate_all_possible_1d_sharding,
+    enumerate_all_possible_2d_sharding,
+    ignore_sharding_exception,
+)
+from colossalai.auto_parallel.tensor_shard.deprecated.sharding_strategy import ShardingStrategy, StrategiesVector
 from colossalai.tensor.shape_consistency import ShapeConsistencyManager
 from colossalai.tensor.sharding_spec import ShardingSpec
 

From e81caeb4bc20ed14be0dd5f52d14c0f11813c817 Mon Sep 17 00:00:00 2001
From: Xue Fuzhao <57164838+XueFuzhao@users.noreply.github.com>
Date: Wed, 15 Feb 2023 16:12:45 +0800
Subject: [PATCH 330/503] [NFC] polish
 colossalai/auto_parallel/tensor_shard/deprecated/cost_graph.py code style
 (#2720)

Co-authored-by: Fuzhao Xue <fuzhao@login2.ls6.tacc.utexas.edu>
---
 .../tensor_shard/deprecated/cost_graph.py          | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/colossalai/auto_parallel/tensor_shard/deprecated/cost_graph.py b/colossalai/auto_parallel/tensor_shard/deprecated/cost_graph.py
index 239d02115d0e..50220bca6482 100644
--- a/colossalai/auto_parallel/tensor_shard/deprecated/cost_graph.py
+++ b/colossalai/auto_parallel/tensor_shard/deprecated/cost_graph.py
@@ -1,6 +1,8 @@
-from typing import List
 import math
+from typing import List
+
 from torch.fx.node import Node
+
 from .constants import INFINITY_COST
 
 
@@ -9,7 +11,7 @@ class CostGraph:
     A graph data structure to simplify the edge cost graph. It has two main functions:
     1. To feed the quadratic resharding costs into solver, we need to linearize it. We build edge_cost in
     CostGraph, and it stored every combinations of strategies for a src-dst node pair in an 1D list.
-    2. To reduce the searching space, we merge computationally-trivial operators, such as 
+    2. To reduce the searching space, we merge computationally-trivial operators, such as
     element-wise operators, transpose, and reduction, into their following nodes. The merging infomation will
     be given by the StrategiesVector depending on the type of target node and following nodes.
 
@@ -75,14 +77,14 @@ def get_edge_cost(self, src_node, dst_node):
     def merge_node(self, src_node, dst_node):
         '''
         To merge dst_node into src_node, we need to do it in following steps:
-        
+
         1. For each strategy in dst_node, we need to pick an appropriate strategy
-        of src_node to merge, it is important because the logical resharding costs 
-        between the parents node of src_node and merged node depend on the src_node 
+        of src_node to merge, it is important because the logical resharding costs
+        between the parents node of src_node and merged node depend on the src_node
         strategies dispatching. For example, for the graph 0->1->2, after merging node 1
         into node 2, edge_costs[(node 0, node 2)][(0, 0)] = edge_costs[(node 0, node 1)][(0, x)]
         x represents the picking strategy of node 1 merged into node 2 strategy 0.
-        
+
         2. We need to accumulate the extra costs introduced by merging nodes, the extra costs
         contains two parts, one is resharding costs between src_node strategy and dst_node strategy,
         another is the origin extra costs in src_node strategy.

From d344313533de84ebd6876e0da86303218a954a4f Mon Sep 17 00:00:00 2001
From: ziyuhuang123 <99854690+ziyuhuang123@users.noreply.github.com>
Date: Wed, 15 Feb 2023 16:31:40 +0800
Subject: [PATCH 331/503] [NFC] polish
 colossalai/auto_parallel/tensor_shard/deprecated/op_handler/embedding_handler.py
 code style (#2725)

---
 .../deprecated/op_handler/embedding_handler.py     | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/embedding_handler.py b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/embedding_handler.py
index d01a487ad673..d3f51d489cd9 100644
--- a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/embedding_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/embedding_handler.py
@@ -5,9 +5,9 @@
 from typing import Dict, List
 
 import torch
-from colossalai.auto_parallel.tensor_shard.deprecated._utils import \
-    ignore_sharding_exception
-from colossalai.auto_parallel.tensor_shard.deprecated.sharding_strategy import (ShardingStrategy, StrategiesVector)
+
+from colossalai.auto_parallel.tensor_shard.deprecated._utils import ignore_sharding_exception
+from colossalai.auto_parallel.tensor_shard.deprecated.sharding_strategy import ShardingStrategy, StrategiesVector
 from colossalai.tensor.shape_consistency import ShapeConsistencyManager
 from colossalai.tensor.sharding_spec import ShardingSpec
 
@@ -42,19 +42,19 @@ def _generate_memory_cost(self, sharding_size_forward, sharding_size_backward_ac
         Argument:
             sharding_size_forward(int): The forward activation will be divided
                 into sharding_size_forward number partions.
-            sharding_size_backward_activation(int): The backward activation will 
+            sharding_size_backward_activation(int): The backward activation will
                 be divided into sharding_size_backward_activation number partions.
             sharding_size_weight(int): The backward weight will be divided
                 into sharding_size_weight number partions.
 
         Return:
-            memory_cost(Tuple[float]): Memory cost per device with this 
+            memory_cost(Tuple[float]): Memory cost per device with this
                 specific strategy, the first element of this tuple is forward
                 memory cost, and the second element of this tuple is backward
                 memory cost.
-            memory_cost_forward(float): Memory cost of forward activation per 
+            memory_cost_forward(float): Memory cost of forward activation per
                 device with this specific strategy.
-            memory_cost_backward_activation(float): Memory cost of backward activation 
+            memory_cost_backward_activation(float): Memory cost of backward activation
                 per device with this specific strategy.
         '''
         # compute the memory cost of this strategy

From c5be83afbf8d64c9966d802504e4619f3c3fc4a9 Mon Sep 17 00:00:00 2001
From: binmakeswell <binmakeswell@gmail.com>
Date: Wed, 15 Feb 2023 16:48:08 +0800
Subject: [PATCH 332/503] Update version.txt (#2727)

---
 version.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version.txt b/version.txt
index abd410582dea..3a4036fb450f 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.2.4
+0.2.5

From 5479fdd5b86a809e2dad20b3279abd1d58816a44 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Wed, 15 Feb 2023 17:39:50 +0800
Subject: [PATCH 333/503] [doc] updated documentation version list (#2730)

---
 docs/versions.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/versions.json b/docs/versions.json
index 49a0fab2bd55..6dd417a18024 100644
--- a/docs/versions.json
+++ b/docs/versions.json
@@ -1,3 +1,3 @@
 [
-  "v0.2.4"
+  "v0.2.5"
 ]

From 8331420520dfdccf9e9eea7bf730d39051441729 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wangbo=20Zhao=28=E9=BB=91=E8=89=B2=E6=9E=B7=E9=94=81=29?=
 <56866854+wangbo-zhao@users.noreply.github.com>
Date: Wed, 15 Feb 2023 22:25:28 +0800
Subject: [PATCH 334/503] [NFC] polish colossalai/cli/cli.py code style (#2734)

---
 colossalai/cli/cli.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/colossalai/cli/cli.py b/colossalai/cli/cli.py
index 3e5b9ae6343f..a94e1150e49f 100644
--- a/colossalai/cli/cli.py
+++ b/colossalai/cli/cli.py
@@ -1,7 +1,8 @@
 import click
-from .launcher import run
-from .check import check
+
 from .benchmark import benchmark
+from .check import check
+from .launcher import run
 
 
 class Arguments():

From 1819373e5ce1ffc44a7d3d59f19c4290c8bfc027 Mon Sep 17 00:00:00 2001
From: Zangwei Zheng <zangwei@comp.nus.edu.sg>
Date: Wed, 15 Feb 2023 22:26:13 +0800
Subject: [PATCH 335/503] [NFC] polish
 colossalai/auto_parallel/tensor_shard/deprecated/op_handler/batch_norm_handler.py
 code style (#2728)

---
 .../deprecated/op_handler/batch_norm_handler.py  | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/batch_norm_handler.py b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/batch_norm_handler.py
index 519436270828..868600b39f2c 100644
--- a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/batch_norm_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/batch_norm_handler.py
@@ -2,9 +2,9 @@
 from functools import reduce
 
 import torch
-from colossalai.auto_parallel.tensor_shard.deprecated._utils import \
-    ignore_sharding_exception
-from colossalai.auto_parallel.tensor_shard.deprecated.sharding_strategy import (ShardingStrategy, StrategiesVector)
+
+from colossalai.auto_parallel.tensor_shard.deprecated._utils import ignore_sharding_exception
+from colossalai.auto_parallel.tensor_shard.deprecated.sharding_strategy import ShardingStrategy, StrategiesVector
 
 from .operator_handler import OperatorHandler
 
@@ -76,19 +76,19 @@ def _generate_memory_cost(self, sharding_size_forward, sharding_size_backward_ac
         Argument:
             sharding_size_forward(int): The forward activation will be divided
                 into sharding_size_forward number partions.
-            sharding_size_backward_activation(int): The backward activation will 
+            sharding_size_backward_activation(int): The backward activation will
                 be divided into sharding_size_backward_activation number partions.
             sharding_size_weight(int): The backward weight will be divided
                 into sharding_size_weight number partions.
 
         Return:
-            memory_cost(Tuple[float]): Memory cost per device with this 
+            memory_cost(Tuple[float]): Memory cost per device with this
                 specific strategy, the first element of this tuple is forward
                 memory cost, and the second element of this tuple is backward
                 memory cost.
-            memory_cost_forward(float): Memory cost of forward activation per 
+            memory_cost_forward(float): Memory cost of forward activation per
                 device with this specific strategy.
-            memory_cost_backward_activation(float): Memory cost of backward activation 
+            memory_cost_backward_activation(float): Memory cost of backward activation
                 per device with this specific strategy.
         '''
         # compute the memory cost of this strategy
@@ -458,7 +458,7 @@ def register_strategy(self) -> StrategiesVector:
             norm_handler.register_strategy()
             for strategy in norm_handler.strategies_vector:
                 print(f'{strategy.name}, computation_cost: {strategy.compute_cost}, memory_cost: {strategy.memory_cost}')
-        
+
         Output:
             RS0 = RS0 x S0, computation_cost: 131072, memory_cost: 524288.0
             RS1 = RS1 x S1, computation_cost: 131072, memory_cost: 524288.0

From c9e3ee389eea822c856cce243ab2c7a477594d67 Mon Sep 17 00:00:00 2001
From: Zirui Zhu <zhuzr21@gmail.com>
Date: Wed, 15 Feb 2023 22:27:13 +0800
Subject: [PATCH 336/503] [NFC] polish
 colossalai/context/process_group_initializer/initializer_2d.py code style
 (#2726)

---
 .../context/process_group_initializer/initializer_2d.py      | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/colossalai/context/process_group_initializer/initializer_2d.py b/colossalai/context/process_group_initializer/initializer_2d.py
index fe0ba553d6f3..7fbe3be5901f 100644
--- a/colossalai/context/process_group_initializer/initializer_2d.py
+++ b/colossalai/context/process_group_initializer/initializer_2d.py
@@ -2,10 +2,11 @@
 
 import torch.distributed as dist
 
+from colossalai.global_variables import tensor_parallel_env as env
 from colossalai.registry import DIST_GROUP_INITIALIZER
-from .process_group_initializer import ProcessGroupInitializer
+
 from ..parallel_mode import ParallelMode
-from colossalai.global_variables import tensor_parallel_env as env
+from .process_group_initializer import ProcessGroupInitializer
 
 
 def _check_summa_env_var(summa_dim):

From 2fd528b9f4ca2a29e23989cafb7f99230e8c31eb Mon Sep 17 00:00:00 2001
From: xyupeng <99191637+xyupeng@users.noreply.github.com>
Date: Wed, 15 Feb 2023 22:57:45 +0800
Subject: [PATCH 337/503] [NFC] polish
 colossalai/auto_parallel/tensor_shard/deprecated/graph_analysis.py code style
 (#2737)

---
 .../tensor_shard/deprecated/graph_analysis.py             | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/colossalai/auto_parallel/tensor_shard/deprecated/graph_analysis.py b/colossalai/auto_parallel/tensor_shard/deprecated/graph_analysis.py
index 831e7eadd179..9f7a6a5ec286 100644
--- a/colossalai/auto_parallel/tensor_shard/deprecated/graph_analysis.py
+++ b/colossalai/auto_parallel/tensor_shard/deprecated/graph_analysis.py
@@ -1,9 +1,11 @@
+from collections import OrderedDict as ODict
 from dataclasses import dataclass
-from torch.fx.node import Node
+from typing import Any, List, OrderedDict, Union
+
 from torch.fx.graph import Graph
 from torch.fx.graph_module import GraphModule
-from collections import OrderedDict as ODict
-from typing import List, OrderedDict, Union, Any
+from torch.fx.node import Node
+
 from colossalai.fx.passes.utils import get_node_module
 
 __all__ = ['LiveVariable', 'LiveVariableVector', 'LiveStage', 'GraphAnalyser']

From 43dffdaba58ffc9f2de131ea80c91e99d5dfe756 Mon Sep 17 00:00:00 2001
From: cloudhuang <liping.huang@live.com>
Date: Wed, 15 Feb 2023 22:24:45 +0800
Subject: [PATCH 338/503] [doc] fixed a typo in GPT readme (#2736)

---
 examples/language/gpt/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/language/gpt/README.md b/examples/language/gpt/README.md
index 3d5ce7c8807c..fe7b23beb41b 100644
--- a/examples/language/gpt/README.md
+++ b/examples/language/gpt/README.md
@@ -36,7 +36,7 @@ If you want to test ZeRO1 and ZeRO2 in Colossal-AI, you need to ensure Colossal-
 
 ## Dataset
 
-For simplicity, the input data is randonly generated here.
+For simplicity, the input data is randomly generated here.
 
 ## Training
 We provide two stable solutions.

From ae86a29e2379314da3cb4abf95b5306db6156794 Mon Sep 17 00:00:00 2001
From: YH <100389977+yhna940@users.noreply.github.com>
Date: Wed, 15 Feb 2023 23:27:58 +0900
Subject: [PATCH 339/503] Refact method of grad store (#2687)

---
 .../bookkeeping/gradient_store.py             | 23 ++++++++++++++++---
 .../zero/sharded_optim/low_level_optim.py     | 18 +++++++++------
 2 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/colossalai/zero/sharded_optim/bookkeeping/gradient_store.py b/colossalai/zero/sharded_optim/bookkeeping/gradient_store.py
index 8a9128a18964..b166752cc400 100644
--- a/colossalai/zero/sharded_optim/bookkeeping/gradient_store.py
+++ b/colossalai/zero/sharded_optim/bookkeeping/gradient_store.py
@@ -6,7 +6,6 @@
 
 
 class GradientStore(BaseStore):
-
     def __init__(self, *args):
         super().__init__(*args)
         # bookkeeping data structures
@@ -15,7 +14,7 @@ def __init__(self, *args):
         # for backward reduction hooks
         self._grad_acc_objs = []
 
-    def add_accumulate_grad_object(self, obj):
+    def append_accumulate_grad_object(self, obj):
         """
         Keep :class:`AccumulateGrad` objects. If these objects are not kept, reduction hooks may not
         be attached successfully.
@@ -36,10 +35,12 @@ def get_averaged_gradients_by_group(self, group_id: int) -> List[Tensor]:
         :return: Return the list of averaged gradients of a parameter group. Each element is a gradient, not a parameter.
         :rtype: List[torch.Tensor]
         """
+        if group_id not in self._averaged_gradients:
+            self._averaged_gradients[group_id] = []
 
         return self._averaged_gradients[group_id]
 
-    def add_average_gradient_by_group(self, group_id: int, tensor: Tensor) -> None:
+    def append_average_gradient_by_group(self, group_id: int, tensor: Tensor) -> None:
         """
         Append an average gradient to the list of averaged gradients of a parameter group
 
@@ -55,6 +56,22 @@ def add_average_gradient_by_group(self, group_id: int, tensor: Tensor) -> None:
         else:
             self._averaged_gradients[group_id] = [tensor]
 
+    def add_average_gradient_by_group(
+        self, group_id: int, tensor_idx: int, tensor: Tensor
+    ) -> None:
+        """
+        Add an average gradient to the list of averaged gradients of a parameter group
+
+        :param group_id: The index of a parameter group
+        :param tensor_idx: The index of a tensor in the list of averaged gradients
+        :param tensor: A :class:`torch.Tensor` object
+        :type group_id: int
+        :type tensor_idx: int
+        :type tensor: torch.Tensor
+
+        """
+        self._averaged_gradients[group_id][tensor_idx].add_(tensor)
+
     def reset_average_gradients_by_group(self, group_id: int) -> None:
         """
         Reset the bookkeeping data structure for averaged gradients to an empty list
diff --git a/colossalai/zero/sharded_optim/low_level_optim.py b/colossalai/zero/sharded_optim/low_level_optim.py
index 89f5f9fadca4..f5e03ce28532 100644
--- a/colossalai/zero/sharded_optim/low_level_optim.py
+++ b/colossalai/zero/sharded_optim/low_level_optim.py
@@ -550,20 +550,24 @@ def _sync_grad(self):
             reduction_states[tensor] = False
 
         # accumulate gradient
-        avg_gradients = self._grad_store._averaged_gradients
         for group_id in range(self.num_param_groups):
             param_group = self._param_store.get_fp16_params_by_rank_group(self._local_rank, group_id)
 
-            if group_id not in avg_gradients:
-                avg_gradients[group_id] = []
+            avg_gradients_group = self._grad_store.get_averaged_gradients_by_group(
+                group_id
+            )
 
             param_idx = 0
             for param in param_group:
                 if param.grad is not None:
-                    if len(avg_gradients[group_id]) == param_idx:
-                        avg_gradients[group_id].append(param.grad)
+                    if len(avg_gradients_group) == param_idx:
+                        self._grad_store.append_average_gradient_by_group(
+                            group_id, param.grad
+                        )
                     else:
-                        avg_gradients[group_id][param_idx].add_(param.grad)
+                        self._grad_store.add_average_gradient_by_group(
+                            group_id, param_idx, param.grad
+                        )
                     param_idx += 1
 
         # the gradients needed are stored in the avg_gradients buffer
@@ -590,4 +594,4 @@ def _reduce_grad_stage2(self):
         # only need to reduce the gradients
         # left in the communication bucket
         for reduce_rank in range(self._world_size):
-            self._run_reduction(reduce_rank)
+            self._run_reduction(reduce_rank)
\ No newline at end of file

From 1dc003c1698730234ca6a10248d1d3b800fc9ad9 Mon Sep 17 00:00:00 2001
From: YuliangLiu0306 <72588413+YuliangLiu0306@users.noreply.github.com>
Date: Wed, 15 Feb 2023 22:28:28 +0800
Subject: [PATCH 340/503] [autoparallel] distinguish different parallel
 strategies (#2699)

---
 .../node_handler/linear_handler.py            |   5 +-
 .../strategy/matmul_strategy_generator.py     |  58 ++++++--
 .../test_gpt/test_runtime_with_gpt_modules.py |   2 +-
 .../test_permute_and_transpose_handler.py     | 138 +++++++++---------
 .../test_node_handler/test_softmax_handler.py |  88 +++++------
 .../test_node_handler/test_split_handler.py   |  88 +++++------
 .../test_node_handler/test_view_handler.py    |  95 ++++++------
 7 files changed, 255 insertions(+), 219 deletions(-)

diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/linear_handler.py b/colossalai/auto_parallel/tensor_shard/node_handler/linear_handler.py
index 37ff3c3ab572..59091dab519f 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/linear_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/linear_handler.py
@@ -152,7 +152,10 @@ def get_strategy_generator(self) -> List[StrategyGenerator]:
         op_data_mapping = self.get_operation_data_mapping()
         generators = []
         generators.append(
-            LinearProjectionStrategyGenerator(op_data_mapping, self.device_mesh, linear_projection_type='linear'))
+            LinearProjectionStrategyGenerator(op_data_mapping,
+                                              self.device_mesh,
+                                              linear_projection_type='linear',
+                                              solver_perference=self.solver_perference))
         return generators
 
     def get_operation_data_mapping(self) -> Dict[str, OperationData]:
diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/matmul_strategy_generator.py b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/matmul_strategy_generator.py
index fa2246f952a9..5d70e131d1e9 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/matmul_strategy_generator.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/matmul_strategy_generator.py
@@ -3,6 +3,7 @@
 from functools import reduce
 from typing import List
 
+from colossalai.auto_parallel.tensor_shard.options import SolverPerference
 from colossalai.auto_parallel.tensor_shard.sharding_strategy import (
     CommType,
     MemoryCost,
@@ -209,9 +210,14 @@ def collate_strategies(self) -> List[ShardingStrategy]:
 
 class LinearProjectionStrategyGenerator(MatMulStrategyGenerator):
 
-    def __init__(self, operation_data_mapping, device_mesh, linear_projection_type='linear'):
+    def __init__(self,
+                 operation_data_mapping,
+                 device_mesh,
+                 linear_projection_type='linear',
+                 solver_perference=SolverPerference.STANDARD):
         super().__init__(operation_data_mapping, device_mesh)
         self.linear_projection_type = linear_projection_type
+        self.solver_perference = solver_perference
 
     def update_compute_cost(self, strategy: ShardingStrategy) -> ShardingStrategy:
         # C = AB
@@ -231,16 +237,22 @@ def update_compute_cost(self, strategy: ShardingStrategy) -> ShardingStrategy:
                                       total=fwd_compute_cost + bwd_compute_cost)
         strategy.compute_cost = compute_cost
 
-    def collate_strategies(self) -> List[ShardingStrategy]:
+    def dp_strategies(self) -> List[ShardingStrategy]:
         strategies = []
 
-        # SS = SR x RS
-        strategies.append(self.split_lhs_space_rhs_space(0, 1))
-        strategies.append(self.split_lhs_space_rhs_space(1, 0))
+        # S01R = S01R x RR
+        strategies.append(self.split_lhs_1st_dim_1d(0, 1))
 
-        # SR = SS x SR
-        strategies.append(self.split_lhs_space_both_contract(0, 1))
-        strategies.append(self.split_lhs_space_both_contract(1, 0))
+        return strategies
+
+    def tp_strategies(self) -> List[ShardingStrategy]:
+        strategies = []
+
+        # RR = RS01 x S01R
+        strategies.append(self.split_lhs_2nd_dim_1d(0, 1))
+
+        # RS01 = RR x RS01
+        strategies.append(self.split_rhs_2nd_dim_1d(0, 1))
 
         # RS = RS x SS
         strategies.append(self.split_rhs_space_both_contract(0, 1))
@@ -254,20 +266,38 @@ def collate_strategies(self) -> List[ShardingStrategy]:
         strategies.append(self.split_rhs_space_only(0))
         strategies.append(self.split_rhs_space_only(1))
 
-        # S01R = S01R x RR
-        strategies.append(self.split_lhs_1st_dim_1d(0, 1))
+        return strategies
 
-        # RR = RS01 x S01R
-        strategies.append(self.split_lhs_2nd_dim_1d(0, 1))
+    def mix_strategies(self) -> List[ShardingStrategy]:
+        strategies = []
 
-        # RS01 = RR x RS01
-        strategies.append(self.split_rhs_2nd_dim_1d(0, 1))
+        # SS = SR x RS
+        strategies.append(self.split_lhs_space_rhs_space(0, 1))
+        strategies.append(self.split_lhs_space_rhs_space(1, 0))
+
+        # SR = SS x SR
+        strategies.append(self.split_lhs_space_both_contract(0, 1))
+        strategies.append(self.split_lhs_space_both_contract(1, 0))
 
         # RR = RR x RR
         strategies.append(self.non_split())
 
         return strategies
 
+    def collate_strategies(self) -> List[ShardingStrategy]:
+        strategies = []
+
+        if self.solver_perference == SolverPerference.STANDARD:
+            strategies.extend(self.dp_strategies())
+            strategies.extend(self.tp_strategies())
+            strategies.extend(self.mix_strategies())
+        elif self.solver_perference == SolverPerference.DP:
+            strategies.extend(self.dp_strategies())
+        elif self.solver_perference == SolverPerference.TP:
+            strategies.extend(self.tp_strategies())
+
+        return strategies
+
     @ignore_sharding_exception
     def split_lhs_space_rhs_space(self, mesh_dim_0, mesh_dim_1):
         # handle case SS = SR x RS
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_gpt/test_runtime_with_gpt_modules.py b/tests/test_auto_parallel/test_tensor_shard/test_gpt/test_runtime_with_gpt_modules.py
index 753ecff5374c..ebeef9870fe9 100644
--- a/tests/test_auto_parallel/test_tensor_shard/test_gpt/test_runtime_with_gpt_modules.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_gpt/test_runtime_with_gpt_modules.py
@@ -117,7 +117,7 @@ def check_attention_layer(rank, model_cls, world_size, port):
     gm = GraphModule(model, graph, model.__class__.__name__)
     gm.recompile()
 
-    strategies_constructor = build_strategy_constructor(graph, device_mesh)
+    strategies_constructor = build_strategy_constructor(graph, device_mesh, 'standard', 'replicated', 'standard')
     solution = solve_solution(gm, strategies_constructor, memory_budget=-1)
     gm, sharding_spec_dicts = transform_to_sharded_model(gm, solution, device_mesh, strategies_constructor)
     gm = ModuleWrapper(gm, *sharding_spec_dicts)
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_permute_and_transpose_handler.py b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_permute_and_transpose_handler.py
index b12db13324c0..af03481d830e 100644
--- a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_permute_and_transpose_handler.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_permute_and_transpose_handler.py
@@ -243,79 +243,79 @@ def check_view_handler(rank, call_function, reshape_dims, model_cls, world_size,
     if model_cls.__name__ == 'LinearReshapeModel':
 
         if reshape_dims == ((0, 2, 1, 3), (1, 2)):
-            assert '[S0, R, R, S1] -> [S0, R, R, S1]_0' in strategy_name_list
-            assert '[R, S0, R, S1] -> [R, R, S0, S1]_1' in strategy_name_list
-            assert '[R, R, S0, S1] -> [R, S0, R, S1]_2' in strategy_name_list
-            assert '[S1, R, R, S0] -> [S1, R, R, S0]_3' in strategy_name_list
-            assert '[R, S1, R, S0] -> [R, R, S1, S0]_4' in strategy_name_list
-            assert '[R, R, S1, S0] -> [R, S1, R, S0]_5' in strategy_name_list
-            assert '[S0, R, R, R] -> [S0, R, R, R]_6' in strategy_name_list
-            assert '[R, S0, R, R] -> [R, R, S0, R]_7' in strategy_name_list
-            assert '[R, R, S0, R] -> [R, S0, R, R]_8' in strategy_name_list
-            assert '[S1, R, R, R] -> [S1, R, R, R]_9' in strategy_name_list
-            assert '[R, S1, R, R] -> [R, R, S1, R]_10' in strategy_name_list
-            assert '[R, R, S1, R] -> [R, S1, R, R]_11' in strategy_name_list
-            assert '[R, R, R, S1] -> [R, R, R, S1]_12' in strategy_name_list
-            assert '[R, R, R, S0] -> [R, R, R, S0]_13' in strategy_name_list
-            assert '[R, R, R, R] -> [R, R, R, R]_14' in strategy_name_list
-            assert '[R, R, R, R] -> [R, R, R, R]_15' in strategy_name_list
-            assert '[R, R, R, S0] -> [R, R, R, S0]_16' in strategy_name_list
-            assert '[R, R, R, S1] -> [R, R, R, S1]_17' in strategy_name_list
-            assert '[S01, R, R, R] -> [S01, R, R, R]_18' in strategy_name_list
-            assert '[R, S01, R, R] -> [R, R, S01, R]_19' in strategy_name_list
-            assert '[R, R, S01, R] -> [R, S01, R, R]_20' in strategy_name_list
-            assert '[R, R, R, R] -> [R, R, R, R]_21' in strategy_name_list
-            assert '[R, R, R, S01] -> [R, R, R, S01]_22' in strategy_name_list
+            assert '[S0, R, R, S1] -> [S0, R, R, S1]_11' in strategy_name_list
+            assert '[R, S0, R, S1] -> [R, R, S0, S1]_12' in strategy_name_list
+            assert '[R, R, S0, S1] -> [R, S0, R, S1]_13' in strategy_name_list
+            assert '[S1, R, R, S0] -> [S1, R, R, S0]_14' in strategy_name_list
+            assert '[R, S1, R, S0] -> [R, R, S1, S0]_15' in strategy_name_list
+            assert '[R, R, S1, S0] -> [R, S1, R, S0]_16' in strategy_name_list
+            assert '[S0, R, R, R] -> [S0, R, R, R]_17' in strategy_name_list
+            assert '[R, S0, R, R] -> [R, R, S0, R]_18' in strategy_name_list
+            assert '[R, R, S0, R] -> [R, S0, R, R]_19' in strategy_name_list
+            assert '[S1, R, R, R] -> [S1, R, R, R]_20' in strategy_name_list
+            assert '[R, S1, R, R] -> [R, R, S1, R]_21' in strategy_name_list
+            assert '[R, R, S1, R] -> [R, S1, R, R]_22' in strategy_name_list
+            assert '[R, R, R, S1] -> [R, R, R, S1]_10' in strategy_name_list
+            assert '[R, R, R, S0] -> [R, R, R, S0]_9' in strategy_name_list
+            assert '[R, R, R, R] -> [R, R, R, R]_8' in strategy_name_list
+            assert '[R, R, R, R] -> [R, R, R, R]_7' in strategy_name_list
+            assert '[R, R, R, S0] -> [R, R, R, S0]_6' in strategy_name_list
+            assert '[R, R, R, S1] -> [R, R, R, S1]_5' in strategy_name_list
+            assert '[S01, R, R, R] -> [S01, R, R, R]_0' in strategy_name_list
+            assert '[R, S01, R, R] -> [R, R, S01, R]_1' in strategy_name_list
+            assert '[R, R, S01, R] -> [R, S01, R, R]_2' in strategy_name_list
+            assert '[R, R, R, R] -> [R, R, R, R]_3' in strategy_name_list
+            assert '[R, R, R, S01] -> [R, R, R, S01]_4' in strategy_name_list
 
         if reshape_dims == (2, 0, 1, 3):
-            assert '[S0, R, R, S1] -> [R, S0, R, S1]_0' in strategy_name_list
-            assert '[R, S0, R, S1] -> [R, R, S0, S1]_1' in strategy_name_list
-            assert '[R, R, S0, S1] -> [S0, R, R, S1]_2' in strategy_name_list
-            assert '[S1, R, R, S0] -> [R, S1, R, S0]_3' in strategy_name_list
-            assert '[R, S1, R, S0] -> [R, R, S1, S0]_4' in strategy_name_list
-            assert '[R, R, S1, S0] -> [S1, R, R, S0]_5' in strategy_name_list
-            assert '[S0, R, R, R] -> [R, S0, R, R]_6' in strategy_name_list
-            assert '[R, S0, R, R] -> [R, R, S0, R]_7' in strategy_name_list
-            assert '[R, R, S0, R] -> [S0, R, R, R]_8' in strategy_name_list
-            assert '[S1, R, R, R] -> [R, S1, R, R]_9' in strategy_name_list
-            assert '[R, S1, R, R] -> [R, R, S1, R]_10' in strategy_name_list
-            assert '[R, R, S1, R] -> [S1, R, R, R]_11' in strategy_name_list
-            assert '[R, R, R, S1] -> [R, R, R, S1]_12' in strategy_name_list
-            assert '[R, R, R, S0] -> [R, R, R, S0]_13' in strategy_name_list
-            assert '[R, R, R, R] -> [R, R, R, R]_14' in strategy_name_list
-            assert '[R, R, R, R] -> [R, R, R, R]_15' in strategy_name_list
-            assert '[R, R, R, S0] -> [R, R, R, S0]_16' in strategy_name_list
-            assert '[R, R, R, S1] -> [R, R, R, S1]_17' in strategy_name_list
-            assert '[S01, R, R, R] -> [R, S01, R, R]_18' in strategy_name_list
-            assert '[R, S01, R, R] -> [R, R, S01, R]_19' in strategy_name_list
-            assert '[R, R, S01, R] -> [S01, R, R, R]_20' in strategy_name_list
-            assert '[R, R, R, R] -> [R, R, R, R]_21' in strategy_name_list
-            assert '[R, R, R, S01] -> [R, R, R, S01]_22' in strategy_name_list
+            assert '[S0, R, R, S1] -> [R, S0, R, S1]_11' in strategy_name_list
+            assert '[R, S0, R, S1] -> [R, R, S0, S1]_12' in strategy_name_list
+            assert '[R, R, S0, S1] -> [S0, R, R, S1]_13' in strategy_name_list
+            assert '[S1, R, R, S0] -> [R, S1, R, S0]_14' in strategy_name_list
+            assert '[R, S1, R, S0] -> [R, R, S1, S0]_15' in strategy_name_list
+            assert '[R, R, S1, S0] -> [S1, R, R, S0]_16' in strategy_name_list
+            assert '[S0, R, R, R] -> [R, S0, R, R]_17' in strategy_name_list
+            assert '[R, S0, R, R] -> [R, R, S0, R]_18' in strategy_name_list
+            assert '[R, R, S0, R] -> [S0, R, R, R]_19' in strategy_name_list
+            assert '[S1, R, R, R] -> [R, S1, R, R]_20' in strategy_name_list
+            assert '[R, S1, R, R] -> [R, R, S1, R]_21' in strategy_name_list
+            assert '[R, R, S1, R] -> [S1, R, R, R]_22' in strategy_name_list
+            assert '[R, R, R, S1] -> [R, R, R, S1]_10' in strategy_name_list
+            assert '[R, R, R, S0] -> [R, R, R, S0]_9' in strategy_name_list
+            assert '[R, R, R, R] -> [R, R, R, R]_8' in strategy_name_list
+            assert '[R, R, R, R] -> [R, R, R, R]_7' in strategy_name_list
+            assert '[R, R, R, S0] -> [R, R, R, S0]_6' in strategy_name_list
+            assert '[R, R, R, S1] -> [R, R, R, S1]_5' in strategy_name_list
+            assert '[S01, R, R, R] -> [R, S01, R, R]_0' in strategy_name_list
+            assert '[R, S01, R, R] -> [R, R, S01, R]_1' in strategy_name_list
+            assert '[R, R, S01, R] -> [S01, R, R, R]_2' in strategy_name_list
+            assert '[R, R, R, R] -> [R, R, R, R]_3' in strategy_name_list
+            assert '[R, R, R, S01] -> [R, R, R, S01]_4' in strategy_name_list
 
         if reshape_dims == (1, 3):
-            assert '[S0, R, R, S1] -> [S0, S1, R, R]_0' in strategy_name_list
-            assert '[R, S0, R, S1] -> [R, S1, R, S0]_1' in strategy_name_list
-            assert '[R, R, S0, S1] -> [R, S1, S0, R]_2' in strategy_name_list
-            assert '[S1, R, R, S0] -> [S1, S0, R, R]_3' in strategy_name_list
-            assert '[R, S1, R, S0] -> [R, S0, R, S1]_4' in strategy_name_list
-            assert '[R, R, S1, S0] -> [R, S0, S1, R]_5' in strategy_name_list
-            assert '[S0, R, R, R] -> [S0, R, R, R]_6' in strategy_name_list
-            assert '[R, S0, R, R] -> [R, R, R, S0]_7' in strategy_name_list
-            assert '[R, R, S0, R] -> [R, R, S0, R]_8' in strategy_name_list
-            assert '[S1, R, R, R] -> [S1, R, R, R]_9' in strategy_name_list
-            assert '[R, S1, R, R] -> [R, R, R, S1]_10' in strategy_name_list
-            assert '[R, R, S1, R] -> [R, R, S1, R]_11' in strategy_name_list
-            assert '[R, R, R, S1] -> [R, S1, R, R]_12' in strategy_name_list
-            assert '[R, R, R, S0] -> [R, S0, R, R]_13' in strategy_name_list
-            assert '[R, R, R, R] -> [R, R, R, R]_14' in strategy_name_list
-            assert '[R, R, R, R] -> [R, R, R, R]_15' in strategy_name_list
-            assert '[R, R, R, S0] -> [R, S0, R, R]_16' in strategy_name_list
-            assert '[R, R, R, S1] -> [R, S1, R, R]_17' in strategy_name_list
-            assert '[S01, R, R, R] -> [S01, R, R, R]_18' in strategy_name_list
-            assert '[R, S01, R, R] -> [R, R, R, S01]_19' in strategy_name_list
-            assert '[R, R, S01, R] -> [R, R, S01, R]_20' in strategy_name_list
-            assert '[R, R, R, R] -> [R, R, R, R]_21' in strategy_name_list
-            assert '[R, R, R, S01] -> [R, S01, R, R]_22' in strategy_name_list
+            assert '[S0, R, R, S1] -> [S0, S1, R, R]_11' in strategy_name_list
+            assert '[R, S0, R, S1] -> [R, S1, R, S0]_12' in strategy_name_list
+            assert '[R, R, S0, S1] -> [R, S1, S0, R]_13' in strategy_name_list
+            assert '[S1, R, R, S0] -> [S1, S0, R, R]_14' in strategy_name_list
+            assert '[R, S1, R, S0] -> [R, S0, R, S1]_15' in strategy_name_list
+            assert '[R, R, S1, S0] -> [R, S0, S1, R]_16' in strategy_name_list
+            assert '[S0, R, R, R] -> [S0, R, R, R]_17' in strategy_name_list
+            assert '[R, S0, R, R] -> [R, R, R, S0]_18' in strategy_name_list
+            assert '[R, R, S0, R] -> [R, R, S0, R]_19' in strategy_name_list
+            assert '[S1, R, R, R] -> [S1, R, R, R]_20' in strategy_name_list
+            assert '[R, S1, R, R] -> [R, R, R, S1]_21' in strategy_name_list
+            assert '[R, R, S1, R] -> [R, R, S1, R]_22' in strategy_name_list
+            assert '[R, R, R, S1] -> [R, S1, R, R]_10' in strategy_name_list
+            assert '[R, R, R, S0] -> [R, S0, R, R]_9' in strategy_name_list
+            assert '[R, R, R, R] -> [R, R, R, R]_8' in strategy_name_list
+            assert '[R, R, R, R] -> [R, R, R, R]_7' in strategy_name_list
+            assert '[R, R, R, S0] -> [R, S0, R, R]_6' in strategy_name_list
+            assert '[R, R, R, S1] -> [R, S1, R, R]_5' in strategy_name_list
+            assert '[S01, R, R, R] -> [S01, R, R, R]_0' in strategy_name_list
+            assert '[R, S01, R, R] -> [R, R, R, S01]_1' in strategy_name_list
+            assert '[R, R, S01, R] -> [R, R, S01, R]_2' in strategy_name_list
+            assert '[R, R, R, R] -> [R, R, R, R]_3' in strategy_name_list
+            assert '[R, R, R, S01] -> [R, S01, R, R]_4' in strategy_name_list
 
 
 @run_on_environment_flag(name='AUTO_PARALLEL')
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_softmax_handler.py b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_softmax_handler.py
index b5e8e32778be..c43ee292bedf 100644
--- a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_softmax_handler.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_softmax_handler.py
@@ -117,54 +117,54 @@ def check_split_handler(rank, softmax_dim, model_cls, world_size, port):
     strategy_name_list = [strategy.name for strategy in split_strategies_vector]
 
     if softmax_dim == 0:
-        assert '[R, R, R, S1] -> [R, R, R, S1]_0' in strategy_name_list
-        assert '[R, S0, R, S1] -> [R, S0, R, S1]_1' in strategy_name_list
-        assert '[R, R, S0, S1] -> [R, R, S0, S1]_2' in strategy_name_list
-        assert '[R, R, R, S0] -> [R, R, R, S0]_3' in strategy_name_list
-        assert '[R, S1, R, S0] -> [R, S1, R, S0]_4' in strategy_name_list
-        assert '[R, R, S1, S0] -> [R, R, S1, S0]_5' in strategy_name_list
-        assert '[R, R, R, R] -> [R, R, R, R]_6' in strategy_name_list
-        assert '[R, S0, R, R] -> [R, S0, R, R]_7' in strategy_name_list
-        assert '[R, R, S0, R] -> [R, R, S0, R]_8' in strategy_name_list
-        assert '[R, R, R, R] -> [R, R, R, R]_9' in strategy_name_list
-        assert '[R, S1, R, R] -> [R, S1, R, R]_10' in strategy_name_list
-        assert '[R, R, S1, R] -> [R, R, S1, R]_11' in strategy_name_list
-        assert '[R, R, R, S1] -> [R, R, R, S1]_12' in strategy_name_list
-        assert '[R, R, R, S0] -> [R, R, R, S0]_13' in strategy_name_list
-        assert '[R, R, R, R] -> [R, R, R, R]_14' in strategy_name_list
-        assert '[R, R, R, R] -> [R, R, R, R]_15' in strategy_name_list
-        assert '[R, R, R, S0] -> [R, R, R, S0]_16' in strategy_name_list
-        assert '[R, R, R, S1] -> [R, R, R, S1]_17' in strategy_name_list
-        assert '[R, R, R, R] -> [R, R, R, R]_18' in strategy_name_list
-        assert '[R, S01, R, R] -> [R, S01, R, R]_19' in strategy_name_list
-        assert '[R, R, S01, R] -> [R, R, S01, R]_20' in strategy_name_list
-        assert '[R, R, R, R] -> [R, R, R, R]_21' in strategy_name_list
-        assert '[R, R, R, S01] -> [R, R, R, S01]_22' in strategy_name_list
+        assert '[R, R, R, S1] -> [R, R, R, S1]_11' in strategy_name_list
+        assert '[R, S0, R, S1] -> [R, S0, R, S1]_12' in strategy_name_list
+        assert '[R, R, S0, S1] -> [R, R, S0, S1]_13' in strategy_name_list
+        assert '[R, R, R, S0] -> [R, R, R, S0]_14' in strategy_name_list
+        assert '[R, S1, R, S0] -> [R, S1, R, S0]_15' in strategy_name_list
+        assert '[R, R, S1, S0] -> [R, R, S1, S0]_16' in strategy_name_list
+        assert '[R, R, R, R] -> [R, R, R, R]_17' in strategy_name_list
+        assert '[R, S0, R, R] -> [R, S0, R, R]_18' in strategy_name_list
+        assert '[R, R, S0, R] -> [R, R, S0, R]_19' in strategy_name_list
+        assert '[R, R, R, R] -> [R, R, R, R]_20' in strategy_name_list
+        assert '[R, S1, R, R] -> [R, S1, R, R]_21' in strategy_name_list
+        assert '[R, R, S1, R] -> [R, R, S1, R]_22' in strategy_name_list
+        assert '[R, R, R, S1] -> [R, R, R, S1]_10' in strategy_name_list
+        assert '[R, R, R, S0] -> [R, R, R, S0]_9' in strategy_name_list
+        assert '[R, R, R, R] -> [R, R, R, R]_8' in strategy_name_list
+        assert '[R, R, R, R] -> [R, R, R, R]_7' in strategy_name_list
+        assert '[R, R, R, S0] -> [R, R, R, S0]_6' in strategy_name_list
+        assert '[R, R, R, S1] -> [R, R, R, S1]_5' in strategy_name_list
+        assert '[R, R, R, R] -> [R, R, R, R]_0' in strategy_name_list
+        assert '[R, S01, R, R] -> [R, S01, R, R]_1' in strategy_name_list
+        assert '[R, R, S01, R] -> [R, R, S01, R]_2' in strategy_name_list
+        assert '[R, R, R, R] -> [R, R, R, R]_3' in strategy_name_list
+        assert '[R, R, R, S01] -> [R, R, R, S01]_4' in strategy_name_list
 
     if softmax_dim == 1:
-        assert '[S0, R, R, S1] -> [S0, R, R, S1]_0' in strategy_name_list
-        assert '[R, R, R, S1] -> [R, R, R, S1]_1' in strategy_name_list
-        assert '[R, R, S0, S1] -> [R, R, S0, S1]_2' in strategy_name_list
-        assert '[S1, R, R, S0] -> [S1, R, R, S0]_3' in strategy_name_list
-        assert '[R, R, R, S0] -> [R, R, R, S0]_4' in strategy_name_list
-        assert '[R, R, S1, S0] -> [R, R, S1, S0]_5' in strategy_name_list
-        assert '[S0, R, R, R] -> [S0, R, R, R]_6' in strategy_name_list
-        assert '[R, R, R, R] -> [R, R, R, R]_7' in strategy_name_list
-        assert '[R, R, S0, R] -> [R, R, S0, R]_8' in strategy_name_list
-        assert '[S1, R, R, R] -> [S1, R, R, R]_9' in strategy_name_list
-        assert '[R, R, R, R] -> [R, R, R, R]_10' in strategy_name_list
-        assert '[R, R, S1, R] -> [R, R, S1, R]_11' in strategy_name_list
+        assert '[S0, R, R, S1] -> [S0, R, R, S1]_11' in strategy_name_list
         assert '[R, R, R, S1] -> [R, R, R, S1]_12' in strategy_name_list
-        assert '[R, R, R, S0] -> [R, R, R, S0]_13' in strategy_name_list
-        assert '[R, R, R, R] -> [R, R, R, R]_14' in strategy_name_list
-        assert '[R, R, R, R] -> [R, R, R, R]_15' in strategy_name_list
-        assert '[R, R, R, S0] -> [R, R, R, S0]_16' in strategy_name_list
-        assert '[R, R, R, S1] -> [R, R, R, S1]_17' in strategy_name_list
-        assert '[S01, R, R, R] -> [S01, R, R, R]_18' in strategy_name_list
-        assert '[R, R, R, R] -> [R, R, R, R]_19' in strategy_name_list
-        assert '[R, R, S01, R] -> [R, R, S01, R]_20' in strategy_name_list
+        assert '[R, R, S0, S1] -> [R, R, S0, S1]_13' in strategy_name_list
+        assert '[S1, R, R, S0] -> [S1, R, R, S0]_14' in strategy_name_list
+        assert '[R, R, R, S0] -> [R, R, R, S0]_15' in strategy_name_list
+        assert '[R, R, S1, S0] -> [R, R, S1, S0]_16' in strategy_name_list
+        assert '[S0, R, R, R] -> [S0, R, R, R]_17' in strategy_name_list
+        assert '[R, R, R, R] -> [R, R, R, R]_18' in strategy_name_list
+        assert '[R, R, S0, R] -> [R, R, S0, R]_19' in strategy_name_list
+        assert '[S1, R, R, R] -> [S1, R, R, R]_20' in strategy_name_list
         assert '[R, R, R, R] -> [R, R, R, R]_21' in strategy_name_list
-        assert '[R, R, R, S01] -> [R, R, R, S01]_22' in strategy_name_list
+        assert '[R, R, S1, R] -> [R, R, S1, R]_22' in strategy_name_list
+        assert '[R, R, R, S1] -> [R, R, R, S1]_10' in strategy_name_list
+        assert '[R, R, R, S0] -> [R, R, R, S0]_9' in strategy_name_list
+        assert '[R, R, R, R] -> [R, R, R, R]_8' in strategy_name_list
+        assert '[R, R, R, R] -> [R, R, R, R]_7' in strategy_name_list
+        assert '[R, R, R, S0] -> [R, R, R, S0]_6' in strategy_name_list
+        assert '[R, R, R, S1] -> [R, R, R, S1]_5' in strategy_name_list
+        assert '[S01, R, R, R] -> [S01, R, R, R]_0' in strategy_name_list
+        assert '[R, R, R, R] -> [R, R, R, R]_1' in strategy_name_list
+        assert '[R, R, S01, R] -> [R, R, S01, R]_2' in strategy_name_list
+        assert '[R, R, R, R] -> [R, R, R, R]_3' in strategy_name_list
+        assert '[R, R, R, S01] -> [R, R, R, S01]_4' in strategy_name_list
 
 
 @run_on_environment_flag(name='AUTO_PARALLEL')
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_split_handler.py b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_split_handler.py
index 813651869454..044aef19d38d 100644
--- a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_split_handler.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_split_handler.py
@@ -198,54 +198,54 @@ def check_split_handler(rank, split_size, split_dim, model_cls, world_size, port
     if model_cls.__name__ == 'LinearSplitModel':
 
         if split_dim == 0:
-            assert '[R, R, R, S1]_0' in strategy_name_list
-            assert '[R, S0, R, S1]_1' in strategy_name_list
-            assert '[R, R, S0, S1]_2' in strategy_name_list
-            assert '[R, R, R, S0]_3' in strategy_name_list
-            assert '[R, S1, R, S0]_4' in strategy_name_list
-            assert '[R, R, S1, S0]_5' in strategy_name_list
-            assert '[R, R, R, R]_6' in strategy_name_list
-            assert '[R, S0, R, R]_7' in strategy_name_list
-            assert '[R, R, S0, R]_8' in strategy_name_list
-            assert '[R, R, R, R]_9' in strategy_name_list
-            assert '[R, S1, R, R]_10' in strategy_name_list
-            assert '[R, R, S1, R]_11' in strategy_name_list
-            assert '[R, R, R, S1]_12' in strategy_name_list
-            assert '[R, R, R, S0]_13' in strategy_name_list
-            assert '[R, R, R, R]_14' in strategy_name_list
-            assert '[R, R, R, R]_15' in strategy_name_list
-            assert '[R, R, R, S0]_16' in strategy_name_list
-            assert '[R, R, R, S1]_17' in strategy_name_list
-            assert '[R, R, R, R]_18' in strategy_name_list
-            assert '[R, S01, R, R]_19' in strategy_name_list
-            assert '[R, R, S01, R]_20' in strategy_name_list
-            assert '[R, R, R, R]_21' in strategy_name_list
-            assert '[R, R, R, S01]_22' in strategy_name_list
+            assert '[R, R, R, S1]_11' in strategy_name_list
+            assert '[R, S0, R, S1]_12' in strategy_name_list
+            assert '[R, R, S0, S1]_13' in strategy_name_list
+            assert '[R, R, R, S0]_14' in strategy_name_list
+            assert '[R, S1, R, S0]_15' in strategy_name_list
+            assert '[R, R, S1, S0]_16' in strategy_name_list
+            assert '[R, R, R, R]_17' in strategy_name_list
+            assert '[R, S0, R, R]_18' in strategy_name_list
+            assert '[R, R, S0, R]_19' in strategy_name_list
+            assert '[R, R, R, R]_20' in strategy_name_list
+            assert '[R, S1, R, R]_21' in strategy_name_list
+            assert '[R, R, S1, R]_22' in strategy_name_list
+            assert '[R, R, R, S1]_10' in strategy_name_list
+            assert '[R, R, R, S0]_9' in strategy_name_list
+            assert '[R, R, R, R]_8' in strategy_name_list
+            assert '[R, R, R, R]_7' in strategy_name_list
+            assert '[R, R, R, S0]_6' in strategy_name_list
+            assert '[R, R, R, S1]_5' in strategy_name_list
+            assert '[R, R, R, R]_0' in strategy_name_list
+            assert '[R, S01, R, R]_1' in strategy_name_list
+            assert '[R, R, S01, R]_2' in strategy_name_list
+            assert '[R, R, R, R]_3' in strategy_name_list
+            assert '[R, R, R, S01]_4' in strategy_name_list
 
         if split_dim == 1:
-            assert '[S0, R, R, S1]_0' in strategy_name_list
-            assert '[R, R, R, S1]_1' in strategy_name_list
-            assert '[R, R, S0, S1]_2' in strategy_name_list
-            assert '[S1, R, R, S0]_3' in strategy_name_list
-            assert '[R, R, R, S0]_4' in strategy_name_list
-            assert '[R, R, S1, S0]_5' in strategy_name_list
-            assert '[S0, R, R, R]_6' in strategy_name_list
-            assert '[R, R, R, R]_7' in strategy_name_list
-            assert '[R, R, S0, R]_8' in strategy_name_list
-            assert '[S1, R, R, R]_9' in strategy_name_list
-            assert '[R, R, R, R]_10' in strategy_name_list
-            assert '[R, R, S1, R]_11' in strategy_name_list
+            assert '[S0, R, R, S1]_11' in strategy_name_list
             assert '[R, R, R, S1]_12' in strategy_name_list
-            assert '[R, R, R, S0]_13' in strategy_name_list
-            assert '[R, R, R, R]_14' in strategy_name_list
-            assert '[R, R, R, R]_15' in strategy_name_list
-            assert '[R, R, R, S0]_16' in strategy_name_list
-            assert '[R, R, R, S1]_17' in strategy_name_list
-            assert '[S01, R, R, R]_18' in strategy_name_list
-            assert '[R, R, R, R]_19' in strategy_name_list
-            assert '[R, R, S01, R]_20' in strategy_name_list
+            assert '[R, R, S0, S1]_13' in strategy_name_list
+            assert '[S1, R, R, S0]_14' in strategy_name_list
+            assert '[R, R, R, S0]_15' in strategy_name_list
+            assert '[R, R, S1, S0]_16' in strategy_name_list
+            assert '[S0, R, R, R]_17' in strategy_name_list
+            assert '[R, R, R, R]_18' in strategy_name_list
+            assert '[R, R, S0, R]_19' in strategy_name_list
+            assert '[S1, R, R, R]_20' in strategy_name_list
             assert '[R, R, R, R]_21' in strategy_name_list
-            assert '[R, R, R, S01]_22' in strategy_name_list
+            assert '[R, R, S1, R]_22' in strategy_name_list
+            assert '[R, R, R, S1]_10' in strategy_name_list
+            assert '[R, R, R, S0]_9' in strategy_name_list
+            assert '[R, R, R, R]_8' in strategy_name_list
+            assert '[R, R, R, R]_7' in strategy_name_list
+            assert '[R, R, R, S0]_6' in strategy_name_list
+            assert '[R, R, R, S1]_5' in strategy_name_list
+            assert '[S01, R, R, R]_0' in strategy_name_list
+            assert '[R, R, R, R]_1' in strategy_name_list
+            assert '[R, R, S01, R]_2' in strategy_name_list
+            assert '[R, R, R, R]_3' in strategy_name_list
+            assert '[R, R, R, S01]_4' in strategy_name_list
 
 
 @run_on_environment_flag(name='AUTO_PARALLEL')
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_view_handler.py b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_view_handler.py
index d07d2f76c178..8a96ac0d66f0 100644
--- a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_view_handler.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_view_handler.py
@@ -196,54 +196,57 @@ def check_view_handler(rank, tgt_shape, model_cls, world_size, port):
     if model_cls.__name__ == 'LinearViewModel':
 
         if tgt_shape == (32, 4, 64, 16, 4):
-            assert '[S0, R, R, S1] -> [S0, R, R, S1, R]_0' in strategy_name_list
-            assert '[R, S0, R, S1] -> FULLY REPLICATED_1' in strategy_name_list
-            assert '[R, R, S0, S1] -> [R, R, S0, S1, R]_2' in strategy_name_list
-            assert '[S1, R, R, S0] -> [S1, R, R, S0, R]_3' in strategy_name_list
-            assert '[R, S1, R, S0] -> FULLY REPLICATED_4' in strategy_name_list
-            assert '[R, R, S1, S0] -> [R, R, S1, S0, R]_5' in strategy_name_list
-            assert '[S0, R, R, R] -> [S0, R, R, R, R]_6' in strategy_name_list
-            assert '[R, S0, R, R] -> FULLY REPLICATED_7' in strategy_name_list
-            assert '[R, R, S0, R] -> [R, R, S0, R, R]_8' in strategy_name_list
-            assert '[S1, R, R, R] -> [S1, R, R, R, R]_9' in strategy_name_list
-            assert '[R, S1, R, R] -> FULLY REPLICATED_10' in strategy_name_list
-            assert '[R, R, S1, R] -> [R, R, S1, R, R]_11' in strategy_name_list
-            assert '[R, R, R, S1] -> [R, R, R, S1, R]_12' in strategy_name_list
-            assert '[R, R, R, S0] -> [R, R, R, S0, R]_13' in strategy_name_list
-            assert '[R, R, R, R] -> [R, R, R, R, R]_14' in strategy_name_list
-            assert '[R, R, R, R] -> [R, R, R, R, R]_15' in strategy_name_list
-            assert '[R, R, R, S0] -> [R, R, R, S0, R]_16' in strategy_name_list
-            assert '[R, R, R, S1] -> [R, R, R, S1, R]_17' in strategy_name_list
-            assert '[S01, R, R, R] -> [S01, R, R, R, R]_18' in strategy_name_list
-            assert '[R, S01, R, R] -> FULLY REPLICATED_19' in strategy_name_list
-            assert '[R, R, S01, R] -> [R, R, S01, R, R]_20' in strategy_name_list
-            assert '[R, R, R, R] -> [R, R, R, R, R]_21' in strategy_name_list
-            assert '[R, R, R, S01] -> [R, R, R, S01, R]_22' in strategy_name_list
+            for strategy in strategy_name_list:
+                print(strategy)
+            # print(strategy_name_list)
+            assert '[S0, R, R, S1] -> [S0, R, R, S1, R]_11' in strategy_name_list
+            assert '[R, S0, R, S1] -> FULLY REPLICATED_12' in strategy_name_list
+            assert '[R, R, S0, S1] -> [R, R, S0, S1, R]_13' in strategy_name_list
+            assert '[S1, R, R, S0] -> [S1, R, R, S0, R]_14' in strategy_name_list
+            assert '[R, S1, R, S0] -> FULLY REPLICATED_15' in strategy_name_list
+            assert '[R, R, S1, S0] -> [R, R, S1, S0, R]_16' in strategy_name_list
+            assert '[S0, R, R, R] -> [S0, R, R, R, R]_17' in strategy_name_list
+            assert '[R, S0, R, R] -> FULLY REPLICATED_18' in strategy_name_list
+            assert '[R, R, S0, R] -> [R, R, S0, R, R]_19' in strategy_name_list
+            assert '[S1, R, R, R] -> [S1, R, R, R, R]_20' in strategy_name_list
+            assert '[R, S1, R, R] -> FULLY REPLICATED_21' in strategy_name_list
+            assert '[R, R, S1, R] -> [R, R, S1, R, R]_22' in strategy_name_list
+            assert '[R, R, R, S1] -> [R, R, R, S1, R]_10' in strategy_name_list
+            assert '[R, R, R, S0] -> [R, R, R, S0, R]_9' in strategy_name_list
+            assert '[R, R, R, R] -> [R, R, R, R, R]_8' in strategy_name_list
+            assert '[R, R, R, R] -> [R, R, R, R, R]_7' in strategy_name_list
+            assert '[R, R, R, S0] -> [R, R, R, S0, R]_6' in strategy_name_list
+            assert '[R, R, R, S1] -> [R, R, R, S1, R]_5' in strategy_name_list
+            assert '[S01, R, R, R] -> [S01, R, R, R, R]_0' in strategy_name_list
+            assert '[R, S01, R, R] -> FULLY REPLICATED_1' in strategy_name_list
+            assert '[R, R, S01, R] -> [R, R, S01, R, R]_2' in strategy_name_list
+            assert '[R, R, R, R] -> [R, R, R, R, R]_3' in strategy_name_list
+            assert '[R, R, R, S01] -> [R, R, R, S01, R]_4' in strategy_name_list
 
         if tgt_shape == (8, 4, 4, 64, 16, 4):
-            assert '[S0, R, R, S1] -> [S0, R, R, R, S1, R]_0' in strategy_name_list
-            assert '[R, S0, R, S1] -> [R, S0, R, R, S1, R]_1' in strategy_name_list
-            assert '[R, R, S0, S1] -> [R, R, R, S0, S1, R]_2' in strategy_name_list
-            assert '[S1, R, R, S0] -> [S1, R, R, R, S0, R]_3' in strategy_name_list
-            assert '[R, S1, R, S0] -> [R, S1, R, R, S0, R]_4' in strategy_name_list
-            assert '[R, R, S1, S0] -> [R, R, R, S1, S0, R]_5' in strategy_name_list
-            assert '[S0, R, R, R] -> [S0, R, R, R, R, R]_6' in strategy_name_list
-            assert '[R, S0, R, R] -> [R, S0, R, R, R, R]_7' in strategy_name_list
-            assert '[R, R, S0, R] -> [R, R, R, S0, R, R]_8' in strategy_name_list
-            assert '[S1, R, R, R] -> [S1, R, R, R, R, R]_9' in strategy_name_list
-            assert '[R, S1, R, R] -> [R, S1, R, R, R, R]_10' in strategy_name_list
-            assert '[R, R, S1, R] -> [R, R, R, S1, R, R]_11' in strategy_name_list
-            assert '[R, R, R, S1] -> [R, R, R, R, S1, R]_12' in strategy_name_list
-            assert '[R, R, R, S0] -> [R, R, R, R, S0, R]_13' in strategy_name_list
-            assert '[R, R, R, R] -> [R, R, R, R, R, R]_14' in strategy_name_list
-            assert '[R, R, R, R] -> [R, R, R, R, R, R]_15' in strategy_name_list
-            assert '[R, R, R, S0] -> [R, R, R, R, S0, R]_16' in strategy_name_list
-            assert '[R, R, R, S1] -> [R, R, R, R, S1, R]_17' in strategy_name_list
-            assert '[S01, R, R, R] -> [S01, R, R, R, R, R]_18' in strategy_name_list
-            assert '[R, S01, R, R] -> [R, S01, R, R, R, R]_19' in strategy_name_list
-            assert '[R, R, S01, R] -> [R, R, R, S01, R, R]_20' in strategy_name_list
-            assert '[R, R, R, R] -> [R, R, R, R, R, R]_21' in strategy_name_list
-            assert '[R, R, R, S01] -> [R, R, R, R, S01, R]_22' in strategy_name_list
+            assert '[S0, R, R, S1] -> [S0, R, R, R, S1, R]_11' in strategy_name_list
+            assert '[R, S0, R, S1] -> [R, S0, R, R, S1, R]_12' in strategy_name_list
+            assert '[R, R, S0, S1] -> [R, R, R, S0, S1, R]_13' in strategy_name_list
+            assert '[S1, R, R, S0] -> [S1, R, R, R, S0, R]_14' in strategy_name_list
+            assert '[R, S1, R, S0] -> [R, S1, R, R, S0, R]_15' in strategy_name_list
+            assert '[R, R, S1, S0] -> [R, R, R, S1, S0, R]_16' in strategy_name_list
+            assert '[S0, R, R, R] -> [S0, R, R, R, R, R]_17' in strategy_name_list
+            assert '[R, S0, R, R] -> [R, S0, R, R, R, R]_18' in strategy_name_list
+            assert '[R, R, S0, R] -> [R, R, R, S0, R, R]_19' in strategy_name_list
+            assert '[S1, R, R, R] -> [S1, R, R, R, R, R]_20' in strategy_name_list
+            assert '[R, S1, R, R] -> [R, S1, R, R, R, R]_21' in strategy_name_list
+            assert '[R, R, S1, R] -> [R, R, R, S1, R, R]_22' in strategy_name_list
+            assert '[R, R, R, S1] -> [R, R, R, R, S1, R]_10' in strategy_name_list
+            assert '[R, R, R, S0] -> [R, R, R, R, S0, R]_9' in strategy_name_list
+            assert '[R, R, R, R] -> [R, R, R, R, R, R]_8' in strategy_name_list
+            assert '[R, R, R, R] -> [R, R, R, R, R, R]_7' in strategy_name_list
+            assert '[R, R, R, S0] -> [R, R, R, R, S0, R]_6' in strategy_name_list
+            assert '[R, R, R, S1] -> [R, R, R, R, S1, R]_5' in strategy_name_list
+            assert '[S01, R, R, R] -> [S01, R, R, R, R, R]_0' in strategy_name_list
+            assert '[R, S01, R, R] -> [R, S01, R, R, R, R]_1' in strategy_name_list
+            assert '[R, R, S01, R] -> [R, R, R, S01, R, R]_2' in strategy_name_list
+            assert '[R, R, R, R] -> [R, R, R, R, R, R]_3' in strategy_name_list
+            assert '[R, R, R, S01] -> [R, R, R, R, S01, R]_4' in strategy_name_list
 
 
 @run_on_environment_flag(name='AUTO_PARALLEL')

From b6e3b955c3e1a58c79bdd143e6c0de96b9e3f247 Mon Sep 17 00:00:00 2001
From: fastalgo <youyang@cs.berkeley.edu>
Date: Thu, 16 Feb 2023 07:39:46 +0800
Subject: [PATCH 341/503] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c2ad6ffc78fa..e415bcc8a50f 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 
    [![logo](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/colossal-ai_logo_vertical.png)](https://www.colossalai.org/)
 
-   Colossal-AI: Making big AI models cheaper, easier, and scalable
+   Colossal-AI: Making big AI models cheaper, easier, and more scalable
 
    <h3> <a href="https://arxiv.org/abs/2110.14883"> Paper </a> |
    <a href="https://www.colossalai.org/"> Documentation </a> |

From 648183a96037a0d9e758154f98e1e1b8004eea0b Mon Sep 17 00:00:00 2001
From: BlueRum <70618399+ht-zhou@users.noreply.github.com>
Date: Thu, 16 Feb 2023 10:25:17 +0800
Subject: [PATCH 342/503] [chatgpt]fix train_rm bug with lora (#2741)

---
 applications/ChatGPT/chatgpt/trainer/rm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/applications/ChatGPT/chatgpt/trainer/rm.py b/applications/ChatGPT/chatgpt/trainer/rm.py
index c24289502830..b76ae537306c 100644
--- a/applications/ChatGPT/chatgpt/trainer/rm.py
+++ b/applications/ChatGPT/chatgpt/trainer/rm.py
@@ -43,7 +43,7 @@ def fit(self, use_lora):
             # train
             if use_lora > 0:
                 print("Using Lora")
-                lora.mark_only_lora_as_trainable(self.model)
+                lora.mark_only_lora_as_trainable(self.model.model)
             else:
                 self.model.train()
             for chosen_ids, c_mask, reject_ids, r_mask in self.train_dataloader:

From 613efebc5c2254abc09c87feaeea514857577d3e Mon Sep 17 00:00:00 2001
From: BlueRum <70618399+ht-zhou@users.noreply.github.com>
Date: Thu, 16 Feb 2023 11:24:07 +0800
Subject: [PATCH 343/503] [chatgpt] support colossalai strategy to train rm
 (#2742)

* [chatgpt]fix train_rm bug with lora

* [chatgpt]support colossalai strategy to train rm

* fix pre-commit

* fix pre-commit 2
---
 applications/ChatGPT/chatgpt/trainer/rm.py    | 44 ++++++++++++-------
 applications/ChatGPT/examples/train_dummy.sh  |  4 +-
 .../ChatGPT/examples/train_prompts.sh         |  4 +-
 .../ChatGPT/examples/train_reward_model.py    | 39 +++++++++++++---
 applications/ChatGPT/examples/train_rm.sh     |  4 +-
 5 files changed, 66 insertions(+), 29 deletions(-)

diff --git a/applications/ChatGPT/chatgpt/trainer/rm.py b/applications/ChatGPT/chatgpt/trainer/rm.py
index b76ae537306c..f6639edcbbb4 100644
--- a/applications/ChatGPT/chatgpt/trainer/rm.py
+++ b/applications/ChatGPT/chatgpt/trainer/rm.py
@@ -3,10 +3,13 @@
 import loralib as lora
 from chatgpt.dataset import RewardDataset
 from chatgpt.nn import PairWiseLoss
-from torch.optim import Adam
+from torch.optim import Adam, Optimizer
 from torch.utils.data import DataLoader
 from tqdm import tqdm
 
+from .strategies import Strategy
+from .utils import is_rank_0
+
 
 class RewardModelTrainer(ABC):
     """
@@ -14,32 +17,41 @@ class RewardModelTrainer(ABC):
 
     Args:
         model (torch.nn.Module): the model to train
+        strategy (Strategy): the strategy to use for training
+        optim(Optimizer): the optimizer to use for training
         train_dataset (RewardDataset): the dataset to use for training
         eval_dataset (RewardDataset): the dataset to use for evaluation
         batch_size (int, defaults to 1): the batch size while training
-        num_epochs (int, defaults to 2): the number of epochs to train
+        max_epochs (int, defaults to 2): the number of epochs to train
         optim_kwargs (dict, defaults to {'lr':1e-4}): the kwargs to use while initializing optimizer
     """
 
-    def __init__(self,
-                 model,
-                 train_dataset: RewardDataset,
-                 eval_dataset: RewardDataset,
-                 batch_size: int = 1,
-                 num_epochs: int = 2,
-                 optim_kwargs: dict = {'lr': 1e-4}) -> None:
+    def __init__(
+        self,
+        model,
+        strategy: Strategy,
+        optim: Optimizer,
+        train_dataset: RewardDataset,
+        eval_dataset: RewardDataset,
+        batch_size: int = 1,
+        max_epochs: int = 2,
+    ) -> None:
         super().__init__()
-        self.model = model
+        self.strategy = strategy
+        self.epochs = max_epochs
         self.train_dataloader = DataLoader(train_dataset, batch_size=batch_size)
         self.eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size)
+
+        self.model = strategy.setup_model(model)
         self.loss_fn = PairWiseLoss()
-        self.optimizer = Adam(self.model.parameters(), **optim_kwargs)
-        self.epochs = num_epochs
+        self.optimizer = strategy.setup_optimizer(optim, self.model)
 
     def fit(self, use_lora):
-        epoch_bar = tqdm(range(self.epochs), desc='Train epoch')
+        epoch_bar = tqdm(range(self.epochs), desc='Train epoch', disable=not is_rank_0())
         for epoch in range(self.epochs):
-            step_bar = tqdm(range(self.train_dataloader.__len__()), desc='Train step of epoch %d' % epoch)
+            step_bar = tqdm(range(self.train_dataloader.__len__()),
+                            desc='Train step of epoch %d' % epoch,
+                            disable=not is_rank_0())
             # train
             if use_lora > 0:
                 print("Using Lora")
@@ -54,8 +66,8 @@ def fit(self, use_lora):
                 chosen_reward = self.model(chosen_ids, attention_mask=c_mask)
                 reject_reward = self.model(reject_ids, attention_mask=r_mask)
                 loss = self.loss_fn(chosen_reward, reject_reward)
-                loss.backward()
-                self.optimizer.step()
+                self.strategy.backward(loss, self.model, self.optimizer)
+                self.strategy.optimizer_step(self.optimizer)
                 self.optimizer.zero_grad()
                 step_bar.update()
                 step_bar.set_postfix({'loss': loss.item()})
diff --git a/applications/ChatGPT/examples/train_dummy.sh b/applications/ChatGPT/examples/train_dummy.sh
index 559d338ee021..595da573e2b1 100755
--- a/applications/ChatGPT/examples/train_dummy.sh
+++ b/applications/ChatGPT/examples/train_dummy.sh
@@ -13,6 +13,6 @@ set_n_least_used_CUDA_VISIBLE_DEVICES() {
     echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
 }
 
-set_n_least_used_CUDA_VISIBLE_DEVICES 1
+set_n_least_used_CUDA_VISIBLE_DEVICES 2
 
-python train_dummy.py --model bloom --pretrain '/data2/users/lczht/bloom-560m' --lora_rank 16
+torchrun --standalone --nproc_per_node=2 train_dummy.py --strategy colossalai_zero2
diff --git a/applications/ChatGPT/examples/train_prompts.sh b/applications/ChatGPT/examples/train_prompts.sh
index 0b82d3f1cd5e..db73ac8e8e85 100755
--- a/applications/ChatGPT/examples/train_prompts.sh
+++ b/applications/ChatGPT/examples/train_prompts.sh
@@ -13,6 +13,6 @@ set_n_least_used_CUDA_VISIBLE_DEVICES() {
     echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
 }
 
-set_n_least_used_CUDA_VISIBLE_DEVICES 1
+set_n_least_used_CUDA_VISIBLE_DEVICES 2
 
-python train_prompts.py prompts.csv --pretrain '/data2/users/lczht/bloom-560m' --lora_rank 16
+torchrun --standalone --nproc_per_node=2 train_prompts.py prompts.csv --strategy colossalai_zero2
diff --git a/applications/ChatGPT/examples/train_reward_model.py b/applications/ChatGPT/examples/train_reward_model.py
index fd78a2ac6325..47688325ed7a 100644
--- a/applications/ChatGPT/examples/train_reward_model.py
+++ b/applications/ChatGPT/examples/train_reward_model.py
@@ -5,33 +5,55 @@
 from chatgpt.dataset import RewardDataset
 from chatgpt.nn import BLOOMRM
 from chatgpt.trainer import RewardModelTrainer
+from chatgpt.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
 from datasets import load_dataset
+from torch.optim import Adam
 from transformers import BloomTokenizerFast
 
+from colossalai.nn.optimizer import HybridAdam
+
 
 def train(args):
+    # configure strategy
+    if args.strategy == 'naive':
+        strategy = NaiveStrategy()
+    elif args.strategy == 'ddp':
+        strategy = DDPStrategy()
+    elif args.strategy == 'colossalai_gemini':
+        strategy = ColossalAIStrategy(stage=3, placement_policy='cuda')
+    elif args.strategy == 'colossalai_zero2':
+        strategy = ColossalAIStrategy(stage=2, placement_policy='cuda')
+    else:
+        raise ValueError(f'Unsupported strategy "{args.strategy}"')
+
+    # configure model
     tokenizer = BloomTokenizerFast.from_pretrained(args.pretrain)
     tokenizer.pad_token = tokenizer.eos_token
-    model = BLOOMRM(pretrained=args.pretrain)
-
-    model.cuda()
-
+    model = BLOOMRM(pretrained=args.pretrain).cuda()
     max_len = 1024
 
+    # configure optimizer
+    if args.strategy.startswith('colossalai'):
+        optim = HybridAdam(model.parameters(), lr=5e-5)
+    else:
+        optim = Adam(model.parameters(), lr=5e-5)
+
     # prepare for data and dataset
     data = load_dataset(args.dataset)
-    train_data = data["train"]
-    eval_data = data['test']
+    train_data = data["train"].select(range(100))
+    eval_data = data['test'].select(range(5))
     train_dataset = RewardDataset(train_data, tokenizer, max_len)
     eval_dataset = RewardDataset(eval_data, tokenizer, max_len)
 
     # batch_size here is expected to be C(k,2), k means # response of each prompt
     # be limited with the format of dataset 'Dahoas/rm-static', we'd better use batch_size as 1
     trainer = RewardModelTrainer(model=model,
+                                 strategy=strategy,
+                                 optim=optim,
                                  train_dataset=train_dataset,
                                  eval_dataset=eval_dataset,
                                  batch_size=args.batch_size,
-                                 num_epochs=args.max_epochs)
+                                 max_epochs=args.max_epochs)
 
     trainer.fit(use_lora=args.lora_rank)
 
@@ -43,6 +65,9 @@ def train(args):
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
+    parser.add_argument('--strategy',
+                        choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
+                        default='naive')
     parser.add_argument('--pretrain', type=str, default=None)
     parser.add_argument('--dataset', type=str, default='Dahoas/rm-static')
     parser.add_argument('--save_path', type=str, default='rm_ckpt.pth')
diff --git a/applications/ChatGPT/examples/train_rm.sh b/applications/ChatGPT/examples/train_rm.sh
index bf46d7e43ff2..ed91deee2c59 100755
--- a/applications/ChatGPT/examples/train_rm.sh
+++ b/applications/ChatGPT/examples/train_rm.sh
@@ -13,6 +13,6 @@ set_n_least_used_CUDA_VISIBLE_DEVICES() {
     echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
 }
 
-set_n_least_used_CUDA_VISIBLE_DEVICES 1
+set_n_least_used_CUDA_VISIBLE_DEVICES 2
 
-python train_reward_model.py --pretrain '/data2/users/lczht/bloom-560m' --lora_rank 16
+torchrun --standalone --nproc_per_node=2 train_reward_model.py --pretrain '/data2/users/lczht/bloom-560m' --strategy colossalai_zero2

From e37695430548064cb0e2d2e60bc8065697a52612 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Thu, 16 Feb 2023 15:45:26 +0800
Subject: [PATCH 344/503] [doc] add opt service doc (#2747)

---
 docs/sidebars.json | 3 ++-
 docs/versions.json | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/sidebars.json b/docs/sidebars.json
index 9e9ef89ba63f..44287c17eadf 100644
--- a/docs/sidebars.json
+++ b/docs/sidebars.json
@@ -72,7 +72,8 @@
         "advanced_tutorials/add_your_parallel",
         "advanced_tutorials/meet_gemini",
         "advanced_tutorials/parallelize_your_training_like_Megatron",
-        "advanced_tutorials/integrate_mixture_of_experts_into_your_model"
+        "advanced_tutorials/integrate_mixture_of_experts_into_your_model",
+        "advanced_tutorials/opt_service"
       ]
     }
   ]
diff --git a/docs/versions.json b/docs/versions.json
index 6dd417a18024..dde32982b798 100644
--- a/docs/versions.json
+++ b/docs/versions.json
@@ -1,3 +1,3 @@
 [
-  "v0.2.5"
+  "current"
 ]

From d6d6dec1902f22323b8c16e0fff582baa8a5e56a Mon Sep 17 00:00:00 2001
From: binmakeswell <binmakeswell@gmail.com>
Date: Thu, 16 Feb 2023 20:07:25 +0800
Subject: [PATCH 345/503] [doc] update example and OPT serving link (#2769)

* [doc] update OPT serving link

* [doc] update example and OPT serving link

* [doc] update example and OPT serving link
---
 README-zh-Hans.md | 6 +++---
 README.md         | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/README-zh-Hans.md b/README-zh-Hans.md
index 18623d67a920..fef2a02ec42b 100644
--- a/README-zh-Hans.md
+++ b/README-zh-Hans.md
@@ -145,9 +145,9 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/OPT_update.png" width=800/>
 
 - [Open Pretrained Transformer (OPT)](https://github.com/facebookresearch/metaseq), 由Meta发布的1750亿语言模型，由于完全公开了预训练参数权重，因此促进了下游任务和应用部署的发展。
-- 加速45%，仅用几行代码以低成本微调OPT。[[样例]](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/language/opt) [[在线推理]](https://github.com/hpcaitech/ColossalAI-Documentation/blob/main/i18n/zh-Hans/docusaurus-plugin-content-docs/current/advanced_tutorials/opt_service.md)
+- 加速45%，仅用几行代码以低成本微调OPT。[[样例]](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/opt) [[在线推理]](https://colossalai.org/docs/advanced_tutorials/opt_service)
 
-请访问我们的 [文档](https://www.colossalai.org/) 和 [例程](https://github.com/hpcaitech/ColossalAI-Examples) 以了解详情。
+请访问我们的 [文档](https://www.colossalai.org/) 和 [例程](https://github.com/hpcaitech/ColossalAI/tree/main/examples) 以了解详情。
 
 ### ViT
 <p align="center">
@@ -199,7 +199,7 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/OPT_serving.png" width=800/>
 </p>
 
-- [OPT推理服务](https://github.com/hpcaitech/ColossalAI-Documentation/blob/main/i18n/zh-Hans/docusaurus-plugin-content-docs/current/advanced_tutorials/opt_service.md): 无需注册，免费体验1750亿参数OPT在线推理服务
+- [OPT推理服务](https://colossalai.org/docs/advanced_tutorials/opt_service): 体验1750亿参数OPT在线推理服务
 
 <p id="BLOOM-Inference" align="center">
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/BLOOM%20Inference.PNG" width=800/>
diff --git a/README.md b/README.md
index e415bcc8a50f..35b2cca6b47e 100644
--- a/README.md
+++ b/README.md
@@ -148,9 +148,9 @@ distributed training and inference in a few lines.
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/OPT_update.png" width=800/>
 
 - [Open Pretrained Transformer (OPT)](https://github.com/facebookresearch/metaseq), a 175-Billion parameter AI language model released by Meta, which stimulates AI programmers to perform various downstream tasks and application deployments because public pretrained model weights.
-- 45% speedup fine-tuning OPT at low cost in lines. [[Example]](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/language/opt) [[Online Serving]](https://github.com/hpcaitech/ColossalAI-Documentation/blob/main/i18n/en/docusaurus-plugin-content-docs/current/advanced_tutorials/opt_service.md)
+- 45% speedup fine-tuning OPT at low cost in lines. [[Example]](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/opt) [[Online Serving]](https://colossalai.org/docs/advanced_tutorials/opt_service)
 
-Please visit our [documentation](https://www.colossalai.org/) and [examples](https://github.com/hpcaitech/ColossalAI-Examples) for more details.
+Please visit our [documentation](https://www.colossalai.org/) and [examples](https://github.com/hpcaitech/ColossalAI/tree/main/examples) for more details.
 
 ### ViT
 <p align="center">
@@ -201,7 +201,7 @@ Please visit our [documentation](https://www.colossalai.org/) and [examples](htt
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/OPT_serving.png" width=800/>
 </p>
 
-- [OPT Serving](https://github.com/hpcaitech/ColossalAI-Documentation/blob/main/i18n/en/docusaurus-plugin-content-docs/current/advanced_tutorials/opt_service.md): Try 175-billion-parameter OPT online services for free, without any registration whatsoever.
+- [OPT Serving](https://colossalai.org/docs/advanced_tutorials/opt_service): Try 175-billion-parameter OPT online services
 
 <p id="BLOOM-Inference" align="center">
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/BLOOM%20Inference.PNG" width=800/>

From a88bc828d5b20cc177e49bcfdc7e253a49646597 Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Thu, 16 Feb 2023 20:09:34 +0800
Subject: [PATCH 346/503] [chatgpt] disable shard init for colossalai (#2767)

---
 .../ChatGPT/chatgpt/trainer/strategies/colossalai.py      | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/applications/ChatGPT/chatgpt/trainer/strategies/colossalai.py b/applications/ChatGPT/chatgpt/trainer/strategies/colossalai.py
index 665bfa913d00..578844bdbdbd 100644
--- a/applications/ChatGPT/chatgpt/trainer/strategies/colossalai.py
+++ b/applications/ChatGPT/chatgpt/trainer/strategies/colossalai.py
@@ -1,3 +1,4 @@
+import warnings
 from typing import Optional
 
 import torch
@@ -23,6 +24,7 @@ class ColossalAIStrategy(DDPStrategy):
         stage(int): The stage to use in ZeRO. Choose in (1, 2, 3)
         seed(int): The seed for the random number generator.
         shard_init(bool): Whether to shard the model parameters during initialization. Only for ZeRO-3.
+            This is not compativle with `from_pretrained()`. We temporarily disable this and will support it in the future.
         placement_policy(str): The placement policy for gemini. Choose in ('cpu', 'cuda')
                           If it is “cpu”, parameters, gradients and optimizer states will be offloaded to CPU,
                           If it is “cuda”, they will not be offloaded, which means max CUDA memory will be used. It is the fastest.
@@ -50,7 +52,7 @@ def __init__(
             self,
             stage: int = 3,
             seed: int = 42,
-            shard_init: bool = True,    # only for stage 3
+            shard_init: bool = False,    # only for stage 3
             placement_policy: str = 'cuda',
             pin_memory: bool = True,    # only for stage 3
             force_outputs_fp32: bool = False,    # only for stage 3
@@ -72,6 +74,10 @@ def __init__(
         super().__init__(seed)
         assert placement_policy in ('cpu', 'cuda'), f'Unsupported placement policy "{placement_policy}"'
         self.stage = stage
+        # TODO(ver217): support shard_init when using from_pretrained()
+        if shard_init:
+            warnings.warn(f'Shard init is not supported yet. Ignore.')
+            shard_init = False
         self.shard_init = shard_init
         self.gemini_config = dict(device=get_current_device(),
                                   placement_policy=placement_policy,

From 01066152f12f05e3e3f843c180814a1b55262555 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nikita.shulga@gmail.com>
Date: Thu, 16 Feb 2023 17:22:45 -0800
Subject: [PATCH 347/503] Don't use `torch._six` (#2775)

* Don't use `torch._six`

This is a private API which is gone after https://github.com/pytorch/pytorch/pull/94709

* Update common.py
---
 colossalai/utils/common.py              | 2 +-
 colossalai/zero/sharded_optim/_utils.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/colossalai/utils/common.py b/colossalai/utils/common.py
index 2099883fbdf7..e35b29c2ac56 100644
--- a/colossalai/utils/common.py
+++ b/colossalai/utils/common.py
@@ -11,7 +11,7 @@
 
 import torch
 import torch.distributed as dist
-from torch._six import inf
+from torch import inf
 from torch.nn.parameter import Parameter
 
 from colossalai.constants import IS_TENSOR_PARALLEL, NUM_PARTITIONS, TENSOR_PARALLEL_ATTRIBUTES
diff --git a/colossalai/zero/sharded_optim/_utils.py b/colossalai/zero/sharded_optim/_utils.py
index e674344018d4..68928b232660 100644
--- a/colossalai/zero/sharded_optim/_utils.py
+++ b/colossalai/zero/sharded_optim/_utils.py
@@ -3,7 +3,7 @@
 
 import torch
 import torch.distributed as dist
-from torch._six import inf
+from torch.six import inf
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 
 from colossalai.tensor import ColoParameter

From ba84cd80b28337d97984b008282ff931a71051f8 Mon Sep 17 00:00:00 2001
From: Fazzie-Maqianli <55798671+Fazziekey@users.noreply.github.com>
Date: Fri, 17 Feb 2023 09:54:21 +0800
Subject: [PATCH 348/503] fix pip install colossal (#2764)

---
 examples/images/diffusion/README.md  | 2 +-
 examples/images/dreambooth/README.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/images/diffusion/README.md b/examples/images/diffusion/README.md
index 15932f1f524b..cc57f6d54a8e 100644
--- a/examples/images/diffusion/README.md
+++ b/examples/images/diffusion/README.md
@@ -79,7 +79,7 @@ pip install pytorch-lightning
 For example, you can install  v0.2.0 from our official website.
 
 ```
-pip install colossalai==0.2.0+torch1.12cu11.3 -f https://release.colossalai.org
+pip install colossalai
 ```
 
 ##### From source
diff --git a/examples/images/dreambooth/README.md b/examples/images/dreambooth/README.md
index a306a3abfc2c..83b7e4c06d54 100644
--- a/examples/images/dreambooth/README.md
+++ b/examples/images/dreambooth/README.md
@@ -16,7 +16,7 @@ pip install -r requirements_colossalai.txt
 ### Install [colossalai](https://github.com/hpcaitech/ColossalAI.git)
 
 ```bash
-pip install colossalai==0.2.0+torch1.12cu11.3 -f https://release.colossalai.org
+pip install colossalai
 ```
 
 **From source**

From 8e3f66a0d1ab44ecbee5917d15320e33ad118b5f Mon Sep 17 00:00:00 2001
From: Boyuan Yao <70263930+Cypher30@users.noreply.github.com>
Date: Fri, 17 Feb 2023 10:26:07 +0800
Subject: [PATCH 349/503] [zero] fix wrong import (#2777)

---
 colossalai/zero/sharded_optim/_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/colossalai/zero/sharded_optim/_utils.py b/colossalai/zero/sharded_optim/_utils.py
index 68928b232660..9ca2fdf5aa06 100644
--- a/colossalai/zero/sharded_optim/_utils.py
+++ b/colossalai/zero/sharded_optim/_utils.py
@@ -3,7 +3,7 @@
 
 import torch
 import torch.distributed as dist
-from torch.six import inf
+from torch import inf
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 
 from colossalai.tensor import ColoParameter

From a2b43e393d3e17b7386ea42a655730bd3b107a67 Mon Sep 17 00:00:00 2001
From: Boyuan Yao <70263930+Cypher30@users.noreply.github.com>
Date: Fri, 17 Feb 2023 10:39:48 +0800
Subject: [PATCH 350/503] [autoparallel] Patch meta information of
 `torch.nn.Embedding` (#2760)

* [autoparallel] embedding metainfo

* [autoparallel] fix function name in test_activation_metainfo

* [autoparallel] undo changes in activation metainfo and related tests
---
 .../meta_profiler/meta_registry/__init__.py   |  1 +
 .../meta_profiler/meta_registry/embedding.py  | 52 +++++++++++++
 .../test_metainfo/test_embedding_metainfo.py  | 77 +++++++++++++++++++
 3 files changed, 130 insertions(+)
 create mode 100644 colossalai/auto_parallel/meta_profiler/meta_registry/embedding.py
 create mode 100644 tests/test_auto_parallel/test_tensor_shard/test_metainfo/test_embedding_metainfo.py

diff --git a/colossalai/auto_parallel/meta_profiler/meta_registry/__init__.py b/colossalai/auto_parallel/meta_profiler/meta_registry/__init__.py
index aa5f77f6591e..359590c1fc04 100644
--- a/colossalai/auto_parallel/meta_profiler/meta_registry/__init__.py
+++ b/colossalai/auto_parallel/meta_profiler/meta_registry/__init__.py
@@ -1,6 +1,7 @@
 from .activation import *
 from .binary_elementwise_ops import *
 from .conv import *
+from .embedding import *
 from .linear import *
 from .norm import *
 from .pooling import *
diff --git a/colossalai/auto_parallel/meta_profiler/meta_registry/embedding.py b/colossalai/auto_parallel/meta_profiler/meta_registry/embedding.py
new file mode 100644
index 000000000000..2997f31adff8
--- /dev/null
+++ b/colossalai/auto_parallel/meta_profiler/meta_registry/embedding.py
@@ -0,0 +1,52 @@
+from typing import List, Tuple
+
+import torch
+
+from colossalai.auto_parallel.tensor_shard.sharding_strategy import MemoryCost, OperationDataType, TrainCycleItem
+from colossalai.fx.profiler.memory_utils import activation_size
+from colossalai.fx.profiler.opcount import flop_mapping
+
+from ..registry import meta_register
+
+__all__ = ["embedding_meta_info"]
+
+
+@meta_register.register(torch.nn.Embedding)
+def embedding_meta_info(*args, **kwargs) -> Tuple[TrainCycleItem, TrainCycleItem, List[torch.Tensor]]:
+    """torch.nn.Embedding metainfo generator
+
+    Returns:
+        Tuple[TrainCycleItem, TrainCycleItem, List[torch.Tensor]]: compute cost, memory cost and forward inputs
+    """
+    input_tensor = next(filter(lambda x: x.type == OperationDataType.ARG, args)).data
+    weight_tensor = next(filter(lambda x: x.type == OperationDataType.PARAM, args)).data
+    output_tensor = next(filter(lambda x: x.type == OperationDataType.OUTPUT, args)).data
+
+    # compute cost
+    fwd_compute_cost = flop_mapping[torch.ops.aten.embedding.default]([weight_tensor, input_tensor], [output_tensor])
+    bwd_compute_cost = flop_mapping[torch.ops.aten.embedding_dense_backward.default]([output_tensor, weight_tensor],
+                                                                                     [weight_tensor])
+
+    compute_cost = TrainCycleItem(fwd=fwd_compute_cost, bwd=bwd_compute_cost, total=fwd_compute_cost + bwd_compute_cost)
+
+    # memory cost
+    # NOTE: currently in SPMD solver we always believe that there will be a new tensor created in forward
+    # NOTE: during the backward phase of torch.nn.Embedding, it seems when the input is large enough, it will
+    # have a temp memory which is kind of weird and we don't know the reason yet, so currently we just assume
+    # that there will be no temp memory, as the temp memory is significantly smaller than the gradient memory
+    fwd_memory_cost = MemoryCost(activation=activation_size([input_tensor, output_tensor]),
+                                 parameter=0,
+                                 temp=0,
+                                 buffer=0)
+    bwd_memory_cost = MemoryCost(activation=activation_size([weight_tensor]), parameter=0, temp=0, buffer=0)
+
+    total_memory_cost = MemoryCost(activation=fwd_memory_cost.activation + bwd_memory_cost.activation)
+
+    memory_cost = TrainCycleItem(fwd=fwd_memory_cost, bwd=bwd_memory_cost, total=total_memory_cost)
+
+    # store fwd_in, fwd_buffer, fwd_out
+    fwd_in = [torch.zeros_like(input_tensor)]
+    fwd_buffer = []
+    fwd_out = [torch.zeros_like(output_tensor)]
+
+    return compute_cost, memory_cost, fwd_in, fwd_buffer, fwd_out
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_metainfo/test_embedding_metainfo.py b/tests/test_auto_parallel/test_tensor_shard/test_metainfo/test_embedding_metainfo.py
new file mode 100644
index 000000000000..2fb1306546ca
--- /dev/null
+++ b/tests/test_auto_parallel/test_tensor_shard/test_metainfo/test_embedding_metainfo.py
@@ -0,0 +1,77 @@
+from functools import partial
+
+import pytest
+import torch
+import torch.multiprocessing as mp
+import torch.nn as nn
+
+from colossalai.auto_parallel.tensor_shard.node_handler import LinearModuleHandler
+from colossalai.auto_parallel.tensor_shard.sharding_strategy import (
+    MemoryCost,
+    OperationData,
+    OperationDataType,
+    ShardingStrategy,
+    StrategiesVector,
+    TrainCycleItem,
+)
+from colossalai.device.device_mesh import DeviceMesh
+from colossalai.fx import ColoGraphModule, ColoTracer
+from colossalai.initialize import launch
+from colossalai.logging import disable_existing_loggers
+from colossalai.testing.pytest_wrapper import run_on_environment_flag
+from colossalai.testing.utils import parameterize, rerun_if_address_is_in_use
+from colossalai.utils import free_port
+from tests.test_auto_parallel.test_tensor_shard.test_metainfo.utils import print_results
+
+if torch.__version__ >= '1.12.0':
+    from colossalai.auto_parallel.meta_profiler import MetaInfo, meta_register
+
+
+@pytest.mark.skipif(torch.__version__ < '1.12.0', reason="need pytorch 1.12.0 or higher for aten level operations")
+def test_embedding_meta_info():
+    meta_func = meta_register.get(torch.nn.Embedding)
+
+    # construct meta tensors
+    input_tensor = torch.randint(0, 50256, (8, 1024), device="meta")
+    weight_tensor = torch.rand(50257, 1024, device="meta")
+    output_tensor = torch.rand(8, 1024, 1024, device="meta")
+
+    # construct operation data
+    input_data = OperationData(name="input", type=OperationDataType.ARG, data=input_tensor)
+
+    weight_data = OperationData(name="weight", type=OperationDataType.PARAM, data=weight_tensor)
+
+    output_data = OperationData(name="output", type=OperationDataType.OUTPUT, data=output_tensor)
+
+    # construct args and kwargs
+    args = [input_data, weight_data, output_data]
+    kwargs = {'inplace': False}
+
+    # estimated results
+    compute_cost, memory_cost, fwd_in, fwd_buffer, fwd_out = meta_func(*args, **kwargs)
+
+    # actual results
+    input_real_tensor = torch.randint(0, 50256, (8, 1024), device="cuda")
+    embedding_module = torch.nn.Embedding(50257, 1024).cuda()
+
+    # fwd
+    torch.cuda.reset_peak_memory_stats()
+    mem_stamp0 = torch.cuda.memory_allocated()
+    output_real_tensor = embedding_module(input_real_tensor)
+    fwd_allocated = torch.cuda.memory_allocated() - mem_stamp0
+    fwd_peak = torch.cuda.max_memory_allocated() - mem_stamp0
+
+    # bwd
+    upstream_grad = torch.rand_like(output_real_tensor)
+    torch.cuda.reset_peak_memory_stats()
+    mem_stamp0 = torch.cuda.memory_allocated()
+    torch.autograd.backward(output_real_tensor, upstream_grad)
+    bwd_allocated = torch.cuda.memory_allocated() - mem_stamp0
+    bwd_peak = torch.cuda.max_memory_allocated() - mem_stamp0
+
+    print_results([input_real_tensor], [output_real_tensor], compute_cost, memory_cost, fwd_allocated, fwd_peak,
+                  bwd_allocated, bwd_peak)
+
+
+if __name__ == '__main__':
+    test_embedding_meta_info()

From 4ee311c0262dfbca9b5da7e18f04dd8f1f23fe4c Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Fri, 17 Feb 2023 11:27:27 +0800
Subject: [PATCH 351/503] [chatgpt] startegy add prepare method (#2766)

* [chatgpt] startegy add prepare method

* [chatgpt] refactor examples

* [chatgpt] refactor strategy.prepare

* [chatgpt] support save/load checkpoint

* [chatgpt] fix unwrap actor

* [chatgpt] fix unwrap actor
---
 .../ChatGPT/benchmarks/benchmark_gpt_dummy.py |  3 +
 .../benchmarks/benchmark_opt_lora_dummy.py    |  3 +
 applications/ChatGPT/chatgpt/trainer/ppo.py   | 15 ++--
 .../chatgpt/trainer/strategies/base.py        | 88 ++++++++++++++++++-
 .../chatgpt/trainer/strategies/colossalai.py  | 27 +++++-
 .../ChatGPT/chatgpt/trainer/strategies/ddp.py | 18 ++++
 .../chatgpt/trainer/strategies/naive.py       | 19 ++++
 applications/ChatGPT/examples/train_dummy.py  |  3 +
 .../ChatGPT/examples/train_prompts.py         |  3 +
 9 files changed, 164 insertions(+), 15 deletions(-)

diff --git a/applications/ChatGPT/benchmarks/benchmark_gpt_dummy.py b/applications/ChatGPT/benchmarks/benchmark_gpt_dummy.py
index 3e66e4e7a40a..b5730c7c7bbc 100644
--- a/applications/ChatGPT/benchmarks/benchmark_gpt_dummy.py
+++ b/applications/ChatGPT/benchmarks/benchmark_gpt_dummy.py
@@ -133,6 +133,9 @@ def main(args):
     tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
     tokenizer.pad_token = tokenizer.eos_token
 
+    (actor, actor_optim), (critic, critic_optim), reward_model, initial_model = strategy.prepare(
+        (actor, actor_optim), (critic, critic_optim), reward_model, initial_model)
+
     trainer = PPOTrainer(strategy,
                          actor,
                          critic,
diff --git a/applications/ChatGPT/benchmarks/benchmark_opt_lora_dummy.py b/applications/ChatGPT/benchmarks/benchmark_opt_lora_dummy.py
index 8cee5489e212..6777cb770d53 100644
--- a/applications/ChatGPT/benchmarks/benchmark_opt_lora_dummy.py
+++ b/applications/ChatGPT/benchmarks/benchmark_opt_lora_dummy.py
@@ -126,6 +126,9 @@ def main(args):
     tokenizer = AutoTokenizer.from_pretrained('facebook/opt-350m')
     tokenizer.pad_token = tokenizer.eos_token
 
+    (actor, actor_optim), (critic, critic_optim), reward_model, initial_model = strategy.prepare(
+        (actor, actor_optim), (critic, critic_optim), reward_model, initial_model)
+
     trainer = PPOTrainer(strategy,
                          actor,
                          critic,
diff --git a/applications/ChatGPT/chatgpt/trainer/ppo.py b/applications/ChatGPT/chatgpt/trainer/ppo.py
index b1d11b2242ca..2c1fd2fb6cd3 100644
--- a/applications/ChatGPT/chatgpt/trainer/ppo.py
+++ b/applications/ChatGPT/chatgpt/trainer/ppo.py
@@ -60,11 +60,6 @@ def __init__(self,
                  dataloader_pin_memory: bool = True,
                  callbacks: List[Callback] = [],
                  **generate_kwargs) -> None:
-        self._set_default_generate_kwargs(generate_kwargs, actor)
-        actor = Actor(strategy.setup_model(actor.model))
-        critic = strategy.setup_model(critic)
-        reward_model = strategy.setup_model(reward_model)
-        initial_model = Actor(strategy.setup_model(initial_model.model))
         experience_maker = NaiveExperienceMaker(actor, critic, reward_model, initial_model, kl_coef)
         replay_buffer = NaiveReplayBuffer(train_batch_size, buffer_limit, buffer_cpu_offload)
         super().__init__(strategy, experience_maker, replay_buffer, experience_batch_size, max_epochs, tokenizer,
@@ -75,8 +70,9 @@ def __init__(self,
         self.actor_loss_fn = PolicyLoss(eps_clip)
         self.critic_loss_fn = ValueLoss(value_clip)
 
-        self.actor_optim = strategy.setup_optimizer(actor_optim, self.actor.model)
-        self.critic_optim = strategy.setup_optimizer(critic_optim, self.critic)
+        self.actor_optim = actor_optim
+        self.critic_optim = critic_optim
+        self._set_default_generate_kwargs(generate_kwargs, actor)
 
     def training_step(self, experience: Experience) -> Dict[str, float]:
         self.actor.train()
@@ -106,9 +102,10 @@ def training_step(self, experience: Experience) -> Dict[str, float]:
         return {'actor_loss': actor_loss.item(), 'critic_loss': critic_loss.item()}
 
     def _set_default_generate_kwargs(self, generate_kwargs: dict, actor: Actor) -> None:
+        origin_model = self.strategy._unwrap_actor(actor)
         # use huggingface models method directly
-        if 'prepare_inputs_fn' not in generate_kwargs and hasattr(actor.model, 'prepare_inputs_for_generation'):
-            generate_kwargs['prepare_inputs_fn'] = actor.model.prepare_inputs_for_generation
+        if 'prepare_inputs_fn' not in generate_kwargs and hasattr(origin_model, 'prepare_inputs_for_generation'):
+            generate_kwargs['prepare_inputs_fn'] = origin_model.prepare_inputs_for_generation
 
         if 'update_model_kwargs_fn' not in generate_kwargs:
             generate_kwargs['update_model_kwargs_fn'] = update_model_kwargs_fn
diff --git a/applications/ChatGPT/chatgpt/trainer/strategies/base.py b/applications/ChatGPT/chatgpt/trainer/strategies/base.py
index 3a2923b8c678..2c6aefcd969f 100644
--- a/applications/ChatGPT/chatgpt/trainer/strategies/base.py
+++ b/applications/ChatGPT/chatgpt/trainer/strategies/base.py
@@ -1,12 +1,17 @@
 from abc import ABC, abstractmethod
 from contextlib import nullcontext
+from typing import Any, List, Tuple, Union
 
 import torch
 import torch.nn as nn
-import torch.optim as optim
+from chatgpt.nn import Actor, Critic, RewardModel
 from chatgpt.replay_buffer import ReplayBuffer
+from torch.optim import Optimizer
 from torch.utils.data import DataLoader
 
+ModelOptimPair = Tuple[nn.Module, Optimizer]
+ModelOrModelOptimPair = Union[nn.Module, ModelOptimPair]
+
 
 class Strategy(ABC):
     """
@@ -18,11 +23,11 @@ def __init__(self) -> None:
         self.setup_distributed()
 
     @abstractmethod
-    def backward(self, loss: torch.Tensor, model: nn.Module, optimizer: optim.Optimizer, **kwargs) -> None:
+    def backward(self, loss: torch.Tensor, model: nn.Module, optimizer: Optimizer, **kwargs) -> None:
         pass
 
     @abstractmethod
-    def optimizer_step(self, optimizer: optim.Optimizer, **kwargs) -> None:
+    def optimizer_step(self, optimizer: Optimizer, **kwargs) -> None:
         pass
 
     @abstractmethod
@@ -34,7 +39,7 @@ def setup_model(self, model: nn.Module) -> nn.Module:
         pass
 
     @abstractmethod
-    def setup_optimizer(self, optimizer: optim.Optimizer, model: nn.Module) -> optim.Optimizer:
+    def setup_optimizer(self, optimizer: Optimizer, model: nn.Module) -> Optimizer:
         pass
 
     @abstractmethod
@@ -43,3 +48,78 @@ def setup_dataloader(self, replay_buffer: ReplayBuffer, pin_memory: bool = False
 
     def model_init_context(self):
         return nullcontext()
+
+    def prepare(
+        self, *models_or_model_optim_pairs: ModelOrModelOptimPair
+    ) -> Union[List[ModelOrModelOptimPair], ModelOrModelOptimPair]:
+        """Prepare models or model-optimizer-pairs based on each strategy.
+
+        Example::
+            >>> # when fine-tuning actor and critic
+            >>> (actor, actor_optim), (critic, critic_optim), reward_model, initial_model = strategy.prepare((actor, actor_optim), (critic, critic_optim), reward_model, initial_model)
+            >>> # or when training reward model
+            >>> (reward_model, reward_model_optim) = strategy.prepare((reward_model, reward_model_optim))
+            >>> # or just inference
+            >>> actor, critic = strategy.prepare(actor, critic)
+
+        Returns:
+            Union[List[ModelOrModelOptimPair], ModelOrModelOptimPair]: Models or model-optimizer-pairs in the original order.
+        """
+
+        def prepare_model(model: nn.Module):
+            if isinstance(model, Actor):
+                return Actor(self.setup_model(self._unwrap_model(model)))
+            return self.setup_model(self._unwrap_model(model))
+
+        rets = []
+        for arg in models_or_model_optim_pairs:
+            if isinstance(arg, tuple):
+                assert len(arg) == 2, f'Expect (model, optimizer) pair, got a tuple with size "{len(arg)}"'
+                model, optimizer = arg
+                model = prepare_model(model)
+                optimizer = self.setup_optimizer(optimizer, self._unwrap_model(model))
+                rets.append((model, optimizer))
+            elif isinstance(arg, nn.Module):
+                rets.append(prepare_model(arg))
+            else:
+                raise RuntimeError(f'Expect model or (model, optimizer) pair, got {type(arg)}')
+
+        if len(rets) == 1:
+            return rets[0]
+        return rets
+
+    @staticmethod
+    def _unwrap_model(model: nn.Module) -> nn.Module:
+        """Useful for saving state dict. As actor is wrapped by Actor class again in `prepare()`, we should unwrap it before saving.
+
+        Args:
+            model (nn.Module): an actor or a critic
+        """
+        if isinstance(model, Actor):
+            return model.model
+        return model
+
+    @staticmethod
+    def _unwrap_actor(actor: Actor) -> nn.Module:
+        """Get `actor.model` from a wrapped (by `prepare()`) actor. Useful for getting original huggingface model.
+
+        Args:
+            actor (Actor): a wrapped actor
+        """
+        return Strategy._unwrap_model(actor)
+
+    @abstractmethod
+    def save_model(self, model: nn.Module, path: str, only_rank0: bool = False) -> None:
+        pass
+
+    @abstractmethod
+    def load_model(self, model: nn.Module, path: str, map_location: Any = None, strict: bool = True) -> None:
+        pass
+
+    @abstractmethod
+    def save_optimizer(self, optimizer: Optimizer, path: str, only_rank0: bool = False) -> None:
+        pass
+
+    @abstractmethod
+    def load_optimizer(self, optimizer: Optimizer, path: str, map_location: Any = None) -> None:
+        pass
diff --git a/applications/ChatGPT/chatgpt/trainer/strategies/colossalai.py b/applications/ChatGPT/chatgpt/trainer/strategies/colossalai.py
index 578844bdbdbd..bf4ecdfdf336 100644
--- a/applications/ChatGPT/chatgpt/trainer/strategies/colossalai.py
+++ b/applications/ChatGPT/chatgpt/trainer/strategies/colossalai.py
@@ -1,18 +1,21 @@
 import warnings
-from typing import Optional
+from typing import Optional, Union
 
 import torch
 import torch.distributed as dist
 import torch.nn as nn
 import torch.optim as optim
+from chatgpt.nn import Actor
+from torch.optim import Optimizer
 
 import colossalai
 from colossalai.nn.optimizer import CPUAdam, HybridAdam
-from colossalai.nn.parallel import zero_model_wrapper, zero_optim_wrapper
+from colossalai.nn.parallel import ZeroDDP, zero_model_wrapper, zero_optim_wrapper
 from colossalai.tensor import ProcessGroup, ShardSpec
 from colossalai.utils import get_current_device
 from colossalai.utils.model.colo_init_context import ColoInitContext
 
+from .base import Strategy
 from .ddp import DDPStrategy
 
 
@@ -129,3 +132,23 @@ def backward(self, loss: torch.Tensor, model: nn.Module, optimizer: optim.Optimi
 
     def optimizer_step(self, optimizer: optim.Optimizer, **kwargs) -> None:
         optimizer.step()
+
+    @staticmethod
+    def _unwrap_actor(actor: Actor) -> nn.Module:
+        model: Union[nn.Module, ZeroDDP] = Strategy._unwrap_actor(actor)
+        if isinstance(model, ZeroDDP):
+            return model.module
+        return model
+
+    def save_model(self, model: nn.Module, path: str, only_rank0: bool = False) -> None:
+        unwrapped_model = self._unwrap_model(model)
+        state_dict = unwrapped_model.state_dict()
+        if only_rank0 and dist.get_rank() != 0:
+            return
+        torch.save(state_dict, path)
+
+    def save_optimizer(self, optimizer: Optimizer, path: str, only_rank0: bool = False) -> None:
+        if only_rank0:
+            raise RuntimeError(
+                f'Optimizer states are sharded when using ColossalAIStrategy. Only rank0 is not supported.')
+        torch.save(optimizer.state_dict(), path)
diff --git a/applications/ChatGPT/chatgpt/trainer/strategies/ddp.py b/applications/ChatGPT/chatgpt/trainer/strategies/ddp.py
index b636515b443e..7ceb3a3ca2ba 100644
--- a/applications/ChatGPT/chatgpt/trainer/strategies/ddp.py
+++ b/applications/ChatGPT/chatgpt/trainer/strategies/ddp.py
@@ -5,10 +5,13 @@
 import torch
 import torch.distributed as dist
 import torch.nn as nn
+from chatgpt.nn import Actor
 from chatgpt.replay_buffer import ReplayBuffer
 from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.optim import Optimizer
 from torch.utils.data import DataLoader, DistributedSampler
 
+from .base import Strategy
 from .naive import NaiveStrategy
 
 
@@ -57,3 +60,18 @@ def setup_dataloader(self, replay_buffer: ReplayBuffer, pin_memory: bool = False
                           sampler=sampler,
                           pin_memory=pin_memory,
                           collate_fn=replay_buffer.collate_fn)
+
+    @staticmethod
+    def _unwrap_actor(actor: Actor) -> nn.Module:
+        model: DDP = Strategy._unwrap_actor(actor)
+        return model.module
+
+    def save_model(self, model: nn.Module, path: str, only_rank0: bool = False) -> None:
+        if only_rank0 and dist.get_rank() != 0:
+            return
+        super().save_model(model, path, only_rank0)
+
+    def save_optimizer(self, optimizer: Optimizer, path: str, only_rank0: bool = False) -> None:
+        if only_rank0 and dist.get_rank() != 0:
+            return
+        super().save_optimizer(optimizer, path, only_rank0)
diff --git a/applications/ChatGPT/chatgpt/trainer/strategies/naive.py b/applications/ChatGPT/chatgpt/trainer/strategies/naive.py
index 1bb472ae657e..99b8d6635394 100644
--- a/applications/ChatGPT/chatgpt/trainer/strategies/naive.py
+++ b/applications/ChatGPT/chatgpt/trainer/strategies/naive.py
@@ -1,7 +1,10 @@
+from typing import Any
+
 import torch
 import torch.nn as nn
 import torch.optim as optim
 from chatgpt.replay_buffer import ReplayBuffer
+from torch.optim import Optimizer
 from torch.utils.data import DataLoader
 
 from .base import Strategy
@@ -34,3 +37,19 @@ def setup_dataloader(self, replay_buffer: ReplayBuffer, pin_memory: bool = False
                           drop_last=True,
                           pin_memory=pin_memory,
                           collate_fn=replay_buffer.collate_fn)
+
+    def save_model(self, model: nn.Module, path: str, only_rank0: bool = False) -> None:
+        unwrapped_model = self._unwrap_model(model)
+        torch.save(unwrapped_model.state_dict(), path)
+
+    def load_model(self, model: nn.Module, path: str, map_location: Any = None, strict: bool = True) -> None:
+        unwrapped_model = self._unwrap_model(model)
+        state_dict = torch.load(path, map_location=map_location)
+        unwrapped_model.load_state_dict(state_dict, strict=strict)
+
+    def save_optimizer(self, optimizer: Optimizer, path: str, only_rank0: bool = False) -> None:
+        torch.save(optimizer.state_dict(), path)
+
+    def load_optimizer(self, optimizer: Optimizer, path: str, map_location: Any = None) -> None:
+        state_dict = torch.load(path, map_location=map_location)
+        optimizer.load_state_dict(state_dict)
diff --git a/applications/ChatGPT/examples/train_dummy.py b/applications/ChatGPT/examples/train_dummy.py
index a14117ed5cd4..f98b4792d978 100644
--- a/applications/ChatGPT/examples/train_dummy.py
+++ b/applications/ChatGPT/examples/train_dummy.py
@@ -68,6 +68,9 @@ def main(args):
     else:
         raise ValueError(f'Unsupported model "{args.model}"')
 
+    (actor, actor_optim), (critic, critic_optim), reward_model, initial_model = strategy.prepare(
+        (actor, actor_optim), (critic, critic_optim), reward_model, initial_model)
+
     # configure trainer
     trainer = PPOTrainer(
         strategy,
diff --git a/applications/ChatGPT/examples/train_prompts.py b/applications/ChatGPT/examples/train_prompts.py
index cf351b91a461..e79b2acf11b1 100644
--- a/applications/ChatGPT/examples/train_prompts.py
+++ b/applications/ChatGPT/examples/train_prompts.py
@@ -68,6 +68,9 @@ def tokenize_fn(texts):
         batch = tokenizer(texts, return_tensors='pt', max_length=96, padding=True, truncation=True)
         return {k: v.cuda() for k, v in batch.items()}
 
+    (actor, actor_optim), (critic, critic_optim), reward_model, initial_model = strategy.prepare(
+        (actor, actor_optim), (critic, critic_optim), reward_model, initial_model)
+
     # configure trainer
     trainer = PPOTrainer(
         strategy,

From a619a190df71ea3600e8487bc7070330a9574e06 Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Fri, 17 Feb 2023 12:43:31 +0800
Subject: [PATCH 352/503] [chatgpt] update readme about checkpoint (#2792)

* [chatgpt] add save/load checkpoint sample code

* [chatgpt] add save/load checkpoint readme

* [chatgpt] refactor save/load checkpoint readme
---
 applications/ChatGPT/README.md | 89 +++++++++++++++++++++++++++++++---
 1 file changed, 83 insertions(+), 6 deletions(-)

diff --git a/applications/ChatGPT/README.md b/applications/ChatGPT/README.md
index b3ea239a9919..0516991de288 100644
--- a/applications/ChatGPT/README.md
+++ b/applications/ChatGPT/README.md
@@ -34,26 +34,103 @@ Simplest usage:
 ```python
 from chatgpt.trainer import PPOTrainer
 from chatgpt.trainer.strategies import ColossalAIStrategy
+from chatgpt.nn import GPTActor, GPTCritic, RewardModel
+from copy import deepcopy
+from colossalai.nn.optimizer import HybridAdam
 
 strategy = ColossalAIStrategy()
 
 with strategy.model_init_context():
   # init your model here
-  actor = Actor()
-  critic = Critic()
-
-trainer = PPOTrainer(actor = actor, critic= critic, strategy, ...)
+  # load pretrained gpt2
+  actor = GPTActor(pretrained='gpt2')
+  critic = GPTCritic()
+  initial_model = deepcopy(actor).cuda()
+  reward_model = RewardModel(deepcopy(critic.model), deepcopy(critic.value_head)).cuda()
+
+actor_optim = HybridAdam(actor.parameters(), lr=5e-6)
+critic_optim = HybridAdam(critic.parameters(), lr=5e-6)
+
+# prepare models and optimizers
+(actor, actor_optim), (critic, critic_optim), reward_model, initial_model = strategy.prepare(
+        (actor, actor_optim), (critic, critic_optim), reward_model, initial_model)
+
+# load saved model checkpoint after preparing
+strategy.load_model(actor, 'actor_checkpoint.pt', strict=False)
+# load saved optimizer checkpoint after preparing
+strategy.load_optimizer(actor_optim, 'actor_optim_checkpoint.pt')
+
+trainer = PPOTrainer(strategy,
+                     actor,
+                     critic,
+                     reward_model,
+                     initial_model,
+                     actor_optim,
+                     critic_optim,
+                     ...)
 
 trainer.fit(dataset, ...)
+
+# save model checkpoint after fitting on only rank0
+strategy.save_model(actor, 'actor_checkpoint.pt', only_rank0=True)
+# save optimizer checkpoint on all ranks
+strategy.save_optimizer(actor_optim, 'actor_optim_checkpoint.pt', only_rank0=False)
 ```
 
 For more details, see `examples/`.
 
 We also support training reward model with true-world data. See `examples/train_reward_model.py`.
 
+## FAQ
+
+### How to save/load checkpoint
+
+To load pretrained model, you can simply use huggingface pretrained models:
+
+```python
+# load OPT-350m pretrained model
+actor = OPTActor(pretrained='facebook/opt-350m')
+```
+
+To save model checkpoint:
+
+```python
+# save model checkpoint on only rank0
+strategy.save_model(actor, 'actor_checkpoint.pt', only_rank0=True)
+```
+
+This function must be called after `strategy.prepare()`.
+
+For DDP strategy, model weights are replicated on all ranks. And for ColossalAI strategy, model weights may be sharded, but all-gather will be applied before returning state dict. You can set `only_rank0=True` for both of them, which only saves checkpoint on rank0, to save disk space usage. The checkpoint is float32.
+
+To save optimizer checkpoint:
+
+```python
+# save optimizer checkpoint on all ranks
+strategy.save_optimizer(actor_optim, 'actor_optim_checkpoint.pt', only_rank0=False)
+```
+
+For DDP strategy, optimizer states are replicated on all ranks. You can set `only_rank0=True`. But for ColossalAI strategy, optimizer states are sharded over all ranks, and no all-gather will be applied. So for ColossalAI strategy, you can only set `only_rank0=False`. That is to say, each rank will save a cehckpoint. When loading, each rank should load the corresponding part.
+
+Note that different stategy may have different shapes of optimizer checkpoint.
+
+To load model checkpoint:
+
+```python
+# load saved model checkpoint after preparing
+strategy.load_model(actor, 'actor_checkpoint.pt', strict=False)
+```
+
+To load optimizer checkpoint:
+
+```python
+# load saved optimizer checkpoint after preparing
+strategy.load_optimizer(actor_optim, 'actor_optim_checkpoint.pt')
+```
+
 ## Todo
 
-- [x] implement PPO training
+- [x] implement PPO fine-tuning
 - [x] implement training reward model
 - [x] support LoRA
 - [ ] implement PPO-ptx fine-tuning
@@ -65,7 +142,7 @@ Referring to the successful attempts of [BLOOM](https://bigscience.huggingface.c
 
 You may contact us or participate in the following ways:
 1. Posting an [issue](https://github.com/hpcaitech/ColossalAI/issues/new/choose) or submitting a [PR](https://github.com/hpcaitech/ColossalAI/pulls) on GitHub
-2. Join the Colossal-AI community on 
+2. Join the Colossal-AI community on
 [Slack](https://join.slack.com/t/colossalaiworkspace/shared_invite/zt-z7b26eeb-CBp7jouvu~r0~lcFzX832w),
 and [WeChat](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/WeChat.png "qrcode") to share your ideas.
 3. Check out and fill in the [cooperation proposal](https://www.hpc-ai.tech/partners)

From 56ddc9ca7a150fa49460306bdc4d4672c7cb3ee7 Mon Sep 17 00:00:00 2001
From: HELSON <c2h214748@gmail.com>
Date: Fri, 17 Feb 2023 15:29:07 +0800
Subject: [PATCH 353/503] [hotfix] add correct device for fake_param (#2796)

---
 colossalai/nn/optimizer/zero_optimizer.py             | 5 +++--
 tests/test_gemini/update/test_zerooptim_state_dict.py | 2 --
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/colossalai/nn/optimizer/zero_optimizer.py b/colossalai/nn/optimizer/zero_optimizer.py
index 402e28ce8458..712daed06400 100644
--- a/colossalai/nn/optimizer/zero_optimizer.py
+++ b/colossalai/nn/optimizer/zero_optimizer.py
@@ -136,7 +136,7 @@ def _update_fp16_params(self):
         for group in self.param_groups:
             for fake_param in group['params']:
                 assert fake_param.grad is None
-                fake_param.data = none_tensor
+                fake_param.data = none_tensor.to(fake_param.device)
 
         for chunk16 in self.chunk16_set:
             chunk16.optim_update()
@@ -307,7 +307,8 @@ def get_range_pair(local_chunk: Chunk, local_param: Parameter):
                 if range_pair[0] >= range_pair[1]:
                     continue
 
-                fake_param = torch.nn.Parameter(torch.empty([0]))
+                grad_device = self.module.grads_device[param]
+                fake_param = torch.nn.Parameter(torch.empty([0], device=grad_device))
                 self.param_to_chunk32[fake_param] = chunk16.paired_chunk
                 self.param_to_range[fake_param] = range_pair
 
diff --git a/tests/test_gemini/update/test_zerooptim_state_dict.py b/tests/test_gemini/update/test_zerooptim_state_dict.py
index dc3dda9d6df4..fd13af6b2b0a 100644
--- a/tests/test_gemini/update/test_zerooptim_state_dict.py
+++ b/tests/test_gemini/update/test_zerooptim_state_dict.py
@@ -70,8 +70,6 @@ def exam_zero_optim_state_dict(placement_policy, keep_gathered):
         for n, m in v.items():
             if isinstance(m, torch.Tensor):
                 o = w[n]
-                if m.device != o.device:
-                    o = o.to(m.device)
                 assert torch.equal(m, o)
             else:
                 assert m == w[n]

From 09f457479db6567352de157133743dbd046c9b9a Mon Sep 17 00:00:00 2001
From: binmakeswell <binmakeswell@gmail.com>
Date: Fri, 17 Feb 2023 23:21:42 +0800
Subject: [PATCH 354/503] [doc] update OPT serving (#2804)

* [doc] update OPT serving

* [doc] update OPT serving
---
 README-zh-Hans.md | 4 ----
 README.md         | 4 ----
 2 files changed, 8 deletions(-)

diff --git a/README-zh-Hans.md b/README-zh-Hans.md
index fef2a02ec42b..1ef8ade8520b 100644
--- a/README-zh-Hans.md
+++ b/README-zh-Hans.md
@@ -195,10 +195,6 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
 
 - [Energon-AI](https://github.com/hpcaitech/EnergonAI) ：用相同的硬件推理加速50%
 
-<p id="OPT-Serving" align="center">
-<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/OPT_serving.png" width=800/>
-</p>
-
 - [OPT推理服务](https://colossalai.org/docs/advanced_tutorials/opt_service): 体验1750亿参数OPT在线推理服务
 
 <p id="BLOOM-Inference" align="center">
diff --git a/README.md b/README.md
index 35b2cca6b47e..7d18be39ab1a 100644
--- a/README.md
+++ b/README.md
@@ -197,10 +197,6 @@ Please visit our [documentation](https://www.colossalai.org/) and [examples](htt
 
 - [Energon-AI](https://github.com/hpcaitech/EnergonAI): 50% inference acceleration on the same hardware
 
-<p id="OPT-Serving" align="center">
-<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/OPT_serving.png" width=800/>
-</p>
-
 - [OPT Serving](https://colossalai.org/docs/advanced_tutorials/opt_service): Try 175-billion-parameter OPT online services
 
 <p id="BLOOM-Inference" align="center">

From 8593ae1a3fada934da66cf680b93c73cff718139 Mon Sep 17 00:00:00 2001
From: Boyuan Yao <70263930+Cypher30@users.noreply.github.com>
Date: Sat, 18 Feb 2023 11:30:15 +0800
Subject: [PATCH 355/503] [autoparallel] rotor solver refactor (#2813)

* [autoparallel] rotor solver refactor

* [autoparallel] rotor solver refactor
---
 .../checkpoint/ckpt_solver_rotor.c            | 28 +++++++++++++------
 .../checkpoint/ckpt_solver_rotor.py           | 16 +++++------
 2 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/colossalai/auto_parallel/checkpoint/ckpt_solver_rotor.c b/colossalai/auto_parallel/checkpoint/ckpt_solver_rotor.c
index 0fdcfd58a399..8dad074bc894 100644
--- a/colossalai/auto_parallel/checkpoint/ckpt_solver_rotor.c
+++ b/colossalai/auto_parallel/checkpoint/ckpt_solver_rotor.c
@@ -1,6 +1,12 @@
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
 
+/*
+Rotor solver for checkpointing problem in C. We follow the modeling mentioned in
+paper `Optimal checkpointing for heterogeneous chains: how to train deep neural
+networks with limited memory` https://hal.inria.fr/hal-02352969. Some lines of
+the code are adapted from https://gitlab.inria.fr/hiepacs/rotor.
+*/
 long* PySequenceToLongArray(PyObject* pylist) {
   if (!(pylist && PySequence_Check(pylist))) return NULL;
   Py_ssize_t len = PySequence_Size(pylist);
@@ -81,14 +87,16 @@ static PyObject* computeTable(PyObject* self, PyObject* args) {
       (mmax + 1) * (chainLength + 1) * (chainLength + 1), sizeof(long));
 
   for (long m = 0; m <= mmax; ++m)
-    for (long i = 0; i <= chainLength; ++i)
+    for (long i = 0; i <= chainLength; ++i) {
       if ((m >= x[i + 1] + xbar[i + 1] + btmp[i]) &&
-          (m >= x[i + 1] + xbar[i + 1] + ftmp[i]))
+          (m >= x[i + 1] + xbar[i + 1] + ftmp[i])) {
         COST_TABLE(m, i, i) = ftime[i] + btime[i];
-      else
+      } else {
         COST_TABLE(m, i, i) = INFINITY;
+      }
+    }
 
-  for (long m = 0; m <= mmax; ++m)
+  for (long m = 0; m <= mmax; ++m) {
     for (long d = 1; d <= chainLength; ++d) {
       for (long i = 0; i <= chainLength - d; ++i) {
         long idx = i + d;
@@ -116,9 +124,10 @@ static PyObject* computeTable(PyObject* self, PyObject* args) {
             }
           }
           double chainCost = INFINITY;
-          if (m >= xbar[i + 1])
+          if (m >= xbar[i + 1]) {
             chainCost =
                 COST_TABLE(m, i, i) + COST_TABLE(m - xbar[i + 1], i + 1, idx);
+          }
           if (bestLeafCost <= chainCost) {
             COST_TABLE(m, i, idx) = bestLeafCost;
             BACK_PTR(m, i, idx) = bestLeaf;
@@ -126,10 +135,12 @@ static PyObject* computeTable(PyObject* self, PyObject* args) {
             COST_TABLE(m, i, idx) = chainCost;
             BACK_PTR(m, i, idx) = -1;
           }
-        } else
+        } else {
           COST_TABLE(m, i, idx) = INFINITY;
+        }
       }
     }
+  }
 
   free(ftime);
   free(btime);
@@ -158,10 +169,11 @@ static PyObject* computeTable(PyObject* self, PyObject* args) {
         PyDict_SetItem(pyCostTable_m_i, pyVar_l, pyCostTable_m_i_l);
         Py_DECREF(pyCostTable_m_i_l);
         PyObject* pyBackPtr_m_i_l;
-        if (BACK_PTR(m, i, l) < 0)
+        if (BACK_PTR(m, i, l) < 0) {
           pyBackPtr_m_i_l = Py_BuildValue("(O)", Py_True);
-        else
+        } else {
           pyBackPtr_m_i_l = Py_BuildValue("(Ol)", Py_False, BACK_PTR(m, i, l));
+        }
         PyDict_SetItem(pyBackPtr_m_i, pyVar_l, pyBackPtr_m_i_l);
         Py_DECREF(pyBackPtr_m_i_l);
         Py_DECREF(pyVar_l);
diff --git a/colossalai/auto_parallel/checkpoint/ckpt_solver_rotor.py b/colossalai/auto_parallel/checkpoint/ckpt_solver_rotor.py
index 41d23be5c952..21c3bf0da758 100644
--- a/colossalai/auto_parallel/checkpoint/ckpt_solver_rotor.py
+++ b/colossalai/auto_parallel/checkpoint/ckpt_solver_rotor.py
@@ -207,11 +207,10 @@ def _compute_table(chain: Chain, mmax: int) -> Tuple:
             mmax (int): Maximum number of memory slots.
 
         Returns:
-            cost_table (List): cost_table[m][lhs][rhs] with lhs = 0...chain.length
-                                     and rhs = lhs...chain.length (lhs is not included) and m = 0...mmax
-            back_ptr (List): back_ptr[m][lhs][rhs] is (True,) if the optimal choice
-                                     is a chain checkpoint (False, j) if the optimal choice is a leaf checkpoint
-                                     of length j
+            cost_table (List): cost_table[m][lhs][rhs] indicates the optimal cost of the subproblem from lhs to rhs
+            with m memory slots.
+            back_ptr (List): back_ptr[m][lhs][rhs] indicates the best operation at this point. It is (True,) if the optimal choice
+            is a chain checkpoint, it is (False, j) if the optimal choice is a leaf checkpoint of length j
         """
 
         ftime = chain.ftime + [0.0]
@@ -224,18 +223,17 @@ def _compute_table(chain: Chain, mmax: int) -> Tuple:
         # Build table
         cost_table = [[{} for _ in range(len(chain) + 1)] for _ in range(mmax + 1)]
         back_ptr = [[{} for _ in range(len(chain) + 1)] for _ in range(mmax + 1)]
-        # Last one is a dict because its indices go from i to l. Renumbering will wait for C implementation
 
-        # Initialize borders of the tables for lmax-lmin = 0
+        # Initialize corner cases where length of sequence equals to 1, i.e. lhs == rhs
         for m in range(mmax + 1):
             for i in range(len(chain) + 1):
                 limit = max(x[i + 1] + xbar[i + 1] + ftmp[i], x[i + 1] + xbar[i + 1] + btmp[i])
-                if m >= limit:    # Equation (1)
+                if m >= limit:
                     cost_table[m][i][i] = ftime[i] + btime[i]
                 else:
                     cost_table[m][i][i] = float("inf")
 
-        # Compute everything
+        # Compute tables
         for m in range(mmax + 1):
             for d in range(1, len(chain) + 1):
                 for i in range(len(chain) + 1 - d):

From dbd0fd1522c8b779a6d26da3c4cce52442ce494d Mon Sep 17 00:00:00 2001
From: LuGY <74758262+Gy-Lu@users.noreply.github.com>
Date: Sat, 18 Feb 2023 13:27:13 +0800
Subject: [PATCH 356/503] [CI/CD] fix nightly release CD running on forked repo
 (#2812)

* [CI/CD] fix nightly release CD running on forker repo

* fix misunderstanding of dispatch

* remove some build condition, enable notify even when release failed
---
 .github/workflows/release_nightly_on_schedule.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/release_nightly_on_schedule.yml b/.github/workflows/release_nightly_on_schedule.yml
index aab42e1d754d..4125f333f301 100644
--- a/.github/workflows/release_nightly_on_schedule.yml
+++ b/.github/workflows/release_nightly_on_schedule.yml
@@ -7,7 +7,7 @@ on:
 
 jobs:
   build-n-publish:
-    if: github.event_name == 'workflow_dispatch' || github.repository == 'hpcaitech/ColossalAI'
+    if: github.repository == 'hpcaitech/ColossalAI'
     name: Build and publish Python 🐍 distributions 📦 to PyPI
     runs-on: ubuntu-latest
     timeout-minutes: 20
@@ -31,9 +31,9 @@ jobs:
 
   notify:
     name: Notify Lark via webhook
-    needs: release
+    needs: build-n-publish
     runs-on: ubuntu-latest
-    if: ${{ always() }}
+    if: ${{ always() }} && github.repository == 'hpcaitech/ColossalAI'
     steps:
       - uses: actions/checkout@v2
 

From 2059fdd6b00b65d605247de0fa2b4f5878a39e85 Mon Sep 17 00:00:00 2001
From: YuliangLiu0306 <72588413+YuliangLiu0306@users.noreply.github.com>
Date: Sat, 18 Feb 2023 21:14:38 +0800
Subject: [PATCH 357/503] [hotfix] add copyright for solver and device mesh
 (#2803)

* [hotfix] add copyright for solver and device mesh

* add readme

* add alpa license

* polish
---
 LICENSE                                       | 15 ++++++++++++
 colossalai/auto_parallel/README.md            | 23 +++++++++++++++++++
 .../tensor_shard/solver/solver.py             |  4 ++++
 colossalai/device/device_mesh.py              | 13 ++++++-----
 .../Colossal-Auto/get_started/introduction.md |  3 ---
 .../en/Colossal-Auto/get_started/run_demo.md  |  4 ----
 6 files changed, 49 insertions(+), 13 deletions(-)
 create mode 100644 colossalai/auto_parallel/README.md

diff --git a/LICENSE b/LICENSE
index 0528c89ea9ec..51a166040d54 100644
--- a/LICENSE
+++ b/LICENSE
@@ -200,3 +200,18 @@ Copyright 2021- HPC-AI Technology Inc. All rights reserved.
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License.
+
+## Some of colossal-ai's code is derived from Alpa, which is subject to the following copyright notice:
+
+Copyright 2021 The Alpa team.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       https://github.com/alpa-projects/alpa/blob/979a45a3e6187df941ef4a4c4c6eea664527d68d/LICENSE
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/colossalai/auto_parallel/README.md b/colossalai/auto_parallel/README.md
new file mode 100644
index 000000000000..8e47e1bb0b4a
--- /dev/null
+++ b/colossalai/auto_parallel/README.md
@@ -0,0 +1,23 @@
+# Colossal-AUTO
+
+## Challenges
+Recently, large models have achieved the state of the art performances in various fields. In order to support large model training, we have to use distributed training techniques. However, finding an efficient distributed execution plan not only requires fine-grained model statistics, such as memory and computing overhead of each operator but also is a labor-intensive task even for an expert in the field of distributed training.
+
+## Our solution
+To simplify the process of distributed training for foundational models, recent advancements in machine learning systems have led to the emergence of automatic parallel systems. We investigate and research a number of current automatic parallel systems(<a href="https://arxiv.org/abs/1807.08887"> Tofu </a>, <a href="https://arxiv.org/abs/1807.05358"> Flexflow </a>, <a href="https://arxiv.org/abs/2201.12023"> Alpa </a>) and some auto activation checkpoint algorithms(<a href="https://hal.inria.fr/hal-02352969"> Rotor </a>, <a href="https://arxiv.org/abs/1604.06174"> Sublinear </a>). Inspired from these advanced systems, we build an automatic parallel system upon PyTorch framework. The input of the system is the serial PyTorch code, and the output is a PyTorch program with an optimized distributed execution plan. It is worth emphasizing that the output is a regular PyTorch program, so it is compatible with runtime optimization methods, such as ZeRO-Offload and PatrickStar.
+
+## Key modules
+
+### Analyzer
+
+**Analyzer** is a static analysis system consisting of three parts:
+A *symbolic profiler* for collecting computing and memory overhead related to static computation graph, a *cluster detector* for collecting hardware characteristics and detecting cluster topology and a *tensor layout manager* to find efficient tensor layout conversion path from different sharding spec and record conversion cost.
+
+### Solver
+
+**Solver** is designed to find the optimal execution plan for a given computation graph and cluster in two stages:
+1) *Intra-op parallelism stage* is to find the plan with the minimum total execution time of all nodes with respect to the constraint of the memory budget. The optimaztion goal of intra-op parallelism solver is modified from <a href="https://arxiv.org/abs/2201.12023"> Alpa </a>'s intra-op parallelsim ILP solver.
+2) *Activation checkpoint stage* is to search for the fastest execution plan that meets the memory budget on the computation graph after inserting the communication nodes by the intra-op parallelism stage. The algorithm to find optimial activation checkpoint is modified from <a href="https://hal.inria.fr/hal-02352969"> Rotor </a>. The reason we use two-stage optimization is that if the two tasks are formulated together, the solving time will be significantly increased, which will greatly affect the user experience of the system. On the contrary, solving in two hierarchical levels has many advantages. Firstly, compared with the computation graph with activation checkpointing, the original graph has fewer nodes, which can reduce the solving cost of intra-op parallelism solver. In addition, a more optimal solution can be found by adding the communication overhead into the activation checkpoint modeling.
+
+### Generator
+**Generator** applies the searched execution plan to the computation graph and recompiles the computation graph to optimized PyTorch code. It has *a series compile pass* to insert a communication node or do the kernel substitution as the intra-op parallelism solver required. Additionally, we implement a *code generation* feature to recognize the annotation from the activation checkpoint solver and inject the activation checkpoint block following annotation instructions.
diff --git a/colossalai/auto_parallel/tensor_shard/solver/solver.py b/colossalai/auto_parallel/tensor_shard/solver/solver.py
index 3bc3e8960cc8..5449fb5a149d 100644
--- a/colossalai/auto_parallel/tensor_shard/solver/solver.py
+++ b/colossalai/auto_parallel/tensor_shard/solver/solver.py
@@ -1,3 +1,7 @@
+"""This code is adapted from Alpa
+    https://github.com/alpa-projects/alpa/
+   with some changes. """
+
 import multiprocessing
 import time
 import warnings
diff --git a/colossalai/device/device_mesh.py b/colossalai/device/device_mesh.py
index 22a01dddb869..2a5f747fbc23 100644
--- a/colossalai/device/device_mesh.py
+++ b/colossalai/device/device_mesh.py
@@ -1,3 +1,7 @@
+"""This code is adapted from Alpa
+    https://github.com/alpa-projects/alpa/
+   with some changes. """
+
 import operator
 from functools import reduce
 from typing import List, Tuple
@@ -6,13 +10,10 @@
 import torch.distributed as dist
 
 
+# modified from alpa LogicalDeviceMesh(https://github.com/alpa-projects/alpa/blob/main/alpa/shard_parallel/auto_sharding.py)
 class DeviceMesh:
-    """A logical view of a physical mesh. The logical view is used in the
-    search process.
-    A physical mesh can have multiple logical views. (e.g., a 2x8 physical mesh
-    can be viewed as a 1x16 or a 4x4 logical mesh). Each mesh dimension has its
-    own latency and bandwidth. We use alpha-beta model to model the
-    communication cost.
+    """A logical view of a physical cluster. For example, we could view a physical cluster
+    with 16 devices as a device mesh with shape (2, 2, 4) or (4, 4).
 
     Arguments:
         physical_mesh_id (torch.Tensor): physical view of the devices in global rank.
diff --git a/docs/source/en/Colossal-Auto/get_started/introduction.md b/docs/source/en/Colossal-Auto/get_started/introduction.md
index 3d504d9c9cf8..9c2123756e57 100644
--- a/docs/source/en/Colossal-Auto/get_started/introduction.md
+++ b/docs/source/en/Colossal-Auto/get_started/introduction.md
@@ -37,9 +37,6 @@ Colossal-AI’s auto-parallelism searches for strategies in regard to each opera
 ## Distributed Tensor and Shape-Consistency System
 
 The Colossal-AI system uses a device-mesh, similar to PyTorch's latest DTensor release, to manage its cluster. Colossal-AI uses a sharding-spec to annotate the storage status of each tensor and facilitate their distribution across the cluster. The system also employs a shape-consistency manager to automatically transform tensors between different sharding-specs, allowing for seamless slicing and dicing of tensors, while the shape-consistency manager ensures that the output of upstream operands is consistently stored in the cluster, regardless of how the input of downstream operands is stored. This makes Colossal-AI highly versatile and easy to use without users worrying about the storage status of tensors when performing operations on them.
-<figure style={{textAlign: "center"}}>
-<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/auto_parallel/shape_consistency.png"/>
-</figure>
 
 Here are some key advantages of Colossal-AI compared to PyTorch DTensor:
 Colossal-AI's device-mesh uses cluster performance metrics and profiling results to estimate the time consumption of different communication operators. This helps Colossal-AI optimize communication between nodes and improve overall system efficiency.
diff --git a/docs/source/en/Colossal-Auto/get_started/run_demo.md b/docs/source/en/Colossal-Auto/get_started/run_demo.md
index bcf88cafc786..6f7a82966f20 100644
--- a/docs/source/en/Colossal-Auto/get_started/run_demo.md
+++ b/docs/source/en/Colossal-Auto/get_started/run_demo.md
@@ -11,7 +11,3 @@ Detailed instructions can be found in its `README.md`.
 
 Colossal-Auto's automatic search function for activation checkpointing finds the most efficient checkpoint within a given memory budget, rather than just aiming for maximum memory compression. To avoid a lengthy search process for an optimal activation checkpoint, Colossal-Auto has implemented a two-stage search process. This allows the system to find a feasible distributed training solution in a reasonable amount of time while still benefiting from activation checkpointing for memory management. The integration of activation checkpointing in Colossal-AI improves the efficiency and effectiveness of large model training. You can follow the [Resnet example](https://github.com/hpcaitech/ColossalAI/tree/main/examples/tutorial/auto_parallel).
 Detailed instructions can be found in its `README.md`.
-
-<figure style={{textAlign: "center"}}>
-<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/auto_parallel/auto_ckpt.jpg"/>
-</figure>

From cf6409dd40979e2cd2ca0bbe5697ca06e695a659 Mon Sep 17 00:00:00 2001
From: YuliangLiu0306 <72588413+YuliangLiu0306@users.noreply.github.com>
Date: Sun, 19 Feb 2023 15:57:14 +0800
Subject: [PATCH 358/503] Hotfix/auto parallel zh doc (#2820)

* [hotfix] fix autoparallel zh docs

* polish

* polish
---
 docs/source/en/Colossal-Auto/get_started/introduction.md    | 2 +-
 .../zh-Hans/Colossal-Auto/get_started/introduction.md       | 6 ++----
 docs/source/zh-Hans/Colossal-Auto/get_started/run_demo.md   | 4 ----
 3 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/docs/source/en/Colossal-Auto/get_started/introduction.md b/docs/source/en/Colossal-Auto/get_started/introduction.md
index 9c2123756e57..a2606dd2bf9f 100644
--- a/docs/source/en/Colossal-Auto/get_started/introduction.md
+++ b/docs/source/en/Colossal-Auto/get_started/introduction.md
@@ -30,7 +30,7 @@ Colossal-Auto is **the first auto-parallelism system** that uses static graph an
 
 
 ## Fine-grained Parallelism Search
-Colossal-AI’s auto-parallelism searches for strategies in regard to each operand with the goal of achieving the fastest runtime while meeting memory budget constraints. It ultimately determines the actual training time strategy, including the tensor split strategy for each tensor, the type of communication operators to be inserted between different computing nodes, whether to replace operators, etc. The tensor, data, and hybrid parallelism such as column and row split used by NVIDIA in Megatron-LM and other parallelism systems are all subsets of strategies that can be searched by Colossal-AI. In addition to these parallelisms that can be manually specified, Colossal-AI can specify a unique parallelism method for each operation and, potentially finding a better parallelism strategy than what human experts could provide.
+We investigate and research a number of current automatic parallel systems(<a href="https://arxiv.org/abs/1807.08887"> Tofu </a>, <a href="https://arxiv.org/abs/1807.05358"> Flexflow </a>, <a href="https://arxiv.org/abs/2201.12023"> Alpa </a>) and some auto activation checkpoint algorithms(<a href="https://hal.inria.fr/hal-02352969"> Rotor </a>, <a href="https://arxiv.org/abs/1604.06174"> Sublinear </a>). Inspired from these advanced systems, we build Colossal-Auto which is an automatic parallel system upon PyTorch framework. Colossal-Auto searches for strategies in regard to each operand with the goal of achieving the fastest runtime while meeting memory budget constraints. It ultimately determines the actual training time strategy, including the tensor split strategy for each tensor, the type of communication operators to be inserted between different computing nodes, whether to replace operators, etc. The tensor, data, and hybrid parallelism such as column and row split used by NVIDIA in Megatron-LM and other parallelism systems are all subsets of strategies that can be searched by Colossal-AI. In addition to these parallelisms that can be manually specified, Colossal-AI can specify a unique parallelism method for each operation and, potentially finding a better parallelism strategy than what human experts could provide.
 
 
diff --git a/docs/source/zh-Hans/Colossal-Auto/get_started/introduction.md b/docs/source/zh-Hans/Colossal-Auto/get_started/introduction.md
index 1d41e3b501e6..bd5326d43220 100644
--- a/docs/source/zh-Hans/Colossal-Auto/get_started/introduction.md
+++ b/docs/source/zh-Hans/Colossal-Auto/get_started/introduction.md
@@ -25,7 +25,8 @@ Colossal-Auto 是**首个基于 PyTorch 框架使用静态图分析的自动并
 
 
 ## 细粒度分布式训练策略搜索
-Colossal-AI 的自动并行策略会在满足内存预算的限制下，以最快运行时间为目标，为每个 op 进行策略搜索，最终得到真实训练时的策略，包括每个 tensor 的切分策略，不同计算节点间需要插入的通信算子类型，是否要进行算子替换等。现有系统中的张量并行，数据并行，NVIDIA 在 Megatron-LM 等并行系统中使用的 column 切分和 row 切分并行等混合并行，都是自动并行可以搜索到的策略的子集。除了这些可以手动指定的并行方式外，Colossal-AI 有能力为每个 op 指定独特的并行方式，因此有可能找到比依赖专家经验和试错配置的手动切分更好的并行策略。
+
+我们调研了很多现有的自动并行系统（<a href="https://arxiv.org/abs/1807.08887"> Tofu </a>, <a href="https://arxiv.org/abs/1807.05358"> Flexflow </a>, <a href="https://arxiv.org/abs/2201.12023"> Alpa </a>），以及自动激活值检查点算法（<a href="https://hal.inria.fr/hal-02352969"> Rotor </a>, <a href="https://arxiv.org/abs/1604.06174"> Sublinear </a>），在他们的启发下，我们开发一个基于PyTorch框架的自动并行系统Colossal-Auto。Colossal-Auto会在满足内存预算的限制下，以最快运行时间为目标，为每个 op 进行策略搜索，最终得到真实训练时的策略，包括每个 tensor 的切分策略，不同计算节点间需要插入的通信算子类型，是否要进行算子替换等。现有系统中的张量并行，数据并行，NVIDIA 在 Megatron-LM 等并行系统中使用的 column 切分和 row 切分并行等混合并行，都是自动并行可以搜索到的策略的子集。除了这些可以手动指定的并行方式外，Colossal-AI 有能力为每个 op 指定独特的并行方式，因此有可能找到比依赖专家经验和试错配置的手动切分更好的并行策略。
 
 
@@ -33,9 +34,6 @@ Colossal-AI 的自动并行策略会在满足内存预算的限制下，以最
 
 与 PyTorch 最新发布的 DTensor 类似，Colossal-AI 也使用了 device mesh 对集群进行了抽象管理。具体来说，Colossal-AI 使用 sharding spec 对 tensor 的分布式存储状态进行标注，使用 shape consistency manager 自动地对同一 tensor 在不同 sharding spec 间进行转换。这让 Colossal-AI 的通用性和易用性极大地提升，借助 shape consistency manager 可以没有负担地切分 tensor，而不用担心上游 op 的 output 与下游的 input 在集群中的存储方式不同。
 
-<figure style={{textAlign: "center"}}>
-<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/auto_parallel/shape_consistency.png"/>
-</figure>
 
 相较于 PyTorch DTensor，Colossal-AI 有以下优势：
 + Colossal-AI 的 device mesh 可以 profiling 到集群性能指标，对不同的通信算子进行耗时估算。
diff --git a/docs/source/zh-Hans/Colossal-Auto/get_started/run_demo.md b/docs/source/zh-Hans/Colossal-Auto/get_started/run_demo.md
index 1050dcec6842..19316e12b4d5 100644
--- a/docs/source/zh-Hans/Colossal-Auto/get_started/run_demo.md
+++ b/docs/source/zh-Hans/Colossal-Auto/get_started/run_demo.md
@@ -10,7 +10,3 @@ Colossal-Auto 可被用于为每一次操作寻找一个包含数据、张量（
 
 作为大模型训练中必不可少的显存压缩技术，Colossal-AI 也提供了对于 activation checkpoint 的自动搜索功能。相比于大部分将最大显存压缩作为目标的技术方案，Colossal-AI 的搜索目标是在显存预算以内，找到最快的 activation checkpoint 方案。同时，为了避免将 activation checkpoint 的搜索一起建模到 SPMD solver 中导致搜索时间爆炸，Colossal-AI 做了 2-stage search 的设计，因此可以在合理的时间内搜索到有效可行的分布式训练方案。 您可参考 [Resnet 示例](https://github.com/hpcaitech/ColossalAI/tree/main/examples/tutorial/auto_parallel)。
 详细的操作指引见其 `README.md`。
-
-<figure style={{textAlign: "center"}}>
-<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/auto_parallel/auto_ckpt.jpg"/>
-</figure>

From bf0204604f61314316fdd43795ead7abf8b8c356 Mon Sep 17 00:00:00 2001
From: Jiarui Fang <fangjiarui123@gmail.com>
Date: Mon, 20 Feb 2023 10:35:55 +0800
Subject: [PATCH 359/503] [exmaple] add bert and albert (#2824)

---
 examples/language/bert/run_gemini.sh      |  22 ++
 examples/language/bert/test_ci.sh         |   2 +
 examples/language/bert/train_bert_demo.py | 332 ++++++++++++++++++++++
 3 files changed, 356 insertions(+)
 create mode 100644 examples/language/bert/run_gemini.sh
 create mode 100644 examples/language/bert/test_ci.sh
 create mode 100644 examples/language/bert/train_bert_demo.py

diff --git a/examples/language/bert/run_gemini.sh b/examples/language/bert/run_gemini.sh
new file mode 100644
index 000000000000..d791334e8c97
--- /dev/null
+++ b/examples/language/bert/run_gemini.sh
@@ -0,0 +1,22 @@
+set -x
+# distplan in ["CAI_ZeRO1", "CAI_ZeRO2", "CAI_Gemini", "Pytorch_DDP", "Pytorch_ZeRO"]
+export DISTPLAN=${DISTPLAN:-"CAI_Gemini"}
+
+# The following options only valid when DISTPLAN="colossalai"
+export GPUNUM=${GPUNUM:-1}
+export PLACEMENT=${PLACEMENT:-"cpu"}
+export BATCH_SIZE=${BATCH_SIZE:-16}
+
+# bert | albert
+export MODEL_TYPE=${MODEL_TYPE:-"bert"}
+export TRAIN_STEP=${TRAIN_STEP:-10}
+
+mkdir -p gemini_logs
+
+env CUDA_LAUNCH_BLOCKING=1 torchrun --standalone --nproc_per_node=${GPUNUM} ./train_bert_demo.py \
+--model_type=${MODEL_TYPE} \
+--batch_size=${BATCH_SIZE} \
+--placement=${PLACEMENT} \
+--distplan=${DISTPLAN} \
+--train_step=${TRAIN_STEP} \
+2>&1 | tee ./gemini_logs/${MODEL_TYPE}_${DISTPLAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_${PLACEMENT}.log
diff --git a/examples/language/bert/test_ci.sh b/examples/language/bert/test_ci.sh
new file mode 100644
index 000000000000..42c63fec50c0
--- /dev/null
+++ b/examples/language/bert/test_ci.sh
@@ -0,0 +1,2 @@
+set -x
+env GPUNUM=1 bash run_gemini.sh
diff --git a/examples/language/bert/train_bert_demo.py b/examples/language/bert/train_bert_demo.py
new file mode 100644
index 000000000000..b690ff787d01
--- /dev/null
+++ b/examples/language/bert/train_bert_demo.py
@@ -0,0 +1,332 @@
+import os
+from functools import partial
+from time import time
+
+import psutil
+import torch
+from packaging import version
+from torch import nn
+from torch.nn.parallel import DistributedDataParallel as DDP
+from transformers import AlbertConfig, AlbertForSequenceClassification, BertConfig, BertForSequenceClassification
+
+import colossalai
+from colossalai.logging import disable_existing_loggers, get_dist_logger
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.nn.parallel import zero_model_wrapper, zero_optim_wrapper
+from colossalai.tensor import ColoParameter, ComputePattern, ComputeSpec, ProcessGroup, ReplicaSpec, ShardSpec
+from colossalai.utils import get_current_device
+from colossalai.utils.model.colo_init_context import ColoInitContext
+
+CAI_VERSION = colossalai.__version__
+
+
+def get_tflops(model_numel, batch_size, seq_len, step_time):
+    return model_numel * batch_size * seq_len * 8 / 1e12 / (step_time + 1e-12)
+
+
+def get_profile_context(enable_flag, warmup_steps, active_steps, save_dir):
+    from contextlib import nullcontext
+
+    from torch.profiler import ProfilerActivity, profile, schedule, tensorboard_trace_handler
+    if enable_flag:
+        return profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+                       schedule=schedule(wait=0, warmup=warmup_steps, active=active_steps),
+                       on_trace_ready=tensorboard_trace_handler(save_dir),
+                       record_shapes=True,
+                       profile_memory=True)
+    else:
+
+        class DummyProfiler:
+
+            def __init__(self):
+                self.step_number = 0
+
+            def step(self):
+                self.step_number += 1
+
+        return nullcontext(DummyProfiler())
+
+
+def get_time_stamp():
+    import time
+    cur_time = time.strftime("%d-%H:%M", time.localtime())
+    return cur_time
+
+
+def get_bert_data(batch_size: int, sequence_length: int, vacob_size: int, n_class: int, device: torch.device):
+    input = torch.randint(
+        low=0,
+        high=vacob_size,
+        size=(batch_size, sequence_length),
+        device=device,
+        dtype=torch.long,
+    )
+    label = torch.randint(low=0, high=n_class, size=(batch_size,), device=device, dtype=torch.long)
+    return input, label
+
+
+def parse_args():
+    parser = colossalai.get_default_parser()
+    parser.add_argument(
+        "--distplan",
+        type=str,
+        default='CAI_Gemini',
+        help="The distributed plan [colossalai, zero1, zero2, torch_ddp, torch_zero].",
+    )
+    parser.add_argument(
+        "--placement",
+        type=str,
+        default='cpu',
+        help="Placement Policy for Gemini. Valid when using colossalai as dist plan.",
+    )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=8,
+        help="batch size per DP group of training.",
+    )
+    parser.add_argument(
+        "--model_type",
+        type=str,
+        default="bert",
+        help="bert or albert",
+    )
+    parser.add_argument(
+        "--train_step",
+        type=int,
+        default=10,
+        help="training iterations for test",
+    )
+
+    args = parser.parse_args()
+    return args
+
+
+SEQ_LEN = 512
+VOCAB_SIZE = 1000
+NUM_LABELS = 10
+
+
+# Parameter Sharding Strategies for Tensor Parallelism
+def split_param_single_dim_tp1d(dim: int, param: ColoParameter, pg: ProcessGroup):
+    spec = (ShardSpec([dim], [pg.tp_world_size()]), ComputeSpec(ComputePattern.TP1D))
+    param.set_tensor_spec(*spec)
+
+
+def split_param_row_tp1d(param: ColoParameter, pg: ProcessGroup):
+    split_param_single_dim_tp1d(0, param, pg)
+
+
+def split_param_col_tp1d(param: ColoParameter, pg: ProcessGroup):
+    split_param_single_dim_tp1d(-1, param, pg)
+
+
+def get_cpu_mem():
+    return psutil.Process().memory_info().rss / 1024**2
+
+
+def get_gpu_mem():
+    return torch.cuda.memory_allocated() / 1024**2
+
+
+def get_mem_info(prefix=''):
+    return f'{prefix}GPU memory usage: {get_gpu_mem():.2f} MB, CPU memory usage: {get_cpu_mem():.2f} MB'
+
+
+def get_model_size(model: nn.Module):
+    total_numel = 0
+    for module in model.modules():
+        for p in module.parameters(recurse=False):
+            total_numel += p.numel()
+    return total_numel
+
+
+def model_builder(args):
+    if args.model_type == "bert":
+        cfg = BertConfig(vocab_size=VOCAB_SIZE, num_labels=NUM_LABELS)
+        return BertForSequenceClassification(cfg)
+    elif args.model_type == "albert":
+        cfg = AlbertConfig(vocab_size=VOCAB_SIZE, num_labels=NUM_LABELS)
+        return AlbertForSequenceClassification(cfg)
+    else:
+        raise RuntimeError
+
+
+def model_size_formatter(numel: int) -> str:
+    GB_SIZE = 10**9
+    MB_SIZE = 10**6
+    KB_SIZE = 10**3
+    if numel >= GB_SIZE:
+        return f'{numel / GB_SIZE:.1f}B'
+    elif numel >= MB_SIZE:
+        return f'{numel / MB_SIZE:.1f}M'
+    elif numel >= KB_SIZE:
+        return f'{numel / KB_SIZE:.1f}K'
+    else:
+        return str(numel)
+
+
+def set_cpu_maximum_parallelism():
+    conf_str = torch.__config__.parallel_info()
+    inter_str = conf_str.split("hardware_concurrency() : ")[1]
+    max_concurrency = inter_str.split('\n')[0]
+    os.environ["OMP_NUM_THREADS"] = max_concurrency
+    print(f"environmental variable OMP_NUM_THREADS is set to {max_concurrency}.")
+
+
+def main():
+    # version check
+    # this example is supposed to work for versions greater than 0.2.0
+    assert version.parse(CAI_VERSION) >= version.parse("0.2.0")
+
+    set_cpu_maximum_parallelism()
+    args = parse_args()
+
+    # if args.distplan not in ["colossalai", "torch_ddp", "torch_zero", "zero1", "zero2"]:
+    if args.distplan not in ["CAI_ZeRO1", "CAI_ZeRO2", "CAI_Gemini", "Pytorch_DDP", "Pytorch_ZeRO"]:
+        raise TypeError(f"{args.distplan} is error")
+
+    # batch size per DP degree
+    BATCH_SIZE = args.batch_size
+
+    NUM_STEPS = args.train_step
+
+    WARMUP_STEPS = 1
+    assert WARMUP_STEPS < NUM_STEPS, "warmup steps should smaller than the total steps"
+    assert (NUM_STEPS - WARMUP_STEPS) % 2 == 1, "the number of valid steps should be odd to take the median"
+    PROF_FLAG = False    # The flag of profiling, False by default
+
+    disable_existing_loggers()
+    colossalai.launch_from_torch(config={})
+
+    logger = get_dist_logger()
+    logger.info(f" {args.distplan}, batch size {BATCH_SIZE}", ranks=[0])
+
+    torch.manual_seed(123)
+    if args.distplan.startswith("CAI"):
+        # all param must use the same process group.
+        world_size = torch.distributed.get_world_size()
+
+        # build a base-bert model
+        with ColoInitContext(device=get_current_device(), dtype=torch.half):
+            model = model_builder(args)
+            # model = BertForSequenceClassification(BertConfig(vocal_size =  VOCAB_SIZE))
+
+        # asign running configurations
+        gemini_config = None
+        if args.distplan.startswith("CAI_ZeRO"):
+            optim_config = dict(reduce_bucket_size=12 * 1024 * 1024, overlap_communication=True, verbose=True)
+        elif args.distplan == "CAI_Gemini":
+            gemini_config = dict(strict_ddp_mode=True,
+                                 device=get_current_device(),
+                                 placement_policy=args.placement,
+                                 pin_memory=True,
+                                 hidden_dim=model.config.hidden_size,
+                                 search_range_mb=128)
+            optim_config = dict(gpu_margin_mem_ratio=0.)
+        else:
+            raise RuntimeError
+
+        # build a highly optimized gpu/cpu optimizer
+        optimizer = HybridAdam(model.parameters(), lr=1e-3)
+
+        if args.distplan == "CAI_ZeRO1":
+            zero_stage = 1
+        elif args.distplan == "CAI_ZeRO2":
+            zero_stage = 2
+        elif args.distplan == "CAI_Gemini":
+            zero_stage = 3
+        else:
+            raise RuntimeError
+
+        # wrap your model and optimizer
+        model = zero_model_wrapper(model, zero_stage, gemini_config)
+        optimizer = zero_optim_wrapper(model, optimizer, optim_config=optim_config)
+
+        logger.info(get_mem_info(prefix='After init optim, '), ranks=[0])
+    elif args.distplan.startswith("Pytorch"):
+        model = model_builder(args).cuda()
+        model = DDP(model)
+        if args.distplan.endswith("DDP"):
+            optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
+        elif args.distplan.endswith("ZeRO"):
+            from torch.distributed.optim import ZeroRedundancyOptimizer
+            optimizer = ZeroRedundancyOptimizer(model.parameters(), optimizer_class=torch.optim.Adam, lr=1e-3)
+    else:
+        raise RuntimeError
+
+    # model is shared after TP
+    numel = get_model_size(model)
+    logger.info(f"the size of testing model size is {model_size_formatter(numel)}.")
+    logger.info(get_mem_info(prefix='After init model, '), ranks=[0])
+
+    # Tflops_per_GPU = global_batch * global_numel * seq_len * 8 / #gpu
+    # = (batch_per_DP_group * dp_degree) * (numel * tp_degree) * seq_len * 8 / (tp_degree * dp_degree)
+    # = batch_per_DP_group * numel * seq_len * 8
+    get_tflops_func = partial(get_tflops, numel, BATCH_SIZE, SEQ_LEN)
+
+    torch.cuda.synchronize()
+    model.train()
+    tflops_list = []
+
+    def train_step():
+        # we just use randomly generated data here
+        input_ids, labels = get_bert_data(BATCH_SIZE,
+                                          SEQ_LEN,
+                                          VOCAB_SIZE,
+                                          NUM_LABELS,
+                                          device=torch.cuda.current_device())
+        optimizer.zero_grad()
+
+        start = time()
+        outputs = model(input_ids, labels=labels)
+        loss, logits = outputs[:2]
+        torch.cuda.synchronize()
+        fwd_end = time()
+        fwd_time = fwd_end - start
+        logger.info(get_mem_info(prefix=f'[{n + 1}/{NUM_STEPS}] Forward '), ranks=[0])
+
+        if args.distplan.startswith("CAI"):
+            optimizer.backward(loss)
+        elif args.distplan.startswith("Pytorch"):
+            loss.backward()
+        else:
+            raise RuntimeError
+
+        torch.cuda.synchronize()
+        bwd_end = time()
+        bwd_time = bwd_end - fwd_end
+        logger.info(get_mem_info(prefix=f'[{n + 1}/{NUM_STEPS}] Backward '), ranks=[0])
+
+        optimizer.step()
+        torch.cuda.synchronize()
+        optim_time = time() - bwd_end
+        step_time = time() - start
+        logger.info(get_mem_info(prefix=f'[{n + 1}/{NUM_STEPS}] Optimizer step '), ranks=[0])
+
+        step_tflops = get_tflops_func(step_time)
+        logger.info(
+            f"[{n + 1}/{NUM_STEPS}] Loss:{loss.item():.3f}, Step time: {step_time:.3f}s, TFLOPS: {get_tflops_func(step_time):.3f}, FWD time: {fwd_time:.3f}s, BWD time: {bwd_time:.3f}s, OPTIM time: {optim_time:.3f}s",
+            ranks=[0],
+        )
+        if n >= WARMUP_STEPS:
+            tflops_list.append(step_tflops)
+
+    demo_profiler = get_profile_context(PROF_FLAG,
+                                        WARMUP_STEPS,
+                                        NUM_STEPS - WARMUP_STEPS,
+                                        save_dir=f"profile/{get_time_stamp()}-demo")
+
+    with demo_profiler as prof:
+        for n in range(NUM_STEPS):
+            train_step()
+            prof.step()
+
+    tflops_list.sort()
+    median_index = ((NUM_STEPS - WARMUP_STEPS) >> 1) + WARMUP_STEPS
+    logger.info(f"Median TFLOPS is {tflops_list[median_index]:.3f}")
+    torch.cuda.synchronize()
+
+
+if __name__ == '__main__':
+    main()

From 89f0017a9cc653cc193a333e19c2862a44310edf Mon Sep 17 00:00:00 2001
From: Marco Rodrigues <gothicx@gmail.com>
Date: Mon, 20 Feb 2023 03:36:23 +0100
Subject: [PATCH 360/503] Typo (#2826)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 7d18be39ab1a..ca92508527b1 100644
--- a/README.md
+++ b/README.md
@@ -278,7 +278,7 @@ Acceleration of [AlphaFold Protein Structure](https://alphafold.ebi.ac.uk/)
 
 ### Install from PyPI
 
-You can easily install Colossal-AI with the following command. **By defualt, we do not build PyTorch extensions during installation.**
+You can easily install Colossal-AI with the following command. **By default, we do not build PyTorch extensions during installation.**
 
 ```bash
 pip install colossalai

From 58abde285776616d4e217d5ed89ae76c9687bc20 Mon Sep 17 00:00:00 2001
From: mickogoin <79158859+mickogoin@users.noreply.github.com>
Date: Mon, 20 Feb 2023 10:37:57 +0800
Subject: [PATCH 361/503] Update README.md (#2791)

Fixed typo on line 285 from "defualt" to "default"

From c008d4ad0c755586d5e7cb74b483ad707c78967c Mon Sep 17 00:00:00 2001
From: Michelle <97082656+MichelleMa8@users.noreply.github.com>
Date: Mon, 20 Feb 2023 10:38:40 +0800
Subject: [PATCH 362/503] [NFC] polish
 colossalai/engine/schedule/_pipeline_schedule.py code style (#2744)

---
 colossalai/engine/schedule/_pipeline_schedule.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/colossalai/engine/schedule/_pipeline_schedule.py b/colossalai/engine/schedule/_pipeline_schedule.py
index 97571fa024ba..712ae8242409 100644
--- a/colossalai/engine/schedule/_pipeline_schedule.py
+++ b/colossalai/engine/schedule/_pipeline_schedule.py
@@ -4,8 +4,9 @@
 import inspect
 from typing import Callable, List, Tuple, Union
 
-import colossalai.communication as comm
 import torch.cuda
+
+import colossalai.communication as comm
 from colossalai.amp.naive_amp import NaiveAMPModel
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
@@ -72,9 +73,9 @@ class PipelineSchedule(BaseSchedule):
         tensor_shape (torch.Size, optional): Specified shape in pipeline communication.
         scatter_gather_tensors (bool, optional):
             If set to `True`, communication will be reduced over pipeline when using 1D tensor parallelization.
-    
+
     Example:
-    
+
         # this shows an example of customized data_process_func
         def data_process_func(stage_output, dataloader_output):
             output1, output2 = stage_output
@@ -157,6 +158,7 @@ def load_micro_batch(self):
 
     def pre_processing(self, engine):
         from colossalai.zero.sharded_model.sharded_model_v2 import ShardedModelV2
+
         # TODO: remove this after testing new zero with pipeline parallelism
         model = engine.model
         if isinstance(model, NaiveAMPModel):
@@ -229,7 +231,7 @@ def _get_data_label_for_current_step(self, stage_output, micro_batch_data, crite
         return data, label
 
     def _forward_step(self, engine, input_obj, return_tensors, return_output_label=True, accum_loss=None):
-        """Forward step for passed-in model. If it is the first stage, the input tensor 
+        """Forward step for passed-in model. If it is the first stage, the input tensor
         is obtained from data_iterator, otherwise the passed-in input_obj is used.
         Returns output tensor. This is a helper function and can be ignored by users.
 
@@ -266,7 +268,7 @@ def _forward_step(self, engine, input_obj, return_tensors, return_output_label=T
             return output_obj
 
     def _backward_step(self, engine, input_obj, output_obj, output_obj_grad):
-        """Backward step through the passed-in output tensor. If it is the last stage, the 
+        """Backward step through the passed-in output tensor. If it is the last stage, the
         output_obj_grad is None, otherwise it is the gradients with respect to stage's output tensor.
         Returns the gradients with respect to the input tensor (None if first stage).
         This is a helper function and can be ignored by users.
@@ -511,7 +513,7 @@ def _forward_step(self,
                       return_tensors,
                       return_output_label=True,
                       accum_loss=None):
-        """Forward step for passed-in model. If it is the first stage, the input tensor 
+        """Forward step for passed-in model. If it is the first stage, the input tensor
         is obtained from data_iterator, otherwise the passed-in input_obj is used.
         Returns output tensor. This is a helper function and can be ignored by users.
 

From b6a108cb916417abf339fc41a794d77222135ced Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Mon, 20 Feb 2023 15:22:36 +0800
Subject: [PATCH 363/503] [chatgpt] add test checkpoint (#2797)

* [chatgpt] add test checkpoint

* [chatgpt] test checkpoint use smaller model
---
 applications/ChatGPT/tests/test_checkpoint.py | 98 +++++++++++++++++++
 applications/ChatGPT/tests/test_data.py       |  7 +-
 2 files changed, 103 insertions(+), 2 deletions(-)
 create mode 100644 applications/ChatGPT/tests/test_checkpoint.py

diff --git a/applications/ChatGPT/tests/test_checkpoint.py b/applications/ChatGPT/tests/test_checkpoint.py
new file mode 100644
index 000000000000..6cbe51569ff3
--- /dev/null
+++ b/applications/ChatGPT/tests/test_checkpoint.py
@@ -0,0 +1,98 @@
+import os
+import tempfile
+from contextlib import nullcontext
+from functools import partial
+
+import pytest
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+from chatgpt.nn import GPTActor
+from chatgpt.trainer.strategies import ColossalAIStrategy, DDPStrategy
+from transformers.models.gpt2.configuration_gpt2 import GPT2Config
+
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.testing import rerun_if_address_is_in_use
+from colossalai.utils import free_port
+
+GPT_CONFIG = GPT2Config(n_embd=128, n_layer=4, n_head=4)
+
+
+def get_data(batch_size: int, seq_len: int = 10) -> dict:
+    input_ids = torch.randint(0, 50257, (batch_size, seq_len), device='cuda')
+    attention_mask = torch.ones_like(input_ids)
+    return dict(input_ids=input_ids, attention_mask=attention_mask)
+
+
+def run_test_checkpoint(strategy):
+    BATCH_SIZE = 2
+
+    if strategy == 'ddp':
+        strategy = DDPStrategy()
+    elif strategy == 'colossalai_gemini':
+        strategy = ColossalAIStrategy(stage=3, placement_policy='cuda', initial_scale=2**5)
+    elif strategy == 'colossalai_zero2':
+        strategy = ColossalAIStrategy(stage=2, placement_policy='cuda')
+    else:
+        raise ValueError(f'Unsupported strategy "{strategy}"')
+
+    with strategy.model_init_context():
+        actor = GPTActor(config=GPT_CONFIG).cuda()
+
+    actor_optim = HybridAdam(actor.parameters())
+
+    actor, actor_optim = strategy.prepare((actor, actor_optim))
+
+    def run_step():
+        data = get_data(BATCH_SIZE)
+        action_mask = torch.ones_like(data['attention_mask'], dtype=torch.bool)
+        action_log_probs = actor(data['input_ids'], action_mask.size(1), data['attention_mask'])
+        loss = action_log_probs.sum()
+        strategy.backward(loss, actor, actor_optim)
+        strategy.optimizer_step(actor_optim)
+
+    run_step()
+
+    ctx = tempfile.TemporaryDirectory() if dist.get_rank() == 0 else nullcontext()
+
+    with ctx as dirname:
+        rank0_dirname = [dirname]
+        dist.broadcast_object_list(rank0_dirname)
+        rank0_dirname = rank0_dirname[0]
+
+        model_path = os.path.join(rank0_dirname, 'model.pt')
+        optim_path = os.path.join(rank0_dirname, f'optim-r{dist.get_rank()}.pt')
+
+        strategy.save_model(actor, model_path, only_rank0=True)
+        strategy.save_optimizer(actor_optim, optim_path, only_rank0=False)
+
+        dist.barrier()
+
+        strategy.load_model(actor, model_path, strict=False)
+        strategy.load_optimizer(actor_optim, optim_path)
+
+        dist.barrier()
+
+    run_step()
+
+
+def run_dist(rank, world_size, port, strategy):
+    os.environ['RANK'] = str(rank)
+    os.environ['LOCAL_RANK'] = str(rank)
+    os.environ['WORLD_SIZE'] = str(world_size)
+    os.environ['MASTER_ADDR'] = 'localhost'
+    os.environ['MASTER_PORT'] = str(port)
+    run_test_checkpoint(strategy)
+
+
+@pytest.mark.dist
+@pytest.mark.parametrize('world_size', [2])
+@pytest.mark.parametrize('strategy', ['ddp', 'colossalai_zero2', 'colossalai_gemini'])
+@rerun_if_address_is_in_use()
+def test_checkpoint(world_size, strategy):
+    run_func = partial(run_dist, world_size=world_size, port=free_port(), strategy=strategy)
+    mp.spawn(run_func, nprocs=world_size)
+
+
+if __name__ == '__main__':
+    test_checkpoint(2, 'colossalai_zero2')
diff --git a/applications/ChatGPT/tests/test_data.py b/applications/ChatGPT/tests/test_data.py
index b0a9433c2e4f..9571c2843e07 100644
--- a/applications/ChatGPT/tests/test_data.py
+++ b/applications/ChatGPT/tests/test_data.py
@@ -10,10 +10,13 @@
 from chatgpt.nn import GPTActor, GPTCritic, RewardModel
 from chatgpt.replay_buffer import NaiveReplayBuffer
 from chatgpt.trainer.strategies import ColossalAIStrategy, DDPStrategy
+from transformers.models.gpt2.configuration_gpt2 import GPT2Config
 
 from colossalai.testing import rerun_if_address_is_in_use
 from colossalai.utils import free_port
 
+GPT_CONFIG = GPT2Config(n_embd=128, n_layer=4, n_head=4)
+
 
 def get_data(batch_size: int, seq_len: int = 10) -> dict:
     input_ids = torch.randint(0, 50257, (batch_size, seq_len), device='cuda')
@@ -42,8 +45,8 @@ def run_test_data(strategy):
     else:
         raise ValueError(f'Unsupported strategy "{strategy}"')
 
-    actor = GPTActor().cuda()
-    critic = GPTCritic().cuda()
+    actor = GPTActor(config=GPT_CONFIG).cuda()
+    critic = GPTCritic(config=GPT_CONFIG).cuda()
 
     initial_model = deepcopy(actor)
     reward_model = RewardModel(deepcopy(critic.model)).cuda()

From 47ecb2238749e46d3bcfa30523850c92864a1837 Mon Sep 17 00:00:00 2001
From: Haofan Wang <haofanwang.ai@gmail.com>
Date: Mon, 20 Feb 2023 16:23:12 +0800
Subject: [PATCH 364/503] [example] add LoRA support (#2821)

* add lora

* format
---
 .../train_dreambooth_colossalai_lora.py       | 691 ++++++++++++++++++
 1 file changed, 691 insertions(+)
 create mode 100644 examples/images/dreambooth/train_dreambooth_colossalai_lora.py

diff --git a/examples/images/dreambooth/train_dreambooth_colossalai_lora.py b/examples/images/dreambooth/train_dreambooth_colossalai_lora.py
new file mode 100644
index 000000000000..3d789ae2ce0f
--- /dev/null
+++ b/examples/images/dreambooth/train_dreambooth_colossalai_lora.py
@@ -0,0 +1,691 @@
+import argparse
+import hashlib
+import math
+import os
+from pathlib import Path
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from diffusers import AutoencoderKL, DDPMScheduler, DiffusionPipeline, UNet2DConditionModel
+from diffusers.loaders import AttnProcsLayers
+from diffusers.models.cross_attention import LoRACrossAttnProcessor
+from diffusers.optimization import get_scheduler
+from huggingface_hub import HfFolder, Repository, create_repo, whoami
+from PIL import Image
+from torch.utils.data import Dataset
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+
+import colossalai
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.logging import disable_existing_loggers, get_dist_logger
+from colossalai.nn.optimizer.gemini_optimizer import GeminiAdamOptimizer
+from colossalai.nn.parallel.utils import get_static_torch_model
+from colossalai.utils import get_current_device
+from colossalai.utils.model.colo_init_context import ColoInitContext
+
+disable_existing_loggers()
+logger = get_dist_logger()
+
+
+def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path,
+        subfolder="text_encoder",
+        revision=args.revision,
+    )
+    model_class = text_encoder_config.architectures[0]
+
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+
+        return CLIPTextModel
+    elif model_class == "RobertaSeriesModelWithTransformation":
+        from diffusers.pipelines.alt_diffusion.modeling_roberta_series import RobertaSeriesModelWithTransformation
+
+        return RobertaSeriesModelWithTransformation
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--instance_data_dir",
+        type=str,
+        default=None,
+        required=True,
+        help="A folder containing the training data of instance images.",
+    )
+    parser.add_argument(
+        "--class_data_dir",
+        type=str,
+        default=None,
+        required=False,
+        help="A folder containing the training data of class images.",
+    )
+    parser.add_argument(
+        "--instance_prompt",
+        type=str,
+        default="a photo of sks dog",
+        required=False,
+        help="The prompt with identifier specifying the instance",
+    )
+    parser.add_argument(
+        "--class_prompt",
+        type=str,
+        default=None,
+        help="The prompt to specify images in the same class as provided instance images.",
+    )
+    parser.add_argument(
+        "--with_prior_preservation",
+        default=False,
+        action="store_true",
+        help="Flag to add prior preservation loss.",
+    )
+    parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.")
+    parser.add_argument(
+        "--num_class_images",
+        type=int,
+        default=100,
+        help=("Minimal class images for prior preservation loss. If there are not enough images already present in"
+              " class_data_dir, additional images will be sampled with class_prompt."),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="text-inversion-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=("The resolution for input images, all the images in the train/validation dataset will be resized to this"
+              " resolution"),
+    )
+    parser.add_argument(
+        "--placement",
+        type=str,
+        default="cpu",
+        help="Placement Policy for Gemini. Valid when using colossalai as dist plan.",
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=("Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+              " cropped. The images will be resized to the resolution first before cropping."),
+    )
+    parser.add_argument("--train_batch_size",
+                        type=int,
+                        default=4,
+                        help="Batch size (per device) for the training dataloader.")
+    parser.add_argument("--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images.")
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-6,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=('The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+              ' "constant", "constant_with_warmup"]'),
+    )
+    parser.add_argument("--lr_warmup_steps",
+                        type=int,
+                        default=500,
+                        help="Number of steps for the warmup in the lr scheduler.")
+    parser.add_argument("--use_8bit_adam",
+                        action="store_true",
+                        help="Whether or not to use 8-bit Adam from bitsandbytes.")
+
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=("[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+              " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.with_prior_preservation:
+        if args.class_data_dir is None:
+            raise ValueError("You must specify a data directory for class images.")
+        if args.class_prompt is None:
+            raise ValueError("You must specify prompt for class images.")
+    else:
+        if args.class_data_dir is not None:
+            logger.warning("You need not use --class_data_dir without --with_prior_preservation.")
+        if args.class_prompt is not None:
+            logger.warning("You need not use --class_prompt without --with_prior_preservation.")
+
+    return args
+
+
+class DreamBoothDataset(Dataset):
+    """
+    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
+    It pre-processes the images and the tokenizes prompts.
+    """
+
+    def __init__(
+        self,
+        instance_data_root,
+        instance_prompt,
+        tokenizer,
+        class_data_root=None,
+        class_prompt=None,
+        size=512,
+        center_crop=False,
+    ):
+        self.size = size
+        self.center_crop = center_crop
+        self.tokenizer = tokenizer
+
+        self.instance_data_root = Path(instance_data_root)
+        if not self.instance_data_root.exists():
+            raise ValueError("Instance images root doesn't exists.")
+
+        self.instance_images_path = list(Path(instance_data_root).iterdir())
+        self.num_instance_images = len(self.instance_images_path)
+        self.instance_prompt = instance_prompt
+        self._length = self.num_instance_images
+
+        if class_data_root is not None:
+            self.class_data_root = Path(class_data_root)
+            self.class_data_root.mkdir(parents=True, exist_ok=True)
+            self.class_images_path = list(self.class_data_root.iterdir())
+            self.num_class_images = len(self.class_images_path)
+            self._length = max(self.num_class_images, self.num_instance_images)
+            self.class_prompt = class_prompt
+        else:
+            self.class_data_root = None
+
+        self.image_transforms = transforms.Compose([
+            transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]),
+        ])
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, index):
+        example = {}
+        instance_image = Image.open(self.instance_images_path[index % self.num_instance_images])
+        if not instance_image.mode == "RGB":
+            instance_image = instance_image.convert("RGB")
+        example["instance_images"] = self.image_transforms(instance_image)
+        example["instance_prompt_ids"] = self.tokenizer(
+            self.instance_prompt,
+            padding="do_not_pad",
+            truncation=True,
+            max_length=self.tokenizer.model_max_length,
+        ).input_ids
+
+        if self.class_data_root:
+            class_image = Image.open(self.class_images_path[index % self.num_class_images])
+            if not class_image.mode == "RGB":
+                class_image = class_image.convert("RGB")
+            example["class_images"] = self.image_transforms(class_image)
+            example["class_prompt_ids"] = self.tokenizer(
+                self.class_prompt,
+                padding="do_not_pad",
+                truncation=True,
+                max_length=self.tokenizer.model_max_length,
+            ).input_ids
+
+        return example
+
+
+class PromptDataset(Dataset):
+    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."
+
+    def __init__(self, prompt, num_samples):
+        self.prompt = prompt
+        self.num_samples = num_samples
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, index):
+        example = {}
+        example["prompt"] = self.prompt
+        example["index"] = index
+        return example
+
+
+def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
+    if token is None:
+        token = HfFolder.get_token()
+    if organization is None:
+        username = whoami(token)["name"]
+        return f"{username}/{model_id}"
+    else:
+        return f"{organization}/{model_id}"
+
+
+# Gemini + ZeRO DDP
+def gemini_zero_dpp(model: torch.nn.Module, placememt_policy: str = "auto"):
+    from colossalai.nn.parallel import GeminiDDP
+
+    model = GeminiDDP(model,
+                      device=get_current_device(),
+                      placement_policy=placememt_policy,
+                      pin_memory=True,
+                      search_range_mb=64)
+    return model
+
+
+def main(args):
+    if args.seed is None:
+        colossalai.launch_from_torch(config={})
+    else:
+        colossalai.launch_from_torch(config={}, seed=args.seed)
+
+    local_rank = gpc.get_local_rank(ParallelMode.DATA)
+    world_size = gpc.get_world_size(ParallelMode.DATA)
+
+    if args.with_prior_preservation:
+        class_images_dir = Path(args.class_data_dir)
+        if not class_images_dir.exists():
+            class_images_dir.mkdir(parents=True)
+        cur_class_images = len(list(class_images_dir.iterdir()))
+
+        if cur_class_images < args.num_class_images:
+            torch_dtype = torch.float16 if get_current_device() == "cuda" else torch.float32
+            pipeline = DiffusionPipeline.from_pretrained(
+                args.pretrained_model_name_or_path,
+                torch_dtype=torch_dtype,
+                safety_checker=None,
+                revision=args.revision,
+            )
+            pipeline.set_progress_bar_config(disable=True)
+
+            num_new_images = args.num_class_images - cur_class_images
+            logger.info(f"Number of class images to sample: {num_new_images}.")
+
+            sample_dataset = PromptDataset(args.class_prompt, num_new_images)
+            sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)
+
+            pipeline.to(get_current_device())
+
+            for example in tqdm(
+                    sample_dataloader,
+                    desc="Generating class images",
+                    disable=not local_rank == 0,
+            ):
+                images = pipeline(example["prompt"]).images
+
+                for i, image in enumerate(images):
+                    hash_image = hashlib.sha1(image.tobytes()).hexdigest()
+                    image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
+                    image.save(image_filename)
+
+            del pipeline
+
+    # Handle the repository creation
+    if local_rank == 0:
+        if args.push_to_hub:
+            if args.hub_model_id is None:
+                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
+            else:
+                repo_name = args.hub_model_id
+            create_repo(repo_name, exist_ok=True, token=args.hub_token)
+            repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
+
+            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
+                if "step_*" not in gitignore:
+                    gitignore.write("step_*\n")
+                if "epoch_*" not in gitignore:
+                    gitignore.write("epoch_*\n")
+        elif args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+    # Load the tokenizer
+    if args.tokenizer_name:
+        logger.info(f"Loading tokenizer from {args.tokenizer_name}", ranks=[0])
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.tokenizer_name,
+            revision=args.revision,
+            use_fast=False,
+        )
+    elif args.pretrained_model_name_or_path:
+        logger.info("Loading tokenizer from pretrained model", ranks=[0])
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.pretrained_model_name_or_path,
+            subfolder="tokenizer",
+            revision=args.revision,
+            use_fast=False,
+        )
+        # import correct text encoder class
+    text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path)
+
+    # Load models and create wrapper for stable diffusion
+
+    logger.info(f"Loading text_encoder from {args.pretrained_model_name_or_path}", ranks=[0])
+
+    text_encoder = text_encoder_cls.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="text_encoder",
+        revision=args.revision,
+    )
+
+    logger.info(f"Loading AutoencoderKL from {args.pretrained_model_name_or_path}", ranks=[0])
+    vae = AutoencoderKL.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="vae",
+        revision=args.revision,
+    )
+
+    logger.info(f"Loading UNet2DConditionModel from {args.pretrained_model_name_or_path}", ranks=[0])
+    with ColoInitContext(device=get_current_device()):
+        unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path,
+                                                    subfolder="unet",
+                                                    revision=args.revision,
+                                                    low_cpu_mem_usage=False)
+        unet.requires_grad_(False)
+
+        # Set correct lora layers
+        lora_attn_procs = {}
+        for name in unet.attn_processors.keys():
+            cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
+            if name.startswith("mid_block"):
+                hidden_size = unet.config.block_out_channels[-1]
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                hidden_size = unet.config.block_out_channels[block_id]
+
+            lora_attn_procs[name] = LoRACrossAttnProcessor(hidden_size=hidden_size,
+                                                           cross_attention_dim=cross_attention_dim)
+
+        unet.set_attn_processor(lora_attn_procs)
+        lora_layers = AttnProcsLayers(unet.attn_processors)
+
+    vae.requires_grad_(False)
+    text_encoder.requires_grad_(False)
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+
+    if args.scale_lr:
+        args.learning_rate = args.learning_rate * args.train_batch_size * world_size
+
+    unet = gemini_zero_dpp(unet, args.placement)
+
+    # config optimizer for colossalai zero
+    optimizer = GeminiAdamOptimizer(unet, lr=args.learning_rate, initial_scale=2**5, clipping_norm=args.max_grad_norm)
+
+    # load noise_scheduler
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+
+    # prepare dataset
+    logger.info(f"Prepare dataset from {args.instance_data_dir}", ranks=[0])
+    train_dataset = DreamBoothDataset(
+        instance_data_root=args.instance_data_dir,
+        instance_prompt=args.instance_prompt,
+        class_data_root=args.class_data_dir if args.with_prior_preservation else None,
+        class_prompt=args.class_prompt,
+        tokenizer=tokenizer,
+        size=args.resolution,
+        center_crop=args.center_crop,
+    )
+
+    def collate_fn(examples):
+        input_ids = [example["instance_prompt_ids"] for example in examples]
+        pixel_values = [example["instance_images"] for example in examples]
+
+        # Concat class and instance examples for prior preservation.
+        # We do this to avoid doing two forward passes.
+        if args.with_prior_preservation:
+            input_ids += [example["class_prompt_ids"] for example in examples]
+            pixel_values += [example["class_images"] for example in examples]
+
+        pixel_values = torch.stack(pixel_values)
+        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+
+        input_ids = tokenizer.pad(
+            {
+                "input_ids": input_ids
+            },
+            padding="max_length",
+            max_length=tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids
+
+        batch = {
+            "input_ids": input_ids,
+            "pixel_values": pixel_values,
+        }
+        return batch
+
+    train_dataloader = torch.utils.data.DataLoader(train_dataset,
+                                                   batch_size=args.train_batch_size,
+                                                   shuffle=True,
+                                                   collate_fn=collate_fn,
+                                                   num_workers=1)
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader))
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps,
+        num_training_steps=args.max_train_steps,
+    )
+    weight_dtype = torch.float32
+    if args.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif args.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move text_encode and vae to gpu.
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    vae.to(get_current_device(), dtype=weight_dtype)
+    text_encoder.to(get_current_device(), dtype=weight_dtype)
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader))
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # Train!
+    total_batch_size = args.train_batch_size * world_size
+
+    logger.info("***** Running training *****", ranks=[0])
+    logger.info(f"  Num examples = {len(train_dataset)}", ranks=[0])
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}", ranks=[0])
+    logger.info(f"  Num Epochs = {args.num_train_epochs}", ranks=[0])
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}", ranks=[0])
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}", ranks=[0])
+    logger.info(f"  Total optimization steps = {args.max_train_steps}", ranks=[0])
+
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(args.max_train_steps), disable=not local_rank == 0)
+    progress_bar.set_description("Steps")
+    global_step = 0
+
+    torch.cuda.synchronize()
+    for epoch in range(args.num_train_epochs):
+        unet.train()
+        for step, batch in enumerate(train_dataloader):
+            torch.cuda.reset_peak_memory_stats()
+            # Move batch to gpu
+            for key, value in batch.items():
+                batch[key] = value.to(get_current_device(), non_blocking=True)
+
+            # Convert images to latent space
+            optimizer.zero_grad()
+
+            latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
+            latents = latents * 0.18215
+
+            # Sample noise that we'll add to the latents
+            noise = torch.randn_like(latents)
+            bsz = latents.shape[0]
+            # Sample a random timestep for each image
+            timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+            timesteps = timesteps.long()
+
+            # Add noise to the latents according to the noise magnitude at each timestep
+            # (this is the forward diffusion process)
+            noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+            # Get the text embedding for conditioning
+            encoder_hidden_states = text_encoder(batch["input_ids"])[0]
+
+            # Predict the noise residual
+            model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+
+            # Get the target for loss depending on the prediction type
+            if noise_scheduler.config.prediction_type == "epsilon":
+                target = noise
+            elif noise_scheduler.config.prediction_type == "v_prediction":
+                target = noise_scheduler.get_velocity(latents, noise, timesteps)
+            else:
+                raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+            if args.with_prior_preservation:
+                # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
+                model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
+                target, target_prior = torch.chunk(target, 2, dim=0)
+
+                # Compute instance loss
+                loss = F.mse_loss(model_pred.float(), target.float(), reduction="none").mean([1, 2, 3]).mean()
+
+                # Compute prior loss
+                prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean")
+
+                # Add the prior loss to the instance loss.
+                loss = loss + args.prior_loss_weight * prior_loss
+            else:
+                loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+
+            optimizer.backward(loss)
+
+            optimizer.step()
+            lr_scheduler.step()
+            logger.info(f"max GPU_mem cost is {torch.cuda.max_memory_allocated()/2**20} MB", ranks=[0])
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            progress_bar.update(1)
+            global_step += 1
+            logs = {
+                "loss": loss.detach().item(),
+                "lr": optimizer.param_groups[0]["lr"],
+            }    # lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+
+            if global_step % args.save_steps == 0:
+                torch.cuda.synchronize()
+                torch_unet = get_static_torch_model(unet)
+                if local_rank == 0:
+                    save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                    torch_unet = torch_unet.to(torch.float32)
+                    torch_unet.save_attn_procs(save_path)
+                    logger.info(f"Saving model checkpoint to {save_path}", ranks=[0])
+            if global_step >= args.max_train_steps:
+                break
+
+    torch.cuda.synchronize()
+    torch_unet = get_static_torch_model(unet)
+
+    if local_rank == 0:
+        torch_unet = torch_unet.to(torch.float32)
+        torch_unet.save_attn_procs(save_path)
+        logger.info(f"Saving model checkpoint to {args.output_dir}", ranks=[0])
+
+        if args.push_to_hub:
+            repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)

From a5721229d952a24332f3d5cb03422489150041e0 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 20 Feb 2023 17:35:46 +0800
Subject: [PATCH 365/503] Automated submodule synchronization (#2740)

Co-authored-by: github-actions <github-actions@github.com>
---
 examples/tutorial/fastfold/FastFold | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tutorial/fastfold/FastFold b/examples/tutorial/fastfold/FastFold
index f05e712982ae..03ff54e56157 160000
--- a/examples/tutorial/fastfold/FastFold
+++ b/examples/tutorial/fastfold/FastFold
@@ -1 +1 @@
-Subproject commit f05e712982aeba6a32a9b3d1ee4dee6492426cec
+Subproject commit 03ff54e561576d38118bf4cba8e73ef728f099e3

From 7ea6bc7f69418379a489b1be4a3a428d9c6c1823 Mon Sep 17 00:00:00 2001
From: Boyuan Yao <70263930+Cypher30@users.noreply.github.com>
Date: Mon, 20 Feb 2023 17:38:55 +0800
Subject: [PATCH 366/503] [autoparallel] Patch tensor related operations meta
 information (#2789)

* [autoparallel] tensor related meta information prototype

* [autoparallel] tensor related meta information

* [autoparallel] tensor related meta information

* [autoparallel] tensor related meta information

* [autoparallel] tensor related meta information
---
 .../meta_profiler/meta_registry/__init__.py   |   1 +
 .../meta_profiler/meta_registry/pooling.py    |   1 -
 .../meta_profiler/meta_registry/tensor.py     |  79 ++++++++++++++
 .../test_metainfo/test_tensor_metainfo.py     | 103 ++++++++++++++++++
 4 files changed, 183 insertions(+), 1 deletion(-)
 create mode 100644 colossalai/auto_parallel/meta_profiler/meta_registry/tensor.py
 create mode 100644 tests/test_auto_parallel/test_tensor_shard/test_metainfo/test_tensor_metainfo.py

diff --git a/colossalai/auto_parallel/meta_profiler/meta_registry/__init__.py b/colossalai/auto_parallel/meta_profiler/meta_registry/__init__.py
index 359590c1fc04..df9eb6498377 100644
--- a/colossalai/auto_parallel/meta_profiler/meta_registry/__init__.py
+++ b/colossalai/auto_parallel/meta_profiler/meta_registry/__init__.py
@@ -5,3 +5,4 @@
 from .linear import *
 from .norm import *
 from .pooling import *
+from .tensor import *
diff --git a/colossalai/auto_parallel/meta_profiler/meta_registry/pooling.py b/colossalai/auto_parallel/meta_profiler/meta_registry/pooling.py
index 79780c92eed4..21272ea09ac1 100644
--- a/colossalai/auto_parallel/meta_profiler/meta_registry/pooling.py
+++ b/colossalai/auto_parallel/meta_profiler/meta_registry/pooling.py
@@ -14,7 +14,6 @@
 @meta_register.register(torch.nn.AdaptiveAvgPool1d)
 @meta_register.register(torch.nn.AdaptiveAvgPool2d)
 @meta_register.register(torch.nn.AdaptiveAvgPool3d)
-@meta_register.register(torch.flatten)
 def avgpool_meta_info(*args, **kwargs) -> Tuple[TrainCycleItem, TrainCycleItem, List[torch.Tensor]]:
     """Meta info for AdaptiveAvgPool
     The aten graph of AdaptiveAvgPool is
diff --git a/colossalai/auto_parallel/meta_profiler/meta_registry/tensor.py b/colossalai/auto_parallel/meta_profiler/meta_registry/tensor.py
new file mode 100644
index 000000000000..332e649d2d7e
--- /dev/null
+++ b/colossalai/auto_parallel/meta_profiler/meta_registry/tensor.py
@@ -0,0 +1,79 @@
+from typing import Callable, List, Tuple
+
+import torch
+
+from colossalai.auto_parallel.tensor_shard.sharding_strategy import MemoryCost, OperationDataType, TrainCycleItem
+from colossalai.fx.profiler.memory_utils import activation_size
+from colossalai.fx.profiler.opcount import flop_mapping
+
+from ..registry import meta_register
+
+__all__ = ["tensor_related_metainfo"]
+
+
+def tensor_related_metainfo(bwd_mem_out_factor: float = 1, bwd_mem_tmp_factor: float = 0) -> Callable:
+    """torch.Tensor related metainfo generator template
+
+    Args:
+        bwd_mem_out_factor (float, optional): backward activation memory cost factor. Defaults to 1.
+        bwd_mem_tmp_factor (float, optional): backward temp memory cost factor. Defaults to 0.
+
+    Returns:
+        Callable: torch.Tensor related metainfo generator
+    """
+
+    def meta_func(*args, **kwargs) -> Tuple[TrainCycleItem, TrainCycleItem, List[torch.Tensor]]:
+        """torch.Tensor related metainfo generator
+
+        Returns:
+            Tuple[TrainCycleItem, TrainCycleItem, List[torch.Tensor]]: compute cost, memory cost and forward inputs
+        """
+        outputs = next(filter(lambda x: x.type == OperationDataType.OUTPUT, args)).data
+
+        # compute costs are all zero
+        compute_cost = TrainCycleItem(fwd=0, bwd=0, total=0)
+
+        # memory costs
+        # NOTE: currently in SPMD solver we always believe that there will be a new tensor created in forward
+        fwd_mem_cost = MemoryCost(activation=activation_size(outputs) * 2, parameter=0, temp=0, buffer=0)
+
+        bwd_mem_cost = MemoryCost(activation=activation_size(outputs) * bwd_mem_out_factor,
+                                  parameter=0,
+                                  temp=activation_size(outputs) * bwd_mem_tmp_factor,
+                                  buffer=0)
+
+        total_mem_cost = MemoryCost(activation=fwd_mem_cost.activation + bwd_mem_cost.activation,
+                                    parameter=fwd_mem_cost.parameter + bwd_mem_cost.parameter,
+                                    temp=fwd_mem_cost.temp + bwd_mem_cost.temp,
+                                    buffer=fwd_mem_cost.buffer + bwd_mem_cost.buffer)
+
+        memory_cost = TrainCycleItem(fwd=fwd_mem_cost, bwd=bwd_mem_cost, total=total_mem_cost)
+
+        # store fwd_in, fwd_buffer, fwd_out
+        fwd_in = []
+        fwd_buffer = []
+        if isinstance(outputs, tuple) or isinstance(outputs, list) or isinstance(outputs, dict):
+            # tuple of tensors
+            fwd_out = [torch.zeros_like(tensor) for tensor in outputs]
+        else:
+            # enaged_tensors is a single tensor
+            fwd_out = [torch.zeros_like(outputs)]
+
+        return compute_cost, memory_cost, fwd_in, fwd_buffer, fwd_out
+
+    return meta_func
+
+
+# register torch.Tensor related metainfo
+# (0, 0)
+meta_register.register([torch.tensor, torch.Tensor.to, torch.Tensor.unsqueeze, torch.unsqueeze,
+                        torch.arange])(tensor_related_metainfo(0, 0))
+
+# (1, 0)
+meta_register.register([
+    torch.Tensor.flatten, torch.flatten, torch.Tensor.transpose, torch.transpose, torch.Tensor.permute, torch.permute,
+    torch.Tensor.split, torch.split, torch.Tensor.view
+])(tensor_related_metainfo(1, 0))
+
+# (1, 1)
+meta_register.register([torch.Tensor.type, torch.Tensor.contiguous])(tensor_related_metainfo(1, 1))
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_metainfo/test_tensor_metainfo.py b/tests/test_auto_parallel/test_tensor_shard/test_metainfo/test_tensor_metainfo.py
new file mode 100644
index 000000000000..a0ab66fdc060
--- /dev/null
+++ b/tests/test_auto_parallel/test_tensor_shard/test_metainfo/test_tensor_metainfo.py
@@ -0,0 +1,103 @@
+from functools import partial
+
+import pytest
+import torch
+import torch.multiprocessing as mp
+import torch.nn as nn
+
+from colossalai.auto_parallel.tensor_shard.node_handler import LinearModuleHandler
+from colossalai.auto_parallel.tensor_shard.sharding_strategy import (
+    MemoryCost,
+    OperationData,
+    OperationDataType,
+    ShardingStrategy,
+    StrategiesVector,
+    TrainCycleItem,
+)
+from colossalai.device.device_mesh import DeviceMesh
+from colossalai.fx import ColoGraphModule, ColoTracer
+from colossalai.initialize import launch
+from colossalai.logging import disable_existing_loggers
+from colossalai.testing.pytest_wrapper import run_on_environment_flag
+from colossalai.testing.utils import parameterize, rerun_if_address_is_in_use
+from colossalai.utils import free_port
+from tests.test_auto_parallel.test_tensor_shard.test_metainfo.utils import print_results
+
+if torch.__version__ >= '1.12.0':
+    from colossalai.auto_parallel.meta_profiler import MetaInfo, meta_register
+
+
+class SplitModule(nn.Module):
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x):
+        return x.split(512, dim=0)
+
+
+@pytest.mark.skipif(torch.__version__ < '1.12.0', reason="need pytorch 1.12.0 or higher for aten level operations")
+def test_tensor_meta_info():
+    """test tensor related meta information
+    We will just use torch.Tensor.split for the test
+    """
+    meta_func = meta_register.get(torch.Tensor.split)
+
+    # construct meta tensors
+    input_tensor = torch.rand(1024, 1024, device="meta")
+    output_tensor = input_tensor.split(512, dim=0)
+
+    # construct operation data
+    input_data = OperationData(
+        name="input",
+        data=input_tensor,
+        type=OperationDataType.ARG,
+        logical_shape=input_tensor.shape,
+    )
+    output_data = OperationData(
+        name="output",
+        data=output_tensor,
+        type=OperationDataType.OUTPUT,
+        logical_shape=input_tensor.shape,
+    )
+    split_info_data = OperationData(
+        name='split_info',
+        type=OperationDataType.ARG,
+        data=0,
+        logical_shape=None,
+    )
+
+    # construct args
+    args = [input_data, output_data, split_info_data]
+    kwargs = {'inplace': False}
+
+    # estimated results
+    compute_cost, memory_cost, fwd_in, fwd_buffer, fwd_out = meta_func(*args, **kwargs)
+
+    # actual results
+    model = SplitModule()
+    input_real_tensor = torch.rand(1024, 1024).cuda()
+
+    input_real_tensor.requires_grad = True
+
+    # fwd
+    torch.cuda.reset_peak_memory_stats()
+    mem_stamp0 = torch.cuda.memory_allocated()
+    output_real_tensor = model(input_real_tensor)
+    fwd_allocated = torch.cuda.memory_allocated() - mem_stamp0
+    fwd_peak = torch.cuda.max_memory_allocated() - mem_stamp0
+
+    # bwd
+    upstream_grad = [torch.rand_like(tensor) for tensor in output_real_tensor]
+    torch.cuda.reset_peak_memory_stats()
+    mem_stamp0 = torch.cuda.memory_allocated()
+    torch.autograd.backward(output_real_tensor, upstream_grad)
+    bwd_allocated = torch.cuda.memory_allocated() - mem_stamp0
+    bwd_peak = torch.cuda.max_memory_allocated() - mem_stamp0
+
+    print_results([input_real_tensor], output_real_tensor, compute_cost, memory_cost, fwd_allocated, fwd_peak,
+                  bwd_allocated, bwd_peak)
+
+
+if __name__ == "__main__":
+    test_tensor_meta_info()

From 918bc94b6bb27eaa18769bbee270bb268c792059 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Tue, 21 Feb 2023 11:25:57 +0800
Subject: [PATCH 367/503] [triton] added copyright information for flash
 attention (#2835)

* [triton] added copyright information for flash attention

* polish code
---
 LICENSE                                       | 20 ++++++++++++++++---
 .../kernel/cuda_native/flash_attention.py     | 15 +++++++++-----
 2 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/LICENSE b/LICENSE
index 51a166040d54..f05b54bd5dd4 100644
--- a/LICENSE
+++ b/LICENSE
@@ -201,17 +201,31 @@ Copyright 2021- HPC-AI Technology Inc. All rights reserved.
    See the License for the specific language governing permissions and
    limitations under the License.
 
-## Some of colossal-ai's code is derived from Alpa, which is subject to the following copyright notice:
+   ## Some of colossal-ai's code is derived from others projects, which is subject to the following copyright notice:
+
+   Copyright 2021 The Alpa team.
 
-Copyright 2021 The Alpa team.
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
 
-       https://github.com/alpa-projects/alpa/blob/979a45a3e6187df941ef4a4c4c6eea664527d68d/LICENSE
+         https://github.com/alpa-projects/alpa/blob/979a45a3e6187df941ef4a4c4c6eea664527d68d/LICENSE
 
    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License.
+
+   -------------------------------------------------
+
+   Copyright 2018-2020 Philippe Tillet
+   Copyright 2020-2022 OpenAI
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files
+   (the "Software"), to deal in the Software without restriction,
+   including without limitation the rights to use, copy, modify, merge,
+   publish, distribute, sublicense, and/or sell copies of the Software,
+   and to permit persons to whom the Software is furnished to do so,
+   subject to the following conditions:
diff --git a/colossalai/kernel/cuda_native/flash_attention.py b/colossalai/kernel/cuda_native/flash_attention.py
index 7bd646d3935f..907fa640d826 100644
--- a/colossalai/kernel/cuda_native/flash_attention.py
+++ b/colossalai/kernel/cuda_native/flash_attention.py
@@ -1,8 +1,12 @@
 """
-Fused Attention
-===============
-This is a Triton implementation of the Flash Attention algorithm
-(see: Dao et al., https://arxiv.org/pdf/2205.14135v2.pdf; Rabe and Staats https://arxiv.org/pdf/2112.05682v2.pdf; Triton https://github.com/openai/triton)
+The triton-based flash attention implementation is copied from the OpenAI/triton repository
+
+You can find the repository in Triton https://github.com/openai/triton
+You can find the source file in https://github.com/openai/triton/blob/main/python/tutorials/06-fused-attention.py
+
+Reference:
+1. Dao et al., https://arxiv.org/pdf/2205.14135v2.pdf
+2. Rabe and Staats https://arxiv.org/pdf/2112.05682v2.pdf
 """
 
 import math
@@ -56,7 +60,8 @@ def triton_cuda_check():
     print('please install xformers from https://github.com/facebookresearch/xformers')
 
 if HAS_TRITON:
-
+    # the following functions are adapted from the OpenAI Triton tutorial
+    # https://github.com/openai/triton/blob/main/python/tutorials/06-fused-attention.py
     @triton.jit
     def _fwd_kernel(
         Q,

From 3eebc4dff70f893b669ae78a4fb811a56d347974 Mon Sep 17 00:00:00 2001
From: BlueRum <70618399+ht-zhou@users.noreply.github.com>
Date: Tue, 21 Feb 2023 11:35:45 +0800
Subject: [PATCH 368/503] [chatgpt] fix rm eval (#2829)

* [chatgpt]fix train_rm bug with lora

* [chatgpt]support colossalai strategy to train rm

* fix pre-commit

* fix pre-commit 2

* [chatgpt]fix rm eval typo

* fix rm eval

* fix pre commit
---
 .../ChatGPT/chatgpt/dataset/__init__.py       |  3 +-
 .../ChatGPT/chatgpt/dataset/reward_dataset.py |  4 ++-
 applications/ChatGPT/chatgpt/dataset/utils.py |  5 ++++
 .../ChatGPT/chatgpt/nn/reward_model.py        |  4 +--
 applications/ChatGPT/chatgpt/trainer/rm.py    | 29 ++++++++++++-------
 .../ChatGPT/examples/train_reward_model.py    |  7 +++--
 6 files changed, 34 insertions(+), 18 deletions(-)
 create mode 100644 applications/ChatGPT/chatgpt/dataset/utils.py

diff --git a/applications/ChatGPT/chatgpt/dataset/__init__.py b/applications/ChatGPT/chatgpt/dataset/__init__.py
index 2f330ee67afe..b4599c82ba75 100644
--- a/applications/ChatGPT/chatgpt/dataset/__init__.py
+++ b/applications/ChatGPT/chatgpt/dataset/__init__.py
@@ -1,3 +1,4 @@
 from .reward_dataset import RewardDataset
+from .utils import is_rank_0
 
-__all__ = ['RewardDataset']
+__all__ = ['RewardDataset', 'is_rank_0']
diff --git a/applications/ChatGPT/chatgpt/dataset/reward_dataset.py b/applications/ChatGPT/chatgpt/dataset/reward_dataset.py
index 14edcce30d19..8bc850f2d52d 100644
--- a/applications/ChatGPT/chatgpt/dataset/reward_dataset.py
+++ b/applications/ChatGPT/chatgpt/dataset/reward_dataset.py
@@ -3,6 +3,8 @@
 from torch.utils.data import Dataset
 from tqdm import tqdm
 
+from .utils import is_rank_0
+
 
 class RewardDataset(Dataset):
     """
@@ -18,7 +20,7 @@ def __init__(self, dataset, tokenizer: Callable, max_length: int) -> None:
         super().__init__()
         self.chosen = []
         self.reject = []
-        for data in tqdm(dataset):
+        for data in tqdm(dataset, disable=not is_rank_0()):
             prompt = data['prompt']
 
             chosen = prompt + data['chosen'] + "<|endoftext|>"
diff --git a/applications/ChatGPT/chatgpt/dataset/utils.py b/applications/ChatGPT/chatgpt/dataset/utils.py
new file mode 100644
index 000000000000..6c9f7f085f8c
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/dataset/utils.py
@@ -0,0 +1,5 @@
+import torch.distributed as dist
+
+
+def is_rank_0() -> bool:
+    return not dist.is_initialized() or dist.get_rank() == 0
diff --git a/applications/ChatGPT/chatgpt/nn/reward_model.py b/applications/ChatGPT/chatgpt/nn/reward_model.py
index 5108f61a6186..baaa8b768766 100644
--- a/applications/ChatGPT/chatgpt/nn/reward_model.py
+++ b/applications/ChatGPT/chatgpt/nn/reward_model.py
@@ -23,7 +23,7 @@ def __init__(self,
                  lora_rank: int = 0,
                  lora_train_bias: str = 'none') -> None:
         super().__init__(lora_rank=lora_rank, lora_train_bias=lora_train_bias)
-        self.model = model
+        self.body = model
         if value_head is not None:
             if value_head.out_features != 1:
                 raise ValueError("The value head of reward model's output dim should be 1!")
@@ -34,7 +34,7 @@ def __init__(self,
         self.convert_to_lora()
 
     def forward(self, sequences: torch.LongTensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
-        outputs = self.model(sequences, attention_mask=attention_mask)
+        outputs = self.body(sequences, attention_mask=attention_mask)
         last_hidden_states = outputs['last_hidden_state']
         values = self.value_head(last_hidden_states)[:, :-1]
         value = values.mean(dim=1).squeeze(1)    # ensure shape is (B)
diff --git a/applications/ChatGPT/chatgpt/trainer/rm.py b/applications/ChatGPT/chatgpt/trainer/rm.py
index f6639edcbbb4..f9000eb7efe5 100644
--- a/applications/ChatGPT/chatgpt/trainer/rm.py
+++ b/applications/ChatGPT/chatgpt/trainer/rm.py
@@ -1,6 +1,7 @@
 from abc import ABC
 
 import loralib as lora
+import torch
 from chatgpt.dataset import RewardDataset
 from chatgpt.nn import PairWiseLoss
 from torch.optim import Adam, Optimizer
@@ -55,7 +56,8 @@ def fit(self, use_lora):
             # train
             if use_lora > 0:
                 print("Using Lora")
-                lora.mark_only_lora_as_trainable(self.model.model)
+                lora.mark_only_lora_as_trainable(self.model.body)
+
             else:
                 self.model.train()
             for chosen_ids, c_mask, reject_ids, r_mask in self.train_dataloader:
@@ -74,16 +76,21 @@ def fit(self, use_lora):
 
             # eval
             self.model.eval()
-            for chosen_ids, c_mask, reject_ids, r_mask in self.eval_dataloader:
+            with torch.no_grad():
                 dist = 0
-                chosen_ids = chosen_ids.squeeze(1).cuda()
-                c_mask = c_mask.squeeze(1).cuda()
-                reject_ids = reject_ids.squeeze(1).cuda()
-                r_mask = r_mask.squeeze(1).cuda()
-                chosen_reward = self.model(chosen_ids, attention_mask=c_mask)
-                reject_reward = self.model(reject_ids, attention_mask=r_mask)
-                dist += (chosen_reward - reject_reward)
-            dist_mean = dist / self.eval_dataloader.__len__()
+                loss_sum = 0
+                for chosen_ids, c_mask, reject_ids, r_mask in self.eval_dataloader:
+                    chosen_ids = chosen_ids.squeeze(1).cuda()
+                    c_mask = c_mask.squeeze(1).cuda()
+                    reject_ids = reject_ids.squeeze(1).cuda()
+                    r_mask = r_mask.squeeze(1).cuda()
+                    chosen_reward = self.model(chosen_ids, attention_mask=c_mask)
+                    reject_reward = self.model(reject_ids, attention_mask=r_mask)
+                    dist += (chosen_reward - reject_reward).mean().item()
+                    loss = self.loss_fn(chosen_reward, reject_reward)
+                    loss_sum += loss.item()
+                dist_mean = dist / self.eval_dataloader.__len__()
+                loss_mean = loss_sum / self.eval_dataloader.__len__()
             epoch_bar.update()
-            step_bar.set_postfix({'loss': loss.item(), 'dist_mean': dist_mean.item()})
+            step_bar.set_postfix({'loss': loss_mean, 'dist_mean': dist_mean})
             step_bar.close()
diff --git a/applications/ChatGPT/examples/train_reward_model.py b/applications/ChatGPT/examples/train_reward_model.py
index 47688325ed7a..57d47b6959a1 100644
--- a/applications/ChatGPT/examples/train_reward_model.py
+++ b/applications/ChatGPT/examples/train_reward_model.py
@@ -29,7 +29,8 @@ def train(args):
     # configure model
     tokenizer = BloomTokenizerFast.from_pretrained(args.pretrain)
     tokenizer.pad_token = tokenizer.eos_token
-    model = BLOOMRM(pretrained=args.pretrain).cuda()
+    with strategy.model_init_context():
+        model = BLOOMRM(pretrained=args.pretrain).cuda()
     max_len = 1024
 
     # configure optimizer
@@ -71,8 +72,8 @@ def train(args):
     parser.add_argument('--pretrain', type=str, default=None)
     parser.add_argument('--dataset', type=str, default='Dahoas/rm-static')
     parser.add_argument('--save_path', type=str, default='rm_ckpt.pth')
-    parser.add_argument('--max_epochs', type=int, default=2)
-    parser.add_argument('--batch_size', type=int, default=1)
+    parser.add_argument('--max_epochs', type=int, default=10)
+    parser.add_argument('--batch_size', type=int, default=4)
     parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
     args = parser.parse_args()
     train(args)

From 935346430f25bcb75dedc35c1207920172a3514f Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Tue, 21 Feb 2023 17:04:49 +0800
Subject: [PATCH 369/503] [cli] handled version check exceptions (#2848)

* [cli] handled version check exceptions

* polish code
---
 colossalai/cli/check/check_installation.py | 52 ++++++++++++++--------
 1 file changed, 33 insertions(+), 19 deletions(-)

diff --git a/colossalai/cli/check/check_installation.py b/colossalai/cli/check/check_installation.py
index 22c169577495..44d7840700ef 100644
--- a/colossalai/cli/check/check_installation.py
+++ b/colossalai/cli/check/check_installation.py
@@ -31,7 +31,7 @@ def check_installation():
     found_aot_cuda_ext = _check_aot_built_cuda_extension_installed()
     cuda_version = _check_cuda_version()
     torch_version, torch_cuda_version = _check_torch_version()
-    colossalai_verison, torch_version_required, cuda_version_required = _parse_colossalai_version()
+    colossalai_verison, prebuilt_torch_version_required, prebuilt_cuda_version_required = _parse_colossalai_version()
 
     # if cuda_version is None, that means either
     # CUDA_HOME is not found, thus cannot compare the version compatibility
@@ -43,33 +43,36 @@ def check_installation():
     # if cuda_version or cuda_version_required is None, that means either
     # CUDA_HOME is not found or AOT compilation is not enabled
     # thus, there is no need to compare the version compatibility at all
-    if not cuda_version or not cuda_version_required:
+    if not cuda_version or not prebuilt_cuda_version_required:
         sys_colossalai_cuda_compatibility = None
     else:
-        sys_colossalai_cuda_compatibility = _is_compatible([cuda_version, cuda_version_required])
+        sys_colossalai_cuda_compatibility = _is_compatible([cuda_version, prebuilt_cuda_version_required])
 
     # if torch_version_required is None, that means AOT compilation is not enabled
     # thus there is no need to compare the versions
-    if torch_version_required is None:
+    if prebuilt_torch_version_required is None:
         torch_compatibility = None
     else:
-        torch_compatibility = _is_compatible([torch_version, torch_version_required])
+        torch_compatibility = _is_compatible([torch_version, prebuilt_torch_version_required])
 
     click.echo(f'#### Installation Report ####')
     click.echo(f'\n------------ Environment ------------')
     click.echo(f"Colossal-AI version: {to_click_output(colossalai_verison)}")
     click.echo(f"PyTorch version: {to_click_output(torch_version)}")
-    click.echo(f"CUDA version: {to_click_output(cuda_version)}")
+    click.echo(f"System CUDA version: {to_click_output(cuda_version)}")
     click.echo(f"CUDA version required by PyTorch: {to_click_output(torch_cuda_version)}")
     click.echo("")
     click.echo(f"Note:")
     click.echo(f"1. The table above checks the versions of the libraries/tools in the current environment")
-    click.echo(f"2. If the CUDA version is N/A, you can set the CUDA_HOME environment variable to locate it")
+    click.echo(f"2. If the System CUDA version is N/A, you can set the CUDA_HOME environment variable to locate it")
+    click.echo(
+        f"3. If the CUDA version required by PyTorch is N/A, you probably did not install a CUDA-compatible PyTorch. This value is give by torch.version.cuda and you can go to https://pytorch.org/get-started/locally/ to download the correct version."
+    )
 
     click.echo(f'\n------------ CUDA Extensions AOT Compilation ------------')
     click.echo(f"Found AOT CUDA Extension: {to_click_output(found_aot_cuda_ext)}")
-    click.echo(f"PyTorch version used for AOT compilation: {to_click_output(torch_version_required)}")
-    click.echo(f"CUDA version used for AOT compilation: {to_click_output(cuda_version_required)}")
+    click.echo(f"PyTorch version used for AOT compilation: {to_click_output(prebuilt_torch_version_required)}")
+    click.echo(f"CUDA version used for AOT compilation: {to_click_output(prebuilt_cuda_version_required)}")
     click.echo("")
     click.echo(f"Note:")
     click.echo(
@@ -169,12 +172,19 @@ def _check_torch_version():
         torch_cuda_version: CUDA version required by PyTorch.
     """
     # get torch version
+    # torch version can be of two formats
+    # - 1.13.1+cu113
+    # - 1.13.1.devxxx
     torch_version = torch.__version__.split('+')[0]
+    torch_version = '.'.join(torch_version.split('.')[:3])
 
     # get cuda version in pytorch build
-    torch_cuda_major = torch.version.cuda.split(".")[0]
-    torch_cuda_minor = torch.version.cuda.split(".")[1]
-    torch_cuda_version = f'{torch_cuda_major}.{torch_cuda_minor}'
+    try:
+        torch_cuda_major = torch.version.cuda.split(".")[0]
+        torch_cuda_minor = torch.version.cuda.split(".")[1]
+        torch_cuda_version = f'{torch_cuda_major}.{torch_cuda_minor}'
+    except:
+        torch_cuda_version = None
 
     return torch_version, torch_cuda_version
 
@@ -186,15 +196,19 @@ def _check_cuda_version():
     Returns:
         cuda_version: CUDA version found on the system.
     """
+
     # get cuda version
     if CUDA_HOME is None:
         cuda_version = CUDA_HOME
     else:
-        raw_output = subprocess.check_output([CUDA_HOME + "/bin/nvcc", "-V"], universal_newlines=True)
-        output = raw_output.split()
-        release_idx = output.index("release") + 1
-        release = output[release_idx].split(".")
-        bare_metal_major = release[0]
-        bare_metal_minor = release[1][0]
-        cuda_version = f'{bare_metal_major}.{bare_metal_minor}'
+        try:
+            raw_output = subprocess.check_output([CUDA_HOME + "/bin/nvcc", "-V"], universal_newlines=True)
+            output = raw_output.split()
+            release_idx = output.index("release") + 1
+            release = output[release_idx].split(".")
+            bare_metal_major = release[0]
+            bare_metal_minor = release[1][0]
+            cuda_version = f'{bare_metal_major}.{bare_metal_minor}'
+        except:
+            cuda_version = None
     return cuda_version

From 597914317ba1a4ffcaa93ccabfbb4cb5e24aae44 Mon Sep 17 00:00:00 2001
From: Zheng Zeng <zengzheng17@gmail.com>
Date: Tue, 21 Feb 2023 17:16:13 +0800
Subject: [PATCH 370/503] [doc] fix typo in opt inference tutorial (#2849)

---
 examples/tutorial/opt/inference/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/tutorial/opt/inference/README.md b/examples/tutorial/opt/inference/README.md
index 5bacac0d74ad..20ad4a23fdeb 100644
--- a/examples/tutorial/opt/inference/README.md
+++ b/examples/tutorial/opt/inference/README.md
@@ -50,7 +50,7 @@ python opt_fastapi.py <model> --queue_size <QueueSize>
 ```
 The `<QueueSize>` can be an integer in `[0, MAXINT]`. If it's `0`, the request queue size is infinite. If it's a positive integer, when the request queue is full, incoming requests will be dropped (the HTTP status code of response will be 406).
 
-### Configure bathcing
+### Configure batching
 ```shell
 python opt_fastapi.py <model> --max_batch_size <MaxBatchSize>
 ```
@@ -85,4 +85,4 @@ Then open the web interface link which is on your console.
 See [script/processing_ckpt_66b.py](./script/processing_ckpt_66b.py).
 
 ## OPT-175B
-See [script/process-opt-175b](./script/process-opt-175b/).
\ No newline at end of file
+See [script/process-opt-175b](./script/process-opt-175b/).

From 34ca324b0d193623c89d8aea1aedb3c00ac2f654 Mon Sep 17 00:00:00 2001
From: BlueRum <70618399+ht-zhou@users.noreply.github.com>
Date: Wed, 22 Feb 2023 10:00:26 +0800
Subject: [PATCH 371/503] [chatgpt] Support saving ckpt in examples (#2846)

* [chatgpt]fix train_rm bug with lora

* [chatgpt]support colossalai strategy to train rm

* fix pre-commit

* fix pre-commit 2

* [chatgpt]fix rm eval typo

* fix rm eval

* fix pre commit

* add support of saving ckpt in examples

* fix single-gpu save
---
 applications/ChatGPT/examples/train_dummy.py   | 7 +++++++
 applications/ChatGPT/examples/train_prompts.py | 7 +++++++
 2 files changed, 14 insertions(+)

diff --git a/applications/ChatGPT/examples/train_dummy.py b/applications/ChatGPT/examples/train_dummy.py
index f98b4792d978..a27d77a50fdf 100644
--- a/applications/ChatGPT/examples/train_dummy.py
+++ b/applications/ChatGPT/examples/train_dummy.py
@@ -97,6 +97,13 @@ def main(args):
                 max_timesteps=args.max_timesteps,
                 update_timesteps=args.update_timesteps)
 
+    # save model checkpoint after fitting on only rank0
+    strategy.save_model(actor, 'actor_checkpoint_dummy.pt', only_rank0=True)
+    # save optimizer checkpoint on all ranks
+    strategy.save_optimizer(actor_optim,
+                            'actor_optim_checkpoint_dummy_%d.pt' % (torch.cuda.current_device()),
+                            only_rank0=False)
+
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
diff --git a/applications/ChatGPT/examples/train_prompts.py b/applications/ChatGPT/examples/train_prompts.py
index e79b2acf11b1..53aa150a06fd 100644
--- a/applications/ChatGPT/examples/train_prompts.py
+++ b/applications/ChatGPT/examples/train_prompts.py
@@ -2,6 +2,7 @@
 from copy import deepcopy
 
 import pandas as pd
+import torch
 from chatgpt.nn import BLOOMActor, BLOOMCritic, GPTActor, GPTCritic, OPTActor, OPTCritic, RewardModel
 from chatgpt.trainer import PPOTrainer
 from chatgpt.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
@@ -95,6 +96,12 @@ def tokenize_fn(texts):
                 num_episodes=args.num_episodes,
                 max_timesteps=args.max_timesteps,
                 update_timesteps=args.update_timesteps)
+    # save model checkpoint after fitting on only rank0
+    strategy.save_model(actor, 'actor_checkpoint_prompts.pt', only_rank0=True)
+    # save optimizer checkpoint on all ranks
+    strategy.save_optimizer(actor_optim,
+                            'actor_optim_checkpoint_prompts_%d.pt' % (torch.cuda.current_device()),
+                            only_rank0=False)
 
 
 if __name__ == '__main__':

From fcc4097efa4ee31ff41ed8f80089c42d34c8001d Mon Sep 17 00:00:00 2001
From: Boyuan Yao <70263930+Cypher30@users.noreply.github.com>
Date: Wed, 22 Feb 2023 10:27:59 +0800
Subject: [PATCH 372/503] [autoparallel] Patch meta information of
 `torch.tanh()` and `torch.nn.Dropout` (#2773)

* [autoparallel] tanh meta information

* [autoparallel] remove redundant code

* [autoparallel] patch meta information of torch.nn.Dropout
---
 .../meta_profiler/meta_registry/activation.py | 181 +++++++-----------
 .../test_metainfo/test_activation_metainfo.py |  57 +-----
 2 files changed, 81 insertions(+), 157 deletions(-)

diff --git a/colossalai/auto_parallel/meta_profiler/meta_registry/activation.py b/colossalai/auto_parallel/meta_profiler/meta_registry/activation.py
index c659cd9ac389..faeed9f29e61 100644
--- a/colossalai/auto_parallel/meta_profiler/meta_registry/activation.py
+++ b/colossalai/auto_parallel/meta_profiler/meta_registry/activation.py
@@ -1,124 +1,85 @@
-from typing import List, Tuple
+from typing import Callable, List, Tuple
 
 import torch
 
 from colossalai.auto_parallel.tensor_shard.sharding_strategy import MemoryCost, OperationDataType, TrainCycleItem
 from colossalai.fx.profiler.memory_utils import activation_size
-from colossalai.fx.profiler.opcount import flop_mapping
+from colossalai.fx.profiler.opcount import elementwise_flop_counter
 
 from ..registry import meta_register
 
-__all__ = ["relu_meta_info"]
+__all__ = ["elementwise_meta_info"]
 
 
-@meta_register.register(torch.nn.ReLU)
-def relu_meta_info(*args, **kwargs) -> Tuple[TrainCycleItem, TrainCycleItem, List[torch.Tensor]]:
-    """torch.nn.ReLU metainfo generator
-    The aten graph of torch.nn.ReLU is
-    graph():
-    %input_2 : [#users=1] = placeholder[target=placeholder](default=)
-    %relu_default : [#users=2] = call_function[target=torch.ops.aten.relu.default](args = (%input_2,), kwargs = {})
-    %zeros_like_default : [#users=1] = call_function[target=torch.ops.aten.zeros_like.default](args = (%relu_default,), kwargs = {dtype: None, layout: None, device: None, pin_memory: None})
-    %detach_default : [#users=1] = call_function[target=torch.ops.aten.detach.default](args = (%relu_default,), kwargs = {})
-    %threshold_backward_default : [#users=1] = call_function[target=torch.ops.aten.threshold_backward.default](args = (%zeros_like_default, %detach_default, None), kwargs = {})
-    %detach_default_1 : [#users=1] = call_function[target=torch.ops.aten.detach.default](args = (%threshold_backward_default,), kwargs = {})
-    %detach_default_2 : [#users=0] = call_function[target=torch.ops.aten.detach.default](args = (%detach_default_1,), kwargs = {})
+def elementwise_meta_info(temp_mem_scale: float = 0, buffer_mem_scale: float = 0) -> Callable:
+    """This is a function to create the meta information generator for elementwise operations
 
-    Returns:
-        Tuple[TrainCycleItem, TrainCycleItem, List[torch.Tensor]]: compute cost, memory cost and forward inputs
-    """
-
-    input_tensor = args[0].data
-    output_tensor = next(filter(lambda x: x.type == OperationDataType.OUTPUT, args)).data
-    is_inplace = kwargs.get("inplace", False)
-
-    # construct input args for forward
-    fwd_in_args = [input_tensor]
-
-    # construct input args for backward
-    bwd_in_args = [output_tensor]
-
-    # calculate cost
-    # the fwd op with compute cost is relu.default
-    # the bwd op with compute cost is threshold_backward
-
-    # calculate compute cost
-    fwd_compute_cost = flop_mapping[torch.ops.aten.relu.default](fwd_in_args, (output_tensor,))
-    bwd_compute_cost = flop_mapping[torch.ops.aten.threshold_backward.default](bwd_in_args, (input_tensor,))
-    compute_cost = TrainCycleItem(fwd=fwd_compute_cost, bwd=bwd_compute_cost, total=fwd_compute_cost + bwd_compute_cost)
-
-    # calculate memory cost
-    # NOTE: the inplace ReLU don't have forward memory cost
-    # NOTE: currently in SPMD solver we always believe that there will be a new tensor created in forward
-    fwd_memory_cost = MemoryCost(
-        activation=activation_size(input_tensor) if is_inplace else activation_size([output_tensor, input_tensor]),
-        parameter=0,
-        temp=0,
-        buffer=0)
+    Args:
+        temp_mem_scale (float, optional): temp memory scaling factor for backward. Defaults to 0.
+        buffer_mem_scale (float, optional): buffer memory scaling factor for forward. Defaults to 0.
 
-    bwd_memory_cost = MemoryCost(activation=activation_size(input_tensor), parameter=0, temp=0, buffer=0)
-
-    # total cost is the sum of forward and backward cost
-    total_cost = MemoryCost(activation=fwd_memory_cost.activation + bwd_memory_cost.activation,
-                            parameter=fwd_memory_cost.parameter + bwd_memory_cost.parameter)
-
-    memory_cost = TrainCycleItem(fwd=fwd_memory_cost, bwd=bwd_memory_cost, total=total_cost)
-
-    # store fwd_in, fwd_buffer, fwd_out
-    # NOTE: It might seems a little bit weird here, we just want to align it with the older version
-    # of MetaInfoProp. In the future we might modify this part to make it clearer.
-    fwd_in = []
-    fwd_buffer = [torch.zeros_like(output_tensor, device='meta')]
-    fwd_out = [torch.zeros_like(output_tensor, device='meta')]
-
-    return compute_cost, memory_cost, fwd_in, fwd_buffer, fwd_out
-
-
-@meta_register.register(torch.nn.Softmax)
-@meta_register.register(torch.nn.functional.softmax)
-def softmax_meta_info(*args, **kwargs) -> Tuple[TrainCycleItem, TrainCycleItem, List[torch.Tensor]]:
-    """torch.nn.Softmax metainfo generator
     Returns:
-        Tuple[TrainCycleItem, TrainCycleItem, List[torch.Tensor]]: compute cost, memory cost and forward inputs
+        Callable: meta information generator
     """
-    input_tensor = next(
-        filter(
-            lambda x:
-            (x.type == OperationDataType.ARG or x.type == OperationDataType.PARAM) and x.name != 'softmax_dim',
-            args)).data
-    output_tensor = next(filter(lambda x: x.type == OperationDataType.OUTPUT, args)).data
-    softmax_dim = next(filter(lambda x: x.name == 'softmax_dim', args)).data
-
-    # calculate cost
-
-    # calculate compute cost
-    fwd_compute_cost = flop_mapping[torch.ops.aten._softmax.default]([input_tensor], [output_tensor])
-    bwd_compute_cost = flop_mapping[torch.ops.aten._softmax_backward_data.default]([output_tensor], [input_tensor])
-
-    compute_cost = TrainCycleItem(fwd=fwd_compute_cost, bwd=bwd_compute_cost, total=fwd_compute_cost + bwd_compute_cost)
-
-    # calculate memory cost
-    # NOTE: currently in SPMD solver we always believe that there will be a new tensor created in forward
-    fwd_memory_cost = MemoryCost(activation=activation_size([input_tensor, output_tensor]),
-                                 parameter=0,
-                                 temp=0,
-                                 buffer=0)
-    bwd_memory_cost = MemoryCost(activation=activation_size(input_tensor),
-                                 parameter=0,
-                                 temp=activation_size(input_tensor),
-                                 buffer=0)
-
-    # total cost is the sum of forward and backward cost
-    total_cost = MemoryCost(activation=fwd_memory_cost.activation + bwd_memory_cost.activation,
-                            parameter=fwd_memory_cost.parameter + bwd_memory_cost.parameter,
-                            temp=fwd_memory_cost.temp + bwd_memory_cost.temp,
-                            buffer=fwd_memory_cost.buffer + bwd_memory_cost.buffer)
-
-    memory_cost = TrainCycleItem(fwd=fwd_memory_cost, bwd=bwd_memory_cost, total=total_cost)
-
-    # store fwd_in, fwd_buffer, fwd_out
-    fwd_in = []
-    fwd_buffer = [torch.zeros_like(output_tensor, device='meta')]
-    fwd_out = [torch.zeros_like(output_tensor, device='meta')]
-
-    return compute_cost, memory_cost, fwd_in, fwd_buffer, fwd_out
+
+    def meta_func(*args, **kwargs) -> Tuple[TrainCycleItem, TrainCycleItem, List[torch.Tensor]]:
+        input_tensor = next(
+            filter(
+                lambda x:
+                (x.type == OperationDataType.ARG or x.type == OperationDataType.PARAM) and x.name != 'softmax_dim',
+                args)).data
+        output_tensor = next(filter(lambda x: x.type == OperationDataType.OUTPUT, args)).data
+        is_inplace = 1 if kwargs.get('inplace', False) else 0
+
+        flop_counter = elementwise_flop_counter(1, 0)
+        # calculate compute cost
+        fwd_compute_cost = flop_counter([input_tensor], [output_tensor])
+        bwd_compute_cost = flop_counter([output_tensor], [input_tensor])
+
+        compute_cost = TrainCycleItem(fwd=fwd_compute_cost,
+                                      bwd=bwd_compute_cost,
+                                      total=fwd_compute_cost + bwd_compute_cost)
+
+        # calculate memory cost
+        # NOTE: currently in SPMD solver we always believe that there will be a new tensor created in forward
+        # NOTE: if in_place is True, we will not create a new tensor in forward
+        fwd_memory_cost = MemoryCost(activation=activation_size(input_tensor) * (2 - is_inplace),
+                                     parameter=0,
+                                     temp=0,
+                                     buffer=activation_size(input_tensor) * buffer_mem_scale)
+
+        # temp_mem_scale is for situation like softmax backward
+        # the buffer will be removed during backward phase
+        bwd_memory_cost = MemoryCost(
+            activation=activation_size(input_tensor) - activation_size(input_tensor) * buffer_mem_scale,
+            parameter=0,
+            temp=activation_size(input_tensor) * temp_mem_scale + activation_size(input_tensor) * buffer_mem_scale,
+            buffer=0)
+
+        # total cost is the sum of forward and backward cost
+        total_cost = MemoryCost(activation=fwd_memory_cost.activation + bwd_memory_cost.activation,
+                                parameter=fwd_memory_cost.parameter + bwd_memory_cost.parameter,
+                                temp=fwd_memory_cost.temp + bwd_memory_cost.temp,
+                                buffer=fwd_memory_cost.buffer + bwd_memory_cost.buffer)
+
+        memory_cost = TrainCycleItem(fwd=fwd_memory_cost, bwd=bwd_memory_cost, total=total_cost)
+
+        # store fwd_in, fwd_buffer, fwd_out
+        fwd_in = []
+        fwd_buffer = [torch.zeros_like(output_tensor, device='meta')]
+        fwd_out = [torch.zeros_like(output_tensor, device='meta')]
+
+        return compute_cost, memory_cost, fwd_in, fwd_buffer, fwd_out
+
+    return meta_func
+
+
+# register meta information
+# (0, 0)
+meta_register.register([torch.nn.ReLU, torch.nn.functional.relu, torch.tanh])(elementwise_meta_info(0, 0))
+
+# (1, 0)
+meta_register.register([torch.nn.Softmax, torch.nn.functional.softmax])(elementwise_meta_info(1, 0))
+
+# (0, 0.25) for dropout, the buffer is in bool type so that the buffer memory cost is 0.25 times of input tensor
+meta_register.register([torch.nn.Dropout, torch.nn.functional.dropout])(elementwise_meta_info(0, 0.25))
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_metainfo/test_activation_metainfo.py b/tests/test_auto_parallel/test_tensor_shard/test_metainfo/test_activation_metainfo.py
index b9b42f8c161d..e41ac4fa690b 100644
--- a/tests/test_auto_parallel/test_tensor_shard/test_metainfo/test_activation_metainfo.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_metainfo/test_activation_metainfo.py
@@ -17,51 +17,15 @@
 from tests.test_auto_parallel.test_tensor_shard.test_metainfo.utils import mem_test_for_node_strategy, print_results
 
 
-def _ReLU_module_mem_test(rank, world_size, port):
-    """This function is for ReLU memory test
-    Test and print real memory cost and estimated, this test will not be executed except with the tag AUTO_PARALLEL
-
-    Args:
-    Args:
-        rank: device rank
-        bias: indicate whether conv module need bias
-        world_size: number of devices
-        port: port for initializing process group
-    """
-    disable_existing_loggers()
-    launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
-    model = nn.Sequential(nn.ReLU()).cuda()
-    input = torch.rand(4, 128, 64, 64).cuda()
-    input.requires_grad = True
-    physical_mesh_id = torch.arange(0, 4)
-    mesh_shape = (2, 2)
-    device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
-
-    # index of target node in computation graph
-    node_index = 1
-    # total number of target node strategies
-    strategy_number = 1
-    mem_test_for_node_strategy(rank=rank,
-                               model=model,
-                               device_mesh=device_mesh,
-                               node_index=node_index,
-                               strategy_number=strategy_number,
-                               input_args=[input],
-                               meta_arg_names=['input'])
-
-
-@run_on_environment_flag(name='AUTO_PARALLEL')
-@pytest.mark.dist
-@rerun_if_address_is_in_use()
-def test_ReLU_meta_concrete_info_match():
-    world_size = 4
-    run_func_module = partial(_ReLU_module_mem_test, world_size=world_size, port=free_port())
-    mp.spawn(run_func_module, nprocs=world_size)
-
-
 @pytest.mark.skipif(torch.__version__ < '1.12.0', reason="need pytorch 1.12.0 or higher for aten level operations")
-def test_sofmax_meta_info():
-    meta_func = meta_register.get(torch.nn.functional.softmax)
+@parameterize('func', [
+    torch.nn.functional.softmax,
+    torch.nn.functional.relu,
+    torch.tanh,
+    torch.nn.functional.dropout,
+])
+def test_activation_meta_info(func):
+    meta_func = meta_register.get(func)
     # construct meta tensors
     input_tensor = torch.rand(256, 1024, device="meta")
     output_tensor = torch.rand(256, 1024, device="meta")
@@ -87,7 +51,7 @@ def test_sofmax_meta_info():
     # fwd
     torch.cuda.reset_peak_memory_stats()
     mem_stamp0 = torch.cuda.memory_allocated()
-    output_real_tensor = torch.nn.functional.softmax(input_real_tensor, dim=softmax_dim)
+    output_real_tensor = func(input_real_tensor)
     fwd_allocated = torch.cuda.memory_allocated() - mem_stamp0
     fwd_peak = torch.cuda.max_memory_allocated() - mem_stamp0
 
@@ -104,5 +68,4 @@ def test_sofmax_meta_info():
 
 
 if __name__ == '__main__':
-    # test_ReLU_meta_concrete_info_match()
-    test_sofmax_meta_info()
+    test_activation_meta_info()

From c7764d3f227f78a0f3d22f0e4d4b9b28612cd120 Mon Sep 17 00:00:00 2001
From: Boyuan Yao <70263930+Cypher30@users.noreply.github.com>
Date: Wed, 22 Feb 2023 10:28:21 +0800
Subject: [PATCH 373/503] [autoparallel] Patch meta information of
 `torch.where` (#2822)

* [autoparallel] patch meta information of torch.where

* [autoparallel] pre-commit modified
---
 .../meta_profiler/meta_registry/__init__.py   |   1 +
 .../meta_profiler/meta_registry/where.py      |  60 ++++++++++
 .../test_metainfo/test_where_metainfo.py      | 104 ++++++++++++++++++
 3 files changed, 165 insertions(+)
 create mode 100644 colossalai/auto_parallel/meta_profiler/meta_registry/where.py
 create mode 100644 tests/test_auto_parallel/test_tensor_shard/test_metainfo/test_where_metainfo.py

diff --git a/colossalai/auto_parallel/meta_profiler/meta_registry/__init__.py b/colossalai/auto_parallel/meta_profiler/meta_registry/__init__.py
index df9eb6498377..d005ac813bc6 100644
--- a/colossalai/auto_parallel/meta_profiler/meta_registry/__init__.py
+++ b/colossalai/auto_parallel/meta_profiler/meta_registry/__init__.py
@@ -6,3 +6,4 @@
 from .norm import *
 from .pooling import *
 from .tensor import *
+from .where import *
diff --git a/colossalai/auto_parallel/meta_profiler/meta_registry/where.py b/colossalai/auto_parallel/meta_profiler/meta_registry/where.py
new file mode 100644
index 000000000000..c67eb40bc80e
--- /dev/null
+++ b/colossalai/auto_parallel/meta_profiler/meta_registry/where.py
@@ -0,0 +1,60 @@
+from typing import List, Tuple
+
+import torch
+
+from colossalai.auto_parallel.tensor_shard.sharding_strategy import MemoryCost, OperationDataType, TrainCycleItem
+from colossalai.fx.profiler.memory_utils import activation_size
+from colossalai.fx.profiler.opcount import flop_mapping
+
+from ..registry import meta_register
+
+__all__ = ["where_meta_info"]
+
+
+@meta_register.register(torch.where)
+def where_meta_info(*args, **kwargs) -> Tuple[TrainCycleItem, TrainCycleItem, List[torch.Tensor]]:
+    """torch.where meta information generator
+
+    Returns:
+        Tuple[TrainCycleItem, TrainCycleItem, List[torch.Tensor]]: compute cost, memory cost and forward inputs
+    """
+
+    condition_tensor, x_tensor, y_tensor, output_tensor = [arg.data for arg in args]
+
+    # compute cost
+    fwd_compute_cost = 0
+
+    # if we need to broadcast the condition tensor, during backward we need to do a reduce_sum
+    bwd_compute_cost = 0
+    if x_tensor.shape != output_tensor.shape:
+        bwd_compute_cost += flop_mapping[torch.ops.aten.sum.dim_IntList]([output_tensor], [x_tensor])
+    if y_tensor.shape != output_tensor.shape:
+        bwd_compute_cost += flop_mapping[torch.ops.aten.sum.dim_IntList]([output_tensor], [y_tensor])
+
+    compute_cost = TrainCycleItem(fwd=fwd_compute_cost, bwd=bwd_compute_cost, total=fwd_compute_cost + bwd_compute_cost)
+
+    # memory cost
+    # during the forward phase, torch.where will allocate memory for output tensor and condition tensor
+    # during the backward phase, torch.where will allocate temp memory which is 3 times as output tensor, then generate
+    # gradient matrix for input x and input y, remove the temp memory and condition tensor generated in forward phase
+    # NOTE: currently in SPMD solver we always believe that there will be a new input tensor created in forward
+    fwd_mem_cost = MemoryCost(activation=activation_size([condition_tensor, x_tensor, y_tensor, output_tensor]))
+    bwd_mem_cost = MemoryCost(activation=activation_size([x_tensor, y_tensor]) - activation_size([condition_tensor]),
+                              parameter=0,
+                              temp=activation_size([output_tensor]) * 3 + activation_size([condition_tensor]) -
+                              activation_size([x_tensor, y_tensor]),
+                              buffer=0)
+
+    total_mem_cost = MemoryCost(activation=fwd_mem_cost.activation + bwd_mem_cost.activation,
+                                parameter=fwd_mem_cost.parameter + bwd_mem_cost.parameter,
+                                temp=fwd_mem_cost.temp + bwd_mem_cost.temp,
+                                buffer=fwd_mem_cost.buffer + bwd_mem_cost.buffer)
+
+    memory_cost = TrainCycleItem(fwd=fwd_mem_cost, bwd=bwd_mem_cost, total=total_mem_cost)
+
+    # store fwd_in, fwd_buffer, fwd_out
+    fwd_in = [condition_tensor]
+    fwd_buffer = []
+    fwd_out = [output_tensor]
+
+    return compute_cost, memory_cost, fwd_in, fwd_buffer, fwd_out
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_metainfo/test_where_metainfo.py b/tests/test_auto_parallel/test_tensor_shard/test_metainfo/test_where_metainfo.py
new file mode 100644
index 000000000000..20156f9ab4d5
--- /dev/null
+++ b/tests/test_auto_parallel/test_tensor_shard/test_metainfo/test_where_metainfo.py
@@ -0,0 +1,104 @@
+import pytest
+import torch
+import torch.multiprocessing as mp
+import torch.nn as nn
+
+from colossalai.auto_parallel.tensor_shard.node_handler import LinearModuleHandler
+from colossalai.auto_parallel.tensor_shard.sharding_strategy import (
+    MemoryCost,
+    OperationData,
+    OperationDataType,
+    ShardingStrategy,
+    StrategiesVector,
+    TrainCycleItem,
+)
+from colossalai.device.device_mesh import DeviceMesh
+from colossalai.fx import ColoGraphModule, ColoTracer
+from colossalai.initialize import launch
+from colossalai.logging import disable_existing_loggers
+from colossalai.testing.pytest_wrapper import run_on_environment_flag
+from colossalai.testing.utils import parameterize, rerun_if_address_is_in_use
+from colossalai.utils import free_port
+from tests.test_auto_parallel.test_tensor_shard.test_metainfo.utils import print_results
+
+if torch.__version__ >= '1.12.0':
+    from colossalai.auto_parallel.meta_profiler import MetaInfo, meta_register
+
+
+@pytest.mark.skipif(torch.__version__ < '1.12.0', reason="need pytorch 1.12.0 or higher for aten level operations")
+def test_where_meta_info():
+    meta_func = meta_register.get(torch.where)
+
+    # construct meta tensors
+    condition_tensor = torch.rand(1, 1, 1024, 1024) > 0.5
+    condition_tensor = condition_tensor.to(device="meta")
+    x_tensor = torch.rand(8, 16, 1024, 1024, device="meta")
+    y_tensor = torch.tensor(0, device="meta")
+    output_tensor = torch.rand(8, 16, 1024, 1024)
+
+    # construct operation data
+    condition_data = OperationData(
+        name="condition",
+        data=condition_tensor,
+        type=OperationDataType.ARG,
+        logical_shape=condition_tensor.shape,
+    )
+    x_data = OperationData(
+        name="x",
+        data=x_tensor,
+        type=OperationDataType.ARG,
+        logical_shape=x_tensor.shape,
+    )
+    y_data = OperationData(
+        name="y",
+        data=y_tensor,
+        type=OperationDataType.ARG,
+        logical_shape=y_tensor.shape,
+    )
+    output_data = OperationData(
+        name="output",
+        data=output_tensor,
+        type=OperationDataType.OUTPUT,
+        logical_shape=output_tensor.shape,
+    )
+
+    # construct args and kwargs
+    args = [condition_data, x_data, y_data, output_data]
+    kwargs = {'inplace': False}
+
+    # estimated results
+    compute_cost, memory_cost, fwd_in, fwd_buffer, fwd_out = meta_func(*args, **kwargs)
+
+    # actual results
+    condition_real_tensor = torch.rand(1, 1, 1024, 1024) > 0.5
+    condition_real_tensor = condition_real_tensor.to(device="cuda")
+    x_real_tensor = torch.rand(8, 16, 1024, 1024, device="cuda")
+    y_real_tensor = torch.tensor(0.0, device="cuda")
+
+    x_real_tensor.requires_grad = True
+    y_real_tensor.requires_grad = True
+
+    # fwd
+    torch.cuda.reset_peak_memory_stats()
+    mem_stamp0 = torch.cuda.memory_allocated()
+    output_real_tensor = torch.where(condition_real_tensor, x_real_tensor, y_real_tensor)
+    fwd_allocated = torch.cuda.memory_allocated() - mem_stamp0
+    fwd_peak = torch.cuda.max_memory_allocated() - mem_stamp0
+
+    # bwd
+    upstream_grad = torch.rand_like(output_real_tensor)
+    torch.cuda.reset_peak_memory_stats()
+    mem_stamp0 = torch.cuda.memory_allocated()
+    torch.autograd.backward(output_real_tensor, upstream_grad)
+    bwd_allocated = torch.cuda.memory_allocated() - mem_stamp0
+    bwd_peak = torch.cuda.max_memory_allocated() - mem_stamp0
+
+    compute_cost: TrainCycleItem
+    memory_cost: TrainCycleItem
+
+    print_results([condition_real_tensor, x_real_tensor, y_real_tensor], [output_real_tensor], compute_cost,
+                  memory_cost, fwd_allocated, fwd_peak, bwd_allocated, bwd_peak)
+
+
+if __name__ == '__main__':
+    test_where_meta_info()

From eae77c831d93aa0190bf3af4cc87ff9a153851d2 Mon Sep 17 00:00:00 2001
From: Boyuan Yao <70263930+Cypher30@users.noreply.github.com>
Date: Wed, 22 Feb 2023 10:28:56 +0800
Subject: [PATCH 374/503] [autoparallel] Patch meta information for nodes that
 will not be handled by SPMD solver (#2823)

* [autoparallel] non spmd meta information generator

* [autoparallel] patch meta information for non spmd nodes
---
 .../meta_profiler/meta_registry/__init__.py   |  1 +
 .../meta_profiler/meta_registry/non_spmd.py   | 29 +++++++++++++++++++
 2 files changed, 30 insertions(+)
 create mode 100644 colossalai/auto_parallel/meta_profiler/meta_registry/non_spmd.py

diff --git a/colossalai/auto_parallel/meta_profiler/meta_registry/__init__.py b/colossalai/auto_parallel/meta_profiler/meta_registry/__init__.py
index d005ac813bc6..4d8b656e17e1 100644
--- a/colossalai/auto_parallel/meta_profiler/meta_registry/__init__.py
+++ b/colossalai/auto_parallel/meta_profiler/meta_registry/__init__.py
@@ -3,6 +3,7 @@
 from .conv import *
 from .embedding import *
 from .linear import *
+from .non_spmd import *
 from .norm import *
 from .pooling import *
 from .tensor import *
diff --git a/colossalai/auto_parallel/meta_profiler/meta_registry/non_spmd.py b/colossalai/auto_parallel/meta_profiler/meta_registry/non_spmd.py
new file mode 100644
index 000000000000..4634d3ccdcfd
--- /dev/null
+++ b/colossalai/auto_parallel/meta_profiler/meta_registry/non_spmd.py
@@ -0,0 +1,29 @@
+import operator
+from typing import List, Tuple
+
+import torch
+
+from colossalai.auto_parallel.tensor_shard.sharding_strategy import MemoryCost, OperationDataType, TrainCycleItem
+from colossalai.fx.profiler.memory_utils import activation_size
+from colossalai.fx.profiler.opcount import flop_mapping
+
+from ..registry import meta_register
+
+__all__ = ["non_spmd_meta_info"]
+
+
+@meta_register.register(torch.Size)
+@meta_register.register(torch.Tensor.size)
+@meta_register.register(torch.finfo)
+@meta_register.register(operator.le)
+def non_spmd_meta_info(*args, **kwargs) -> Tuple[TrainCycleItem, TrainCycleItem, List[torch.Tensor]]:
+    """Non-SPMD node meta information generator
+    Those nodes will not be handled by SPMD solver, so we just return all zero meta information for it
+
+    Returns:
+        Tuple[TrainCycleItem, TrainCycleItem, List[torch.Tensor]]: compute cost, memory cost and forward inputs
+    """
+    compute_cost = TrainCycleItem(fwd=0, bwd=0, total=0)
+    memory_cost = TrainCycleItem(fwd=MemoryCost(), bwd=MemoryCost(), total=MemoryCost())
+    fwd_in, fwd_buffer, fwd_out = [], [], []
+    return compute_cost, memory_cost, fwd_in, fwd_buffer, fwd_out

From 55424a16a54372bc89f460c54a2051b681adb5a7 Mon Sep 17 00:00:00 2001
From: dawei-wang <dawei-wang@users.noreply.github.com>
Date: Tue, 21 Feb 2023 18:58:52 -0800
Subject: [PATCH 375/503] [doc] fix GPT tutorial (#2860)

Fix hpcaitech/ColossalAI#2851
---
 examples/language/gpt/titans/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/language/gpt/titans/README.md b/examples/language/gpt/titans/README.md
index fe1854c9ffdf..e954f35fae0d 100644
--- a/examples/language/gpt/titans/README.md
+++ b/examples/language/gpt/titans/README.md
@@ -26,7 +26,7 @@ Use the following commands to execute training.
 # run on a single node
 colossalai run --nproc_per_node=<num_gpus> train_gpt.py --config configs/<config_file> --from_torch --use_dummy_dataset
 
-# run on multiple nodes with slurm
+# run on multiple nodes
 colossalai run --nproc_per_node=<num_gpus> \
    --master_addr <hostname> \
    --master_port <port-number> \

From a4fc125c34c0302c15774ae2824d4de3a2ed3580 Mon Sep 17 00:00:00 2001
From: Alex_996 <45281765+koking0@users.noreply.github.com>
Date: Wed, 22 Feb 2023 10:59:48 +0800
Subject: [PATCH 376/503] Fix typos (#2863)

Fix typos, `6.7 -> 6.7b`
---
 examples/language/opt/run_gemini.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/language/opt/run_gemini.sh b/examples/language/opt/run_gemini.sh
index d9625723a1ae..92fd481c5bc3 100644
--- a/examples/language/opt/run_gemini.sh
+++ b/examples/language/opt/run_gemini.sh
@@ -1,7 +1,7 @@
 set -x
 export BS=${BS:-16}
 export MEMCAP=${MEMCAP:-0}
-# Acceptable values include `125m`, `350m`, `1.3b`, `2.7b`, `6.7`, `13b`, `30b`, `66b`. For `175b`
+# Acceptable values include `125m`, `350m`, `1.3b`, `2.7b`, `6.7b`, `13b`, `30b`, `66b`. For `175b`
 export MODEL=${MODEL:-"125m"}
 export GPUNUM=${GPUNUM:-1}
 

From 6e4ac08172f04984f6c703d8677c9f6d1756473e Mon Sep 17 00:00:00 2001
From: HELSON <c2h214748@gmail.com>
Date: Wed, 22 Feb 2023 15:04:46 +0800
Subject: [PATCH 377/503] [hotfix] fix chunk size can not be divided (#2867)

* [hotfix] fix chunk size can not be divided

* [hotfix] use numpy for python3.8
---
 colossalai/gemini/chunk/manager.py      | 3 +++
 colossalai/gemini/chunk/search_utils.py | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/colossalai/gemini/chunk/manager.py b/colossalai/gemini/chunk/manager.py
index e73c59b251fb..30ac4d354647 100644
--- a/colossalai/gemini/chunk/manager.py
+++ b/colossalai/gemini/chunk/manager.py
@@ -72,6 +72,9 @@ def register_tensor(self,
 
             if tensor.numel() > chunk_size:
                 chunk_size = tensor.numel()
+                dp_size = tensor.process_group.dp_world_size()
+                chunk_size = chunk_size + (-chunk_size % dp_size)
+
             chunk = Chunk(
                 chunk_size=chunk_size,
                 process_group=tensor.process_group,
diff --git a/colossalai/gemini/chunk/search_utils.py b/colossalai/gemini/chunk/search_utils.py
index 57a708135708..fe9650721d74 100644
--- a/colossalai/gemini/chunk/search_utils.py
+++ b/colossalai/gemini/chunk/search_utils.py
@@ -119,6 +119,7 @@ def search_chunk_configuration(
     assert search_range_byte >= 0
 
     params_dict = classify_params_by_dp_degree(param_order, strict_ddp_flag)
+    size_lcm = np.lcm.reduce(list(params_dict.keys()))
     config_dict: Dict[int, Dict] = dict()
     total_param_size = 0
 
@@ -154,6 +155,8 @@ def search_chunk_configuration(
             min_chunk_waste = temp_waste
             best_chunk_size = chunk_size
 
+    # the chunk size needs to be divided by each groups sizes
+    best_chunk_size = best_chunk_size + (-best_chunk_size % size_lcm)
     for dp_degree in params_dict:
         if dp_degree in config_dict:
             continue

From c52edcf0eb36bf186482c04c50c8c719d246ea82 Mon Sep 17 00:00:00 2001
From: junxu <xujun@cmss.chinamobile.com>
Date: Wed, 22 Feb 2023 15:05:53 +0800
Subject: [PATCH 378/503] Rename class method of ZeroDDP (#2692)

---
 colossalai/nn/parallel/data_parallel.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/colossalai/nn/parallel/data_parallel.py b/colossalai/nn/parallel/data_parallel.py
index 8e0192c71313..a9d001bd0a9c 100644
--- a/colossalai/nn/parallel/data_parallel.py
+++ b/colossalai/nn/parallel/data_parallel.py
@@ -294,7 +294,7 @@ def _setup_grads_ptr(self):
                 continue
             p.grad = None
 
-    def _pre_bacward(self):
+    def _pre_backward(self):
         # set a visit label for all parameters
         # the label is used to check whether the parameter is correctly reduced
         for param in self.param2name:
@@ -318,7 +318,7 @@ def _post_backward(self):
         self.gemini_manager.post_iter()
 
     def backward(self, loss: torch.Tensor):
-        self._pre_bacward()
+        self._pre_backward()
         with self.param_op_hook.switch_to_backward(), ColoParamOpHookManager.use_hooks(self.param_op_hook):
             loss.backward()
         self._post_backward()

From 2e16f842a9e5b1fb54e7e41070e9d2bb5cd64d7c Mon Sep 17 00:00:00 2001
From: BlueRum <70618399+ht-zhou@users.noreply.github.com>
Date: Wed, 22 Feb 2023 16:58:11 +0800
Subject: [PATCH 379/503] [chatgpt]support opt & gpt for rm training (#2876)

---
 applications/ChatGPT/chatgpt/nn/bloom_rm.py   |  1 -
 applications/ChatGPT/chatgpt/nn/gpt_rm.py     |  9 +++-
 applications/ChatGPT/chatgpt/nn/opt_rm.py     | 10 +++--
 .../ChatGPT/examples/train_reward_model.py    | 41 ++++++++++++++-----
 applications/ChatGPT/examples/train_rm.sh     |  4 +-
 5 files changed, 48 insertions(+), 17 deletions(-)

diff --git a/applications/ChatGPT/chatgpt/nn/bloom_rm.py b/applications/ChatGPT/chatgpt/nn/bloom_rm.py
index 0d4dd43fa07a..12c37957dd83 100644
--- a/applications/ChatGPT/chatgpt/nn/bloom_rm.py
+++ b/applications/ChatGPT/chatgpt/nn/bloom_rm.py
@@ -1,6 +1,5 @@
 from typing import Optional
 
-import torch
 import torch.nn as nn
 from transformers import BloomConfig, BloomForCausalLM, BloomModel
 
diff --git a/applications/ChatGPT/chatgpt/nn/gpt_rm.py b/applications/ChatGPT/chatgpt/nn/gpt_rm.py
index c6c41a45a684..fcfb61cd4b82 100644
--- a/applications/ChatGPT/chatgpt/nn/gpt_rm.py
+++ b/applications/ChatGPT/chatgpt/nn/gpt_rm.py
@@ -15,12 +15,16 @@ class GPTRM(RewardModel):
         pretrained (str): Pretrained model name or path.
         config (GPT2Config): Model config.
         checkpoint (bool): Enable gradient checkpointing.
+        lora_rank (int): Rank of the low-rank approximation.
+        lora_train_bias (str): LoRA bias training mode.
     """
 
     def __init__(self,
                  pretrained: Optional[str] = None,
                  config: Optional[GPT2Config] = None,
-                 checkpoint: bool = False) -> None:
+                 checkpoint: bool = False,
+                 lora_rank: int = 0,
+                 lora_train_bias: str = 'none') -> None:
         if pretrained is not None:
             model = GPT2Model.from_pretrained(pretrained)
         elif config is not None:
@@ -29,5 +33,6 @@ def __init__(self,
             model = GPT2Model(GPT2Config())
         if checkpoint:
             model.gradient_checkpointing_enable()
+
         value_head = nn.Linear(model.config.n_embd, 1)
-        super().__init__(model, value_head)
+        super().__init__(model, value_head, lora_rank, lora_train_bias)
diff --git a/applications/ChatGPT/chatgpt/nn/opt_rm.py b/applications/ChatGPT/chatgpt/nn/opt_rm.py
index 150f832e0c35..5f518a3cc05e 100644
--- a/applications/ChatGPT/chatgpt/nn/opt_rm.py
+++ b/applications/ChatGPT/chatgpt/nn/opt_rm.py
@@ -1,8 +1,7 @@
 from typing import Optional
 
 import torch.nn as nn
-from transformers.models.opt.configuration_opt import OPTConfig
-from transformers.models.opt.modeling_opt import OPTModel
+from transformers import OPTConfig, OPTModel
 
 from .reward_model import RewardModel
 
@@ -14,6 +13,7 @@ class OPTRM(RewardModel):
     Args:
         pretrained (str): Pretrained model name or path.
         config (OPTConfig): Model config.
+        checkpoint (bool): Enable gradient checkpointing.
         lora_rank (int): Rank of the low-rank approximation.
         lora_train_bias (str): LoRA bias training mode.
     """
@@ -21,6 +21,7 @@ class OPTRM(RewardModel):
     def __init__(self,
                  pretrained: Optional[str] = None,
                  config: Optional[OPTConfig] = None,
+                 checkpoint: bool = False,
                  lora_rank: int = 0,
                  lora_train_bias: str = 'none') -> None:
         if pretrained is not None:
@@ -29,5 +30,8 @@ def __init__(self,
             model = OPTModel(config)
         else:
             model = OPTModel(OPTConfig())
-        value_head = nn.Linear(model.config.hidden_size, 1)
+        if checkpoint:
+            model.gradient_checkpointing_enable()
+
+        value_head = nn.Linear(model.config.word_embed_proj_dim, 1)
         super().__init__(model, value_head, lora_rank, lora_train_bias)
diff --git a/applications/ChatGPT/examples/train_reward_model.py b/applications/ChatGPT/examples/train_reward_model.py
index 57d47b6959a1..bf2071793b47 100644
--- a/applications/ChatGPT/examples/train_reward_model.py
+++ b/applications/ChatGPT/examples/train_reward_model.py
@@ -3,12 +3,13 @@
 import loralib as lora
 import torch
 from chatgpt.dataset import RewardDataset
-from chatgpt.nn import BLOOMRM
+from chatgpt.nn import BLOOMRM, GPTRM, OPTRM
 from chatgpt.trainer import RewardModelTrainer
 from chatgpt.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
 from datasets import load_dataset
 from torch.optim import Adam
-from transformers import BloomTokenizerFast
+from transformers import AutoTokenizer, BloomTokenizerFast
+from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
 
 from colossalai.nn.optimizer import HybridAdam
 
@@ -27,11 +28,30 @@ def train(args):
         raise ValueError(f'Unsupported strategy "{args.strategy}"')
 
     # configure model
-    tokenizer = BloomTokenizerFast.from_pretrained(args.pretrain)
-    tokenizer.pad_token = tokenizer.eos_token
     with strategy.model_init_context():
-        model = BLOOMRM(pretrained=args.pretrain).cuda()
-    max_len = 1024
+        if args.model == 'bloom':
+            model = BLOOMRM(pretrained=args.pretrain, lora_rank=args.lora_rank).cuda()
+        elif args.model == 'opt':
+            model = OPTRM(pretrained=args.pretrain, lora_rank=args.lora_rank).cuda()
+        elif args.model == 'gpt2':
+            model = GPTRM(pretrained=args.pretrain, lora_rank=args.lora_rank).cuda()
+        else:
+            raise ValueError(f'Unsupported model "{args.model}"')
+
+    # configure tokenizer
+    if args.model == 'gpt2':
+        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        tokenizer.pad_token = tokenizer.eos_token
+    elif args.model == 'bloom':
+        tokenizer = BloomTokenizerFast.from_pretrained(args.pretrain)
+        tokenizer.pad_token = tokenizer.eos_token
+    elif args.model == 'opt':
+        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
+    else:
+        raise ValueError(f'Unsupported model "{args.model}"')
+    tokenizer.pad_token = tokenizer.eos_token
+
+    max_len = 512
 
     # configure optimizer
     if args.strategy.startswith('colossalai'):
@@ -58,10 +78,10 @@ def train(args):
 
     trainer.fit(use_lora=args.lora_rank)
 
-    if args.lora_rank > 0:
-        torch.save({'model_state_dict': lora.lora_state_dict(trainer.model)}, args.save_path)
-    else:
-        torch.save(trainer.model, args.save_path)
+    # save model checkpoint after fitting on only rank0
+    strategy.save_model(model, 'rm_checkpoint.pt', only_rank0=True)
+    # save optimizer checkpoint on all ranks
+    strategy.save_optimizer(optim, 'rm_optim_checkpoint_%d.pt' % (torch.cuda.current_device()), only_rank0=False)
 
 
 if __name__ == '__main__':
@@ -69,6 +89,7 @@ def train(args):
     parser.add_argument('--strategy',
                         choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
                         default='naive')
+    parser.add_argument('--model', choices=['gpt2', 'bloom', 'opt'], default='bloom')
     parser.add_argument('--pretrain', type=str, default=None)
     parser.add_argument('--dataset', type=str, default='Dahoas/rm-static')
     parser.add_argument('--save_path', type=str, default='rm_ckpt.pth')
diff --git a/applications/ChatGPT/examples/train_rm.sh b/applications/ChatGPT/examples/train_rm.sh
index ed91deee2c59..6e11a148bfbe 100755
--- a/applications/ChatGPT/examples/train_rm.sh
+++ b/applications/ChatGPT/examples/train_rm.sh
@@ -15,4 +15,6 @@ set_n_least_used_CUDA_VISIBLE_DEVICES() {
 
 set_n_least_used_CUDA_VISIBLE_DEVICES 2
 
-torchrun --standalone --nproc_per_node=2 train_reward_model.py --pretrain '/data2/users/lczht/bloom-560m' --strategy colossalai_zero2
+# torchrun --standalone --nproc_per_node=2 train_reward_model.py --pretrain 'bigscience/bloomz-560m' --model 'bloom' --strategy colossalai_zero2
+torchrun --standalone --nproc_per_node=2 train_reward_model.py  --model 'gpt2' --strategy colossalai_zero2
+# torchrun --standalone --nproc_per_node=2 train_reward_model.py --pretrain "facebook/opt-350m" --model 'opt' --strategy colossalai_zero2

From 0f392d7403b432ae176a1af501c734ee9608b82b Mon Sep 17 00:00:00 2001
From: YuliangLiu0306 <72588413+YuliangLiu0306@users.noreply.github.com>
Date: Thu, 23 Feb 2023 17:28:19 +0800
Subject: [PATCH 380/503] [autoparallel] find repeat blocks (#2854)

* [autoparallel] find repeat blocks

* polish

* polish

* polish
---
 .../tensor_shard/utils/factory.py             | 122 +++++++++++++++++-
 .../test_auto_parallel/test_pass/__init__.py  |   0
 .../test_find_repeat_block.py                 | 110 ++++++++++++++++
 3 files changed, 229 insertions(+), 3 deletions(-)
 create mode 100644 tests/test_auto_parallel/test_pass/__init__.py
 create mode 100644 tests/test_auto_parallel/test_tensor_shard/test_find_repeat_block.py

diff --git a/colossalai/auto_parallel/tensor_shard/utils/factory.py b/colossalai/auto_parallel/tensor_shard/utils/factory.py
index fd3ba3d41c30..05331e560001 100644
--- a/colossalai/auto_parallel/tensor_shard/utils/factory.py
+++ b/colossalai/auto_parallel/tensor_shard/utils/factory.py
@@ -1,13 +1,16 @@
+import copy
 import operator
 import warnings
 from functools import reduce
 from typing import Dict, List, Optional, Union
 
 import torch
+from torch.fx.node import Node
+from torch.utils._pytree import tree_map
+
 from colossalai.device.device_mesh import DeviceMesh
 from colossalai.tensor.shape_consistency import ShapeConsistencyManager
 from colossalai.tensor.sharding_spec import ShardingSpec
-from torch.fx.node import Node
 
 from ..constants import INFINITY_COST
 
@@ -18,7 +21,7 @@ def generate_sharding_spec(input_: Union[Node, torch.Tensor], device_mesh: Devic
                            dim_partition_dict: Dict[int, List[int]]) -> ShardingSpec:
     """
     Generate the sharding spec of the tensor based on the given dim_partition_dict.
-    
+
 
     Args:
         input_ (Union[Node, torch.Tensor]): the input can be a Node object or a PyTorch tensor. If a node is used, it will look for its meta data associated with this node.
@@ -59,7 +62,7 @@ def generate_resharding_costs(nodes: List[Node],
         nodes (List[Node]): a list of nodes
         sharding_spec_for_input(ShardingSpec): a list of ShardingSpec for the nodes.
         count_backward (Optional[bool]): whether to include the cost of resharding in the backward pass, default is True. False can be used for inference.
-        dtype (Optional[torch.dtype]): the data type for cost calculation, default is None. 
+        dtype (Optional[torch.dtype]): the data type for cost calculation, default is None.
     '''
     # The resharding_cost of weight is counted due to sharing weight cases.
     resharding_costs = {}
@@ -88,3 +91,116 @@ def generate_resharding_costs(nodes: List[Node],
                 resharding_cost = INFINITY_COST
             resharding_costs[input_node].append(resharding_cost)
     return resharding_costs
+
+
+def find_repeat_blocks(node_list: List[torch.fx.Node], root_module, common_length_threshold: int = 20):
+    '''
+    Find the largest repeat blocks in the graph, whose length is larger than the threshold.
+
+    Args:
+        gm (GraphModule): the graph module to be analyzed.
+        common_length_threshold (int): the threshold of the repeat block length.
+    '''
+
+    # graph = gm.graph
+
+    def _process_args(args):
+        new_args = []
+        for arg in args:
+            if hasattr(arg, '_meta_data'):
+                meta_data = arg._meta_data
+            else:
+                meta_data = arg
+
+            def _process_arg(data):
+                if isinstance(data, torch.Tensor):
+                    data = data.size()
+                elif isinstance(data, slice):
+                    data = (data.start, data.step, data.stop)
+                return data
+
+            new_meta_data = tree_map(_process_arg, meta_data)
+            new_args.append(new_meta_data)
+
+        return new_args
+
+    def _all_equal(check_list, check_fn):
+        base_value = check_list[-1]
+        for e in check_list:
+            if not check_fn(e, base_value):
+                return False
+        return True
+
+    def _check_node_list_equal(l1, l2):
+        if len(l1) != len(l2):
+            return False
+        for node1, node2 in zip(l1, l2):
+            if hash(node1.hash_key) != hash(node2.hash_key):
+                return False
+        return True
+
+    def _check_node_equal(node1, node2):
+        if hash(node1.hash_key) == hash(node2.hash_key):
+            return True
+        return False
+
+    for index, node in enumerate(node_list):
+        if node.op == 'call_module':
+            target = node.target
+            submod = root_module.get_submodule(target)
+            submod_type = type(submod)
+            target = submod_type
+        else:
+            target = node.target
+
+        new_args = _process_args(node.args)
+
+        if node.op != 'get_attr':
+            hash_key = (node.op, target, *new_args)
+        else:
+            hash_key = (node.op,)
+
+        setattr(node, 'hash_key', hash_key)
+
+    hash_value_to_node_dict = {}
+
+    for index, node in enumerate(node_list):
+        hash_value = hash(node.hash_key)
+        if hash_value not in hash_value_to_node_dict:
+            hash_value_to_node_dict[hash_value] = []
+        hash_value_to_node_dict[hash_value].append(index)
+
+    # node_list = list(graph.nodes)
+
+    node_list_start = 0
+    max_common_length = common_length_threshold
+    common_blocks_index = []
+    for index, node in enumerate(node_list):
+        # the comparison will be triggered if a common node appears
+        if len(hash_value_to_node_dict[hash(node.hash_key)]) >= 2:
+            start_index_list = hash_value_to_node_dict[hash(node.hash_key)]
+            check_block_list = [node_list[start:start + max_common_length] for start in start_index_list]
+
+            common_label = True
+            if not _all_equal(check_block_list, _check_node_list_equal):
+                common_label = False
+
+            if common_label:
+                common_blocks_index = copy.deepcopy(start_index_list)
+                max_step = len(node_list) - common_blocks_index[-1] - max_common_length - 1
+
+                for i in range(max_step):
+                    # add assertion to avoid out of index
+                    next_node_list = [node_list[index + max_common_length + i] for index in start_index_list]
+                    if not _all_equal(next_node_list, _check_node_equal):
+                        max_step = i
+                        break
+                max_common_length += max_step
+                node_list_start += max_common_length
+
+    # recover common subgraph from the index
+    common_blocks = []
+    for start in common_blocks_index:
+        common_blocks.append(node_list[start:start + max_common_length])
+
+    return common_blocks
diff --git a/tests/test_auto_parallel/test_pass/__init__.py b/tests/test_auto_parallel/test_pass/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_find_repeat_block.py b/tests/test_auto_parallel/test_tensor_shard/test_find_repeat_block.py
new file mode 100644
index 000000000000..90301521f207
--- /dev/null
+++ b/tests/test_auto_parallel/test_tensor_shard/test_find_repeat_block.py
@@ -0,0 +1,110 @@
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+from torch.fx import GraphModule
+from transformers.pytorch_utils import Conv1D
+
+from colossalai.auto_parallel.tensor_shard.utils.factory import find_repeat_blocks
+from colossalai.fx.tracer.tracer import ColoTracer
+from colossalai.testing import parameterize
+from colossalai.testing.pytest_wrapper import run_on_environment_flag
+
+NUM_REPEAT_BLOCKS = 4
+BATCH_SIZE = 1
+SEQ_LENGTH = 32
+HIDDEN_DIM = 384
+
+
+class RepeatBlock(nn.Module):
+
+    def __init__(self, intermediate_size, hidden_size):
+        super().__init__()
+        self.c_fc = Conv1D(intermediate_size, hidden_size)
+        self.c_proj = Conv1D(hidden_size, intermediate_size)
+        self.act = torch.nn.ReLU()
+
+    def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor:
+        hidden_states = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.c_proj(hidden_states)
+
+        return hidden_states
+
+
+class RepeatModel(nn.Module):
+
+    def __init__(self, intermediate_size, hidden_size, num_layers):
+        super().__init__()
+        self.blocks = nn.ModuleList([RepeatBlock(intermediate_size, hidden_size) for i in range(num_layers)])
+
+    def forward(self, x):
+
+        for block in self.blocks:
+            x = block(x)
+
+        return x
+
+
+class NonRepeatBlock(nn.Module):
+
+    def __init__(self, intermediate_size, hidden_size, layer_index):
+        super().__init__()
+        intermediate_size //= (layer_index + 1)
+        self.c_fc = Conv1D(intermediate_size, hidden_size)
+        self.c_proj = Conv1D(hidden_size, intermediate_size)
+        self.act = torch.nn.ReLU()
+
+    def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor:
+        hidden_states = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.c_proj(hidden_states)
+
+        return hidden_states
+
+
+class NonRepeatModel(nn.Module):
+
+    def __init__(self, intermediate_size, hidden_size, num_layers):
+        super().__init__()
+        self.blocks = nn.ModuleList([NonRepeatBlock(intermediate_size, hidden_size, i) for i in range(num_layers)])
+
+    def forward(self, x):
+
+        for block in self.blocks:
+            x = block(x)
+
+        return x
+
+
+@run_on_environment_flag(name='AUTO_PARALLEL')
+@parameterize('model_cls', [RepeatModel, NonRepeatModel])
+def test_repeat_blocks(model_cls):
+
+    model = model_cls(4 * HIDDEN_DIM, HIDDEN_DIM, NUM_REPEAT_BLOCKS)
+
+    tracer = ColoTracer()
+    input_sample = {'x': torch.rand(BATCH_SIZE, SEQ_LENGTH, HIDDEN_DIM).to('meta')}
+    graph = tracer.trace(root=model, meta_args=input_sample)
+
+    gm = GraphModule(model, graph, model.__class__.__name__)
+    gm.recompile()
+
+    node_list = list(graph.nodes)
+    root_module = graph.owning_module
+    common_blocks = find_repeat_blocks(node_list, root_module, common_length_threshold=10)
+
+    total_num_nodes = len(list(graph.nodes))
+    # remove the input placeholder node and the output node
+    num_repeat_nodes_per_block = (total_num_nodes - 2) // NUM_REPEAT_BLOCKS
+    for common_block in common_blocks:
+        print(common_block)
+    if model_cls == RepeatModel:
+        assert len(common_blocks) == NUM_REPEAT_BLOCKS
+        assert len(common_blocks[0]) == num_repeat_nodes_per_block
+    elif model_cls == NonRepeatModel:
+        assert len(common_blocks) == 0
+
+
+if __name__ == '__main__':
+    test_repeat_blocks()

From 819e25d8b1bbb8d38e5972c694331b23ebce6d88 Mon Sep 17 00:00:00 2001
From: YuliangLiu0306 <72588413+YuliangLiu0306@users.noreply.github.com>
Date: Thu, 23 Feb 2023 17:28:36 +0800
Subject: [PATCH 381/503] [hotfix] fix autoparallel compatibility test issues
 (#2754)

---
 .../auto_parallel/tensor_shard/initialize.py     |  1 +
 .../test_compatibility_with_ddp.py               | 15 +++++++++++----
 .../test_compatibility_with_gemini.py            | 16 +++++++++++-----
 3 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/colossalai/auto_parallel/tensor_shard/initialize.py b/colossalai/auto_parallel/tensor_shard/initialize.py
index 012b0ff43c5d..4affa37897bf 100644
--- a/colossalai/auto_parallel/tensor_shard/initialize.py
+++ b/colossalai/auto_parallel/tensor_shard/initialize.py
@@ -330,6 +330,7 @@ def autoparallelize(model: nn.Module,
                                      device_mesh,
                                      solver_preference=solver_preference,
                                      dataloader_option=dataloader_option,
+                                     shard_option=shard_option,
                                      save_solver_solution=save_solver_solution,
                                      load_solver_solution=load_solver_solution,
                                      solution_path=solver_solution_path,
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_compatibility_with_ddp.py b/tests/test_auto_parallel/test_tensor_shard/test_compatibility_with_ddp.py
index 365981f105f0..e4982a5d7f5a 100644
--- a/tests/test_auto_parallel/test_tensor_shard/test_compatibility_with_ddp.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_compatibility_with_ddp.py
@@ -33,11 +33,15 @@ def check_compatibility_with_ddp(rank, world_size, port):
     disable_existing_loggers()
     launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
     model = MLP(4).cuda()
-    input = torch.rand(4, 4).cuda()
-    output_compare = model(input)
+    if rank in [0, 1]:
+        input = torch.arange(0, 16, dtype=torch.float).reshape(4, 4).cuda()
+    elif rank in [2, 3]:
+        input = torch.arange(16, 32, dtype=torch.float).reshape(4, 4).cuda()
+    input_compare = torch.arange(0, 32, dtype=torch.float).reshape(8, 4).cuda()
+    output_compare = model(input_compare)
     loss_compare = output_compare.sum()
     loss_compare.backward()
-    grad_compare = copy.deepcopy(model.linear_1.weight.grad)
+    grad_compare = copy.deepcopy(model.linear_1.weight.grad / 2)
 
     physical_mesh_id = torch.arange(0, 4)
     mesh_shape = (2, 2)
@@ -70,7 +74,10 @@ def check_compatibility_with_ddp(rank, world_size, port):
     gm = DDP(gm, process_group=dp_process_group)
     output = gm(input)
 
-    assert_close(output, output_compare)
+    if rank in (0, 1):
+        assert_close(output, output_compare.narrow(0, 0, 4))
+    else:
+        assert_close(output, output_compare.narrow(0, 4, 4))
     print(f'output on rank{rank} is correct')
     loss = output.sum()
 
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_compatibility_with_gemini.py b/tests/test_auto_parallel/test_tensor_shard/test_compatibility_with_gemini.py
index b4080c54599a..760401c3f2c2 100644
--- a/tests/test_auto_parallel/test_tensor_shard/test_compatibility_with_gemini.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_compatibility_with_gemini.py
@@ -37,12 +37,15 @@ def check_auto_parallel_with_gemini(rank, world_size, port):
     disable_existing_loggers()
     launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
     model = MLP(4).half().cuda()
-
-    input = torch.rand(4, 4).half().cuda()
-    output_compare = model(input)
+    if rank in [0, 1]:
+        input = torch.arange(0, 16).reshape(4, 4).half().cuda()
+    elif rank in [2, 3]:
+        input = torch.arange(16, 32).reshape(4, 4).half().cuda()
+    input_compare = torch.arange(0, 32).reshape(8, 4).half().cuda()
+    output_compare = model(input_compare)
     loss_compare = output_compare.sum()
     loss_compare.backward()
-    grad_compare = copy.deepcopy(model.linear_1.weight.grad)
+    grad_compare = copy.deepcopy(model.linear_1.weight.grad / 2)
 
     physical_mesh_id = torch.arange(0, 4)
     mesh_shape = (2, 2)
@@ -79,7 +82,10 @@ def check_auto_parallel_with_gemini(rank, world_size, port):
     optimizer = HybridAdam(gm.parameters(), betas=(0, 0))
     optimizer = zero_optim_wrapper(gm, optimizer, initial_scale=1)
     output = gm(input)
-    assert_close(output, output_compare)
+    if rank in (0, 1):
+        assert_close(output, output_compare.narrow(0, 0, 4))
+    else:
+        assert_close(output, output_compare.narrow(0, 4, 4))
     print(f'output on rank{rank} is correct')
     loss = output.sum()
     optimizer.zero_grad()

From 8c8a39be950af86a89f1cbc186f5cb24324018ec Mon Sep 17 00:00:00 2001
From: "Jiatong (Julius) Han" <59948448+JThh@users.noreply.github.com>
Date: Thu, 23 Feb 2023 23:56:15 +0800
Subject: [PATCH 382/503] [hotfix]: Remove math.prod dependency (#2837)

* Remove math.prod dependency

* Fix style

* Fix style

---------

Co-authored-by: Jiatong Han <jiatong.han@u.nus.edu>
---
 colossalai/nn/_ops/view.py       | 193 +++++++++++++++----------------
 colossalai/tensor/colo_tensor.py |   6 +-
 2 files changed, 99 insertions(+), 100 deletions(-)

diff --git a/colossalai/nn/_ops/view.py b/colossalai/nn/_ops/view.py
index 3197e7568d6f..3c0bc52337ce 100644
--- a/colossalai/nn/_ops/view.py
+++ b/colossalai/nn/_ops/view.py
@@ -1,97 +1,96 @@
-import math
-import torch
-from colossalai.tensor.op_wrapper import colo_op_impl
-from colossalai.tensor import ColoTensor, ColoTensorSpec, ReplicaSpec
-from typing import Optional, Union
-
-
-def _all_int(my_iter):
-    return all(isinstance(i, int) for i in my_iter)
-
-
-def _get_valid_shape(shape):
-    if isinstance(shape, list):
-        if _all_int(shape):
-            return tuple(shape)
-        else:
-            raise RuntimeError("expects type(int) but finds an other type")
-    elif isinstance(shape, tuple):
-        if _all_int(shape):
-            return shape
-        else:
-            return _get_valid_shape(shape[0])
-    else:
-        raise RuntimeError("expects an iterable array but finds '{}'".format(type(shape)))
-
-
-def _shape_infer(org_sp, tgt_sp):
-    cnt = 0
-    pos = 0
-    for idx, dim in enumerate(tgt_sp):
-        if dim < -1:
-            raise RuntimeError("invalid shape dimension {}".format(dim))
-        elif dim == -1:
-            cnt += 1
-            pos = idx
-
-    if cnt > 1:
-        raise RuntimeError("only one dimension can be inferred")
-
-    org_prod = math.prod(org_sp)
-    tgt_prod = math.prod(tgt_sp)
-
-    if cnt == 0:
-        if org_prod != tgt_prod:
-            raise RuntimeError("shape '{}' is invalid for input of size {}".format(tgt_sp, org_prod))
-        else:
-            return tgt_sp
-    elif org_prod % tgt_prod != 0:
-        raise RuntimeError("shape '{}' is invalid for input of size {}".format(tgt_sp, org_prod))
-
-    infer_dim = -(org_prod // tgt_prod)
-    return tgt_sp[: pos] + (infer_dim,) + tgt_sp[pos + 1:]
-
-
-@colo_op_impl(torch.Tensor.view)
-def colo_view(self: ColoTensor, *shape) -> 'ColoTensor':
-    """Handles ``__torch_function__`` dispatch for ``torch.Tensor.view``.
-    Changes the shape of the current tensor.
-    """
-    assert isinstance(self, ColoTensor)
-    # apply original `view` function for replicated colo tensors
-    if self.is_replicate():
-        return self.view(*shape)
-
-    cur_sp = self.size()
-    org_sp = self.size_global()
-    # parse the passed arguments
-    tgt_sp = _get_valid_shape(shape)
-    # get the correct shape from inference
-    inf_sp = _shape_infer(org_sp, tgt_sp)
-
-    if self.is_shard_1drow() and org_sp[0] == inf_sp[0]:
-        new_shape = (cur_sp[0],) + tgt_sp[1:]
-        res = self.view(*new_shape)
-    elif self.is_shard_1dcol() and org_sp[-1] == inf_sp[-1]:
-        new_shape = tgt_sp[:-1] + (cur_sp[-1],)
-        res = self.view(*new_shape)
-    else:
-        replicated_t = self.redistribute(dist_spec=ReplicaSpec())
-        return ColoTensor.from_torch_tensor(
-            tensor=replicated_t.view(*shape),
-            spec=ColoTensorSpec(self.get_process_group()))
-
-    return ColoTensor.from_torch_tensor(
-        tensor=res,
-        spec=ColoTensorSpec(
-            pg=self.get_process_group(),
-            dist_attr=self.dist_spec))
-
-
-@colo_op_impl(torch.Tensor.size)
-def colo_size(self: ColoTensor, dim: Optional[int] = None) -> Union[torch.Size, int]:
-    size = self.size_global()
-    if dim is None:
-        return size
-    else:
-        return size[dim]
+import operator
+from functools import reduce
+from typing import Optional, Union
+
+import torch
+
+from colossalai.tensor import ColoTensor, ColoTensorSpec, ReplicaSpec
+from colossalai.tensor.op_wrapper import colo_op_impl
+
+
+def _all_int(my_iter):
+    return all(isinstance(i, int) for i in my_iter)
+
+
+def _get_valid_shape(shape):
+    if isinstance(shape, list):
+        if _all_int(shape):
+            return tuple(shape)
+        else:
+            raise RuntimeError("expects type(int) but finds an other type")
+    elif isinstance(shape, tuple):
+        if _all_int(shape):
+            return shape
+        else:
+            return _get_valid_shape(shape[0])
+    else:
+        raise RuntimeError("expects an iterable array but finds '{}'".format(type(shape)))
+
+
+def _shape_infer(org_sp, tgt_sp):
+    cnt = 0
+    pos = 0
+    for idx, dim in enumerate(tgt_sp):
+        if dim < -1:
+            raise RuntimeError("invalid shape dimension {}".format(dim))
+        elif dim == -1:
+            cnt += 1
+            pos = idx
+
+    if cnt > 1:
+        raise RuntimeError("only one dimension can be inferred")
+
+    org_prod = reduce(operator.mul, org_sp, 1)
+    tgt_prod = reduce(operator.mul, tgt_sp, 1)
+
+    if cnt == 0:
+        if org_prod != tgt_prod:
+            raise RuntimeError("shape '{}' is invalid for input of size {}".format(tgt_sp, org_prod))
+        else:
+            return tgt_sp
+    elif org_prod % tgt_prod != 0:
+        raise RuntimeError("shape '{}' is invalid for input of size {}".format(tgt_sp, org_prod))
+
+    infer_dim = -(org_prod // tgt_prod)
+    return tgt_sp[:pos] + (infer_dim,) + tgt_sp[pos + 1:]
+
+
+@colo_op_impl(torch.Tensor.view)
+def colo_view(self: ColoTensor, *shape) -> 'ColoTensor':
+    """Handles ``__torch_function__`` dispatch for ``torch.Tensor.view``.
+    Changes the shape of the current tensor.
+    """
+    assert isinstance(self, ColoTensor)
+    # apply original `view` function for replicated colo tensors
+    if self.is_replicate():
+        return self.view(*shape)
+
+    cur_sp = self.size()
+    org_sp = self.size_global()
+    # parse the passed arguments
+    tgt_sp = _get_valid_shape(shape)
+    # get the correct shape from inference
+    inf_sp = _shape_infer(org_sp, tgt_sp)
+
+    if self.is_shard_1drow() and org_sp[0] == inf_sp[0]:
+        new_shape = (cur_sp[0],) + tgt_sp[1:]
+        res = self.view(*new_shape)
+    elif self.is_shard_1dcol() and org_sp[-1] == inf_sp[-1]:
+        new_shape = tgt_sp[:-1] + (cur_sp[-1],)
+        res = self.view(*new_shape)
+    else:
+        replicated_t = self.redistribute(dist_spec=ReplicaSpec())
+        return ColoTensor.from_torch_tensor(tensor=replicated_t.view(*shape),
+                                            spec=ColoTensorSpec(self.get_process_group()))
+
+    return ColoTensor.from_torch_tensor(tensor=res,
+                                        spec=ColoTensorSpec(pg=self.get_process_group(), dist_attr=self.dist_spec))
+
+
+@colo_op_impl(torch.Tensor.size)
+def colo_size(self: ColoTensor, dim: Optional[int] = None) -> Union[torch.Size, int]:
+    size = self.size_global()
+    if dim is None:
+        return size
+    else:
+        return size[dim]
diff --git a/colossalai/tensor/colo_tensor.py b/colossalai/tensor/colo_tensor.py
index 474dc7a1e9bf..bbed8847abbc 100644
--- a/colossalai/tensor/colo_tensor.py
+++ b/colossalai/tensor/colo_tensor.py
@@ -1,6 +1,6 @@
-import math
+import operator
 from copy import copy
-from functools import lru_cache
+from functools import lru_cache, reduce
 from typing import Callable, Optional, Set
 
 import torch
@@ -312,7 +312,7 @@ def size_global(self, *args) -> torch.Size:
     def numel_global(self):
         """Returns the number of elements in the tensor when it's replicated.
         """
-        return math.prod(self.size_global())
+        return reduce(operator.mul, self.size_global(), 1)
 
     # Some API for dist spec check
 

From e33c043dec37d0e78b18f6df36fbc3890f0b53f8 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Fri, 24 Feb 2023 14:41:33 +0800
Subject: [PATCH 383/503] [workflow] moved pre-commit to post-commit (#2895)

---
 .github/workflows/README.md                   | 10 ++-
 .../{pre_commit.yml => post_commit.yml}       | 50 +++++++++-----
 .../workflows/report_precommit_failure.yml    | 67 -------------------
 3 files changed, 39 insertions(+), 88 deletions(-)
 rename .github/workflows/{pre_commit.yml => post_commit.yml} (50%)
 delete mode 100644 .github/workflows/report_precommit_failure.yml

diff --git a/.github/workflows/README.md b/.github/workflows/README.md
index 3bf535343d6d..9634b84b8ff8 100644
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@@ -35,10 +35,9 @@ I will provide the details of each workflow below.
 
 ### Code Style Check
 
-| Workflow Name               | File name                      | Description                                                                                                |
-| --------------------------- | ------------------------------ | ---------------------------------------------------------------------------------------------------------- |
-| `Pre-commit`                | `pre_commit.yml`               | This workflow runs pre-commit checks for code style consistency for PRs.                                   |
-| `Report pre-commit failure` | `report_precommit_failure.yml` | This PR will put up a comment in the PR to explain the precommit failure and remedy if `Pre-commit` fails. |
+| Workflow Name | File name         | Description                                                                                                    |
+| ------------- | ----------------- | -------------------------------------------------------------------------------------------------------------- |
+| `post-commit` | `post_commit.yml` | This workflow runs pre-commit checks for changed files to achieve code style consistency after a PR is merged. |
 
 ### Unit Test
 
@@ -130,8 +129,7 @@ This file controls which CUDA versions will be checked against CUDA extenson bui
 ## Progress Log
 
 - [x] Code style check
-  - [x] pre-commit check
-  - [x] pre-commit failure report
+  - [x] post-commit check
 - [x] unit testing
   - [x] test on PR
   - [x] report test coverage
diff --git a/.github/workflows/pre_commit.yml b/.github/workflows/post_commit.yml
similarity index 50%
rename from .github/workflows/pre_commit.yml
rename to .github/workflows/post_commit.yml
index 3e71be2fc611..765a4d4281e9 100644
--- a/.github/workflows/pre_commit.yml
+++ b/.github/workflows/post_commit.yml
@@ -1,11 +1,17 @@
-name: pre-commit
+name: post-commit
 
 on:
   pull_request:
+    types:
+        - closed
 
 jobs:
+  # this job will run after a PR is merged to run pre-commit on any changed file
+  # so that the user does not need to learn pre-commit and pre-commit can still
+  # be auto-executed by the workflow
   pre-commit:
     runs-on: ubuntu-latest
+    if: github.event.pull_request.merged == true && github.repository == 'hpcaitech/ColossalAI'
     steps:
     - uses: actions/checkout@v2
       with:
@@ -36,6 +42,11 @@ jobs:
           echo "$file was changed"
         done
 
+    # check out the main branch
+    - uses: actions/checkout@v2
+      with:
+        ref: 'main'
+
     - uses: actions/setup-python@v3
 
     - name: Cache pre-commit hooks
@@ -49,23 +60,32 @@ jobs:
         pip install pre-commit
         pre-commit install
 
-    - name: Run pre-commit on Changed Files
-      id: precommit
+    # run pre-commit on changed files
+    - name: Run Pre-commit
       run: |
         for file in ${{ steps.find-changed-files.outputs.all_changed_files }}; do
-          echo "======= running pre-commit on ${file} ======="
-          pre-commit run --files $file
+          pre-commit run --files $file || true
         done
 
-    - name: Save PR number
-      if: always()
-      env:
-        PR_NUMBER: ${{ github.event.number }}
+    # create commit for pre-commit
+    - name: Create commits
       run: |
-        mkdir -p ./pr
-        echo $PR_NUMBER > ./pr/pr_number
-    - uses: actions/upload-artifact@v3
-      if: always()
+        git config --global user.name 'github-actions'
+        git config --global user.email 'github-actions@github.com'
+        git remote set-url origin https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}
+        git add -A
+        git commit -am "[format] applied code formatting on changed files in pull request ${{ github.event.pull_request.number }}"
+
+    # create pull request
+    - name: Create Pull Request
+      id: cpr
+      uses: peter-evans/create-pull-request@v4
+      with:
+        branch: pre-commit-${{ github.event.pull_request.number }}
+        title: "[format] applied code formatting on changed files in PR ${{ github.event.pull_request.number }}"
+
+    - name: Enable Auto-merge for the New PR
+      uses: peter-evans/enable-pull-request-automerge@v2
       with:
-        name: pr_number
-        path: pr/
+        pull-request-number: ${{ steps.cpr.outputs.pull-request-number }}
+        merge-method: squash
diff --git a/.github/workflows/report_precommit_failure.yml b/.github/workflows/report_precommit_failure.yml
deleted file mode 100644
index e6ca7b01bcc1..000000000000
--- a/.github/workflows/report_precommit_failure.yml
+++ /dev/null
@@ -1,67 +0,0 @@
-name: Report Precommit Failure
-
-on:
-  workflow_run:
-    workflows: [pre-commit]
-    types:
-      - completed
-
-jobs:
-  # comment with a message on how to do pre-commit
-  # if the pre-commit check was not passed
-  report-precommit-failure:
-    runs-on: ubuntu-latest
-    if: ${{ github.event.workflow_run.conclusion == 'failure' }}
-    steps:
-      - name: 'Download artifact'
-        uses: actions/github-script@v6
-        with:
-          script: |
-            let allArtifacts = await github.rest.actions.listWorkflowRunArtifacts({
-               owner: context.repo.owner,
-               repo: context.repo.repo,
-               run_id: context.payload.workflow_run.id,
-            });
-            let matchArtifact = allArtifacts.data.artifacts.filter((artifact) => {
-              return artifact.name == "pr_number"
-            })[0];
-            let download = await github.rest.actions.downloadArtifact({
-               owner: context.repo.owner,
-               repo: context.repo.repo,
-               artifact_id: matchArtifact.id,
-               archive_format: 'zip',
-            });
-            let fs = require('fs');
-            fs.writeFileSync(`${process.env.GITHUB_WORKSPACE}/pr_number.zip`, Buffer.from(download.data));
-
-      - name: 'Unzip artifact'
-        run: unzip pr_number.zip
-
-      - name: 'Comment on PR'
-        uses: actions/github-script@v6
-        with:
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          script: |
-            let fs = require('fs');
-            let issue_number = Number(fs.readFileSync('./pr_number'));
-            let owner = context.repo.owner;
-            let repo = context.repo.repo;
-            let run_id = context.payload.workflow_run.id;
-            let run_url = `https://github.com/${owner}/${repo}/actions/runs/${run_id}`
-            let body = `
-            Your pre-commit check failed, follow the steps to run pre-commit on your file for code style consistency.
-
-            1. install pre-commit via "pip install pre-commit"
-            2. install pre-commit hooks via "pre-commit install"
-            3. run pre-commit on file with format error via "pre-commit run --files path" by replacing "path" with the actual file path
-            4. commit and push to your branch
-
-            View your job at ${run_url}.
-            Read our "CONTRIBUTING.md" for more reference to the code style.
-            `;
-            await github.rest.issues.createComment({
-              owner: owner,
-              repo: repo,
-              issue_number: issue_number,
-              body: body
-            });

From dbc01b9c0479a6fd3fb04450b9dc01b5162d8c0d Mon Sep 17 00:00:00 2001
From: fastalgo <youyang@cs.berkeley.edu>
Date: Sat, 25 Feb 2023 12:27:10 +0800
Subject: [PATCH 384/503] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ca92508527b1..65763325e4e0 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 
    [![logo](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/colossal-ai_logo_vertical.png)](https://www.colossalai.org/)
 
-   Colossal-AI: Making big AI models cheaper, easier, and more scalable
+   Colossal-AI: Making large AI models cheaper, faster and more accessible
 
    <h3> <a href="https://arxiv.org/abs/2110.14883"> Paper </a> |
    <a href="https://www.colossalai.org/"> Documentation </a> |

From 7b13f7db18999c611db21a17b7388709af75eda1 Mon Sep 17 00:00:00 2001
From: YH <100389977+yhna940@users.noreply.github.com>
Date: Mon, 27 Feb 2023 15:04:53 +0900
Subject: [PATCH 385/503] [zero] trivial zero optimizer refactoring (#2869)

* Fix mionr grad store interface

* Apply lint
---
 .../bookkeeping/gradient_store.py             | 11 +++++---
 .../zero/sharded_optim/low_level_optim.py     | 25 +++++++------------
 2 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/colossalai/zero/sharded_optim/bookkeeping/gradient_store.py b/colossalai/zero/sharded_optim/bookkeeping/gradient_store.py
index b166752cc400..942d7186e55f 100644
--- a/colossalai/zero/sharded_optim/bookkeeping/gradient_store.py
+++ b/colossalai/zero/sharded_optim/bookkeeping/gradient_store.py
@@ -6,6 +6,7 @@
 
 
 class GradientStore(BaseStore):
+
     def __init__(self, *args):
         super().__init__(*args)
         # bookkeeping data structures
@@ -56,9 +57,7 @@ def append_average_gradient_by_group(self, group_id: int, tensor: Tensor) -> Non
         else:
             self._averaged_gradients[group_id] = [tensor]
 
-    def add_average_gradient_by_group(
-        self, group_id: int, tensor_idx: int, tensor: Tensor
-    ) -> None:
+    def add_average_gradient_by_group(self, group_id: int, tensor_idx: int, tensor: Tensor) -> None:
         """
         Add an average gradient to the list of averaged gradients of a parameter group
 
@@ -81,3 +80,9 @@ def reset_average_gradients_by_group(self, group_id: int) -> None:
         """
 
         self._averaged_gradients[group_id] = []
+
+    def reset_all_average_gradients(self) -> None:
+        """
+        Reset the bookkeeping data structure for averaged gradients to an empty list
+        """
+        self._averaged_gradients = dict()
diff --git a/colossalai/zero/sharded_optim/low_level_optim.py b/colossalai/zero/sharded_optim/low_level_optim.py
index f5e03ce28532..502b1c4d9f4c 100644
--- a/colossalai/zero/sharded_optim/low_level_optim.py
+++ b/colossalai/zero/sharded_optim/low_level_optim.py
@@ -416,7 +416,7 @@ def zero_grad(self, set_to_none=True):
         :param set_to_none: Whether set the gradient to None. Default value is True.
         :type set_to_none: bool
         """
-        for group_id, param_group in self._fp16_param_groups.items():
+        for _, param_group in self._fp16_param_groups.items():
             for param in param_group:
                 if set_to_none:
                     param.grad = None
@@ -438,7 +438,7 @@ def step(self, closure=None):
 
         # update loss scale if overflow occurs
         if found_inf:
-            self._grad_store._averaged_gradients = dict()
+            self._grad_store.reset_all_average_gradients()
             self.zero_grad()
             return
 
@@ -448,7 +448,7 @@ def step(self, closure=None):
 
         for group_id in range(self.num_param_groups):
             # compute norm
-            norm_group = compute_norm(gradients=self._grad_store._averaged_gradients[group_id],
+            norm_group = compute_norm(gradients=self._grad_store.get_averaged_gradients_by_group(group_id),
                                       params=self._param_store.get_fp16_params_by_rank_group(group_id=group_id,
                                                                                              rank=self._local_rank),
                                       dp_group=self._dp_torch_group,
@@ -469,8 +469,7 @@ def step(self, closure=None):
             single_grad_partition_groups.append(flat_fp32_avg_grads)
             device = self._fp32_flat_param_groups_of_current_rank[group_id].device
             self._fp32_flat_param_groups_of_current_rank[group_id].grad = flat_fp32_avg_grads.to(device)
-            self._grad_store._averaged_gradients[group_id] = []
-            self._grad_store._averaged_gradients[group_id] = []
+            self._grad_store.reset_average_gradients_by_group(group_id)
 
         # unscale and clip grads
         global_norm = calculate_global_norm_from_list(norm_list=norm_groups)
@@ -546,28 +545,22 @@ def _unscale_and_clip_grads(self, grad_groups_flat, total_norm):
     def _sync_grad(self):
         # update param already reduced flag
         reduction_states = self._param_store.get_param_reduction_states()
-        for tensor, state in reduction_states.items():
+        for tensor, _ in reduction_states.items():
             reduction_states[tensor] = False
 
         # accumulate gradient
         for group_id in range(self.num_param_groups):
             param_group = self._param_store.get_fp16_params_by_rank_group(self._local_rank, group_id)
 
-            avg_gradients_group = self._grad_store.get_averaged_gradients_by_group(
-                group_id
-            )
+            avg_gradients_group = self._grad_store.get_averaged_gradients_by_group(group_id)
 
             param_idx = 0
             for param in param_group:
                 if param.grad is not None:
                     if len(avg_gradients_group) == param_idx:
-                        self._grad_store.append_average_gradient_by_group(
-                            group_id, param.grad
-                        )
+                        self._grad_store.append_average_gradient_by_group(group_id, param.grad)
                     else:
-                        self._grad_store.add_average_gradient_by_group(
-                            group_id, param_idx, param.grad
-                        )
+                        self._grad_store.add_average_gradient_by_group(group_id, param_idx, param.grad)
                     param_idx += 1
 
         # the gradients needed are stored in the avg_gradients buffer
@@ -594,4 +587,4 @@ def _reduce_grad_stage2(self):
         # only need to reduce the gradients
         # left in the communication bucket
         for reduce_rank in range(self._world_size):
-            self._run_reduction(reduce_rank)
\ No newline at end of file
+            self._run_reduction(reduce_rank)

From 0afb55fc5ba5f517fda2a7872d2ac2170b5ee983 Mon Sep 17 00:00:00 2001
From: binmakeswell <binmakeswell@gmail.com>
Date: Mon, 27 Feb 2023 14:59:27 +0800
Subject: [PATCH 386/503] [doc] add os scope, update tutorial install and tips
 (#2914)

---
 README-zh-Hans.md                               |  1 +
 README.md                                       |  1 +
 docs/source/en/get_started/installation.md      |  1 +
 docs/source/zh-Hans/get_started/installation.md |  1 +
 examples/tutorial/README.md                     | 13 +++----------
 5 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/README-zh-Hans.md b/README-zh-Hans.md
index 1ef8ade8520b..54d97af82efa 100644
--- a/README-zh-Hans.md
+++ b/README-zh-Hans.md
@@ -273,6 +273,7 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
 <p align="right">(<a href="#top">返回顶端</a>)</p>
 
 ## 安装
+> Colossal-AI 目前仅支持Linux操作系统，没有在其他操作系统如Windows和macOS进行测试
 
 ### 从PyPI安装
 
diff --git a/README.md b/README.md
index 65763325e4e0..bb1b0e2164de 100644
--- a/README.md
+++ b/README.md
@@ -275,6 +275,7 @@ Acceleration of [AlphaFold Protein Structure](https://alphafold.ebi.ac.uk/)
 <p align="right">(<a href="#top">back to top</a>)</p>
 
 ## Installation
+> Colossal-AI currently only supports the Linux operating system and has not been tested on other OS such as Windows and macOS.
 
 ### Install from PyPI
 
diff --git a/docs/source/en/get_started/installation.md b/docs/source/en/get_started/installation.md
index b4285a40e194..1757b4241ed2 100644
--- a/docs/source/en/get_started/installation.md
+++ b/docs/source/en/get_started/installation.md
@@ -1,4 +1,5 @@
 # Setup
+> Colossal-AI currently only supports the Linux operating system and has not been tested on other OS such as Windows and macOS.
 
 ## Download From PyPI
 
diff --git a/docs/source/zh-Hans/get_started/installation.md b/docs/source/zh-Hans/get_started/installation.md
index bcb2112bb2e8..5f2351ffe446 100755
--- a/docs/source/zh-Hans/get_started/installation.md
+++ b/docs/source/zh-Hans/get_started/installation.md
@@ -1,4 +1,5 @@
 # 安装
+> Colossal-AI 目前仅支持Linux操作系统，没有在其他操作系统如Windows和macOS进行测试
 
 ## 从PyPI上安装
 
diff --git a/examples/tutorial/README.md b/examples/tutorial/README.md
index 1da77e831c23..000cf2117335 100644
--- a/examples/tutorial/README.md
+++ b/examples/tutorial/README.md
@@ -1,5 +1,7 @@
 # Colossal-AI Tutorial Hands-on
 
+> This path is an abbreviated tutorial prepared for specific activities and may not be maintained in real time. For use of Colossal-AI, please refer to other [examples](https://github.com/hpcaitech/ColossalAI/tree/main/examples) and [documents](https://www.colossalai.org/).
+
 ## Introduction
 
 Welcome to the [Colossal-AI](https://github.com/hpcaitech/ColossalAI) tutorial, which has been accepted as official tutorials by top conference [SC](https://sc22.supercomputing.org/), [AAAI](https://aaai.org/Conferences/AAAI-23/), [PPoPP](https://ppopp23.sigplan.org/), [CVPR](https://cvpr2023.thecvf.com/), etc.
@@ -38,16 +40,7 @@ If you encounter any problem while running these tutorials, you may want to rais
 
 ## 🛠️ Setup environment
 [[video]](https://www.youtube.com/watch?v=dpMYj974ZIc) You should use `conda` to create a virtual environment, we recommend **python 3.8**, e.g. `conda create -n colossal python=3.8`. This installation commands are for CUDA 11.3, if you have a different version of CUDA, please download PyTorch and Colossal-AI accordingly.
-
-```
-# install torch
-# visit https://pytorch.org/get-started/locally/ to download other versions
-pip install torch==1.12.1+cu113 torchvision==0.13.1+cu113 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu113
-
-# install latest ColossalAI
-# visit https://colossalai.org/download to download corresponding version of Colossal-AI
-pip install colossalai==0.1.11rc3+torch1.12cu11.3 -f https://release.colossalai.org
-```
+You can refer to the [Installation](https://github.com/hpcaitech/ColossalAI#installation) to set up your environment.
 
 You can run `colossalai check -i` to verify if you have correctly set up your environment 🕹️.
 ![](https://raw.githubusercontent.com/hpcaitech/public_assets/main/examples/tutorial/colossalai%20check%20-i.png)

From 12bafe057fb3020326a28d28d1f9f2734bb55658 Mon Sep 17 00:00:00 2001
From: binmakeswell <binmakeswell@gmail.com>
Date: Mon, 27 Feb 2023 18:28:34 +0800
Subject: [PATCH 387/503] [doc] update installation for GPT (#2922)

---
 examples/language/gpt/README.md | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/examples/language/gpt/README.md b/examples/language/gpt/README.md
index fe7b23beb41b..a35408152a86 100644
--- a/examples/language/gpt/README.md
+++ b/examples/language/gpt/README.md
@@ -19,11 +19,8 @@ conda install pytorch==1.12.0 torchvision==0.13.0 torchaudio==0.12.0 cudatoolkit
 pip install torch==1.12.0+cu113 torchvision==0.13.0+cu113 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu113
 ```
 
-### Install [Colossal-AI v0.1.12](https://colossalai.org/download/) From Official Website
+### [Install Colossal-AI](https://github.com/hpcaitech/ColossalAI#installation) 
 
-```bash
-pip install colossalai==0.1.12+torch1.12cu11.3 -f https://release.colossalai.org
-```
 
 ### Install requirements
 
@@ -31,7 +28,7 @@ pip install colossalai==0.1.12+torch1.12cu11.3 -f https://release.colossalai.org
 pip install -r requirements.txt
 ```
 
-This is just an example that we download PyTorch=1.12.0, CUDA=11.6 and colossalai=0.1.12+torch1.12cu11.3. You can download another version of PyTorch and its corresponding ColossalAI version. Just make sure that the version of ColossalAI is at least 0.1.10, PyTorch is at least 1.8.1 and transformers is at least 4.231.
+This is just an example that we download PyTorch=1.12.0, CUDA=11.6 and colossalai. You can download another version of PyTorch and its corresponding ColossalAI version. Just make sure that the version of ColossalAI is at least 0.1.10, PyTorch is at least 1.8.1 and transformers is at least 4.231.
 If you want to test ZeRO1 and ZeRO2 in Colossal-AI, you need to ensure Colossal-AI>=0.1.12.
 
 ## Dataset

From da056285f213e51bc4d76c02bff00aa49a351aa2 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 27 Feb 2023 19:29:06 +0800
Subject: [PATCH 388/503] [format] applied code formatting on changed files in
 pull request 2922 (#2923)

Co-authored-by: github-actions <github-actions@github.com>
---
 examples/language/gpt/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/language/gpt/README.md b/examples/language/gpt/README.md
index a35408152a86..10d6c2ddd5d7 100644
--- a/examples/language/gpt/README.md
+++ b/examples/language/gpt/README.md
@@ -19,7 +19,7 @@ conda install pytorch==1.12.0 torchvision==0.13.0 torchaudio==0.12.0 cudatoolkit
 pip install torch==1.12.0+cu113 torchvision==0.13.0+cu113 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu113
 ```
 
-### [Install Colossal-AI](https://github.com/hpcaitech/ColossalAI#installation) 
+### [Install Colossal-AI](https://github.com/hpcaitech/ColossalAI#installation)
 
 
 ### Install requirements

From eb5cf94332b2f6f230f486a6c9746fe79f771a96 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Tue, 28 Feb 2023 10:35:23 +0800
Subject: [PATCH 389/503] Automated submodule synchronization (#2927)

Co-authored-by: github-actions <github-actions@github.com>
---
 inference | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/inference b/inference
index cde4c8f4e726..83c48efaf152 160000
--- a/inference
+++ b/inference
@@ -1 +1 @@
-Subproject commit cde4c8f4e7269decb82b1b225ada278694e10f6a
+Subproject commit 83c48efaf152ecc104b1c311c80df9e5c59a09e0

From 61e687831d19ab5ad5e3255ac980807046bb8dc3 Mon Sep 17 00:00:00 2001
From: zbian <kurisusnowdeng@gmail.com>
Date: Mon, 27 Feb 2023 17:52:16 +0800
Subject: [PATCH 390/503] fixed using zero with tp cannot access weight
 correctly

---
 .../nn/layer/colossalai_layer/_utils.py       | 79 ++++++++++---------
 .../nn/layer/colossalai_layer/dropout.py      | 61 +++++++-------
 2 files changed, 72 insertions(+), 68 deletions(-)

diff --git a/colossalai/nn/layer/colossalai_layer/_utils.py b/colossalai/nn/layer/colossalai_layer/_utils.py
index 4283e5fe09b5..677cb0e7ac42 100644
--- a/colossalai/nn/layer/colossalai_layer/_utils.py
+++ b/colossalai/nn/layer/colossalai_layer/_utils.py
@@ -1,38 +1,41 @@
-import torch.nn as nn
-from torch import Tensor
-
-from ..parallel_2d._operation import split_batch_2d
-from ..parallel_2p5d._operation import split_batch_2p5d
-from ..parallel_3d._operation import split_batch_3d
-from ..utils import get_tensor_parallel_mode
-
-_parallel_split_batch = {'2d': split_batch_2d, '2.5d': split_batch_2p5d, '3d': split_batch_3d}
-
-
-def partition_batch(input_) -> Tensor:
-    tensor_parallel_mode = get_tensor_parallel_mode()
-    if tensor_parallel_mode in _parallel_split_batch:
-        if isinstance(input_, dict):
-            return {k: _parallel_split_batch[tensor_parallel_mode](v) for k, v in input_.items()}
-        else:
-            return _parallel_split_batch[tensor_parallel_mode](input_)
-    else:
-        return input_
-
-
-class ColossalaiModule(nn.Module):
-
-    def __init__(self, module: nn.Module, **kwargs):
-        super().__init__()
-        # copy values
-        self.__dict__ = module.__dict__.copy()
-        # copy methods
-        for name, attr in module.__class__.__dict__.items():
-            if name not in ['__init__', 'forward'] and callable(attr):
-                setattr(self, name, getattr(module, name))
-        self._forward_func = module.forward
-        for k, v in kwargs.items():
-            setattr(self, k, v)
-
-    def forward(self, *args):
-        return self._forward_func(*args)
+import torch.nn as nn
+from torch import Tensor
+
+from ..parallel_2d._operation import split_batch_2d
+from ..parallel_2p5d._operation import split_batch_2p5d
+from ..parallel_3d._operation import split_batch_3d
+from ..utils import get_tensor_parallel_mode
+
+_parallel_split_batch = {'2d': split_batch_2d, '2.5d': split_batch_2p5d, '3d': split_batch_3d}
+
+
+def partition_batch(input_) -> Tensor:
+    tensor_parallel_mode = get_tensor_parallel_mode()
+    if tensor_parallel_mode in _parallel_split_batch:
+        if isinstance(input_, dict):
+            return {k: _parallel_split_batch[tensor_parallel_mode](v) for k, v in input_.items()}
+        else:
+            return _parallel_split_batch[tensor_parallel_mode](input_)
+    else:
+        return input_
+
+
+class ColossalaiModule(nn.Module):
+
+    def __init__(self, module: nn.Module, **kwargs):
+        super().__init__()
+        self.module = module
+        for k, v in kwargs.items():
+            setattr(self, k, v)
+
+    def __getattr__(self, name: str):
+        if name == 'module':
+            return super().__getattr__(name)
+        elif hasattr(self.module, name):
+            return getattr(self.module, name)
+        elif name in self.__dict__:
+            return self.__dict__[name]
+        raise AttributeError("'{}' object has no attribute '{}'".format(type(self).__name__, name))
+
+    def forward(self, *args):
+        return self.module(*args)
diff --git a/colossalai/nn/layer/colossalai_layer/dropout.py b/colossalai/nn/layer/colossalai_layer/dropout.py
index cc2d9a0a70fd..0c049cb3f408 100644
--- a/colossalai/nn/layer/colossalai_layer/dropout.py
+++ b/colossalai/nn/layer/colossalai_layer/dropout.py
@@ -1,30 +1,31 @@
-import torch.nn as nn
-from colossalai.context import ParallelMode, seed
-
-from ..parallel_1d import *
-from ..utils import get_tensor_parallel_mode
-from ._utils import ColossalaiModule
-
-
-class Dropout(ColossalaiModule):
-    """Dropout layer of colossalai.
-
-    Args:
-        p (float, optional): probability of an element to be zeroed, defaults 0.5.
-        inplace (bool, optional): whether to do dropout in-place, default to be False.
-    """
-
-    def __init__(self, p: float = 0.5, inplace: bool = False) -> None:
-        tensor_parallel = get_tensor_parallel_mode()
-        if tensor_parallel == "1d":
-            drop = Dropout1D(p, inplace)
-        else:
-            drop = nn.Dropout(p, inplace)
-        super().__init__(drop, tensor_parallel=tensor_parallel)
-
-    def forward(self, *args):
-        if self.tensor_parallel in [None, '1d']:
-            return self._forward_func(*args)
-        else:
-            with seed(ParallelMode.TENSOR):
-                return self._forward_func(*args)
+import torch.nn as nn
+
+from colossalai.context import ParallelMode, seed
+
+from ..parallel_1d import *
+from ..utils import get_tensor_parallel_mode
+from ._utils import ColossalaiModule
+
+
+class Dropout(ColossalaiModule):
+    """Dropout layer of colossalai.
+
+    Args:
+        p (float, optional): probability of an element to be zeroed, defaults 0.5.
+        inplace (bool, optional): whether to do dropout in-place, default to be False.
+    """
+
+    def __init__(self, p: float = 0.5, inplace: bool = False) -> None:
+        tensor_parallel = get_tensor_parallel_mode()
+        if tensor_parallel == "1d":
+            drop = Dropout1D(p, inplace)
+        else:
+            drop = nn.Dropout(p, inplace)
+        super().__init__(drop, tensor_parallel=tensor_parallel)
+
+    def forward(self, *args):
+        if self.tensor_parallel in [None, '1d']:
+            return super().forward(*args)
+        else:
+            with seed(ParallelMode.TENSOR):
+                return super().forward(*args)

From a848091141898047bec4adf858336419d8f65a16 Mon Sep 17 00:00:00 2001
From: YH <100389977+yhna940@users.noreply.github.com>
Date: Tue, 28 Feb 2023 12:00:43 +0900
Subject: [PATCH 391/503] Fix port exception type (#2925)

---
 colossalai/utils/common.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/colossalai/utils/common.py b/colossalai/utils/common.py
index e35b29c2ac56..e15981140be1 100644
--- a/colossalai/utils/common.py
+++ b/colossalai/utils/common.py
@@ -50,16 +50,20 @@ def ensure_path_exists(filename: str):
         Path(dirpath).mkdir(parents=True, exist_ok=True)
 
 
-def free_port():
+def free_port() -> int:
+    """Get a free port on localhost.
+
+    Returns:
+        int: A free port on localhost.
+    """
     while True:
+        port = random.randint(20000, 65000)
         try:
-            sock = socket.socket()
-            sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
-            port = random.randint(20000, 65000)
-            sock.bind(('localhost', port))
-            sock.close()
-            return port
-        except Exception:
+            with socket.socket() as sock:
+                sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+                sock.bind(("localhost", port))
+                return port
+        except OSError:
             continue
 
 
From 197d0bf4eded9da84bc176323fa9ad075c09d0c0 Mon Sep 17 00:00:00 2001
From: YuliangLiu0306 <72588413+YuliangLiu0306@users.noreply.github.com>
Date: Tue, 28 Feb 2023 11:03:30 +0800
Subject: [PATCH 392/503] [autoparallel] apply repeat block to reduce solving
 time (#2912)

---
 .../auto_parallel/tensor_shard/initialize.py  |  8 ++--
 .../tensor_shard/solver/solver.py             | 39 ++++++++++++-------
 .../solver/strategies_constructor.py          | 21 ++++++++++
 .../test_gpt/test_solver_with_gpt_module.py   |  8 ++--
 .../test_node_handler/utils.py                |  4 +-
 .../test_solver_with_resnet_v2.py             |  5 +--
 6 files changed, 57 insertions(+), 28 deletions(-)

diff --git a/colossalai/auto_parallel/tensor_shard/initialize.py b/colossalai/auto_parallel/tensor_shard/initialize.py
index 4affa37897bf..60472eee52ca 100644
--- a/colossalai/auto_parallel/tensor_shard/initialize.py
+++ b/colossalai/auto_parallel/tensor_shard/initialize.py
@@ -112,11 +112,13 @@ def solve_solution(gm: ColoGraphModule, strategy_constructor: StrategiesConstruc
     This method is used to solve the best solution for the given graph.
     The solution is a list of integers, each integer represents the best strategy index of the corresponding node.
     '''
-    graph_analyser = GraphAnalyser(gm)
-    liveness_list = graph_analyser.liveness_analysis()
+    # temporarily we use all nodes as liveness list, we count the backward memory cost together with
+    # forward memory cost into the node memory cost, and no activation checkpoint is used in this phase.
+    # graph_analyser = GraphAnalyser(gm)
+    # liveness_list = graph_analyser.liveness_analysis()
     cost_graph = CostGraph(strategy_constructor.leaf_strategies)
     cost_graph.simplify_graph()
-    solver = Solver(gm.graph, strategy_constructor, cost_graph, graph_analyser, memory_budget=memory_budget)
+    solver = Solver(gm.graph, strategy_constructor, cost_graph, memory_budget=memory_budget)
     ret = solver.call_solver_serialized_args()
     solution = list(ret[0])
 
diff --git a/colossalai/auto_parallel/tensor_shard/solver/solver.py b/colossalai/auto_parallel/tensor_shard/solver/solver.py
index 5449fb5a149d..f5c6663dce80 100644
--- a/colossalai/auto_parallel/tensor_shard/solver/solver.py
+++ b/colossalai/auto_parallel/tensor_shard/solver/solver.py
@@ -32,7 +32,7 @@ def __init__(self,
                  graph: Graph,
                  strategies_constructor: StrategiesConstructor,
                  cost_graph: CostGraph,
-                 graph_analyser: GraphAnalyser,
+                 graph_analyser: GraphAnalyser = None,
                  memory_budget: float = -1.0,
                  solution_numbers: int = 1,
                  forward_only: bool = False,
@@ -63,7 +63,10 @@ def __init__(self,
             self.memory_increasing_coefficient = memory_increasing_coefficient
         else:
             self.memory_increasing_coefficient = 1
-        self.liveness_list = self.graph_analyser.liveness_analysis()
+        # temporarily we use all nodes as liveness list, we count the backward memory cost together with
+        # forward memory cost into the node memory cost, and no activation checkpoint is used in this phase.
+        # self.liveness_list = self.graph_analyser.liveness_analysis()
+        self.liveness_list = self.nodes
         self.node_index_dict = self._generate_node_index_dict()
         # The last solution vector of auto sharding.
         self.last_s_val = None
@@ -140,7 +143,7 @@ def _prepare_data_for_solver(self):
         liveness_set = self.liveness_list
 
         # omit alias_set now
-        alias_set = None
+        alias_set = self.strategies_constructor.alias_set
         alias_convert_costs = None
 
         # prepare compute_costs, communication_costs and memory_costs
@@ -230,6 +233,7 @@ def get_non_zero_index(binary_vector):
 
         # 0. Unpack flatten numpy arrays
         s_follow = following_nodes
+        s_alias = alias_set
 
         E = edge_pairs.reshape((-1, 2))    # noqa
         r = []
@@ -294,8 +298,11 @@ def get_non_zero_index(binary_vector):
                 if strategies_len[i] == 1:
                     s.append([1])
                 else:
-                    num_nodes += 1
-                    s.append(LpVariable.matrix(f"s[{i}]", (range(strategies_len[i]),), cat="Binary"))
+                    if i not in s_alias:
+                        num_nodes += 1
+                        s.append(LpVariable.matrix(f"s[{i}]", (range(strategies_len[i]),), cat="Binary"))
+                    else:
+                        s.append(s[s_alias[i]])
             else:
                 if s_follow[i] < len(s):
                     s.append(s[s_follow[i]])
@@ -311,15 +318,20 @@ def get_non_zero_index(binary_vector):
         #############################
         e = []
         num_edges = 0
+        map_edge_to_idx = {}
         for (idx, (i, j)) in enumerate(E):
             if len(s[i]) == 1:
                 e.append(s[j])
             elif len(s[j]) == 1:
                 e.append(s[i])
             else:
-                num_edges += 1
-                e.append(LpVariable.matrix(f"e[{i},{j}]", (range(len(s[i]) * len(s[j])),), cat="Binary"))
+                if i in s_alias and j in s_alias and (s_alias[i], s_alias[j]) in map_edge_to_idx:
+                    e.append(e[map_edge_to_idx[(s_alias[i], s_alias[j])]])
+                else:
+                    num_edges += 1
+                    e.append(LpVariable.matrix(f"e[{i},{j}]", (range(len(s[i]) * len(s[j])),), cat="Binary"))
             assert len(e[idx]) == len(r[idx])
+            map_edge_to_idx[(i, j)] = idx
         for element in s:
             assert len(element) > 0
         # 2. Set initial value
@@ -371,13 +383,12 @@ def get_non_zero_index(binary_vector):
         # compute memory consumption with liveness set  #
         #################################################
         if memory_budget > 0:
-            for liveness_stage in liveness_set:
-                mem = 0
-                for live_variable in liveness_stage.unique_live_vars:
-                    if live_variable.node not in self.node_index_dict:
-                        continue
-                    node_index = self.node_index_dict[live_variable.node]
-                    mem += lpSum(s[node_index][j] * m[node_index][j] for j in range(len(s[node_index])))
+            mem = 0
+            for node in liveness_set:
+                if node not in self.node_index_dict:
+                    continue
+                node_index = self.node_index_dict[node]
+                mem += lpSum(s[node_index][j] * m[node_index][j] for j in range(len(s[node_index])))
                 prob += mem <= memory_budget
 
         # (d). specified by `cat="Binary"`
diff --git a/colossalai/auto_parallel/tensor_shard/solver/strategies_constructor.py b/colossalai/auto_parallel/tensor_shard/solver/strategies_constructor.py
index 40741daca702..59ead1ca8fac 100644
--- a/colossalai/auto_parallel/tensor_shard/solver/strategies_constructor.py
+++ b/colossalai/auto_parallel/tensor_shard/solver/strategies_constructor.py
@@ -15,6 +15,7 @@
 )
 from colossalai.auto_parallel.tensor_shard.sharding_strategy import StrategiesVector
 from colossalai.auto_parallel.tensor_shard.utils import generate_resharding_costs, generate_sharding_spec
+from colossalai.auto_parallel.tensor_shard.utils.factory import find_repeat_blocks
 from colossalai.device.device_mesh import DeviceMesh
 
 from ..options import DataloaderOption, SolverOptions
@@ -42,6 +43,7 @@ def __init__(self, graph: Graph, device_mesh: DeviceMesh, solver_options: Solver
         self.strategy_map = {}
         self.solver_options = solver_options
         self.no_strategy_nodes = []
+        self.alias_set = None
 
     def remove_duplicated_strategy(self, strategies_vector):
         '''
@@ -59,6 +61,22 @@ def remove_duplicated_strategy(self, strategies_vector):
         for strategy in remove_list:
             strategies_vector.remove(strategy)
 
+    def generate_alias_set(self):
+
+        node_list = [strategy_vector.node for strategy_vector in self.leaf_strategies]
+        common_blocks = find_repeat_blocks(node_list, self.root_module, common_length_threshold=10)
+
+        repeat_block_nums = len(common_blocks)
+        alias_set = {}
+
+        if repeat_block_nums == 0:
+            return alias_set
+
+        for index, common_node in enumerate(common_blocks[0]):
+            for i in range(1, repeat_block_nums):
+                alias_set[node_list.index(common_blocks[i][index])] = node_list.index(common_node)
+        return alias_set
+
     def build_strategies_and_cost(self):
         """
         This method is to build the strategy vector for each node in the computation graph.
@@ -175,3 +193,6 @@ def _check_no_strategy_for_data(data):
                 self.leaf_strategies.remove(node.strategies_vector)
             if node in self.strategy_map:
                 self.strategy_map.pop(node)
+
+        alias_set = self.generate_alias_set()
+        self.alias_set = alias_set
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_gpt/test_solver_with_gpt_module.py b/tests/test_auto_parallel/test_tensor_shard/test_gpt/test_solver_with_gpt_module.py
index a6be1928b547..4adb4fbaf047 100644
--- a/tests/test_auto_parallel/test_tensor_shard/test_gpt/test_solver_with_gpt_module.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_gpt/test_solver_with_gpt_module.py
@@ -15,13 +15,13 @@
 
 BATCH_SIZE = 1
 SEQ_LENGTH = 32
-HIDDEN_DIM = 768
+HIDDEN_DIM = 384
 
 
 @run_on_environment_flag(name='AUTO_PARALLEL')
 @parameterize('model_cls', [GPT2Block, GPT2Attention, GPT2MLP, GPT2Model])
 def test_self_attention_block(model_cls):
-    config = transformers.GPT2Config(n_position=64, n_layer=4, n_head=16, n_embd=HIDDEN_DIM)
+    config = transformers.GPT2Config(n_position=64, n_layer=12, n_head=16, n_embd=HIDDEN_DIM)
     if model_cls == GPT2MLP:
         model = model_cls(intermediate_size=4 * config.hidden_size, config=config)
     else:
@@ -54,15 +54,13 @@ def test_self_attention_block(model_cls):
     gm = GraphModule(model, graph, model.__class__.__name__)
     print(gm.graph)
     gm.recompile()
-    graph_analyser = GraphAnalyser(gm)
-    liveness_list = graph_analyser.liveness_analysis()
     solver_options = SolverOptions()
     strategies_constructor = StrategiesConstructor(graph, device_mesh, solver_options)
     strategies_constructor.build_strategies_and_cost()
 
     cost_graph = CostGraph(strategies_constructor.leaf_strategies)
     cost_graph.simplify_graph()
-    solver = Solver(gm.graph, strategies_constructor, cost_graph, graph_analyser, memory_budget=-1)
+    solver = Solver(gm.graph, strategies_constructor, cost_graph, memory_budget=-1)
     ret = solver.call_solver_serialized_args()
     strategies_list = solver.last_s_val
     nodes = [strategies_vector.node for strategies_vector in strategies_constructor.leaf_strategies]
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/utils.py b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/utils.py
index 14c8cb296949..0cdfdbc9d0cd 100644
--- a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/utils.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/utils.py
@@ -9,7 +9,6 @@
 from colossalai.auto_parallel.tensor_shard.options import SolverOptions
 from colossalai.auto_parallel.tensor_shard.solver import StrategiesConstructor
 from colossalai.auto_parallel.tensor_shard.solver.cost_graph import CostGraph
-from colossalai.auto_parallel.tensor_shard.solver.graph_analysis import GraphAnalyser
 from colossalai.auto_parallel.tensor_shard.solver.solver import Solver
 from colossalai.device.device_mesh import DeviceMesh
 from colossalai.fx.tracer.tracer import ColoTracer
@@ -109,8 +108,7 @@ def numerical_test_for_node_strategy(model: torch.nn.Module,
             # solution construction
             cost_graph = CostGraph(strategies_constructor.leaf_strategies)
             cost_graph.simplify_graph()
-            graph_analyser = GraphAnalyser(gm)
-            solver = Solver(gm.graph, strategies_constructor, cost_graph, graph_analyser, verbose=False)
+            solver = Solver(gm.graph, strategies_constructor, cost_graph, verbose=False)
             ret = solver.call_solver_serialized_args()
             solution = list(ret[0])
         gm, sharding_spec_dict, origin_spec_dict, comm_actions_dict = runtime_preparation_pass(
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_solver_with_resnet_v2.py b/tests/test_auto_parallel/test_tensor_shard/test_solver_with_resnet_v2.py
index 6f64acd525c2..bbfc3e1fcc14 100644
--- a/tests/test_auto_parallel/test_tensor_shard/test_solver_with_resnet_v2.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_solver_with_resnet_v2.py
@@ -51,15 +51,14 @@ def test_cost_graph():
     #     return fc
     gm = GraphModule(model, graph, model.__class__.__name__)
     gm.recompile()
-    graph_analyser = GraphAnalyser(gm)
-    liveness_list = graph_analyser.liveness_analysis()
+
     solver_options = SolverOptions()
     strategies_constructor = StrategiesConstructor(graph, device_mesh, solver_options)
     strategies_constructor.build_strategies_and_cost()
 
     cost_graph = CostGraph(strategies_constructor.leaf_strategies)
     cost_graph.simplify_graph()
-    solver = Solver(gm.graph, strategies_constructor, cost_graph, graph_analyser)
+    solver = Solver(gm.graph, strategies_constructor, cost_graph)
 
     ret = solver.call_solver_serialized_args()
     print(ret[0])

From 77b88a3849c7f2d9ca7f3e3f9ce4bc4718c92292 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Tue, 28 Feb 2023 11:10:38 +0800
Subject: [PATCH 393/503] [workflow] added auto doc test on PR (#2929)

* [workflow] added auto doc test on PR

* [workflow] added doc test workflow

* polish code

* polish code

* polish code

* polish code

* polish code

* polish code

* polish code
---
 .github/workflows/doc_check_on_pr.yml      |  1 +
 .github/workflows/doc_test_on_pr.yml       | 87 ++++++++++++++++++++++
 .github/workflows/doc_test_on_schedule.yml | 47 ++++++++++++
 docs/requirements-doc-test.txt             |  2 +
 docs/source/en/get_started/installation.md |  2 +
 5 files changed, 139 insertions(+)
 create mode 100644 .github/workflows/doc_test_on_pr.yml
 create mode 100644 .github/workflows/doc_test_on_schedule.yml
 create mode 100644 docs/requirements-doc-test.txt

diff --git a/.github/workflows/doc_check_on_pr.yml b/.github/workflows/doc_check_on_pr.yml
index 6e42053ddc08..6593ac50e168 100644
--- a/.github/workflows/doc_check_on_pr.yml
+++ b/.github/workflows/doc_check_on_pr.yml
@@ -64,5 +64,6 @@ jobs:
           mkdir $cache_dir
           mv ColossalAI $cache_dir
           cd ColossalAI-Documentation
+          pip install -v ./doc-build/third_party/hf-doc-builder
           pip install -v ./doc-build
           bash ./scripts/build.sh
diff --git a/.github/workflows/doc_test_on_pr.yml b/.github/workflows/doc_test_on_pr.yml
new file mode 100644
index 000000000000..a0572766f093
--- /dev/null
+++ b/.github/workflows/doc_test_on_pr.yml
@@ -0,0 +1,87 @@
+name: Test Documentation on PR
+on:
+  pull_request:
+    # any change in the examples folder will trigger check for the corresponding example.
+    paths:
+      - 'docs/source/**.md'
+
+jobs:
+  # This is for changed example files detect and output a matrix containing all the corresponding directory name.
+  detect-changed-doc:
+    if: |
+        github.event.pull_request.draft == false &&
+        github.base_ref == 'main' &&
+        github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request'
+    runs-on: ubuntu-latest
+    outputs:
+      any_changed: ${{ steps.changed-files.outputs.any_changed }}
+      changed_files: ${{ steps.changed-files.outputs.all_changed_files }}
+    name: Detect changed example files
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.pull_request.head.sha }}
+
+      - name: Locate base commit
+        id: locate-base-sha
+        run: |
+            curBranch=$(git rev-parse --abbrev-ref HEAD)
+            commonCommit=$(git merge-base origin/main $curBranch)
+            echo $commonCommit
+            echo "baseSHA=$commonCommit" >> $GITHUB_OUTPUT
+
+      - name: Get all changed example files
+        id: changed-files
+        uses: tj-actions/changed-files@v35
+        with:
+          base_sha: ${{ steps.locate-base-sha.outputs.baseSHA }}
+          files: |
+            ./docs/source/**/*.md
+
+  # If no file is changed, it will prompt an error and shows the matrix do not have value.
+  check-changed-doc:
+    # Add this condition to avoid executing this job if the trigger event is workflow_dispatch.
+    if: |
+        github.event.pull_request.draft == false &&
+        github.base_ref == 'main' &&
+        github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request' &&
+        needs.detect-changed-doc.outputs.any_changed == 'true'
+    name: Test the changed Doc
+    needs: detect-changed-doc
+    runs-on: [self-hosted, gpu]
+    container:
+      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      options: --gpus all --rm
+    timeout-minutes: 20
+    steps:
+      - name: Checkout ColossalAI-Documentation
+        uses: actions/checkout@v2
+        with:
+          path: './ColossalAI-Documentation'
+          repository: 'hpcaitech/ColossalAI-Documentation'
+
+      - name: Install Docer
+        run: |
+          pip install -v ./ColossalAI-Documentation/doc-build/third_party/hf-doc-builder
+          pip install -v ./ColossalAI-Documentation/doc-build
+
+      - name: Checkout ColossalAI
+        uses: actions/checkout@v3
+
+      - name: Install ColossalAI
+        run: |
+          pip install -v .
+
+      - name: Install Doc Test Requirements
+        run: |
+          pip install -r docs/requirements-doc-test.txt
+
+      - name: Test the Doc
+        run: |
+          for file in ${{ steps.changed-files.outputs.all_changed_files }}; do
+            echo "Testing $file now..."
+            docer test -p $file
+          done
+        env:
+          NCCL_SHM_DISABLE: 1
diff --git a/.github/workflows/doc_test_on_schedule.yml b/.github/workflows/doc_test_on_schedule.yml
new file mode 100644
index 000000000000..6b4f5d1f908c
--- /dev/null
+++ b/.github/workflows/doc_test_on_schedule.yml
@@ -0,0 +1,47 @@
+name: Test Documentation on Schedule
+on:
+  # run at 07:00 of every Sunday(singapore time) so here is UTC time Saturday 23:00
+  schedule:
+    - cron:  '0 23 * * 6'
+  workflow_dispatch:
+
+jobs:
+  check-changed-doc:
+    # Add this condition to avoid executing this job if the trigger event is workflow_dispatch.
+    if: github.repository == 'hpcaitech/ColossalAI'
+    name: Test the changed Doc
+    runs-on: [self-hosted, gpu]
+    container:
+      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      options: --gpus all --rm
+    timeout-minutes: 60
+    steps:
+      - name: Checkout ColossalAI-Documentation
+        uses: actions/checkout@v2
+        with:
+          path: './ColossalAI-Documentation'
+          repository: 'hpcaitech/ColossalAI-Documentation'
+
+      - name: Install Docer
+        run: |
+          pip install -v ./ColossalAI-Documentation/doc-build/third_party/hf-doc-builder
+          pip install -v ./ColossalAI-Documentation/doc-build
+
+      - name: Checkout ColossalAI
+        uses: actions/checkout@v3
+
+      - name: Install ColossalAI
+        run: |
+          pip install -v .
+
+      - name: Install Doc Test Requirements
+        run: |
+          pip install -r docs/requirements-doc-test.txt
+
+      - name: Test the Doc
+        run: |
+          for file in $(find ./docs/source -name "*.md"); do
+            docer test -p $file
+          done
+        env:
+          NCCL_SHM_DISABLE: 1
diff --git a/docs/requirements-doc-test.txt b/docs/requirements-doc-test.txt
new file mode 100644
index 000000000000..b49a94554afb
--- /dev/null
+++ b/docs/requirements-doc-test.txt
@@ -0,0 +1,2 @@
+colossalai
+torch
diff --git a/docs/source/en/get_started/installation.md b/docs/source/en/get_started/installation.md
index 1757b4241ed2..0e114696de6d 100644
--- a/docs/source/en/get_started/installation.md
+++ b/docs/source/en/get_started/installation.md
@@ -1,3 +1,5 @@
+<!-- doc-test-command: echo "installation.md does not need test" -->
+
 # Setup
 > Colossal-AI currently only supports the Linux operating system and has not been tested on other OS such as Windows and macOS.
 

From 9e3b8b7affc0961d8d0e7e4a1ab67d1c4ada9ce4 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Tue, 28 Feb 2023 11:28:24 +0800
Subject: [PATCH 394/503] [doc] removed read-the-docs (#2932)

---
 .readthedocs.yaml                             |  30 ----
 docs/Makefile                                 |  26 ----
 docs/_static/css/rtd_theme.css                |   3 -
 docs/_templates/apidoc/module.rst_t           |   9 --
 docs/_templates/apidoc/package.rst_t          |  52 -------
 docs/_templates/apidoc/toc.rst_t              |   8 -
 docs/colossalai/colossalai.amp.amp_type.rst   |   5 -
 .../colossalai.amp.apex_amp.apex_amp.rst      |   5 -
 docs/colossalai/colossalai.amp.apex_amp.rst   |  11 --
 .../colossalai.amp.naive_amp.grad_scaler.rst  |   8 -
 .../colossalai.amp.naive_amp.naive_amp.rst    |   5 -
 docs/colossalai/colossalai.amp.naive_amp.rst  |  16 --
 docs/colossalai/colossalai.amp.rst            |  18 ---
 docs/colossalai/colossalai.amp.torch_amp.rst  |  11 --
 .../colossalai.amp.torch_amp.torch_amp.rst    |   5 -
 .../colossalai/colossalai.builder.builder.rst |   5 -
 docs/colossalai/colossalai.builder.rst        |  11 --
 .../colossalai.cli.benchmark.benchmark.rst    |   5 -
 .../colossalai.cli.benchmark.models.rst       |   5 -
 docs/colossalai/colossalai.cli.benchmark.rst  |  13 --
 .../colossalai.cli.benchmark.utils.rst        |   5 -
 ...olossalai.cli.check.check_installation.rst |   5 -
 docs/colossalai/colossalai.cli.check.rst      |  11 --
 docs/colossalai/colossalai.cli.cli.rst        |   5 -
 .../colossalai.cli.launcher.hostinfo.rst      |   5 -
 ...lossalai.cli.launcher.multinode_runner.rst |   5 -
 docs/colossalai/colossalai.cli.launcher.rst   |  13 --
 .../colossalai.cli.launcher.run.rst           |   5 -
 docs/colossalai/colossalai.cli.rst            |  18 ---
 .../colossalai.communication.collective.rst   |   5 -
 .../colossalai.communication.p2p.rst          |   5 -
 .../colossalai.communication.ring.rst         |   5 -
 docs/colossalai/colossalai.communication.rst  |  14 --
 .../colossalai.communication.utils.rst        |   5 -
 docs/colossalai/colossalai.constants.rst      |   5 -
 docs/colossalai/colossalai.context.config.rst |   5 -
 .../colossalai.context.moe_context.rst        |   5 -
 .../colossalai.context.parallel_context.rst   |   5 -
 .../colossalai.context.parallel_mode.rst      |   5 -
 ...ocess_group_initializer.initializer_1d.rst |   5 -
 ...ocess_group_initializer.initializer_2d.rst |   5 -
 ...ess_group_initializer.initializer_2p5d.rst |   5 -
 ...ocess_group_initializer.initializer_3d.rst |   5 -
 ...ess_group_initializer.initializer_data.rst |   5 -
 ...ss_group_initializer.initializer_model.rst |   5 -
 ...group_initializer.initializer_pipeline.rst |   5 -
 ...group_initializer.initializer_sequence.rst |   5 -
 ...s_group_initializer.initializer_tensor.rst |   5 -
 ..._initializer.process_group_initializer.rst |   5 -
 ...alai.context.process_group_initializer.rst |  20 ---
 docs/colossalai/colossalai.context.random.rst |  11 --
 ...colossalai.context.random.seed_manager.rst |   5 -
 docs/colossalai/colossalai.context.rst        |  21 ---
 .../colossalai.context.singleton_meta.rst     |   5 -
 docs/colossalai/colossalai.core.rst           |   5 -
 ...olossalai.engine.gradient_accumulation.rst |   5 -
 .../colossalai.engine.gradient_handler.rst    |  11 --
 ...lossalai.engine.gradient_handler.utils.rst |   5 -
 docs/colossalai/colossalai.engine.rst         |  12 --
 .../colossalai/colossalai.engine.schedule.rst |   5 -
 ...salai.fx.passes.adding_split_node_pass.rst |   5 -
 .../colossalai.fx.passes.meta_info_prop.rst   |   5 -
 docs/colossalai/colossalai.fx.passes.rst      |  15 --
 .../colossalai.fx.passes.shard_1d_pass.rst    |   5 -
 .../colossalai.fx.passes.split_module.rst     |   5 -
 .../colossalai/colossalai.fx.passes.utils.rst |   5 -
 docs/colossalai/colossalai.fx.proxy.rst       |   5 -
 docs/colossalai/colossalai.fx.rst             |  17 ---
 docs/colossalai/colossalai.fx.tracer.rst      |  11 --
 .../colossalai.fx.tracer.tracer.rst           |   5 -
 docs/colossalai/colossalai.gemini.chunk.rst   |   5 -
 .../colossalai.gemini.chunk_mgr.rst           |   5 -
 .../colossalai.gemini.gemini_context.rst      |   5 -
 .../colossalai.gemini.gemini_mgr.rst          |   5 -
 ...ai.gemini.memory_tracer.memory_monitor.rst |   5 -
 ...emini.memory_tracer.memstats_collector.rst |   5 -
 ...ini.memory_tracer.model_data_memtracer.rst |   5 -
 .../colossalai.gemini.memory_tracer.rst       |  13 --
 docs/colossalai/colossalai.gemini.ophooks.rst |  11 --
 .../colossalai.gemini.ophooks.utils.rst       |   5 -
 .../colossalai.gemini.paramhooks.rst          |   5 -
 .../colossalai.gemini.placement_policy.rst    |   5 -
 docs/colossalai/colossalai.gemini.rst         |  27 ----
 .../colossalai.gemini.stateful_tensor.rst     |   5 -
 ...salai.gemini.stateful_tensor_container.rst |   5 -
 .../colossalai.gemini.stateful_tensor_mgr.rst |   5 -
 ...ossalai.gemini.tensor_placement_policy.rst |   5 -
 .../colossalai.gemini.tensor_utils.rst        |   5 -
 .../colossalai.global_variables.rst           |   5 -
 docs/colossalai/colossalai.initialize.rst     |   5 -
 ...lossalai.kernel.cuda_native.layer_norm.rst |   5 -
 ...kernel.cuda_native.multihead_attention.rst |   5 -
 .../colossalai.kernel.cuda_native.rst         |  13 --
 ...alai.kernel.cuda_native.scaled_softmax.rst |   5 -
 ...colossalai.kernel.jit.bias_dropout_add.rst |   5 -
 .../colossalai.kernel.jit.bias_gelu.rst       |   5 -
 .../colossalai.kernel.jit.option.rst          |   5 -
 docs/colossalai/colossalai.kernel.jit.rst     |  13 --
 docs/colossalai/colossalai.kernel.rst         |  11 --
 docs/colossalai/colossalai.logging.logger.rst |   5 -
 docs/colossalai/colossalai.logging.rst        |  11 --
 .../colossalai.nn.graph.graph_node.rst        |   5 -
 docs/colossalai/colossalai.nn.graph.rst       |  12 --
 docs/colossalai/colossalai.nn.graph.utils.rst |   5 -
 docs/colossalai/colossalai.nn.init.rst        |   5 -
 .../colossalai.nn.layer.base_layer.rst        |   5 -
 ...alai.nn.layer.colossalai_layer.dropout.rst |   5 -
 ...ai.nn.layer.colossalai_layer.embedding.rst |   5 -
 ...salai.nn.layer.colossalai_layer.linear.rst |   5 -
 ...n.layer.colossalai_layer.normalization.rst |   5 -
 .../colossalai.nn.layer.colossalai_layer.rst  |  14 --
 .../colossalai.nn.layer.moe.experts.rst       |   5 -
 .../colossalai.nn.layer.moe.layers.rst        |   5 -
 docs/colossalai/colossalai.nn.layer.moe.rst   |  13 --
 .../colossalai.nn.layer.moe.utils.rst         |   5 -
 ...colossalai.nn.layer.parallel_1d.layers.rst |   5 -
 .../colossalai.nn.layer.parallel_1d.rst       |  11 --
 ...colossalai.nn.layer.parallel_2d.layers.rst |   5 -
 .../colossalai.nn.layer.parallel_2d.rst       |  11 --
 ...lossalai.nn.layer.parallel_2p5d.layers.rst |   5 -
 .../colossalai.nn.layer.parallel_2p5d.rst     |  11 --
 ...colossalai.nn.layer.parallel_3d.layers.rst |   5 -
 .../colossalai.nn.layer.parallel_3d.rst       |  11 --
 ...alai.nn.layer.parallel_sequence.layers.rst |   5 -
 .../colossalai.nn.layer.parallel_sequence.rst |  11 --
 docs/colossalai/colossalai.nn.layer.rst       |  25 ----
 .../colossalai.nn.layer.utils.common.rst      |   5 -
 docs/colossalai/colossalai.nn.layer.utils.rst |  11 --
 .../colossalai.nn.layer.vanilla.layers.rst    |   5 -
 .../colossalai.nn.layer.vanilla.rst           |  11 --
 ...alai.nn.layer.wrapper.pipeline_wrapper.rst |   5 -
 .../colossalai.nn.layer.wrapper.rst           |  11 --
 .../colossalai/colossalai.nn.loss.loss_1d.rst |   5 -
 .../colossalai/colossalai.nn.loss.loss_2d.rst |   5 -
 .../colossalai.nn.loss.loss_2p5d.rst          |   5 -
 .../colossalai/colossalai.nn.loss.loss_3d.rst |   5 -
 .../colossalai.nn.loss.loss_moe.rst           |   5 -
 docs/colossalai/colossalai.nn.loss.rst        |  15 --
 .../colossalai.nn.lr_scheduler.cosine.rst     |   5 -
 .../colossalai.nn.lr_scheduler.delayed.rst    |   5 -
 .../colossalai.nn.lr_scheduler.linear.rst     |   5 -
 .../colossalai.nn.lr_scheduler.multistep.rst  |   5 -
 .../colossalai.nn.lr_scheduler.onecycle.rst   |   5 -
 .../colossalai.nn.lr_scheduler.poly.rst       |   5 -
 .../colossalai/colossalai.nn.lr_scheduler.rst |  17 ---
 .../colossalai.nn.lr_scheduler.torch.rst      |   5 -
 .../colossalai.nn.metric.accuracy_2d.rst      |   5 -
 .../colossalai.nn.metric.accuracy_2p5d.rst    |   5 -
 .../colossalai.nn.metric.accuracy_3d.rst      |   5 -
 docs/colossalai/colossalai.nn.metric.rst      |  13 --
 ...alai.nn.optimizer.colossalai_optimizer.rst |   5 -
 .../colossalai.nn.optimizer.cpu_adam.rst      |   5 -
 .../colossalai.nn.optimizer.fused_adam.rst    |   5 -
 .../colossalai.nn.optimizer.fused_lamb.rst    |   5 -
 .../colossalai.nn.optimizer.fused_sgd.rst     |   5 -
 .../colossalai.nn.optimizer.hybrid_adam.rst   |   5 -
 .../colossalai.nn.optimizer.lamb.rst          |   5 -
 .../colossalai.nn.optimizer.lars.rst          |   5 -
 docs/colossalai/colossalai.nn.optimizer.rst   |  19 ---
 .../colossalai.nn.optimizer.utils.rst         |   5 -
 .../colossalai.nn.parallel.data_parallel.rst  |   5 -
 ...ossalai.nn.parallel.layers.colo_module.rst |   5 -
 ...olossalai.nn.parallel.layers.embedding.rst |   5 -
 .../colossalai.nn.parallel.layers.linear.rst  |   5 -
 ...ssalai.nn.parallel.layers.module_utils.rst |   5 -
 .../colossalai.nn.parallel.layers.rst         |  14 --
 .../colossalai.nn.parallel.reducer.rst        |   5 -
 docs/colossalai/colossalai.nn.parallel.rst    |  17 ---
 docs/colossalai/colossalai.nn.rst             |  22 ---
 .../colossalai.pipeline.layer_sepc.rst        |   5 -
 .../colossalai.pipeline.pipelinable.rst       |   5 -
 docs/colossalai/colossalai.pipeline.rst       |  13 --
 docs/colossalai/colossalai.pipeline.utils.rst |   5 -
 .../colossalai.registry.registry.rst          |   5 -
 docs/colossalai/colossalai.registry.rst       |  11 --
 docs/colossalai/colossalai.rst                |  36 -----
 .../colossalai.tensor.colo_parameter.rst      |   5 -
 .../colossalai.tensor.colo_tensor.rst         |   5 -
 .../colossalai.tensor.compute_spec.rst        |   5 -
 docs/colossalai/colossalai.tensor.const.rst   |   5 -
 .../colossalai.tensor.dist_spec_mgr.rst       |   5 -
 .../colossalai/colossalai.tensor.distspec.rst |   5 -
 .../colossalai.tensor.op_wrapper.rst          |   5 -
 .../colossalai.tensor.param_op_hook.rst       |   5 -
 .../colossalai.tensor.process_group.rst       |   5 -
 docs/colossalai/colossalai.tensor.rst         |  21 ---
 .../colossalai.tensor.tensor_spec.rst         |   5 -
 docs/colossalai/colossalai.tensor.utils.rst   |   5 -
 .../colossalai.testing.comparison.rst         |   5 -
 docs/colossalai/colossalai.testing.rst        |  12 --
 docs/colossalai/colossalai.testing.utils.rst  |   5 -
 docs/colossalai/colossalai.trainer.hooks.rst  |   5 -
 docs/colossalai/colossalai.trainer.rst        |  10 --
 ...colossalai.utils.activation_checkpoint.rst |   5 -
 ...lai.utils.checkpoint.module_checkpoint.rst |   5 -
 .../colossalai.utils.checkpoint.rst           |  12 --
 .../colossalai.utils.checkpoint.utils.rst     |   5 -
 .../colossalai.utils.checkpointing.rst        |   5 -
 docs/colossalai/colossalai.utils.common.rst   |   5 -
 docs/colossalai/colossalai.utils.cuda.rst     |   5 -
 ...ssalai.utils.data_sampler.base_sampler.rst |   5 -
 ...ils.data_sampler.data_parallel_sampler.rst |   5 -
 .../colossalai.utils.data_sampler.rst         |  12 --
 docs/colossalai/colossalai.utils.memory.rst   |   5 -
 ...lossalai.utils.model.colo_init_context.rst |   5 -
 ...lossalai.utils.model.lazy_init_context.rst |   5 -
 docs/colossalai/colossalai.utils.model.rst    |  13 --
 .../colossalai.utils.model.utils.rst          |   5 -
 docs/colossalai/colossalai.utils.moe.rst      |   5 -
 ....multi_tensor_apply.multi_tensor_apply.rst |   5 -
 .../colossalai.utils.multi_tensor_apply.rst   |  11 --
 .../colossalai.utils.profiler.extention.rst   |   5 -
 ...ai.utils.profiler.legacy.comm_profiler.rst |   5 -
 ...lai.utils.profiler.legacy.mem_profiler.rst |   5 -
 ...ai.utils.profiler.legacy.pcie_profiler.rst |   5 -
 ...salai.utils.profiler.legacy.prof_utils.rst |   5 -
 .../colossalai.utils.profiler.legacy.rst      |  14 --
 .../colossalai.utils.profiler.profiler.rst    |   5 -
 docs/colossalai/colossalai.utils.profiler.rst |  18 ---
 ...profiler.stateful_tensor_mem_extention.rst |   5 -
 docs/colossalai/colossalai.utils.rst          |  27 ----
 .../colossalai.utils.tensor_detector.rst      |  11 --
 ....utils.tensor_detector.tensor_detector.rst |   5 -
 docs/colossalai/colossalai.utils.timer.rst    |   5 -
 .../colossalai.zero.init_ctx.init_context.rst |   5 -
 docs/colossalai/colossalai.zero.init_ctx.rst  |  11 --
 docs/colossalai/colossalai.zero.rst           |  21 ---
 ...i.zero.shard_utils.base_shard_strategy.rst |   5 -
 ...ard_utils.bucket_tensor_shard_strategy.rst |   5 -
 .../colossalai.zero.shard_utils.commons.rst   |   5 -
 .../colossalai.zero.shard_utils.rst           |  14 --
 ...zero.shard_utils.tensor_shard_strategy.rst |   5 -
 ...alai.zero.sharded_model.reduce_scatter.rst |   5 -
 .../colossalai.zero.sharded_model.rst         |  13 --
 ...ai.zero.sharded_model.sharded_model_v2.rst |   5 -
 .../colossalai.zero.sharded_model.utils.rst   |   5 -
 .../colossalai.zero.sharded_optim.rst         |  11 --
 ...ai.zero.sharded_optim.sharded_optim_v2.rst |   5 -
 .../colossalai.zero.sharded_param.rst         |  12 --
 ...salai.zero.sharded_param.sharded_param.rst |   5 -
 ...alai.zero.sharded_param.sharded_tensor.rst |   5 -
 docs/colossalai/colossalai.zero.utils.rst     |  12 --
 .../colossalai.zero.utils.zero_hook.rst       |   5 -
 .../colossalai.zero.utils.zero_hook_v2.rst    |   5 -
 .../colossalai.zero.zero_optimizer.rst        |   5 -
 docs/conf.py                                  | 137 ------------------
 docs/index.rst                                |  27 ----
 docs/links/Colossalai Homepage.rst            |   6 -
 docs/links/Colossalai benchmarks.rst          |   6 -
 docs/links/Colossalai examples.rst            |   6 -
 docs/links/Colossalai tutorial.rst            |   7 -
 docs/make.bat                                 |  35 -----
 docs/requirements.txt                         |   5 -
 253 files changed, 2167 deletions(-)
 delete mode 100644 .readthedocs.yaml
 delete mode 100644 docs/Makefile
 delete mode 100644 docs/_static/css/rtd_theme.css
 delete mode 100644 docs/_templates/apidoc/module.rst_t
 delete mode 100644 docs/_templates/apidoc/package.rst_t
 delete mode 100644 docs/_templates/apidoc/toc.rst_t
 delete mode 100644 docs/colossalai/colossalai.amp.amp_type.rst
 delete mode 100644 docs/colossalai/colossalai.amp.apex_amp.apex_amp.rst
 delete mode 100644 docs/colossalai/colossalai.amp.apex_amp.rst
 delete mode 100644 docs/colossalai/colossalai.amp.naive_amp.grad_scaler.rst
 delete mode 100644 docs/colossalai/colossalai.amp.naive_amp.naive_amp.rst
 delete mode 100644 docs/colossalai/colossalai.amp.naive_amp.rst
 delete mode 100644 docs/colossalai/colossalai.amp.rst
 delete mode 100644 docs/colossalai/colossalai.amp.torch_amp.rst
 delete mode 100644 docs/colossalai/colossalai.amp.torch_amp.torch_amp.rst
 delete mode 100644 docs/colossalai/colossalai.builder.builder.rst
 delete mode 100644 docs/colossalai/colossalai.builder.rst
 delete mode 100644 docs/colossalai/colossalai.cli.benchmark.benchmark.rst
 delete mode 100644 docs/colossalai/colossalai.cli.benchmark.models.rst
 delete mode 100644 docs/colossalai/colossalai.cli.benchmark.rst
 delete mode 100644 docs/colossalai/colossalai.cli.benchmark.utils.rst
 delete mode 100644 docs/colossalai/colossalai.cli.check.check_installation.rst
 delete mode 100644 docs/colossalai/colossalai.cli.check.rst
 delete mode 100644 docs/colossalai/colossalai.cli.cli.rst
 delete mode 100644 docs/colossalai/colossalai.cli.launcher.hostinfo.rst
 delete mode 100644 docs/colossalai/colossalai.cli.launcher.multinode_runner.rst
 delete mode 100644 docs/colossalai/colossalai.cli.launcher.rst
 delete mode 100644 docs/colossalai/colossalai.cli.launcher.run.rst
 delete mode 100644 docs/colossalai/colossalai.cli.rst
 delete mode 100644 docs/colossalai/colossalai.communication.collective.rst
 delete mode 100644 docs/colossalai/colossalai.communication.p2p.rst
 delete mode 100644 docs/colossalai/colossalai.communication.ring.rst
 delete mode 100644 docs/colossalai/colossalai.communication.rst
 delete mode 100644 docs/colossalai/colossalai.communication.utils.rst
 delete mode 100644 docs/colossalai/colossalai.constants.rst
 delete mode 100644 docs/colossalai/colossalai.context.config.rst
 delete mode 100644 docs/colossalai/colossalai.context.moe_context.rst
 delete mode 100644 docs/colossalai/colossalai.context.parallel_context.rst
 delete mode 100644 docs/colossalai/colossalai.context.parallel_mode.rst
 delete mode 100644 docs/colossalai/colossalai.context.process_group_initializer.initializer_1d.rst
 delete mode 100644 docs/colossalai/colossalai.context.process_group_initializer.initializer_2d.rst
 delete mode 100644 docs/colossalai/colossalai.context.process_group_initializer.initializer_2p5d.rst
 delete mode 100644 docs/colossalai/colossalai.context.process_group_initializer.initializer_3d.rst
 delete mode 100644 docs/colossalai/colossalai.context.process_group_initializer.initializer_data.rst
 delete mode 100644 docs/colossalai/colossalai.context.process_group_initializer.initializer_model.rst
 delete mode 100644 docs/colossalai/colossalai.context.process_group_initializer.initializer_pipeline.rst
 delete mode 100644 docs/colossalai/colossalai.context.process_group_initializer.initializer_sequence.rst
 delete mode 100644 docs/colossalai/colossalai.context.process_group_initializer.initializer_tensor.rst
 delete mode 100644 docs/colossalai/colossalai.context.process_group_initializer.process_group_initializer.rst
 delete mode 100644 docs/colossalai/colossalai.context.process_group_initializer.rst
 delete mode 100644 docs/colossalai/colossalai.context.random.rst
 delete mode 100644 docs/colossalai/colossalai.context.random.seed_manager.rst
 delete mode 100644 docs/colossalai/colossalai.context.rst
 delete mode 100644 docs/colossalai/colossalai.context.singleton_meta.rst
 delete mode 100644 docs/colossalai/colossalai.core.rst
 delete mode 100644 docs/colossalai/colossalai.engine.gradient_accumulation.rst
 delete mode 100644 docs/colossalai/colossalai.engine.gradient_handler.rst
 delete mode 100644 docs/colossalai/colossalai.engine.gradient_handler.utils.rst
 delete mode 100644 docs/colossalai/colossalai.engine.rst
 delete mode 100644 docs/colossalai/colossalai.engine.schedule.rst
 delete mode 100644 docs/colossalai/colossalai.fx.passes.adding_split_node_pass.rst
 delete mode 100644 docs/colossalai/colossalai.fx.passes.meta_info_prop.rst
 delete mode 100644 docs/colossalai/colossalai.fx.passes.rst
 delete mode 100644 docs/colossalai/colossalai.fx.passes.shard_1d_pass.rst
 delete mode 100644 docs/colossalai/colossalai.fx.passes.split_module.rst
 delete mode 100644 docs/colossalai/colossalai.fx.passes.utils.rst
 delete mode 100644 docs/colossalai/colossalai.fx.proxy.rst
 delete mode 100644 docs/colossalai/colossalai.fx.rst
 delete mode 100644 docs/colossalai/colossalai.fx.tracer.rst
 delete mode 100644 docs/colossalai/colossalai.fx.tracer.tracer.rst
 delete mode 100644 docs/colossalai/colossalai.gemini.chunk.rst
 delete mode 100644 docs/colossalai/colossalai.gemini.chunk_mgr.rst
 delete mode 100644 docs/colossalai/colossalai.gemini.gemini_context.rst
 delete mode 100644 docs/colossalai/colossalai.gemini.gemini_mgr.rst
 delete mode 100644 docs/colossalai/colossalai.gemini.memory_tracer.memory_monitor.rst
 delete mode 100644 docs/colossalai/colossalai.gemini.memory_tracer.memstats_collector.rst
 delete mode 100644 docs/colossalai/colossalai.gemini.memory_tracer.model_data_memtracer.rst
 delete mode 100644 docs/colossalai/colossalai.gemini.memory_tracer.rst
 delete mode 100644 docs/colossalai/colossalai.gemini.ophooks.rst
 delete mode 100644 docs/colossalai/colossalai.gemini.ophooks.utils.rst
 delete mode 100644 docs/colossalai/colossalai.gemini.paramhooks.rst
 delete mode 100644 docs/colossalai/colossalai.gemini.placement_policy.rst
 delete mode 100644 docs/colossalai/colossalai.gemini.rst
 delete mode 100644 docs/colossalai/colossalai.gemini.stateful_tensor.rst
 delete mode 100644 docs/colossalai/colossalai.gemini.stateful_tensor_container.rst
 delete mode 100644 docs/colossalai/colossalai.gemini.stateful_tensor_mgr.rst
 delete mode 100644 docs/colossalai/colossalai.gemini.tensor_placement_policy.rst
 delete mode 100644 docs/colossalai/colossalai.gemini.tensor_utils.rst
 delete mode 100644 docs/colossalai/colossalai.global_variables.rst
 delete mode 100644 docs/colossalai/colossalai.initialize.rst
 delete mode 100644 docs/colossalai/colossalai.kernel.cuda_native.layer_norm.rst
 delete mode 100644 docs/colossalai/colossalai.kernel.cuda_native.multihead_attention.rst
 delete mode 100644 docs/colossalai/colossalai.kernel.cuda_native.rst
 delete mode 100644 docs/colossalai/colossalai.kernel.cuda_native.scaled_softmax.rst
 delete mode 100644 docs/colossalai/colossalai.kernel.jit.bias_dropout_add.rst
 delete mode 100644 docs/colossalai/colossalai.kernel.jit.bias_gelu.rst
 delete mode 100644 docs/colossalai/colossalai.kernel.jit.option.rst
 delete mode 100644 docs/colossalai/colossalai.kernel.jit.rst
 delete mode 100644 docs/colossalai/colossalai.kernel.rst
 delete mode 100644 docs/colossalai/colossalai.logging.logger.rst
 delete mode 100644 docs/colossalai/colossalai.logging.rst
 delete mode 100644 docs/colossalai/colossalai.nn.graph.graph_node.rst
 delete mode 100644 docs/colossalai/colossalai.nn.graph.rst
 delete mode 100644 docs/colossalai/colossalai.nn.graph.utils.rst
 delete mode 100644 docs/colossalai/colossalai.nn.init.rst
 delete mode 100644 docs/colossalai/colossalai.nn.layer.base_layer.rst
 delete mode 100644 docs/colossalai/colossalai.nn.layer.colossalai_layer.dropout.rst
 delete mode 100644 docs/colossalai/colossalai.nn.layer.colossalai_layer.embedding.rst
 delete mode 100644 docs/colossalai/colossalai.nn.layer.colossalai_layer.linear.rst
 delete mode 100644 docs/colossalai/colossalai.nn.layer.colossalai_layer.normalization.rst
 delete mode 100644 docs/colossalai/colossalai.nn.layer.colossalai_layer.rst
 delete mode 100644 docs/colossalai/colossalai.nn.layer.moe.experts.rst
 delete mode 100644 docs/colossalai/colossalai.nn.layer.moe.layers.rst
 delete mode 100644 docs/colossalai/colossalai.nn.layer.moe.rst
 delete mode 100644 docs/colossalai/colossalai.nn.layer.moe.utils.rst
 delete mode 100644 docs/colossalai/colossalai.nn.layer.parallel_1d.layers.rst
 delete mode 100644 docs/colossalai/colossalai.nn.layer.parallel_1d.rst
 delete mode 100644 docs/colossalai/colossalai.nn.layer.parallel_2d.layers.rst
 delete mode 100644 docs/colossalai/colossalai.nn.layer.parallel_2d.rst
 delete mode 100644 docs/colossalai/colossalai.nn.layer.parallel_2p5d.layers.rst
 delete mode 100644 docs/colossalai/colossalai.nn.layer.parallel_2p5d.rst
 delete mode 100644 docs/colossalai/colossalai.nn.layer.parallel_3d.layers.rst
 delete mode 100644 docs/colossalai/colossalai.nn.layer.parallel_3d.rst
 delete mode 100644 docs/colossalai/colossalai.nn.layer.parallel_sequence.layers.rst
 delete mode 100644 docs/colossalai/colossalai.nn.layer.parallel_sequence.rst
 delete mode 100644 docs/colossalai/colossalai.nn.layer.rst
 delete mode 100644 docs/colossalai/colossalai.nn.layer.utils.common.rst
 delete mode 100644 docs/colossalai/colossalai.nn.layer.utils.rst
 delete mode 100644 docs/colossalai/colossalai.nn.layer.vanilla.layers.rst
 delete mode 100644 docs/colossalai/colossalai.nn.layer.vanilla.rst
 delete mode 100644 docs/colossalai/colossalai.nn.layer.wrapper.pipeline_wrapper.rst
 delete mode 100644 docs/colossalai/colossalai.nn.layer.wrapper.rst
 delete mode 100644 docs/colossalai/colossalai.nn.loss.loss_1d.rst
 delete mode 100644 docs/colossalai/colossalai.nn.loss.loss_2d.rst
 delete mode 100644 docs/colossalai/colossalai.nn.loss.loss_2p5d.rst
 delete mode 100644 docs/colossalai/colossalai.nn.loss.loss_3d.rst
 delete mode 100644 docs/colossalai/colossalai.nn.loss.loss_moe.rst
 delete mode 100644 docs/colossalai/colossalai.nn.loss.rst
 delete mode 100644 docs/colossalai/colossalai.nn.lr_scheduler.cosine.rst
 delete mode 100644 docs/colossalai/colossalai.nn.lr_scheduler.delayed.rst
 delete mode 100644 docs/colossalai/colossalai.nn.lr_scheduler.linear.rst
 delete mode 100644 docs/colossalai/colossalai.nn.lr_scheduler.multistep.rst
 delete mode 100644 docs/colossalai/colossalai.nn.lr_scheduler.onecycle.rst
 delete mode 100644 docs/colossalai/colossalai.nn.lr_scheduler.poly.rst
 delete mode 100644 docs/colossalai/colossalai.nn.lr_scheduler.rst
 delete mode 100644 docs/colossalai/colossalai.nn.lr_scheduler.torch.rst
 delete mode 100644 docs/colossalai/colossalai.nn.metric.accuracy_2d.rst
 delete mode 100644 docs/colossalai/colossalai.nn.metric.accuracy_2p5d.rst
 delete mode 100644 docs/colossalai/colossalai.nn.metric.accuracy_3d.rst
 delete mode 100644 docs/colossalai/colossalai.nn.metric.rst
 delete mode 100644 docs/colossalai/colossalai.nn.optimizer.colossalai_optimizer.rst
 delete mode 100644 docs/colossalai/colossalai.nn.optimizer.cpu_adam.rst
 delete mode 100644 docs/colossalai/colossalai.nn.optimizer.fused_adam.rst
 delete mode 100644 docs/colossalai/colossalai.nn.optimizer.fused_lamb.rst
 delete mode 100644 docs/colossalai/colossalai.nn.optimizer.fused_sgd.rst
 delete mode 100644 docs/colossalai/colossalai.nn.optimizer.hybrid_adam.rst
 delete mode 100644 docs/colossalai/colossalai.nn.optimizer.lamb.rst
 delete mode 100644 docs/colossalai/colossalai.nn.optimizer.lars.rst
 delete mode 100644 docs/colossalai/colossalai.nn.optimizer.rst
 delete mode 100644 docs/colossalai/colossalai.nn.optimizer.utils.rst
 delete mode 100644 docs/colossalai/colossalai.nn.parallel.data_parallel.rst
 delete mode 100644 docs/colossalai/colossalai.nn.parallel.layers.colo_module.rst
 delete mode 100644 docs/colossalai/colossalai.nn.parallel.layers.embedding.rst
 delete mode 100644 docs/colossalai/colossalai.nn.parallel.layers.linear.rst
 delete mode 100644 docs/colossalai/colossalai.nn.parallel.layers.module_utils.rst
 delete mode 100644 docs/colossalai/colossalai.nn.parallel.layers.rst
 delete mode 100644 docs/colossalai/colossalai.nn.parallel.reducer.rst
 delete mode 100644 docs/colossalai/colossalai.nn.parallel.rst
 delete mode 100644 docs/colossalai/colossalai.nn.rst
 delete mode 100644 docs/colossalai/colossalai.pipeline.layer_sepc.rst
 delete mode 100644 docs/colossalai/colossalai.pipeline.pipelinable.rst
 delete mode 100644 docs/colossalai/colossalai.pipeline.rst
 delete mode 100644 docs/colossalai/colossalai.pipeline.utils.rst
 delete mode 100644 docs/colossalai/colossalai.registry.registry.rst
 delete mode 100644 docs/colossalai/colossalai.registry.rst
 delete mode 100644 docs/colossalai/colossalai.rst
 delete mode 100644 docs/colossalai/colossalai.tensor.colo_parameter.rst
 delete mode 100644 docs/colossalai/colossalai.tensor.colo_tensor.rst
 delete mode 100644 docs/colossalai/colossalai.tensor.compute_spec.rst
 delete mode 100644 docs/colossalai/colossalai.tensor.const.rst
 delete mode 100644 docs/colossalai/colossalai.tensor.dist_spec_mgr.rst
 delete mode 100644 docs/colossalai/colossalai.tensor.distspec.rst
 delete mode 100644 docs/colossalai/colossalai.tensor.op_wrapper.rst
 delete mode 100644 docs/colossalai/colossalai.tensor.param_op_hook.rst
 delete mode 100644 docs/colossalai/colossalai.tensor.process_group.rst
 delete mode 100644 docs/colossalai/colossalai.tensor.rst
 delete mode 100644 docs/colossalai/colossalai.tensor.tensor_spec.rst
 delete mode 100644 docs/colossalai/colossalai.tensor.utils.rst
 delete mode 100644 docs/colossalai/colossalai.testing.comparison.rst
 delete mode 100644 docs/colossalai/colossalai.testing.rst
 delete mode 100644 docs/colossalai/colossalai.testing.utils.rst
 delete mode 100644 docs/colossalai/colossalai.trainer.hooks.rst
 delete mode 100644 docs/colossalai/colossalai.trainer.rst
 delete mode 100644 docs/colossalai/colossalai.utils.activation_checkpoint.rst
 delete mode 100644 docs/colossalai/colossalai.utils.checkpoint.module_checkpoint.rst
 delete mode 100644 docs/colossalai/colossalai.utils.checkpoint.rst
 delete mode 100644 docs/colossalai/colossalai.utils.checkpoint.utils.rst
 delete mode 100644 docs/colossalai/colossalai.utils.checkpointing.rst
 delete mode 100644 docs/colossalai/colossalai.utils.common.rst
 delete mode 100644 docs/colossalai/colossalai.utils.cuda.rst
 delete mode 100644 docs/colossalai/colossalai.utils.data_sampler.base_sampler.rst
 delete mode 100644 docs/colossalai/colossalai.utils.data_sampler.data_parallel_sampler.rst
 delete mode 100644 docs/colossalai/colossalai.utils.data_sampler.rst
 delete mode 100644 docs/colossalai/colossalai.utils.memory.rst
 delete mode 100644 docs/colossalai/colossalai.utils.model.colo_init_context.rst
 delete mode 100644 docs/colossalai/colossalai.utils.model.lazy_init_context.rst
 delete mode 100644 docs/colossalai/colossalai.utils.model.rst
 delete mode 100644 docs/colossalai/colossalai.utils.model.utils.rst
 delete mode 100644 docs/colossalai/colossalai.utils.moe.rst
 delete mode 100644 docs/colossalai/colossalai.utils.multi_tensor_apply.multi_tensor_apply.rst
 delete mode 100644 docs/colossalai/colossalai.utils.multi_tensor_apply.rst
 delete mode 100644 docs/colossalai/colossalai.utils.profiler.extention.rst
 delete mode 100644 docs/colossalai/colossalai.utils.profiler.legacy.comm_profiler.rst
 delete mode 100644 docs/colossalai/colossalai.utils.profiler.legacy.mem_profiler.rst
 delete mode 100644 docs/colossalai/colossalai.utils.profiler.legacy.pcie_profiler.rst
 delete mode 100644 docs/colossalai/colossalai.utils.profiler.legacy.prof_utils.rst
 delete mode 100644 docs/colossalai/colossalai.utils.profiler.legacy.rst
 delete mode 100644 docs/colossalai/colossalai.utils.profiler.profiler.rst
 delete mode 100644 docs/colossalai/colossalai.utils.profiler.rst
 delete mode 100644 docs/colossalai/colossalai.utils.profiler.stateful_tensor_mem_extention.rst
 delete mode 100644 docs/colossalai/colossalai.utils.rst
 delete mode 100644 docs/colossalai/colossalai.utils.tensor_detector.rst
 delete mode 100644 docs/colossalai/colossalai.utils.tensor_detector.tensor_detector.rst
 delete mode 100644 docs/colossalai/colossalai.utils.timer.rst
 delete mode 100644 docs/colossalai/colossalai.zero.init_ctx.init_context.rst
 delete mode 100644 docs/colossalai/colossalai.zero.init_ctx.rst
 delete mode 100644 docs/colossalai/colossalai.zero.rst
 delete mode 100644 docs/colossalai/colossalai.zero.shard_utils.base_shard_strategy.rst
 delete mode 100644 docs/colossalai/colossalai.zero.shard_utils.bucket_tensor_shard_strategy.rst
 delete mode 100644 docs/colossalai/colossalai.zero.shard_utils.commons.rst
 delete mode 100644 docs/colossalai/colossalai.zero.shard_utils.rst
 delete mode 100644 docs/colossalai/colossalai.zero.shard_utils.tensor_shard_strategy.rst
 delete mode 100644 docs/colossalai/colossalai.zero.sharded_model.reduce_scatter.rst
 delete mode 100644 docs/colossalai/colossalai.zero.sharded_model.rst
 delete mode 100644 docs/colossalai/colossalai.zero.sharded_model.sharded_model_v2.rst
 delete mode 100644 docs/colossalai/colossalai.zero.sharded_model.utils.rst
 delete mode 100644 docs/colossalai/colossalai.zero.sharded_optim.rst
 delete mode 100644 docs/colossalai/colossalai.zero.sharded_optim.sharded_optim_v2.rst
 delete mode 100644 docs/colossalai/colossalai.zero.sharded_param.rst
 delete mode 100644 docs/colossalai/colossalai.zero.sharded_param.sharded_param.rst
 delete mode 100644 docs/colossalai/colossalai.zero.sharded_param.sharded_tensor.rst
 delete mode 100644 docs/colossalai/colossalai.zero.utils.rst
 delete mode 100644 docs/colossalai/colossalai.zero.utils.zero_hook.rst
 delete mode 100644 docs/colossalai/colossalai.zero.utils.zero_hook_v2.rst
 delete mode 100644 docs/colossalai/colossalai.zero.zero_optimizer.rst
 delete mode 100644 docs/conf.py
 delete mode 100644 docs/index.rst
 delete mode 100644 docs/links/Colossalai Homepage.rst
 delete mode 100644 docs/links/Colossalai benchmarks.rst
 delete mode 100644 docs/links/Colossalai examples.rst
 delete mode 100644 docs/links/Colossalai tutorial.rst
 delete mode 100644 docs/make.bat
 delete mode 100644 docs/requirements.txt

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
deleted file mode 100644
index 98dd0cc4e979..000000000000
--- a/.readthedocs.yaml
+++ /dev/null
@@ -1,30 +0,0 @@
-# .readthedocs.yaml
-# Read the Docs configuration file
-# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
-
-# Required
-version: 2
-
-# Set the version of Python and other tools you might need
-build:
-  os: ubuntu-20.04
-  tools:
-    python: "3.9"
-    # You can also specify other tool versions:
-    # nodejs: "16"
-    # rust: "1.55"
-    # golang: "1.17"
-
-# Build documentation in the docs/ directory with Sphinx
-sphinx:
-   configuration: docs/conf.py
-
-# If using Sphinx, optionally build your docs in additional formats such as PDF
-# formats:
-#    - pdf
-
-# Optionally declare the Python requirements required to build your docs
-python:
-   install:
-   - requirements: requirements/requirements.txt
-   - requirements: docs/requirements.txt
diff --git a/docs/Makefile b/docs/Makefile
deleted file mode 100644
index 9f43a48d6420..000000000000
--- a/docs/Makefile
+++ /dev/null
@@ -1,26 +0,0 @@
-# Minimal makefile for Sphinx documentation
-#
-
-# You can set these variables from the command line, and also
-# from the environment for the first two.
-SPHINXOPTS    ?=
-SPHINXBUILD   ?= sphinx-build
-SOURCEDIR     = .
-BUILDDIR      = .build
-SPHINXAPIDOC  ?= sphinx-apidoc
-SPHINX_APIDOC_OPTIONS = members
-SPHINX_APIDOC_TEMPLATEDIR = _templates/apidoc
-
-# Put it first so that "make" without argument is like "make help".
-help:
-	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
-
-.PHONY: help Makefile apidoc
-
-apidoc:
-	@SPHINX_APIDOC_OPTIONS=$(SPHINX_APIDOC_OPTIONS) $(SPHINXAPIDOC) -f -T -e -M -d 2 -t $(SPHINX_APIDOC_TEMPLATEDIR) -o ./colossalai ../colossalai
-# @$(SPHINXAPIDOC) -f -o ./model_zoo ../model_zoo
-# Catch-all target: route all unknown targets to Sphinx using the new
-# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
-%: Makefile
-	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/_static/css/rtd_theme.css b/docs/_static/css/rtd_theme.css
deleted file mode 100644
index caf42dc5aaab..000000000000
--- a/docs/_static/css/rtd_theme.css
+++ /dev/null
@@ -1,3 +0,0 @@
-.wy-nav-content {
-    max-width: 80%;
-}
\ No newline at end of file
diff --git a/docs/_templates/apidoc/module.rst_t b/docs/_templates/apidoc/module.rst_t
deleted file mode 100644
index d9a50e6b9752..000000000000
--- a/docs/_templates/apidoc/module.rst_t
+++ /dev/null
@@ -1,9 +0,0 @@
-{%- if show_headings %}
-{{- basename | e | heading }}
-
-{% endif -%}
-.. automodule:: {{ qualname }}
-{%- for option in automodule_options %}
-   :{{ option }}:
-{%- endfor %}
-
diff --git a/docs/_templates/apidoc/package.rst_t b/docs/_templates/apidoc/package.rst_t
deleted file mode 100644
index 83742b3f7c66..000000000000
--- a/docs/_templates/apidoc/package.rst_t
+++ /dev/null
@@ -1,52 +0,0 @@
-{%- macro automodule(modname, options) -%}
-.. automodule:: {{ modname }}
-{%- for option in options %}
-   :{{ option }}:
-{%- endfor %}
-{%- endmacro %}
-
-{%- macro toctree(docnames) -%}
-.. toctree::
-   :maxdepth: {{ maxdepth }}
-{% for docname in docnames %}
-   {{ docname }}
-{%- endfor %}
-{%- endmacro %}
-
-{%- if is_namespace %}
-{{- pkgname | e | heading }}
-{% else %}
-{{- pkgname | e | heading }}
-{% endif %}
-
-{%- if is_namespace %}
-.. py:module:: {{ pkgname }}
-{% endif %}
-
-{%- if modulefirst and not is_namespace %}
-{{ automodule(pkgname, automodule_options) }}
-{% endif %}
-
-{%- if subpackages %}
-{{ toctree(subpackages) }}
-{% endif %}
-
-{%- if submodules %}
-{% if separatemodules %}
-{{ toctree(submodules) }}
-{% else %}
-{%- for submodule in submodules %}
-{% if show_headings %}
-{{- submodule | e | heading(2) }}
-{% endif %}
-{{ automodule(submodule, automodule_options) }}
-{% endfor %}
-{%- endif %}
-{%- endif %}
-
-{%- if not modulefirst and not is_namespace %}
-Module contents
----------------
-
-{{ automodule(pkgname, automodule_options) }}
-{% endif %}
diff --git a/docs/_templates/apidoc/toc.rst_t b/docs/_templates/apidoc/toc.rst_t
deleted file mode 100644
index f0877eeb2f85..000000000000
--- a/docs/_templates/apidoc/toc.rst_t
+++ /dev/null
@@ -1,8 +0,0 @@
-{{ header | heading }}
-
-.. toctree::
-   :maxdepth: {{ maxdepth }}
-{% for docname in docnames %}
-   {{ docname }}
-{%- endfor %}
-
diff --git a/docs/colossalai/colossalai.amp.amp_type.rst b/docs/colossalai/colossalai.amp.amp_type.rst
deleted file mode 100644
index 067af7d8c51a..000000000000
--- a/docs/colossalai/colossalai.amp.amp_type.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.amp.amp\_type
-========================
-
-.. automodule:: colossalai.amp.amp_type
-   :members:
diff --git a/docs/colossalai/colossalai.amp.apex_amp.apex_amp.rst b/docs/colossalai/colossalai.amp.apex_amp.apex_amp.rst
deleted file mode 100644
index cba7e00625a4..000000000000
--- a/docs/colossalai/colossalai.amp.apex_amp.apex_amp.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.amp.apex\_amp.apex\_amp
-==================================
-
-.. automodule:: colossalai.amp.apex_amp.apex_amp
-   :members:
diff --git a/docs/colossalai/colossalai.amp.apex_amp.rst b/docs/colossalai/colossalai.amp.apex_amp.rst
deleted file mode 100644
index 7116a538b4c1..000000000000
--- a/docs/colossalai/colossalai.amp.apex_amp.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-colossalai.amp.apex\_amp
-========================
-
-.. automodule:: colossalai.amp.apex_amp
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.amp.apex_amp.apex_amp
diff --git a/docs/colossalai/colossalai.amp.naive_amp.grad_scaler.rst b/docs/colossalai/colossalai.amp.naive_amp.grad_scaler.rst
deleted file mode 100644
index 12d477825659..000000000000
--- a/docs/colossalai/colossalai.amp.naive_amp.grad_scaler.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-colossalai.amp.naive\_amp.grad\_scaler
-======================================
-
-.. automodule:: colossalai.amp.naive_amp.grad_scaler
-   :members:
-
-
-
diff --git a/docs/colossalai/colossalai.amp.naive_amp.naive_amp.rst b/docs/colossalai/colossalai.amp.naive_amp.naive_amp.rst
deleted file mode 100644
index e20f22b2e386..000000000000
--- a/docs/colossalai/colossalai.amp.naive_amp.naive_amp.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.amp.naive\_amp.naive\_amp
-====================================
-
-.. automodule:: colossalai.amp.naive_amp.naive_amp
-   :members:
diff --git a/docs/colossalai/colossalai.amp.naive_amp.rst b/docs/colossalai/colossalai.amp.naive_amp.rst
deleted file mode 100644
index fd364c05331c..000000000000
--- a/docs/colossalai/colossalai.amp.naive_amp.rst
+++ /dev/null
@@ -1,16 +0,0 @@
-colossalai.amp.naive\_amp
-=========================
-
-.. automodule:: colossalai.amp.naive_amp
-   :members:
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.amp.naive_amp.grad_scaler
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.amp.naive_amp.naive_amp
diff --git a/docs/colossalai/colossalai.amp.rst b/docs/colossalai/colossalai.amp.rst
deleted file mode 100644
index 5ef4f36c13ac..000000000000
--- a/docs/colossalai/colossalai.amp.rst
+++ /dev/null
@@ -1,18 +0,0 @@
-colossalai.amp
-==============
-
-.. automodule:: colossalai.amp
-   :members:
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.amp.apex_amp
-   colossalai.amp.naive_amp
-   colossalai.amp.torch_amp
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.amp.amp_type
diff --git a/docs/colossalai/colossalai.amp.torch_amp.rst b/docs/colossalai/colossalai.amp.torch_amp.rst
deleted file mode 100644
index f10095f136e0..000000000000
--- a/docs/colossalai/colossalai.amp.torch_amp.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-colossalai.amp.torch\_amp
-=========================
-
-.. automodule:: colossalai.amp.torch_amp
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.amp.torch_amp.torch_amp
diff --git a/docs/colossalai/colossalai.amp.torch_amp.torch_amp.rst b/docs/colossalai/colossalai.amp.torch_amp.torch_amp.rst
deleted file mode 100644
index 5f1549cb8d48..000000000000
--- a/docs/colossalai/colossalai.amp.torch_amp.torch_amp.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.amp.torch\_amp.torch\_amp
-====================================
-
-.. automodule:: colossalai.amp.torch_amp.torch_amp
-   :members:
diff --git a/docs/colossalai/colossalai.builder.builder.rst b/docs/colossalai/colossalai.builder.builder.rst
deleted file mode 100644
index 85da78ab9e3d..000000000000
--- a/docs/colossalai/colossalai.builder.builder.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.builder.builder
-==========================
-
-.. automodule:: colossalai.builder.builder
-   :members:
diff --git a/docs/colossalai/colossalai.builder.rst b/docs/colossalai/colossalai.builder.rst
deleted file mode 100644
index 61163d7c1ea1..000000000000
--- a/docs/colossalai/colossalai.builder.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-colossalai.builder
-==================
-
-.. automodule:: colossalai.builder
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.builder.builder
diff --git a/docs/colossalai/colossalai.cli.benchmark.benchmark.rst b/docs/colossalai/colossalai.cli.benchmark.benchmark.rst
deleted file mode 100644
index 94a4170c8590..000000000000
--- a/docs/colossalai/colossalai.cli.benchmark.benchmark.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.cli.benchmark.benchmark
-==================================
-
-.. automodule:: colossalai.cli.benchmark.benchmark
-   :members:
diff --git a/docs/colossalai/colossalai.cli.benchmark.models.rst b/docs/colossalai/colossalai.cli.benchmark.models.rst
deleted file mode 100644
index 4e6290288d59..000000000000
--- a/docs/colossalai/colossalai.cli.benchmark.models.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.cli.benchmark.models
-===============================
-
-.. automodule:: colossalai.cli.benchmark.models
-   :members:
diff --git a/docs/colossalai/colossalai.cli.benchmark.rst b/docs/colossalai/colossalai.cli.benchmark.rst
deleted file mode 100644
index 80fb43dde04b..000000000000
--- a/docs/colossalai/colossalai.cli.benchmark.rst
+++ /dev/null
@@ -1,13 +0,0 @@
-colossalai.cli.benchmark
-========================
-
-.. automodule:: colossalai.cli.benchmark
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.cli.benchmark.benchmark
-   colossalai.cli.benchmark.models
-   colossalai.cli.benchmark.utils
diff --git a/docs/colossalai/colossalai.cli.benchmark.utils.rst b/docs/colossalai/colossalai.cli.benchmark.utils.rst
deleted file mode 100644
index 12fbaf2270ec..000000000000
--- a/docs/colossalai/colossalai.cli.benchmark.utils.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.cli.benchmark.utils
-==============================
-
-.. automodule:: colossalai.cli.benchmark.utils
-   :members:
diff --git a/docs/colossalai/colossalai.cli.check.check_installation.rst b/docs/colossalai/colossalai.cli.check.check_installation.rst
deleted file mode 100644
index 95b2d02ca371..000000000000
--- a/docs/colossalai/colossalai.cli.check.check_installation.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.cli.check.check\_installation
-========================================
-
-.. automodule:: colossalai.cli.check.check_installation
-   :members:
diff --git a/docs/colossalai/colossalai.cli.check.rst b/docs/colossalai/colossalai.cli.check.rst
deleted file mode 100644
index 262ae7ad31ba..000000000000
--- a/docs/colossalai/colossalai.cli.check.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-colossalai.cli.check
-====================
-
-.. automodule:: colossalai.cli.check
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.cli.check.check_installation
diff --git a/docs/colossalai/colossalai.cli.cli.rst b/docs/colossalai/colossalai.cli.cli.rst
deleted file mode 100644
index 8f83973d5e0c..000000000000
--- a/docs/colossalai/colossalai.cli.cli.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.cli.cli
-==================
-
-.. automodule:: colossalai.cli.cli
-   :members:
diff --git a/docs/colossalai/colossalai.cli.launcher.hostinfo.rst b/docs/colossalai/colossalai.cli.launcher.hostinfo.rst
deleted file mode 100644
index 5bcd9dd8cc4c..000000000000
--- a/docs/colossalai/colossalai.cli.launcher.hostinfo.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.cli.launcher.hostinfo
-================================
-
-.. automodule:: colossalai.cli.launcher.hostinfo
-   :members:
diff --git a/docs/colossalai/colossalai.cli.launcher.multinode_runner.rst b/docs/colossalai/colossalai.cli.launcher.multinode_runner.rst
deleted file mode 100644
index 223b0deac1f1..000000000000
--- a/docs/colossalai/colossalai.cli.launcher.multinode_runner.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.cli.launcher.multinode\_runner
-=========================================
-
-.. automodule:: colossalai.cli.launcher.multinode_runner
-   :members:
diff --git a/docs/colossalai/colossalai.cli.launcher.rst b/docs/colossalai/colossalai.cli.launcher.rst
deleted file mode 100644
index 38bef61c790d..000000000000
--- a/docs/colossalai/colossalai.cli.launcher.rst
+++ /dev/null
@@ -1,13 +0,0 @@
-colossalai.cli.launcher
-=======================
-
-.. automodule:: colossalai.cli.launcher
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.cli.launcher.hostinfo
-   colossalai.cli.launcher.multinode_runner
-   colossalai.cli.launcher.run
diff --git a/docs/colossalai/colossalai.cli.launcher.run.rst b/docs/colossalai/colossalai.cli.launcher.run.rst
deleted file mode 100644
index 8506fb9e3165..000000000000
--- a/docs/colossalai/colossalai.cli.launcher.run.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.cli.launcher.run
-===========================
-
-.. automodule:: colossalai.cli.launcher.run
-   :members:
diff --git a/docs/colossalai/colossalai.cli.rst b/docs/colossalai/colossalai.cli.rst
deleted file mode 100644
index 8cc0dcb04aed..000000000000
--- a/docs/colossalai/colossalai.cli.rst
+++ /dev/null
@@ -1,18 +0,0 @@
-colossalai.cli
-==============
-
-.. automodule:: colossalai.cli
-   :members:
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.cli.benchmark
-   colossalai.cli.check
-   colossalai.cli.launcher
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.cli.cli
diff --git a/docs/colossalai/colossalai.communication.collective.rst b/docs/colossalai/colossalai.communication.collective.rst
deleted file mode 100644
index 5015edf98901..000000000000
--- a/docs/colossalai/colossalai.communication.collective.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.communication.collective
-===================================
-
-.. automodule:: colossalai.communication.collective
-   :members:
diff --git a/docs/colossalai/colossalai.communication.p2p.rst b/docs/colossalai/colossalai.communication.p2p.rst
deleted file mode 100644
index 79135bb8630f..000000000000
--- a/docs/colossalai/colossalai.communication.p2p.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.communication.p2p
-============================
-
-.. automodule:: colossalai.communication.p2p
-   :members:
diff --git a/docs/colossalai/colossalai.communication.ring.rst b/docs/colossalai/colossalai.communication.ring.rst
deleted file mode 100644
index c218d4bed350..000000000000
--- a/docs/colossalai/colossalai.communication.ring.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.communication.ring
-=============================
-
-.. automodule:: colossalai.communication.ring
-   :members:
diff --git a/docs/colossalai/colossalai.communication.rst b/docs/colossalai/colossalai.communication.rst
deleted file mode 100644
index 5086fa663ec7..000000000000
--- a/docs/colossalai/colossalai.communication.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-colossalai.communication
-========================
-
-.. automodule:: colossalai.communication
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.communication.collective
-   colossalai.communication.p2p
-   colossalai.communication.ring
-   colossalai.communication.utils
diff --git a/docs/colossalai/colossalai.communication.utils.rst b/docs/colossalai/colossalai.communication.utils.rst
deleted file mode 100644
index 19a36cc9ff6f..000000000000
--- a/docs/colossalai/colossalai.communication.utils.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.communication.utils
-==============================
-
-.. automodule:: colossalai.communication.utils
-   :members:
diff --git a/docs/colossalai/colossalai.constants.rst b/docs/colossalai/colossalai.constants.rst
deleted file mode 100644
index 330b3e8668ec..000000000000
--- a/docs/colossalai/colossalai.constants.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.constants
-====================
-
-.. automodule:: colossalai.constants
-   :members:
diff --git a/docs/colossalai/colossalai.context.config.rst b/docs/colossalai/colossalai.context.config.rst
deleted file mode 100644
index 2fb1b99d3e7a..000000000000
--- a/docs/colossalai/colossalai.context.config.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.context.config
-=========================
-
-.. automodule:: colossalai.context.config
-   :members:
diff --git a/docs/colossalai/colossalai.context.moe_context.rst b/docs/colossalai/colossalai.context.moe_context.rst
deleted file mode 100644
index 9027d19ff023..000000000000
--- a/docs/colossalai/colossalai.context.moe_context.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.context.moe\_context
-===============================
-
-.. automodule:: colossalai.context.moe_context
-   :members:
diff --git a/docs/colossalai/colossalai.context.parallel_context.rst b/docs/colossalai/colossalai.context.parallel_context.rst
deleted file mode 100644
index d1c82c517845..000000000000
--- a/docs/colossalai/colossalai.context.parallel_context.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.context.parallel\_context
-====================================
-
-.. automodule:: colossalai.context.parallel_context
-   :members:
diff --git a/docs/colossalai/colossalai.context.parallel_mode.rst b/docs/colossalai/colossalai.context.parallel_mode.rst
deleted file mode 100644
index f7ac137493fb..000000000000
--- a/docs/colossalai/colossalai.context.parallel_mode.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.context.parallel\_mode
-=================================
-
-.. automodule:: colossalai.context.parallel_mode
-   :members:
diff --git a/docs/colossalai/colossalai.context.process_group_initializer.initializer_1d.rst b/docs/colossalai/colossalai.context.process_group_initializer.initializer_1d.rst
deleted file mode 100644
index 88cbf3ebadb3..000000000000
--- a/docs/colossalai/colossalai.context.process_group_initializer.initializer_1d.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.context.process\_group\_initializer.initializer\_1d
-==============================================================
-
-.. automodule:: colossalai.context.process_group_initializer.initializer_1d
-   :members:
diff --git a/docs/colossalai/colossalai.context.process_group_initializer.initializer_2d.rst b/docs/colossalai/colossalai.context.process_group_initializer.initializer_2d.rst
deleted file mode 100644
index d99a2e1c3177..000000000000
--- a/docs/colossalai/colossalai.context.process_group_initializer.initializer_2d.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.context.process\_group\_initializer.initializer\_2d
-==============================================================
-
-.. automodule:: colossalai.context.process_group_initializer.initializer_2d
-   :members:
diff --git a/docs/colossalai/colossalai.context.process_group_initializer.initializer_2p5d.rst b/docs/colossalai/colossalai.context.process_group_initializer.initializer_2p5d.rst
deleted file mode 100644
index 73d80e4431bb..000000000000
--- a/docs/colossalai/colossalai.context.process_group_initializer.initializer_2p5d.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.context.process\_group\_initializer.initializer\_2p5d
-================================================================
-
-.. automodule:: colossalai.context.process_group_initializer.initializer_2p5d
-   :members:
diff --git a/docs/colossalai/colossalai.context.process_group_initializer.initializer_3d.rst b/docs/colossalai/colossalai.context.process_group_initializer.initializer_3d.rst
deleted file mode 100644
index 5cfba5ce0870..000000000000
--- a/docs/colossalai/colossalai.context.process_group_initializer.initializer_3d.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.context.process\_group\_initializer.initializer\_3d
-==============================================================
-
-.. automodule:: colossalai.context.process_group_initializer.initializer_3d
-   :members:
diff --git a/docs/colossalai/colossalai.context.process_group_initializer.initializer_data.rst b/docs/colossalai/colossalai.context.process_group_initializer.initializer_data.rst
deleted file mode 100644
index 55ad05f32b14..000000000000
--- a/docs/colossalai/colossalai.context.process_group_initializer.initializer_data.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.context.process\_group\_initializer.initializer\_data
-================================================================
-
-.. automodule:: colossalai.context.process_group_initializer.initializer_data
-   :members:
diff --git a/docs/colossalai/colossalai.context.process_group_initializer.initializer_model.rst b/docs/colossalai/colossalai.context.process_group_initializer.initializer_model.rst
deleted file mode 100644
index 8f2d79369915..000000000000
--- a/docs/colossalai/colossalai.context.process_group_initializer.initializer_model.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.context.process\_group\_initializer.initializer\_model
-=================================================================
-
-.. automodule:: colossalai.context.process_group_initializer.initializer_model
-   :members:
diff --git a/docs/colossalai/colossalai.context.process_group_initializer.initializer_pipeline.rst b/docs/colossalai/colossalai.context.process_group_initializer.initializer_pipeline.rst
deleted file mode 100644
index 466d5143a02b..000000000000
--- a/docs/colossalai/colossalai.context.process_group_initializer.initializer_pipeline.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.context.process\_group\_initializer.initializer\_pipeline
-====================================================================
-
-.. automodule:: colossalai.context.process_group_initializer.initializer_pipeline
-   :members:
diff --git a/docs/colossalai/colossalai.context.process_group_initializer.initializer_sequence.rst b/docs/colossalai/colossalai.context.process_group_initializer.initializer_sequence.rst
deleted file mode 100644
index dab71cc3c391..000000000000
--- a/docs/colossalai/colossalai.context.process_group_initializer.initializer_sequence.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.context.process\_group\_initializer.initializer\_sequence
-====================================================================
-
-.. automodule:: colossalai.context.process_group_initializer.initializer_sequence
-   :members:
diff --git a/docs/colossalai/colossalai.context.process_group_initializer.initializer_tensor.rst b/docs/colossalai/colossalai.context.process_group_initializer.initializer_tensor.rst
deleted file mode 100644
index 0c2d8d1e9daa..000000000000
--- a/docs/colossalai/colossalai.context.process_group_initializer.initializer_tensor.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.context.process\_group\_initializer.initializer\_tensor
-==================================================================
-
-.. automodule:: colossalai.context.process_group_initializer.initializer_tensor
-   :members:
diff --git a/docs/colossalai/colossalai.context.process_group_initializer.process_group_initializer.rst b/docs/colossalai/colossalai.context.process_group_initializer.process_group_initializer.rst
deleted file mode 100644
index 3f98723c170b..000000000000
--- a/docs/colossalai/colossalai.context.process_group_initializer.process_group_initializer.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.context.process\_group\_initializer.process\_group\_initializer
-==========================================================================
-
-.. automodule:: colossalai.context.process_group_initializer.process_group_initializer
-   :members:
diff --git a/docs/colossalai/colossalai.context.process_group_initializer.rst b/docs/colossalai/colossalai.context.process_group_initializer.rst
deleted file mode 100644
index 519337e9c71d..000000000000
--- a/docs/colossalai/colossalai.context.process_group_initializer.rst
+++ /dev/null
@@ -1,20 +0,0 @@
-colossalai.context.process\_group\_initializer
-==============================================
-
-.. automodule:: colossalai.context.process_group_initializer
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.context.process_group_initializer.initializer_1d
-   colossalai.context.process_group_initializer.initializer_2d
-   colossalai.context.process_group_initializer.initializer_2p5d
-   colossalai.context.process_group_initializer.initializer_3d
-   colossalai.context.process_group_initializer.initializer_data
-   colossalai.context.process_group_initializer.initializer_model
-   colossalai.context.process_group_initializer.initializer_pipeline
-   colossalai.context.process_group_initializer.initializer_sequence
-   colossalai.context.process_group_initializer.initializer_tensor
-   colossalai.context.process_group_initializer.process_group_initializer
diff --git a/docs/colossalai/colossalai.context.random.rst b/docs/colossalai/colossalai.context.random.rst
deleted file mode 100644
index 8d4b9c56af3c..000000000000
--- a/docs/colossalai/colossalai.context.random.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-colossalai.context.random
-=========================
-
-.. automodule:: colossalai.context.random
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.context.random.seed_manager
diff --git a/docs/colossalai/colossalai.context.random.seed_manager.rst b/docs/colossalai/colossalai.context.random.seed_manager.rst
deleted file mode 100644
index b71f35c2750c..000000000000
--- a/docs/colossalai/colossalai.context.random.seed_manager.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.context.random.seed\_manager
-=======================================
-
-.. automodule:: colossalai.context.random.seed_manager
-   :members:
diff --git a/docs/colossalai/colossalai.context.rst b/docs/colossalai/colossalai.context.rst
deleted file mode 100644
index 102a9e02eaa4..000000000000
--- a/docs/colossalai/colossalai.context.rst
+++ /dev/null
@@ -1,21 +0,0 @@
-colossalai.context
-==================
-
-.. automodule:: colossalai.context
-   :members:
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.context.process_group_initializer
-   colossalai.context.random
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.context.config
-   colossalai.context.moe_context
-   colossalai.context.parallel_context
-   colossalai.context.parallel_mode
-   colossalai.context.singleton_meta
diff --git a/docs/colossalai/colossalai.context.singleton_meta.rst b/docs/colossalai/colossalai.context.singleton_meta.rst
deleted file mode 100644
index ae4ceb314f32..000000000000
--- a/docs/colossalai/colossalai.context.singleton_meta.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.context.singleton\_meta
-==================================
-
-.. automodule:: colossalai.context.singleton_meta
-   :members:
diff --git a/docs/colossalai/colossalai.core.rst b/docs/colossalai/colossalai.core.rst
deleted file mode 100644
index d9ddb76ed72a..000000000000
--- a/docs/colossalai/colossalai.core.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.core
-===============
-
-.. automodule:: colossalai.core
-   :members:
diff --git a/docs/colossalai/colossalai.engine.gradient_accumulation.rst b/docs/colossalai/colossalai.engine.gradient_accumulation.rst
deleted file mode 100644
index 75fc0e9a24eb..000000000000
--- a/docs/colossalai/colossalai.engine.gradient_accumulation.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.engine.gradient\_accumulation
-========================================
-
-.. automodule:: colossalai.engine.gradient_accumulation
-   :members:
diff --git a/docs/colossalai/colossalai.engine.gradient_handler.rst b/docs/colossalai/colossalai.engine.gradient_handler.rst
deleted file mode 100644
index 27eb2b56a29f..000000000000
--- a/docs/colossalai/colossalai.engine.gradient_handler.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-colossalai.engine.gradient\_handler
-===================================
-
-.. automodule:: colossalai.engine.gradient_handler
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.engine.gradient_handler.utils
diff --git a/docs/colossalai/colossalai.engine.gradient_handler.utils.rst b/docs/colossalai/colossalai.engine.gradient_handler.utils.rst
deleted file mode 100644
index c8997e135b60..000000000000
--- a/docs/colossalai/colossalai.engine.gradient_handler.utils.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.engine.gradient\_handler.utils
-=========================================
-
-.. automodule:: colossalai.engine.gradient_handler.utils
-   :members:
diff --git a/docs/colossalai/colossalai.engine.rst b/docs/colossalai/colossalai.engine.rst
deleted file mode 100644
index 3d194b70695e..000000000000
--- a/docs/colossalai/colossalai.engine.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-colossalai.engine
-=================
-
-.. automodule:: colossalai.engine
-   :members:
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.engine.gradient_accumulation
-   colossalai.engine.gradient_handler
-   colossalai.engine.schedule
diff --git a/docs/colossalai/colossalai.engine.schedule.rst b/docs/colossalai/colossalai.engine.schedule.rst
deleted file mode 100644
index 2909373f0002..000000000000
--- a/docs/colossalai/colossalai.engine.schedule.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.engine.schedule
-==========================
-
-.. automodule:: colossalai.engine.schedule
-   :members:
diff --git a/docs/colossalai/colossalai.fx.passes.adding_split_node_pass.rst b/docs/colossalai/colossalai.fx.passes.adding_split_node_pass.rst
deleted file mode 100644
index 6799fdc658cd..000000000000
--- a/docs/colossalai/colossalai.fx.passes.adding_split_node_pass.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.fx.passes.adding\_split\_node\_pass
-==============================================
-
-.. automodule:: colossalai.fx.passes.adding_split_node_pass
-   :members:
diff --git a/docs/colossalai/colossalai.fx.passes.meta_info_prop.rst b/docs/colossalai/colossalai.fx.passes.meta_info_prop.rst
deleted file mode 100644
index 4e51732ce83d..000000000000
--- a/docs/colossalai/colossalai.fx.passes.meta_info_prop.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.fx.passes.meta\_info\_prop
-=====================================
-
-.. automodule:: colossalai.fx.passes.meta_info_prop
-   :members:
diff --git a/docs/colossalai/colossalai.fx.passes.rst b/docs/colossalai/colossalai.fx.passes.rst
deleted file mode 100644
index fac10b768034..000000000000
--- a/docs/colossalai/colossalai.fx.passes.rst
+++ /dev/null
@@ -1,15 +0,0 @@
-colossalai.fx.passes
-====================
-
-.. automodule:: colossalai.fx.passes
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.fx.passes.adding_split_node_pass
-   colossalai.fx.passes.meta_info_prop
-   colossalai.fx.passes.shard_1d_pass
-   colossalai.fx.passes.split_module
-   colossalai.fx.passes.utils
diff --git a/docs/colossalai/colossalai.fx.passes.shard_1d_pass.rst b/docs/colossalai/colossalai.fx.passes.shard_1d_pass.rst
deleted file mode 100644
index 0942e96d46dc..000000000000
--- a/docs/colossalai/colossalai.fx.passes.shard_1d_pass.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.fx.passes.shard\_1d\_pass
-====================================
-
-.. automodule:: colossalai.fx.passes.shard_1d_pass
-   :members:
diff --git a/docs/colossalai/colossalai.fx.passes.split_module.rst b/docs/colossalai/colossalai.fx.passes.split_module.rst
deleted file mode 100644
index 9e5e58259254..000000000000
--- a/docs/colossalai/colossalai.fx.passes.split_module.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.fx.passes.split\_module
-==================================
-
-.. automodule:: colossalai.fx.passes.split_module
-   :members:
diff --git a/docs/colossalai/colossalai.fx.passes.utils.rst b/docs/colossalai/colossalai.fx.passes.utils.rst
deleted file mode 100644
index 4afd9256322b..000000000000
--- a/docs/colossalai/colossalai.fx.passes.utils.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.fx.passes.utils
-==========================
-
-.. automodule:: colossalai.fx.passes.utils
-   :members:
diff --git a/docs/colossalai/colossalai.fx.proxy.rst b/docs/colossalai/colossalai.fx.proxy.rst
deleted file mode 100644
index 4b92da41c794..000000000000
--- a/docs/colossalai/colossalai.fx.proxy.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.fx.proxy
-===================
-
-.. automodule:: colossalai.fx.proxy
-   :members:
diff --git a/docs/colossalai/colossalai.fx.rst b/docs/colossalai/colossalai.fx.rst
deleted file mode 100644
index 778d642c3a11..000000000000
--- a/docs/colossalai/colossalai.fx.rst
+++ /dev/null
@@ -1,17 +0,0 @@
-colossalai.fx
-=============
-
-.. automodule:: colossalai.fx
-   :members:
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.fx.passes
-   colossalai.fx.tracer
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.fx.proxy
diff --git a/docs/colossalai/colossalai.fx.tracer.rst b/docs/colossalai/colossalai.fx.tracer.rst
deleted file mode 100644
index d2f743d67d55..000000000000
--- a/docs/colossalai/colossalai.fx.tracer.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-colossalai.fx.tracer
-====================
-
-.. automodule:: colossalai.fx.tracer
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.fx.tracer.tracer
diff --git a/docs/colossalai/colossalai.fx.tracer.tracer.rst b/docs/colossalai/colossalai.fx.tracer.tracer.rst
deleted file mode 100644
index 83b98bafd825..000000000000
--- a/docs/colossalai/colossalai.fx.tracer.tracer.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.fx.tracer.tracer
-===========================
-
-.. automodule:: colossalai.fx.tracer.tracer
-   :members:
diff --git a/docs/colossalai/colossalai.gemini.chunk.rst b/docs/colossalai/colossalai.gemini.chunk.rst
deleted file mode 100644
index 9fe1c2b415d6..000000000000
--- a/docs/colossalai/colossalai.gemini.chunk.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.gemini.chunk
-=======================
-
-.. automodule:: colossalai.gemini.chunk
-   :members:
diff --git a/docs/colossalai/colossalai.gemini.chunk_mgr.rst b/docs/colossalai/colossalai.gemini.chunk_mgr.rst
deleted file mode 100644
index acb554faf319..000000000000
--- a/docs/colossalai/colossalai.gemini.chunk_mgr.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.gemini.chunk\_mgr
-============================
-
-.. automodule:: colossalai.gemini.chunk_mgr
-   :members:
diff --git a/docs/colossalai/colossalai.gemini.gemini_context.rst b/docs/colossalai/colossalai.gemini.gemini_context.rst
deleted file mode 100644
index be4884062253..000000000000
--- a/docs/colossalai/colossalai.gemini.gemini_context.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.gemini.gemini\_context
-=================================
-
-.. automodule:: colossalai.gemini.gemini_context
-   :members:
diff --git a/docs/colossalai/colossalai.gemini.gemini_mgr.rst b/docs/colossalai/colossalai.gemini.gemini_mgr.rst
deleted file mode 100644
index 5d7f944f7a56..000000000000
--- a/docs/colossalai/colossalai.gemini.gemini_mgr.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.gemini.gemini\_mgr
-=============================
-
-.. automodule:: colossalai.gemini.gemini_mgr
-   :members:
diff --git a/docs/colossalai/colossalai.gemini.memory_tracer.memory_monitor.rst b/docs/colossalai/colossalai.gemini.memory_tracer.memory_monitor.rst
deleted file mode 100644
index e8088a609f34..000000000000
--- a/docs/colossalai/colossalai.gemini.memory_tracer.memory_monitor.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.gemini.memory\_tracer.memory\_monitor
-================================================
-
-.. automodule:: colossalai.gemini.memory_tracer.memory_monitor
-   :members:
diff --git a/docs/colossalai/colossalai.gemini.memory_tracer.memstats_collector.rst b/docs/colossalai/colossalai.gemini.memory_tracer.memstats_collector.rst
deleted file mode 100644
index e2682220c27b..000000000000
--- a/docs/colossalai/colossalai.gemini.memory_tracer.memstats_collector.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.gemini.memory\_tracer.memstats\_collector
-====================================================
-
-.. automodule:: colossalai.gemini.memory_tracer.memstats_collector
-   :members:
diff --git a/docs/colossalai/colossalai.gemini.memory_tracer.model_data_memtracer.rst b/docs/colossalai/colossalai.gemini.memory_tracer.model_data_memtracer.rst
deleted file mode 100644
index ccdfe6682c3f..000000000000
--- a/docs/colossalai/colossalai.gemini.memory_tracer.model_data_memtracer.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.gemini.memory\_tracer.model\_data\_memtracer
-=======================================================
-
-.. automodule:: colossalai.gemini.memory_tracer.model_data_memtracer
-   :members:
diff --git a/docs/colossalai/colossalai.gemini.memory_tracer.rst b/docs/colossalai/colossalai.gemini.memory_tracer.rst
deleted file mode 100644
index f3d9c4d76dd8..000000000000
--- a/docs/colossalai/colossalai.gemini.memory_tracer.rst
+++ /dev/null
@@ -1,13 +0,0 @@
-colossalai.gemini.memory\_tracer
-================================
-
-.. automodule:: colossalai.gemini.memory_tracer
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.gemini.memory_tracer.memory_monitor
-   colossalai.gemini.memory_tracer.memstats_collector
-   colossalai.gemini.memory_tracer.model_data_memtracer
diff --git a/docs/colossalai/colossalai.gemini.ophooks.rst b/docs/colossalai/colossalai.gemini.ophooks.rst
deleted file mode 100644
index af87ab568ac0..000000000000
--- a/docs/colossalai/colossalai.gemini.ophooks.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-colossalai.gemini.ophooks
-=========================
-
-.. automodule:: colossalai.gemini.ophooks
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.gemini.ophooks.utils
diff --git a/docs/colossalai/colossalai.gemini.ophooks.utils.rst b/docs/colossalai/colossalai.gemini.ophooks.utils.rst
deleted file mode 100644
index 5c5917047f44..000000000000
--- a/docs/colossalai/colossalai.gemini.ophooks.utils.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.gemini.ophooks.utils
-===============================
-
-.. automodule:: colossalai.gemini.ophooks.utils
-   :members:
diff --git a/docs/colossalai/colossalai.gemini.paramhooks.rst b/docs/colossalai/colossalai.gemini.paramhooks.rst
deleted file mode 100644
index 28a823d4e69c..000000000000
--- a/docs/colossalai/colossalai.gemini.paramhooks.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.gemini.paramhooks
-============================
-
-.. automodule:: colossalai.gemini.paramhooks
-   :members:
diff --git a/docs/colossalai/colossalai.gemini.placement_policy.rst b/docs/colossalai/colossalai.gemini.placement_policy.rst
deleted file mode 100644
index 9de0ed52371b..000000000000
--- a/docs/colossalai/colossalai.gemini.placement_policy.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.gemini.placement\_policy
-===================================
-
-.. automodule:: colossalai.gemini.placement_policy
-   :members:
diff --git a/docs/colossalai/colossalai.gemini.rst b/docs/colossalai/colossalai.gemini.rst
deleted file mode 100644
index 4f6efe386521..000000000000
--- a/docs/colossalai/colossalai.gemini.rst
+++ /dev/null
@@ -1,27 +0,0 @@
-colossalai.gemini
-=================
-
-.. automodule:: colossalai.gemini
-   :members:
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.gemini.memory_tracer
-   colossalai.gemini.ophooks
-   colossalai.gemini.paramhooks
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.gemini.chunk
-   colossalai.gemini.chunk_mgr
-   colossalai.gemini.gemini_context
-   colossalai.gemini.gemini_mgr
-   colossalai.gemini.placement_policy
-   colossalai.gemini.stateful_tensor
-   colossalai.gemini.stateful_tensor_container
-   colossalai.gemini.stateful_tensor_mgr
-   colossalai.gemini.tensor_placement_policy
-   colossalai.gemini.tensor_utils
diff --git a/docs/colossalai/colossalai.gemini.stateful_tensor.rst b/docs/colossalai/colossalai.gemini.stateful_tensor.rst
deleted file mode 100644
index 02d526d1b4c8..000000000000
--- a/docs/colossalai/colossalai.gemini.stateful_tensor.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.gemini.stateful\_tensor
-==================================
-
-.. automodule:: colossalai.gemini.stateful_tensor
-   :members:
diff --git a/docs/colossalai/colossalai.gemini.stateful_tensor_container.rst b/docs/colossalai/colossalai.gemini.stateful_tensor_container.rst
deleted file mode 100644
index be56c2aa8ed2..000000000000
--- a/docs/colossalai/colossalai.gemini.stateful_tensor_container.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.gemini.stateful\_tensor\_container
-=============================================
-
-.. automodule:: colossalai.gemini.stateful_tensor_container
-   :members:
diff --git a/docs/colossalai/colossalai.gemini.stateful_tensor_mgr.rst b/docs/colossalai/colossalai.gemini.stateful_tensor_mgr.rst
deleted file mode 100644
index 3456192bd735..000000000000
--- a/docs/colossalai/colossalai.gemini.stateful_tensor_mgr.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.gemini.stateful\_tensor\_mgr
-=======================================
-
-.. automodule:: colossalai.gemini.stateful_tensor_mgr
-   :members:
diff --git a/docs/colossalai/colossalai.gemini.tensor_placement_policy.rst b/docs/colossalai/colossalai.gemini.tensor_placement_policy.rst
deleted file mode 100644
index 81dcac339048..000000000000
--- a/docs/colossalai/colossalai.gemini.tensor_placement_policy.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.gemini.tensor\_placement\_policy
-===========================================
-
-.. automodule:: colossalai.gemini.tensor_placement_policy
-   :members:
diff --git a/docs/colossalai/colossalai.gemini.tensor_utils.rst b/docs/colossalai/colossalai.gemini.tensor_utils.rst
deleted file mode 100644
index 385baf4b50bb..000000000000
--- a/docs/colossalai/colossalai.gemini.tensor_utils.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.gemini.tensor\_utils
-===============================
-
-.. automodule:: colossalai.gemini.tensor_utils
-   :members:
diff --git a/docs/colossalai/colossalai.global_variables.rst b/docs/colossalai/colossalai.global_variables.rst
deleted file mode 100644
index 1900c88351ff..000000000000
--- a/docs/colossalai/colossalai.global_variables.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.global\_variables
-============================
-
-.. automodule:: colossalai.global_variables
-   :members:
diff --git a/docs/colossalai/colossalai.initialize.rst b/docs/colossalai/colossalai.initialize.rst
deleted file mode 100644
index d3f65076a795..000000000000
--- a/docs/colossalai/colossalai.initialize.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.initialize
-=====================
-
-.. automodule:: colossalai.initialize
-   :members:
diff --git a/docs/colossalai/colossalai.kernel.cuda_native.layer_norm.rst b/docs/colossalai/colossalai.kernel.cuda_native.layer_norm.rst
deleted file mode 100644
index b8bff51bef34..000000000000
--- a/docs/colossalai/colossalai.kernel.cuda_native.layer_norm.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.kernel.cuda\_native.layer\_norm
-==========================================
-
-.. automodule:: colossalai.kernel.cuda_native.layer_norm
-   :members:
diff --git a/docs/colossalai/colossalai.kernel.cuda_native.multihead_attention.rst b/docs/colossalai/colossalai.kernel.cuda_native.multihead_attention.rst
deleted file mode 100644
index de7577d195cd..000000000000
--- a/docs/colossalai/colossalai.kernel.cuda_native.multihead_attention.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.kernel.cuda\_native.multihead\_attention
-===================================================
-
-.. automodule:: colossalai.kernel.cuda_native.multihead_attention
-   :members:
diff --git a/docs/colossalai/colossalai.kernel.cuda_native.rst b/docs/colossalai/colossalai.kernel.cuda_native.rst
deleted file mode 100644
index d88e4cfdb761..000000000000
--- a/docs/colossalai/colossalai.kernel.cuda_native.rst
+++ /dev/null
@@ -1,13 +0,0 @@
-colossalai.kernel.cuda\_native
-==============================
-
-.. automodule:: colossalai.kernel.cuda_native
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.kernel.cuda_native.layer_norm
-   colossalai.kernel.cuda_native.multihead_attention
-   colossalai.kernel.cuda_native.scaled_softmax
diff --git a/docs/colossalai/colossalai.kernel.cuda_native.scaled_softmax.rst b/docs/colossalai/colossalai.kernel.cuda_native.scaled_softmax.rst
deleted file mode 100644
index 474fcd3349bd..000000000000
--- a/docs/colossalai/colossalai.kernel.cuda_native.scaled_softmax.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.kernel.cuda\_native.scaled\_softmax
-==============================================
-
-.. automodule:: colossalai.kernel.cuda_native.scaled_softmax
-   :members:
diff --git a/docs/colossalai/colossalai.kernel.jit.bias_dropout_add.rst b/docs/colossalai/colossalai.kernel.jit.bias_dropout_add.rst
deleted file mode 100644
index d61550928bc8..000000000000
--- a/docs/colossalai/colossalai.kernel.jit.bias_dropout_add.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.kernel.jit.bias\_dropout\_add
-========================================
-
-.. automodule:: colossalai.kernel.jit.bias_dropout_add
-   :members:
diff --git a/docs/colossalai/colossalai.kernel.jit.bias_gelu.rst b/docs/colossalai/colossalai.kernel.jit.bias_gelu.rst
deleted file mode 100644
index 7db184b4ce3b..000000000000
--- a/docs/colossalai/colossalai.kernel.jit.bias_gelu.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.kernel.jit.bias\_gelu
-================================
-
-.. automodule:: colossalai.kernel.jit.bias_gelu
-   :members:
diff --git a/docs/colossalai/colossalai.kernel.jit.option.rst b/docs/colossalai/colossalai.kernel.jit.option.rst
deleted file mode 100644
index 15ebfc83aa77..000000000000
--- a/docs/colossalai/colossalai.kernel.jit.option.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.kernel.jit.option
-============================
-
-.. automodule:: colossalai.kernel.jit.option
-   :members:
diff --git a/docs/colossalai/colossalai.kernel.jit.rst b/docs/colossalai/colossalai.kernel.jit.rst
deleted file mode 100644
index 8b2f728d34d5..000000000000
--- a/docs/colossalai/colossalai.kernel.jit.rst
+++ /dev/null
@@ -1,13 +0,0 @@
-colossalai.kernel.jit
-=====================
-
-.. automodule:: colossalai.kernel.jit
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.kernel.jit.bias_dropout_add
-   colossalai.kernel.jit.bias_gelu
-   colossalai.kernel.jit.option
diff --git a/docs/colossalai/colossalai.kernel.rst b/docs/colossalai/colossalai.kernel.rst
deleted file mode 100644
index dcbac8c1de76..000000000000
--- a/docs/colossalai/colossalai.kernel.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-colossalai.kernel
-=================
-
-.. automodule:: colossalai.kernel
-   :members:
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.kernel.cuda_native
-   colossalai.kernel.jit
diff --git a/docs/colossalai/colossalai.logging.logger.rst b/docs/colossalai/colossalai.logging.logger.rst
deleted file mode 100644
index 047deb8a1d19..000000000000
--- a/docs/colossalai/colossalai.logging.logger.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.logging.logger
-=========================
-
-.. automodule:: colossalai.logging.logger
-   :members:
diff --git a/docs/colossalai/colossalai.logging.rst b/docs/colossalai/colossalai.logging.rst
deleted file mode 100644
index bc593fc81bf4..000000000000
--- a/docs/colossalai/colossalai.logging.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-colossalai.logging
-==================
-
-.. automodule:: colossalai.logging
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.logging.logger
diff --git a/docs/colossalai/colossalai.nn.graph.graph_node.rst b/docs/colossalai/colossalai.nn.graph.graph_node.rst
deleted file mode 100644
index 335ecfe620fe..000000000000
--- a/docs/colossalai/colossalai.nn.graph.graph_node.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.graph.graph\_node
-===============================
-
-.. automodule:: colossalai.nn.graph.graph_node
-   :members:
diff --git a/docs/colossalai/colossalai.nn.graph.rst b/docs/colossalai/colossalai.nn.graph.rst
deleted file mode 100644
index 4510b3374f2a..000000000000
--- a/docs/colossalai/colossalai.nn.graph.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-colossalai.nn.graph
-===================
-
-.. automodule:: colossalai.nn.graph
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.nn.graph.graph_node
-   colossalai.nn.graph.utils
diff --git a/docs/colossalai/colossalai.nn.graph.utils.rst b/docs/colossalai/colossalai.nn.graph.utils.rst
deleted file mode 100644
index 866a93cd9201..000000000000
--- a/docs/colossalai/colossalai.nn.graph.utils.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.graph.utils
-=========================
-
-.. automodule:: colossalai.nn.graph.utils
-   :members:
diff --git a/docs/colossalai/colossalai.nn.init.rst b/docs/colossalai/colossalai.nn.init.rst
deleted file mode 100644
index d0ab993126d5..000000000000
--- a/docs/colossalai/colossalai.nn.init.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.init
-==================
-
-.. automodule:: colossalai.nn.init
-   :members:
diff --git a/docs/colossalai/colossalai.nn.layer.base_layer.rst b/docs/colossalai/colossalai.nn.layer.base_layer.rst
deleted file mode 100644
index c2a22f04d3f3..000000000000
--- a/docs/colossalai/colossalai.nn.layer.base_layer.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.layer.base\_layer
-===============================
-
-.. automodule:: colossalai.nn.layer.base_layer
-   :members:
diff --git a/docs/colossalai/colossalai.nn.layer.colossalai_layer.dropout.rst b/docs/colossalai/colossalai.nn.layer.colossalai_layer.dropout.rst
deleted file mode 100644
index ec1dfd395f17..000000000000
--- a/docs/colossalai/colossalai.nn.layer.colossalai_layer.dropout.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.layer.colossalai\_layer.dropout
-=============================================
-
-.. automodule:: colossalai.nn.layer.colossalai_layer.dropout
-   :members:
diff --git a/docs/colossalai/colossalai.nn.layer.colossalai_layer.embedding.rst b/docs/colossalai/colossalai.nn.layer.colossalai_layer.embedding.rst
deleted file mode 100644
index 8438b3a07787..000000000000
--- a/docs/colossalai/colossalai.nn.layer.colossalai_layer.embedding.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.layer.colossalai\_layer.embedding
-===============================================
-
-.. automodule:: colossalai.nn.layer.colossalai_layer.embedding
-   :members:
diff --git a/docs/colossalai/colossalai.nn.layer.colossalai_layer.linear.rst b/docs/colossalai/colossalai.nn.layer.colossalai_layer.linear.rst
deleted file mode 100644
index 3213282549ea..000000000000
--- a/docs/colossalai/colossalai.nn.layer.colossalai_layer.linear.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.layer.colossalai\_layer.linear
-============================================
-
-.. automodule:: colossalai.nn.layer.colossalai_layer.linear
-   :members:
diff --git a/docs/colossalai/colossalai.nn.layer.colossalai_layer.normalization.rst b/docs/colossalai/colossalai.nn.layer.colossalai_layer.normalization.rst
deleted file mode 100644
index f94dd27b86e4..000000000000
--- a/docs/colossalai/colossalai.nn.layer.colossalai_layer.normalization.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.layer.colossalai\_layer.normalization
-===================================================
-
-.. automodule:: colossalai.nn.layer.colossalai_layer.normalization
-   :members:
diff --git a/docs/colossalai/colossalai.nn.layer.colossalai_layer.rst b/docs/colossalai/colossalai.nn.layer.colossalai_layer.rst
deleted file mode 100644
index 0f685e6c2dc3..000000000000
--- a/docs/colossalai/colossalai.nn.layer.colossalai_layer.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-colossalai.nn.layer.colossalai\_layer
-=====================================
-
-.. automodule:: colossalai.nn.layer.colossalai_layer
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.nn.layer.colossalai_layer.dropout
-   colossalai.nn.layer.colossalai_layer.embedding
-   colossalai.nn.layer.colossalai_layer.linear
-   colossalai.nn.layer.colossalai_layer.normalization
diff --git a/docs/colossalai/colossalai.nn.layer.moe.experts.rst b/docs/colossalai/colossalai.nn.layer.moe.experts.rst
deleted file mode 100644
index c05e763d5723..000000000000
--- a/docs/colossalai/colossalai.nn.layer.moe.experts.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.layer.moe.experts
-===============================
-
-.. automodule:: colossalai.nn.layer.moe.experts
-   :members:
diff --git a/docs/colossalai/colossalai.nn.layer.moe.layers.rst b/docs/colossalai/colossalai.nn.layer.moe.layers.rst
deleted file mode 100644
index d109d47b8174..000000000000
--- a/docs/colossalai/colossalai.nn.layer.moe.layers.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.layer.moe.layers
-==============================
-
-.. automodule:: colossalai.nn.layer.moe.layers
-   :members:
diff --git a/docs/colossalai/colossalai.nn.layer.moe.rst b/docs/colossalai/colossalai.nn.layer.moe.rst
deleted file mode 100644
index f3106b98d405..000000000000
--- a/docs/colossalai/colossalai.nn.layer.moe.rst
+++ /dev/null
@@ -1,13 +0,0 @@
-colossalai.nn.layer.moe
-=======================
-
-.. automodule:: colossalai.nn.layer.moe
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.nn.layer.moe.experts
-   colossalai.nn.layer.moe.layers
-   colossalai.nn.layer.moe.utils
diff --git a/docs/colossalai/colossalai.nn.layer.moe.utils.rst b/docs/colossalai/colossalai.nn.layer.moe.utils.rst
deleted file mode 100644
index fc085d136bb4..000000000000
--- a/docs/colossalai/colossalai.nn.layer.moe.utils.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.layer.moe.utils
-=============================
-
-.. automodule:: colossalai.nn.layer.moe.utils
-   :members:
diff --git a/docs/colossalai/colossalai.nn.layer.parallel_1d.layers.rst b/docs/colossalai/colossalai.nn.layer.parallel_1d.layers.rst
deleted file mode 100644
index 380f6bf8d134..000000000000
--- a/docs/colossalai/colossalai.nn.layer.parallel_1d.layers.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.layer.parallel\_1d.layers
-=======================================
-
-.. automodule:: colossalai.nn.layer.parallel_1d.layers
-   :members:
diff --git a/docs/colossalai/colossalai.nn.layer.parallel_1d.rst b/docs/colossalai/colossalai.nn.layer.parallel_1d.rst
deleted file mode 100644
index 3a8ed6206721..000000000000
--- a/docs/colossalai/colossalai.nn.layer.parallel_1d.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-colossalai.nn.layer.parallel\_1d
-================================
-
-.. automodule:: colossalai.nn.layer.parallel_1d
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.nn.layer.parallel_1d.layers
diff --git a/docs/colossalai/colossalai.nn.layer.parallel_2d.layers.rst b/docs/colossalai/colossalai.nn.layer.parallel_2d.layers.rst
deleted file mode 100644
index b64d402bdf3e..000000000000
--- a/docs/colossalai/colossalai.nn.layer.parallel_2d.layers.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.layer.parallel\_2d.layers
-=======================================
-
-.. automodule:: colossalai.nn.layer.parallel_2d.layers
-   :members:
diff --git a/docs/colossalai/colossalai.nn.layer.parallel_2d.rst b/docs/colossalai/colossalai.nn.layer.parallel_2d.rst
deleted file mode 100644
index f5ad41a1b450..000000000000
--- a/docs/colossalai/colossalai.nn.layer.parallel_2d.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-colossalai.nn.layer.parallel\_2d
-================================
-
-.. automodule:: colossalai.nn.layer.parallel_2d
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.nn.layer.parallel_2d.layers
diff --git a/docs/colossalai/colossalai.nn.layer.parallel_2p5d.layers.rst b/docs/colossalai/colossalai.nn.layer.parallel_2p5d.layers.rst
deleted file mode 100644
index ebc99d56ccdc..000000000000
--- a/docs/colossalai/colossalai.nn.layer.parallel_2p5d.layers.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.layer.parallel\_2p5d.layers
-=========================================
-
-.. automodule:: colossalai.nn.layer.parallel_2p5d.layers
-   :members:
diff --git a/docs/colossalai/colossalai.nn.layer.parallel_2p5d.rst b/docs/colossalai/colossalai.nn.layer.parallel_2p5d.rst
deleted file mode 100644
index 5869bdee9928..000000000000
--- a/docs/colossalai/colossalai.nn.layer.parallel_2p5d.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-colossalai.nn.layer.parallel\_2p5d
-==================================
-
-.. automodule:: colossalai.nn.layer.parallel_2p5d
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.nn.layer.parallel_2p5d.layers
diff --git a/docs/colossalai/colossalai.nn.layer.parallel_3d.layers.rst b/docs/colossalai/colossalai.nn.layer.parallel_3d.layers.rst
deleted file mode 100644
index a1702f1fcf62..000000000000
--- a/docs/colossalai/colossalai.nn.layer.parallel_3d.layers.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.layer.parallel\_3d.layers
-=======================================
-
-.. automodule:: colossalai.nn.layer.parallel_3d.layers
-   :members:
diff --git a/docs/colossalai/colossalai.nn.layer.parallel_3d.rst b/docs/colossalai/colossalai.nn.layer.parallel_3d.rst
deleted file mode 100644
index bb55a63e507d..000000000000
--- a/docs/colossalai/colossalai.nn.layer.parallel_3d.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-colossalai.nn.layer.parallel\_3d
-================================
-
-.. automodule:: colossalai.nn.layer.parallel_3d
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.nn.layer.parallel_3d.layers
diff --git a/docs/colossalai/colossalai.nn.layer.parallel_sequence.layers.rst b/docs/colossalai/colossalai.nn.layer.parallel_sequence.layers.rst
deleted file mode 100644
index 54929d2e7169..000000000000
--- a/docs/colossalai/colossalai.nn.layer.parallel_sequence.layers.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.layer.parallel\_sequence.layers
-=============================================
-
-.. automodule:: colossalai.nn.layer.parallel_sequence.layers
-   :members:
diff --git a/docs/colossalai/colossalai.nn.layer.parallel_sequence.rst b/docs/colossalai/colossalai.nn.layer.parallel_sequence.rst
deleted file mode 100644
index 24e8941d4ec4..000000000000
--- a/docs/colossalai/colossalai.nn.layer.parallel_sequence.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-colossalai.nn.layer.parallel\_sequence
-======================================
-
-.. automodule:: colossalai.nn.layer.parallel_sequence
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.nn.layer.parallel_sequence.layers
diff --git a/docs/colossalai/colossalai.nn.layer.rst b/docs/colossalai/colossalai.nn.layer.rst
deleted file mode 100644
index 32a93128f2a4..000000000000
--- a/docs/colossalai/colossalai.nn.layer.rst
+++ /dev/null
@@ -1,25 +0,0 @@
-colossalai.nn.layer
-===================
-
-.. automodule:: colossalai.nn.layer
-   :members:
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.nn.layer.colossalai_layer
-   colossalai.nn.layer.moe
-   colossalai.nn.layer.parallel_1d
-   colossalai.nn.layer.parallel_2d
-   colossalai.nn.layer.parallel_2p5d
-   colossalai.nn.layer.parallel_3d
-   colossalai.nn.layer.parallel_sequence
-   colossalai.nn.layer.utils
-   colossalai.nn.layer.vanilla
-   colossalai.nn.layer.wrapper
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.nn.layer.base_layer
diff --git a/docs/colossalai/colossalai.nn.layer.utils.common.rst b/docs/colossalai/colossalai.nn.layer.utils.common.rst
deleted file mode 100644
index 6a552830f8f5..000000000000
--- a/docs/colossalai/colossalai.nn.layer.utils.common.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.layer.utils.common
-================================
-
-.. automodule:: colossalai.nn.layer.utils.common
-   :members:
diff --git a/docs/colossalai/colossalai.nn.layer.utils.rst b/docs/colossalai/colossalai.nn.layer.utils.rst
deleted file mode 100644
index 16c3d718286a..000000000000
--- a/docs/colossalai/colossalai.nn.layer.utils.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-colossalai.nn.layer.utils
-=========================
-
-.. automodule:: colossalai.nn.layer.utils
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.nn.layer.utils.common
diff --git a/docs/colossalai/colossalai.nn.layer.vanilla.layers.rst b/docs/colossalai/colossalai.nn.layer.vanilla.layers.rst
deleted file mode 100644
index f993b1f50e5b..000000000000
--- a/docs/colossalai/colossalai.nn.layer.vanilla.layers.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.layer.vanilla.layers
-==================================
-
-.. automodule:: colossalai.nn.layer.vanilla.layers
-   :members:
diff --git a/docs/colossalai/colossalai.nn.layer.vanilla.rst b/docs/colossalai/colossalai.nn.layer.vanilla.rst
deleted file mode 100644
index fe1ea5c6c53e..000000000000
--- a/docs/colossalai/colossalai.nn.layer.vanilla.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-colossalai.nn.layer.vanilla
-===========================
-
-.. automodule:: colossalai.nn.layer.vanilla
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.nn.layer.vanilla.layers
diff --git a/docs/colossalai/colossalai.nn.layer.wrapper.pipeline_wrapper.rst b/docs/colossalai/colossalai.nn.layer.wrapper.pipeline_wrapper.rst
deleted file mode 100644
index e5648873d34b..000000000000
--- a/docs/colossalai/colossalai.nn.layer.wrapper.pipeline_wrapper.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.layer.wrapper.pipeline\_wrapper
-=============================================
-
-.. automodule:: colossalai.nn.layer.wrapper.pipeline_wrapper
-   :members:
diff --git a/docs/colossalai/colossalai.nn.layer.wrapper.rst b/docs/colossalai/colossalai.nn.layer.wrapper.rst
deleted file mode 100644
index 761bf843af36..000000000000
--- a/docs/colossalai/colossalai.nn.layer.wrapper.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-colossalai.nn.layer.wrapper
-===========================
-
-.. automodule:: colossalai.nn.layer.wrapper
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.nn.layer.wrapper.pipeline_wrapper
diff --git a/docs/colossalai/colossalai.nn.loss.loss_1d.rst b/docs/colossalai/colossalai.nn.loss.loss_1d.rst
deleted file mode 100644
index d9ac2e67d317..000000000000
--- a/docs/colossalai/colossalai.nn.loss.loss_1d.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.loss.loss\_1d
-===========================
-
-.. automodule:: colossalai.nn.loss.loss_1d
-   :members:
diff --git a/docs/colossalai/colossalai.nn.loss.loss_2d.rst b/docs/colossalai/colossalai.nn.loss.loss_2d.rst
deleted file mode 100644
index 14d1585e3e0f..000000000000
--- a/docs/colossalai/colossalai.nn.loss.loss_2d.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.loss.loss\_2d
-===========================
-
-.. automodule:: colossalai.nn.loss.loss_2d
-   :members:
diff --git a/docs/colossalai/colossalai.nn.loss.loss_2p5d.rst b/docs/colossalai/colossalai.nn.loss.loss_2p5d.rst
deleted file mode 100644
index fc3714da3630..000000000000
--- a/docs/colossalai/colossalai.nn.loss.loss_2p5d.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.loss.loss\_2p5d
-=============================
-
-.. automodule:: colossalai.nn.loss.loss_2p5d
-   :members:
diff --git a/docs/colossalai/colossalai.nn.loss.loss_3d.rst b/docs/colossalai/colossalai.nn.loss.loss_3d.rst
deleted file mode 100644
index a593324fb4f1..000000000000
--- a/docs/colossalai/colossalai.nn.loss.loss_3d.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.loss.loss\_3d
-===========================
-
-.. automodule:: colossalai.nn.loss.loss_3d
-   :members:
diff --git a/docs/colossalai/colossalai.nn.loss.loss_moe.rst b/docs/colossalai/colossalai.nn.loss.loss_moe.rst
deleted file mode 100644
index ef2851ace83a..000000000000
--- a/docs/colossalai/colossalai.nn.loss.loss_moe.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.loss.loss\_moe
-============================
-
-.. automodule:: colossalai.nn.loss.loss_moe
-   :members:
diff --git a/docs/colossalai/colossalai.nn.loss.rst b/docs/colossalai/colossalai.nn.loss.rst
deleted file mode 100644
index 5df7d1ae3770..000000000000
--- a/docs/colossalai/colossalai.nn.loss.rst
+++ /dev/null
@@ -1,15 +0,0 @@
-colossalai.nn.loss
-==================
-
-.. automodule:: colossalai.nn.loss
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.nn.loss.loss_1d
-   colossalai.nn.loss.loss_2d
-   colossalai.nn.loss.loss_2p5d
-   colossalai.nn.loss.loss_3d
-   colossalai.nn.loss.loss_moe
diff --git a/docs/colossalai/colossalai.nn.lr_scheduler.cosine.rst b/docs/colossalai/colossalai.nn.lr_scheduler.cosine.rst
deleted file mode 100644
index a7c636ad3a36..000000000000
--- a/docs/colossalai/colossalai.nn.lr_scheduler.cosine.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.lr\_scheduler.cosine
-==================================
-
-.. automodule:: colossalai.nn.lr_scheduler.cosine
-   :members:
diff --git a/docs/colossalai/colossalai.nn.lr_scheduler.delayed.rst b/docs/colossalai/colossalai.nn.lr_scheduler.delayed.rst
deleted file mode 100644
index 2a86c4b2a20c..000000000000
--- a/docs/colossalai/colossalai.nn.lr_scheduler.delayed.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.lr\_scheduler.delayed
-===================================
-
-.. automodule:: colossalai.nn.lr_scheduler.delayed
-   :members:
diff --git a/docs/colossalai/colossalai.nn.lr_scheduler.linear.rst b/docs/colossalai/colossalai.nn.lr_scheduler.linear.rst
deleted file mode 100644
index 5e917edc2faf..000000000000
--- a/docs/colossalai/colossalai.nn.lr_scheduler.linear.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.lr\_scheduler.linear
-==================================
-
-.. automodule:: colossalai.nn.lr_scheduler.linear
-   :members:
diff --git a/docs/colossalai/colossalai.nn.lr_scheduler.multistep.rst b/docs/colossalai/colossalai.nn.lr_scheduler.multistep.rst
deleted file mode 100644
index 4248a6386375..000000000000
--- a/docs/colossalai/colossalai.nn.lr_scheduler.multistep.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.lr\_scheduler.multistep
-=====================================
-
-.. automodule:: colossalai.nn.lr_scheduler.multistep
-   :members:
diff --git a/docs/colossalai/colossalai.nn.lr_scheduler.onecycle.rst b/docs/colossalai/colossalai.nn.lr_scheduler.onecycle.rst
deleted file mode 100644
index 7f2fd47586fe..000000000000
--- a/docs/colossalai/colossalai.nn.lr_scheduler.onecycle.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.lr\_scheduler.onecycle
-====================================
-
-.. automodule:: colossalai.nn.lr_scheduler.onecycle
-   :members:
diff --git a/docs/colossalai/colossalai.nn.lr_scheduler.poly.rst b/docs/colossalai/colossalai.nn.lr_scheduler.poly.rst
deleted file mode 100644
index c1618812aa0c..000000000000
--- a/docs/colossalai/colossalai.nn.lr_scheduler.poly.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.lr\_scheduler.poly
-================================
-
-.. automodule:: colossalai.nn.lr_scheduler.poly
-   :members:
diff --git a/docs/colossalai/colossalai.nn.lr_scheduler.rst b/docs/colossalai/colossalai.nn.lr_scheduler.rst
deleted file mode 100644
index 427a3ee4529e..000000000000
--- a/docs/colossalai/colossalai.nn.lr_scheduler.rst
+++ /dev/null
@@ -1,17 +0,0 @@
-colossalai.nn.lr\_scheduler
-===========================
-
-.. automodule:: colossalai.nn.lr_scheduler
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.nn.lr_scheduler.cosine
-   colossalai.nn.lr_scheduler.delayed
-   colossalai.nn.lr_scheduler.linear
-   colossalai.nn.lr_scheduler.multistep
-   colossalai.nn.lr_scheduler.onecycle
-   colossalai.nn.lr_scheduler.poly
-   colossalai.nn.lr_scheduler.torch
diff --git a/docs/colossalai/colossalai.nn.lr_scheduler.torch.rst b/docs/colossalai/colossalai.nn.lr_scheduler.torch.rst
deleted file mode 100644
index f8d552bf1d62..000000000000
--- a/docs/colossalai/colossalai.nn.lr_scheduler.torch.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.lr\_scheduler.torch
-=================================
-
-.. automodule:: colossalai.nn.lr_scheduler.torch
-   :members:
diff --git a/docs/colossalai/colossalai.nn.metric.accuracy_2d.rst b/docs/colossalai/colossalai.nn.metric.accuracy_2d.rst
deleted file mode 100644
index 63bcb8349763..000000000000
--- a/docs/colossalai/colossalai.nn.metric.accuracy_2d.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.metric.accuracy\_2d
-=================================
-
-.. automodule:: colossalai.nn.metric.accuracy_2d
-   :members:
diff --git a/docs/colossalai/colossalai.nn.metric.accuracy_2p5d.rst b/docs/colossalai/colossalai.nn.metric.accuracy_2p5d.rst
deleted file mode 100644
index dd4358fbff72..000000000000
--- a/docs/colossalai/colossalai.nn.metric.accuracy_2p5d.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.metric.accuracy\_2p5d
-===================================
-
-.. automodule:: colossalai.nn.metric.accuracy_2p5d
-   :members:
diff --git a/docs/colossalai/colossalai.nn.metric.accuracy_3d.rst b/docs/colossalai/colossalai.nn.metric.accuracy_3d.rst
deleted file mode 100644
index 95143444b945..000000000000
--- a/docs/colossalai/colossalai.nn.metric.accuracy_3d.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.metric.accuracy\_3d
-=================================
-
-.. automodule:: colossalai.nn.metric.accuracy_3d
-   :members:
diff --git a/docs/colossalai/colossalai.nn.metric.rst b/docs/colossalai/colossalai.nn.metric.rst
deleted file mode 100644
index 28f5568eb846..000000000000
--- a/docs/colossalai/colossalai.nn.metric.rst
+++ /dev/null
@@ -1,13 +0,0 @@
-colossalai.nn.metric
-====================
-
-.. automodule:: colossalai.nn.metric
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.nn.metric.accuracy_2d
-   colossalai.nn.metric.accuracy_2p5d
-   colossalai.nn.metric.accuracy_3d
diff --git a/docs/colossalai/colossalai.nn.optimizer.colossalai_optimizer.rst b/docs/colossalai/colossalai.nn.optimizer.colossalai_optimizer.rst
deleted file mode 100644
index 35515c374f33..000000000000
--- a/docs/colossalai/colossalai.nn.optimizer.colossalai_optimizer.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.optimizer.colossalai\_optimizer
-=============================================
-
-.. automodule:: colossalai.nn.optimizer.colossalai_optimizer
-   :members:
diff --git a/docs/colossalai/colossalai.nn.optimizer.cpu_adam.rst b/docs/colossalai/colossalai.nn.optimizer.cpu_adam.rst
deleted file mode 100644
index 224dfab43ed0..000000000000
--- a/docs/colossalai/colossalai.nn.optimizer.cpu_adam.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.optimizer.cpu\_adam
-=================================
-
-.. automodule:: colossalai.nn.optimizer.cpu_adam
-   :members:
diff --git a/docs/colossalai/colossalai.nn.optimizer.fused_adam.rst b/docs/colossalai/colossalai.nn.optimizer.fused_adam.rst
deleted file mode 100644
index 60af624cb6c1..000000000000
--- a/docs/colossalai/colossalai.nn.optimizer.fused_adam.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.optimizer.fused\_adam
-===================================
-
-.. automodule:: colossalai.nn.optimizer.fused_adam
-   :members:
diff --git a/docs/colossalai/colossalai.nn.optimizer.fused_lamb.rst b/docs/colossalai/colossalai.nn.optimizer.fused_lamb.rst
deleted file mode 100644
index 66c0fa4ca1c7..000000000000
--- a/docs/colossalai/colossalai.nn.optimizer.fused_lamb.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.optimizer.fused\_lamb
-===================================
-
-.. automodule:: colossalai.nn.optimizer.fused_lamb
-   :members:
diff --git a/docs/colossalai/colossalai.nn.optimizer.fused_sgd.rst b/docs/colossalai/colossalai.nn.optimizer.fused_sgd.rst
deleted file mode 100644
index 2ecc77c33d88..000000000000
--- a/docs/colossalai/colossalai.nn.optimizer.fused_sgd.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.optimizer.fused\_sgd
-==================================
-
-.. automodule:: colossalai.nn.optimizer.fused_sgd
-   :members:
diff --git a/docs/colossalai/colossalai.nn.optimizer.hybrid_adam.rst b/docs/colossalai/colossalai.nn.optimizer.hybrid_adam.rst
deleted file mode 100644
index 20508d664701..000000000000
--- a/docs/colossalai/colossalai.nn.optimizer.hybrid_adam.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.optimizer.hybrid\_adam
-====================================
-
-.. automodule:: colossalai.nn.optimizer.hybrid_adam
-   :members:
diff --git a/docs/colossalai/colossalai.nn.optimizer.lamb.rst b/docs/colossalai/colossalai.nn.optimizer.lamb.rst
deleted file mode 100644
index 57199ea36951..000000000000
--- a/docs/colossalai/colossalai.nn.optimizer.lamb.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.optimizer.lamb
-============================
-
-.. automodule:: colossalai.nn.optimizer.lamb
-   :members:
diff --git a/docs/colossalai/colossalai.nn.optimizer.lars.rst b/docs/colossalai/colossalai.nn.optimizer.lars.rst
deleted file mode 100644
index f935950f8b5a..000000000000
--- a/docs/colossalai/colossalai.nn.optimizer.lars.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.optimizer.lars
-============================
-
-.. automodule:: colossalai.nn.optimizer.lars
-   :members:
diff --git a/docs/colossalai/colossalai.nn.optimizer.rst b/docs/colossalai/colossalai.nn.optimizer.rst
deleted file mode 100644
index ede9cc496967..000000000000
--- a/docs/colossalai/colossalai.nn.optimizer.rst
+++ /dev/null
@@ -1,19 +0,0 @@
-colossalai.nn.optimizer
-=======================
-
-.. automodule:: colossalai.nn.optimizer
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.nn.optimizer.colossalai_optimizer
-   colossalai.nn.optimizer.cpu_adam
-   colossalai.nn.optimizer.fused_adam
-   colossalai.nn.optimizer.fused_lamb
-   colossalai.nn.optimizer.fused_sgd
-   colossalai.nn.optimizer.hybrid_adam
-   colossalai.nn.optimizer.lamb
-   colossalai.nn.optimizer.lars
-   colossalai.nn.optimizer.utils
diff --git a/docs/colossalai/colossalai.nn.optimizer.utils.rst b/docs/colossalai/colossalai.nn.optimizer.utils.rst
deleted file mode 100644
index 9b2bc2f016c4..000000000000
--- a/docs/colossalai/colossalai.nn.optimizer.utils.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.optimizer.utils
-=============================
-
-.. automodule:: colossalai.nn.optimizer.utils
-   :members:
diff --git a/docs/colossalai/colossalai.nn.parallel.data_parallel.rst b/docs/colossalai/colossalai.nn.parallel.data_parallel.rst
deleted file mode 100644
index ba987c2ee2f3..000000000000
--- a/docs/colossalai/colossalai.nn.parallel.data_parallel.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.parallel.data\_parallel
-=====================================
-
-.. automodule:: colossalai.nn.parallel.data_parallel
-   :members:
diff --git a/docs/colossalai/colossalai.nn.parallel.layers.colo_module.rst b/docs/colossalai/colossalai.nn.parallel.layers.colo_module.rst
deleted file mode 100644
index c80fff6d543a..000000000000
--- a/docs/colossalai/colossalai.nn.parallel.layers.colo_module.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.parallel.layers.colo\_module
-==========================================
-
-.. automodule:: colossalai.nn.parallel.layers.colo_module
-   :members:
diff --git a/docs/colossalai/colossalai.nn.parallel.layers.embedding.rst b/docs/colossalai/colossalai.nn.parallel.layers.embedding.rst
deleted file mode 100644
index 1e7ecc50f478..000000000000
--- a/docs/colossalai/colossalai.nn.parallel.layers.embedding.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.parallel.layers.embedding
-=======================================
-
-.. automodule:: colossalai.nn.parallel.layers.embedding
-   :members:
diff --git a/docs/colossalai/colossalai.nn.parallel.layers.linear.rst b/docs/colossalai/colossalai.nn.parallel.layers.linear.rst
deleted file mode 100644
index bbc5e32570e7..000000000000
--- a/docs/colossalai/colossalai.nn.parallel.layers.linear.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.parallel.layers.linear
-====================================
-
-.. automodule:: colossalai.nn.parallel.layers.linear
-   :members:
diff --git a/docs/colossalai/colossalai.nn.parallel.layers.module_utils.rst b/docs/colossalai/colossalai.nn.parallel.layers.module_utils.rst
deleted file mode 100644
index 5190ab40345a..000000000000
--- a/docs/colossalai/colossalai.nn.parallel.layers.module_utils.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.parallel.layers.module\_utils
-===========================================
-
-.. automodule:: colossalai.nn.parallel.layers.module_utils
-   :members:
diff --git a/docs/colossalai/colossalai.nn.parallel.layers.rst b/docs/colossalai/colossalai.nn.parallel.layers.rst
deleted file mode 100644
index 782a206e88d5..000000000000
--- a/docs/colossalai/colossalai.nn.parallel.layers.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-colossalai.nn.parallel.layers
-=============================
-
-.. automodule:: colossalai.nn.parallel.layers
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.nn.parallel.layers.colo_module
-   colossalai.nn.parallel.layers.embedding
-   colossalai.nn.parallel.layers.linear
-   colossalai.nn.parallel.layers.module_utils
diff --git a/docs/colossalai/colossalai.nn.parallel.reducer.rst b/docs/colossalai/colossalai.nn.parallel.reducer.rst
deleted file mode 100644
index d80841f6916e..000000000000
--- a/docs/colossalai/colossalai.nn.parallel.reducer.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.parallel.reducer
-==============================
-
-.. automodule:: colossalai.nn.parallel.reducer
-   :members:
diff --git a/docs/colossalai/colossalai.nn.parallel.rst b/docs/colossalai/colossalai.nn.parallel.rst
deleted file mode 100644
index 19e9d1eef19b..000000000000
--- a/docs/colossalai/colossalai.nn.parallel.rst
+++ /dev/null
@@ -1,17 +0,0 @@
-colossalai.nn.parallel
-======================
-
-.. automodule:: colossalai.nn.parallel
-   :members:
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.nn.parallel.layers
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.nn.parallel.data_parallel
-   colossalai.nn.parallel.reducer
diff --git a/docs/colossalai/colossalai.nn.rst b/docs/colossalai/colossalai.nn.rst
deleted file mode 100644
index 7e683952f3db..000000000000
--- a/docs/colossalai/colossalai.nn.rst
+++ /dev/null
@@ -1,22 +0,0 @@
-colossalai.nn
-=============
-
-.. automodule:: colossalai.nn
-   :members:
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.nn.graph
-   colossalai.nn.layer
-   colossalai.nn.loss
-   colossalai.nn.lr_scheduler
-   colossalai.nn.metric
-   colossalai.nn.optimizer
-   colossalai.nn.parallel
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.nn.init
diff --git a/docs/colossalai/colossalai.pipeline.layer_sepc.rst b/docs/colossalai/colossalai.pipeline.layer_sepc.rst
deleted file mode 100644
index 156660b5c00f..000000000000
--- a/docs/colossalai/colossalai.pipeline.layer_sepc.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.pipeline.layer\_sepc
-===============================
-
-.. automodule:: colossalai.pipeline.layer_spec
-   :members:
diff --git a/docs/colossalai/colossalai.pipeline.pipelinable.rst b/docs/colossalai/colossalai.pipeline.pipelinable.rst
deleted file mode 100644
index 5c2b02ba63e2..000000000000
--- a/docs/colossalai/colossalai.pipeline.pipelinable.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.pipeline.pipelinable
-===============================
-
-.. automodule:: colossalai.pipeline.pipelinable
-   :members:
diff --git a/docs/colossalai/colossalai.pipeline.rst b/docs/colossalai/colossalai.pipeline.rst
deleted file mode 100644
index 6f7652d492e0..000000000000
--- a/docs/colossalai/colossalai.pipeline.rst
+++ /dev/null
@@ -1,13 +0,0 @@
-colossalai.pipeline
-===================
-
-.. automodule:: colossalai.pipeline
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.pipeline.layer_spec
-   colossalai.pipeline.pipelinable
-   colossalai.pipeline.utils
diff --git a/docs/colossalai/colossalai.pipeline.utils.rst b/docs/colossalai/colossalai.pipeline.utils.rst
deleted file mode 100644
index a33bf42cfc2b..000000000000
--- a/docs/colossalai/colossalai.pipeline.utils.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.pipeline.utils
-=========================
-
-.. automodule:: colossalai.pipeline.utils
-   :members:
diff --git a/docs/colossalai/colossalai.registry.registry.rst b/docs/colossalai/colossalai.registry.registry.rst
deleted file mode 100644
index e942d7969b60..000000000000
--- a/docs/colossalai/colossalai.registry.registry.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.registry.registry
-============================
-
-.. automodule:: colossalai.registry.registry
-   :members:
diff --git a/docs/colossalai/colossalai.registry.rst b/docs/colossalai/colossalai.registry.rst
deleted file mode 100644
index 0f294f6d15a7..000000000000
--- a/docs/colossalai/colossalai.registry.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-colossalai.registry
-===================
-
-.. automodule:: colossalai.registry
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.registry.registry
diff --git a/docs/colossalai/colossalai.rst b/docs/colossalai/colossalai.rst
deleted file mode 100644
index 921f15a97f00..000000000000
--- a/docs/colossalai/colossalai.rst
+++ /dev/null
@@ -1,36 +0,0 @@
-colossalai
-==========
-
-.. automodule:: colossalai
-   :members:
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.amp
-   colossalai.builder
-   colossalai.cli
-   colossalai.communication
-   colossalai.context
-   colossalai.engine
-   colossalai.fx
-   colossalai.gemini
-   colossalai.kernel
-   colossalai.logging
-   colossalai.nn
-   colossalai.pipeline
-   colossalai.registry
-   colossalai.tensor
-   colossalai.testing
-   colossalai.trainer
-   colossalai.utils
-   colossalai.zero
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.constants
-   colossalai.core
-   colossalai.global_variables
-   colossalai.initialize
diff --git a/docs/colossalai/colossalai.tensor.colo_parameter.rst b/docs/colossalai/colossalai.tensor.colo_parameter.rst
deleted file mode 100644
index 9b65029dbbe4..000000000000
--- a/docs/colossalai/colossalai.tensor.colo_parameter.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.tensor.colo\_parameter
-=================================
-
-.. automodule:: colossalai.tensor.colo_parameter
-   :members:
diff --git a/docs/colossalai/colossalai.tensor.colo_tensor.rst b/docs/colossalai/colossalai.tensor.colo_tensor.rst
deleted file mode 100644
index 9161ac22f665..000000000000
--- a/docs/colossalai/colossalai.tensor.colo_tensor.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.tensor.colo\_tensor
-==============================
-
-.. automodule:: colossalai.tensor.colo_tensor
-   :members:
diff --git a/docs/colossalai/colossalai.tensor.compute_spec.rst b/docs/colossalai/colossalai.tensor.compute_spec.rst
deleted file mode 100644
index e2d7235d99c4..000000000000
--- a/docs/colossalai/colossalai.tensor.compute_spec.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.tensor.compute\_spec
-===============================
-
-.. automodule:: colossalai.tensor.compute_spec
-   :members:
diff --git a/docs/colossalai/colossalai.tensor.const.rst b/docs/colossalai/colossalai.tensor.const.rst
deleted file mode 100644
index a22a2789349b..000000000000
--- a/docs/colossalai/colossalai.tensor.const.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.tensor.const
-=======================
-
-.. automodule:: colossalai.tensor.const
-   :members:
diff --git a/docs/colossalai/colossalai.tensor.dist_spec_mgr.rst b/docs/colossalai/colossalai.tensor.dist_spec_mgr.rst
deleted file mode 100644
index 043cf22604a3..000000000000
--- a/docs/colossalai/colossalai.tensor.dist_spec_mgr.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.tensor.dist\_spec\_mgr
-=================================
-
-.. automodule:: colossalai.tensor.dist_spec_mgr
-   :members:
diff --git a/docs/colossalai/colossalai.tensor.distspec.rst b/docs/colossalai/colossalai.tensor.distspec.rst
deleted file mode 100644
index 2b4b0e5fa266..000000000000
--- a/docs/colossalai/colossalai.tensor.distspec.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.tensor.distspec
-==========================
-
-.. automodule:: colossalai.tensor.distspec
-   :members:
diff --git a/docs/colossalai/colossalai.tensor.op_wrapper.rst b/docs/colossalai/colossalai.tensor.op_wrapper.rst
deleted file mode 100644
index a246e0a6a548..000000000000
--- a/docs/colossalai/colossalai.tensor.op_wrapper.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.tensor.op\_wrapper
-=============================
-
-.. automodule:: colossalai.tensor.op_wrapper
-   :members:
diff --git a/docs/colossalai/colossalai.tensor.param_op_hook.rst b/docs/colossalai/colossalai.tensor.param_op_hook.rst
deleted file mode 100644
index 475ada452bb2..000000000000
--- a/docs/colossalai/colossalai.tensor.param_op_hook.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.tensor.param\_op\_hook
-=================================
-
-.. automodule:: colossalai.tensor.param_op_hook
-   :members:
diff --git a/docs/colossalai/colossalai.tensor.process_group.rst b/docs/colossalai/colossalai.tensor.process_group.rst
deleted file mode 100644
index b71409e3bd11..000000000000
--- a/docs/colossalai/colossalai.tensor.process_group.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.tensor.process\_group
-================================
-
-.. automodule:: colossalai.tensor.process_group
-   :members:
diff --git a/docs/colossalai/colossalai.tensor.rst b/docs/colossalai/colossalai.tensor.rst
deleted file mode 100644
index 68e06552b873..000000000000
--- a/docs/colossalai/colossalai.tensor.rst
+++ /dev/null
@@ -1,21 +0,0 @@
-colossalai.tensor
-=================
-
-.. automodule:: colossalai.tensor
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.tensor.colo_parameter
-   colossalai.tensor.colo_tensor
-   colossalai.tensor.compute_spec
-   colossalai.tensor.const
-   colossalai.tensor.dist_spec_mgr
-   colossalai.tensor.distspec
-   colossalai.tensor.op_wrapper
-   colossalai.tensor.param_op_hook
-   colossalai.tensor.process_group
-   colossalai.tensor.tensor_spec
-   colossalai.tensor.utils
diff --git a/docs/colossalai/colossalai.tensor.tensor_spec.rst b/docs/colossalai/colossalai.tensor.tensor_spec.rst
deleted file mode 100644
index 7125b9cbc28d..000000000000
--- a/docs/colossalai/colossalai.tensor.tensor_spec.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.tensor.tensor\_spec
-==============================
-
-.. automodule:: colossalai.tensor.tensor_spec
-   :members:
diff --git a/docs/colossalai/colossalai.tensor.utils.rst b/docs/colossalai/colossalai.tensor.utils.rst
deleted file mode 100644
index 5d9bd1b03038..000000000000
--- a/docs/colossalai/colossalai.tensor.utils.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.tensor.utils
-=======================
-
-.. automodule:: colossalai.tensor.utils
-   :members:
diff --git a/docs/colossalai/colossalai.testing.comparison.rst b/docs/colossalai/colossalai.testing.comparison.rst
deleted file mode 100644
index bcfdf0598856..000000000000
--- a/docs/colossalai/colossalai.testing.comparison.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.testing.comparison
-=============================
-
-.. automodule:: colossalai.testing.comparison
-   :members:
diff --git a/docs/colossalai/colossalai.testing.rst b/docs/colossalai/colossalai.testing.rst
deleted file mode 100644
index 1127aa52c1ad..000000000000
--- a/docs/colossalai/colossalai.testing.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-colossalai.testing
-==================
-
-.. automodule:: colossalai.testing
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.testing.comparison
-   colossalai.testing.utils
diff --git a/docs/colossalai/colossalai.testing.utils.rst b/docs/colossalai/colossalai.testing.utils.rst
deleted file mode 100644
index d8c2edcce71c..000000000000
--- a/docs/colossalai/colossalai.testing.utils.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.testing.utils
-========================
-
-.. automodule:: colossalai.testing.utils
-   :members:
diff --git a/docs/colossalai/colossalai.trainer.hooks.rst b/docs/colossalai/colossalai.trainer.hooks.rst
deleted file mode 100644
index 84cc6797b831..000000000000
--- a/docs/colossalai/colossalai.trainer.hooks.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.trainer.hooks
-========================
-
-.. automodule:: colossalai.trainer.hooks
-   :members:
diff --git a/docs/colossalai/colossalai.trainer.rst b/docs/colossalai/colossalai.trainer.rst
deleted file mode 100644
index abc636e62373..000000000000
--- a/docs/colossalai/colossalai.trainer.rst
+++ /dev/null
@@ -1,10 +0,0 @@
-colossalai.trainer
-==================
-
-.. automodule:: colossalai.trainer
-   :members:
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.trainer.hooks
diff --git a/docs/colossalai/colossalai.utils.activation_checkpoint.rst b/docs/colossalai/colossalai.utils.activation_checkpoint.rst
deleted file mode 100644
index 671b5fe9e9c4..000000000000
--- a/docs/colossalai/colossalai.utils.activation_checkpoint.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.utils.activation\_checkpoint
-=======================================
-
-.. automodule:: colossalai.utils.activation_checkpoint
-   :members:
diff --git a/docs/colossalai/colossalai.utils.checkpoint.module_checkpoint.rst b/docs/colossalai/colossalai.utils.checkpoint.module_checkpoint.rst
deleted file mode 100644
index 237ad380b301..000000000000
--- a/docs/colossalai/colossalai.utils.checkpoint.module_checkpoint.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.utils.checkpoint.module\_checkpoint
-==============================================
-
-.. automodule:: colossalai.utils.checkpoint.module_checkpoint
-   :members:
diff --git a/docs/colossalai/colossalai.utils.checkpoint.rst b/docs/colossalai/colossalai.utils.checkpoint.rst
deleted file mode 100644
index 220c270f09b9..000000000000
--- a/docs/colossalai/colossalai.utils.checkpoint.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-colossalai.utils.checkpoint
-===========================
-
-.. automodule:: colossalai.utils.checkpoint
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.utils.checkpoint.module_checkpoint
-   colossalai.utils.checkpoint.utils
diff --git a/docs/colossalai/colossalai.utils.checkpoint.utils.rst b/docs/colossalai/colossalai.utils.checkpoint.utils.rst
deleted file mode 100644
index 7fdeefd539fe..000000000000
--- a/docs/colossalai/colossalai.utils.checkpoint.utils.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.utils.checkpoint.utils
-=================================
-
-.. automodule:: colossalai.utils.checkpoint.utils
-   :members:
diff --git a/docs/colossalai/colossalai.utils.checkpointing.rst b/docs/colossalai/colossalai.utils.checkpointing.rst
deleted file mode 100644
index 534a581d5364..000000000000
--- a/docs/colossalai/colossalai.utils.checkpointing.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.utils.checkpointing
-==============================
-
-.. automodule:: colossalai.utils.checkpointing
-   :members:
diff --git a/docs/colossalai/colossalai.utils.common.rst b/docs/colossalai/colossalai.utils.common.rst
deleted file mode 100644
index cb9f9c14ef4f..000000000000
--- a/docs/colossalai/colossalai.utils.common.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.utils.common
-=======================
-
-.. automodule:: colossalai.utils.common
-   :members:
diff --git a/docs/colossalai/colossalai.utils.cuda.rst b/docs/colossalai/colossalai.utils.cuda.rst
deleted file mode 100644
index ec428c5ef6ea..000000000000
--- a/docs/colossalai/colossalai.utils.cuda.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.utils.cuda
-=====================
-
-.. automodule:: colossalai.utils.cuda
-   :members:
diff --git a/docs/colossalai/colossalai.utils.data_sampler.base_sampler.rst b/docs/colossalai/colossalai.utils.data_sampler.base_sampler.rst
deleted file mode 100644
index 199e8fcf83c3..000000000000
--- a/docs/colossalai/colossalai.utils.data_sampler.base_sampler.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.utils.data\_sampler.base\_sampler
-============================================
-
-.. automodule:: colossalai.utils.data_sampler.base_sampler
-   :members:
diff --git a/docs/colossalai/colossalai.utils.data_sampler.data_parallel_sampler.rst b/docs/colossalai/colossalai.utils.data_sampler.data_parallel_sampler.rst
deleted file mode 100644
index 85e1b121c682..000000000000
--- a/docs/colossalai/colossalai.utils.data_sampler.data_parallel_sampler.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.utils.data\_sampler.data\_parallel\_sampler
-======================================================
-
-.. automodule:: colossalai.utils.data_sampler.data_parallel_sampler
-   :members:
diff --git a/docs/colossalai/colossalai.utils.data_sampler.rst b/docs/colossalai/colossalai.utils.data_sampler.rst
deleted file mode 100644
index 61dde070bad4..000000000000
--- a/docs/colossalai/colossalai.utils.data_sampler.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-colossalai.utils.data\_sampler
-==============================
-
-.. automodule:: colossalai.utils.data_sampler
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.utils.data_sampler.base_sampler
-   colossalai.utils.data_sampler.data_parallel_sampler
diff --git a/docs/colossalai/colossalai.utils.memory.rst b/docs/colossalai/colossalai.utils.memory.rst
deleted file mode 100644
index 67c5d60022dd..000000000000
--- a/docs/colossalai/colossalai.utils.memory.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.utils.memory
-=======================
-
-.. automodule:: colossalai.utils.memory
-   :members:
diff --git a/docs/colossalai/colossalai.utils.model.colo_init_context.rst b/docs/colossalai/colossalai.utils.model.colo_init_context.rst
deleted file mode 100644
index 33ee44915083..000000000000
--- a/docs/colossalai/colossalai.utils.model.colo_init_context.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.utils.model.colo\_init\_context
-==========================================
-
-.. automodule:: colossalai.utils.model.colo_init_context
-   :members:
diff --git a/docs/colossalai/colossalai.utils.model.lazy_init_context.rst b/docs/colossalai/colossalai.utils.model.lazy_init_context.rst
deleted file mode 100644
index 27c9a32c6a7d..000000000000
--- a/docs/colossalai/colossalai.utils.model.lazy_init_context.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.utils.model.lazy\_init\_context
-==========================================
-
-.. automodule:: colossalai.utils.model.lazy_init_context
-   :members:
diff --git a/docs/colossalai/colossalai.utils.model.rst b/docs/colossalai/colossalai.utils.model.rst
deleted file mode 100644
index 9adfd1450a47..000000000000
--- a/docs/colossalai/colossalai.utils.model.rst
+++ /dev/null
@@ -1,13 +0,0 @@
-colossalai.utils.model
-======================
-
-.. automodule:: colossalai.utils.model
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.utils.model.colo_init_context
-   colossalai.utils.model.lazy_init_context
-   colossalai.utils.model.utils
diff --git a/docs/colossalai/colossalai.utils.model.utils.rst b/docs/colossalai/colossalai.utils.model.utils.rst
deleted file mode 100644
index 211106662dc3..000000000000
--- a/docs/colossalai/colossalai.utils.model.utils.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.utils.model.utils
-============================
-
-.. automodule:: colossalai.utils.model.utils
-   :members:
diff --git a/docs/colossalai/colossalai.utils.moe.rst b/docs/colossalai/colossalai.utils.moe.rst
deleted file mode 100644
index b66ccdc8ec2d..000000000000
--- a/docs/colossalai/colossalai.utils.moe.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.utils.moe
-====================
-
-.. automodule:: colossalai.utils.moe
-   :members:
diff --git a/docs/colossalai/colossalai.utils.multi_tensor_apply.multi_tensor_apply.rst b/docs/colossalai/colossalai.utils.multi_tensor_apply.multi_tensor_apply.rst
deleted file mode 100644
index 493b9530e0f6..000000000000
--- a/docs/colossalai/colossalai.utils.multi_tensor_apply.multi_tensor_apply.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.utils.multi\_tensor\_apply.multi\_tensor\_apply
-==========================================================
-
-.. automodule:: colossalai.utils.multi_tensor_apply.multi_tensor_apply
-   :members:
diff --git a/docs/colossalai/colossalai.utils.multi_tensor_apply.rst b/docs/colossalai/colossalai.utils.multi_tensor_apply.rst
deleted file mode 100644
index d5749cfa8801..000000000000
--- a/docs/colossalai/colossalai.utils.multi_tensor_apply.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-colossalai.utils.multi\_tensor\_apply
-=====================================
-
-.. automodule:: colossalai.utils.multi_tensor_apply
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.utils.multi_tensor_apply.multi_tensor_apply
diff --git a/docs/colossalai/colossalai.utils.profiler.extention.rst b/docs/colossalai/colossalai.utils.profiler.extention.rst
deleted file mode 100644
index 5c87692611a0..000000000000
--- a/docs/colossalai/colossalai.utils.profiler.extention.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.utils.profiler.extention
-===================================
-
-.. automodule:: colossalai.utils.profiler.extention
-   :members:
diff --git a/docs/colossalai/colossalai.utils.profiler.legacy.comm_profiler.rst b/docs/colossalai/colossalai.utils.profiler.legacy.comm_profiler.rst
deleted file mode 100644
index 4329a3d60da3..000000000000
--- a/docs/colossalai/colossalai.utils.profiler.legacy.comm_profiler.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.utils.profiler.legacy.comm\_profiler
-===============================================
-
-.. automodule:: colossalai.utils.profiler.legacy.comm_profiler
-   :members:
diff --git a/docs/colossalai/colossalai.utils.profiler.legacy.mem_profiler.rst b/docs/colossalai/colossalai.utils.profiler.legacy.mem_profiler.rst
deleted file mode 100644
index 35c665c71d3b..000000000000
--- a/docs/colossalai/colossalai.utils.profiler.legacy.mem_profiler.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.utils.profiler.legacy.mem\_profiler
-==============================================
-
-.. automodule:: colossalai.utils.profiler.legacy.mem_profiler
-   :members:
diff --git a/docs/colossalai/colossalai.utils.profiler.legacy.pcie_profiler.rst b/docs/colossalai/colossalai.utils.profiler.legacy.pcie_profiler.rst
deleted file mode 100644
index 7aa82b8f7a4f..000000000000
--- a/docs/colossalai/colossalai.utils.profiler.legacy.pcie_profiler.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.utils.profiler.legacy.pcie\_profiler
-===============================================
-
-.. automodule:: colossalai.utils.profiler.legacy.pcie_profiler
-   :members:
diff --git a/docs/colossalai/colossalai.utils.profiler.legacy.prof_utils.rst b/docs/colossalai/colossalai.utils.profiler.legacy.prof_utils.rst
deleted file mode 100644
index 93af82b2fabb..000000000000
--- a/docs/colossalai/colossalai.utils.profiler.legacy.prof_utils.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.utils.profiler.legacy.prof\_utils
-============================================
-
-.. automodule:: colossalai.utils.profiler.legacy.prof_utils
-   :members:
diff --git a/docs/colossalai/colossalai.utils.profiler.legacy.rst b/docs/colossalai/colossalai.utils.profiler.legacy.rst
deleted file mode 100644
index 37fcebde5a43..000000000000
--- a/docs/colossalai/colossalai.utils.profiler.legacy.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-colossalai.utils.profiler.legacy
-================================
-
-.. automodule:: colossalai.utils.profiler.legacy
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.utils.profiler.legacy.comm_profiler
-   colossalai.utils.profiler.legacy.mem_profiler
-   colossalai.utils.profiler.legacy.pcie_profiler
-   colossalai.utils.profiler.legacy.prof_utils
diff --git a/docs/colossalai/colossalai.utils.profiler.profiler.rst b/docs/colossalai/colossalai.utils.profiler.profiler.rst
deleted file mode 100644
index d35522837801..000000000000
--- a/docs/colossalai/colossalai.utils.profiler.profiler.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.utils.profiler.profiler
-==================================
-
-.. automodule:: colossalai.utils.profiler.profiler
-   :members:
diff --git a/docs/colossalai/colossalai.utils.profiler.rst b/docs/colossalai/colossalai.utils.profiler.rst
deleted file mode 100644
index 15681fcf2d82..000000000000
--- a/docs/colossalai/colossalai.utils.profiler.rst
+++ /dev/null
@@ -1,18 +0,0 @@
-colossalai.utils.profiler
-=========================
-
-.. automodule:: colossalai.utils.profiler
-   :members:
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.utils.profiler.legacy
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.utils.profiler.extention
-   colossalai.utils.profiler.profiler
-   colossalai.utils.profiler.stateful_tensor_mem_extention
diff --git a/docs/colossalai/colossalai.utils.profiler.stateful_tensor_mem_extention.rst b/docs/colossalai/colossalai.utils.profiler.stateful_tensor_mem_extention.rst
deleted file mode 100644
index 72a3fcceca18..000000000000
--- a/docs/colossalai/colossalai.utils.profiler.stateful_tensor_mem_extention.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.utils.profiler.stateful\_tensor\_mem\_extention
-==========================================================
-
-.. automodule:: colossalai.utils.profiler.stateful_tensor_mem_extention
-   :members:
diff --git a/docs/colossalai/colossalai.utils.rst b/docs/colossalai/colossalai.utils.rst
deleted file mode 100644
index 8b232a12c245..000000000000
--- a/docs/colossalai/colossalai.utils.rst
+++ /dev/null
@@ -1,27 +0,0 @@
-colossalai.utils
-================
-
-.. automodule:: colossalai.utils
-   :members:
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.utils.checkpoint
-   colossalai.utils.data_sampler
-   colossalai.utils.model
-   colossalai.utils.multi_tensor_apply
-   colossalai.utils.profiler
-   colossalai.utils.tensor_detector
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.utils.activation_checkpoint
-   colossalai.utils.checkpointing
-   colossalai.utils.common
-   colossalai.utils.cuda
-   colossalai.utils.memory
-   colossalai.utils.moe
-   colossalai.utils.timer
diff --git a/docs/colossalai/colossalai.utils.tensor_detector.rst b/docs/colossalai/colossalai.utils.tensor_detector.rst
deleted file mode 100644
index 807d67e3ad1e..000000000000
--- a/docs/colossalai/colossalai.utils.tensor_detector.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-colossalai.utils.tensor\_detector
-=================================
-
-.. automodule:: colossalai.utils.tensor_detector
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.utils.tensor_detector.tensor_detector
diff --git a/docs/colossalai/colossalai.utils.tensor_detector.tensor_detector.rst b/docs/colossalai/colossalai.utils.tensor_detector.tensor_detector.rst
deleted file mode 100644
index 991cea3438b3..000000000000
--- a/docs/colossalai/colossalai.utils.tensor_detector.tensor_detector.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.utils.tensor\_detector.tensor\_detector
-==================================================
-
-.. automodule:: colossalai.utils.tensor_detector.tensor_detector
-   :members:
diff --git a/docs/colossalai/colossalai.utils.timer.rst b/docs/colossalai/colossalai.utils.timer.rst
deleted file mode 100644
index 2014c85f548f..000000000000
--- a/docs/colossalai/colossalai.utils.timer.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.utils.timer
-======================
-
-.. automodule:: colossalai.utils.timer
-   :members:
diff --git a/docs/colossalai/colossalai.zero.init_ctx.init_context.rst b/docs/colossalai/colossalai.zero.init_ctx.init_context.rst
deleted file mode 100644
index 1694074e83bf..000000000000
--- a/docs/colossalai/colossalai.zero.init_ctx.init_context.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.zero.init\_ctx.init\_context
-=======================================
-
-.. automodule:: colossalai.zero.init_ctx.init_context
-   :members:
diff --git a/docs/colossalai/colossalai.zero.init_ctx.rst b/docs/colossalai/colossalai.zero.init_ctx.rst
deleted file mode 100644
index 88cf471df9d3..000000000000
--- a/docs/colossalai/colossalai.zero.init_ctx.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-colossalai.zero.init\_ctx
-=========================
-
-.. automodule:: colossalai.zero.init_ctx
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.zero.init_ctx.init_context
diff --git a/docs/colossalai/colossalai.zero.rst b/docs/colossalai/colossalai.zero.rst
deleted file mode 100644
index 3bcaffd28d05..000000000000
--- a/docs/colossalai/colossalai.zero.rst
+++ /dev/null
@@ -1,21 +0,0 @@
-colossalai.zero
-===============
-
-.. automodule:: colossalai.zero
-   :members:
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.zero.init_ctx
-   colossalai.zero.shard_utils
-   colossalai.zero.sharded_model
-   colossalai.zero.sharded_optim
-   colossalai.zero.sharded_param
-   colossalai.zero.utils
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.zero.zero_optimizer
diff --git a/docs/colossalai/colossalai.zero.shard_utils.base_shard_strategy.rst b/docs/colossalai/colossalai.zero.shard_utils.base_shard_strategy.rst
deleted file mode 100644
index d5b59e06a517..000000000000
--- a/docs/colossalai/colossalai.zero.shard_utils.base_shard_strategy.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.zero.shard\_utils.base\_shard\_strategy
-==================================================
-
-.. automodule:: colossalai.zero.shard_utils.base_shard_strategy
-   :members:
diff --git a/docs/colossalai/colossalai.zero.shard_utils.bucket_tensor_shard_strategy.rst b/docs/colossalai/colossalai.zero.shard_utils.bucket_tensor_shard_strategy.rst
deleted file mode 100644
index 952c5bbddf09..000000000000
--- a/docs/colossalai/colossalai.zero.shard_utils.bucket_tensor_shard_strategy.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.zero.shard\_utils.bucket\_tensor\_shard\_strategy
-============================================================
-
-.. automodule:: colossalai.zero.shard_utils.bucket_tensor_shard_strategy
-   :members:
diff --git a/docs/colossalai/colossalai.zero.shard_utils.commons.rst b/docs/colossalai/colossalai.zero.shard_utils.commons.rst
deleted file mode 100644
index aa6682d79ff2..000000000000
--- a/docs/colossalai/colossalai.zero.shard_utils.commons.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.zero.shard\_utils.commons
-====================================
-
-.. automodule:: colossalai.zero.shard_utils.commons
-   :members:
diff --git a/docs/colossalai/colossalai.zero.shard_utils.rst b/docs/colossalai/colossalai.zero.shard_utils.rst
deleted file mode 100644
index 580bfdab7d85..000000000000
--- a/docs/colossalai/colossalai.zero.shard_utils.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-colossalai.zero.shard\_utils
-============================
-
-.. automodule:: colossalai.zero.shard_utils
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.zero.shard_utils.base_shard_strategy
-   colossalai.zero.shard_utils.bucket_tensor_shard_strategy
-   colossalai.zero.shard_utils.commons
-   colossalai.zero.shard_utils.tensor_shard_strategy
diff --git a/docs/colossalai/colossalai.zero.shard_utils.tensor_shard_strategy.rst b/docs/colossalai/colossalai.zero.shard_utils.tensor_shard_strategy.rst
deleted file mode 100644
index 571b7bd7a588..000000000000
--- a/docs/colossalai/colossalai.zero.shard_utils.tensor_shard_strategy.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.zero.shard\_utils.tensor\_shard\_strategy
-====================================================
-
-.. automodule:: colossalai.zero.shard_utils.tensor_shard_strategy
-   :members:
diff --git a/docs/colossalai/colossalai.zero.sharded_model.reduce_scatter.rst b/docs/colossalai/colossalai.zero.sharded_model.reduce_scatter.rst
deleted file mode 100644
index cf861ee70aa0..000000000000
--- a/docs/colossalai/colossalai.zero.sharded_model.reduce_scatter.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.zero.sharded\_model.reduce\_scatter
-==============================================
-
-.. automodule:: colossalai.zero.sharded_model.reduce_scatter
-   :members:
diff --git a/docs/colossalai/colossalai.zero.sharded_model.rst b/docs/colossalai/colossalai.zero.sharded_model.rst
deleted file mode 100644
index fb3f5a8456d0..000000000000
--- a/docs/colossalai/colossalai.zero.sharded_model.rst
+++ /dev/null
@@ -1,13 +0,0 @@
-colossalai.zero.sharded\_model
-==============================
-
-.. automodule:: colossalai.zero.sharded_model
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.zero.sharded_model.reduce_scatter
-   colossalai.zero.sharded_model.sharded_model_v2
-   colossalai.zero.sharded_model.utils
diff --git a/docs/colossalai/colossalai.zero.sharded_model.sharded_model_v2.rst b/docs/colossalai/colossalai.zero.sharded_model.sharded_model_v2.rst
deleted file mode 100644
index a0e191377914..000000000000
--- a/docs/colossalai/colossalai.zero.sharded_model.sharded_model_v2.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.zero.sharded\_model.sharded\_model\_v2
-=================================================
-
-.. automodule:: colossalai.zero.sharded_model.sharded_model_v2
-   :members:
diff --git a/docs/colossalai/colossalai.zero.sharded_model.utils.rst b/docs/colossalai/colossalai.zero.sharded_model.utils.rst
deleted file mode 100644
index 5e376774296f..000000000000
--- a/docs/colossalai/colossalai.zero.sharded_model.utils.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.zero.sharded\_model.utils
-====================================
-
-.. automodule:: colossalai.zero.sharded_model.utils
-   :members:
diff --git a/docs/colossalai/colossalai.zero.sharded_optim.rst b/docs/colossalai/colossalai.zero.sharded_optim.rst
deleted file mode 100644
index db3dfdddbab4..000000000000
--- a/docs/colossalai/colossalai.zero.sharded_optim.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-colossalai.zero.sharded\_optim
-==============================
-
-.. automodule:: colossalai.zero.sharded_optim
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.zero.sharded_optim.sharded_optim_v2
diff --git a/docs/colossalai/colossalai.zero.sharded_optim.sharded_optim_v2.rst b/docs/colossalai/colossalai.zero.sharded_optim.sharded_optim_v2.rst
deleted file mode 100644
index 01fbe0c4c031..000000000000
--- a/docs/colossalai/colossalai.zero.sharded_optim.sharded_optim_v2.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.zero.sharded\_optim.sharded\_optim\_v2
-=================================================
-
-.. automodule:: colossalai.zero.sharded_optim.sharded_optim_v2
-   :members:
diff --git a/docs/colossalai/colossalai.zero.sharded_param.rst b/docs/colossalai/colossalai.zero.sharded_param.rst
deleted file mode 100644
index 02e0fc6c29eb..000000000000
--- a/docs/colossalai/colossalai.zero.sharded_param.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-colossalai.zero.sharded\_param
-==============================
-
-.. automodule:: colossalai.zero.sharded_param
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.zero.sharded_param.sharded_param
-   colossalai.zero.sharded_param.sharded_tensor
diff --git a/docs/colossalai/colossalai.zero.sharded_param.sharded_param.rst b/docs/colossalai/colossalai.zero.sharded_param.sharded_param.rst
deleted file mode 100644
index efa2f0de379c..000000000000
--- a/docs/colossalai/colossalai.zero.sharded_param.sharded_param.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.zero.sharded\_param.sharded\_param
-=============================================
-
-.. automodule:: colossalai.zero.sharded_param.sharded_param
-   :members:
diff --git a/docs/colossalai/colossalai.zero.sharded_param.sharded_tensor.rst b/docs/colossalai/colossalai.zero.sharded_param.sharded_tensor.rst
deleted file mode 100644
index 930c28de4542..000000000000
--- a/docs/colossalai/colossalai.zero.sharded_param.sharded_tensor.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.zero.sharded\_param.sharded\_tensor
-==============================================
-
-.. automodule:: colossalai.zero.sharded_param.sharded_tensor
-   :members:
diff --git a/docs/colossalai/colossalai.zero.utils.rst b/docs/colossalai/colossalai.zero.utils.rst
deleted file mode 100644
index 50ee9071e7d5..000000000000
--- a/docs/colossalai/colossalai.zero.utils.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-colossalai.zero.utils
-=====================
-
-.. automodule:: colossalai.zero.utils
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.zero.utils.zero_hook
-   colossalai.zero.utils.gemini_hook
diff --git a/docs/colossalai/colossalai.zero.utils.zero_hook.rst b/docs/colossalai/colossalai.zero.utils.zero_hook.rst
deleted file mode 100644
index 424f466dd4f5..000000000000
--- a/docs/colossalai/colossalai.zero.utils.zero_hook.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.zero.utils.zero\_hook
-================================
-
-.. automodule:: colossalai.zero.utils.zero_hook
-   :members:
diff --git a/docs/colossalai/colossalai.zero.utils.zero_hook_v2.rst b/docs/colossalai/colossalai.zero.utils.zero_hook_v2.rst
deleted file mode 100644
index e6d6673af131..000000000000
--- a/docs/colossalai/colossalai.zero.utils.zero_hook_v2.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.zero.utils.zero\_hook\_v2
-====================================
-
-.. automodule:: colossalai.zero.utils.gemini_hook
-   :members:
diff --git a/docs/colossalai/colossalai.zero.zero_optimizer.rst b/docs/colossalai/colossalai.zero.zero_optimizer.rst
deleted file mode 100644
index b945b081c866..000000000000
--- a/docs/colossalai/colossalai.zero.zero_optimizer.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.zero.zero\_optimizer
-===============================
-
-.. automodule:: colossalai.zero.zero_optimizer
-   :members:
diff --git a/docs/conf.py b/docs/conf.py
deleted file mode 100644
index 52e999f3b938..000000000000
--- a/docs/conf.py
+++ /dev/null
@@ -1,137 +0,0 @@
-# Configuration file for the Sphinx documentation builder.
-#
-# This file only contains a selection of the most common options. For a full
-# list see the documentation:
-# https://www.sphinx-doc.org/en/master/usage/configuration.html
-
-# -- Path setup --------------------------------------------------------------
-
-import datetime
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-#
-import os
-import sys
-
-sys.path.insert(0, os.path.abspath('..'))
-
-# -- Project information -----------------------------------------------------
-
-project = 'Colossal-AI'
-copyright = f'{datetime.datetime.now().year}, HPC-AI Tech'
-author = 'HPC-AI Technology Inc.'
-
-# The full version, including alpha/beta/rc tags
-# release = '0.0.1'
-
-# -- General configuration ---------------------------------------------------
-
-# Add any Sphinx extension module names here, as strings. They can be
-# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
-# ones.
-extensions = [
-    'sphinx.ext.autodoc',
-    'sphinx.ext.mathjax',
-    'sphinx.ext.napoleon',
-    'sphinx.ext.linkcode',
-    'myst_parser',
-]
-
-# Disable docstring inheritance
-autodoc_inherit_docstrings = False
-
-# Disable displaying type annotations, these can be very verbose
-autodoc_typehints = 'none'
-
-# Enable overriding of function signatures in the first line of the docstring.
-autodoc_docstring_signature = True
-autodoc_default_options = {
-    'member-order': 'bysource',
-}
-
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-# This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = ['.build', 'Thumbs.db', '.DS_Store']
-
-# -- Options for HTML output -------------------------------------------------
-
-# The theme to use for HTML and HTML Help pages.  See the documentation for
-# a list of builtin themes.
-#
-html_theme = 'sphinx_book_theme'
-html_show_sourcelink = False
-html_theme_options = {
-    'navigation_depth': 3,
-}
-
-html_context = {
-    'display_github': True,
-    'github_user': 'hpcaitech',
-    'github_repo': 'ColossalAI',
-    #   'github_version': 'master/docs/',
-}
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
-
-html_css_files = [
-    'css/rtd_theme.css',
-]
-
-# -- Extension configuration -------------------------------------------------
-source_suffix = ['.rst', '.md', '.MD']
-
-import inspect
-
-import colossalai
-
-
-def linkcode_resolve(domain, info):
-    """
-    Determine the URL corresponding to Python object
-    """
-    if domain != 'py':
-        return None
-
-    modname = info['module']
-    fullname = info['fullname']
-
-    submod = sys.modules.get(modname)
-    if submod is None:
-        return None
-
-    obj = submod
-    for part in fullname.split('.'):
-        try:
-            obj = getattr(obj, part)
-        except Exception:
-            return None
-
-    try:
-        fn = inspect.getsourcefile(obj)
-    except Exception:
-        fn = None
-    if not fn:
-        return None
-
-    try:
-        source, lineno = inspect.findsource(obj)
-    except Exception:
-        lineno = None
-
-    if lineno:
-        linespec = "#L%d" % (lineno + 1)
-    else:
-        linespec = ""
-
-    fn = os.path.relpath(fn, start=os.path.dirname(colossalai.__file__))
-
-    github = "https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/{}{}"
-    return github.format(fn, linespec)
diff --git a/docs/index.rst b/docs/index.rst
deleted file mode 100644
index f275f7829403..000000000000
--- a/docs/index.rst
+++ /dev/null
@@ -1,27 +0,0 @@
-.. Colossal-AI documentation master file, created by
-   sphinx-quickstart on Mon Oct 11 17:05:05 2021.
-   You can adapt this file completely to your liking, but it should at least
-   contain the root `toctree` directive.
-
-Colossal-AI API documentation
-======================================
-
-.. toctree::
-   :maxdepth: 2
-   :caption: API REFERENCE
-
-   colossalai/colossalai
-
-.. toctree::
-   :maxdepth: 2
-   :caption: Useful links for Colossal-AI
-
-   links/Colossalai examples
-   links/Colossalai benchmarks
-   links/Colossalai tutorial
-
-
-Indices and tables
---------------------
-
-* :ref:`genindex`
diff --git a/docs/links/Colossalai Homepage.rst b/docs/links/Colossalai Homepage.rst
deleted file mode 100644
index 38e223bd22c9..000000000000
--- a/docs/links/Colossalai Homepage.rst	
+++ /dev/null
@@ -1,6 +0,0 @@
-Colossal-AI Github Homepage
-==================================
-
-*If you are looking for the Git homepage of Colossal-AI, please check*
-`Colossal-AI Tutorial <https://github.com/hpcaitech/ColossalAI>`_
-*for our source code.*
\ No newline at end of file
diff --git a/docs/links/Colossalai benchmarks.rst b/docs/links/Colossalai benchmarks.rst
deleted file mode 100644
index 1835670a5f2a..000000000000
--- a/docs/links/Colossalai benchmarks.rst	
+++ /dev/null
@@ -1,6 +0,0 @@
-Colossal-AI Benchmarks
-==================================
-
-*If you are interested in the performance or the features of Colossal-AI, please check*
-`Colossal-AI Benchmark <https://github.com/hpcaitech/ColossalAI-Benchmark>`_.
-*to get more details about our performance on CIFAR10, ImageNet1K or GPT2 ZeRO.*
\ No newline at end of file
diff --git a/docs/links/Colossalai examples.rst b/docs/links/Colossalai examples.rst
deleted file mode 100644
index c375f007a3ff..000000000000
--- a/docs/links/Colossalai examples.rst	
+++ /dev/null
@@ -1,6 +0,0 @@
-Colossal-AI Examples
-==================================
-
-*If you are looking for the example code of using Colossal-AI in CV or NLP, please check*
-`Colossal-AI Example <https://github.com/hpcaitech/ColossalAI-Examples>`_
-*to get more details about using colossalai in Resnet, Moe, Vit, Bert and GPT*
\ No newline at end of file
diff --git a/docs/links/Colossalai tutorial.rst b/docs/links/Colossalai tutorial.rst
deleted file mode 100644
index a4ab7f5b906b..000000000000
--- a/docs/links/Colossalai tutorial.rst	
+++ /dev/null
@@ -1,7 +0,0 @@
-Colossal-AI Tutorial
-==================================
-
-*If you are looking for the tutorial of using Colossal-AI, please check*
-`Colossal-AI Tutorial <https://www.colossalai.org/docs/get_started/installation>`_
-*to get more details about getting started, using TP (tensor parallel), PP (pipeline parallel)
-and training with colossalai trainer or engine.*
\ No newline at end of file
diff --git a/docs/make.bat b/docs/make.bat
deleted file mode 100644
index cf73214110f2..000000000000
--- a/docs/make.bat
+++ /dev/null
@@ -1,35 +0,0 @@
-@ECHO OFF
-
-pushd %~dp0
-
-REM Command file for Sphinx documentation
-
-if "%SPHINXBUILD%" == "" (
-	set SPHINXBUILD=sphinx-build
-)
-set SOURCEDIR=.
-set BUILDDIR=.build
-
-if "%1" == "" goto help
-
-%SPHINXBUILD% >NUL 2>NUL
-if errorlevel 9009 (
-	echo.
-	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
-	echo.installed, then set the SPHINXBUILD environment variable to point
-	echo.to the full path of the 'sphinx-build' executable. Alternatively you
-	echo.may add the Sphinx directory to PATH.
-	echo.
-	echo.If you don't have Sphinx installed, grab it from
-	echo.https://www.sphinx-doc.org/
-	exit /b 1
-)
-
-%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-goto end
-
-:help
-%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-
-:end
-popd
diff --git a/docs/requirements.txt b/docs/requirements.txt
deleted file mode 100644
index c93221495e2c..000000000000
--- a/docs/requirements.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-tensorboard
-apex
-sphinx
-myst-parser
-sphinx-book-theme

From b8804aa60c603d00d9576a4b5ad2fa43d13687b7 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Tue, 28 Feb 2023 14:04:52 +0800
Subject: [PATCH 395/503] [doc] added readme for documentation (#2935)

---
 docs/README.md | 112 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 112 insertions(+)
 create mode 100644 docs/README.md

diff --git a/docs/README.md b/docs/README.md
new file mode 100644
index 000000000000..7261a6bc7c19
--- /dev/null
+++ b/docs/README.md
@@ -0,0 +1,112 @@
+# 📕 Documentation
+
+## 🔗 Table of Contents
+
+- [📕 Documentation](#-documentation)
+  - [🔗 Table of Contents](#-table-of-contents)
+  - [📝 Overview](#-overview)
+  - [🗺 Module Structure](#-module-structure)
+  - [🧱 Our Documentation System](#-our-documentation-system)
+  - [🎊 Contribution](#-contribution)
+    - [🖊 Adding a New Documentation](#-adding-a-new-documentation)
+    - [🧹 Doc Testing](#-doc-testing)
+    - [💉 Auto Documentation](#-auto-documentation)
+
+## 📝 Overview
+
+We evaluated various existing solutions for documentation in the community and discussed their advantages and disadvangtes in the [issue #2651](https://github.com/hpcaitech/ColossalAI/issues/2651). Therefore, we propose to build a more modern and robust documentation system by integrating the Sphinx [autodoc](https://www.sphinx-doc.org/en/master/usage/extensions/autodoc.html) function and the [Docusaurus](https://docusaurus.io/) framework.
+
+## 🗺 Module Structure
+
+```text
+- docs
+    - source
+        - en
+        - zh-Hans
+    - sidebars.json
+    - versions.json
+    - requirements-doc-test.txt
+```
+
+The documentation module structure is shown above:
+1. source: This folder contains the multi-language documentation files.
+2. `sidebars.json`: The `sidebars.json` defines the table of content for the tutorials. You need to update this file when a new doc is added/deleted.
+3. `versions.json`: The `versions.json` in the **main branch** in the **latest commit** will be used to control the versions to be displayed on our website
+
+## 🧱 Our Documentation System
+
+We believe that there are several advantages from the existing system can be combined for simplicity, usability and maintainability:1
+1. Support Markdown](https://www.markdownguide.org/), we belive is a more popular language for writing documentations comapred to [RST](https://docutils.sourceforge.io/rst.html).
+2. Support Autodoc, which can automatically generate documentation from the docstrings in the source code provided by [Sphinx](https://www.sphinx-doc.org/en/master/).
+3. Support elegant and modern UI, which is provided by [Docusaurus](https://docusaurus.io/).
+4. Support MDX for more flexible and powerful documentation, which is provided by [Docusaurus](https://docusaurus.io/).
+5. Support hosting blogs/project home page/other pages besides the documentation, which is provided by [Docusaurus](https://docusaurus.io/).
+
+Therefore, we have built the [ColossalAI-Documentation](https://github.com/hpcaitech/ColossalAI-Documentation) repository to integrate the features above.
+
+## 🎊 Contribution
+
+You can contribute to the documentation by directly set up a Pull Request towards the `docs/source` folder. There are several guidelines for documentation contribution.
+
+1. The documentation is written in Markdown. You can refer to the [Markdown Guide](https://www.markdownguide.org/) for the syntax.
+2. You must ensure that the documentation exists for all languages. You can refer to the [Adding a New Documentation](#-adding-a-new-documentation) for more details.
+3. You must provide a test command for your documentation, please see [Doc Testing](#-doc-testing) for more details.
+4. You can embed your docstring in your markdown, please see [Auto Documentation](#-auto-documentation) for more details.
+
+### 🖊 Adding a New Documentation
+
+You can add a Markdown file to the `docs/source` folder`. You need to ensure that multi-language is supported in your PR.
+Let's assume that you want to add a file called `your_doc.md`, your file structure will look like this.
+
+```text
+- docs
+  - source
+    - en
+        - your_doc.md  # written in English
+    - zh-Hans
+        - your_doc.md  # written in Chinese
+  - sidebars.json  # add your documentation file name here
+```
+
+Meanwhile, you need to ensure the `sidebars.json` is updated such that it contains your documentation file. Our CI will check whether a documentation exists for all languages and can be used to build the website successfully.
+
+### 🧹 Doc Testing
+
+Every documentation is tested to ensure it works well. You need to add the following line to the top of your file and replace `$command` with the actual command. Do note that the markdown will be converted into a Python file. Assuming you have a `demo.md` file, the test file generated will be `demo.py`. Therefore, you should use `demo.py` in your command, e.g. `python demo.py`.
+
+```markdown
+<!-- doc-test-command: $command  -->
+```
+
+Meanwhile, only code labelled as a Python code block will be considered for testing.
+
+```markdown
+    ```python
+    print("hello world")
+    ```
+```
+
+Lastly, if you want to skip some code, you just need to add the following annotations to tell `docer` to discard the wrapped code for testing.
+
+```markdown
+<!--- doc-test-ignore-start -->
+
+    ```python
+    print("hello world")
+    ```
+
+<!--- doc-test-ignore-end -->
+```
+
+If you have any dependency required, please add it to `requriements-doc-test.txt`.
+
+
+### 💉 Auto Documentation
+
+Lastly, you may want to include the API documentation for a class/function in your documentation for reference.
+We support `autodoc` to extract the docstring and transform it into a Web element for elegant display.
+You just need to add `{{ autodoc:<mod-name> }}` in your markdown as a single line. An example is given below and you can see the outcome in [this PR](https://github.com/hpcaitech/ColossalAI-Documentation/pull/175).
+
+```markdown
+{{ autodoc:colossalai.amp.apex_amp.convert_to_apex_amp }}
+```

From 8264cd7ef11d1b40d1fc6069bd8c1b20dc78bd08 Mon Sep 17 00:00:00 2001
From: binmakeswell <binmakeswell@gmail.com>
Date: Tue, 28 Feb 2023 15:39:51 +0800
Subject: [PATCH 396/503] [doc] add env scope (#2933)

---
 README-zh-Hans.md                               | 5 ++++-
 README.md                                       | 4 +++-
 applications/ChatGPT/README.md                  | 2 +-
 docs/source/en/get_started/installation.md      | 3 +++
 docs/source/zh-Hans/get_started/installation.md | 2 ++
 5 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/README-zh-Hans.md b/README-zh-Hans.md
index 54d97af82efa..0451d86099cb 100644
--- a/README-zh-Hans.md
+++ b/README-zh-Hans.md
@@ -273,7 +273,10 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
 <p align="right">(<a href="#top">返回顶端</a>)</p>
 
 ## 安装
-> Colossal-AI 目前仅支持Linux操作系统，没有在其他操作系统如Windows和macOS进行测试
+> Colossal-AI 目前仅支持Linux操作系统，没有在其他操作系统如Windows和macOS进行测试。
+> 
+> 环境要求: PyTorch 1.10 ~ 1.12 (更新版本正在兼容中), Python >= 3.7, CUDA >= 11.0。如果你遇到安装问题，可以向本项目 [反馈](https://github.com/hpcaitech/ColossalAI/issues/new/choose)。
+
 
 ### 从PyPI安装
 
diff --git a/README.md b/README.md
index bb1b0e2164de..536511bc3ebb 100644
--- a/README.md
+++ b/README.md
@@ -276,6 +276,8 @@ Acceleration of [AlphaFold Protein Structure](https://alphafold.ebi.ac.uk/)
 
 ## Installation
 > Colossal-AI currently only supports the Linux operating system and has not been tested on other OS such as Windows and macOS.
+> 
+> Environment Requirement: PyTorch 1.10 ~ 1.12 (WIP higher version), Python >= 3.7, CUDA >= 11.0. If you encounter any problem about installation, you may want to raise an [issue](https://github.com/hpcaitech/ColossalAI/issues/new/choose) in this repository.
 
 ### Install from PyPI
 
@@ -353,7 +355,7 @@ docker run -ti --gpus all --rm --ipc=host colossalai bash
 
 Join the Colossal-AI community on [Forum](https://github.com/hpcaitech/ColossalAI/discussions),
 [Slack](https://join.slack.com/t/colossalaiworkspace/shared_invite/zt-z7b26eeb-CBp7jouvu~r0~lcFzX832w),
-and [WeChat](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/WeChat.png "qrcode") to share your suggestions, feedback, and questions with our engineering team.
+and [WeChat(微信)](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/WeChat.png "qrcode") to share your suggestions, feedback, and questions with our engineering team.
 
 ## Contributing
 
diff --git a/applications/ChatGPT/README.md b/applications/ChatGPT/README.md
index 0516991de288..dbd5eb7709b9 100644
--- a/applications/ChatGPT/README.md
+++ b/applications/ChatGPT/README.md
@@ -144,7 +144,7 @@ You may contact us or participate in the following ways:
 1. Posting an [issue](https://github.com/hpcaitech/ColossalAI/issues/new/choose) or submitting a [PR](https://github.com/hpcaitech/ColossalAI/pulls) on GitHub
 2. Join the Colossal-AI community on
 [Slack](https://join.slack.com/t/colossalaiworkspace/shared_invite/zt-z7b26eeb-CBp7jouvu~r0~lcFzX832w),
-and [WeChat](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/WeChat.png "qrcode") to share your ideas.
+and [WeChat(微信)](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/WeChat.png "qrcode") to share your ideas.
 3. Check out and fill in the [cooperation proposal](https://www.hpc-ai.tech/partners)
 4. Send your proposal to email contact@hpcaitech.com
 
diff --git a/docs/source/en/get_started/installation.md b/docs/source/en/get_started/installation.md
index 0e114696de6d..68c2c6a9f094 100644
--- a/docs/source/en/get_started/installation.md
+++ b/docs/source/en/get_started/installation.md
@@ -2,6 +2,9 @@
 
 # Setup
 > Colossal-AI currently only supports the Linux operating system and has not been tested on other OS such as Windows and macOS.
+> 
+> Environment Requirement: PyTorch 1.10 ~ 1.12 (WIP higher version), Python >= 3.7, CUDA >= 11.0. If you encounter any problem about installation, you may want to raise an [issue](https://github.com/hpcaitech/ColossalAI/issues/new/choose) in this repository.
+
 
 ## Download From PyPI
 
diff --git a/docs/source/zh-Hans/get_started/installation.md b/docs/source/zh-Hans/get_started/installation.md
index 5f2351ffe446..2edeb639335b 100755
--- a/docs/source/zh-Hans/get_started/installation.md
+++ b/docs/source/zh-Hans/get_started/installation.md
@@ -1,5 +1,7 @@
 # 安装
 > Colossal-AI 目前仅支持Linux操作系统，没有在其他操作系统如Windows和macOS进行测试
+> 
+> 环境要求: PyTorch 1.10 ~ 1.12 (更新版本正在兼容中), Python >= 3.7, CUDA >= 11.0。如果你遇到安装问题，可以向本项目 [反馈](https://github.com/hpcaitech/ColossalAI/issues/new/choose)。
 
 ## 从PyPI上安装
 

From dca98937f834f5af2730f481bf6f5e5eee844742 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Tue, 28 Feb 2023 15:41:52 +0800
Subject: [PATCH 397/503] [format] applied code formatting on changed files in
 pull request 2933 (#2939)

Co-authored-by: github-actions <github-actions@github.com>
---
 README-zh-Hans.md                               | 2 +-
 README.md                                       | 2 +-
 docs/source/en/get_started/installation.md      | 2 +-
 docs/source/zh-Hans/get_started/installation.md | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/README-zh-Hans.md b/README-zh-Hans.md
index 0451d86099cb..74cefa75cd12 100644
--- a/README-zh-Hans.md
+++ b/README-zh-Hans.md
@@ -274,7 +274,7 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
 
 ## 安装
 > Colossal-AI 目前仅支持Linux操作系统，没有在其他操作系统如Windows和macOS进行测试。
-> 
+>
 > 环境要求: PyTorch 1.10 ~ 1.12 (更新版本正在兼容中), Python >= 3.7, CUDA >= 11.0。如果你遇到安装问题，可以向本项目 [反馈](https://github.com/hpcaitech/ColossalAI/issues/new/choose)。
 
 
diff --git a/README.md b/README.md
index 536511bc3ebb..2d4135dc5087 100644
--- a/README.md
+++ b/README.md
@@ -276,7 +276,7 @@ Acceleration of [AlphaFold Protein Structure](https://alphafold.ebi.ac.uk/)
 
 ## Installation
 > Colossal-AI currently only supports the Linux operating system and has not been tested on other OS such as Windows and macOS.
-> 
+>
 > Environment Requirement: PyTorch 1.10 ~ 1.12 (WIP higher version), Python >= 3.7, CUDA >= 11.0. If you encounter any problem about installation, you may want to raise an [issue](https://github.com/hpcaitech/ColossalAI/issues/new/choose) in this repository.
 
 ### Install from PyPI
diff --git a/docs/source/en/get_started/installation.md b/docs/source/en/get_started/installation.md
index 68c2c6a9f094..da30ab4e5507 100644
--- a/docs/source/en/get_started/installation.md
+++ b/docs/source/en/get_started/installation.md
@@ -2,7 +2,7 @@
 
 # Setup
 > Colossal-AI currently only supports the Linux operating system and has not been tested on other OS such as Windows and macOS.
-> 
+>
 > Environment Requirement: PyTorch 1.10 ~ 1.12 (WIP higher version), Python >= 3.7, CUDA >= 11.0. If you encounter any problem about installation, you may want to raise an [issue](https://github.com/hpcaitech/ColossalAI/issues/new/choose) in this repository.
 
 
diff --git a/docs/source/zh-Hans/get_started/installation.md b/docs/source/zh-Hans/get_started/installation.md
index 2edeb639335b..2ceb0231f2f3 100755
--- a/docs/source/zh-Hans/get_started/installation.md
+++ b/docs/source/zh-Hans/get_started/installation.md
@@ -1,6 +1,6 @@
 # 安装
 > Colossal-AI 目前仅支持Linux操作系统，没有在其他操作系统如Windows和macOS进行测试
-> 
+>
 > 环境要求: PyTorch 1.10 ~ 1.12 (更新版本正在兼容中), Python >= 3.7, CUDA >= 11.0。如果你遇到安装问题，可以向本项目 [反馈](https://github.com/hpcaitech/ColossalAI/issues/new/choose)。
 
 ## 从PyPI上安装

From 090f14fd6b9ed71c2bee617daf1b4e7f399ac66e Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Tue, 28 Feb 2023 18:07:24 +0800
Subject: [PATCH 398/503] [misc] add reference (#2930)

* [misc] add reference

* [misc] add license
---
 LICENSE                                       |  97 ++++
 .../kernel/cuda_native/csrc/type_shim.h       | 467 ++++++++----------
 .../kernel/cuda_native/scaled_softmax.py      |   3 +
 colossalai/kernel/jit/bias_gelu.py            |   1 +
 4 files changed, 313 insertions(+), 255 deletions(-)

diff --git a/LICENSE b/LICENSE
index f05b54bd5dd4..394791da2771 100644
--- a/LICENSE
+++ b/LICENSE
@@ -229,3 +229,100 @@ Copyright 2021- HPC-AI Technology Inc. All rights reserved.
    publish, distribute, sublicense, and/or sell copies of the Software,
    and to permit persons to whom the Software is furnished to do so,
    subject to the following conditions:
+
+   ---------------- LICENSE FOR Microsoft Deepspeed ----------------
+
+   MIT License
+
+   Copyright (c) Microsoft Corporation.
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in all
+   copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE
+
+   ---------------- LICENSE FOR NVIDIA Megatron-LM ----------------
+
+   Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of NVIDIA CORPORATION nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+   EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+   PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+   OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   ---------------- LICENSE FOR NVIDIA Apex ----------------
+
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+   1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+
+   3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   ---------------- LICENSE FOR Facebook Fairscale ----------------
+
+   Copyright (c) Facebook, Inc. and its affiliates
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+   3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
+      and IDIAP Research Institute nor the names of its contributors may be
+      used to endorse or promote products derived from this software without
+      specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
diff --git a/colossalai/kernel/cuda_native/csrc/type_shim.h b/colossalai/kernel/cuda_native/csrc/type_shim.h
index cf83414af37f..b4011c5ba6c3 100644
--- a/colossalai/kernel/cuda_native/csrc/type_shim.h
+++ b/colossalai/kernel/cuda_native/csrc/type_shim.h
@@ -1,76 +1,64 @@
+/* Taken from NVIDIA/apex commit 855808f3fc268e9715d613f3c2e56469d8c986d8 */
 #include <ATen/ATen.h>
-#include "compat.h"
-
-
-#define DISPATCH_HALF_AND_BFLOAT(TYPE, NAME, ...)			\
-  switch(TYPE)								\
-    {									\
-    case at::ScalarType::Half:						\
-      {									\
-	using scalar_t = at::Half;					\
-	__VA_ARGS__;							\
-	break;								\
-      }									\
-    case at::ScalarType::BFloat16:					\
-      {									\
-	using scalar_t = at::BFloat16;					\
-	__VA_ARGS__;							\
-	break;								\
-      }									\
-    default:								\
-      AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'");	\
-      }
 
+#include "compat.h"
 
+#define DISPATCH_HALF_AND_BFLOAT(TYPE, NAME, ...)                     \
+  switch (TYPE) {                                                     \
+    case at::ScalarType::Half: {                                      \
+      using scalar_t = at::Half;                                      \
+      __VA_ARGS__;                                                    \
+      break;                                                          \
+    }                                                                 \
+    case at::ScalarType::BFloat16: {                                  \
+      using scalar_t = at::BFloat16;                                  \
+      __VA_ARGS__;                                                    \
+      break;                                                          \
+    }                                                                 \
+    default:                                                          \
+      AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
+  }
 
 #define DISPATCH_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(TYPEIN, TYPEOUT, NAME, ...) \
-  switch(TYPEIN)							\
-    {									\
-    case at::ScalarType::Float:						\
-      {									\
-	using scalar_t_in = float;					\
-	switch(TYPEOUT)							\
-	  {								\
-	  case at::ScalarType::Float:					\
-	    {								\
-	      using scalar_t_out = float;				\
-	      __VA_ARGS__;						\
-	      break;							\
-	    }								\
-	  case at::ScalarType::Half:					\
-	    {								\
-	      using scalar_t_out = at::Half;				\
-	      __VA_ARGS__;						\
-	      break;							\
-	    }								\
-	  case at::ScalarType::BFloat16:				\
-	    {								\
-	      using scalar_t_out = at::BFloat16;			\
-	      __VA_ARGS__;						\
-	      break;							\
-	    }								\
-	  default:							\
-	    AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'"); \
-	  }								\
-	break;								\
-      }									\
-    case at::ScalarType::Half:						\
-      {									\
-	using scalar_t_in = at::Half;					\
-	using scalar_t_out = at::Half;					\
-	__VA_ARGS__;							\
-	break;								\
-      }									\
-    case at::ScalarType::BFloat16:					\
-      {									\
-	using scalar_t_in = at::BFloat16;				\
-	using scalar_t_out = at::BFloat16;				\
-	__VA_ARGS__;							\
-	break;								\
-      }									\
-    default:								\
-      AT_ERROR(#NAME, " not implemented for '", toString(TYPEIN), "'");	\
-    }
+  switch (TYPEIN) {                                                            \
+    case at::ScalarType::Float: {                                              \
+      using scalar_t_in = float;                                               \
+      switch (TYPEOUT) {                                                       \
+        case at::ScalarType::Float: {                                          \
+          using scalar_t_out = float;                                          \
+          __VA_ARGS__;                                                         \
+          break;                                                               \
+        }                                                                      \
+        case at::ScalarType::Half: {                                           \
+          using scalar_t_out = at::Half;                                       \
+          __VA_ARGS__;                                                         \
+          break;                                                               \
+        }                                                                      \
+        case at::ScalarType::BFloat16: {                                       \
+          using scalar_t_out = at::BFloat16;                                   \
+          __VA_ARGS__;                                                         \
+          break;                                                               \
+        }                                                                      \
+        default:                                                               \
+          AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'");   \
+      }                                                                        \
+      break;                                                                   \
+    }                                                                          \
+    case at::ScalarType::Half: {                                               \
+      using scalar_t_in = at::Half;                                            \
+      using scalar_t_out = at::Half;                                           \
+      __VA_ARGS__;                                                             \
+      break;                                                                   \
+    }                                                                          \
+    case at::ScalarType::BFloat16: {                                           \
+      using scalar_t_in = at::BFloat16;                                        \
+      using scalar_t_out = at::BFloat16;                                       \
+      __VA_ARGS__;                                                             \
+      break;                                                                   \
+    }                                                                          \
+    default:                                                                   \
+      AT_ERROR(#NAME, " not implemented for '", toString(TYPEIN), "'");        \
+  }
 
 // Forward/backward compatiblity hack around
 // https://github.com/pytorch/pytorch/commit/3aeb78079bcd68282fe9117088e138b77318e288
@@ -81,222 +69,191 @@
 //   TypeShim(const at::Type& type) : payload(type) {}
 //   // Enable trivial conversion to a const at::Type& for pre-3aeb78
 //   operator const at::Type&(){ return payload; };
-//   // Enable dispatch switch statements to take *this directly for  post-3aeb78
+//   // Enable dispatch switch statements to take *this directly for post-3aeb78
 //   //operator at::ScalarType(){ return payload.; };
 // };
 
-#define DISPATCH_FLOAT_AND_HALF(TYPE, LEVEL, NAME, ...)                 \
-    switch (TYPE)                                                       \
-    {                                                                   \
-    case at::ScalarType::Float:                                         \
-    {                                                                   \
-        using scalar_t_##LEVEL = float;                                 \
-        __VA_ARGS__;                                                    \
-        break;                                                          \
-    }                                                                   \
-    case at::ScalarType::Half:                                          \
-    {                                                                   \
-        using scalar_t_##LEVEL = at::Half;                              \
-        __VA_ARGS__;                                                    \
-        break;                                                          \
-    }                                                                   \
-    default:                                                            \
-        AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
-    }
+#define DISPATCH_FLOAT_AND_HALF(TYPE, LEVEL, NAME, ...)               \
+  switch (TYPE) {                                                     \
+    case at::ScalarType::Float: {                                     \
+      using scalar_t_##LEVEL = float;                                 \
+      __VA_ARGS__;                                                    \
+      break;                                                          \
+    }                                                                 \
+    case at::ScalarType::Half: {                                      \
+      using scalar_t_##LEVEL = at::Half;                              \
+      __VA_ARGS__;                                                    \
+      break;                                                          \
+    }                                                                 \
+    default:                                                          \
+      AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
+  }
 
-#define DISPATCH_FLOAT_HALF_AND_BYTE(TYPE, LEVEL, NAME, ...)            \
-    switch (TYPE)                                                       \
-    {                                                                   \
-    case at::ScalarType::Float:                                         \
-    {                                                                   \
-        using scalar_t_##LEVEL = float;                                 \
-        __VA_ARGS__;                                                    \
-        break;                                                          \
-    }                                                                   \
-    case at::ScalarType::Half:                                          \
-    {                                                                   \
-        using scalar_t_##LEVEL = at::Half;                              \
-        __VA_ARGS__;                                                    \
-        break;                                                          \
-    }                                                                   \
-    case at::ScalarType::Byte:                                          \
-    {                                                                   \
-        using scalar_t_##LEVEL = uint8_t;                               \
-        __VA_ARGS__;                                                    \
-        break;                                                          \
-    }                                                                   \
-    default:                                                            \
-        AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
-    }
+#define DISPATCH_FLOAT_HALF_AND_BYTE(TYPE, LEVEL, NAME, ...)          \
+  switch (TYPE) {                                                     \
+    case at::ScalarType::Float: {                                     \
+      using scalar_t_##LEVEL = float;                                 \
+      __VA_ARGS__;                                                    \
+      break;                                                          \
+    }                                                                 \
+    case at::ScalarType::Half: {                                      \
+      using scalar_t_##LEVEL = at::Half;                              \
+      __VA_ARGS__;                                                    \
+      break;                                                          \
+    }                                                                 \
+    case at::ScalarType::Byte: {                                      \
+      using scalar_t_##LEVEL = uint8_t;                               \
+      __VA_ARGS__;                                                    \
+      break;                                                          \
+    }                                                                 \
+    default:                                                          \
+      AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
+  }
 
-#define DISPATCH_DOUBLE_FLOAT_AND_HALF(TYPE, LEVEL, NAME, ...)          \
-    switch (TYPE)                                                       \
-    {                                                                   \
-    case at::ScalarType::Double:                                        \
-    {                                                                   \
-        using scalar_t_##LEVEL = double;                                \
-        __VA_ARGS__;                                                    \
-        break;                                                          \
-    }                                                                   \
-    case at::ScalarType::Float:                                         \
-    {                                                                   \
-        using scalar_t_##LEVEL = float;                                 \
-        __VA_ARGS__;                                                    \
-        break;                                                          \
-    }                                                                   \
-    case at::ScalarType::Half:                                          \
-    {                                                                   \
-        using scalar_t_##LEVEL = at::Half;                              \
-        __VA_ARGS__;                                                    \
-        break;                                                          \
-    }                                                                   \
-    default:                                                            \
-        AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
-    }
+#define DISPATCH_DOUBLE_FLOAT_AND_HALF(TYPE, LEVEL, NAME, ...)        \
+  switch (TYPE) {                                                     \
+    case at::ScalarType::Double: {                                    \
+      using scalar_t_##LEVEL = double;                                \
+      __VA_ARGS__;                                                    \
+      break;                                                          \
+    }                                                                 \
+    case at::ScalarType::Float: {                                     \
+      using scalar_t_##LEVEL = float;                                 \
+      __VA_ARGS__;                                                    \
+      break;                                                          \
+    }                                                                 \
+    case at::ScalarType::Half: {                                      \
+      using scalar_t_##LEVEL = at::Half;                              \
+      __VA_ARGS__;                                                    \
+      break;                                                          \
+    }                                                                 \
+    default:                                                          \
+      AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
+  }
 
-#define DISPATCH_DOUBLE_AND_FLOAT(TYPE, LEVEL, NAME, ...)               \
-    switch (TYPE)                                                       \
-    {                                                                   \
-    case at::ScalarType::Double:                                        \
-    {                                                                   \
-        using scalar_t_##LEVEL = double;                                \
-        __VA_ARGS__;                                                    \
-        break;                                                          \
-    }                                                                   \
-    case at::ScalarType::Float:                                         \
-    {                                                                   \
-        using scalar_t_##LEVEL = float;                                 \
-        __VA_ARGS__;                                                    \
-        break;                                                          \
-    }                                                                   \
-    default:                                                            \
-        AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
-    }
+#define DISPATCH_DOUBLE_AND_FLOAT(TYPE, LEVEL, NAME, ...)             \
+  switch (TYPE) {                                                     \
+    case at::ScalarType::Double: {                                    \
+      using scalar_t_##LEVEL = double;                                \
+      __VA_ARGS__;                                                    \
+      break;                                                          \
+    }                                                                 \
+    case at::ScalarType::Float: {                                     \
+      using scalar_t_##LEVEL = float;                                 \
+      __VA_ARGS__;                                                    \
+      break;                                                          \
+    }                                                                 \
+    default:                                                          \
+      AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
+  }
 
-#define DISPATCH_FLOAT_AND_HALF_FOR_G_P(GTYPE, PTYPE, LEVEL, NAME, ...)                          \
-    if (GTYPE == at::ScalarType::Float && PTYPE == at::ScalarType::Float)                        \
-    {                                                                                            \
-        using g_scalar_t_##LEVEL = float;                                                        \
-        using p_scalar_t_##LEVEL = float;                                                        \
-        __VA_ARGS__;                                                                             \
-    }                                                                                            \
-    else if (GTYPE == at::ScalarType::Float && PTYPE == at::ScalarType::Half)                    \
-    {                                                                                            \
-        using g_scalar_t_##LEVEL = float;                                                        \
-        using p_scalar_t_##LEVEL = at::Half;                                                     \
-        __VA_ARGS__;                                                                             \
-    }                                                                                            \
-    else if (GTYPE == at::ScalarType::Half && PTYPE == at::ScalarType::Float)                    \
-    {                                                                                            \
-        using g_scalar_t_##LEVEL = at::Half;                                                     \
-        using p_scalar_t_##LEVEL = float;                                                        \
-        __VA_ARGS__;                                                                             \
-    }                                                                                            \
-    else if (GTYPE == at::ScalarType::Half && PTYPE == at::ScalarType::Half)                     \
-    {                                                                                            \
-        using g_scalar_t_##LEVEL = at::Half;                                                     \
-        using p_scalar_t_##LEVEL = at::Half;                                                     \
-        __VA_ARGS__;                                                                             \
-    }                                                                                            \
-    else                                                                                         \
-    {                                                                                            \
-       AT_ERROR(#NAME, "not implemented for '", toString(GTYPE), toString(PTYPE), "'");          \
-    }                                                                                            \
+#define DISPATCH_FLOAT_AND_HALF_FOR_G_P(GTYPE, PTYPE, LEVEL, NAME, ...)        \
+  if (GTYPE == at::ScalarType::Float && PTYPE == at::ScalarType::Float) {      \
+    using g_scalar_t_##LEVEL = float;                                          \
+    using p_scalar_t_##LEVEL = float;                                          \
+    __VA_ARGS__;                                                               \
+  } else if (GTYPE == at::ScalarType::Float &&                                 \
+             PTYPE == at::ScalarType::Half) {                                  \
+    using g_scalar_t_##LEVEL = float;                                          \
+    using p_scalar_t_##LEVEL = at::Half;                                       \
+    __VA_ARGS__;                                                               \
+  } else if (GTYPE == at::ScalarType::Half &&                                  \
+             PTYPE == at::ScalarType::Float) {                                 \
+    using g_scalar_t_##LEVEL = at::Half;                                       \
+    using p_scalar_t_##LEVEL = float;                                          \
+    __VA_ARGS__;                                                               \
+  } else if (GTYPE == at::ScalarType::Half && PTYPE == at::ScalarType::Half) { \
+    using g_scalar_t_##LEVEL = at::Half;                                       \
+    using p_scalar_t_##LEVEL = at::Half;                                       \
+    __VA_ARGS__;                                                               \
+  } else {                                                                     \
+    AT_ERROR(#NAME, "not implemented for '", toString(GTYPE), toString(PTYPE), \
+             "'");                                                             \
+  }
 
 template <typename T>
-__device__ __forceinline__ T reduce_block_into_lanes(T *x,
-                                                     T val,
-                                                     int lanes = 1,
-                                                     bool share_result = false) // lanes is intended to be <= 32.
+__device__ __forceinline__ T reduce_block_into_lanes(
+    T *x, T val, int lanes = 1,
+    bool share_result = false)  // lanes is intended to be <= 32.
 {
-    int tid = threadIdx.x + threadIdx.y * blockDim.x;
-    int blockSize = blockDim.x * blockDim.y; // blockSize is intended to be a multiple of 32.
+  int tid = threadIdx.x + threadIdx.y * blockDim.x;
+  int blockSize =
+      blockDim.x * blockDim.y;  // blockSize is intended to be a multiple of 32.
 
-    if (blockSize >= 64)
-    {
-        x[tid] = val;
-        __syncthreads();
-    }
+  if (blockSize >= 64) {
+    x[tid] = val;
+    __syncthreads();
+  }
 
 #pragma unroll
-    for (int i = (blockSize >> 1); i >= 64; i >>= 1)
-    {
-        if (tid < i)
-            x[tid] = x[tid] + x[tid + i];
-        __syncthreads();
-    }
+  for (int i = (blockSize >> 1); i >= 64; i >>= 1) {
+    if (tid < i) x[tid] = x[tid] + x[tid + i];
+    __syncthreads();
+  }
 
-    T final;
+  T final;
 
-    if (tid < 32)
-    {
-        if (blockSize >= 64)
-            final = x[tid] + x[tid + 32];
-        else
-            final = val;
-            // __SYNCWARP();
+  if (tid < 32) {
+    if (blockSize >= 64)
+      final = x[tid] + x[tid + 32];
+    else
+      final = val;
+      // __SYNCWARP();
 
 #pragma unroll
-        for (int i = 16; i >= lanes; i >>= 1)
-            final = final + __shfl_down_sync(0xffffffff, final, i);
-    }
+    for (int i = 16; i >= lanes; i >>= 1)
+      final = final + __shfl_down_sync(0xffffffff, final, i);
+  }
 
-    if (share_result)
-    {
-        if (tid < lanes)
-            x[tid] = final; // EpilogueOp
-        // Make sure the smem result is visible to all warps.
-        __syncthreads();
-    }
+  if (share_result) {
+    if (tid < lanes) x[tid] = final;  // EpilogueOp
+    // Make sure the smem result is visible to all warps.
+    __syncthreads();
+  }
 
-    return final;
+  return final;
 }
 
 template <typename T>
-__device__ __forceinline__ T reduce_block_into_lanes_max_op(T *x,
-                                                            T val,
-                                                            int lanes = 1,
-                                                            bool share_result = false) // lanes is intended to be <= 32.
+__device__ __forceinline__ T reduce_block_into_lanes_max_op(
+    T *x, T val, int lanes = 1,
+    bool share_result = false)  // lanes is intended to be <= 32.
 {
-    int tid = threadIdx.x + threadIdx.y * blockDim.x;
-    int blockSize = blockDim.x * blockDim.y; // blockSize is intended to be a multiple of 32.
+  int tid = threadIdx.x + threadIdx.y * blockDim.x;
+  int blockSize =
+      blockDim.x * blockDim.y;  // blockSize is intended to be a multiple of 32.
 
-    if (blockSize >= 64)
-    {
-        x[tid] = val;
-        __syncthreads();
-    }
+  if (blockSize >= 64) {
+    x[tid] = val;
+    __syncthreads();
+  }
 
 #pragma unroll
-    for (int i = (blockSize >> 1); i >= 64; i >>= 1)
-    {
-        if (tid < i)
-            x[tid] = fmaxf(fabsf(x[tid]), fabsf(x[tid + i]));
-        __syncthreads();
-    }
+  for (int i = (blockSize >> 1); i >= 64; i >>= 1) {
+    if (tid < i) x[tid] = fmaxf(fabsf(x[tid]), fabsf(x[tid + i]));
+    __syncthreads();
+  }
 
-    T final;
+  T final;
 
-    if (tid < 32)
-    {
-        if (blockSize >= 64)
-            final = fmaxf(fabsf(x[tid]), fabsf(x[tid + 32]));
-        else
-            final = val;
-            // __SYNCWARP();
+  if (tid < 32) {
+    if (blockSize >= 64)
+      final = fmaxf(fabsf(x[tid]), fabsf(x[tid + 32]));
+    else
+      final = val;
+      // __SYNCWARP();
 
 #pragma unroll
-        for (int i = 16; i >= lanes; i >>= 1)
-            final = fmaxf(fabsf(final), fabsf(__shfl_down_sync(0xffffffff, final, i)));
-    }
+    for (int i = 16; i >= lanes; i >>= 1)
+      final =
+          fmaxf(fabsf(final), fabsf(__shfl_down_sync(0xffffffff, final, i)));
+  }
 
-    if (share_result)
-    {
-        if (tid < lanes)
-            x[tid] = final; // EpilogueOp
-        // Make sure the smem result is visible to all warps.
-        __syncthreads();
-    }
+  if (share_result) {
+    if (tid < lanes) x[tid] = final;  // EpilogueOp
+    // Make sure the smem result is visible to all warps.
+    __syncthreads();
+  }
 
-    return final;
-}
\ No newline at end of file
+  return final;
+}
diff --git a/colossalai/kernel/cuda_native/scaled_softmax.py b/colossalai/kernel/cuda_native/scaled_softmax.py
index 580e5c81aabb..05c6ee35b8ce 100644
--- a/colossalai/kernel/cuda_native/scaled_softmax.py
+++ b/colossalai/kernel/cuda_native/scaled_softmax.py
@@ -1,3 +1,6 @@
+# This code from NVIDIA Megatron:
+#     with minor changes.
+
 import enum
 
 import torch
diff --git a/colossalai/kernel/jit/bias_gelu.py b/colossalai/kernel/jit/bias_gelu.py
index e6da70c40b42..33b4ac32b044 100644
--- a/colossalai/kernel/jit/bias_gelu.py
+++ b/colossalai/kernel/jit/bias_gelu.py
@@ -1,3 +1,4 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 import torch
 
 ###### BIAS GELU FUSION/ NO AUTOGRAD ################

From 47fb214b3b24d53b4f00c562f9291fc4abc6eb90 Mon Sep 17 00:00:00 2001
From: YuliangLiu0306 <72588413+YuliangLiu0306@users.noreply.github.com>
Date: Wed, 1 Mar 2023 11:41:53 +0800
Subject: [PATCH 399/503] [hotfix] add shard dim to aviod backward
 communication error (#2954)

---
 .../tensor_shard/node_handler/strategy/reshape_generator.py     | 1 +
 colossalai/tensor/comm_spec.py                                  | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/reshape_generator.py b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/reshape_generator.py
index 39983e918a96..24f75e352935 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/reshape_generator.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/reshape_generator.py
@@ -343,6 +343,7 @@ def collate_strategies(self) -> List[ShardingStrategy]:
                     comm_type=CommType.BEFORE,
                     arg_index=0)
                 input_comm_action.comm_spec.gather_dim = total_mesh_dim_list
+                input_comm_action.comm_spec.shard_dim = total_mesh_dim_list
 
             elif len(total_mesh_dim_list) >= 2:
                 source_spec = sharding_spec_mapping["input"]
diff --git a/colossalai/tensor/comm_spec.py b/colossalai/tensor/comm_spec.py
index b31c06994190..0d8de1062d42 100644
--- a/colossalai/tensor/comm_spec.py
+++ b/colossalai/tensor/comm_spec.py
@@ -429,6 +429,7 @@ def __repr__(self):
         if self.comm_pattern == CollectiveCommPattern.GATHER_FWD_SPLIT_BWD:
             res_list.append(f"comm_pattern:GATHER_FWD_SPLIT_BWD, ")
             res_list.append(f"gather_dim:{self.gather_dim}, ")
+            res_list.append(f"shard_dim:{self.shard_dim}, ")
             res_list.append(f"logical_process_axis:{self.logical_process_axis})")
         elif self.comm_pattern == CollectiveCommPattern.ALL2ALL_FWD_ALL2ALL_BWD:
             res_list.append(f"comm_pattern:ALL2ALL_FWD_ALL2ALL_BWD, ")
@@ -437,6 +438,7 @@ def __repr__(self):
             res_list.append(f"logical_process_axis: {self.logical_process_axis})")
         elif self.comm_pattern == CollectiveCommPattern.SPLIT_FWD_GATHER_BWD:
             res_list.append(f"comm_pattern:SPLIT_FWD_GATHER_BWD, ")
+            res_list.append(f"gather_dim:{self.gather_dim}, ")
             res_list.append(f"shard_dim:{self.shard_dim}, ")
             res_list.append(f"logical_process_axis:{self.logical_process_axis})")
         elif self.comm_pattern == CollectiveCommPattern.ALLREDUCE_FWD_IDENTITY_BWD:

From 489a9566af24bdae4dd3d895a692c73775d6b1db Mon Sep 17 00:00:00 2001
From: BlueRum <70618399+ht-zhou@users.noreply.github.com>
Date: Wed, 1 Mar 2023 13:39:39 +0800
Subject: [PATCH 400/503] [chatgpt]add inference example (#2944)

* [chatgpt] support inference example

* Create inference.sh

* Update README.md

* Delete inference.sh

* Update inference.py
---
 applications/ChatGPT/examples/README.md    | 37 ++++++++-------
 applications/ChatGPT/examples/inference.py | 52 ++++++++++++++++++++++
 2 files changed, 74 insertions(+), 15 deletions(-)
 create mode 100644 applications/ChatGPT/examples/inference.py

diff --git a/applications/ChatGPT/examples/README.md b/applications/ChatGPT/examples/README.md
index 5f9d8698d616..e5522f08770d 100644
--- a/applications/ChatGPT/examples/README.md
+++ b/applications/ChatGPT/examples/README.md
@@ -6,7 +6,21 @@
 pip install -r requirements.txt
 ```
 
-## Train with dummy prompt data
+## Train the reward model (Stage 2)
+We use [rm-static](https://huggingface.co/datasets/Dahoas/rm-static) as dataset to train our reward model. It is a dataset of chosen & rejected response of the same prompt.
+
+You can download the dataset from huggingface automatically.
+
+Use these code to train your reward model.
+
+```shell
+# Naive reward model training
+python train_reward_model.py --pretrain <your model path>
+# if to use LoRA
+python train_reward_model.py --pretrain <your model path> --lora_rank 16
+```
+
+## Train with dummy prompt data (Stage 3)
 
 This script supports 3 strategies:
 
@@ -33,7 +47,7 @@ torchrun --standalone --nproc_per_node=2 train_dummy.py --strategy ddp
 torchrun --standalone --nproc_per_node=2 train_dummy.py --strategy colossalai
 ```
 
-## Train with real prompt data
+## Train with real prompt data (Stage 3)
 
 We use [awesome-chatgpt-prompts](https://huggingface.co/datasets/fka/awesome-chatgpt-prompts) as example dataset. It is a small dataset with hundreds of prompts.
 
@@ -52,18 +66,11 @@ torchrun --standalone --nproc_per_node=2 train_prompts.py prompts.csv --strategy
 torchrun --standalone --nproc_per_node=2 train_prompts.py prompts.csv --strategy colossalai
 ```
 
-## Train the reward model
-We use [rm-static](https://huggingface.co/datasets/Dahoas/rm-static) as dataset to train our reward model. It is a dataset of chosen & rejected response of the same prompt.
-
-You can download the dataset from huggingface automatically.
-
-Use these code to train your reward model.
-
+## Inference example(After Stage3)
+We support naive inference demo after training.
 ```shell
-# Naive reward model training
-python train_reward_model.py --pretrain <your model path>
-# if to use LoRA
-python train_reward_model.py --pretrain <your model path> --lora_rank 16
+# inference
+python inference_actor.py --pretrain <your actor model path> --model <your model type>
 ```
 
 ## Support Model
@@ -91,8 +98,8 @@ python train_reward_model.py --pretrain <your model path> --lora_rank 16
 ### BLOOM
 - [x] [BLOOM-560m](https://huggingface.co/bigscience/bloom-560m)
 - [x] [BLOOM-1b1](https://huggingface.co/bigscience/bloom-1b1)
-- [ ] [BLOOM-3b](https://huggingface.co/bigscience/bloom-3b)
-- [ ] [BLOOM-7b](https://huggingface.co/bigscience/bloomz-7b1)
+- [x] [BLOOM-3b](https://huggingface.co/bigscience/bloom-3b)
+- [x] [BLOOM-7b](https://huggingface.co/bigscience/bloomz-7b1)
 - [ ] BLOOM-175b
 
 ### OPT
diff --git a/applications/ChatGPT/examples/inference.py b/applications/ChatGPT/examples/inference.py
new file mode 100644
index 000000000000..ba055d81fd15
--- /dev/null
+++ b/applications/ChatGPT/examples/inference.py
@@ -0,0 +1,52 @@
+import argparse
+import torch
+
+from chatgpt.nn import BLOOMActor, GPTActor, OPTActor
+from transformers import AutoTokenizer
+from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
+
+
+def eval(args):
+    # configure model
+    if args.model == 'gpt2':
+        model = GPTActor(pretrained=args.pretrain).to(torch.cuda.current_device())
+    elif args.model == 'bloom':
+        model = BLOOMActor(pretrained=args.pretrain).to(torch.cuda.current_device())
+    elif args.model == 'opt':
+        model = OPTActor(pretrained=args.pretrain).to(torch.cuda.current_device())
+    else:
+        raise ValueError(f'Unsupported model "{args.model}"')
+
+    # configure tokenizer
+    if args.model == 'gpt2':
+        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        tokenizer.pad_token = tokenizer.eos_token
+    elif args.model == 'bloom':
+        tokenizer = AutoTokenizer.from_pretrained(args.pretrain)
+        tokenizer.pad_token = tokenizer.eos_token
+    elif args.model == 'opt':
+        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
+    else:
+        raise ValueError(f'Unsupported model "{args.model}"')
+
+    model.eval()
+    input = args.input
+    input_ids = tokenizer.encode(input, return_tensors='pt').to(torch.cuda.current_device())
+    outputs = model.generate(input_ids,
+                             max_length=args.max_length,
+                             do_sample=True,
+                             top_k=50,
+                             top_p=0.95,
+                             num_return_sequences=1)
+    output = tokenizer.batch_decode(outputs[0], skip_special_tokens=True)
+    print(output)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt'])
+    parser.add_argument('--pretrain', type=str, default=None)
+    parser.add_argument('--input', type=str, default='Q: How are you ? A:')
+    parser.add_argument('--max_length', type=int, default=100)
+    args = parser.parse_args()
+    eval(args)

From e414e4092bd2e120f873e4ebf2961616281b3582 Mon Sep 17 00:00:00 2001
From: YuliangLiu0306 <72588413+YuliangLiu0306@users.noreply.github.com>
Date: Wed, 1 Mar 2023 16:34:58 +0800
Subject: [PATCH 401/503] [DTensor] implementation of dtensor (#2946)

* [DTensor] implementation of dtensor

* test layout convert

* polish
---
 colossalai/tensor/d_tensor/d_tensor.py | 158 +++++++++++++++++++++++++
 colossalai/tensor/d_tensor/layout.py   |  22 ++++
 tests/test_tensor/test_dtensor.py      | 104 ++++++++++++++++
 3 files changed, 284 insertions(+)
 create mode 100644 colossalai/tensor/d_tensor/d_tensor.py
 create mode 100644 colossalai/tensor/d_tensor/layout.py
 create mode 100644 tests/test_tensor/test_dtensor.py

diff --git a/colossalai/tensor/d_tensor/d_tensor.py b/colossalai/tensor/d_tensor/d_tensor.py
new file mode 100644
index 000000000000..e311eb3ba241
--- /dev/null
+++ b/colossalai/tensor/d_tensor/d_tensor.py
@@ -0,0 +1,158 @@
+from typing import Optional
+
+import torch
+from torch.utils._pytree import tree_map
+
+from colossalai.device.device_mesh import DeviceMesh
+from colossalai.tensor.d_tensor.layout import Layout
+from colossalai.tensor.shape_consistency import ShapeConsistencyManager, to_global
+from colossalai.tensor.sharding_spec import ShardingSpec
+
+shape_consistency_manager = ShapeConsistencyManager()
+
+
+class DTensor(torch.Tensor):
+
+    def __init__(self, local_tensor: torch.Tensor, dist_layout: Layout):
+        self.local_tensor = local_tensor
+        self.data_type = local_tensor.dtype
+        self.entire_shape = local_tensor.shape
+        if dist_layout.entire_shape is None:
+            dist_layout.entire_shape = self.entire_shape
+        self.dist_layout = dist_layout
+        self._apply_layout()
+
+    @staticmethod
+    def __new__(cls, local_tensor, layout):
+        return torch.Tensor._make_subclass(cls, local_tensor, local_tensor.requires_grad)
+
+    def __repr__(self):
+        return f"DTensor({self.to_global()}, {self.dist_layout})"
+
+    def __str__(self):
+        return self.__repr__()
+
+    def layout_convert(self, target_layout):
+        '''
+        Convert the layout of the tensor from source_spec to target_spec.
+        '''
+        source_spec = convert_layout_to_sharding_spec(self.dist_layout)
+        target_spec = convert_layout_to_sharding_spec(target_layout)
+        self.local_tensor = shape_consistency_manager.apply_for_autoparallel_runtime(
+            self.local_tensor, source_spec, target_spec)
+        self.dist_layout = target_layout
+
+    def _apply_layout(self):
+        '''
+        Apply the layout to the local tensor during initializing process.
+        '''
+        source_spec = construct_default_sharding_spec(self.local_tensor, self.device_mesh)
+        target_spec = convert_layout_to_sharding_spec(self.dist_layout)
+        self.local_tensor = shape_consistency_manager.apply_for_autoparallel_runtime(
+            self.local_tensor, source_spec, target_spec)
+
+    @classmethod
+    def __torch_function__(cls, func, types, args=(), kwargs=None):
+        if kwargs is None:
+            kwargs = {}
+
+        def filter_arg(arg):
+            if isinstance(arg, DTensor):
+                return arg.local_tensor
+            else:
+                return arg
+
+        args = tree_map(filter_arg, args)
+        kwargs = tree_map(filter_arg, kwargs)
+        # if we want to convert the result into DTensor, we need to infer the layout of result from the layout of input tensors
+        # and op type.
+
+        return func(*args, **kwargs)
+
+    @property
+    def device_mesh(self):
+        '''
+        Return the device mesh of the tensor.
+        '''
+        return self.dist_layout.device_mesh
+
+    @property
+    def sharding_spec(self):
+        '''
+        Return the sharding specification of the tensor.
+        '''
+        return self.dist_layout.sharding_spec
+
+    def to(self, *args, **kwargs):
+        '''
+        Move the tensor to a new device or convert the tensor to a new dtype.
+        '''
+        self.local_tensor = self.local_tensor.to(*args, **kwargs)
+        self.data_type = self.local_tensor.dtype
+        self.dist_layout.device_type = self.local_tensor.device
+        # TODO: update the device mesh process groups or we should just cache
+        # both the cpu process groups and the cuda process groups?
+        return self
+
+    def to_local(self):
+        '''
+        Return the local tensor in this rank.
+        '''
+        return self.local_tensor
+
+    def to_global(self):
+        '''
+        Recover the global tensor from the distributed tensor.
+
+        Note: This function will all_gather the local tensor to the global tensor and it
+        will not change the layout of the DTensor. This function is mainly used for debugging or
+        check the correctness of the distributed tensor.
+        '''
+        return to_global(self.local_tensor, convert_layout_to_sharding_spec(self.dist_layout))
+
+
+def distribute_tensor(local_tensor: torch.Tensor, dist_layout: Layout) -> DTensor:
+    '''
+    Distribute the local tensor to the distributed tensor according to the dist_layout specified.
+
+    Args:
+        local_tensor: tensor to be distributed.
+        dist_layout: the layout specification of the distributed tensor.
+
+    Returns:
+        A 'DTensor' object.
+    '''
+    return DTensor(local_tensor, dist_layout)
+
+
+def distribute_module(module: torch.nn.Module, partition_fn: Optional[callable] = None) -> torch.nn.Module:
+    '''
+    This function converts all the parameters in the module to DTensor(DParam).
+
+    Note: This function is subject to future change as the DParam has not been implemented yet.
+    '''
+    for name, param in module.named_parameters():
+        if param is not None and not isinstance(param, DTensor):
+            # TODO: we could convert the parameter to DParam here,
+            # the type of the parameter could be an optional argument.
+            setattr(module, name, torch.nn.Parameter(partition_fn(name, param.data)))
+    return module
+
+
+def convert_layout_to_sharding_spec(layout: Layout) -> ShardingSpec:
+    '''
+    Convert the layout from Layout class to ShardingSpec class.
+    '''
+    return ShardingSpec(device_mesh=layout.device_mesh,
+                        entire_shape=layout.entire_shape,
+                        dim_partition_dict=layout.sharding_spec.dim_partition_dict)
+
+
+def construct_default_sharding_spec(
+    tensor: torch.Tensor,
+    device_mesh: DeviceMesh,
+) -> ShardingSpec:
+    '''
+    Construct the default sharding specification for the tensor.
+    '''
+    return ShardingSpec(device_mesh=device_mesh, entire_shape=tensor.shape, dim_partition_dict={})
diff --git a/colossalai/tensor/d_tensor/layout.py b/colossalai/tensor/d_tensor/layout.py
new file mode 100644
index 000000000000..9b72444aa3c6
--- /dev/null
+++ b/colossalai/tensor/d_tensor/layout.py
@@ -0,0 +1,22 @@
+from dataclasses import dataclass
+
+import torch
+
+from colossalai.device.device_mesh import DeviceMesh
+from colossalai.tensor.sharding_spec import ShardingSpec
+
+
+@dataclass
+class Layout:
+    """Layout of a tensor.
+
+    Attributes:
+        device_mesh: the device mesh to store the tensor distributedly.
+        device_type: the type of the device mesh, e.g. 'cpu' or 'cuda'.
+        sharding_spec: the sharding specification to describe how the tensor is sharded.
+        entire_shape: the entire shape of the global tensor.
+    """
+    device_mesh: DeviceMesh
+    device_type: torch.device
+    sharding_spec: ShardingSpec
+    entire_shape: torch.Size = None
diff --git a/tests/test_tensor/test_dtensor.py b/tests/test_tensor/test_dtensor.py
new file mode 100644
index 000000000000..1de9563a2eff
--- /dev/null
+++ b/tests/test_tensor/test_dtensor.py
@@ -0,0 +1,104 @@
+from functools import partial
+
+import torch
+import torch.multiprocessing as mp
+
+from colossalai.device.device_mesh import DeviceMesh
+from colossalai.fx.tracer import ColoTracer
+from colossalai.initialize import launch
+from colossalai.logging import disable_existing_loggers
+from colossalai.tensor.d_tensor.d_tensor import DTensor, distribute_tensor
+from colossalai.tensor.d_tensor.layout import Layout
+from colossalai.tensor.sharding_spec import ShardingSpec
+from colossalai.utils import free_port
+
+
+class TestModel(torch.nn.Module):
+
+    def __init__(self, in_features, out_features):
+        super().__init__()
+        self.linear_1 = torch.nn.Linear(in_features, out_features)
+        self.linear_2 = torch.nn.Linear(out_features, in_features)
+
+    def forward(self, x):
+        x = self.linear_1(x)
+        x = self.linear_2(x)
+        return x
+
+
+def check_dtensor(rank, world_size, port):
+    disable_existing_loggers()
+    launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+    test_model = TestModel(8, 8).to('cuda')
+    original_tensor = torch.rand(4, 8).to('cuda')
+    compare_output = test_model(original_tensor)
+
+    device_mesh = DeviceMesh(torch.Tensor([0, 1, 2, 3]), (2, 2), init_process_group=True)
+    target_sharding_spec = ShardingSpec(device_mesh=device_mesh,
+                                        entire_shape=original_tensor.shape,
+                                        dim_partition_dict={0: [0]})
+    layout = Layout(device_mesh=device_mesh, device_type=torch.device('cuda'), sharding_spec=target_sharding_spec)
+    d_tensor = DTensor(original_tensor, layout)
+
+    assert d_tensor.entire_shape == original_tensor.shape
+    assert d_tensor.data_type == original_tensor.dtype
+
+    if rank in (0, 1):
+        assert d_tensor.to_local().equal(original_tensor.narrow(0, 0, 2))
+    elif rank in (2, 3):
+        assert d_tensor.to_local().equal(original_tensor.narrow(0, 2, 2))
+    else:
+        raise ValueError(f'rank {rank} is not in the device mesh')
+    assert d_tensor.to_global().equal(original_tensor)
+    output = test_model(d_tensor)
+
+    if rank in (0, 1):
+        assert output.equal(compare_output.narrow(0, 0, 2))
+    elif rank in (2, 3):
+        assert output.equal(compare_output.narrow(0, 2, 2))
+    else:
+        raise ValueError(f'rank {rank} is not in the device mesh')
+
+    new_sharding_spec = ShardingSpec(device_mesh=device_mesh,
+                                     entire_shape=original_tensor.shape,
+                                     dim_partition_dict={0: [0, 1]})
+    new_layout = Layout(device_mesh=device_mesh,
+                        device_type=torch.device('cuda'),
+                        sharding_spec=new_sharding_spec,
+                        entire_shape=original_tensor.shape)
+
+    d_tensor.layout_convert(new_layout)
+
+    if rank == 0:
+        assert d_tensor.local_tensor.equal(original_tensor.narrow(0, 0, 1))
+    elif rank == 1:
+        assert d_tensor.local_tensor.equal(original_tensor.narrow(0, 1, 1))
+    elif rank == 2:
+        assert d_tensor.local_tensor.equal(original_tensor.narrow(0, 2, 1))
+    elif rank == 3:
+        assert d_tensor.local_tensor.equal(original_tensor.narrow(0, 3, 1))
+    else:
+        raise ValueError(f'rank {rank} is not in the device mesh')
+
+    dtensor_from_local = distribute_tensor(original_tensor, new_layout)
+
+    if rank == 0:
+        assert dtensor_from_local.local_tensor.equal(original_tensor.narrow(0, 0, 1))
+    elif rank == 1:
+        assert dtensor_from_local.local_tensor.equal(original_tensor.narrow(0, 1, 1))
+    elif rank == 2:
+        assert dtensor_from_local.local_tensor.equal(original_tensor.narrow(0, 2, 1))
+    elif rank == 3:
+        assert dtensor_from_local.local_tensor.equal(original_tensor.narrow(0, 3, 1))
+    else:
+        raise ValueError(f'rank {rank} is not in the device mesh')
+
+
+def test_dtensor():
+    world_size = 4
+    run_func = partial(check_dtensor, world_size=world_size, port=free_port())
+    mp.spawn(run_func, nprocs=world_size)
+
+
+if __name__ == '__main__':
+    test_dtensor()

From 0d07514988296cb38aa8ea2fbf981e5ba02b18c4 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 2 Mar 2023 09:15:21 +0800
Subject: [PATCH 402/503] Automated submodule synchronization (#2951)

Co-authored-by: github-actions <github-actions@github.com>
---
 inference | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/inference b/inference
index 83c48efaf152..84dfbda3fda2 160000
--- a/inference
+++ b/inference
@@ -1 +1 @@
-Subproject commit 83c48efaf152ecc104b1c311c80df9e5c59a09e0
+Subproject commit 84dfbda3fda29ab74ede2731d82bc4932469ab4d

From b0a8766381910921af60db5fcc03e57e349e1442 Mon Sep 17 00:00:00 2001
From: binmakeswell <binmakeswell@gmail.com>
Date: Thu, 2 Mar 2023 11:22:08 +0800
Subject: [PATCH 403/503] [doc] fix chatgpt inference typo (#2964)

---
 applications/ChatGPT/examples/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/applications/ChatGPT/examples/README.md b/applications/ChatGPT/examples/README.md
index e5522f08770d..0a5e504a020a 100644
--- a/applications/ChatGPT/examples/README.md
+++ b/applications/ChatGPT/examples/README.md
@@ -70,7 +70,7 @@ torchrun --standalone --nproc_per_node=2 train_prompts.py prompts.csv --strategy
 We support naive inference demo after training.
 ```shell
 # inference
-python inference_actor.py --pretrain <your actor model path> --model <your model type>
+python inference.py --pretrain <your actor model path> --model <your model type>
 ```
 
 ## Support Model

From bbf9c827c38c56b6f921d248544ad62ceb133d85 Mon Sep 17 00:00:00 2001
From: Fazzie-Maqianli <55798671+Fazziekey@users.noreply.github.com>
Date: Thu, 2 Mar 2023 15:00:05 +0800
Subject: [PATCH 404/503] [ChatGPT] fix README (#2966)

* Update README.md

* fix README

* Update README.md

* Update README.md

---------

Co-authored-by: fastalgo <youyang@cs.berkeley.edu>
Co-authored-by: BlueRum <70618399+ht-zhou@users.noreply.github.com>
---
 applications/ChatGPT/README.md          | 23 ++++++++++++++++++++---
 applications/ChatGPT/examples/README.md | 17 ++++++++++++-----
 applications/ChatGPT/requirements.txt   |  1 +
 3 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/applications/ChatGPT/README.md b/applications/ChatGPT/README.md
index dbd5eb7709b9..d26206144614 100644
--- a/applications/ChatGPT/README.md
+++ b/applications/ChatGPT/README.md
@@ -1,5 +1,13 @@
 # RLHF - Colossal-AI
 
+## Table of Contents
+
+- [What is RLHF - Colossal-AI?](#intro)
+- [How to Install?](#install)
+- [The Plan](#the-plan)
+- [How can you partcipate in open source?](#invitation-to-open-source-contribution)
+---
+## Intro
 Implementation of RLHF (Reinforcement Learning with Human Feedback) powered by Colossal-AI. It supports distributed training and offloading, which can fit extremly large models. More details can be found in the [blog](https://www.hpc-ai.tech/blog/colossal-ai-chatgpt).
 
 <p align="center">
@@ -20,7 +28,6 @@ Implementation of RLHF (Reinforcement Learning with Human Feedback) powered by C
 pip install .
 ```
 
-
 ## Usage
 
 The main entrypoint is `Trainer`. We only support PPO trainer now. We support many training strategies:
@@ -128,14 +135,24 @@ To load optimizer checkpoint:
 strategy.load_optimizer(actor_optim, 'actor_optim_checkpoint.pt')
 ```
 
-## Todo
+## The Plan
 
 - [x] implement PPO fine-tuning
 - [x] implement training reward model
 - [x] support LoRA
+- [x] support inference
+- [ ] open source the reward model weight
+- [ ] support llama from [facebook](https://github.com/facebookresearch/llama)
+- [ ] support BoN(best of N sample)
 - [ ] implement PPO-ptx fine-tuning
 - [ ] integrate with Ray
-- [ ] support more RL paradigms, like Implicit Language Q-Learning (ILQL)
+- [ ] support more RL paradigms, like Implicit Language Q-Learning (ILQL),
+- [ ] support chain of throught by [langchain](https://github.com/hwchase17/langchain)
+
+### Real-time progress
+You will find our progress in github project broad
+
+[Open ChatGPT](https://github.com/orgs/hpcaitech/projects/17/views/1)
 
 ## Invitation to open-source contribution
 Referring to the successful attempts of [BLOOM](https://bigscience.huggingface.co/) and [Stable Diffusion](https://en.wikipedia.org/wiki/Stable_Diffusion), any and all developers and partners with computing powers, datasets, models are welcome to join and build an ecosystem with Colossal-AI, making efforts towards the era of big AI models from the starting point of replicating ChatGPT!
diff --git a/applications/ChatGPT/examples/README.md b/applications/ChatGPT/examples/README.md
index 0a5e504a020a..c411c880b038 100644
--- a/applications/ChatGPT/examples/README.md
+++ b/applications/ChatGPT/examples/README.md
@@ -73,14 +73,21 @@ We support naive inference demo after training.
 python inference.py --pretrain <your actor model path> --model <your model type>
 ```
 
+#### data
+- [x] [rm-static](https://huggingface.co/datasets/Dahoas/rm-static)
+- [x] [hh-rlhf](https://huggingface.co/datasets/Anthropic/hh-rlhf)
+- [ ] [openai/summarize_from_feedback](https://huggingface.co/datasets/openai/summarize_from_feedback)
+- [ ] [openai/webgpt_comparisons](https://huggingface.co/datasets/openai/webgpt_comparisons)
+- [ ] [Dahoas/instruct-synthetic-prompt-responses](https://huggingface.co/datasets/Dahoas/instruct-synthetic-prompt-responses)
+
 ## Support Model
 
 ### GPT
-- [ ]  GPT2-S (s)
-- [ ]  GPT2-M (m)
-- [ ]  GPT2-L (l)
+- [x]  GPT2-S (s)
+- [x]  GPT2-M (m)
+- [x]  GPT2-L (l)
 - [ ]  GPT2-XL (xl)
-- [ ]  GPT2-4B (4b)
+- [x]  GPT2-4B (4b)
 - [ ]  GPT2-6B (6b)
 - [ ]  GPT2-8B (8b)
 - [ ]  GPT2-10B (10b)
@@ -99,7 +106,7 @@ python inference.py --pretrain <your actor model path> --model <your model type>
 - [x] [BLOOM-560m](https://huggingface.co/bigscience/bloom-560m)
 - [x] [BLOOM-1b1](https://huggingface.co/bigscience/bloom-1b1)
 - [x] [BLOOM-3b](https://huggingface.co/bigscience/bloom-3b)
-- [x] [BLOOM-7b](https://huggingface.co/bigscience/bloomz-7b1)
+- [x] [BLOOM-7b](https://huggingface.co/bigscience/bloom-7b1)
 - [ ] BLOOM-175b
 
 ### OPT
diff --git a/applications/ChatGPT/requirements.txt b/applications/ChatGPT/requirements.txt
index 87f6a52cc0e2..15a960c2c650 100644
--- a/applications/ChatGPT/requirements.txt
+++ b/applications/ChatGPT/requirements.txt
@@ -4,3 +4,4 @@ datasets
 loralib
 colossalai>=0.2.4
 torch
+langchain

From 82149e9d1b0d9e1e9eeb643af7c7e19fbf503ee4 Mon Sep 17 00:00:00 2001
From: BlueRum <70618399+ht-zhou@users.noreply.github.com>
Date: Thu, 2 Mar 2023 16:18:33 +0800
Subject: [PATCH 405/503] [chatgpt] fix inference demo loading bug (#2969)

* [chatgpt] fix inference demo loading bug

* polish
---
 applications/ChatGPT/examples/inference.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/applications/ChatGPT/examples/inference.py b/applications/ChatGPT/examples/inference.py
index ba055d81fd15..a2682277d18c 100644
--- a/applications/ChatGPT/examples/inference.py
+++ b/applications/ChatGPT/examples/inference.py
@@ -9,30 +9,34 @@
 def eval(args):
     # configure model
     if args.model == 'gpt2':
-        model = GPTActor(pretrained=args.pretrain).to(torch.cuda.current_device())
+        actor = GPTActor().to(torch.cuda.current_device())
     elif args.model == 'bloom':
-        model = BLOOMActor(pretrained=args.pretrain).to(torch.cuda.current_device())
+        actor = BLOOMActor().to(torch.cuda.current_device())
     elif args.model == 'opt':
-        model = OPTActor(pretrained=args.pretrain).to(torch.cuda.current_device())
+        actor = OPTActor().to(torch.cuda.current_device())
     else:
         raise ValueError(f'Unsupported model "{args.model}"')
 
+    state_dict = torch.load(args.pretrain)
+    actor.model.load_state_dict(state_dict)
+    
+    
     # configure tokenizer
     if args.model == 'gpt2':
         tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
         tokenizer.pad_token = tokenizer.eos_token
     elif args.model == 'bloom':
-        tokenizer = AutoTokenizer.from_pretrained(args.pretrain)
+        tokenizer = AutoTokenizer.from_pretrained('bigscience/bloom-560m')
         tokenizer.pad_token = tokenizer.eos_token
     elif args.model == 'opt':
-        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
+        tokenizer = AutoTokenizer.from_pretrained('facebook/opt-350m')
     else:
         raise ValueError(f'Unsupported model "{args.model}"')
 
-    model.eval()
+    actor.eval()
     input = args.input
     input_ids = tokenizer.encode(input, return_tensors='pt').to(torch.cuda.current_device())
-    outputs = model.generate(input_ids,
+    outputs = actor.generate(input_ids,
                              max_length=args.max_length,
                              do_sample=True,
                              top_k=50,
@@ -46,7 +50,7 @@ def eval(args):
     parser = argparse.ArgumentParser()
     parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt'])
     parser.add_argument('--pretrain', type=str, default=None)
-    parser.add_argument('--input', type=str, default='Q: How are you ? A:')
+    parser.add_argument('--input', type=str, default='Question: How are you ? Answer:')
     parser.add_argument('--max_length', type=int, default=100)
     args = parser.parse_args()
     eval(args)

From c9e27f0d1bee591b218920b5d927e080ee1efb51 Mon Sep 17 00:00:00 2001
From: BlueRum <70618399+ht-zhou@users.noreply.github.com>
Date: Thu, 2 Mar 2023 17:51:44 +0800
Subject: [PATCH 406/503] [chatgpt]fix lora bug (#2974)

* fix lora bug

* polish
---
 applications/ChatGPT/chatgpt/nn/reward_model.py     | 4 ++--
 applications/ChatGPT/chatgpt/trainer/rm.py          | 4 +++-
 applications/ChatGPT/examples/train_reward_model.py | 6 +++---
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/applications/ChatGPT/chatgpt/nn/reward_model.py b/applications/ChatGPT/chatgpt/nn/reward_model.py
index baaa8b768766..5108f61a6186 100644
--- a/applications/ChatGPT/chatgpt/nn/reward_model.py
+++ b/applications/ChatGPT/chatgpt/nn/reward_model.py
@@ -23,7 +23,7 @@ def __init__(self,
                  lora_rank: int = 0,
                  lora_train_bias: str = 'none') -> None:
         super().__init__(lora_rank=lora_rank, lora_train_bias=lora_train_bias)
-        self.body = model
+        self.model = model
         if value_head is not None:
             if value_head.out_features != 1:
                 raise ValueError("The value head of reward model's output dim should be 1!")
@@ -34,7 +34,7 @@ def __init__(self,
         self.convert_to_lora()
 
     def forward(self, sequences: torch.LongTensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
-        outputs = self.body(sequences, attention_mask=attention_mask)
+        outputs = self.model(sequences, attention_mask=attention_mask)
         last_hidden_states = outputs['last_hidden_state']
         values = self.value_head(last_hidden_states)[:, :-1]
         value = values.mean(dim=1).squeeze(1)    # ensure shape is (B)
diff --git a/applications/ChatGPT/chatgpt/trainer/rm.py b/applications/ChatGPT/chatgpt/trainer/rm.py
index f9000eb7efe5..3286b8d8d927 100644
--- a/applications/ChatGPT/chatgpt/trainer/rm.py
+++ b/applications/ChatGPT/chatgpt/trainer/rm.py
@@ -44,6 +44,8 @@ def __init__(
         self.eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size)
 
         self.model = strategy.setup_model(model)
+        if "DDP" in str(self.strategy):
+            self.model = self.model.module
         self.loss_fn = PairWiseLoss()
         self.optimizer = strategy.setup_optimizer(optim, self.model)
 
@@ -56,7 +58,7 @@ def fit(self, use_lora):
             # train
             if use_lora > 0:
                 print("Using Lora")
-                lora.mark_only_lora_as_trainable(self.model.body)
+                lora.mark_only_lora_as_trainable(self.model.model)
 
             else:
                 self.model.train()
diff --git a/applications/ChatGPT/examples/train_reward_model.py b/applications/ChatGPT/examples/train_reward_model.py
index bf2071793b47..c17c6f393f41 100644
--- a/applications/ChatGPT/examples/train_reward_model.py
+++ b/applications/ChatGPT/examples/train_reward_model.py
@@ -61,8 +61,8 @@ def train(args):
 
     # prepare for data and dataset
     data = load_dataset(args.dataset)
-    train_data = data["train"].select(range(100))
-    eval_data = data['test'].select(range(5))
+    train_data = data["train"]
+    eval_data = data['test']
     train_dataset = RewardDataset(train_data, tokenizer, max_len)
     eval_dataset = RewardDataset(eval_data, tokenizer, max_len)
 
@@ -93,7 +93,7 @@ def train(args):
     parser.add_argument('--pretrain', type=str, default=None)
     parser.add_argument('--dataset', type=str, default='Dahoas/rm-static')
     parser.add_argument('--save_path', type=str, default='rm_ckpt.pth')
-    parser.add_argument('--max_epochs', type=int, default=10)
+    parser.add_argument('--max_epochs', type=int, default=1)
     parser.add_argument('--batch_size', type=int, default=4)
     parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
     args = parser.parse_args()

From 9b4ceefc212bcba79ed69bc2889193a650e093f6 Mon Sep 17 00:00:00 2001
From: binmakeswell <binmakeswell@gmail.com>
Date: Fri, 3 Mar 2023 10:41:58 +0800
Subject: [PATCH 407/503] [doc] update news (#2983)

* [doc] update news

* [doc] update news
---
 README-zh-Hans.md | 3 ++-
 README.md         | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/README-zh-Hans.md b/README-zh-Hans.md
index 74cefa75cd12..0f53491c0442 100644
--- a/README-zh-Hans.md
+++ b/README-zh-Hans.md
@@ -23,8 +23,9 @@
 </div>
 
 ## 新闻
+* [2023/03] [AWS and Google Fund Colossal-AI with Startup Cloud Programs](https://www.hpc-ai.tech/blog/aws-and-google-fund-colossal-ai-with-startup-cloud-programs)
 * [2023/02] [Open source solution replicates ChatGPT training process! Ready to go with only 1.6GB GPU memory](https://www.hpc-ai.tech/blog/colossal-ai-chatgpt)
-* [2023/01] [Hardware Savings Up to 46 Times for AIGC and  Automatic Parallelism](https://www.hpc-ai.tech/blog/colossal-ai-0-2-0)
+* [2023/01] [Hardware Savings Up to 46 Times for AIGC and  Automatic Parallelism](https://medium.com/pytorch/latest-colossal-ai-boasts-novel-automatic-parallelism-and-offers-savings-up-to-46x-for-stable-1453b48f3f02)
 * [2022/11] [Diffusion Pretraining and Hardware Fine-Tuning Can Be Almost 7X Cheaper](https://www.hpc-ai.tech/blog/diffusion-pretraining-and-hardware-fine-tuning-can-be-almost-7x-cheaper)
 * [2022/10] [Use a Laptop to Analyze 90% of Proteins, With a Single-GPU Inference Sequence Exceeding 10,000](https://www.hpc-ai.tech/blog/use-a-laptop-to-analyze-90-of-proteins-with-a-single-gpu-inference-sequence-exceeding)
 * [2022/09] [HPC-AI Tech Completes $6 Million Seed and Angel Round Fundraising](https://www.hpc-ai.tech/blog/hpc-ai-tech-completes-6-million-seed-and-angel-round-fundraising-led-by-bluerun-ventures-in-the)
diff --git a/README.md b/README.md
index 2d4135dc5087..bdd9fafc969e 100644
--- a/README.md
+++ b/README.md
@@ -24,8 +24,9 @@
 </div>
 
 ## Latest News
+* [2023/03] [AWS and Google Fund Colossal-AI with Startup Cloud Programs](https://www.hpc-ai.tech/blog/aws-and-google-fund-colossal-ai-with-startup-cloud-programs)
 * [2023/02] [Open source solution replicates ChatGPT training process! Ready to go with only 1.6GB GPU memory](https://www.hpc-ai.tech/blog/colossal-ai-chatgpt)
-* [2023/01] [Hardware Savings Up to 46 Times for AIGC and  Automatic Parallelism](https://www.hpc-ai.tech/blog/colossal-ai-0-2-0)
+* [2023/01] [Hardware Savings Up to 46 Times for AIGC and  Automatic Parallelism](https://medium.com/pytorch/latest-colossal-ai-boasts-novel-automatic-parallelism-and-offers-savings-up-to-46x-for-stable-1453b48f3f02)
 * [2022/11] [Diffusion Pretraining and Hardware Fine-Tuning Can Be Almost 7X Cheaper](https://www.hpc-ai.tech/blog/diffusion-pretraining-and-hardware-fine-tuning-can-be-almost-7x-cheaper)
 * [2022/10] [Use a Laptop to Analyze 90% of Proteins, With a Single-GPU Inference Sequence Exceeding 10,000](https://www.hpc-ai.tech/blog/use-a-laptop-to-analyze-90-of-proteins-with-a-single-gpu-inference-sequence-exceeding)
 * [2022/09] [HPC-AI Tech Completes $6 Million Seed and Angel Round Fundraising](https://www.hpc-ai.tech/blog/hpc-ai-tech-completes-6-million-seed-and-angel-round-fundraising-led-by-bluerun-ventures-in-the)

From 827a0af8ccdfd0165fba4ff7665567793aaf2b91 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 3 Mar 2023 10:55:45 +0800
Subject: [PATCH 408/503] Automated submodule synchronization (#2982)

Co-authored-by: github-actions <github-actions@github.com>
---
 examples/tutorial/fastfold/FastFold | 2 +-
 inference                           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/tutorial/fastfold/FastFold b/examples/tutorial/fastfold/FastFold
index 03ff54e56157..c9309ecf2437 160000
--- a/examples/tutorial/fastfold/FastFold
+++ b/examples/tutorial/fastfold/FastFold
@@ -1 +1 @@
-Subproject commit 03ff54e561576d38118bf4cba8e73ef728f099e3
+Subproject commit c9309ecf2437ffd7f308ace7ea31042a380fb82c
diff --git a/inference b/inference
index 84dfbda3fda2..a2c9905d94f9 160000
--- a/inference
+++ b/inference
@@ -1 +1 @@
-Subproject commit 84dfbda3fda29ab74ede2731d82bc4932469ab4d
+Subproject commit a2c9905d94f926a3135acd7228ae6b4e7474217c

From 19ad49fb3b847ce0992c68f57ad9940c2f2b2c44 Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Fri, 3 Mar 2023 15:51:19 +0800
Subject: [PATCH 409/503] [chatgpt] making experience support dp (#2971)

* [chatgpt] making experience support dp

* [chatgpt] update example test ci

* [chatgpt] update example test ci

* [chatgpt] update example test ci

* [chatgpt] update example test ci

* [chatgpt] update sampler

* [chatgpt] update example test ci

* [chatgpt] refactor sampler

* [chatgpt] update example test ci
---
 applications/ChatGPT/chatgpt/trainer/base.py  |  6 ++--
 .../chatgpt/trainer/strategies/base.py        |  8 ++++-
 .../ChatGPT/chatgpt/trainer/strategies/ddp.py | 32 ++++++++++++-------
 .../chatgpt/trainer/strategies/sampler.py     | 32 +++++++++++++++++++
 applications/ChatGPT/examples/test_ci.sh      |  6 ++--
 applications/ChatGPT/examples/train_dummy.py  |  4 ++-
 .../ChatGPT/examples/train_prompts.py         |  4 ++-
 applications/ChatGPT/tests/test_data.py       |  1 +
 8 files changed, 71 insertions(+), 22 deletions(-)
 create mode 100644 applications/ChatGPT/chatgpt/trainer/strategies/sampler.py

diff --git a/applications/ChatGPT/chatgpt/trainer/base.py b/applications/ChatGPT/chatgpt/trainer/base.py
index 42547af78cfb..a2419a35b6cd 100644
--- a/applications/ChatGPT/chatgpt/trainer/base.py
+++ b/applications/ChatGPT/chatgpt/trainer/base.py
@@ -1,4 +1,3 @@
-import random
 from abc import ABC, abstractmethod
 from typing import Any, Callable, Dict, List, Optional, Union
 
@@ -68,7 +67,7 @@ def _make_experience(self, inputs: Union[Tensor, Dict[str, Tensor]]) -> Experien
 
     def _sample_prompts(self, prompts) -> list:
         indices = list(range(len(prompts)))
-        sampled_indices = random.sample(indices, self.experience_batch_size)
+        sampled_indices = self.strategy.experience_sampler.choice(indices, self.experience_batch_size, replace=False)
         return [prompts[i] for i in sampled_indices]
 
     def _learn(self):
@@ -98,6 +97,7 @@ def _learn(self):
 
     def fit(self, prompts, num_episodes: int = 50000, max_timesteps: int = 500, update_timesteps: int = 5000) -> None:
         time = 0
+        sampler = self.strategy.setup_sampler(prompts)
         self._on_fit_start()
         for episode in range(num_episodes):
             self._on_episode_start(episode)
@@ -105,7 +105,7 @@ def fit(self, prompts, num_episodes: int = 50000, max_timesteps: int = 500, upda
                                  desc=f'Episode [{episode+1}/{num_episodes}]',
                                  disable=not is_rank_0()):
                 time += 1
-                rand_prompts = self._sample_prompts(prompts)
+                rand_prompts = sampler.sample(self.experience_batch_size)
                 if self.tokenizer is not None:
                     inputs = self.tokenizer(rand_prompts)
                 else:
diff --git a/applications/ChatGPT/chatgpt/trainer/strategies/base.py b/applications/ChatGPT/chatgpt/trainer/strategies/base.py
index 2c6aefcd969f..2a96078e98c1 100644
--- a/applications/ChatGPT/chatgpt/trainer/strategies/base.py
+++ b/applications/ChatGPT/chatgpt/trainer/strategies/base.py
@@ -2,13 +2,16 @@
 from contextlib import nullcontext
 from typing import Any, List, Tuple, Union
 
+import numpy as np
 import torch
 import torch.nn as nn
-from chatgpt.nn import Actor, Critic, RewardModel
+from chatgpt.nn import Actor
 from chatgpt.replay_buffer import ReplayBuffer
 from torch.optim import Optimizer
 from torch.utils.data import DataLoader
 
+from .sampler import DistributedSampler
+
 ModelOptimPair = Tuple[nn.Module, Optimizer]
 ModelOrModelOptimPair = Union[nn.Module, ModelOptimPair]
 
@@ -123,3 +126,6 @@ def save_optimizer(self, optimizer: Optimizer, path: str, only_rank0: bool = Fal
     @abstractmethod
     def load_optimizer(self, optimizer: Optimizer, path: str, map_location: Any = None) -> None:
         pass
+
+    def setup_sampler(self, dataset) -> DistributedSampler:
+        return DistributedSampler(dataset, 1, 0)
diff --git a/applications/ChatGPT/chatgpt/trainer/strategies/ddp.py b/applications/ChatGPT/chatgpt/trainer/strategies/ddp.py
index 7ceb3a3ca2ba..66e99dd3977c 100644
--- a/applications/ChatGPT/chatgpt/trainer/strategies/ddp.py
+++ b/applications/ChatGPT/chatgpt/trainer/strategies/ddp.py
@@ -9,10 +9,11 @@
 from chatgpt.replay_buffer import ReplayBuffer
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.optim import Optimizer
-from torch.utils.data import DataLoader, DistributedSampler
+from torch.utils.data import DataLoader
 
 from .base import Strategy
 from .naive import NaiveStrategy
+from .sampler import DistributedSampler
 
 
 class DDPStrategy(NaiveStrategy):
@@ -49,17 +50,21 @@ def setup_model(self, model: nn.Module) -> nn.Module:
         return DDP(model, device_ids=[device])
 
     def setup_dataloader(self, replay_buffer: ReplayBuffer, pin_memory: bool = False) -> DataLoader:
-        sampler = DistributedSampler(replay_buffer,
-                                     num_replicas=dist.get_world_size(),
-                                     rank=dist.get_rank(),
-                                     shuffle=True,
-                                     seed=self.seed,
-                                     drop_last=True)
-        return DataLoader(replay_buffer,
-                          batch_size=replay_buffer.sample_batch_size,
-                          sampler=sampler,
-                          pin_memory=pin_memory,
-                          collate_fn=replay_buffer.collate_fn)
+        # DDP only mode, replay buffers on each rank are different.
+        # sampler = DistributedSampler(replay_buffer,
+        #                              num_replicas=dist.get_world_size(),
+        #                              rank=dist.get_rank(),
+        #                              shuffle=True,
+        #                              seed=self.seed,
+        #                              drop_last=True)
+        return DataLoader(
+            replay_buffer,
+            batch_size=replay_buffer.sample_batch_size,
+        #   sampler=sampler,
+            shuffle=True,
+            drop_last=True,
+            pin_memory=pin_memory,
+            collate_fn=replay_buffer.collate_fn)
 
     @staticmethod
     def _unwrap_actor(actor: Actor) -> nn.Module:
@@ -75,3 +80,6 @@ def save_optimizer(self, optimizer: Optimizer, path: str, only_rank0: bool = Fal
         if only_rank0 and dist.get_rank() != 0:
             return
         super().save_optimizer(optimizer, path, only_rank0)
+
+    def setup_sampler(self, dataset) -> DistributedSampler:
+        return DistributedSampler(dataset, dist.get_world_size(), dist.get_rank())
diff --git a/applications/ChatGPT/chatgpt/trainer/strategies/sampler.py b/applications/ChatGPT/chatgpt/trainer/strategies/sampler.py
new file mode 100644
index 000000000000..d726fa640fa2
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/trainer/strategies/sampler.py
@@ -0,0 +1,32 @@
+import math
+
+import numpy as np
+
+
+class DistributedSampler:
+
+    def __init__(self, dataset, num_replicas: int, rank: int) -> None:
+        self.dataset = dataset
+        self.num_replicas = num_replicas
+        self.rank = rank
+
+        if len(self.dataset) % self.num_replicas != 0:
+            self.num_samples = math.ceil(
+                (len(self.dataset) - self.num_replicas) / self.num_replicas    # type: ignore[arg-type]
+            )
+        else:
+            self.num_samples = math.ceil(len(self.dataset) / self.num_replicas)
+
+        self.total_size = self.num_samples * self.num_replicas
+
+        indices = list(range(len(self.dataset)))
+        indices = indices[:self.total_size]
+        assert len(indices) == self.total_size
+        # subsample
+        indices = indices[self.rank:self.total_size:self.num_replicas]
+        assert len(indices) == self.num_samples
+        self.indices = indices
+
+    def sample(self, batch_size: int) -> list:
+        sampled_indices = np.random.choice(self.indices, batch_size, replace=False)
+        return [self.dataset[idx] for idx in sampled_indices]
diff --git a/applications/ChatGPT/examples/test_ci.sh b/applications/ChatGPT/examples/test_ci.sh
index c4a5ead1d1d3..8109db2260a0 100755
--- a/applications/ChatGPT/examples/test_ci.sh
+++ b/applications/ChatGPT/examples/test_ci.sh
@@ -15,13 +15,11 @@ export OMP_NUM_THREADS=8
 pip install -r ${BASE}/requirements.txt
 
 # train dummy
-python ${BASE}/train_dummy.py --strategy naive --num_episodes 3 --max_timesteps 3 --update_timesteps 3 --max_epochs 3 --train_batch_size 2
 for strategy in ddp colossalai_gemini colossalai_zero2; do
-    torchrun --standalone --nproc_per_node=2 ${BASE}/train_dummy.py --strategy ${strategy} --num_episodes 3 --max_timesteps 3 --update_timesteps 3 --max_epochs 3 --train_batch_size 2
+    torchrun --standalone --nproc_per_node=2 ${BASE}/train_dummy.py --strategy ${strategy} --num_episodes 2 --max_timesteps 3 --update_timesteps 3 --max_epochs 3 --experience_batch_size 4 --train_batch_size 4
 done
 
 # train prompts
-python ${BASE}/train_prompts.py $PROMPT_PATH --strategy naive --num_episodes 3 --max_timesteps 3 --update_timesteps 3 --max_epochs 3
 for strategy in ddp colossalai_gemini colossalai_zero2; do
-    torchrun --standalone --nproc_per_node=2 ${BASE}/train_prompts.py $PROMPT_PATH --strategy ${strategy} --num_episodes 3 --max_timesteps 3 --update_timesteps 3 --max_epochs 3 --train_batch_size 2
+    torchrun --standalone --nproc_per_node=2 ${BASE}/train_prompts.py $PROMPT_PATH --strategy ${strategy} --num_episodes 2 --max_timesteps 3 --update_timesteps 3 --max_epochs 3
 done
diff --git a/applications/ChatGPT/examples/train_dummy.py b/applications/ChatGPT/examples/train_dummy.py
index a27d77a50fdf..35f6474910d3 100644
--- a/applications/ChatGPT/examples/train_dummy.py
+++ b/applications/ChatGPT/examples/train_dummy.py
@@ -25,7 +25,7 @@ def main(args):
     elif args.strategy == 'ddp':
         strategy = DDPStrategy()
     elif args.strategy == 'colossalai_gemini':
-        strategy = ColossalAIStrategy(stage=3, placement_policy='cuda')
+        strategy = ColossalAIStrategy(stage=3, placement_policy='cuda', initial_scale=2**5)
     elif args.strategy == 'colossalai_zero2':
         strategy = ColossalAIStrategy(stage=2, placement_policy='cuda')
     else:
@@ -82,6 +82,7 @@ def main(args):
         critic_optim,
         max_epochs=args.max_epochs,
         train_batch_size=args.train_batch_size,
+        experience_batch_size=args.experience_batch_size,
         tokenizer=preprocess_batch,
         max_length=128,
         do_sample=True,
@@ -117,6 +118,7 @@ def main(args):
     parser.add_argument('--update_timesteps', type=int, default=10)
     parser.add_argument('--max_epochs', type=int, default=5)
     parser.add_argument('--train_batch_size', type=int, default=8)
+    parser.add_argument('--experience_batch_size', type=int, default=8)
     parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
     args = parser.parse_args()
     main(args)
diff --git a/applications/ChatGPT/examples/train_prompts.py b/applications/ChatGPT/examples/train_prompts.py
index 53aa150a06fd..db4c7d475aa7 100644
--- a/applications/ChatGPT/examples/train_prompts.py
+++ b/applications/ChatGPT/examples/train_prompts.py
@@ -20,7 +20,7 @@ def main(args):
     elif args.strategy == 'ddp':
         strategy = DDPStrategy()
     elif args.strategy == 'colossalai_gemini':
-        strategy = ColossalAIStrategy(stage=3, placement_policy='cuda')
+        strategy = ColossalAIStrategy(stage=3, placement_policy='cuda', initial_scale=2**5)
     elif args.strategy == 'colossalai_zero2':
         strategy = ColossalAIStrategy(stage=2, placement_policy='cuda')
     else:
@@ -83,6 +83,7 @@ def tokenize_fn(texts):
         critic_optim,
         max_epochs=args.max_epochs,
         train_batch_size=args.train_batch_size,
+        experience_batch_size=args.experience_batch_size,
         tokenizer=tokenize_fn,
         max_length=128,
         do_sample=True,
@@ -117,6 +118,7 @@ def tokenize_fn(texts):
     parser.add_argument('--update_timesteps', type=int, default=10)
     parser.add_argument('--max_epochs', type=int, default=5)
     parser.add_argument('--train_batch_size', type=int, default=8)
+    parser.add_argument('--experience_batch_size', type=int, default=8)
     parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
     args = parser.parse_args()
     main(args)
diff --git a/applications/ChatGPT/tests/test_data.py b/applications/ChatGPT/tests/test_data.py
index 9571c2843e07..b5a84c4d0ef2 100644
--- a/applications/ChatGPT/tests/test_data.py
+++ b/applications/ChatGPT/tests/test_data.py
@@ -107,6 +107,7 @@ def run_dist(rank, world_size, port, strategy):
     run_test_data(strategy)
 
 
+@pytest.mark.skip
 @pytest.mark.dist
 @pytest.mark.parametrize('world_size', [2])
 @pytest.mark.parametrize('strategy', ['ddp', 'colossalai'])

From f5ca0397dd1c0d725c9b8d0c63784c55666245a7 Mon Sep 17 00:00:00 2001
From: BlueRum <70618399+ht-zhou@users.noreply.github.com>
Date: Fri, 3 Mar 2023 15:58:16 +0800
Subject: [PATCH 410/503] [chatgpt] fix lora gemini conflict in RM training
 (#2984)

* fix lora bug

* polish

* fix lora gemini
---
 applications/ChatGPT/chatgpt/nn/reward_model.py     | 4 ++--
 applications/ChatGPT/chatgpt/trainer/rm.py          | 7 +------
 applications/ChatGPT/examples/train_reward_model.py | 2 --
 3 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/applications/ChatGPT/chatgpt/nn/reward_model.py b/applications/ChatGPT/chatgpt/nn/reward_model.py
index 5108f61a6186..27cd1ccaee93 100644
--- a/applications/ChatGPT/chatgpt/nn/reward_model.py
+++ b/applications/ChatGPT/chatgpt/nn/reward_model.py
@@ -24,14 +24,14 @@ def __init__(self,
                  lora_train_bias: str = 'none') -> None:
         super().__init__(lora_rank=lora_rank, lora_train_bias=lora_train_bias)
         self.model = model
+        self.convert_to_lora()
+
         if value_head is not None:
             if value_head.out_features != 1:
                 raise ValueError("The value head of reward model's output dim should be 1!")
             self.value_head = value_head
-
         else:
             self.value_head = nn.Linear(model.config.n_embd, 1)
-        self.convert_to_lora()
 
     def forward(self, sequences: torch.LongTensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
         outputs = self.model(sequences, attention_mask=attention_mask)
diff --git a/applications/ChatGPT/chatgpt/trainer/rm.py b/applications/ChatGPT/chatgpt/trainer/rm.py
index 3286b8d8d927..d44944aeeb35 100644
--- a/applications/ChatGPT/chatgpt/trainer/rm.py
+++ b/applications/ChatGPT/chatgpt/trainer/rm.py
@@ -56,12 +56,7 @@ def fit(self, use_lora):
                             desc='Train step of epoch %d' % epoch,
                             disable=not is_rank_0())
             # train
-            if use_lora > 0:
-                print("Using Lora")
-                lora.mark_only_lora_as_trainable(self.model.model)
-
-            else:
-                self.model.train()
+            self.model.train()
             for chosen_ids, c_mask, reject_ids, r_mask in self.train_dataloader:
                 chosen_ids = chosen_ids.squeeze(1).cuda()
                 c_mask = c_mask.squeeze(1).cuda()
diff --git a/applications/ChatGPT/examples/train_reward_model.py b/applications/ChatGPT/examples/train_reward_model.py
index c17c6f393f41..44acba192245 100644
--- a/applications/ChatGPT/examples/train_reward_model.py
+++ b/applications/ChatGPT/examples/train_reward_model.py
@@ -66,8 +66,6 @@ def train(args):
     train_dataset = RewardDataset(train_data, tokenizer, max_len)
     eval_dataset = RewardDataset(eval_data, tokenizer, max_len)
 
-    # batch_size here is expected to be C(k,2), k means # response of each prompt
-    # be limited with the format of dataset 'Dahoas/rm-static', we'd better use batch_size as 1
     trainer = RewardModelTrainer(model=model,
                                  strategy=strategy,
                                  optim=optim,

From 0ff8406b009d15298e5b1d6ca884742630f6d91c Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Fri, 3 Mar 2023 16:27:59 +0800
Subject: [PATCH 411/503] [chatgpt] allow shard init and display warning
 (#2986)

---
 .../ChatGPT/chatgpt/trainer/strategies/colossalai.py         | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/applications/ChatGPT/chatgpt/trainer/strategies/colossalai.py b/applications/ChatGPT/chatgpt/trainer/strategies/colossalai.py
index bf4ecdfdf336..b6ed1d451b78 100644
--- a/applications/ChatGPT/chatgpt/trainer/strategies/colossalai.py
+++ b/applications/ChatGPT/chatgpt/trainer/strategies/colossalai.py
@@ -79,8 +79,9 @@ def __init__(
         self.stage = stage
         # TODO(ver217): support shard_init when using from_pretrained()
         if shard_init:
-            warnings.warn(f'Shard init is not supported yet. Ignore.')
-            shard_init = False
+            warnings.warn(
+                f'Shard init is not supported model.from_pretrained() yet. Please load weights after strategy.prepare()'
+            )
         self.shard_init = shard_init
         self.gemini_config = dict(device=get_current_device(),
                                   placement_policy=placement_policy,

From 3a5d93bc2cbbec2e8e45c0ce2bd913857b2eb6db Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Fri, 3 Mar 2023 21:45:05 +0800
Subject: [PATCH 412/503] [kernel] cached the op kernel and fixed version check
 (#2886)

* [kernel] cached the op kernel and fixed version check

* polish code

* polish code
---
 op_builder/README.md  |   7 +-
 op_builder/builder.py |  83 +++++++++++------
 op_builder/utils.py   | 211 +++++++++++++++++++++++++++++++++++++++---
 setup.py              | 167 ++++++++++++++-------------------
 4 files changed, 328 insertions(+), 140 deletions(-)

diff --git a/op_builder/README.md b/op_builder/README.md
index 057da1038555..b7ac6107300c 100644
--- a/op_builder/README.md
+++ b/op_builder/README.md
@@ -15,17 +15,18 @@ Method 2 is good because it allows the user to only build the kernel they actual
 
 ## PyTorch Extensions in Colossal-AI
 
-As mentioned in the section above, our aim is to make these two methods coherently supported in Colossal-AI, meaning that for a kernel should be either built in `setup.py` or during runtime.
-There are mainly two functions used to build extensions.
+The project DeepSpeed (https://github.com/microsoft/DeepSpeed) has proposed a [solution](https://github.com/microsoft/DeepSpeed/tree/master/op_builder)) to support kernel-build during either installation or runtime.
+We have adapted from DeepSpeed's solution to build extensions. The extension build requries two main functions from PyTorch:
 
 1. `torch.utils.cpp_extension.CUDAExtension`: used to build extensions in `setup.py` during `pip install`.
 2. `torch.utils.cpp_extension.load`: used to build and load extension during runtime
 
 Please note that the extension build by `CUDAExtension` cannot be loaded by the `load` function and `load` will run its own build again (correct me if I am wrong).
 
-We have implemented the following conventions:
+Based on the DeepSpeed's work, we have make several modifications and improvements:
 
 1. All pre-built kernels (those installed with `setup.py`) will be found in `colossalai._C`
 2. All runtime-built kernels will be found in the default torch extension path, i.e. ~/.cache/colossalai/torch_extensions. (If we put the built kernels in the installed site-package directory, this will make pip uninstall incomplete)
+3. Once a kernel is loaded, we will cache it in the builder to avoid repeated kernel loading.
 
 When loading the built kernel, we will first check if the pre-built one exists. If not, the runtime build will be triggered.
diff --git a/op_builder/builder.py b/op_builder/builder.py
index e2fdde3affa8..140a10c091b4 100644
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -5,22 +5,7 @@
 from pathlib import Path
 from typing import List
 
-
-def print_rank_0(message):
-    """
-    Print on only one process to avoid spamming.
-    """
-    try:
-        import torch.distributed as dist
-        if not dist.is_initialized():
-            is_main_rank = True
-        else:
-            is_main_rank = dist.get_rank() == 0
-    except ImportError:
-        is_main_rank = True
-
-    if is_main_rank:
-        print(message)
+from .utils import check_cuda_availability, check_system_pytorch_cuda_match, print_rank_0
 
 
 class Builder(ABC):
@@ -37,6 +22,9 @@ def __init__(self, name: str, prebuilt_import_path: str):
         self.prebuilt_import_path = prebuilt_import_path
         self.version_dependent_macros = ['-DVERSION_GE_1_1', '-DVERSION_GE_1_3', '-DVERSION_GE_1_5']
 
+        # we store the op as an attribute to avoid repeated building and loading
+        self.cached_op_module = None
+
         assert prebuilt_import_path.startswith('colossalai._C'), \
             f'The prebuilt_import_path should start with colossalai._C, but got {self.prebuilt_import_path}'
 
@@ -117,6 +105,35 @@ def import_op(self):
         """
         return importlib.import_module(self.prebuilt_import_path)
 
+    def check_runtime_build_environment(self):
+        """
+        Check whether the system environment is ready for extension compilation.
+        """
+        try:
+            import torch
+            from torch.utils.cpp_extension import CUDA_HOME
+            TORCH_AVAILABLE = True
+        except ImportError:
+            TORCH_AVAILABLE = False
+            CUDA_HOME = None
+
+        if not TORCH_AVAILABLE:
+            raise ModuleNotFoundError(
+                "PyTorch is not found. You need to install PyTorch first in order to build CUDA extensions")
+
+        if CUDA_HOME is None:
+            raise RuntimeError(
+                "CUDA_HOME is not found. You need to export CUDA_HOME environment vairable or install CUDA Toolkit first in order to build CUDA extensions"
+            )
+
+        # make sure CUDA is available for compilation during
+        cuda_available = check_cuda_availability()
+        if not cuda_available:
+            raise RuntimeError("CUDA is not available on your system as torch.cuda.is_avaible() returns False.")
+
+        # make sure system CUDA and pytorch CUDA match, an error will raised inside the function if not
+        check_system_pytorch_cuda_match(CUDA_HOME)
+
     def load(self, verbose=True):
         """
         load the kernel during runtime. If the kernel is not built during pip install, it will build the kernel.
@@ -128,16 +145,27 @@ def load(self, verbose=True):
         Args:
             verbose (bool, optional): show detailed info. Defaults to True.
         """
-        from torch.utils.cpp_extension import load
-        start_build = time.time()
+        # if the kernel has be compiled and cached, we directly use it
+        if self.cached_op_module is not None:
+            return self.cached_op_module
 
         try:
+            # if the kernel has been pre-built during installation
+            # we just directly import it
             op_module = self.import_op()
             if verbose:
-                print_rank_0(f"OP {self.prebuilt_import_path} already exists, skip building.")
+                print_rank_0(
+                    f"[extension] OP {self.prebuilt_import_path} has been compileed ahead of time, skip building.")
         except ImportError:
+            # check environment
+            self.check_runtime_build_environment()
+
+            # time the kernel compilation
+            start_build = time.time()
+
             # construct the build directory
             import torch
+            from torch.utils.cpp_extension import load
             torch_version_major = torch.__version__.split('.')[0]
             torch_version_minor = torch.__version__.split('.')[1]
             torch_cuda_version = torch.version.cuda
@@ -147,11 +175,7 @@ def load(self, verbose=True):
             Path(build_directory).mkdir(parents=True, exist_ok=True)
 
             if verbose:
-                print_rank_0(
-                    "=========================================================================================")
-                print_rank_0(f"No pre-built kernel is found, build and load the {self.name} kernel during runtime now")
-                print_rank_0(
-                    "=========================================================================================")
+                print_rank_0(f"[extension] Compiling or loading the JIT-built {self.name} kernel during runtime now")
 
             # load the kernel
             op_module = load(name=self.name,
@@ -163,9 +187,14 @@ def load(self, verbose=True):
                              build_directory=build_directory,
                              verbose=verbose)
 
-        build_duration = time.time() - start_build
-        if verbose:
-            print_rank_0(f"Time to load {self.name} op: {build_duration} seconds")
+            build_duration = time.time() - start_build
+
+            # log jit compilation time
+            if verbose:
+                print_rank_0(f"[extension] Time to compile or load {self.name} op: {build_duration} seconds")
+
+        # cache the built/loaded kernel
+        self.cached_op_module = op_module
 
         return op_module
 
diff --git a/op_builder/utils.py b/op_builder/utils.py
index b6bada99efe5..153590428cbe 100644
--- a/op_builder/utils.py
+++ b/op_builder/utils.py
@@ -1,29 +1,203 @@
+import os
 import re
 import subprocess
+import warnings
 from typing import List
 
 
-def get_cuda_bare_metal_version(cuda_dir):
-    raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
-    output = raw_output.split()
-    release_idx = output.index("release") + 1
-    release = output[release_idx].split(".")
-    bare_metal_major = release[0]
-    bare_metal_minor = release[1][0]
+def print_rank_0(message: str) -> None:
+    """
+    Print on only one process to avoid spamming.
+    """
+    try:
+        import torch.distributed as dist
+        if not dist.is_initialized():
+            is_main_rank = True
+        else:
+            is_main_rank = dist.get_rank() == 0
+    except ImportError:
+        is_main_rank = True
+
+    if is_main_rank:
+        print(message)
+
+
+def get_cuda_version_in_pytorch() -> List[int]:
+    """
+    This function returns the CUDA version in the PyTorch build.
+
+    Returns:
+        The CUDA version required by PyTorch, in the form of tuple (major, minor).
+    """
+    import torch
+
+    try:
+        torch_cuda_major = torch.version.cuda.split(".")[0]
+        torch_cuda_minor = torch.version.cuda.split(".")[1]
+    except:
+        raise ValueError(
+            "[extension] Cannot retrive the CUDA version in the PyTorch binary given by torch.version.cuda")
+    return torch_cuda_major, torch_cuda_minor
+
+
+def get_cuda_bare_metal_version(cuda_dir) -> List[int]:
+    """
+    Get the System CUDA version from nvcc.
+
+    Args:
+        cuda_dir (str): the directory for CUDA Toolkit.
+
+    Returns:
+        The CUDA version required by PyTorch, in the form of tuple (major, minor).
+    """
+    nvcc_path = os.path.join(cuda_dir, 'bin/nvcc')
+
+    if cuda_dir is None:
+        raise ValueError(
+            f"[extension] The argument cuda_dir is None, but expected to be a string. Please make sure your have exported the environment variable CUDA_HOME correctly."
+        )
+
+    # check for nvcc path
+    if not os.path.exists(nvcc_path):
+        raise FileNotFoundError(
+            f"[extension] The nvcc compiler is not found in {nvcc_path}, please make sure you have set the correct value for CUDA_HOME."
+        )
+
+    # parse the nvcc -v output to obtain the system cuda version
+    try:
+        raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
+        output = raw_output.split()
+        release_idx = output.index("release") + 1
+        release = output[release_idx].split(".")
+        bare_metal_major = release[0]
+        bare_metal_minor = release[1][0]
+    except:
+        raise ValueError(
+            f"[extension] Failed to parse the nvcc output to obtain the system CUDA bare metal version. The output for 'nvcc -v' is \n{raw_output}"
+        )
+
+    return bare_metal_major, bare_metal_minor
+
+
+def check_system_pytorch_cuda_match(cuda_dir):
+    bare_metal_major, bare_metal_minor = get_cuda_bare_metal_version(cuda_dir)
+    torch_cuda_major, torch_cuda_minor = get_cuda_version_in_pytorch()
+
+    if bare_metal_major != torch_cuda_major:
+        raise Exception(
+            f'[extension] Failed to build PyTorch extension because the detected CUDA version ({bare_metal_major}.{bare_metal_minor}) '
+            f'mismatches the version that was used to compile PyTorch ({torch_cuda_major}.{torch_cuda_minor}).'
+            'Please make sure you have set the CUDA_HOME correctly and installed the correct PyTorch in https://pytorch.org/get-started/locally/ .'
+        )
+
+    print(bare_metal_minor != torch_cuda_minor)
+    if bare_metal_minor != torch_cuda_minor:
+        warnings.warn(
+            f"[extension] The CUDA version on the system ({bare_metal_major}.{bare_metal_minor}) does not match with the version ({torch_cuda_major}.{torch_cuda_minor}) torch was compiled with. "
+            "The mismatch is found in the minor version. As the APIs are compatible, we will allow compilation to proceed. "
+            "If you encounter any issue when using the built kernel, please try to build it again with fully matched CUDA versions"
+        )
+    return True
+
+
+def get_pytorch_version() -> List[int]:
+    """
+    This functions finds the PyTorch version.
+
+    Returns:
+        A tuple of integers in the form of (major, minor, patch).
+    """
+    import torch
+    torch_version = torch.__version__.split('+')[0]
+    TORCH_MAJOR = int(torch_version.split('.')[0])
+    TORCH_MINOR = int(torch_version.split('.')[1])
+    TORCH_PATCH = int(torch_version.split('.')[2])
+    return TORCH_MAJOR, TORCH_MINOR, TORCH_PATCH
 
-    return raw_output, bare_metal_major, bare_metal_minor
 
-def get_cuda_cc_flag() -> List:
-    """get_cuda_cc_flag
+def check_pytorch_version(min_major_version, min_minor_version) -> bool:
+    """
+    Compare the current PyTorch version with the minium required version.
+
+    Args:
+        min_major_version (int): the minimum major version of PyTorch required
+        min_minor_version (int): the minimum minor version of PyTorch required
+
+    Returns:
+        A boolean value. The value is True if the current pytorch version is acceptable and False otherwise.
+    """
+    # get pytorch version
+    torch_major, torch_minor, _ = get_pytorch_version()
+
+    # if the
+    if torch_major < min_major_version or (torch_major == min_major_version and torch_minor < min_minor_version):
+        raise RuntimeError(
+            f"[extension] Colossal-AI requires Pytorch {min_major_version}.{min_minor_version} or newer.\n"
+            "The latest stable release can be obtained from https://pytorch.org/get-started/locally/")
+
+
+def check_cuda_availability():
+    """
+    Check if CUDA is available on the system.
+
+    Returns:
+        A boolean value. True if CUDA is available and False otherwise.
+    """
+    import torch
+    return torch.cuda.is_available()
+
+
+def set_cuda_arch_list(cuda_dir):
+    """
+    This function sets the PyTorch TORCH_CUDA_ARCH_LIST variable for ahead-of-time extension compilation.
+    Ahead-of-time compilation occurs when CUDA_EXT=1 is set when running 'pip install'.
+    """
+    cuda_available = check_cuda_availability()
 
-    cc flag for your GPU arch
+    # we only need to set this when CUDA is not available for cross-compilation
+    if not cuda_available:
+        warnings.warn(
+            '\n[extension]  PyTorch did not find available GPUs on this system.\n',
+            'If your intention is to cross-compile, this is not an error.\n'
+            'By default, Colossal-AI will cross-compile for \n'
+            '1. Pascal (compute capabilities 6.0, 6.1, 6.2),\n'
+            '2. Volta (compute capability 7.0)\n'
+            '3. Turing (compute capability 7.5),\n'
+            '4. Ampere (compute capability 8.0, 8.6)if the CUDA version is >= 11.0\n'
+            '\nIf you wish to cross-compile for a single specific architecture,\n'
+            'export TORCH_CUDA_ARCH_LIST="compute capability" before running setup.py.\n')
+
+        if os.environ.get("TORCH_CUDA_ARCH_LIST", None) is None:
+            bare_metal_major, bare_metal_minor = get_cuda_bare_metal_version(cuda_dir)
+
+            arch_list = ['6.0', '6.1', '6.2', '7.0', '7.5']
+
+            if int(bare_metal_major) == 11:
+                if int(bare_metal_minor) == 0:
+                    arch_list.append('8.0')
+                else:
+                    arch_list.append('8.0')
+                    arch_list.append('8.6')
+
+            arch_list_str = ';'.join(arch_list)
+            os.environ["TORCH_CUDA_ARCH_LIST"] = arch_list_str
+        return False
+    return True
+
+
+def get_cuda_cc_flag() -> List[str]:
+    """
+    This function produces the cc flags for your GPU arch
+
+    Returns:
+        The CUDA cc flags for compilation.
     """
 
     # only import torch when needed
     # this is to avoid importing torch when building on a machine without torch pre-installed
     # one case is to build wheel for pypi release
     import torch
-    
+
     cc_flag = []
     for arch in torch.cuda.get_arch_list():
         res = re.search(r'sm_(\d+)', arch)
@@ -31,12 +205,19 @@ def get_cuda_cc_flag() -> List:
             arch_cap = res[1]
             if int(arch_cap) >= 60:
                 cc_flag.extend(['-gencode', f'arch=compute_{arch_cap},code={arch}'])
-
     return cc_flag
 
-def append_nvcc_threads(nvcc_extra_args):
+
+def append_nvcc_threads(nvcc_extra_args: List[str]) -> List[str]:
+    """
+    This function appends the threads flag to your nvcc args.
+
+    Returns:
+        The nvcc compilation flags including the threads flag.
+    """
     from torch.utils.cpp_extension import CUDA_HOME
-    _, bare_metal_major, bare_metal_minor = get_cuda_bare_metal_version(CUDA_HOME)
+
+    bare_metal_major, bare_metal_minor = get_cuda_bare_metal_version(CUDA_HOME)
     if int(bare_metal_major) >= 11 and int(bare_metal_minor) >= 2:
         return nvcc_extra_args + ["--threads", "4"]
     return nvcc_extra_args
diff --git a/setup.py b/setup.py
index 7cfbbe9b19a4..6c24cb504251 100644
--- a/setup.py
+++ b/setup.py
@@ -1,115 +1,87 @@
 import os
-import re
 from datetime import datetime
+from typing import List
 
 from setuptools import find_packages, setup
 
-from op_builder.utils import get_cuda_bare_metal_version
+from op_builder.utils import (
+    check_cuda_availability,
+    check_pytorch_version,
+    check_system_pytorch_cuda_match,
+    get_cuda_bare_metal_version,
+    get_pytorch_version,
+    set_cuda_arch_list,
+)
 
 try:
     import torch
-    from torch.utils.cpp_extension import CUDA_HOME, BuildExtension, CUDAExtension
-    print("\n\ntorch.__version__  = {}\n\n".format(torch.__version__))
-    TORCH_MAJOR = int(torch.__version__.split('.')[0])
-    TORCH_MINOR = int(torch.__version__.split('.')[1])
-
-    if TORCH_MAJOR < 1 or (TORCH_MAJOR == 1 and TORCH_MINOR < 10):
-        raise RuntimeError("Colossal-AI requires Pytorch 1.10 or newer.\n"
-                           "The latest stable release can be obtained from https://pytorch.org/")
+    from torch.utils.cpp_extension import CUDA_HOME, BuildExtension
     TORCH_AVAILABLE = True
 except ImportError:
     TORCH_AVAILABLE = False
     CUDA_HOME = None
 
-# ninja build does not work unless include_dirs are abs path
-this_dir = os.path.dirname(os.path.abspath(__file__))
-build_cuda_ext = False
+# Some constants for installation checks
+MIN_PYTORCH_VERSION_MAJOR = 1
+MIN_PYTORCH_VERSION_MINOR = 10
+THIS_DIR = os.path.dirname(os.path.abspath(__file__))
+BUILD_CUDA_EXT = int(os.environ.get('CUDA_EXT', '0')) == 1
+IS_NIGHTLY = int(os.environ.get('NIGHTLY', '0')) == 1
+
+# a variable to store the op builder
 ext_modules = []
-is_nightly = int(os.environ.get('NIGHTLY', '0')) == 1
 
-if int(os.environ.get('CUDA_EXT', '0')) == 1:
+
+# check for CUDA extension dependencies
+def environment_check_for_cuda_extension_build():
     if not TORCH_AVAILABLE:
         raise ModuleNotFoundError(
-            "PyTorch is not found while CUDA_EXT=1. You need to install PyTorch first in order to build CUDA extensions"
+            "[extension] PyTorch is not found while CUDA_EXT=1. You need to install PyTorch first in order to build CUDA extensions"
         )
 
     if not CUDA_HOME:
         raise RuntimeError(
-            "CUDA_HOME is not found while CUDA_EXT=1. You need to export CUDA_HOME environment vairable or install CUDA Toolkit first in order to build CUDA extensions"
+            "[extension] CUDA_HOME is not found while CUDA_EXT=1. You need to export CUDA_HOME environment vairable or install CUDA Toolkit first in order to build CUDA extensions"
         )
 
-    build_cuda_ext = True
-
-
-def check_cuda_torch_binary_vs_bare_metal(cuda_dir):
-    raw_output, bare_metal_major, bare_metal_minor = get_cuda_bare_metal_version(cuda_dir)
-    torch_binary_major = torch.version.cuda.split(".")[0]
-    torch_binary_minor = torch.version.cuda.split(".")[1]
-
-    print("\nCompiling cuda extensions with")
-    print(raw_output + "from " + cuda_dir + "/bin\n")
-
-    if bare_metal_major != torch_binary_major:
-        print(f'The detected CUDA version ({raw_output}) mismatches the version that was used to compile PyTorch '
-              f'({torch.version.cuda}). CUDA extension will not be installed.')
-        return False
-
-    if bare_metal_minor != torch_binary_minor:
-        print("\nWarning: Cuda extensions are being compiled with a version of Cuda that does "
-              "not match the version used to compile Pytorch binaries.  "
-              f"Pytorch binaries were compiled with Cuda {torch.version.cuda}.\n"
-              "In some cases, a minor-version mismatch will not cause later errors:  "
-              "https://github.com/NVIDIA/apex/pull/323#discussion_r287021798. ")
-    return True
-
-
-def check_cuda_availability(cuda_dir):
-    if not torch.cuda.is_available():
-        # https://github.com/NVIDIA/apex/issues/486
-        # Extension builds after https://github.com/pytorch/pytorch/pull/23408 attempt to query
-        # torch.cuda.get_device_capability(), which will fail if you are compiling in an environment
-        # without visible GPUs (e.g. during an nvidia-docker build command).
-        print(
-            '\nWarning: Torch did not find available GPUs on this system.\n',
-            'If your intention is to cross-compile, this is not an error.\n'
-            'By default, Colossal-AI will cross-compile for Pascal (compute capabilities 6.0, 6.1, 6.2),\n'
-            'Volta (compute capability 7.0), Turing (compute capability 7.5),\n'
-            'and, if the CUDA version is >= 11.0, Ampere (compute capability 8.0).\n'
-            'If you wish to cross-compile for a single specific architecture,\n'
-            'export TORCH_CUDA_ARCH_LIST="compute capability" before running setup.py.\n')
-        if os.environ.get("TORCH_CUDA_ARCH_LIST", None) is None:
-            _, bare_metal_major, _ = get_cuda_bare_metal_version(cuda_dir)
-            if int(bare_metal_major) == 11:
-                os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5;8.0"
-            else:
-                os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5"
-        return False
-
-    if cuda_dir is None:
-        print("nvcc was not found. CUDA extension will not be installed. If you're installing within a container from "
-              "https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
-        return False
-    return True
-
-
-def append_nvcc_threads(nvcc_extra_args):
-    _, bare_metal_major, bare_metal_minor = get_cuda_bare_metal_version(CUDA_HOME)
-    if int(bare_metal_major) >= 11 and int(bare_metal_minor) >= 2:
-        return nvcc_extra_args + ["--threads", "4"]
-    return nvcc_extra_args
-
-
-def fetch_requirements(path):
+    check_system_pytorch_cuda_match(CUDA_HOME)
+    check_pytorch_version(MIN_PYTORCH_VERSION_MAJOR, MIN_PYTORCH_VERSION_MINOR)
+    check_cuda_availability()
+
+
+def fetch_requirements(path) -> List[str]:
+    """
+    This function reads the requirements file.
+
+    Args:
+        path (str): the path to the requirements file.
+
+    Returns:
+        The lines in the requirements file.
+    """
     with open(path, 'r') as fd:
         return [r.strip() for r in fd.readlines()]
 
 
-def fetch_readme():
+def fetch_readme() -> str:
+    """
+    This function reads the README.md file in the current directory.
+
+    Returns:
+        The lines in the README file.
+    """
     with open('README.md', encoding='utf-8') as f:
         return f.read()
 
 
-def get_version():
+def get_version() -> str:
+    """
+    This function reads the version.txt and generates the colossalai/version.py file.
+
+    Returns:
+        The library version stored in version.txt.
+    """
+
     setup_file_path = os.path.abspath(__file__)
     project_path = os.path.dirname(setup_file_path)
     version_txt_path = os.path.join(project_path, 'version.txt')
@@ -121,13 +93,17 @@ def get_version():
     # write version into version.py
     with open(version_py_path, 'w') as f:
         f.write(f"__version__ = '{version}'\n")
-        if build_cuda_ext:
-            torch_version = '.'.join(torch.__version__.split('.')[:2])
-            cuda_version = '.'.join(get_cuda_bare_metal_version(CUDA_HOME)[1:])
+
+        # look for pytorch and cuda version
+        if BUILD_CUDA_EXT:
+            torch_major, torch_minor, _ = get_pytorch_version()
+            torch_version = f'{torch_major}.{torch_minor}'
+            cuda_version = '.'.join(get_cuda_bare_metal_version(CUDA_HOME))
         else:
             torch_version = None
             cuda_version = None
 
+        # write the version into the python file
         if torch_version:
             f.write(f'torch = "{torch_version}"\n')
         else:
@@ -141,25 +117,26 @@ def get_version():
     return version
 
 
-if build_cuda_ext:
-    build_cuda_ext = check_cuda_availability(CUDA_HOME) and check_cuda_torch_binary_vs_bare_metal(CUDA_HOME)
-
-if build_cuda_ext:
-    # Set up macros for forward/backward compatibility hack around
-    # https://github.com/pytorch/pytorch/commit/4404762d7dd955383acee92e6f06b48144a0742e
-    # and
-    # https://github.com/NVIDIA/apex/issues/456
-    # https://github.com/pytorch/pytorch/commit/eb7b39e02f7d75c26d8a795ea8c7fd911334da7e#diff-4632522f237f1e4e728cb824300403ac
+if BUILD_CUDA_EXT:
+    environment_check_for_cuda_extension_build()
+    set_cuda_arch_list(CUDA_HOME)
 
     from op_builder import ALL_OPS
+    op_names = []
+
+    # load all builders
     for name, builder_cls in ALL_OPS.items():
-        print(f'===== Building Extension {name} =====')
+        op_names.append(name)
         ext_modules.append(builder_cls().builder())
 
+    # show log
+    op_name_list = ', '.join(op_names)
+    print(f"[extension]  loaded builders for {op_name_list}")
+
 # always put not nightly branch as the if branch
 # otherwise github will treat colossalai-nightly as the project name
 # and it will mess up with the dependency graph insights
-if not is_nightly:
+if not IS_NIGHTLY:
     version = get_version()
     package_name = 'colossalai'
 else:

From 19fa0e57f622cc5177d54f373fe53ea155fb6ee9 Mon Sep 17 00:00:00 2001
From: Yasyf Mohamedali <yasyfm@gmail.com>
Date: Fri, 3 Mar 2023 22:44:06 -0800
Subject: [PATCH 413/503] Remove extraneous comma (#2993)

Prevents `TypeError: category must be a Warning subclass, not 'str'`.
---
 op_builder/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/op_builder/utils.py b/op_builder/utils.py
index 153590428cbe..4029703e4829 100644
--- a/op_builder/utils.py
+++ b/op_builder/utils.py
@@ -157,7 +157,7 @@ def set_cuda_arch_list(cuda_dir):
     # we only need to set this when CUDA is not available for cross-compilation
     if not cuda_available:
         warnings.warn(
-            '\n[extension]  PyTorch did not find available GPUs on this system.\n',
+            '\n[extension]  PyTorch did not find available GPUs on this system.\n'
             'If your intention is to cross-compile, this is not an error.\n'
             'By default, Colossal-AI will cross-compile for \n'
             '1. Pascal (compute capabilities 6.0, 6.1, 6.2),\n'

From e0a1c1321ce6751686e184476d520e173c1d6b8e Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Sat, 4 Mar 2023 17:32:22 +0800
Subject: [PATCH 414/503] [doc] added reference to related works (#2994)

* [doc] added reference to related works

* polish code
---
 README-zh-Hans.md                             |  4 ++
 README.md                                     |  4 ++
 REFERENCE.md                                  | 38 +++++++++++++++++++
 .../en/concepts/paradigms_of_parallelism.md   |  1 +
 docs/source/en/features/nvme_offload.md       |  5 +++
 docs/source/en/features/zero_with_chunk.md    |  3 ++
 .../concepts/paradigms_of_parallelism.md      |  1 +
 docs/source/zh-Hans/features/nvme_offload.md  |  4 ++
 .../zh-Hans/features/zero_with_chunk.md       |  4 ++
 9 files changed, 64 insertions(+)
 create mode 100644 REFERENCE.md

diff --git a/README-zh-Hans.md b/README-zh-Hans.md
index 0f53491c0442..3b331734f3be 100644
--- a/README-zh-Hans.md
+++ b/README-zh-Hans.md
@@ -376,6 +376,10 @@ docker run -ti --gpus all --rm --ipc=host colossalai bash
 
 ## 引用我们
 
+Colossal-AI项目受一些相关的项目启发而成立，一些项目是我们的开发者的科研项目，另一些来自于其他组织的科研工作。我们希望. 我们希望在[参考文献列表](./REFERENCE.md)中列出这些令人称赞的项目，以向开源社区和研究项目致谢。
+
+你可以通过以下格式引用这个项目。
+
 ```
 @article{bian2021colossal,
   title={Colossal-AI: A Unified Deep Learning System For Large-Scale Parallel Training},
diff --git a/README.md b/README.md
index bdd9fafc969e..31af801d1c4f 100644
--- a/README.md
+++ b/README.md
@@ -378,6 +378,10 @@ We leverage the power of [GitHub Actions](https://github.com/features/actions) t
 
 ## Cite Us
 
+This project is inspired by some related projects (some by our team and some by other organizations). We would like to credit these amazing projects as listed in the [Reference List](./REFERENCE.md).
+
+To cite this project, you can use the following BibTeX citation.
+
 ```
 @article{bian2021colossal,
   title={Colossal-AI: A Unified Deep Learning System For Large-Scale Parallel Training},
diff --git a/REFERENCE.md b/REFERENCE.md
new file mode 100644
index 000000000000..2681198191cb
--- /dev/null
+++ b/REFERENCE.md
@@ -0,0 +1,38 @@
+# References
+
+The Colossal-AI project aims to provide a wide array of parallelism techniques for the machine learning community in the big-model era. This project is inspired by quite a few reserach works, some are conducted by some of our developers and the others are research projects open-sourced by other organizations. We would like to credit these amazing projects below in the IEEE citation format.
+
+## By Our Team
+
+- Q. Xu, S. Li, C. Gong, and Y. You, ‘An Efficient 2D Method for Training Super-Large Deep Learning Models’. arXiv, 2021.
+
+- Z. Bian, Q. Xu, B. Wang, and Y. You, ‘Maximizing Parallelism in Distributed Training for Huge Neural Networks’. arXiv, 2021.
+
+- S. Li, F. Xue, C. Baranwal, Y. Li, and Y. You, ‘Sequence Parallelism: Long Sequence Training from System Perspective’. arXiv, 2021.
+
+- S. Li et al., ‘Colossal-AI: A Unified Deep Learning System For Large-Scale Parallel Training’. arXiv, 2021.
+
+- B. Wang, Q. Xu, Z. Bian, and Y. You, ‘Tesseract: Parallelize the Tensor Parallelism Efficiently’, in Proceedings of the 51th International Conference on Parallel Processing, 2022.
+
+- J. Fang et al., ‘A Frequency-aware Software Cache for Large Recommendation System Embeddings’. arXiv, 2022.
+
+- J. Fang et al., ‘Parallel Training of Pre-Trained Models via Chunk-Based Dynamic Memory Management’, IEEE Transactions on Parallel and Distributed Systems, vol. 34, no. 1, pp. 304–315, 2023.
+
+- Y. Liu, S. Li, J. Fang, Y. Shao, B. Yao, and Y. You, ‘Colossal-Auto: Unified Automation of Parallelization and Activation Checkpoint for Large-scale Models’. arXiv, 2023.
+
+
+## By Other Organizations
+
+- M. Shoeybi, M. Patwary, R. Puri, P. LeGresley, J. Casper, and B. Catanzaro, ‘Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism’. arXiv, 2019.
+
+- S. Rajbhandari, J. Rasley, O. Ruwase, and Y. He, ‘ZeRO: Memory Optimizations toward Training Trillion Parameter Models’, in Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, 2020.
+
+- J. Rasley, S. Rajbhandari, O. Ruwase, and Y. He, ‘DeepSpeed: System Optimizations Enable Training Deep Learning Models with Over 100 Billion Parameters’, in Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, Virtual Event, CA, USA, 2020, pp. 3505–3506.
+
+- D. Narayanan et al., ‘Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM’, in Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, St. Louis, Missouri, 2021.
+
+- Jie Ren, Samyam Rajbhandari, Reza Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, Yuxiong He. 2021. ZeRO-Offload: Democratizing Billion-Scale Model Training. arXiv:2101.06840 and USENIX ATC 2021.
+
+- S. Rajbhandari, O. Ruwase, J. Rasley, S. Smith, and Y. He, ‘ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning’. in Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, St. Louis, Missouri, 2021.
+
+- L. Zheng et al., ‘Alpa: Automating Inter- and Intra-Operator Parallelism for Distributed Deep Learning’, in 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22), 2022, pp. 559–578.
diff --git a/docs/source/en/concepts/paradigms_of_parallelism.md b/docs/source/en/concepts/paradigms_of_parallelism.md
index ced7a544a7b0..1a5dab7a76f7 100644
--- a/docs/source/en/concepts/paradigms_of_parallelism.md
+++ b/docs/source/en/concepts/paradigms_of_parallelism.md
@@ -119,5 +119,6 @@ model on a single machine.
 </figure>
 
 Related paper:
+- [ZeRO-Offload: Democratizing Billion-Scale Model Training](https://arxiv.org/abs/2101.06840)
 - [ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning](https://arxiv.org/abs/2104.07857)
 - [PatrickStar: Parallel Training of Pre-trained Models via Chunk-based Memory Management](https://arxiv.org/abs/2108.05818)
diff --git a/docs/source/en/features/nvme_offload.md b/docs/source/en/features/nvme_offload.md
index fb491b063c03..8c0fd2053f8b 100644
--- a/docs/source/en/features/nvme_offload.md
+++ b/docs/source/en/features/nvme_offload.md
@@ -5,6 +5,11 @@ Author: Hongxin Liu
 **Prerequisite:**
 - [Zero Redundancy Optimizer with chunk-based memory management](../features/zero_with_chunk.md)
 
+**Related Paper**
+
+- [ZeRO-Offload: Democratizing Billion-Scale Model Training](https://arxiv.org/abs/2101.06840)
+- [ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning](https://arxiv.org/abs/2104.07857)
+
 ## Introduction
 
 If a model has `N` parameters, when using Adam, it has `8N` optimizer states. For billion-scale models, optimizer states take at least 32 GB memory. GPU memory limits the model scale we can train, which is called GPU memory wall. If we offload optimizer states to the disk, we can break through GPU memory wall.
diff --git a/docs/source/en/features/zero_with_chunk.md b/docs/source/en/features/zero_with_chunk.md
index 8492631bc0d3..6b0a9585af85 100644
--- a/docs/source/en/features/zero_with_chunk.md
+++ b/docs/source/en/features/zero_with_chunk.md
@@ -1,6 +1,7 @@
 # Zero Redundancy Optimizer with chunk-based memory management
 
 Author: [Hongxiu Liu](https://github.com/ver217), [Jiarui Fang](https://github.com/feifeibear), [Zijian Ye](https://github.com/ZijianYY)
+
 **Prerequisite:**
 - [Define Your Configuration](../basics/define_your_config.md)
 
@@ -9,9 +10,11 @@ Author: [Hongxiu Liu](https://github.com/ver217), [Jiarui Fang](https://github.c
 - [Train GPT with Colossal-AI](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/gpt)
 
 **Related Paper**
+
 - [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models](https://arxiv.org/abs/1910.02054)
 - [ZeRO-Offload: Democratizing Billion-Scale Model Training](https://arxiv.org/abs/2101.06840)
 - [ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning](https://arxiv.org/abs/2104.07857)
+- [DeepSpeed: System Optimizations Enable Training Deep Learning Models with Over 100 Billion Parameters](https://dl.acm.org/doi/10.1145/3394486.3406703)
 - [PatrickStar: Parallel Training of Pre-trained Models via Chunk-based Memory Management](https://arxiv.org/abs/2108.05818)
 
 ## Introduction
diff --git a/docs/source/zh-Hans/concepts/paradigms_of_parallelism.md b/docs/source/zh-Hans/concepts/paradigms_of_parallelism.md
index 0d6d58fd281c..8f52d28ecdf4 100755
--- a/docs/source/zh-Hans/concepts/paradigms_of_parallelism.md
+++ b/docs/source/zh-Hans/concepts/paradigms_of_parallelism.md
@@ -87,5 +87,6 @@
 </figure>
 
 相关文章:
+- [ZeRO-Offload: Democratizing Billion-Scale Model Training](https://arxiv.org/abs/2101.06840)
 - [ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning](https://arxiv.org/abs/2104.07857)
 - [PatrickStar: Parallel Training of Pre-trained Models via Chunk-based Memory Management](https://arxiv.org/abs/2108.05818)
diff --git a/docs/source/zh-Hans/features/nvme_offload.md b/docs/source/zh-Hans/features/nvme_offload.md
index 0ced6031de63..6f3280fe19d4 100644
--- a/docs/source/zh-Hans/features/nvme_offload.md
+++ b/docs/source/zh-Hans/features/nvme_offload.md
@@ -5,6 +5,10 @@
 **前置教程:**
 - [基于Chunk内存管理的零冗余优化器 (ZeRO)](../features/zero_with_chunk.md)
 
+**相关论文**
+
+- [ZeRO-Offload: Democratizing Billion-Scale Model Training](https://arxiv.org/abs/2101.06840)
+- [ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning](https://arxiv.org/abs/2104.07857)
 ## 引言
 
 如果模型具有`N`个参数，在使用 Adam 时，优化器状态具有`8N`个参数。对于十亿规模的模型，优化器状态至少需要 32 GB 内存。 GPU显存限制了我们可以训练的模型规模，这称为GPU显存墙。如果我们将优化器状态 offload 到磁盘，我们可以突破 GPU 内存墙。
diff --git a/docs/source/zh-Hans/features/zero_with_chunk.md b/docs/source/zh-Hans/features/zero_with_chunk.md
index 13dd1cd20130..72403bf610a4 100644
--- a/docs/source/zh-Hans/features/zero_with_chunk.md
+++ b/docs/source/zh-Hans/features/zero_with_chunk.md
@@ -3,9 +3,11 @@
 作者: [Hongxiu Liu](https://github.com/ver217), [Jiarui Fang](https://github.com/feifeibear), [Zijian Ye](https://github.com/ZijianYY)
 
 **前置教程:**
+
 - [定义配置文件](../basics/define_your_config.md)
 
 **示例代码**
+
 - [Train GPT with Colossal-AI](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/gpt)
 
 **相关论文**
@@ -13,8 +15,10 @@
 - [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models](https://arxiv.org/abs/1910.02054)
 - [ZeRO-Offload: Democratizing Billion-Scale Model Training](https://arxiv.org/abs/2101.06840)
 - [ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning](https://arxiv.org/abs/2104.07857)
+- [DeepSpeed: System Optimizations Enable Training Deep Learning Models with Over 100 Billion Parameters](https://dl.acm.org/doi/10.1145/3394486.3406703)
 - [PatrickStar: Parallel Training of Pre-trained Models via Chunk-based Memory Management](https://arxiv.org/abs/2108.05818)
 
+
 ## 引言
 
 零冗余优化器 (ZeRO) 通过对三个模型状态（优化器状态、梯度和参数）进行划分而不是复制他们，消除了数据并行进程中的内存冗余。该方法与传统的数据并行相比，内存效率得到了极大的提高，而计算粒度和通信效率得到了保留。

From 823f3b9cf461e030bbc0236f6dd1d9360cc8f191 Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Sat, 4 Mar 2023 20:08:11 +0800
Subject: [PATCH 415/503] [doc] add deepspeed citation and copyright (#2996)

* [doc] add deepspeed citation and copyright

* [doc] add deepspeed citation and copyright

* [doc] add deepspeed citation and copyright
---
 colossalai/engine/_base_engine.py             |  14 +-
 .../profiler_function/arithmetic.py           |   5 +
 .../profiler_module/convolution.py            |   7 +-
 .../profiler_module/normalization.py          |   5 +
 colossalai/gemini/ophooks/utils.py            |   6 +-
 .../csrc/kernels/cublas_wrappers.cu           |   1 +
 .../csrc/kernels/include/cublas_wrappers.h    |   1 +
 .../csrc/kernels/include/feed_forward.h       | 137 ++++++------
 .../csrc/kernels/include/strided_batch_gemm.h | 199 ++++++++---------
 .../cuda_native/csrc/multi_tensor_adam.cu     |   5 +
 .../cuda_native/csrc/multi_tensor_apply.cuh   | 207 +++++++++---------
 .../kernel/cuda_native/csrc/type_shim.h       |   5 +
 colossalai/nn/optimizer/fused_adam.py         |   7 +
 colossalai/nn/optimizer/zero_optimizer.py     |   1 +
 colossalai/utils/model/utils.py               |   7 +-
 .../zero/sharded_model/sharded_model_v2.py    |   1 +
 .../zero/sharded_optim/low_level_optim.py     |   1 +
 .../zero/sharded_optim/sharded_optim_v2.py    |  24 +-
 op_builder/builder.py                         |   4 +
 19 files changed, 342 insertions(+), 295 deletions(-)

diff --git a/colossalai/engine/_base_engine.py b/colossalai/engine/_base_engine.py
index 146a29669227..59d8e1058652 100644
--- a/colossalai/engine/_base_engine.py
+++ b/colossalai/engine/_base_engine.py
@@ -1,16 +1,16 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
+# this code is inspired by the DeepSpeed library and implemented with our own design from scratch
 
-from typing import List, Iterable
+from typing import Iterable, List, Optional, Type
+
+from torch import Tensor
 from torch.nn import Module
 from torch.nn.modules.loss import _Loss
 
-from colossalai.logging import get_dist_logger
-from torch import Tensor
-from colossalai.gemini.ophooks import register_ophooks_recursively, BaseOpHook
-from colossalai.engine.schedule import BaseSchedule, NonPipelineSchedule, PipelineSchedule, InterleavedPipelineSchedule
-from typing import Optional, Type
 from colossalai.engine.gradient_handler import BaseGradientHandler
+from colossalai.engine.schedule import BaseSchedule, InterleavedPipelineSchedule, NonPipelineSchedule, PipelineSchedule
+from colossalai.gemini.ophooks import BaseOpHook, register_ophooks_recursively
 from colossalai.logging import get_dist_logger
 
 
@@ -93,7 +93,7 @@ def __init__(self,
         if self.uses_pipeline:
             self._schedule.pre_processing(self)
 
-        #register hook if any
+        # register hook if any
         if len(self._ophook_list) > 0:
             register_ophooks_recursively(self._model, self._ophook_list)
 
diff --git a/colossalai/fx/profiler/experimental/profiler_function/arithmetic.py b/colossalai/fx/profiler/experimental/profiler_function/arithmetic.py
index 2cf50133d3bd..8d1c8a8c6877 100644
--- a/colossalai/fx/profiler/experimental/profiler_function/arithmetic.py
+++ b/colossalai/fx/profiler/experimental/profiler_function/arithmetic.py
@@ -1,7 +1,12 @@
+# Copyright (c) Microsoft Corporation.
+
+# Licensed under the MIT License.
 import operator
 from functools import reduce
 from typing import Any, Optional, Tuple, Union
+
 import torch
+
 from ..registry import meta_profiler_function
 
 
diff --git a/colossalai/fx/profiler/experimental/profiler_module/convolution.py b/colossalai/fx/profiler/experimental/profiler_module/convolution.py
index 3193489fee5e..a4c15b91e611 100644
--- a/colossalai/fx/profiler/experimental/profiler_module/convolution.py
+++ b/colossalai/fx/profiler/experimental/profiler_module/convolution.py
@@ -1,8 +1,13 @@
+# Copyright (c) Microsoft Corporation.
+
+# Licensed under the MIT License.
+import math
 import operator
 from functools import reduce
-import math
 from typing import Tuple
+
 import torch
+
 from ..registry import meta_profiler_module
 
 
diff --git a/colossalai/fx/profiler/experimental/profiler_module/normalization.py b/colossalai/fx/profiler/experimental/profiler_module/normalization.py
index e9939da7b1c4..49e5e6fa5384 100644
--- a/colossalai/fx/profiler/experimental/profiler_module/normalization.py
+++ b/colossalai/fx/profiler/experimental/profiler_module/normalization.py
@@ -1,5 +1,10 @@
+# Copyright (c) Microsoft Corporation.
+
+# Licensed under the MIT License.
 from typing import Tuple, Union
+
 import torch
+
 from ..registry import meta_profiler_module
 
 
diff --git a/colossalai/gemini/ophooks/utils.py b/colossalai/gemini/ophooks/utils.py
index fe08405c82bf..84e8298c1d51 100644
--- a/colossalai/gemini/ophooks/utils.py
+++ b/colossalai/gemini/ophooks/utils.py
@@ -1,7 +1,7 @@
-import torch
-from typing import List, Callable, Optional
-
+# this code is inspired by the DeepSpeed library and implemented with our own design from scratch
 from abc import ABC, abstractmethod
+from typing import Callable, List, Optional
+
 import torch
 
 
diff --git a/colossalai/kernel/cuda_native/csrc/kernels/cublas_wrappers.cu b/colossalai/kernel/cuda_native/csrc/kernels/cublas_wrappers.cu
index 68be1f6d7a22..09f34763f9b2 100644
--- a/colossalai/kernel/cuda_native/csrc/kernels/cublas_wrappers.cu
+++ b/colossalai/kernel/cuda_native/csrc/kernels/cublas_wrappers.cu
@@ -1,6 +1,7 @@
 /* Copyright 2021 The LightSeq Team
    Copyright Microsoft DeepSpeed
    This file is adapted from Microsoft DeepSpeed
+   Licensed under the MIT License.
 */
 #include "cublas_wrappers.h"
 
diff --git a/colossalai/kernel/cuda_native/csrc/kernels/include/cublas_wrappers.h b/colossalai/kernel/cuda_native/csrc/kernels/include/cublas_wrappers.h
index 7ebb9ce48ed3..90255152b2c8 100644
--- a/colossalai/kernel/cuda_native/csrc/kernels/include/cublas_wrappers.h
+++ b/colossalai/kernel/cuda_native/csrc/kernels/include/cublas_wrappers.h
@@ -1,6 +1,7 @@
 /* Copyright 2021 The LightSeq Team
    Copyright Microsoft DeepSpeed
    This file is adapted from Microsoft DeepSpeed
+   Licensed under the MIT License.
 */
 #pragma once
 
diff --git a/colossalai/kernel/cuda_native/csrc/kernels/include/feed_forward.h b/colossalai/kernel/cuda_native/csrc/kernels/include/feed_forward.h
index ec963259f738..8186da1eed5f 100644
--- a/colossalai/kernel/cuda_native/csrc/kernels/include/feed_forward.h
+++ b/colossalai/kernel/cuda_native/csrc/kernels/include/feed_forward.h
@@ -1,68 +1,69 @@
-#pragma once
-
-/* Copyright 2021 The LightSeq Team
-   Copyright Microsoft DeepSpeed
-   This file is adapted from Microsoft DeepSpeed
-*/
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <stdio.h>
-
-#include <array>
-
-#include "cublas_wrappers.h"
-#include "kernels.h"
-
-template <typename T>
-class FeedForward {
- public:
-  struct Config {
-    int outputSize;
-    int inputSize;
-    std::array<int, 3> gemm_algos;
-    Config(int outputs, int inputs)
-        : outputSize(outputs),
-          inputSize(inputs),
-          gemm_algos(std::array<int, 3>({99, 99, 99})) {}
-  };
-
-  FeedForward(Config config) : config_(config) {}
-
-  ~FeedForward() {}
-
-  void Forward(int bsz, const T *input_ptr, const T *weights, T *out,
-               cublasHandle_t &_cublasHandle) {
-    float alpha = T(1.);
-    float beta = T(0.);
-
-    cublas_gemm_ex(_cublasHandle, CUBLAS_OP_T, CUBLAS_OP_N, config_.outputSize,
-                   bsz, config_.inputSize, &alpha, &beta, weights, input_ptr,
-                   out, cublasGemmAlgo_t(config_.gemm_algos[0]));
-  }
-  void Backward(int bsz, const T *out_grad, const T *input_ptr,
-                const T *weights, T *weights_grad, T *bias_grad,
-                cublasHandle_t &_cublasHandle, cudaStream_t &stream,
-                T *inp_grad_out = nullptr, T *out_grad_trans_out = nullptr,
-                bool compute_bias = true) {
-    float alpha = (T)1.0, beta = (T)0.0;
-    cublas_gemm_ex(_cublasHandle, CUBLAS_OP_N, CUBLAS_OP_T, config_.inputSize,
-                   config_.outputSize, bsz, &alpha, &beta, input_ptr, out_grad,
-                   weights_grad, cublasGemmAlgo_t(config_.gemm_algos[1]));
-
-    cublas_gemm_ex(_cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, config_.inputSize,
-                   bsz, config_.outputSize, &alpha, &beta, weights, out_grad,
-                   inp_grad_out, cublasGemmAlgo_t(config_.gemm_algos[2]));
-    if (compute_bias) {
-      launch_fuse_transpose_bias_kernel<T>(out_grad, bias_grad, bsz,
-                                           config_.outputSize, stream);
-    }
-  }
-
-  void reset_size(int outputSize, int inputSize) {
-    config_.outputSize = outputSize;
-    config_.inputSize = inputSize;
-  }
-
- private:
-  Config config_;
-};
+#pragma once
+
+/* Copyright 2021 The LightSeq Team
+   Copyright Microsoft DeepSpeed
+   This file is adapted from Microsoft DeepSpeed
+   Licensed under the MIT License.
+*/
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <stdio.h>
+
+#include <array>
+
+#include "cublas_wrappers.h"
+#include "kernels.h"
+
+template <typename T>
+class FeedForward {
+ public:
+  struct Config {
+    int outputSize;
+    int inputSize;
+    std::array<int, 3> gemm_algos;
+    Config(int outputs, int inputs)
+        : outputSize(outputs),
+          inputSize(inputs),
+          gemm_algos(std::array<int, 3>({99, 99, 99})) {}
+  };
+
+  FeedForward(Config config) : config_(config) {}
+
+  ~FeedForward() {}
+
+  void Forward(int bsz, const T *input_ptr, const T *weights, T *out,
+               cublasHandle_t &_cublasHandle) {
+    float alpha = T(1.);
+    float beta = T(0.);
+
+    cublas_gemm_ex(_cublasHandle, CUBLAS_OP_T, CUBLAS_OP_N, config_.outputSize,
+                   bsz, config_.inputSize, &alpha, &beta, weights, input_ptr,
+                   out, cublasGemmAlgo_t(config_.gemm_algos[0]));
+  }
+  void Backward(int bsz, const T *out_grad, const T *input_ptr,
+                const T *weights, T *weights_grad, T *bias_grad,
+                cublasHandle_t &_cublasHandle, cudaStream_t &stream,
+                T *inp_grad_out = nullptr, T *out_grad_trans_out = nullptr,
+                bool compute_bias = true) {
+    float alpha = (T)1.0, beta = (T)0.0;
+    cublas_gemm_ex(_cublasHandle, CUBLAS_OP_N, CUBLAS_OP_T, config_.inputSize,
+                   config_.outputSize, bsz, &alpha, &beta, input_ptr, out_grad,
+                   weights_grad, cublasGemmAlgo_t(config_.gemm_algos[1]));
+
+    cublas_gemm_ex(_cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, config_.inputSize,
+                   bsz, config_.outputSize, &alpha, &beta, weights, out_grad,
+                   inp_grad_out, cublasGemmAlgo_t(config_.gemm_algos[2]));
+    if (compute_bias) {
+      launch_fuse_transpose_bias_kernel<T>(out_grad, bias_grad, bsz,
+                                           config_.outputSize, stream);
+    }
+  }
+
+  void reset_size(int outputSize, int inputSize) {
+    config_.outputSize = outputSize;
+    config_.inputSize = inputSize;
+  }
+
+ private:
+  Config config_;
+};
diff --git a/colossalai/kernel/cuda_native/csrc/kernels/include/strided_batch_gemm.h b/colossalai/kernel/cuda_native/csrc/kernels/include/strided_batch_gemm.h
index 3120660b98be..d386650e8235 100644
--- a/colossalai/kernel/cuda_native/csrc/kernels/include/strided_batch_gemm.h
+++ b/colossalai/kernel/cuda_native/csrc/kernels/include/strided_batch_gemm.h
@@ -1,99 +1,100 @@
-/* Copyright 2021 The LightSeq Team
-   Copyright Microsoft DeepSpeed
-   This file is adapted from Microsoft DeepSpeed
-*/
-#pragma once
-
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <stdio.h>
-
-#include <array>
-
-#include "cublas_wrappers.h"
-
-template <typename T>
-class StridedBatchGemm {
- public:
-  struct Config {
-    int m;
-    int n;
-    int k;
-    float alpha;
-    float beta;
-    cublasOperation_t op_A;
-    cublasOperation_t op_B;
-    std::array<int, 3> gemm_algos;
-
-    Config(float param_alpha, float param_beta, cublasOperation_t opA,
-           cublasOperation_t opB)
-        : alpha(param_alpha),
-          beta(param_beta),
-          op_A(opA),
-          op_B(opB),
-          gemm_algos(std::array<int, 3>({99, 99, 99})) {}
-    void SetConfig(int mm, int nn, int kk) {
-      m = mm;
-      n = nn;
-      k = kk;
-    }
-  };
-
-  StridedBatchGemm(const Config &config) : _config(config) {}
-
-  virtual ~StridedBatchGemm() {}
-
-  void Forward(int bsz, T *output, const T *_buffer_a, const T *_buffer_b,
-               cublasHandle_t handle) {
-    int stride_a = _config.m * _config.k;
-    int stride_b = _config.n * _config.k;
-    int stride_c = _config.m * _config.n;
-
-    cublas_strided_batched_gemm(
-        handle, _config.m, _config.n, _config.k, &_config.alpha, &_config.beta,
-        _buffer_a, _buffer_b, output, _config.op_A, _config.op_B, stride_a,
-        stride_b, stride_c, bsz, cublasGemmAlgo_t(_config.gemm_algos[0]));
-  }
-
-  void Backward(int bsz, const T *d_output, const T *_buffer_a,
-                const T *_buffer_b, cublasHandle_t handle,
-                T *inpGradA = nullptr, T *inpGradB = nullptr) {
-    int mb = (_config.op_A == CUBLAS_OP_T ? _config.k : _config.m);
-    int kb = (_config.op_A == CUBLAS_OP_T ? _config.m : _config.k);
-
-    int stride_a = mb * _config.n;
-    int stride_b = _config.n * kb;
-    int stride_c = _config.m * _config.k;
-
-    // B need to transpose.
-    cublasOperation_t op_b =
-        (_config.op_B == CUBLAS_OP_T ? CUBLAS_OP_N : CUBLAS_OP_T);
-
-    // Calculate d_A.
-    cublas_strided_batched_gemm(
-        handle, mb, kb, _config.n, &_config.alpha, &_config.beta,
-        (_config.op_A == CUBLAS_OP_T ? _buffer_b : d_output),
-        (_config.op_A == CUBLAS_OP_T ? d_output : _buffer_b), inpGradA,
-        CUBLAS_OP_N, op_b, stride_a, stride_b, stride_c, bsz,
-        cublasGemmAlgo_t(_config.gemm_algos[1]));
-
-    // A need to transpose.
-    cublasOperation_t op_a =
-        (_config.op_A == CUBLAS_OP_T ? CUBLAS_OP_N : CUBLAS_OP_T);
-
-    stride_a = _config.m * _config.k;
-    stride_b = _config.m * _config.n;
-    stride_c = _config.n * _config.k;
-
-    // Calculate d_B.
-    cublas_strided_batched_gemm(
-        handle, _config.k, _config.n, _config.m, &_config.alpha, &_config.beta,
-        _buffer_a, d_output, inpGradB, op_a, CUBLAS_OP_N, stride_a, stride_b,
-        stride_c, bsz, cublasGemmAlgo_t(_config.gemm_algos[2]));
-  }
-
-  inline void SetConfig(int m, int n, int k) { _config.SetConfig(m, n, k); }
-
- private:
-  Config _config;
-};
+/* Copyright 2021 The LightSeq Team
+   Copyright Microsoft DeepSpeed
+   This file is adapted from Microsoft DeepSpeed
+   Licensed under the MIT License.
+*/
+#pragma once
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <stdio.h>
+
+#include <array>
+
+#include "cublas_wrappers.h"
+
+template <typename T>
+class StridedBatchGemm {
+ public:
+  struct Config {
+    int m;
+    int n;
+    int k;
+    float alpha;
+    float beta;
+    cublasOperation_t op_A;
+    cublasOperation_t op_B;
+    std::array<int, 3> gemm_algos;
+
+    Config(float param_alpha, float param_beta, cublasOperation_t opA,
+           cublasOperation_t opB)
+        : alpha(param_alpha),
+          beta(param_beta),
+          op_A(opA),
+          op_B(opB),
+          gemm_algos(std::array<int, 3>({99, 99, 99})) {}
+    void SetConfig(int mm, int nn, int kk) {
+      m = mm;
+      n = nn;
+      k = kk;
+    }
+  };
+
+  StridedBatchGemm(const Config &config) : _config(config) {}
+
+  virtual ~StridedBatchGemm() {}
+
+  void Forward(int bsz, T *output, const T *_buffer_a, const T *_buffer_b,
+               cublasHandle_t handle) {
+    int stride_a = _config.m * _config.k;
+    int stride_b = _config.n * _config.k;
+    int stride_c = _config.m * _config.n;
+
+    cublas_strided_batched_gemm(
+        handle, _config.m, _config.n, _config.k, &_config.alpha, &_config.beta,
+        _buffer_a, _buffer_b, output, _config.op_A, _config.op_B, stride_a,
+        stride_b, stride_c, bsz, cublasGemmAlgo_t(_config.gemm_algos[0]));
+  }
+
+  void Backward(int bsz, const T *d_output, const T *_buffer_a,
+                const T *_buffer_b, cublasHandle_t handle,
+                T *inpGradA = nullptr, T *inpGradB = nullptr) {
+    int mb = (_config.op_A == CUBLAS_OP_T ? _config.k : _config.m);
+    int kb = (_config.op_A == CUBLAS_OP_T ? _config.m : _config.k);
+
+    int stride_a = mb * _config.n;
+    int stride_b = _config.n * kb;
+    int stride_c = _config.m * _config.k;
+
+    // B need to transpose.
+    cublasOperation_t op_b =
+        (_config.op_B == CUBLAS_OP_T ? CUBLAS_OP_N : CUBLAS_OP_T);
+
+    // Calculate d_A.
+    cublas_strided_batched_gemm(
+        handle, mb, kb, _config.n, &_config.alpha, &_config.beta,
+        (_config.op_A == CUBLAS_OP_T ? _buffer_b : d_output),
+        (_config.op_A == CUBLAS_OP_T ? d_output : _buffer_b), inpGradA,
+        CUBLAS_OP_N, op_b, stride_a, stride_b, stride_c, bsz,
+        cublasGemmAlgo_t(_config.gemm_algos[1]));
+
+    // A need to transpose.
+    cublasOperation_t op_a =
+        (_config.op_A == CUBLAS_OP_T ? CUBLAS_OP_N : CUBLAS_OP_T);
+
+    stride_a = _config.m * _config.k;
+    stride_b = _config.m * _config.n;
+    stride_c = _config.n * _config.k;
+
+    // Calculate d_B.
+    cublas_strided_batched_gemm(
+        handle, _config.k, _config.n, _config.m, &_config.alpha, &_config.beta,
+        _buffer_a, d_output, inpGradB, op_a, CUBLAS_OP_N, stride_a, stride_b,
+        stride_c, bsz, cublasGemmAlgo_t(_config.gemm_algos[2]));
+  }
+
+  inline void SetConfig(int m, int n, int k) { _config.SetConfig(m, n, k); }
+
+ private:
+  Config _config;
+};
diff --git a/colossalai/kernel/cuda_native/csrc/multi_tensor_adam.cu b/colossalai/kernel/cuda_native/csrc/multi_tensor_adam.cu
index afd34bb96352..9cc3ae1eac10 100644
--- a/colossalai/kernel/cuda_native/csrc/multi_tensor_adam.cu
+++ b/colossalai/kernel/cuda_native/csrc/multi_tensor_adam.cu
@@ -1,5 +1,10 @@
 // modified from
 // https://github.com/NVIDIA/apex/blob/master/csrc/multi_tensor_adam.cu
+/* Copyright 2020 The Microsoft DeepSpeed Team
+   Copyright NVIDIA/apex
+   This file is adapted from fused adam in NVIDIA/apex, commit a109f85
+   Licensed under the MIT License.
+*/
 #include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/cuda/CUDAContext.h>
diff --git a/colossalai/kernel/cuda_native/csrc/multi_tensor_apply.cuh b/colossalai/kernel/cuda_native/csrc/multi_tensor_apply.cuh
index 9ce41191133e..ec55dd320b40 100644
--- a/colossalai/kernel/cuda_native/csrc/multi_tensor_apply.cuh
+++ b/colossalai/kernel/cuda_native/csrc/multi_tensor_apply.cuh
@@ -1,12 +1,18 @@
-// modified from https://github.com/NVIDIA/apex/blob/master/csrc/multi_tensor_apply.cuh
+// modified from
+// https://github.com/NVIDIA/apex/blob/master/csrc/multi_tensor_apply.cuh
+/* Copyright 2020 The Microsoft DeepSpeed Team
+   Copyright NVIDIA/apex
+   This file is adapted from fused adam in NVIDIA/apex, commit a109f85
+   Licensed under the MIT License.
+*/
 #include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/Exceptions.h>
+#include <assert.h>
 #include <c10/cuda/CUDAGuard.h>
-#include "compat.h"
 
-#include <assert.h>
+#include "compat.h"
 
 // #include <iostream>
 
@@ -17,117 +23,108 @@ constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30};
 constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320};
 
 template <int n>
-struct TensorListMetadata
-{
-    void *addresses[n][depth_to_max_tensors[n - 1]];
-    int sizes[depth_to_max_tensors[n - 1]];
-    unsigned char block_to_tensor[depth_to_max_blocks[n - 1]];
-    int block_to_chunk[depth_to_max_blocks[n - 1]]; // I fear this needs to be a full int.
-    int start_tensor_this_launch;
+struct TensorListMetadata {
+  void *addresses[n][depth_to_max_tensors[n - 1]];
+  int sizes[depth_to_max_tensors[n - 1]];
+  unsigned char block_to_tensor[depth_to_max_blocks[n - 1]];
+  int block_to_chunk[depth_to_max_blocks[n - 1]];  // I fear this needs to be a
+                                                   // full int.
+  int start_tensor_this_launch;
 };
 
 template <typename T, typename U, typename... ArgTypes>
-__global__ void multi_tensor_apply_kernel(
-    int chunk_size,
-    volatile int *noop_flag,
-    T tl,
-    U callable,
-    ArgTypes... args)
-{
-    // Hand the chunk information to the user-supplied functor to process however it likes.
-    callable(chunk_size, noop_flag, tl, args...);
+__global__ void multi_tensor_apply_kernel(int chunk_size,
+                                          volatile int *noop_flag, T tl,
+                                          U callable, ArgTypes... args) {
+  // Hand the chunk information to the user-supplied functor to process however
+  // it likes.
+  callable(chunk_size, noop_flag, tl, args...);
 }
 
 template <int depth, typename T, typename... ArgTypes>
 void multi_tensor_apply(
-    int block_size,
-    int chunk_size,
-    const at::Tensor &noop_flag,
-    const std::vector<std::vector<at::Tensor>> &tensor_lists,
-    T callable,
-    ArgTypes... args)
-{
-    TORCH_CHECK(tensor_lists.size() == depth, "tensor_lists.size() != depth");
-    int len0 = tensor_lists[0].size();
-    TORCH_CHECK(len0 > 0, "tensor_lists[0].size() is not > 0");
-    auto ref_device = tensor_lists[0][0].device();
-    TORCH_CHECK(ref_device.type() == at::kCUDA, "expected input to be on cuda");
-    for (int l = 0; l < tensor_lists.size(); l++) // No range-based for because I need indices
-    {
-        TORCH_CHECK(tensor_lists[l].size() == len0, "Size mismatch among tensor lists");
-        for (int t = 0; t < tensor_lists[l].size(); t++)
-        {
-            // TODO:  Print which tensor fails.
-            bool contiguous_memory = tensor_lists[l][t].is_contiguous();
+    int block_size, int chunk_size, const at::Tensor &noop_flag,
+    const std::vector<std::vector<at::Tensor>> &tensor_lists, T callable,
+    ArgTypes... args) {
+  TORCH_CHECK(tensor_lists.size() == depth, "tensor_lists.size() != depth");
+  int len0 = tensor_lists[0].size();
+  TORCH_CHECK(len0 > 0, "tensor_lists[0].size() is not > 0");
+  auto ref_device = tensor_lists[0][0].device();
+  TORCH_CHECK(ref_device.type() == at::kCUDA, "expected input to be on cuda");
+  for (int l = 0; l < tensor_lists.size();
+       l++)  // No range-based for because I need indices
+  {
+    TORCH_CHECK(tensor_lists[l].size() == len0,
+                "Size mismatch among tensor lists");
+    for (int t = 0; t < tensor_lists[l].size(); t++) {
+      // TODO:  Print which tensor fails.
+      bool contiguous_memory = tensor_lists[l][t].is_contiguous();
 #ifdef VERSION_GE_1_5
-            contiguous_memory = (contiguous_memory || tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast));
+      contiguous_memory =
+          (contiguous_memory ||
+           tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast));
 #endif
-            TORCH_CHECK(contiguous_memory, "A tensor was not contiguous.");
-            TORCH_CHECK(tensor_lists[l][t].device() == ref_device, "A tensor was not on the same device as the first tensor");
-            TORCH_CHECK(tensor_lists[l][t].numel() == tensor_lists[0][t].numel(), "Size mismatch");
-        }
+      TORCH_CHECK(contiguous_memory, "A tensor was not contiguous.");
+      TORCH_CHECK(tensor_lists[l][t].device() == ref_device,
+                  "A tensor was not on the same device as the first tensor");
+      TORCH_CHECK(tensor_lists[l][t].numel() == tensor_lists[0][t].numel(),
+                  "Size mismatch");
     }
-
-    int ntensors = tensor_lists[0].size();
-
-    TensorListMetadata<depth> tl;
-
-    const at::cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0]));
-    auto stream = at::cuda::getCurrentCUDAStream();
-
-    tl.start_tensor_this_launch = 0;
-    int loc_block_info = 0;
-    int loc_tensor_info = 0;
-    for (int t = 0; t < ntensors; t++)
-    {
-        tl.sizes[loc_tensor_info] = tensor_lists[0][t].numel();
-        for (int d = 0; d < depth; d++)
-            tl.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr();
-        loc_tensor_info++;
-
-        int chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size;
-
-        for (int chunk = 0; chunk < chunks_this_tensor; chunk++)
-        {
-            // std::cout << chunks_this_tensor << std::endl;
-            tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
-            tl.block_to_chunk[loc_block_info] = chunk;
-            loc_block_info++;
-
-            bool tensors_full = (loc_tensor_info == depth_to_max_tensors[depth - 1] &&
-                                 chunk == chunks_this_tensor - 1);
-            bool blocks_full = (loc_block_info == depth_to_max_blocks[depth - 1]);
-            bool last_chunk = (t == ntensors - 1 && chunk == chunks_this_tensor - 1);
-            if (tensors_full || blocks_full || last_chunk)
-            {
-                // using accscalar_t = acc_type<scalar_t, true>;
-                multi_tensor_apply_kernel<<<loc_block_info, block_size, 0, stream>>>(
-                    chunk_size,
-                    noop_flag.DATA_PTR<int>(),
-                    tl,
-                    callable,
-                    args...);
-
-                AT_CUDA_CHECK(cudaGetLastError());
-
-                // Reset.  The control flow possibilities here make my brain hurt.
-                loc_block_info = 0;
-                if (chunk == chunks_this_tensor - 1)
-                {
-                    // std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3 << std::endl;
-                    loc_tensor_info = 0;
-                    tl.start_tensor_this_launch = t + 1;
-                }
-                else
-                {
-                    // std::cout << "Hit case 2 " << cond1 << " " << cond2 << " " << cond3 << std::endl;
-                    tl.sizes[0] = tl.sizes[loc_tensor_info - 1];
-                    for (int d = 0; d < depth; d++)
-                        tl.addresses[d][0] = tl.addresses[d][loc_tensor_info - 1];
-                    loc_tensor_info = 1;
-                    tl.start_tensor_this_launch = t;
-                }
-            }
+  }
+
+  int ntensors = tensor_lists[0].size();
+
+  TensorListMetadata<depth> tl;
+
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0]));
+  auto stream = at::cuda::getCurrentCUDAStream();
+
+  tl.start_tensor_this_launch = 0;
+  int loc_block_info = 0;
+  int loc_tensor_info = 0;
+  for (int t = 0; t < ntensors; t++) {
+    tl.sizes[loc_tensor_info] = tensor_lists[0][t].numel();
+    for (int d = 0; d < depth; d++)
+      tl.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr();
+    loc_tensor_info++;
+
+    int chunks_this_tensor =
+        (tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size;
+
+    for (int chunk = 0; chunk < chunks_this_tensor; chunk++) {
+      // std::cout << chunks_this_tensor << std::endl;
+      tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
+      tl.block_to_chunk[loc_block_info] = chunk;
+      loc_block_info++;
+
+      bool tensors_full = (loc_tensor_info == depth_to_max_tensors[depth - 1] &&
+                           chunk == chunks_this_tensor - 1);
+      bool blocks_full = (loc_block_info == depth_to_max_blocks[depth - 1]);
+      bool last_chunk = (t == ntensors - 1 && chunk == chunks_this_tensor - 1);
+      if (tensors_full || blocks_full || last_chunk) {
+        // using accscalar_t = acc_type<scalar_t, true>;
+        multi_tensor_apply_kernel<<<loc_block_info, block_size, 0, stream>>>(
+            chunk_size, noop_flag.DATA_PTR<int>(), tl, callable, args...);
+
+        AT_CUDA_CHECK(cudaGetLastError());
+
+        // Reset.  The control flow possibilities here make my brain hurt.
+        loc_block_info = 0;
+        if (chunk == chunks_this_tensor - 1) {
+          // std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3
+          // << std::endl;
+          loc_tensor_info = 0;
+          tl.start_tensor_this_launch = t + 1;
+        } else {
+          // std::cout << "Hit case 2 " << cond1 << " " << cond2 << " " << cond3
+          // << std::endl;
+          tl.sizes[0] = tl.sizes[loc_tensor_info - 1];
+          for (int d = 0; d < depth; d++)
+            tl.addresses[d][0] = tl.addresses[d][loc_tensor_info - 1];
+          loc_tensor_info = 1;
+          tl.start_tensor_this_launch = t;
         }
+      }
     }
-}
\ No newline at end of file
+  }
+}
diff --git a/colossalai/kernel/cuda_native/csrc/type_shim.h b/colossalai/kernel/cuda_native/csrc/type_shim.h
index b4011c5ba6c3..2f180a7783ec 100644
--- a/colossalai/kernel/cuda_native/csrc/type_shim.h
+++ b/colossalai/kernel/cuda_native/csrc/type_shim.h
@@ -1,4 +1,9 @@
 /* Taken from NVIDIA/apex commit 855808f3fc268e9715d613f3c2e56469d8c986d8 */
+/* Copyright 2020 The Microsoft DeepSpeed Team
+   Copyright NVIDIA/apex
+   This file is adapted from fused adam in NVIDIA/apex, commit a109f85
+   Licensed under the MIT License.
+*/
 #include <ATen/ATen.h>
 
 #include "compat.h"
diff --git a/colossalai/nn/optimizer/fused_adam.py b/colossalai/nn/optimizer/fused_adam.py
index 941866d557ff..987af8a968b7 100644
--- a/colossalai/nn/optimizer/fused_adam.py
+++ b/colossalai/nn/optimizer/fused_adam.py
@@ -1,4 +1,11 @@
 # modified from https://github.com/NVIDIA/apex/blob/master/apex/optimizers/fused_adam.py
+'''
+Copyright 2020 The Microsoft DeepSpeed Team
+
+Copyright NVIDIA/apex
+This file is adapted from fused adam in NVIDIA/apex, commit a109f85
+Licensed under the MIT License.
+'''
 import torch
 
 from colossalai.registry import OPTIMIZERS
diff --git a/colossalai/nn/optimizer/zero_optimizer.py b/colossalai/nn/optimizer/zero_optimizer.py
index 712daed06400..422ebb7a3944 100644
--- a/colossalai/nn/optimizer/zero_optimizer.py
+++ b/colossalai/nn/optimizer/zero_optimizer.py
@@ -1,3 +1,4 @@
+# this code is inspired by the DeepSpeed library and implemented with our own design from scratch
 import math
 import warnings
 from enum import Enum
diff --git a/colossalai/utils/model/utils.py b/colossalai/utils/model/utils.py
index 75bb18df66c1..f49607376439 100644
--- a/colossalai/utils/model/utils.py
+++ b/colossalai/utils/model/utils.py
@@ -1,7 +1,12 @@
-import torch
+# This code has been adapted from the DeepSpeed library.
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
 import functools
 from typing import Optional
 
+import torch
+
 
 def substitute_init_recursively(cls, func, visited: set):
     for subcls in cls.__subclasses__():
diff --git a/colossalai/zero/sharded_model/sharded_model_v2.py b/colossalai/zero/sharded_model/sharded_model_v2.py
index ae3a619980ac..094f7d76a86d 100644
--- a/colossalai/zero/sharded_model/sharded_model_v2.py
+++ b/colossalai/zero/sharded_model/sharded_model_v2.py
@@ -1,3 +1,4 @@
+# this code is inspired by the DeepSpeed library and implemented with our own design from scratch
 import functools
 import itertools
 from collections import OrderedDict
diff --git a/colossalai/zero/sharded_optim/low_level_optim.py b/colossalai/zero/sharded_optim/low_level_optim.py
index 502b1c4d9f4c..49fb8b54b7d2 100644
--- a/colossalai/zero/sharded_optim/low_level_optim.py
+++ b/colossalai/zero/sharded_optim/low_level_optim.py
@@ -1,3 +1,4 @@
+# this code is inspired by the DeepSpeed library and implemented with our own design from scratch
 from functools import partial
 from typing import Optional
 
diff --git a/colossalai/zero/sharded_optim/sharded_optim_v2.py b/colossalai/zero/sharded_optim/sharded_optim_v2.py
index 401ff988df4a..43a0b7d76107 100644
--- a/colossalai/zero/sharded_optim/sharded_optim_v2.py
+++ b/colossalai/zero/sharded_optim/sharded_optim_v2.py
@@ -1,3 +1,4 @@
+# this code is inspired by the DeepSpeed library and implemented with our own design from scratch
 from enum import Enum
 from os import stat
 from typing import Dict, Optional, Tuple
@@ -5,20 +6,21 @@
 import torch
 import torch.distributed as dist
 import torch.nn as nn
+from torch import Tensor
+from torch.distributed import ProcessGroup
+from torch.nn.parameter import Parameter
+from torch.optim import Optimizer
+
 from colossalai.amp.naive_amp.grad_scaler import DynamicGradScaler
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
+from colossalai.gemini.stateful_tensor import StatefulTensor, TensorState
+from colossalai.gemini.tensor_placement_policy import AutoTensorPlacementPolicy
+from colossalai.gemini.tensor_utils import colo_model_data_tensor_move_inline, colo_tensor_mem_usage
 from colossalai.logging import get_dist_logger
 from colossalai.nn.optimizer import ColossalaiOptimizer
-from colossalai.gemini.tensor_utils import (colo_model_data_tensor_move_inline, colo_tensor_mem_usage)
 from colossalai.zero.sharded_model import ShardedModelV2
 from colossalai.zero.sharded_model._utils import cast_tensor_to_fp32
-from torch import Tensor
-from torch.distributed import ProcessGroup
-from torch.nn.parameter import Parameter
-from torch.optim import Optimizer
-from colossalai.gemini.stateful_tensor import (StatefulTensor, TensorState)
-from colossalai.gemini.tensor_placement_policy import AutoTensorPlacementPolicy
 
 
 class OptimState(Enum):
@@ -36,9 +38,9 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
     `PatrickStar: Parallel Training of Pre-trained Models via Chunk-based Memory Management`_
 
     GPU margin space is the remaining space after removing peak non-model data from the overall GPU memory,
-    which is detected by a runtime memory tracer. 
+    which is detected by a runtime memory tracer.
 
-    We place as many OS chunks in the margin space as possible. 
+    We place as many OS chunks in the margin space as possible.
 
     The size of margin space can be controlled by ``gpu_margin_mem_ratio``.
     If it is set as ``0.0``, it is the same as classical ZeRO optimizer.
@@ -54,8 +56,8 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
         sharded_model (ShardedModelV2): A sharded model initialized by class ShardedModelV2. The optimizer will use the
             shard strategy provided by sharded model to shard param fp32 tensors.
         optimizer (Optimizer): An Optimizer instance.
-        gpu_margin_mem_ratio (float, optional): The ratio of GPU remaining memory (after the first forward-backward) 
-            which will be used when using hybrid CPU optimizer. 
+        gpu_margin_mem_ratio (float, optional): The ratio of GPU remaining memory (after the first forward-backward)
+            which will be used when using hybrid CPU optimizer.
             This argument is meaningless when `tensor_placement_policy` of `ShardedModelV2` is not "auto".
             Defaults to 0.0.
         initial_scale (float, optional): Initial scale used by DynamicGradScaler. Defaults to 2**32.
diff --git a/op_builder/builder.py b/op_builder/builder.py
index 140a10c091b4..b9f44decc119 100644
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -1,3 +1,7 @@
+# This code has been adapted from the DeepSpeed library.
+# Copyright (c) Microsoft Corporation.
+
+# Licensed under the MIT License.
 import importlib
 import os
 import time

From 35c8f4ce479e7dc7aab59e03bf00cba2d777ddb0 Mon Sep 17 00:00:00 2001
From: Saurav Maheshkar <sauravvmaheshkar@gmail.com>
Date: Sun, 5 Mar 2023 12:29:34 +0000
Subject: [PATCH 416/503] [refactor] restructure configuration files (#2977)

* gh: move CONTRIBUTING to .github

* chore: move isort config to pyproject

* chore: move pytest config to pyproject

* chore: move yapf config to pyproject

* chore: move clang-format config to pre-commit
---
 .clang-format                              |  1 -
 CONTRIBUTING.md => .github/CONTRIBUTING.md |  2 +-
 .isort.cfg                                 |  5 -----
 .pre-commit-config.yaml                    |  3 ++-
 .style.yapf                                |  5 -----
 pyproject.toml                             | 19 +++++++++++++++++++
 pytest.ini                                 |  6 ------
 7 files changed, 22 insertions(+), 19 deletions(-)
 delete mode 100644 .clang-format
 rename CONTRIBUTING.md => .github/CONTRIBUTING.md (99%)
 delete mode 100644 .isort.cfg
 delete mode 100644 .style.yapf
 create mode 100644 pyproject.toml
 delete mode 100644 pytest.ini

diff --git a/.clang-format b/.clang-format
deleted file mode 100644
index f6cb8ad931f5..000000000000
--- a/.clang-format
+++ /dev/null
@@ -1 +0,0 @@
-BasedOnStyle: Google
diff --git a/CONTRIBUTING.md b/.github/CONTRIBUTING.md
similarity index 99%
rename from CONTRIBUTING.md
rename to .github/CONTRIBUTING.md
index 00abcf650158..915c43174c6a 100644
--- a/CONTRIBUTING.md
+++ b/.github/CONTRIBUTING.md
@@ -138,4 +138,4 @@ You can now create a pull request on the GitHub webpage of your repository. The
 
 Do write clearly the description of your pull request and [link the pull request to your target issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue). This will automatically close the issue when the pull request is approved.
 
-In case of code conflict, you should rebase your branch and resolve the conflicts manually.
\ No newline at end of file
+In case of code conflict, you should rebase your branch and resolve the conflicts manually.
diff --git a/.isort.cfg b/.isort.cfg
deleted file mode 100644
index 090aa28e39f3..000000000000
--- a/.isort.cfg
+++ /dev/null
@@ -1,5 +0,0 @@
-[settings]
-line_length = 120
-multi_line_output=3
-include_trailing_comma = true
-ignore_comments = true
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 725d266375ef..b98edb6c9a8b 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -11,13 +11,14 @@ repos:
     hooks:
     - id: yapf
       name: yapf formatter
-      args: ['--style=.style.yapf', '--parallel', '--in-place']
+      args: ['--style=pyproject.toml', '--parallel', '--in-place']
 
   - repo: https://github.com/pre-commit/mirrors-clang-format
     rev: v13.0.1
     hooks:
     - id: clang-format
       name: clang formatter
+      args: [--style, "{BasedOnStyle: Google}"]
 
   - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v4.3.0
diff --git a/.style.yapf b/.style.yapf
deleted file mode 100644
index 05be0dc6a3a5..000000000000
--- a/.style.yapf
+++ /dev/null
@@ -1,5 +0,0 @@
-[style]
-based_on_style = google
-spaces_before_comment = 4
-split_before_logical_operator = true
-column_limit = 120
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 000000000000..5d918a5784ea
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,19 @@
+[tool.isort]
+line_length = 120
+multi_line_output = 3
+include_trailing_comma = true
+ignore_comments = true
+
+[tool.yapf]
+based_on_style = "google"
+spaces_before_comment = 4
+split_before_logical_operator = true
+column_limit = 120
+
+[tool.pytest.ini_options]
+markers = [
+    "cpu: tests which can run on CPU",
+    "gpu: tests which requires a single GPU",
+    "dist: tests which are run in a multi-GPU or multi-machine environment",
+    "experiment: tests for experimental features",
+]
diff --git a/pytest.ini b/pytest.ini
deleted file mode 100644
index ac31ace4bfae..000000000000
--- a/pytest.ini
+++ /dev/null
@@ -1,6 +0,0 @@
-[pytest]
-markers =
-    cpu: tests which can run on CPU
-    gpu: tests which requires a single GPU
-    dist: tests which are run in a multi-GPU or multi-machine environment
-    experiment: tests for experimental features
\ No newline at end of file

From 52a5078988c250884b6d7d57a497cc0be21689a7 Mon Sep 17 00:00:00 2001
From: binmakeswell <binmakeswell@gmail.com>
Date: Mon, 6 Mar 2023 10:36:38 +0800
Subject: [PATCH 417/503] [doc] add ISC tutorial (#2997)

* [doc] add ISC tutorial

* [doc] add ISC tutorial

* [doc] add ISC tutorial

* [doc] add ISC tutorial
---
 README-zh-Hans.md                 | 2 +-
 README.md                         | 2 +-
 colossalai/nn/optimizer/README.md | 2 +-
 examples/tutorial/README.md       | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/README-zh-Hans.md b/README-zh-Hans.md
index 3b331734f3be..8ff25c64f5d4 100644
--- a/README-zh-Hans.md
+++ b/README-zh-Hans.md
@@ -389,6 +389,6 @@ Colossal-AI项目受一些相关的项目启发而成立，一些项目是我们
 }
 ```
 
-Colossal-AI 已被 [SC](https://sc22.supercomputing.org/), [AAAI](https://aaai.org/Conferences/AAAI-23/), [PPoPP](https://ppopp23.sigplan.org/), [CVPR](https://cvpr2023.thecvf.com/)等顶级会议录取为官方教程。
+Colossal-AI 已被 [SC](https://sc22.supercomputing.org/), [AAAI](https://aaai.org/Conferences/AAAI-23/), [PPoPP](https://ppopp23.sigplan.org/), [CVPR](https://cvpr2023.thecvf.com/), [ISC](https://www.isc-hpc.com/)等顶级会议录取为官方教程。
 
 <p align="right">(<a href="#top">返回顶端</a>)</p>
diff --git a/README.md b/README.md
index 31af801d1c4f..10d59e34c8f8 100644
--- a/README.md
+++ b/README.md
@@ -391,6 +391,6 @@ To cite this project, you can use the following BibTeX citation.
 }
 ```
 
-Colossal-AI has been accepted as official tutorials by top conference [SC](https://sc22.supercomputing.org/), [AAAI](https://aaai.org/Conferences/AAAI-23/), [PPoPP](https://ppopp23.sigplan.org/), [CVPR](https://cvpr2023.thecvf.com/), etc.
+Colossal-AI has been accepted as official tutorials by top conference [SC](https://sc22.supercomputing.org/), [AAAI](https://aaai.org/Conferences/AAAI-23/), [PPoPP](https://ppopp23.sigplan.org/), [CVPR](https://cvpr2023.thecvf.com/), [ISC](https://www.isc-hpc.com/), etc.
 
 <p align="right">(<a href="#top">back to top</a>)</p>
diff --git a/colossalai/nn/optimizer/README.md b/colossalai/nn/optimizer/README.md
index e2fc30bc5d4f..752ffa46d584 100644
--- a/colossalai/nn/optimizer/README.md
+++ b/colossalai/nn/optimizer/README.md
@@ -3,7 +3,7 @@
 ## Introduction
 
 Welcome to the large-scale deep learning optimization techniques of [Colossal-AI](https://github.com/hpcaitech/ColossalAI), 
-which has been accepted as official tutorials by top conference [SC](https://sc22.supercomputing.org/), [AAAI](https://aaai.org/Conferences/AAAI-23/), [PPoPP](https://ppopp23.sigplan.org/), [CVPR](https://cvpr2023.thecvf.com/), etc.
+which has been accepted as official tutorials by top conference [SC](https://sc22.supercomputing.org/), [AAAI](https://aaai.org/Conferences/AAAI-23/), [PPoPP](https://ppopp23.sigplan.org/), [CVPR](https://cvpr2023.thecvf.com/), [ISC](https://www.isc-hpc.com/), etc.
 
 
 [Colossal-AI](https://github.com/hpcaitech/ColossalAI), a unified deep learning system for the big model era, integrates
diff --git a/examples/tutorial/README.md b/examples/tutorial/README.md
index 000cf2117335..f4843331fd54 100644
--- a/examples/tutorial/README.md
+++ b/examples/tutorial/README.md
@@ -4,7 +4,7 @@
 
 ## Introduction
 
-Welcome to the [Colossal-AI](https://github.com/hpcaitech/ColossalAI) tutorial, which has been accepted as official tutorials by top conference [SC](https://sc22.supercomputing.org/), [AAAI](https://aaai.org/Conferences/AAAI-23/), [PPoPP](https://ppopp23.sigplan.org/), [CVPR](https://cvpr2023.thecvf.com/), etc.
+Welcome to the [Colossal-AI](https://github.com/hpcaitech/ColossalAI) tutorial, which has been accepted as official tutorials by top conference [SC](https://sc22.supercomputing.org/), [AAAI](https://aaai.org/Conferences/AAAI-23/), [PPoPP](https://ppopp23.sigplan.org/), [CVPR](https://cvpr2023.thecvf.com/), [ISC](https://www.isc-hpc.com/), etc.
 
 
 [Colossal-AI](https://github.com/hpcaitech/ColossalAI), a unified deep learning system for the big model era, integrates

From 82503a96f20c2e61016af78445f9d1697ac07dd5 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 6 Mar 2023 10:42:22 +0800
Subject: [PATCH 418/503] [format] applied code formatting on changed files in
 pull request 2997 (#3008)

Co-authored-by: github-actions <github-actions@github.com>
---
 colossalai/nn/optimizer/README.md | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/colossalai/nn/optimizer/README.md b/colossalai/nn/optimizer/README.md
index 752ffa46d584..09395d08b93e 100644
--- a/colossalai/nn/optimizer/README.md
+++ b/colossalai/nn/optimizer/README.md
@@ -2,30 +2,30 @@
 
 ## Introduction
 
-Welcome to the large-scale deep learning optimization techniques of [Colossal-AI](https://github.com/hpcaitech/ColossalAI), 
+Welcome to the large-scale deep learning optimization techniques of [Colossal-AI](https://github.com/hpcaitech/ColossalAI),
 which has been accepted as official tutorials by top conference [SC](https://sc22.supercomputing.org/), [AAAI](https://aaai.org/Conferences/AAAI-23/), [PPoPP](https://ppopp23.sigplan.org/), [CVPR](https://cvpr2023.thecvf.com/), [ISC](https://www.isc-hpc.com/), etc.
 
 
 [Colossal-AI](https://github.com/hpcaitech/ColossalAI), a unified deep learning system for the big model era, integrates
 many advanced technologies such as multi-dimensional tensor parallelism, sequence parallelism, heterogeneous memory management,
-large-scale optimization, adaptive task scheduling, etc. By using Colossal-AI, we could help users to efficiently and 
+large-scale optimization, adaptive task scheduling, etc. By using Colossal-AI, we could help users to efficiently and
 quickly deploy large AI model training and inference, reducing large AI model training budgets and scaling down the labor cost of learning and deployment.
 
 ### 🚀 Quick Links
 
 [**Colossal-AI**](https://github.com/hpcaitech/ColossalAI) |
-[**Paper**](https://arxiv.org/abs/2110.14883) | 
-[**Documentation**](https://www.colossalai.org/) | 
-[**Forum**](https://github.com/hpcaitech/ColossalAI/discussions) | 
+[**Paper**](https://arxiv.org/abs/2110.14883) |
+[**Documentation**](https://www.colossalai.org/) |
+[**Forum**](https://github.com/hpcaitech/ColossalAI/discussions) |
 [**Slack**](https://join.slack.com/t/colossalaiworkspace/shared_invite/zt-z7b26eeb-CBp7jouvu~r0~lcFzX832w)
 
 
 ## Table of Content
 
-Large transformer models display promising performance on a wide spectrum of AI applications. 
+Large transformer models display promising performance on a wide spectrum of AI applications.
 Both academia and industry are scaling DL training on larger clusters. However, degrading generalization performance, non-negligible communication overhead, and increasing model size prevent DL researchers and engineers from exploring large-scale AI models.
 
-We aim to provide a clear sketch of the optimizations for large-scale deep learning with regard to model accuracy and model efficiency. 
+We aim to provide a clear sketch of the optimizations for large-scale deep learning with regard to model accuracy and model efficiency.
 One way to achieve the goal of maintaining or improving the model accuracy in the large-scale setting while maintaining compute efficiency is to design algorithms that
 are less communication and memory hungry. Notably, they are not mutually exclusive but can
 be optimized jointly to further speed up training.
@@ -51,7 +51,7 @@ be optimized jointly to further speed up training.
     - Memory Efficiency
       - Mix-Precision Training
       - Memory-Efficient Methods, e.g. ZeRO, Gemini, etc.
-      
+
 Some of the above are still under development. **If you wish to make a contribution to this repository, please read the `Contributing` section below.**
 
 ## Discussion
@@ -63,7 +63,7 @@ If you encounter any problem while running these optimizers, you may want to rai
 
 ## Contributing
 
-This project welcomes constructive ideas and implementations from the community. 
+This project welcomes constructive ideas and implementations from the community.
 
 ### Update an Optimizer
 

From e58870345482b1f45caa8641340f8ac538f13ee9 Mon Sep 17 00:00:00 2001
From: BlueRum <70618399+ht-zhou@users.noreply.github.com>
Date: Tue, 7 Mar 2023 09:17:52 +0800
Subject: [PATCH 419/503] [chatgpt]fix inference model load (#2988)

* fix lora bug

* polish

* fix lora gemini

* fix inference laod model bug
---
 applications/ChatGPT/examples/README.md    |  7 +++++--
 applications/ChatGPT/examples/inference.py | 15 ++++++++-------
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/applications/ChatGPT/examples/README.md b/applications/ChatGPT/examples/README.md
index c411c880b038..bf3daf5ec3f5 100644
--- a/applications/ChatGPT/examples/README.md
+++ b/applications/ChatGPT/examples/README.md
@@ -69,10 +69,13 @@ torchrun --standalone --nproc_per_node=2 train_prompts.py prompts.csv --strategy
 ## Inference example(After Stage3)
 We support naive inference demo after training.
 ```shell
-# inference
-python inference.py --pretrain <your actor model path> --model <your model type>
+# inference, using pretrain path to configure model
+python inference.py --model_path <your actor model path> --model <your model type> --pretrain <your pretrain model name/path>
+# example
+python inference.py --model_path ./actor_checkpoint_prompts.pt --pretrain bigscience/bloom-560m --model bloom
 ```
 
+
 #### data
 - [x] [rm-static](https://huggingface.co/datasets/Dahoas/rm-static)
 - [x] [hh-rlhf](https://huggingface.co/datasets/Anthropic/hh-rlhf)
diff --git a/applications/ChatGPT/examples/inference.py b/applications/ChatGPT/examples/inference.py
index a2682277d18c..239b6e19b282 100644
--- a/applications/ChatGPT/examples/inference.py
+++ b/applications/ChatGPT/examples/inference.py
@@ -1,6 +1,6 @@
 import argparse
-import torch
 
+import torch
 from chatgpt.nn import BLOOMActor, GPTActor, OPTActor
 from transformers import AutoTokenizer
 from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
@@ -9,18 +9,17 @@
 def eval(args):
     # configure model
     if args.model == 'gpt2':
-        actor = GPTActor().to(torch.cuda.current_device())
+        actor = GPTActor(pretrained=args.pretrain).to(torch.cuda.current_device())
     elif args.model == 'bloom':
-        actor = BLOOMActor().to(torch.cuda.current_device())
+        actor = BLOOMActor(pretrained=args.pretrain).to(torch.cuda.current_device())
     elif args.model == 'opt':
-        actor = OPTActor().to(torch.cuda.current_device())
+        actor = OPTActor(pretrained=args.pretrain).to(torch.cuda.current_device())
     else:
         raise ValueError(f'Unsupported model "{args.model}"')
 
-    state_dict = torch.load(args.pretrain)
+    state_dict = torch.load(args.model_path)
     actor.model.load_state_dict(state_dict)
-    
-    
+
     # configure tokenizer
     if args.model == 'gpt2':
         tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
@@ -49,7 +48,9 @@ def eval(args):
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt'])
+    # We suggest to use the pretrained model from HuggingFace, use pretrain to configure model
     parser.add_argument('--pretrain', type=str, default=None)
+    parser.add_argument('--model_path', type=str, default=None)
     parser.add_argument('--input', type=str, default='Question: How are you ? Answer:')
     parser.add_argument('--max_length', type=int, default=100)
     args = parser.parse_args()

From 287d60499ea372e03ee19e6c91a70970eebf0876 Mon Sep 17 00:00:00 2001
From: LuGY <74758262+Gy-Lu@users.noreply.github.com>
Date: Tue, 7 Mar 2023 10:13:25 +0800
Subject: [PATCH 420/503] [chatgpt] Add saving ckpt callback for PPO (#2880)

* add checkpoint callback for chatgpt

* add save ckpt callbacks for ppo

---------

Co-authored-by: Fazzie-Maqianli <55798671+Fazziekey@users.noreply.github.com>
---
 .../chatgpt/trainer/callbacks/__init__.py     |  3 +-
 .../trainer/callbacks/save_checkpoint.py      | 75 +++++++++++++++++++
 applications/ChatGPT/examples/train_dummy.py  | 56 +++++++++-----
 3 files changed, 114 insertions(+), 20 deletions(-)
 create mode 100644 applications/ChatGPT/chatgpt/trainer/callbacks/save_checkpoint.py

diff --git a/applications/ChatGPT/chatgpt/trainer/callbacks/__init__.py b/applications/ChatGPT/chatgpt/trainer/callbacks/__init__.py
index 79ea9ffcdf61..9ed0ee6f7640 100644
--- a/applications/ChatGPT/chatgpt/trainer/callbacks/__init__.py
+++ b/applications/ChatGPT/chatgpt/trainer/callbacks/__init__.py
@@ -1,4 +1,5 @@
 from .base import Callback
 from .performance_evaluator import PerformanceEvaluator
+from .save_checkpoint import SaveCheckpoint
 
-__all__ = ['Callback', 'PerformanceEvaluator']
+__all__ = ['Callback', 'PerformanceEvaluator', 'SaveCheckpoint']
diff --git a/applications/ChatGPT/chatgpt/trainer/callbacks/save_checkpoint.py b/applications/ChatGPT/chatgpt/trainer/callbacks/save_checkpoint.py
new file mode 100644
index 000000000000..8f2beb12db22
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/trainer/callbacks/save_checkpoint.py
@@ -0,0 +1,75 @@
+import os
+
+import torch.distributed as dist
+from chatgpt.trainer.strategies import ColossalAIStrategy, Strategy
+from chatgpt.trainer.utils import is_rank_0
+from torch import nn
+from torch.optim import Optimizer
+
+from .base import Callback
+
+
+class SaveCheckpoint(Callback):
+    """
+        The callback for saving checkpoint for chatgpt.
+
+        Only support saving actor and critic model.
+        A typical architecture of the saved checkpoint would be:
+            - checkpoint
+                - episode_x
+                    - actor.pt
+                    - actor-optim-rank-0.pt
+                    - actor-optim-rank-1.pt
+                    - critic.pt
+                    - critic-optim-rank-0.pt
+                    - critic-optim-rank-1.pt
+                - ...
+
+    Args:
+        path(str): the base path you want to save checkpoint, the checkpoint would be saved at `path/checkpoint`
+        interval(int): the interval episode of saving checkpoint
+        strategy(Strategy): the strategy used to train
+        actor(nn.Module): the actor model
+        critic(nn.Module): the critic model
+        actor_optim(Optimizer): the optimizer of actor
+        critic_optim(Optimizer): the optimizer of critic
+
+    """
+
+    def __init__(self,
+                 path: str,
+                 interval: int,
+                 strategy: Strategy,
+                 actor: nn.Module = None,
+                 critic: nn.Module = None,
+                 actor_optim: Optimizer = None,
+                 critic_optim: Optimizer = None) -> None:
+        super().__init__()
+        self.path = os.path.join(path, 'checkpoint')
+        self.interval = interval
+        self.strategy = strategy
+        self.model_dict = {'actor': [actor, actor_optim], 'critic': [critic, critic_optim]}
+
+    def on_episode_end(self, episode: int) -> None:
+        if (episode + 1) % self.interval != 0:
+            return
+        base_path = os.path.join(self.path, f'episode_{episode}')
+        if not os.path.exists(base_path):
+            os.makedirs(base_path)
+
+        for model in self.model_dict.keys():
+
+            # save model
+            if self.model_dict[model][0] is None:
+                # saving only optimizer states is meaningless, so it would be skipped
+                continue
+            model_path = os.path.join(base_path, f'{model}.pt')
+            self.strategy.save_model(model=self.model_dict[model][0], path=model_path, only_rank0=True)
+
+            # save optimizer
+            if self.model_dict[model][1] is None:
+                continue
+            only_rank0 = not isinstance(self.strategy, ColossalAIStrategy)
+            rank = 0 if is_rank_0() else dist.get_rank()
+            optim_path = os.path.join(base_path, f'{model}-optim-rank-{rank}.pt')
+            self.strategy.save_optimizer(optimizer=self.model_dict[model][1], path=optim_path, only_rank0=only_rank0)
diff --git a/applications/ChatGPT/examples/train_dummy.py b/applications/ChatGPT/examples/train_dummy.py
index 35f6474910d3..df64515a1ce8 100644
--- a/applications/ChatGPT/examples/train_dummy.py
+++ b/applications/ChatGPT/examples/train_dummy.py
@@ -4,6 +4,7 @@
 import torch
 from chatgpt.nn import BLOOMActor, BLOOMCritic, GPTActor, GPTCritic, OPTActor, OPTCritic, RewardModel
 from chatgpt.trainer import PPOTrainer
+from chatgpt.trainer.callbacks import SaveCheckpoint
 from chatgpt.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
 from torch.optim import Adam
 from transformers import AutoTokenizer, BloomTokenizerFast
@@ -71,26 +72,38 @@ def main(args):
     (actor, actor_optim), (critic, critic_optim), reward_model, initial_model = strategy.prepare(
         (actor, actor_optim), (critic, critic_optim), reward_model, initial_model)
 
+    callbacks = []
+    if args.save_ckpt_path:
+        ckpt_callback = SaveCheckpoint(
+            args.save_ckpt_path,
+            args.save_ckpt_interval,
+            strategy,
+            actor,
+            critic,
+            actor_optim,
+            critic_optim,
+        )
+        callbacks.append(ckpt_callback)
+
     # configure trainer
-    trainer = PPOTrainer(
-        strategy,
-        actor,
-        critic,
-        reward_model,
-        initial_model,
-        actor_optim,
-        critic_optim,
-        max_epochs=args.max_epochs,
-        train_batch_size=args.train_batch_size,
-        experience_batch_size=args.experience_batch_size,
-        tokenizer=preprocess_batch,
-        max_length=128,
-        do_sample=True,
-        temperature=1.0,
-        top_k=50,
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-    )
+
+    trainer = PPOTrainer(strategy,
+                         actor,
+                         critic,
+                         reward_model,
+                         initial_model,
+                         actor_optim,
+                         critic_optim,
+                         max_epochs=args.max_epochs,
+                         train_batch_size=args.train_batch_size,
+                         tokenizer=preprocess_batch,
+                         max_length=128,
+                         do_sample=True,
+                         temperature=1.0,
+                         top_k=50,
+                         pad_token_id=tokenizer.pad_token_id,
+                         eos_token_id=tokenizer.eos_token_id,
+                         callbacks=callbacks)
 
     random_prompts = torch.randint(tokenizer.vocab_size, (1000, 64), device=torch.cuda.current_device())
     trainer.fit(random_prompts,
@@ -120,5 +133,10 @@ def main(args):
     parser.add_argument('--train_batch_size', type=int, default=8)
     parser.add_argument('--experience_batch_size', type=int, default=8)
     parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
+    parser.add_argument('--save_ckpt_path',
+                        type=str,
+                        default=None,
+                        help="path to save checkpoint, None means not to save")
+    parser.add_argument('--save_ckpt_interval', type=int, default=1, help="the interval of episode to save checkpoint")
     args = parser.parse_args()
     main(args)

From 55dcd3051adfe7e1b12743a0ce4797e2bc0ac005 Mon Sep 17 00:00:00 2001
From: BlueRum <70618399+ht-zhou@users.noreply.github.com>
Date: Tue, 7 Mar 2023 10:21:25 +0800
Subject: [PATCH 421/503] [chatgpt] fix readme (#3025)

---
 applications/ChatGPT/examples/README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/applications/ChatGPT/examples/README.md b/applications/ChatGPT/examples/README.md
index bf3daf5ec3f5..39a76911027f 100644
--- a/applications/ChatGPT/examples/README.md
+++ b/applications/ChatGPT/examples/README.md
@@ -15,9 +15,9 @@ Use these code to train your reward model.
 
 ```shell
 # Naive reward model training
-python train_reward_model.py --pretrain <your model path>
-# if to use LoRA
-python train_reward_model.py --pretrain <your model path> --lora_rank 16
+python train_reward_model.py --pretrain <your model path> --model <your model type> --strategy naive 
+# use colossalai_zero2
+torchrun --standalone --nproc_per_node=2 train_reward_model.py --pretrain <your model path> --model <your model type> --strategy colossalai_zero2 
 ```
 
 ## Train with dummy prompt data (Stage 3)
@@ -44,7 +44,7 @@ DDP strategy and ColossalAI strategy support multi GPUs training:
 # run DDP on 2 GPUs
 torchrun --standalone --nproc_per_node=2 train_dummy.py --strategy ddp
 # run ColossalAI on 2 GPUs
-torchrun --standalone --nproc_per_node=2 train_dummy.py --strategy colossalai
+torchrun --standalone --nproc_per_node=2 train_dummy.py --strategy colossalai_zero2
 ```
 
 ## Train with real prompt data (Stage 3)
@@ -63,7 +63,7 @@ python train_prompts.py prompts.csv --strategy naive
 # run DDP on 2 GPUs
 torchrun --standalone --nproc_per_node=2 train_prompts.py prompts.csv --strategy ddp
 # run ColossalAI on 2 GPUs
-torchrun --standalone --nproc_per_node=2 train_prompts.py prompts.csv --strategy colossalai
+torchrun --standalone --nproc_per_node=2 train_prompts.py prompts.csv --strategy colossalai_zero2
 ```
 
 ## Inference example(After Stage3)

From b42d3d28ed03b281ba0192b7aacf9a661f29f3c9 Mon Sep 17 00:00:00 2001
From: Super Daniel <78588128+super-dainiu@users.noreply.github.com>
Date: Tue, 7 Mar 2023 10:30:35 +0800
Subject: [PATCH 422/503] [fx] remove depreciated algorithms. (#2312) (#2313)

---
 colossalai/fx/passes/algorithms/__init__.py   |   4 -
 .../fx/passes/algorithms/build_c_ext.py       |  15 -
 .../fx/passes/algorithms/ckpt_solver_chen.py  |  98 ----
 .../fx/passes/algorithms/ckpt_solver_pofo.py  | 537 ------------------
 .../fx/passes/algorithms/ckpt_solver_rotor.py | 436 --------------
 .../fx/passes/algorithms/dynamic_programs.c   | 516 -----------------
 colossalai/fx/passes/algorithms/linearize.py  |  94 ---
 colossalai/fx/passes/algorithms/operation.py  | 270 ---------
 8 files changed, 1970 deletions(-)
 delete mode 100644 colossalai/fx/passes/algorithms/__init__.py
 delete mode 100644 colossalai/fx/passes/algorithms/build_c_ext.py
 delete mode 100644 colossalai/fx/passes/algorithms/ckpt_solver_chen.py
 delete mode 100644 colossalai/fx/passes/algorithms/ckpt_solver_pofo.py
 delete mode 100644 colossalai/fx/passes/algorithms/ckpt_solver_rotor.py
 delete mode 100644 colossalai/fx/passes/algorithms/dynamic_programs.c
 delete mode 100644 colossalai/fx/passes/algorithms/linearize.py
 delete mode 100644 colossalai/fx/passes/algorithms/operation.py

diff --git a/colossalai/fx/passes/algorithms/__init__.py b/colossalai/fx/passes/algorithms/__init__.py
deleted file mode 100644
index 9ccf135d0911..000000000000
--- a/colossalai/fx/passes/algorithms/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from .ckpt_solver_chen import chen_greedy
-from .linearize import linearize
-from .ckpt_solver_rotor import solver_rotor
-from .ckpt_solver_pofo import solver_pofo
diff --git a/colossalai/fx/passes/algorithms/build_c_ext.py b/colossalai/fx/passes/algorithms/build_c_ext.py
deleted file mode 100644
index cb360cb20340..000000000000
--- a/colossalai/fx/passes/algorithms/build_c_ext.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from setuptools import setup, Extension
-import os
-
-this_dir = os.path.dirname(os.path.abspath(__file__))
-ext_modules = [Extension(
-    'dynamic_programs_C_version',
-    sources=[os.path.join(this_dir, 'dynamic_programs.c')],
-)]
-
-setup(
-    name='rotor c extension',
-    version='0.1',
-    description='rotor c extension for faster dp computing',
-    ext_modules=ext_modules,
-)
diff --git a/colossalai/fx/passes/algorithms/ckpt_solver_chen.py b/colossalai/fx/passes/algorithms/ckpt_solver_chen.py
deleted file mode 100644
index 52000ebe5364..000000000000
--- a/colossalai/fx/passes/algorithms/ckpt_solver_chen.py
+++ /dev/null
@@ -1,98 +0,0 @@
-import math
-from typing import List, Set, Tuple
-
-import torch
-from torch.fx import GraphModule, Node
-
-from colossalai.fx.profiler import calculate_fwd_in, calculate_fwd_tmp
-
-__all__ = ['chen_greedy']
-CKPT_OP = ['call_module', 'call_method', 'call_function', 'get_attr']
-
-
-def _all_potential_ckpt_nodes(gm: GraphModule) -> List:
-    """
-    In most existing frameworks of activation checkpoint, the forward graph is assumed to be linearized.
-    """
-
-    def is_sink():
-        """
-        If we can free all memories when executing a certain node, it is a sink.
-        """
-        return not sum((v for k, v in deps.items()))
-
-    deps = {}
-    ckpt_nodes = []
-    for n in gm.graph.nodes:
-        for n_par in n._input_nodes:
-            deps[n_par] -= 1    # free memory and dependencies
-
-        # We can only put act_ckpt on these nodes
-        if n.op in CKPT_OP and is_sink():
-            ckpt_nodes.append(n)
-        deps[n] = len(n.users)    # add dependencies for future executions
-    return ckpt_nodes
-
-
-def chen_greedy(gm: GraphModule) -> GraphModule:
-    """
-    This is the simple implementation of Algorithm 3 in https://arxiv.org/abs/1604.06174.
-    Note that this algorithm targets at memory optimization only, using techniques in appendix A.
-
-    Usage:
-        model = resnet18()
-        input_sample = torch.rand(4, 3, 224, 224)
-        gm = symbolic_trace(model)
-        MetaInfoProp(gm).run(input_sample)
-        gm = chen_greedy(gm)
-
-    Args:
-        gm (GraphModule): The module to add checkpoints
-    """
-
-    def grid_search(num_grids: int = 6) -> Set:
-        """
-        Search ckpt strategy with b = 0, then run the allocation algorithm again with b = √xy.
-        Grid search over [√2/2 b, √2 b] for ckpt_opt over num_grids as in appendix A.
-        """
-        _, b_approx = run_chen_greedy(0)
-        b_min, b_max = math.floor(b_approx / math.sqrt(2)), math.ceil(b_approx * math.sqrt(2))
-        b_opt = math.inf
-        for b in range(b_min, b_max, (b_max - b_min) // num_grids):
-            ckpt_intv, b_approx = run_chen_greedy(b)
-            if b_approx < b_opt:
-                b_opt = b_approx
-                ckpt_opt = ckpt_intv
-        return ckpt_opt
-
-    def run_chen_greedy(b: int = 0) -> Tuple[Set, int]:
-        """
-        This is the simple implementation of Algorithm 3 in https://arxiv.org/abs/1604.06174.
-        """
-        ckpt_nodes = _all_potential_ckpt_nodes(gm)
-        ckpt_intv = []
-        temp = 0
-        x = 0
-        y = 0
-        prev_idx = 2
-        for (idx, n) in enumerate(gm.graph.nodes):
-            n: Node
-            temp += calculate_fwd_in(n) + calculate_fwd_tmp(n)
-            y = max(y, temp)
-            if temp > b and n in ckpt_nodes:
-                x += calculate_fwd_in(n)
-                temp = 0
-                ckpt_intv.append((prev_idx, idx + 1))
-                prev_idx = idx + 1
-        return ckpt_intv, math.floor(math.sqrt(x * y))
-
-    gm.graph.lint()    # make sure nodes are in topological order
-    ckpt = grid_search(num_grids=6)
-    node_list = list(gm.graph.nodes)
-    for i, seg in enumerate(ckpt):
-        for idx in range(*seg):
-            n = node_list[idx]
-            if n.op in CKPT_OP:
-                setattr(n, 'activation_checkpoint', i)
-    gm.recompile()
-    return gm
diff --git a/colossalai/fx/passes/algorithms/ckpt_solver_pofo.py b/colossalai/fx/passes/algorithms/ckpt_solver_pofo.py
deleted file mode 100644
index 69e4e9f2cce8..000000000000
--- a/colossalai/fx/passes/algorithms/ckpt_solver_pofo.py
+++ /dev/null
@@ -1,537 +0,0 @@
-import copy
-import math
-from typing import List, Tuple
-
-import torch
-from colossalai.fx import is_compatible_with_meta
-from colossalai.fx.codegen.activation_checkpoint_codegen import \
-    _find_nested_ckpt_regions
-from colossalai.fx.graph_module import ColoGraphModule
-from colossalai.fx.passes.algorithms.ckpt_solver_rotor import (_compute_table, _construct_chain, _rec)
-from colossalai.fx.passes.meta_info_prop import MetaInfoProp
-from colossalai.fx.profiler import parameter_size
-from torch.fx import GraphModule, Node
-
-from .linearize import linearize
-from .operation import (Backward, Chain, ForwardCheck, ForwardEnable, ForwardNograd, Function, Loss, Offload, Prefetch,
-                        Sequence)
-
-INF = float("inf")
-
-
-def _normalize_flops(chain: Chain, flops) -> Chain:
-    """
-    Normalize flops
-    """
-    for i in range(chain.length):
-        chain.fweight[i] /= flops
-        chain.bweight[i] /= flops
-
-    return chain
-
-
-class PofoTable:
-    """PofoTable
-    The PofoTable contains the necessary components to store intermediate results
-    of dynamic programming and the operations alone the way.
-    """
-
-    def __init__(self, chain_length: int, mem_slots: int):
-        """Init pofo table
-        The pofo table contains two tables, opt and what, indicating values and
-        operations.
-
-        Args:
-            chain_length (int): chain length
-            mem_slots (int): number of memory slots
-        """
-
-        self.length = chain_length
-        self.mem_slots = mem_slots
-
-        # initializing tables
-        # the first bool indicates whether the input has bar
-        # opt table is for value, opt[True/False][i][A][(df, db)] = OCx(i, A, df, db)
-        # what table is for decision, what[True/False][i][A][(df, db)] = (is_enable, is_offload, index)
-        # where is_enable indicates whether we enable the gradient, is_offload indicates whether we
-        # offload the input, index indicates the end of F_\empty sequence if is_enable = False
-        self.opt = {
-            False: [[{} for _ in range(mem_slots + 1)] for _ in range(self.length + 1)],
-            True: [[{} for _ in range(mem_slots + 1)] for _ in range(self.length + 1)]
-        }
-        self.what = {
-            False: [[{} for _ in range(mem_slots + 1)] for _ in range(self.length + 1)],
-            True: [[{} for _ in range(mem_slots + 1)] for _ in range(self.length + 1)]
-        }
-
-    def _get_value(self, state, table, default):
-        i, act_size, df, db, input_has_bar = state
-        if act_size + df > self.mem_slots or act_size + db > self.mem_slots:
-            return default
-
-        try:
-            return table[input_has_bar][i][act_size][(df, db)]
-        except KeyError:
-            print(f"state not found {state}")
-
-    def get_opt(self, state):
-        return self._get_value(state, self.opt, INF)
-
-    def get_what(self, state):
-        return self._get_value(state, self.what, INF)
-
-    def set_value(self, state, opt, what):
-        i, act_size, df, db, input_has_bar = state
-        self.opt[input_has_bar][i][act_size][(df, db)] = opt
-        self.what[input_has_bar][i][act_size][(df, db)] = what
-
-
-class PofoSolver:
-    """PofoSolver that executes algorithm mentioned in https://proceedings.neurips.cc/paper/2021/hash/c8461bf13fca8a2b9912ab2eb1668e4b-Abstract.html
-    The new pofo solver is based on paper Efficient Combination of Rematerialization and Offloading for Training DNNs 
-    and it's code given in the supplemental. Currently we doesn't use the whole set up in the original paper and reuse 
-    rotor solver for the backward sequence as suggested in supplemental. The solver now is able to find strategy with offload. 
-    """
-
-    def __init__(self, chain: Chain, max_memory: int, bandwidth, mem_slots: int) -> None:
-        self.chain = chain
-        self.length = chain.length
-        self.max_memory = max_memory
-        self.mem_slots = mem_slots
-        self.mem_unit = max_memory / mem_slots
-        self.bandwidth = bandwidth
-
-        self.disc_chain = copy.deepcopy(self.chain)
-        self.disc_chain._discretize(self.mem_unit)
-
-        self.rotor_table = _compute_table(self.disc_chain, mem_slots)
-        self._compute_pofo_table()
-
-    def _discretize(self, *values) -> Tuple:
-        return tuple(math.ceil(value / self.mem_unit) for value in values)
-
-    def _undiscretize(self, *discrete_values) -> Tuple:
-        if len(discrete_values) == 1:
-            return discrete_values[0] * self.mem_unit
-        else:
-            return tuple(d * self.mem_unit for d in discrete_values)
-
-    def _mmax_all(self, idx: int):
-        """
-        Calculate the maximum memory usage of Fi_all
-        """
-
-        return self.chain.cbweight[idx + 1] + self.chain.fwd_mem_tmp[idx]
-
-    def _mmax_b(self, idx: int):
-        """
-        Calculate the maximum memory usage of Bi
-        """
-
-        return self.chain.cbweight[idx +
-                                   1] + self.chain.cweight[idx +
-                                                           1] + self.chain.cweight[idx] + self.chain.bwd_mem_tmp[idx]
-
-    def _mmax_ng(self, i: int, j: int):
-        """
-        Calculate the maximum memory usage of CF_i, F_i+1\empty, ... F_j\empty
-        """
-
-        res = self.chain.cweight[j + 1] + self.chain.fwd_mem_tmp[j]
-        if j > i:
-            res += self.chain.cweight[j]
-        return res
-
-    def _rotor_estimated_bwd(self, i, j, m, delta):
-        compute = self.rotor_table[0][math.floor((m - self.chain.cweight[i]) / self.mem_unit)][i][j]
-        comm = delta / self.bandwidth
-        return (max(compute, comm) + compute + comm) / 2
-
-    def _rotor_estimated_bwd_sequence(self, i, j, m, delta):
-        return _rec(self.disc_chain, i, j, math.floor((m - self.chain.cweight[i]) / self.mem_unit), self.rotor_table)
-
-    def _common_values_enable(self, state: Tuple):
-
-        idx, act_size, df, db, input_has_bar = state
-        input_size = self.chain.cbweight[idx] if input_has_bar else self.chain.cweight[idx]
-        mf = act_size + df + input_size
-        mb = act_size + db + input_size
-        mem_avail = self.max_memory - act_size - input_size
-        f_usage = self._mmax_all(idx)
-        b_usage = self._mmax_b(idx)
-
-        # infeasible
-        if f_usage > mem_avail or b_usage > mem_avail:
-            return None
-
-        # calculate idle time
-        eps_f_beta = max(0, f_usage - self.max_memory + mf)
-        eps_b_beta = max(0, b_usage - self.max_memory + mb)
-        idle_time = (eps_f_beta + eps_b_beta) / self.bandwidth
-
-        # calculate offload and prefetch data
-        offload_data = self.chain.fweight[idx] * self.bandwidth + eps_f_beta
-        prefetch_data = self.chain.bweight[idx] * self.bandwidth + eps_b_beta
-
-        # total_time
-        total_time = self.chain.fweight[idx] + self.chain.bweight[idx] + idle_time
-
-        return (offload_data, prefetch_data, total_time, idle_time)
-
-    def _common_values_nograd(self, state: Tuple, j: int, iterative: bool = False):
-
-        i, act_size, df, db, input_has_bar = state
-
-        # compute new epsilon_tmp and sum_fwds
-        if iterative:
-            self.epsilon_tmp = max(self.epsilon_tmp, self._mmax_ng(i, j) - self.bandwidth * self.sum_fwds)
-            self.sum_fwds += self.chain.fweight[j]
-        else:
-            self.epsilon_tmp = max(
-                self._mmax_ng(i, k) - self.bandwidth * sum(self.chain.fweight[i:k]) for k in range(i, j + 1))
-            self.sum_fwds = sum(self.chain.fweight[i:j + 1])
-
-        input_size = self.chain.cbweight[i] if input_has_bar else self.chain.cweight[i]
-        mf = act_size + df + input_size
-        mem_avail = self.max_memory - act_size - input_size
-
-        # if infeasible
-        if max(self._mmax_ng(i, k) for k in range(i, self.length)) > mem_avail:
-            return None
-
-        eps_f_beta = max(0, self.epsilon_tmp - self.max_memory + mf)
-        offload_data = self.sum_fwds * self.bandwidth + eps_f_beta
-
-        # TODO: Implement the precise backward recompute sequence mentioned in the paper
-        # currently we will use an approximate way to get the backward time
-        time_backward = self._rotor_estimated_bwd(i, j, mem_avail, db)
-
-        prefetch_data = time_backward * self.bandwidth
-        idle_time = eps_f_beta / self.bandwidth
-        total_time = self.sum_fwds + idle_time + time_backward
-
-        return (offload_data, prefetch_data, total_time, idle_time)
-
-    def _new_values(self, state: Tuple, do_offload: bool, common_values: Tuple) -> Tuple:
-        """Generate new values for next state
-
-        Args:
-            state (Tuple): undiscretized states
-            do_offload (bool): bool type indicates whether we need to do offload
-            common_values (Tuple): common values (offload_data, prefetch_data, total_time, idle_time)
-
-        Returns:
-            Tuple: (new_act_size, new_df, new_db)
-        """
-        idx, act_size, df, db, input_has_bar = state
-        offload_data, prefetch_data, *_ = common_values
-        input_size = self.chain.cbweight[idx] if input_has_bar else self.chain.cweight[idx]
-        if do_offload:
-            new_act_size = act_size
-            new_df = max(0, df + input_size - offload_data)
-            new_db = max(0, db - prefetch_data) + input_size
-        else:
-            new_act_size = act_size + input_size
-            new_df = max(0, df - offload_data)
-            new_db = max(0, db - prefetch_data)
-
-        return (new_act_size, new_df, new_db)
-
-    def _compute_pofo_table(self):
-        self.table = PofoTable(self.length, self.mem_slots)
-
-        # initializing the loss
-        for act_size in range(self.mem_slots + 1):
-            for df in range(self.mem_slots - act_size + 1):
-                for db in range(self.mem_slots - act_size + 1):
-                    # undiscretize for idle time calculation
-                    origin_values = self._undiscretize(act_size, df, db)
-
-                    for input_has_bar in (False, True):
-                        disc_state = (self.length, act_size, df, db, input_has_bar)
-                        state = (self.length, *origin_values, input_has_bar)
-                        common_values = self._common_values_enable(state)
-
-                        # if no feasible choice
-                        if common_values is None:
-                            self.table.set_value(disc_state, INF, None)
-                            continue
-
-                        # if there is feasible choice
-                        new_act_size, new_df, new_db = self._new_values(state, False, common_values)
-                        eps_g = (new_df + new_db) / self.bandwidth
-                        total_time = common_values[2] + eps_g
-                        self.table.set_value(disc_state, total_time, (True, False))
-
-        # main loop
-        for i in reversed(range(self.length)):
-            for act_size in range(self.mem_slots + 1):
-                for df in range(self.mem_slots - act_size + 1):
-                    for db in range(self.mem_slots - act_size + 1):
-                        # undiscretize for idle time calculation
-                        origin_values = self._undiscretize(act_size, df, db)
-
-                        for input_has_bar in (False, True):
-                            best_result = INF
-                            best_choice = None
-                            disc_state = (i, act_size, df, db, input_has_bar)
-                            state = (i, *origin_values, input_has_bar)
-
-                            # case 1: start with F_all
-                            vals_enable = self._common_values_enable(state)
-                            if vals_enable is not None:
-                                for do_offload in (True, False):
-                                    new_state = self._new_values(state, do_offload, vals_enable)
-                                    new_state = (i + 1, *self._discretize(*new_state), True)
-                                    total_time = vals_enable[2]
-                                    results_all = self.table.get_opt(new_state) + total_time
-                                    if results_all < best_result:
-                                        best_result = results_all
-                                        best_choice = (True, do_offload)
-
-                            # case 2: start with F_ck
-                            self.sum_fwds = 0
-                            self.epsilon_tmp = 0
-                            for j in range(i, self.length):
-                                vals_nograd = self._common_values_nograd(state, j, True)
-
-                                # if infeasible
-                                if vals_nograd is None:
-                                    continue
-
-                                for do_offload in (True, False):
-                                    new_state = self._new_values(state, do_offload, vals_nograd)
-                                    new_state = (j + 1, *self._discretize(*new_state), False)
-                                    total_time = vals_nograd[2]
-                                    result_nograd = total_time + self.table.get_opt(new_state)
-                                    if result_nograd < best_result:
-                                        best_result = result_nograd
-                                        best_choice = (False, do_offload, j)
-
-                            self.table.set_value(disc_state, best_result, best_choice)
-
-    def pofo_rec(self, disc_state):
-        i, act_size, df, db, input_has_bar = disc_state
-        result = Sequence(Function("pofo", *disc_state))
-        what = self.table.get_what(disc_state)
-        state = self._undiscretize(act_size, df, db)
-        state = (i, *state, input_has_bar)
-        i, act_size, df, db, input_has_bar = state
-
-        if what is None:
-            return None
-
-        # if loss
-        if i == self.length:
-            result.insert(Loss())
-            return result
-
-        if what[0]:
-            do_offload = what[1]
-            values = self._common_values_enable(state)
-            new_state = self._discretize(*self._new_values(state, do_offload, values))
-            new_state = (i + 1, *new_state, True)
-            if do_offload:
-                result.insert(Offload(i, input_has_bar))
-            result.insert(ForwardEnable(i))
-            result.insert_sequence(self.pofo_rec(new_state))
-            if do_offload:
-                result.insert(Prefetch(i, input_has_bar))
-            result.insert(Backward(i))
-
-        else:
-            _, do_offload, j = what
-            values = self._common_values_nograd(state, j)
-            new_state = self._discretize(*self._new_values(state, do_offload, values))
-            new_state = (j + 1, *new_state, False)
-            if do_offload:
-                result.insert(Offload(i, input_has_bar))
-            result.insert(ForwardCheck(i))
-            for k in range(i + 1, j + 1):
-                result.insert(ForwardNograd(k))
-            result.insert_sequence(self.pofo_rec(new_state))
-            if do_offload:
-                result.insert(Prefetch(i, input_has_bar))
-            m = self.max_memory - act_size - (self.chain.cbweight[i] if input_has_bar else self.chain.cweight[i])
-
-            #TODO: Implement the precise backward recompute sequence mentioned in the paper
-            result.insert_sequence(self._rotor_estimated_bwd_sequence(i, j, m, db))
-
-        return result
-
-
-def _annotate_from_pofo_sequence(sequence: Sequence, node_list: List[List[Node]]):
-    op_list = sequence.list_operations()
-    loss_op = next(op for op in op_list if isinstance(op, Loss))
-    fwd_list = op_list[:op_list.index(loss_op)]
-    bwd_list = op_list[op_list.index(loss_op) + 1:]
-    ckpt_idx = 0
-    in_ckpt = False
-    ckpt_region = []
-
-    # forward annotation
-    for op in fwd_list:
-        if in_ckpt:
-            if isinstance(op, ForwardNograd):
-                ckpt_region.append(op.index)
-
-            elif isinstance(op, ForwardEnable):
-                in_ckpt = False
-                for node_idx in ckpt_region:
-                    for n in node_list[node_idx]:
-                        setattr(n, "activation_checkpoint", [ckpt_idx])
-
-                ckpt_idx += 1
-                ckpt_region = []
-
-            elif isinstance(op, ForwardCheck):
-                for node_idx in ckpt_region:
-                    for n in node_list[node_idx]:
-                        setattr(n, "activation_checkpoint", [ckpt_idx])
-
-                ckpt_idx += 1
-                ckpt_region = [op.index]
-
-        else:
-            if isinstance(op, ForwardCheck):
-                in_ckpt = True
-                ckpt_region.append(op.index)
-
-    # annotate the backward if there is any nested activation checkpoint
-    in_recompute = False
-    for op in bwd_list:
-        if in_recompute:
-            if isinstance(op, ForwardNograd):
-                ckpt_region.append(op.index)
-
-            elif isinstance(op, ForwardEnable):
-                for node_idx in ckpt_region:
-                    for n in node_list[node_idx]:
-                        n.activation_checkpoint.append(ckpt_idx)
-
-                ckpt_idx += 1
-                ckpt_region = []
-
-            elif isinstance(op, ForwardCheck):
-                for node_idx in ckpt_region:
-                    for n in node_list[node_idx]:
-                        n.activation_checkpoint.append(ckpt_idx)
-
-                ckpt_idx += 1
-                ckpt_region = [op.index]
-
-            elif isinstance(op, Backward):
-                for node_idx in ckpt_region:
-                    for n in node_list[node_idx]:
-                        n.activation_checkpoint.append(ckpt_idx)
-
-                in_recompute = False
-
-        else:
-            if not isinstance(op, Backward):
-                in_recompute = True
-                ckpt_idx = 0
-                ckpt_region = []
-                if isinstance(op, ForwardCheck):
-                    ckpt_region.append(op.index)
-
-    # postprocess, make sure every activation checkpoint label in the
-    # same activation checkpoint region (level = 0) has the same length
-    op_list = []
-    for node in node_list:
-        op_list += node
-    ckpt_regions = _find_nested_ckpt_regions(op_list)
-    for (start_idx, end_idx) in ckpt_regions:
-        nested_length = max(len(op_list[idx].activation_checkpoint) for idx in range(start_idx, end_idx + 1))
-        for idx in range(start_idx, end_idx + 1):
-            op_list[idx].activation_checkpoint += [None] * (nested_length - len(op_list[idx].activation_checkpoint))
-
-    # annotate the offload
-    offload_idx = 0
-    for idx, op in enumerate(fwd_list):
-        if isinstance(op, Offload):
-            # corner case: offload input
-            if op.index == 0:
-                if isinstance(fwd_list[idx + 1], ForwardCheck):
-                    for n in node_list[op.index]:
-                        setattr(n, "activation_offload", True)
-                else:
-                    for n in node_list[op.index]:
-                        setattr(n, "activation_offload", (offload_idx, True, False))
-                    offload_idx += 1
-
-            else:
-                if op.has_bar:
-                    # annotate previous node
-                    if hasattr(node_list[op.index - 1][0], "activation_offload"):
-                        for n in node_list[op.index - 1]:
-                            n.activation_offload[-1] = True
-                    else:
-                        for n in node_list[op.index - 1]:
-                            setattr(n, "activation_offload", [offload_idx, False, True])
-
-                        offload_idx += 1
-
-                # annotate this node
-                if isinstance(fwd_list[idx + 1], ForwardCheck):
-                    for n in node_list[op.index]:
-                        setattr(n, "activation_offload", True)
-                else:
-                    for n in node_list[op.index]:
-                        setattr(n, "activation_offload", [offload_idx, True, False])
-
-                    offload_idx += 1
-
-
-def solver_pofo(gm: ColoGraphModule,
-                data,
-                bandwidth,
-                flops,
-                mem_limit: int,
-                mem_slots: int = 50,
-                cnode: List[str] = None,
-                eps: float = 0.0) -> ColoGraphModule:
-    """Solver that combine offload and activation checkpoint
-    Reference: https://proceedings.neurips.cc/paper/2021/hash/c8461bf13fca8a2b9912ab2eb1668e4b-Abstract.html
-
-    Args:
-        gm (ColoGraphModule): ColoGraphModule derived from tracer
-        data: input of the model
-        bandwidth: offload bandwidth, unit Byte/s
-        flops: FLOPS of device, unit FLOPs/s
-        mem_limit (int): memory limit, unit Byte
-        mem_slots (int, optional): number of memory slots. Defaults to 500.
-        cnode (List[str], optional): common node for linearize. Defaults to None.
-        eps (float, optional): epsilon for memory decay. Defaults to 0.02.
-
-    Returns:
-        ColoGraphModule: annotated graph module
-    """
-
-    node_list = linearize(gm, cnode)
-    mem_limit -= parameter_size(gm)
-
-    # prepare data
-    if is_compatible_with_meta():
-        from colossalai.fx.profiler import MetaTensor
-        data = MetaTensor(data, fake_device=next(gm.parameters()).device)
-    MetaInfoProp(gm).run(data)
-    chain: Chain = _construct_chain(node_list, data)
-    chain = _normalize_flops(chain, flops)
-    # currently we view loss as an op without expense
-    chain.cbweight.append(0)
-    chain.cweight.append(0)
-    chain.fwd_mem_tmp.append(0)
-    chain.bwd_mem_tmp.append(0)
-    chain.fweight.append(0)
-    chain.bweight.append(0)
-
-    solver = PofoSolver(chain, mem_limit, bandwidth, mem_slots)
-    first_state = (0, 0, 0, 0, False)
-    sequence = solver.pofo_rec(first_state)
-    if sequence == None:
-        raise ValueError(f"Cannot solve sequence with {mem_limit} Bytes memory")
-
-    _annotate_from_pofo_sequence(sequence, node_list)
-    setattr(gm, "__sequence__", sequence)
-    return gm
diff --git a/colossalai/fx/passes/algorithms/ckpt_solver_rotor.py b/colossalai/fx/passes/algorithms/ckpt_solver_rotor.py
deleted file mode 100644
index 5b8d0da9ffe6..000000000000
--- a/colossalai/fx/passes/algorithms/ckpt_solver_rotor.py
+++ /dev/null
@@ -1,436 +0,0 @@
-import math
-import sys
-from typing import List, Tuple
-
-from torch.fx import Node
-
-from colossalai.fx.codegen.activation_checkpoint_codegen import _find_nested_ckpt_regions
-from colossalai.fx.graph_module import ColoGraphModule
-from colossalai.fx.profiler import activation_size, calculate_fwd_out, calculate_fwd_tmp, parameter_size
-from colossalai.logging import get_dist_logger
-
-from .linearize import linearize
-from .operation import Backward, Chain, ForwardCheck, ForwardEnable, ForwardNograd, Function, Loss, Sequence
-
-# global vairable to indicate whether the solver is failed
-SOLVER_FAILED = False
-
-
-# this is the python compute table code from rotor
-# https://gitlab.inria.fr/hiepacs/rotor
-# paper link: https://hal.inria.fr/hal-02352969
-def _compute_table(chain: Chain, mmax) -> Tuple:
-    """Returns the optimal table: a tuple containing:
-    Opt[m][lmin][lmax] with lmin = 0...chain.length
-         and lmax = lmin...chain.length (lmax is not included) and m = 0...mmax
-    what[m][lmin][lmax] is (True,) if the optimal choice is a chain checkpoint
-                           (False, j) if the optimal choice is a leaf checkpoint of length j
-    The computation uses dynamic programming"""
-
-    fw = chain.fweight + [0]    ## forward time
-    bw = chain.bweight    ## backward time, not used
-    cw = chain.cweight + [0]    ## size of x (and of y)
-    cbw = chain.cbweight + [0]    ## size of xbar
-    fwd_mem_tmp = chain.fwd_mem_tmp + [0]
-    bwd_mem_tmp = chain.bwd_mem_tmp + [0]
-
-    # Build table
-    opt = [[{} for _ in range(chain.length + 1)] for _ in range(mmax + 1)]
-    what = [[{} for _ in range(chain.length + 1)] for _ in range(mmax + 1)]
-    # Last one is a dict because its indices go from i to l. Renumbering will wait for C implementation
-
-    # Initialize borders of the tables for lmax-lmin = 0
-    for m in range(mmax + 1):
-        for i in range(chain.length + 1):
-            #lmax-lmin = 0
-            limit = max(cw[i + 1] + cbw[i + 1] + fwd_mem_tmp[i], cw[i + 1] + cbw[i + 1] + bwd_mem_tmp[i])
-            if m >= limit:    ## Equation (1)
-                opt[m][i][i] = fw[i] + bw[i]
-            else:
-                opt[m][i][i] = float("inf")
-
-    # Compute everything
-    for m in range(mmax + 1):
-        for d in range(1, chain.length + 1):
-            for i in range(chain.length + 1 - d):
-                # for idx in range(i+1, chain.length + 1):
-                idx = i + d
-                mmin = cw[idx + 1] + cw[i + 1] + fwd_mem_tmp[i]
-                if idx > i + 1:
-                    mmin = max(mmin, cw[idx + 1] + max(cw[j] + cw[j + 1] + fwd_mem_tmp[j] for j in range(i + 1, idx)))
-                if m < mmin:
-                    opt[m][i][idx] = float("inf")
-                else:
-                    leaf_checkpoints = [(j, sum(fw[i:j]) + opt[m - cw[j]][j][idx] + opt[m][i][j - 1])
-                                        for j in range(i + 1, idx + 1)
-                                        if m >= cw[j]]
-                    if leaf_checkpoints:
-                        best_leaf = min(leaf_checkpoints, key=lambda t: t[1])
-                    else:
-                        best_leaf = None
-                    if m >= cbw[i + 1]:
-                        chain_checkpoint = opt[m][i][i] + opt[m - cbw[i + 1]][i + 1][idx]
-                    else:
-                        chain_checkpoint = float("inf")
-                    if best_leaf and best_leaf[1] <= chain_checkpoint:
-                        opt[m][i][idx] = best_leaf[1]
-                        what[m][i][idx] = (False, best_leaf[0])
-                    else:
-                        opt[m][i][idx] = chain_checkpoint
-                        what[m][i][idx] = (True,)
-    return (opt, what)
-
-
-def _rec(chain: Chain, lmin, lmax, cmem, opt_table):
-    """ chain : the class describing the AC graph
-        lmin : index of the first forward to execute
-        lmax : upper bound index of the last forward to execute (not included)
-        cmem : number of available memory slots
-        Return the optimal sequence of makespan Opt_hete[cmem][lmin][lmax-lmin]"""
-    if cmem <= 0:
-        raise ValueError("Can not process a chain with negative memory {cmem}".format(cmem=cmem))
-    opt, what = opt_table
-    sequence = Sequence(Function("Persistent", lmax - lmin, cmem))
-    if opt[cmem][lmin][lmax] == float("inf"):
-        # using logger to annonce that the solver is failed
-        logger = get_dist_logger()
-        logger.info("Can not process this chain from index {lmin} to {lmax} with memory {cmem}".format(lmin=lmin,
-                                                                                                       lmax=lmax,
-                                                                                                       cmem=cmem))
-
-        # set global indicater SOLVER_FAILED to True
-        global SOLVER_FAILED
-        SOLVER_FAILED = True
-        return sequence
-
-    if lmin == lmax:
-        if lmin == chain.length:
-            sequence.insert(Loss())
-        else:
-            sequence.insert(ForwardEnable(lmin))
-            sequence.insert(Backward(lmin))
-        return sequence
-
-    if what[cmem][lmin][lmax][0]:
-        sequence.insert(ForwardEnable(lmin))
-        sequence.insert_sequence(_rec(chain, lmin + 1, lmax, cmem - chain.cbweight[lmin + 1], opt_table))
-        sequence.insert(Backward(lmin))
-    else:
-        j = what[cmem][lmin][lmax][1]
-        sequence.insert(ForwardCheck(lmin))
-        for k in range(lmin + 1, j):
-            sequence.insert(ForwardNograd(k))
-        sequence.insert_sequence(_rec(chain, j, lmax, cmem - chain.cweight[j], opt_table))
-        sequence.insert_sequence(_rec(chain, lmin, j - 1, cmem, opt_table))
-    return sequence
-
-
-def _fwd_xbar(node: List[Node]) -> int:
-    """Get the forward xbar of a node
-
-    Args:
-        node (List[Node]): List of torch.fx Node,
-        indicates a node in linearized graph
-
-    Returns:
-        int: xbar size, unit Byte
-    """
-
-    xbar = 0
-    for n in node:
-        xbar += calculate_fwd_tmp(n) + calculate_fwd_out(n)
-    return xbar
-
-
-def _fwd_time(node: List[Node]) -> int:
-    """Get the foward time of a node
-
-    Args:
-        node (List[Node]): List of torch.fx Node,
-        indicates a node in linearized graph
-
-    Returns:
-        int: foward time, extimated by flops count
-    """
-
-    fwd_time = 0
-    for n in node:
-        # minimum flop count is needed
-        fwd_time += max(n.meta['fwd_flop'], 1)
-    return fwd_time
-
-
-def _bwd_time(node: List[Node]) -> int:
-    """Get the backward time of a node
-
-    Args:
-        node (List[Node]): List of torch.fx Node,
-        indicates a node in linearized graph
-
-    Returns:
-        int: backward time, extimated by flops count
-    """
-
-    bwd_time = 0
-    for n in node:
-        # minimum flop count is needed
-        bwd_time += max(n.meta['bwd_flop'], 1)
-    return bwd_time
-
-
-def _get_fwd_mem_tmp(node: List[Node]) -> int:
-    """Get the forward temp memory of a node
-    This could be done by subtracting the saved activation from all output of a node
-
-    Args:
-        node (List[Node]): List of torch.fx Node,
-        indicates a node in linearized graph
-
-    Returns:
-        int: forward temp memory, unit Byte
-    """
-    n = node[-1]
-    return activation_size(n.meta['fwd_out']) - calculate_fwd_out(n)
-
-
-def _get_bwd_mem_tmp(node: List[Node]) -> int:
-    """Get the backward temp memory of a node
-
-    Args:
-        node (List[Node]): List of torch.fx Node,
-        indicates a node in linearized graph
-
-    Returns:
-        int: backward temp memory, unit Byte
-    """
-
-    def _get_deps_size():
-        deps_size = 0
-        for k, v in deps.items():
-            k: Node
-            if v > 0:
-                deps_size += k.meta['bwd_mem_out']
-            if v == float('-inf'):
-                deps_size -= calculate_fwd_tmp(k) + calculate_fwd_out(k)
-
-        return deps_size
-
-    bwd_mem_tmp = 0
-    deps = {}
-
-    for n in reversed(node):
-        deps[n] = len(n.all_input_nodes)
-        bwd_mem_tmp = max(bwd_mem_tmp, _get_deps_size() + n.meta['bwd_mem_tmp'])
-
-        for child in n.users:
-            if child in deps:
-                deps[child] -= 1
-                if deps[child] <= 0:
-                    deps[child] = float('-inf')    # free
-
-    return bwd_mem_tmp
-
-
-def _construct_chain(node_list: List[List[Node]], input) -> Chain:
-
-    fwd_time = []
-    bwd_time = []
-    xbar_sizes = [activation_size(input)]
-    x_sizes = [activation_size(input)]
-    tmp_fwd = []
-    tmp_bwd = []
-
-    for idx, node in enumerate(node_list):
-        fwd_time.append(_fwd_time(node))
-        bwd_time.append(_bwd_time(node))
-        x_sizes.append(calculate_fwd_out(node[-1]))
-        xbar_sizes.append(max(x_sizes[-1], _fwd_xbar(node)))
-        tmp_fwd.append(_get_fwd_mem_tmp(node))
-        tmp_bwd.append(_get_bwd_mem_tmp(node))
-
-    bwd_time.append(0)
-
-    # currently we view loss backward temp as zero
-    tmp_bwd.append(0)
-
-    return Chain(fwd_time, bwd_time, x_sizes, xbar_sizes, tmp_fwd, tmp_bwd)
-
-
-def _annotate_from_sequence(sequence: Sequence, node_list: List[List[Node]]):
-    op_list = sequence.list_operations()
-    loss_op = next(op for op in op_list if isinstance(op, Loss))
-    fwd_list = op_list[:op_list.index(loss_op)]
-    bwd_list = op_list[op_list.index(loss_op) + 1:]
-    ckpt_idx = 0
-    in_ckpt = False
-    ckpt_region = []
-
-    # forward annotation
-    for idx, op in enumerate(fwd_list, 0):
-        if in_ckpt:
-            if isinstance(op, ForwardNograd):
-                ckpt_region.append(idx)
-
-            elif isinstance(op, ForwardEnable):
-                in_ckpt = False
-                for node_idx in ckpt_region:
-                    for n in node_list[node_idx]:
-                        setattr(n, "activation_checkpoint", [ckpt_idx])
-
-                ckpt_idx += 1
-                ckpt_region = []
-
-            elif isinstance(op, ForwardCheck):
-                for node_idx in ckpt_region:
-                    for n in node_list[node_idx]:
-                        setattr(n, "activation_checkpoint", [ckpt_idx])
-
-                ckpt_idx += 1
-                ckpt_region = [idx]
-
-        else:
-            if isinstance(op, ForwardCheck):
-                in_ckpt = True
-                ckpt_region.append(idx)
-
-    # annotate the backward if there is any nested activation checkpoint
-    in_recompute = False
-    for op in bwd_list:
-        if in_recompute:
-            if isinstance(op, ForwardNograd):
-                ckpt_region.append(op.index)
-
-            elif isinstance(op, ForwardEnable):
-                for node_idx in ckpt_region:
-                    for n in node_list[node_idx]:
-                        n.activation_checkpoint.append(ckpt_idx)
-
-                ckpt_idx += 1
-                ckpt_region = []
-
-            elif isinstance(op, ForwardCheck):
-                for node_idx in ckpt_region:
-                    for n in node_list[node_idx]:
-                        n.activation_checkpoint.append(ckpt_idx)
-
-                ckpt_idx += 1
-                ckpt_region = [op.index]
-
-            elif isinstance(op, Backward):
-                for node_idx in ckpt_region:
-                    for n in node_list[node_idx]:
-                        n.activation_checkpoint.append(ckpt_idx)
-
-                in_recompute = False
-
-        else:
-            if not isinstance(op, Backward):
-                in_recompute = True
-                ckpt_idx = 0
-                ckpt_region = []
-                if isinstance(op, ForwardCheck):
-                    ckpt_region.append(op.index)
-
-    # postprocess, make sure every activation checkpoint label in the
-    # same activation checkpoint region (level = 0) has the same length
-    op_list = []
-    for node in node_list:
-        op_list += node
-    ckpt_regions = _find_nested_ckpt_regions(op_list)
-    for (start_idx, end_idx) in ckpt_regions:
-        nested_length = max(len(op_list[idx].activation_checkpoint) for idx in range(start_idx, end_idx + 1))
-        for idx in range(start_idx, end_idx + 1):
-            op_list[idx].activation_checkpoint += [None] * (nested_length - len(op_list[idx].activation_checkpoint))
-
-
-def solver_rotor(gm: ColoGraphModule,
-                 data,
-                 mem_limit: int,
-                 mem_slots: int = 500,
-                 cnode: List[str] = None,
-                 eps: float = 0.0,
-                 force_python: bool = False) -> ColoGraphModule:
-    """solver that automatically find activation checkpoint in rotor's manner
-
-    Args:
-        gm (ColoGraphModule): ColoGraphModule generated by tracing model and MetaInfoProp.
-        data (torch.Tensor): input data.
-        mem_limit (int): memory budget in Byte.
-        mem_slots (int, optional): number of slots for discretizing memory budget. Defaults to 500.
-        cnode (List[Node], optional): common node list for linearize. Defaults to None.
-        eps (float): epsilon for memory decay. Defaults to 0.0
-        force_python (bool): force to use python version of dynamic programs
-
-    Returns:
-        ColoGraphModule: annotated ColoGraphModuled with __sequence__ attribute
-    """
-
-    # try to import C version solver if force_python is not set
-    logger = get_dist_logger()
-    if not force_python:
-        try:
-            from .dynamic_programs_C_version import persistent_compute_table
-            CVERSION = True
-
-        # build module if module not found
-        except ModuleNotFoundError:
-            import os
-            import subprocess
-            logger.info("dynamic_programs_C_version hasn't been built! Building library...", ranks=[0])
-            this_dir = os.path.dirname(os.path.abspath(__file__))
-            result = subprocess.Popen(
-                [
-                    f"{sys.executable}", f"{os.path.join(this_dir, 'build_c_ext.py')}", "build_ext",
-                    f"--build-lib={this_dir}"
-                ],
-                stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE,
-            )
-            if result.wait() == 0:
-                logger.info("dynamic_programs_C_version has been built!", ranks=[0])
-                from .dynamic_programs_C_version import persistent_compute_table
-                CVERSION = True
-            else:
-                logger.info("dynamic_programs_C_version built failed! Using python version!", ranks=[0])
-                CVERSION = False
-    else:
-        CVERSION = False
-
-    # check if metainfoprop is done
-    if any(len(node.meta) == 0 for node in gm.graph.nodes):
-        raise RuntimeError(
-            "Nodes meta information hasn't been prepared! Please run MetaInfoProp before calling solver!")
-
-    # linearize the graph
-    node_list = linearize(gm, cnode)
-
-    # construct chain
-    mem_unit = mem_limit * (1.0 - eps) // mem_slots
-    chain: Chain = _construct_chain(node_list, data)
-    chain._discretize(mem_unit)
-
-    # use C version if possible
-    if CVERSION and not force_python:
-        logger.info("Using C version rotor solver!", ranks=[0])
-        opt_table = persistent_compute_table(chain, mem_slots)
-    else:
-        opt_table = _compute_table(chain, mem_slots)
-        logger.info("Using python version rotor solver!", ranks=[0])
-
-    # found sequence
-    sequence = _rec(chain, 0, chain.length, mem_slots - chain.cweight[0], opt_table)
-
-    # if solver failed, we don't need to annotate the graph
-    if not SOLVER_FAILED:
-        _annotate_from_sequence(sequence, node_list)
-
-    # set __sequence__ attribute to GraphModule
-    if SOLVER_FAILED:
-        setattr(gm, "__sequence__", None)
-    else:
-        setattr(gm, "__sequence__", sequence)
-
-    # set __opttable__ attribute to GraphModule
-    setattr(gm, "__opttable__", opt_table[0])
-    gm.recompile()
-    return gm
diff --git a/colossalai/fx/passes/algorithms/dynamic_programs.c b/colossalai/fx/passes/algorithms/dynamic_programs.c
deleted file mode 100644
index 3efad58400fa..000000000000
--- a/colossalai/fx/passes/algorithms/dynamic_programs.c
+++ /dev/null
@@ -1,516 +0,0 @@
-#define PY_SSIZE_T_CLEAN
-#include <Python.h>
-
-long* PySequenceToLongArray(PyObject* pylist) {
-  if (!(pylist && PySequence_Check(pylist))) return NULL;
-  Py_ssize_t len = PySequence_Size(pylist);
-  long* result = (long*)calloc(len + 1, sizeof(long));
-  for (Py_ssize_t i = 0; i < len; ++i) {
-    PyObject* item = PySequence_GetItem(pylist, i);
-    result[i] = PyLong_AsLong(item);
-    Py_DECREF(item);
-  }
-  result[len] = 0;
-  return result;
-}
-
-double* PySequenceToDoubleArray(PyObject* pylist) {
-  if (!(pylist && PySequence_Check(pylist))) return NULL;
-  Py_ssize_t len = PySequence_Size(pylist);
-  double* result = (double*)calloc(len + 1, sizeof(double));
-  for (Py_ssize_t i = 0; i < len; ++i) {
-    PyObject* item = PySequence_GetItem(pylist, i);
-    result[i] = PyFloat_AsDouble(item);
-    Py_DECREF(item);
-  }
-  result[len] = 0;
-  return result;
-}
-
-long* getLongArray(PyObject* container, const char* attributeName) {
-  PyObject* sequence = PyObject_GetAttrString(container, attributeName);
-  long* result = PySequenceToLongArray(sequence);
-  Py_DECREF(sequence);
-  return result;
-}
-
-double* getDoubleArray(PyObject* container, const char* attributeName) {
-  PyObject* sequence = PyObject_GetAttrString(container, attributeName);
-  double* result = PySequenceToDoubleArray(sequence);
-  Py_DECREF(sequence);
-  return result;
-}
-
-static PyObject* persistent_compute_table(PyObject* self, PyObject* args) {
-  PyObject* chain_param;
-  int mmax;
-
-  if (!PyArg_ParseTuple(args, "Oi", &chain_param, &mmax)) return NULL;
-
-  double* fw = getDoubleArray(chain_param, "fweight");
-  if (!fw) return NULL;
-
-  double* bw = getDoubleArray(chain_param, "bweight");
-  if (!bw) return NULL;
-
-  long* cw = getLongArray(chain_param, "cweight");
-  if (!cw) return NULL;
-
-  long* cbw = getLongArray(chain_param, "cbweight");
-  if (!cbw) return NULL;
-
-  long* fwd_tmp = getLongArray(chain_param, "fwd_mem_tmp");
-  if (!cbw) return NULL;
-
-  long* bwd_tmp = getLongArray(chain_param, "bwd_mem_tmp");
-  if (!cbw) return NULL;
-
-  PyObject* chain_length_param = PyObject_GetAttrString(chain_param, "length");
-  if (!chain_length_param) return NULL;
-  long chain_length = PyLong_AsLong(chain_length_param);
-  Py_DECREF(chain_length_param);
-
-  // TODO: Can be optimized by only allocating memory for l >= i
-  // TODO: float / int instead of double / long ?
-#define OPT(m, i, l)                                  \
-  opt[(m) * (chain_length + 1) * (chain_length + 1) + \
-      (i) * (chain_length + 1) + (l)]
-  double* opt = (double*)calloc(
-      (mmax + 1) * (chain_length + 1) * (chain_length + 1), sizeof(double));
-
-#define WHAT(m, i, l)                                  \
-  what[(m) * (chain_length + 1) * (chain_length + 1) + \
-       (i) * (chain_length + 1) + (l)]
-  long* what = (long*)calloc(
-      (mmax + 1) * (chain_length + 1) * (chain_length + 1), sizeof(long));
-
-  for (long m = 0; m <= mmax; ++m)
-    for (long i = 0; i <= chain_length; ++i)
-      // TODO: Can be optimized to remove the IF by reordering loops
-      if ((m >= cw[i + 1] + cbw[i + 1] + bwd_tmp[i]) &&
-          (m >= cw[i + 1] + cbw[i + 1] + fwd_tmp[i]))
-        OPT(m, i, i) = fw[i] + bw[i];
-      else
-        OPT(m, i, i) = INFINITY;
-
-  for (long m = 0; m <= mmax; ++m)
-    for (long d = 1; d <= chain_length; ++d) {
-      for (long i = 0; i <= chain_length - d; ++i) {
-        long idx = i + d;
-        long mmin = cw[idx + 1] + cw[i + 1] + fwd_tmp[i];
-        if (idx > i + 1) {
-          long maxCostFWD = 0;
-          for (long j = i + 1; j < idx; j++) {
-            maxCostFWD = fmaxl(maxCostFWD, cw[j] + cw[j + 1] + fwd_tmp[j]);
-          }
-          mmin = fmaxl(mmin, cw[idx + 1] + maxCostFWD);
-        }
-        if ((m >= mmin)) {
-          long bestLeaf = -1;
-          double sumFw = 0;
-          double bestLeafCost = INFINITY;
-          /// sumFw + OPT(m-cw[i+1], i+1, l) + OPT(m, i, i); // Value for j =
-          /// i+1
-          for (long j = i + 1; j <= idx; ++j) {
-            sumFw += fw[j - 1];
-            if (m >= cw[j]) {
-              double cost = sumFw + OPT(m - cw[j], j, idx) + OPT(m, i, j - 1);
-              if (cost < bestLeafCost) {
-                bestLeafCost = cost;
-                bestLeaf = j;
-              }
-            }
-          }
-          double chainCost = INFINITY;
-          if (m >= cbw[i + 1])
-            chainCost = OPT(m, i, i) + OPT(m - cbw[i + 1], i + 1, idx);
-          if (bestLeafCost <= chainCost) {
-            OPT(m, i, idx) = bestLeafCost;
-            WHAT(m, i, idx) = bestLeaf;
-          } else {
-            OPT(m, i, idx) = chainCost;
-            WHAT(m, i, idx) = -1;
-          }
-        } else
-          OPT(m, i, idx) = INFINITY;
-      }
-    }
-
-  free(fw);
-  free(bw);
-  free(cw);
-  free(cbw);
-  free(fwd_tmp);
-  free(bwd_tmp);
-
-  PyObject* res_opt = PyList_New(mmax + 1);
-  PyObject* res_what = PyList_New(mmax + 1);
-
-  // Convert the result into Python world
-  for (long m = 0; m <= mmax; ++m) {
-    PyObject* res_opt_m = PyList_New(chain_length + 1);
-    PyList_SET_ITEM(res_opt, m, res_opt_m);
-    PyObject* res_what_m = PyList_New(chain_length + 1);
-    PyList_SET_ITEM(res_what, m, res_what_m);
-    for (long i = 0; i <= chain_length; ++i) {
-      PyObject* res_opt_m_i = PyDict_New();
-      PyList_SET_ITEM(res_opt_m, i, res_opt_m_i);
-      PyObject* res_what_m_i = PyDict_New();
-      PyList_SET_ITEM(res_what_m, i, res_what_m_i);
-      for (long l = i; l <= chain_length; ++l) {
-        PyObject* res_l = PyLong_FromLong(l);
-        PyObject* res_opt_m_i_l = PyFloat_FromDouble(OPT(m, i, l));
-        PyDict_SetItem(res_opt_m_i, res_l, res_opt_m_i_l);
-        Py_DECREF(res_opt_m_i_l);
-        PyObject* res_what_m_i_l;
-        long what_m_i_l = WHAT(m, i, l);
-        if (what_m_i_l < 0)
-          res_what_m_i_l = Py_BuildValue("(O)", Py_True);
-        else
-          res_what_m_i_l = Py_BuildValue("(Ol)", Py_False, what_m_i_l);
-        PyDict_SetItem(res_what_m_i, res_l, res_what_m_i_l);
-        Py_DECREF(res_what_m_i_l);
-        Py_DECREF(res_l);
-      }
-    }
-  }
-
-  free(opt);
-  free(what);
-
-  PyObject* result = PyTuple_Pack(2, res_opt, res_what);
-  Py_DECREF(res_opt);
-  Py_DECREF(res_what);
-  return result;
-}
-
-//  long i = L - s, j = t - s, k = l - t
-inline long floating_index_in_array(long m_factor, long m, long i, long j,
-                                    long k) {
-  return m * m_factor + (i * (i + 1) * (2 * i + 4)) / 12 + (i + 1) * j -
-         (j * (j - 1)) / 2 + k;
-}
-
-typedef struct {
-  long sp;
-  long r;
-  long tp;
-} index_t;
-
-static PyObject* floating_compute_table(PyObject* self, PyObject* args) {
-  PyObject* chain_param;
-  int mmax;
-
-  if (!PyArg_ParseTuple(args, "Oi", &chain_param, &mmax)) return NULL;
-
-  double* fw = getDoubleArray(chain_param, "fweigth");
-  if (!fw) return NULL;
-
-  double* bw = getDoubleArray(chain_param, "bweigth");
-  if (!bw) return NULL;
-
-  long* cw = getLongArray(chain_param, "cweigth");
-  if (!cw) return NULL;
-
-  long* cbw = getLongArray(chain_param, "cbweigth");
-  if (!cbw) return NULL;
-
-  long* fwd_tmp = getLongArray(chain_param, "fwd_tmp");
-  if (!fwd_tmp) return NULL;
-
-  long* bwd_tmp = getLongArray(chain_param, "bwd_tmp");
-  if (!bwd_tmp) return NULL;
-
-  PyObject* chain_length_param = PyObject_GetAttrString(chain_param, "length");
-  if (!chain_length_param) return NULL;
-  long chain_length = PyLong_AsLong(chain_length_param);
-  Py_DECREF(chain_length_param);
-
-  const long m_factor =
-      (chain_length + 1) * (chain_length + 2) * (2 * chain_length + 6) / 12;
-
-  // Defined for 0 <= s <= t <= l <= chain_length, for all m
-#undef OPT
-#define OPT(m, s, t, l)                                                     \
-  opt[floating_index_in_array(m_factor, (m), chain_length - (s), (t) - (s), \
-                              (l) - (t))]
-  double* opt = (double*)calloc((mmax + 1) * m_factor, sizeof(double));
-
-#undef WHAT
-#define WHAT(m, s, t, l)                                                     \
-  what[floating_index_in_array(m_factor, (m), chain_length - (s), (t) - (s), \
-                               (l) - (t))]
-  index_t* what = (index_t*)calloc((mmax + 1) * m_factor, sizeof(index_t));
-
-  double* partialSumsFW = (double*)calloc(chain_length + 1, sizeof(double));
-  double total = 0;
-  for (long i = 0; i < chain_length; ++i) {
-    partialSumsFW[i] = total;
-    total += fw[i];
-  }
-  partialSumsFW[chain_length] = total;
-
-  for (long m = 0; m <= mmax; ++m)
-    for (long i = 0; i <= chain_length; ++i) {
-      // TODO: Can be optimized to remove the IF by reordering loops
-      if ((m >= cw[i] + cw[i + 1] + cbw[i + 1] + bwd_tmp[i]) &&
-          (m >= cw[i + 1] + cbw[i + 1] + fwd_tmp[i]))
-        OPT(m, i, i, i) = fw[i] + bw[i];
-      else
-        OPT(m, i, i, i) = INFINITY;
-    }
-
-  for (long m = 0; m <= mmax; ++m)
-    for (long d = 1; d <= chain_length; ++d) {  // d = l - s
-      for (long s = 0; s <= chain_length - d; ++s) {
-        long l = s + d;
-        long memNullFirst = cw[l + 1] + cw[s + 1] + fwd_tmp[s];
-        long memNullSecond = 0;
-        for (long j = s + 1; j < l; ++j) {
-          long val = cw[j] + cw[j + 1] + fwd_tmp[j];
-          if (val > memNullSecond) memNullSecond = val;
-        }
-        for (long t = s; t <= l; ++t) {
-          double chainCost = INFINITY;
-          if ((s == t) && (m >= cw[l + 1] + cbw[s + 1] + fwd_tmp[s]) &&
-              (m >= cw[s] + cw[s + 1] + cbw[s + 1] + bwd_tmp[s])) {
-            chainCost = OPT(m, s, s, s) + OPT(m - cbw[s + 1], s + 1, s + 1, l);
-          }
-          double bestLeafCost = INFINITY;
-          index_t bestLeaf = {.sp = -1, .r = -1, .tp = -1};
-          if (m >= memNullFirst && m >= cw[l + 1] + memNullSecond) {
-            for (long r = s; r <= t; ++r)
-              if (cw[s] <= cw[r])
-                for (long tp = t + 1; tp <= l; ++tp)
-                  for (long sp = r + 1; sp <= tp; ++sp) {
-                    long mp = m - cw[r] + cw[s];
-                    assert(mp >= 0);
-                    if (mp >= cw[sp]) {
-                      double value = partialSumsFW[sp] - partialSumsFW[s] +
-                                     OPT(mp - cw[sp], sp, tp, l) +
-                                     OPT(mp, r, t, tp - 1);
-                      if (value < bestLeafCost) {
-                        bestLeafCost = value;
-                        bestLeaf.sp = sp;
-                        bestLeaf.r = r;
-                        bestLeaf.tp = tp;
-                      }
-                    }
-                  }
-          }
-          if (bestLeaf.sp >= 0 && bestLeafCost <= chainCost) {
-            OPT(m, s, t, l) = bestLeafCost;
-            WHAT(m, s, t, l).sp = bestLeaf.sp;
-            WHAT(m, s, t, l).r = bestLeaf.r;
-            WHAT(m, s, t, l).tp = bestLeaf.tp;
-          } else {
-            OPT(m, s, t, l) = chainCost;
-            WHAT(m, s, t, l).sp = -1;
-          }
-        }
-      }
-    }
-
-  free(fw);
-  free(bw);
-  free(cw);
-  free(cbw);
-  free(fwd_tmp);
-  free(bwd_tmp);
-
-  PyObject* res_opt = PyList_New(mmax + 1);
-  PyObject* res_what = PyList_New(mmax + 1);
-
-  // Convert the result into Python world
-  PyObject* true_tuple = Py_BuildValue("(O)", Py_True);
-  for (long m = 0; m <= mmax; ++m) {
-    PyObject* res_opt_m = PyDict_New();
-    PyList_SET_ITEM(res_opt, m, res_opt_m);
-    PyObject* res_what_m = PyDict_New();
-    PyList_SET_ITEM(res_what, m, res_what_m);
-    for (long s = 0; s <= chain_length; ++s)
-      for (long t = s; t <= chain_length; ++t)
-        for (long l = t; l <= chain_length; ++l) {
-          PyObject* key = Py_BuildValue("(lll)", s, t, l);
-          PyObject* value_opt = PyFloat_FromDouble(OPT(m, s, t, l));
-          PyDict_SetItem(res_opt_m, key, value_opt);
-          PyObject* value_what = true_tuple;
-          index_t* idx_what = &WHAT(m, s, t, l);
-          if (idx_what->sp >= 0)
-            value_what = Py_BuildValue("(O(lll))", Py_False, idx_what->sp,
-                                       idx_what->r, idx_what->tp);
-          PyDict_SetItem(res_what_m, key, value_what);
-          if (value_what != true_tuple) Py_DECREF(value_what);
-          Py_DECREF(key);
-          Py_DECREF(value_opt);
-        }
-  }
-
-  Py_DECREF(true_tuple);
-
-  free(opt);
-  free(what);
-
-  PyObject* result = PyTuple_Pack(2, res_opt, res_what);
-  Py_DECREF(res_opt);
-  Py_DECREF(res_what);
-  return result;
-}
-
-static PyObject* griewank_heterogeneous_compute_table(PyObject* self,
-                                                      PyObject* args) {
-  PyObject* chain_param;
-  int mmax;
-
-  if (!PyArg_ParseTuple(args, "Oi", &chain_param, &mmax)) return NULL;
-
-  double* fw = getDoubleArray(chain_param, "fweigth");
-  if (!fw) return NULL;
-
-  double* bw = getDoubleArray(chain_param, "bweigth");
-  if (!bw) return NULL;
-
-  long* cw = getLongArray(chain_param, "cweigth");
-  if (!cw) return NULL;
-
-  long* cbw = getLongArray(chain_param, "cbweigth");
-  if (!cbw) return NULL;
-
-  PyObject* chain_length_param = PyObject_GetAttrString(chain_param, "length");
-  if (!chain_length_param) return NULL;
-  long chain_length = PyLong_AsLong(chain_length_param);
-  Py_DECREF(chain_length_param);
-
-  // TODO: Can be optimized by only allocating memory for l >= i
-  // TODO: float / int instead of double / long ?
-#undef OPT
-#define OPT(m, i, l)                                  \
-  opt[(m) * (chain_length + 1) * (chain_length + 1) + \
-      (i) * (chain_length + 1) + (l)]
-  double* opt = (double*)calloc(
-      (mmax + 1) * (chain_length + 1) * (chain_length + 1), sizeof(double));
-
-  // Compute partial sums
-  double* sumfw = (double*)calloc(chain_length, sizeof(double));
-  double* sumbw = (double*)calloc(chain_length + 1, sizeof(double));
-  double* sumsumfw = (double*)calloc(chain_length, sizeof(double));
-
-  double total = 0;
-  for (long i = 0; i < chain_length; ++i) {
-    total += fw[i];
-    sumfw[i] = total;
-  }
-
-  total = 0;
-  for (long i = 0; i < chain_length + 1; ++i) {
-    total += bw[i];
-    sumbw[i] = total;
-  }
-
-  total = 0;
-  for (long i = 0; i < chain_length; ++i) {
-    total += sumfw[i];
-    sumsumfw[i] = total;
-  }
-
-  for (long m = 0; m <= mmax; ++m)
-    for (long i = 0; i <= chain_length; ++i) {
-      // TODO: Can be optimized to remove the IF by reordering loops
-      if ((m >= cbw[i]) && (m >= cw[i] + cbw[i + 1]))
-        OPT(m, i, i) = bw[i];
-      else
-        OPT(m, i, i) = INFINITY;
-
-      if (i < chain_length) {
-        long maxC = fmaxl(cw[i], cw[i + 1]);
-        long maxCB = fmaxl(cbw[i + 1], cbw[i + 2] + maxC);
-        if ((m >= cbw[i]) && (m >= cw[i] + maxCB))
-          OPT(m, i, i + 1) = fw[i] + bw[i] + bw[i + 1];
-        else
-          OPT(m, i, i + 1) = INFINITY;
-      }
-    }
-
-  for (long m = 0; m <= mmax; ++m)
-    for (long i = 0; i + 2 <= chain_length; ++i) {
-      long mminCst = fmaxl(cbw[i], cbw[i + 1] + cw[i]);
-      long maxCW_il = fmax(fmax(cw[i], cw[i + 1]), cw[i + 2]);
-      long maxCostFWD = cw[i] + cbw[i + 2] + maxCW_il;
-      for (long l = i + 2; l <= chain_length; ++l) {
-        maxCW_il = fmax(maxCW_il, cw[l + 1]);
-        maxCostFWD = fmaxl(maxCostFWD, cw[i] + cw[l + 1] + maxCW_il);
-        long mmin = fmaxl(mminCst, maxCostFWD);
-        if ((m >= mmin)) {
-          double noCheckpointCost = sumbw[l] - (i > 0 ? sumbw[i - 1] : 0);
-          noCheckpointCost +=
-              sumsumfw[l - 1] -
-              (i > 0 ? sumsumfw[i - 1] + (l - i) * sumfw[i - 1] : 0);
-
-          double valueCost = INFINITY;
-          if (m >= cw[i]) {
-            double sumFwds = 0;
-            for (long j = i + 1; j < l; ++j) {
-              sumFwds += fw[j - 1];
-              valueCost = fmin(
-                  valueCost, sumFwds + OPT(m - cw[i], j, l) + OPT(m, i, j - 1));
-            }
-          }
-          OPT(m, i, l) = fmin(noCheckpointCost, valueCost);
-        } else
-          OPT(m, i, l) = INFINITY;
-      }
-    }
-
-  free(sumfw);
-  free(sumbw);
-  free(sumsumfw);
-  free(fw);
-  free(bw);
-  free(cw);
-  free(cbw);
-
-  PyObject* res_opt = PyList_New(mmax + 1);
-
-  // Convert the result into Python world
-  for (long m = 0; m <= mmax; ++m) {
-    PyObject* res_opt_m = PyList_New(chain_length + 1);
-    PyList_SET_ITEM(res_opt, m, res_opt_m);
-    for (long i = 0; i <= chain_length; ++i) {
-      PyObject* res_opt_m_i = PyDict_New();
-      PyList_SET_ITEM(res_opt_m, i, res_opt_m_i);
-      for (long l = i; l <= chain_length; ++l) {
-        PyObject* res_l = PyLong_FromLong(l - i);
-        PyObject* res_opt_m_i_l = PyFloat_FromDouble(OPT(m, i, l));
-        PyDict_SetItem(res_opt_m_i, res_l, res_opt_m_i_l);
-        Py_DECREF(res_opt_m_i_l);
-        Py_DECREF(res_l);
-      }
-    }
-  }
-
-  free(opt);
-
-  return res_opt;
-}
-
-static PyMethodDef dynamic_programs_methods[] = {
-    {"persistent_compute_table", persistent_compute_table, METH_VARARGS,
-     "Compute the optimal table with the persistent algorithm."},
-    {"floating_compute_table", floating_compute_table, METH_VARARGS,
-     "Compute the optimal table with the floating algorithm."},
-    {"griewank_heterogeneous_compute_table",
-     griewank_heterogeneous_compute_table, METH_VARARGS,
-     "Compute the optimal table for the Griewank Heterogeneous Model."},
-    {NULL, NULL, 0, NULL} /* Sentinel */
-};
-
-static struct PyModuleDef dynamic_programs_module = {
-    PyModuleDef_HEAD_INIT, "dynamic_programs_C_version", /* name of module */
-    NULL, /* module documentation, may be NULL */
-    -1,   /* size of per-interpreter state of the module,
-                     or -1 if the module keeps state in global variables. */
-    dynamic_programs_methods};
-
-PyMODINIT_FUNC PyInit_dynamic_programs_C_version(void) {
-  return PyModule_Create(&dynamic_programs_module);
-}
diff --git a/colossalai/fx/passes/algorithms/linearize.py b/colossalai/fx/passes/algorithms/linearize.py
deleted file mode 100644
index 1a49364f5a7c..000000000000
--- a/colossalai/fx/passes/algorithms/linearize.py
+++ /dev/null
@@ -1,94 +0,0 @@
-from typing import List, Any
-from torch.fx import GraphModule, Node
-from colossalai.fx.profiler import is_inplace
-
-# Common nodes are type of nodes that could be seen as attributes and remain
-# unchanged throughout the whole model, it will be used several times by
-# different blocks of model, so that it is hard for us to linearize the graph
-# when we encounter those kinds of nodes. We let users to annotate some of the
-# input as common node, such as attention mask, and the followings are some of
-# the ops that could actually be seen as common nodes. With our common node prop,
-# we could find some of the "real" common nodes (e.g. the real attention mask
-# used in BERT and GPT), the rule is simple, for node who's parents are all common
-# nodes or it's op belongs to the following operations, we view this node as a
-# newly born common node.
-# List of target name that could be seen as common node
-COPS = ["getattr", "getitem", "size"]
-
-
-def _is_cop(target: Any) -> bool:
-    """Check if an op could be seen as common node
-
-    Args:
-        target (Any): node target
-
-    Returns:
-        bool
-    """
-
-    if isinstance(target, str):
-        return target in COPS
-    else:
-        return target.__name__ in COPS
-
-
-def linearize(gm: GraphModule, cnode: List[str] = None) -> List[List[Node]]:
-    """Linearizing the graph
-
-    Args:
-        gm (GraphModule): GraphModule derived by tracing
-        cnode (List[str], optional): common node List, should be the subset of input. Default to None.
-
-    Returns:
-        List[List[Node]]: List of list, each inside list of Node presents
-        the actual 'node' in linearized manner.
-
-    Remarks:
-        We merge the inplace ops into the previous node.
-    """
-
-    def _is_sink() -> bool:
-        """Check if we can free all dependencies
-
-        Returns:
-            bool
-        """
-
-        return not sum([v for _, v in deps.items()]) and not any(map(is_inplace, n.users))
-
-    # make sure that item in cnode is valid
-    if cnode:
-        for name in cnode:
-            try:
-                assert next(node for node in gm.graph.nodes if node.name == name).op == "placeholder", \
-                f"common node {name} is not an input of the model"
-            except StopIteration:
-                raise ValueError(f"common node name {name} not in graph")
-
-    else:
-        cnode = []
-
-    deps = {}
-    linearized_nodes = []
-    region = []
-
-    for n in gm.graph.nodes:
-        if n.op != "placeholder" and n.op != "output":
-            for n_par in n._input_nodes:
-                if n_par.op != "placeholder" and n_par.name not in cnode:
-                    deps[n_par] -= 1
-            region.append(n)
-
-            # if the node could free all dependencies in graph
-            # we could begin a new node
-            if _is_sink():
-                linearized_nodes.append(region)
-                region = []
-
-            # propagate common node attr if possible
-            if len(n._input_nodes) == len([node for node in n._input_nodes if node.name in cnode]) or _is_cop(n.target):
-                cnode.append(n.name)
-            else:
-                deps[n] = len([user for user in n.users if user.op != "output"])
-
-    return linearized_nodes
diff --git a/colossalai/fx/passes/algorithms/operation.py b/colossalai/fx/passes/algorithms/operation.py
deleted file mode 100644
index 8bfa3452ba64..000000000000
--- a/colossalai/fx/passes/algorithms/operation.py
+++ /dev/null
@@ -1,270 +0,0 @@
-import math
-
-
-def _discretize(mem_unit, values):
-    return [math.ceil(value / mem_unit) for value in values]
-
-
-class Chain:
-
-    def __init__(self, fw, bw, cw, cbw, ftmp, btmp, check=True):
-        self.fweight = fw
-        self.bweight = bw
-        self.cweight = cw
-        self.cbweight = cbw
-        self.fwd_mem_tmp = ftmp
-        self.bwd_mem_tmp = btmp
-        self.length = len(fw)
-        if check and not self.check_lengths():
-            raise AttributeError("In Chain, input lists do not have consistent lengths")
-
-    def check_lengths(self):
-        return ((len(self.fweight) == self.length) and (len(self.bweight) == self.length + 1)
-                and (len(self.cweight) == self.length + 1) and (len(self.fwd_mem_tmp) == self.length)
-                and (len(self.bwd_mem_tmp) == self.length + 1) and (len(self.cbweight) == self.length + 1))
-
-    def __repr__(self):
-        chain_list = []
-        for i in range(self.length):
-            chain_list.append((self.fweight[i], self.bweight[i], self.cweight[i], self.cbweight[i], self.fwd_mem_tmp[i],
-                               self.bwd_mem_tmp[i]))
-        i = self.length
-        chain_list.append((None, self.bweight[i], self.cweight[i], self.cbweight[i], None, self.bwd_mem_tmp[i]))
-        return chain_list.__repr__()
-
-    def _discretize(self, mem_unit):
-        self.cweight = _discretize(mem_unit, self.cweight)
-        self.cbweight = _discretize(mem_unit, self.cbweight)
-        self.fwd_mem_tmp = _discretize(mem_unit, self.fwd_mem_tmp)
-        self.bwd_mem_tmp = _discretize(mem_unit, self.bwd_mem_tmp)
-
-
-class Operation:
-
-    def shift(self, value):
-        if type(self.index) is tuple:
-            self.index = tuple(x + value for x in self.index)
-        else:
-            self.index += value
-
-
-class Offload(Operation):
-
-    def __init__(self, index, has_bar=False) -> None:
-        super().__init__()
-        self.index = index
-        self.name = "Off"
-        self.has_bar = has_bar
-        if self.has_bar:
-            self.name += "wBar"
-
-    def __repr__(self):
-        return f"{self.name}_{self.index}"
-
-
-class Prefetch(Operation):
-
-    def __init__(self, index, has_bar=False) -> None:
-        super().__init__()
-        self.index = index
-        self.name = "Pre"
-        self.has_bar = has_bar
-        if self.has_bar:
-            self.name += "wBar"
-
-    def __repr__(self):
-        return f"{self.name}_{self.index}"
-
-
-class Forward(Operation):
-
-    def __init__(self, index):
-        self.index = index
-        self.name = "F"
-
-    def __repr__(self):
-        return "{n}_{i}".format(n=self.name, i=self.index)
-
-    def cost(self, chain: Chain):
-        if chain is not None:
-            return chain.fweight[self.index]
-        else:
-            return 1
-
-
-class ForwardEnable(Forward):
-
-    def __init__(self, index):
-        super().__init__(index)
-        self.name = "Fe"
-
-
-class ForwardNograd(Forward):
-
-    def __init__(self, index):
-        super().__init__(index)
-        self.name = "Fn"
-
-
-class ForwardCheck(Forward):
-
-    def __init__(self, index):
-        super().__init__(index)
-        self.name = "CF"
-
-
-class Forwards(Operation):
-
-    def __init__(self, start, end):
-        self.index = (start, end)
-
-    def __repr__(self):
-        return "F_{i}->{j}".format(i=self.index[0], j=self.index[1])
-
-    def cost(self, chain: Chain):
-        if chain is not None:
-            return sum(chain.fweight[self.index[0]:self.index[1] + 1])
-        else:
-            return (self.index[1] - self.index[0] + 1)
-
-
-def isForward(op):
-    return type(op) is Forward or type(op) is Forwards
-
-
-class Backward(Operation):
-
-    def __init__(self, index):
-        self.index = index
-
-    def __repr__(self):
-        return "B_{i}".format(i=self.index)
-
-    def cost(self, chain: Chain):
-        if chain is not None:
-            return chain.bweight[self.index]
-        else:
-            return 1
-
-
-class Loss(Operation):
-
-    def __init__(self):
-        pass
-
-    def __repr__(self):
-        return "L"
-
-    def cost(self, chain):
-        return 0
-
-
-class MemoryAccess(Operation):
-
-    def __init__(self, index):
-        self.index = index
-
-    def __repr__(self):
-        return "{n}_{i}".format(n=self.name, i=self.index)
-
-    def cost(self, chain: Chain):
-        return 0
-
-
-class WriteMemory(MemoryAccess):
-
-    def __init__(self, index):
-        super().__init__(index)
-        self.name = "WM"
-
-
-class ReadMemory(MemoryAccess):
-
-    def __init__(self, index):
-        super().__init__(index)
-        self.name = "RM"
-
-
-class DiscardMemory(MemoryAccess):
-
-    def __init__(self, index):
-        super().__init__(index)
-        self.name = "DM"
-
-
-class Function:
-
-    def __init__(self, name, *args):
-        self.name = name
-        self.args = args
-        self.str_args = ','.join(str(v) for v in self.args)
-
-    def __repr__(self):
-        return "{n}({args})".format(n=self.name, args=self.str_args)
-
-
-class Sequence:
-
-    def __init__(self, function):
-        self.sequence = []    #List of Operation and Sequence
-        self.function = function    #Description the function (name and parameters)
-
-    def __repr__(self):
-        return repr(self.list_operations())
-
-    def list_operations(self):
-        op_list = []
-        for x in self.sequence:
-            if isinstance(x, Operation):
-                op_list.append(x)
-            else:
-                assert isinstance(x, Sequence)
-                op_list += x.list_operations()
-        return op_list
-
-    def insert(self, operation):
-        self.sequence.append(operation)
-
-    def remove(self, operation_index):
-        del self.sequence[operation_index]
-
-    def insert_sequence(self, sequence):
-        self.sequence.append(sequence)
-
-    def shift(self, value):
-        for x in self.sequence:
-            x.shift(value)
-        return self
-
-    def remove_useless_write(self):
-        if self.sequence:
-            if isinstance(self.sequence[0], WriteMemory):
-                self.remove(0)
-        return self
-
-    def get_makespan(self, chain):
-        return sum(op.cost(chain) for op in self.list_operations())
-
-    def without_suffix(self):
-        ops = self.list_operations()
-        end_of_first_phase = [i for i in range(len(ops)) if type(ops[i]) is Loss][0]
-        try:
-            last_idx = max(i for i in range(end_of_first_phase) if not type(ops[i]) is ForwardEnable)
-        except ValueError:
-            last_idx = -1
-        if last_idx == end_of_first_phase - 1:
-            return (self, None)
-        chain_length = ops[end_of_first_phase -
-                           1].index    ## Some assumption here about the sequence (finishes with Forward_L
-        start_of_fwd_enable_chain = ops[last_idx + 1].index    ## And starts with B_L), but should be fine in practice
-        result = Sequence(Function("Strip", self.function.name, *self.function.args, start_of_fwd_enable_chain))
-        for i in range(last_idx + 1):
-            result.insert(ops[i])
-        result.insert(Loss())
-        for i in range(chain_length, start_of_fwd_enable_chain - 1, -1):
-            position = end_of_first_phase + 1 + (chain_length - i)
-            assert type(ops[position]) is Backward
-            assert ops[position].index == i
-        for i in range(end_of_first_phase + 1 + 1 + chain_length - start_of_fwd_enable_chain, len(ops)):
-            result.insert(ops[i])
-        return (result, start_of_fwd_enable_chain)

From 400f63012eb288b849253efd438622d6898f4233 Mon Sep 17 00:00:00 2001
From: Ziyue Jiang <ziyue.jiang97@gmail.com>
Date: Tue, 7 Mar 2023 10:34:31 +0800
Subject: [PATCH 423/503] [pipeline] Add Simplified Alpa DP Partition (#2507)

* add alpa dp split

* add alpa dp split

* use fwd+bwd instead of fwd only

---------

Co-authored-by: Ziyue Jiang <ziyue.jiang@gmail.com>
---
 .../fx/passes/adding_split_node_pass.py       | 161 ++++++++++++++++++
 colossalai/fx/passes/meta_info_prop.py        |   1 +
 colossalai/pipeline/rpc/_pipeline_base.py     |   3 +-
 .../pipeline_parallel/train_gpt_pp.py         |  47 +++--
 4 files changed, 197 insertions(+), 15 deletions(-)

diff --git a/colossalai/fx/passes/adding_split_node_pass.py b/colossalai/fx/passes/adding_split_node_pass.py
index 0499769d884d..2c7b842b530c 100644
--- a/colossalai/fx/passes/adding_split_node_pass.py
+++ b/colossalai/fx/passes/adding_split_node_pass.py
@@ -1,4 +1,6 @@
+import numpy as np
 import torch
+import tqdm
 from torch.fx import symbolic_trace
 from torch.fx.node import Node
 
@@ -9,6 +11,165 @@ def pipe_split():
     pass
 
 
+def block_split():
+    pass
+
+
+# Construct blocks with the condition that (block_flops / total_flops) >= limit.
+def construct_blocks(gm: torch.fx.GraphModule, limit=0.01):
+    total_fwd_flop = 0
+    total_bwd_flop = 0
+    for node in gm.graph.nodes:
+        total_fwd_flop += node.fwd_flop
+        total_bwd_flop += node.bwd_flop
+
+    total_flop = total_fwd_flop + total_bwd_flop
+    per_block_flop = total_flop * limit
+    accumulate_fwd_flop = 0
+    accumulate_bwd_flop = 0
+    block_nodes = []
+    for node in gm.graph.nodes:
+        if 'block_split' in node.name:
+            continue
+        accumulate_fwd_flop += node.fwd_flop
+        accumulate_bwd_flop += node.bwd_flop
+        if accumulate_fwd_flop + accumulate_bwd_flop >= per_block_flop:
+            with gm.graph.inserting_after(node):
+                block_node = gm.graph.create_node('call_function', block_split)
+                setattr(block_node, 'fwd_flop', accumulate_fwd_flop)
+                setattr(block_node, 'bwd_flop', accumulate_bwd_flop)
+            accumulate_fwd_flop = 0
+            accumulate_bwd_flop = 0
+            block_nodes.append(block_node)
+
+    return block_nodes
+
+
+def remove_blocks(gm: torch.fx.GraphModule):
+    for node in gm.graph.nodes:
+        if (node.op, node.target) == ('call_function', block_split):
+            gm.graph.erase_node(node)
+
+
+def get_compute_costs(node_list):
+    num_nodes = len(node_list)
+    all_compute_cost = np.full((num_nodes, num_nodes), np.inf, dtype=np.float64)
+
+    for start in tqdm.tqdm(range(num_nodes), desc='start pos', position=0):
+        for end in tqdm.tqdm(range(start, num_nodes), desc='end pos', position=1, leave=False):
+            selected_flops = [(node_list[i].fwd_flop + node_list[i].bwd_flop) for i in range(start, end + 1)]
+            all_compute_cost[start, end] = sum(selected_flops)
+
+    return all_compute_cost
+
+
+def do_dp_split_gpipe_impl(num_nodes, num_stages, num_microbatches, compute_costs, max_compute_cost):
+    """The core implementation of the DP algorithm."""
+    # Adapted from Alpa DP Formulation.
+    # For f, node ID start from 0
+    # f[number of stages,
+    #   node id that is currently being considered]
+
+    # record time cost(assess by fwd+bwd flop now)
+    f = np.full((num_stages + 1, num_nodes + 1), np.inf, dtype=np.float32)
+
+    # record max stage compute cost among all stages in this partition.
+    f_stage_max = np.full((num_stages + 1, num_nodes + 1), 0.0, dtype=np.float32)
+    # record start node index for next stage in this partition
+    f_argmin = np.full((num_stages + 1, num_nodes + 1), -1, dtype=np.int32)
+    f[0, num_nodes] = 0
+    for s in tqdm.tqdm(range(1, num_stages + 1), desc='stage', position=2, leave=False):    # pylint: disable=too-many-nested-blocks
+        for i in tqdm.tqdm(range(num_nodes - 1, -1, -1), desc='start node', position=3, leave=False):
+            for k in tqdm.tqdm(range(num_nodes, i, -1), desc='mid node', position=4, leave=False):
+                stage_cost = compute_costs[i, k - 1]
+                new_cost = f[s - 1, k] + stage_cost
+                if (stage_cost <= max_compute_cost and new_cost < f[s, i]):
+                    f[s, i] = new_cost
+                    f_stage_max[s, i] = max(f_stage_max[s - 1, k], stage_cost)
+                    f_argmin[s, i] = k
+
+    best_total_cost = f[num_stages, 0]
+    if np.isinf(best_total_cost):
+        return np.inf, None
+
+    total_cost = f[num_stages, 0] + (num_microbatches - 1) * f_stage_max[num_stages, 0]
+
+    current_s = num_stages
+    current_node = 0
+
+    res = []
+    while current_s > 0 and current_node < num_nodes:
+        next_start_node = f_argmin[current_s, current_node]
+        res.append((current_node, next_start_node))
+        current_s -= 1
+        current_node = next_start_node
+
+    return total_cost, res
+
+
+def do_dp_split_gpipe(node_list, compute_costs, num_stages: int, num_microbatches: int):
+    # Ignore the memory cost profiling in Alpa's design for convenience.
+    max_compute_costs = np.sort(np.unique(compute_costs))
+    best_cost = np.inf
+    best_solution = None
+    last_max_compute_cost = 0.0
+    gap = 1e6    # temporary magic number, unit: flops
+
+    for max_compute_cost in tqdm.tqdm(max_compute_costs):
+        # Pruning to reduce search space.
+        if max_compute_cost * num_microbatches >= best_cost:
+            break
+        if max_compute_cost - last_max_compute_cost < gap:
+            continue
+
+        cost, solution = do_dp_split_gpipe_impl(len(node_list), num_stages, num_microbatches, compute_costs,
+                                                max_compute_cost)
+
+        if cost < best_cost:
+            best_cost = cost
+            best_solution = solution
+        last_max_compute_cost = max_compute_cost
+    return best_cost, best_solution
+
+
+# Auto DP partition based on Alpa.
+# Adapted to Gpipe Scheduler
+# split_mode:
+#   'node': fx_node
+#   'block': many fx_nodes construct a block
+def gpipe_dp_split_pass(gm: torch.fx.GraphModule, pp_size: int, num_microbatches: int, mode='block', block_limit=0.01):
+    assert mode in ['node', 'block']
+
+    # nodes or blocks will be used in partition.
+    node_list = []
+    if mode == 'node':
+        for node in gm.graph.nodes:
+            node_list.append(node)
+    elif mode == 'block':
+        node_list = construct_blocks(gm, limit=block_limit)
+    else:
+        pass
+
+    compute_costs = get_compute_costs(node_list)
+
+    best_cost, best_solution = do_dp_split_gpipe(node_list, compute_costs, pp_size, num_microbatches)
+
+    for (_, next_start_node) in best_solution:
+        if pp_size <= 1:
+            break
+        node = node_list[next_start_node]
+        with gm.graph.inserting_before(node):
+            split_node = gm.graph.create_node('call_function', pipe_split)
+        pp_size -= 1
+
+    # remove block node if possible
+    if mode == 'block':
+        remove_blocks(gm)
+
+    gm.recompile()
+    return gm
+
+
 def avgcompute_split_pass(gm: torch.fx.GraphModule, pp_size: int):
     """
     In avgcompute_split_pass, we split module by the fwd flops.
diff --git a/colossalai/fx/passes/meta_info_prop.py b/colossalai/fx/passes/meta_info_prop.py
index 281cae41f77d..c2394a13c697 100644
--- a/colossalai/fx/passes/meta_info_prop.py
+++ b/colossalai/fx/passes/meta_info_prop.py
@@ -114,6 +114,7 @@ def extract_tensor_meta(obj):
         # TODO: the attribute node_size should be removed in the future
         setattr(n, 'node_size', activation_size(n.meta.get('fwd_out', 0)) + activation_size(n.meta.get('fwd_tmp', 0)))
         setattr(n, 'fwd_flop', n.meta.get('fwd_flop', 0))
+        setattr(n, 'bwd_flop', n.meta.get('bwd_flop', 0))
         n.meta['type'] = type(result)
 
         # retain the autograd graph
diff --git a/colossalai/pipeline/rpc/_pipeline_base.py b/colossalai/pipeline/rpc/_pipeline_base.py
index 1edc1ac70d20..2d7e25c82e7b 100644
--- a/colossalai/pipeline/rpc/_pipeline_base.py
+++ b/colossalai/pipeline/rpc/_pipeline_base.py
@@ -1115,7 +1115,8 @@ def _init_worker(self) -> None:
         # let each worker know global worker rref (include itself)
         sync_futs = []
         for pp_rank in self.pp_rank_to_worker_rref:
-            fut = self.pp_rank_to_worker_rref[pp_rank].rpc_async().sync_global_worker_rrefs(self.pp_rank_to_worker_rref)
+            fut = self.pp_rank_to_worker_rref[pp_rank].rpc_async(timeout=0).sync_global_worker_rrefs(
+                self.pp_rank_to_worker_rref)
             sync_futs.append(fut)
 
         for fut in sync_futs:
diff --git a/examples/language/gpt/experiments/pipeline_parallel/train_gpt_pp.py b/examples/language/gpt/experiments/pipeline_parallel/train_gpt_pp.py
index c3451c18db8f..ad69888b8cc8 100644
--- a/examples/language/gpt/experiments/pipeline_parallel/train_gpt_pp.py
+++ b/examples/language/gpt/experiments/pipeline_parallel/train_gpt_pp.py
@@ -8,11 +8,16 @@
 from tqdm import tqdm
 
 from colossalai.fx import ColoTracer
-from colossalai.fx.passes.adding_split_node_pass import avgnode_split_pass, split_with_split_nodes_pass
+from colossalai.fx.passes.adding_split_node_pass import (
+    avgnode_split_pass,
+    gpipe_dp_split_pass,
+    split_with_split_nodes_pass,
+)
+from colossalai.fx.passes.meta_info_prop import MetaInfoProp
 from colossalai.logging import disable_existing_loggers, get_dist_logger
 from colossalai.nn.optimizer import HybridAdam
 from colossalai.pipeline.middleware.adaptor import get_fx_topology
-from colossalai.pipeline.rpc._pipeline_schedule import OneFOneBPipelineEngine
+from colossalai.pipeline.rpc._pipeline_schedule import FillDrainPipelineEngine, OneFOneBPipelineEngine
 from colossalai.pipeline.rpc.utils import rpc_run
 
 
@@ -55,13 +60,25 @@ def get_tflops(model_numel, batch_size, seq_len, step_time):
     return model_numel * batch_size * seq_len * 8 / 1e12 / (step_time + 1e-12)
 
 
-def create_partition_module(pp_rank: int, stage_num: int, model, data_kwargs):
+# Create annotated model which is noted where to be splitted.
+def get_annotated_model(model, data_kwargs, num_stages, num_microbatches):
     tracer = ColoTracer()
     meta_args = {k: v.to('meta') for k, v in data_kwargs.items()}
     graph = tracer.trace(root=model, meta_args=meta_args)
     gm = torch.fx.GraphModule(model, graph, model.__class__.__name__)
-    annotated_model = avgnode_split_pass(gm, stage_num)
 
+    interp_meta_args = tuple([v.to('meta') for k, v in data_kwargs.items()])
+    interp = MetaInfoProp(gm)
+    interp.run(*interp_meta_args)
+
+    #annotated_model = avgnode_split_pass(gm, num_stages)
+    annotated_model = gpipe_dp_split_pass(gm, num_stages, num_microbatches, mode='block', block_limit=0.01)
+
+    return annotated_model
+
+
+def create_partition_module(pp_rank: int, num_stages: int, model, data_kwargs, num_microbatches):
+    annotated_model = get_annotated_model(model, data_kwargs, num_stages, num_microbatches)
     top_module, split_submodules = split_with_split_nodes_pass(annotated_model, merge_output=True)
     topo = get_fx_topology(top_module)
     for submodule in split_submodules:
@@ -70,8 +87,8 @@ def create_partition_module(pp_rank: int, stage_num: int, model, data_kwargs):
     return split_submodules[pp_rank + 1]
 
 
-def partition(model, data_kwargs, pp_rank: int, chunk: int, stage_num: int):
-    module = create_partition_module(pp_rank, stage_num, model, data_kwargs)
+def partition(model, data_kwargs, num_microbatches, pp_rank: int, chunk: int, stage_num: int):
+    module = create_partition_module(pp_rank, stage_num, model, data_kwargs, num_microbatches)
     return module
 
 
@@ -103,17 +120,19 @@ def run_master(args):
     warmup_data_kwargs = {'input_ids': input_ids, 'attention_mask': attn_mask}
 
     # create model
+    logger.info(f'start model_builder')
     model = model_builder(model_type)(checkpoint=False)
+    logger.info(f'end model_builder')
 
     # set 1f1b pipeline engine
-    pp_engine = OneFOneBPipelineEngine(partition_fn=partial(partition, model, warmup_data_kwargs),
-                                       stage_num=stage_num,
-                                       num_microbatches=num_microbatches,
-                                       device=device,
-                                       chunk=1,
-                                       criterion=criterion,
-                                       metric=None,
-                                       checkpoint=False)
+    pp_engine = FillDrainPipelineEngine(partition_fn=partial(partition, model, warmup_data_kwargs, num_microbatches),
+                                        stage_num=stage_num,
+                                        num_microbatches=num_microbatches,
+                                        device=device,
+                                        chunk=1,
+                                        criterion=criterion,
+                                        metric=None,
+                                        checkpoint=False)
 
     partition_numels = pp_engine.remote_numels()
     for rank, numel in partition_numels.items():

From cd2b0eaa8dd4a7d8a67ce91b93459e07418bd741 Mon Sep 17 00:00:00 2001
From: YuliangLiu0306 <72588413+YuliangLiu0306@users.noreply.github.com>
Date: Tue, 7 Mar 2023 11:08:11 +0800
Subject: [PATCH 424/503] [DTensor] refactor sharding spec (#2987)

* [autoparallel] refactor sharding spec

* rename function name
---
 colossalai/tensor/d_tensor/__init__.py        |   0
 colossalai/tensor/d_tensor/layout.py          |  58 ++++-
 colossalai/tensor/d_tensor/misc.py            |  14 ++
 colossalai/tensor/d_tensor/sharding_spec.py   | 237 ++++++++++++++++++
 .../{ => test_dtensor}/test_dtensor.py        |   5 +-
 .../test_dtensor/test_sharding_spec.py        |  34 +++
 6 files changed, 341 insertions(+), 7 deletions(-)
 create mode 100644 colossalai/tensor/d_tensor/__init__.py
 create mode 100644 colossalai/tensor/d_tensor/misc.py
 create mode 100644 colossalai/tensor/d_tensor/sharding_spec.py
 rename tests/test_tensor/{ => test_dtensor}/test_dtensor.py (94%)
 create mode 100644 tests/test_tensor/test_dtensor/test_sharding_spec.py

diff --git a/colossalai/tensor/d_tensor/__init__.py b/colossalai/tensor/d_tensor/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/colossalai/tensor/d_tensor/layout.py b/colossalai/tensor/d_tensor/layout.py
index 9b72444aa3c6..72a2694a1eaf 100644
--- a/colossalai/tensor/d_tensor/layout.py
+++ b/colossalai/tensor/d_tensor/layout.py
@@ -1,12 +1,15 @@
+import operator
 from dataclasses import dataclass
+from functools import reduce
 
 import torch
 
 from colossalai.device.device_mesh import DeviceMesh
-from colossalai.tensor.sharding_spec import ShardingSpec
+
+from .misc import DuplicatedShardingDimensionError, LayoutException, ShardingNotDivisibleError
+from .sharding_spec import ShardingSpec
 
 
-@dataclass
 class Layout:
     """Layout of a tensor.
 
@@ -16,7 +19,50 @@ class Layout:
         sharding_spec: the sharding specification to describe how the tensor is sharded.
         entire_shape: the entire shape of the global tensor.
     """
-    device_mesh: DeviceMesh
-    device_type: torch.device
-    sharding_spec: ShardingSpec
-    entire_shape: torch.Size = None
+
+    def __init__(self, device_mesh: DeviceMesh, device_type: torch.device, sharding_spec: ShardingSpec,
+                 entire_shape: torch.Size):
+        self.device_mesh = device_mesh
+        self.device_type = device_type
+        self.sharding_spec = sharding_spec
+        self.entire_shape = entire_shape
+        self._sanity_check()
+
+    def __hash__(self) -> int:
+        return hash(f'{self.sharding_spec}')
+
+    def get_sharded_shape_per_device(self):
+        sharded_shape = list(self.entire_shape)
+        for dim, shard_list in self.sharding_spec.dim_partition_dict.items():
+            mesh_list = [self.device_mesh.mesh_shape[mesh_dim] for mesh_dim in shard_list]
+            shard_partitions = reduce(operator.mul, mesh_list, 1)
+            assert sharded_shape[
+                dim] % shard_partitions == 0, f'Cannot shard dimension {dim} into {shard_partitions} partitions.'
+            sharded_shape[dim] //= shard_partitions
+        return torch.Size(sharded_shape)
+
+    def _sanity_check(self):
+        sharding_spec = self.sharding_spec
+
+        # make sure all axes in logical device mesh only be used once
+        dim_check_list = list(range(self.device_mesh.logical_mesh_id.dim()))
+        for dim, shard_list in sharding_spec.dim_partition_dict.items():
+            for element in shard_list:
+                if element in dim_check_list:
+                    dim_check_list.remove(element)
+                else:
+                    raise DuplicatedShardingDimensionError(
+                        f"find an invalid sharding axis {element} in dim_partition_dict in tensor dimension {dim}.")
+
+        # make sure that the sharding for a dimension is divisible by the number of devices
+        for dim, shard_list in sharding_spec.dim_partition_dict.items():
+            tensor_dim_size = self.entire_shape[dim]
+            num_devices = 1
+
+            for element in shard_list:
+                num_devices *= self.device_mesh.mesh_shape[element]
+
+            if tensor_dim_size % num_devices != 0:
+                raise ShardingNotDivisibleError(
+                    f'The size of dimension at index {dim} is {tensor_dim_size}, it cannot be sharded over {num_devices} devices.'
+                )
diff --git a/colossalai/tensor/d_tensor/misc.py b/colossalai/tensor/d_tensor/misc.py
new file mode 100644
index 000000000000..3bb3f6f1961e
--- /dev/null
+++ b/colossalai/tensor/d_tensor/misc.py
@@ -0,0 +1,14 @@
+class LayoutException(Exception):
+    pass
+
+
+class DuplicatedShardingDimensionError(LayoutException):
+    pass
+
+
+class ShardingNotDivisibleError(LayoutException):
+    pass
+
+
+class ShardingOutOfIndexError(LayoutException):
+    pass
diff --git a/colossalai/tensor/d_tensor/sharding_spec.py b/colossalai/tensor/d_tensor/sharding_spec.py
new file mode 100644
index 000000000000..b135c46d68ac
--- /dev/null
+++ b/colossalai/tensor/d_tensor/sharding_spec.py
@@ -0,0 +1,237 @@
+from copy import deepcopy
+from typing import Dict, List
+
+from ..utils import merge_same_dim_mesh_list
+from .misc import ShardingOutOfIndexError
+
+__all__ = ['DimSpec', 'ShardingException', 'ShardingSpec']
+
+ALLGATHER_COST = 20
+SHARD_COST = 5
+STEP_PENALTY = 6
+NAN = 'nan'
+
+
+class DimSpec:
+    '''
+    Sharding spec for single dimension of the sharded tensor decribe the sharding dimension of
+    logical device mesh and give a method to compute the difference between them.
+    This class is used internally in ShardingSpec.
+
+    Argument:
+        shard_list(List[int]): if shard_list is None, the dim spec will be 'R' type.
+            Otherwise, the element in shard_list means the data will be sharded in that dimension.
+    '''
+
+    def __init__(self, shard_list):
+        self.is_replica = len(shard_list) == 0
+        self.shard_list = shard_list
+        self.build_difference_2d_dict()
+
+    def __eq__(self, other):
+        return str(self) == str(other)
+
+    def __repr__(self):
+        if self.is_replica:
+            return 'R'
+        target = 'S'
+        for dim in self.shard_list:
+            target += str(dim)
+        return target
+
+    def _convert_str_to_shard_list(self, str_spec):
+        '''
+        Conver str_spec into shard_list.
+
+        Argument:
+            str_spec(str): dim spec in str type.
+        '''
+
+        if str_spec == 'R':
+            return []
+        if str_spec == 'S0':
+            return [0]
+        if str_spec == 'S1':
+            return [1]
+        if str_spec == 'S01':
+            return [0, 1]
+
+    def build_difference_2d_dict(self):
+        '''
+        Build a difference maping for 2D device mesh case. It will be used to
+        compute the difference between DimSpec pairs.
+        '''
+
+        source_spec_list = ['R', 'S0', 'S1', 'S01']
+        target_spec_list = ['R', 'S0', 'S1', 'S01']
+        difference_dict = {}
+        for source_spec in source_spec_list:
+            for target_spec in target_spec_list:
+                legal_sharding_dims = []
+                spec_pair = (deepcopy(source_spec), deepcopy(target_spec))
+                source_shard_list = self._convert_str_to_shard_list(source_spec)
+                target_shard_list = self._convert_str_to_shard_list(target_spec)
+
+                # source same as target
+                if source_shard_list == target_shard_list:
+                    difference = 0
+
+                # all_gather(source) -> target
+                elif len(source_shard_list
+                        ) == len(target_shard_list) + 1 and source_shard_list[:-1] == target_shard_list:
+                    difference = ALLGATHER_COST
+
+                # shard(source) -> target
+                elif len(source_shard_list) == len(
+                        target_shard_list) - 1 and source_shard_list == target_shard_list[:-1] and target_shard_list[
+                            -1] not in source_shard_list:
+                    difference = SHARD_COST
+
+                # S1 -> S0 or S0 -> S1
+                elif len(source_shard_list) == len(target_shard_list):
+                    # source -> R -> target
+                    difference = ALLGATHER_COST + STEP_PENALTY + SHARD_COST
+
+                # R -> S01
+                elif len(source_shard_list) == len(target_shard_list) - 2:
+                    difference = SHARD_COST + STEP_PENALTY + SHARD_COST
+
+                # S01 -> R
+                elif len(source_shard_list) == len(target_shard_list) + 2:
+                    difference = ALLGATHER_COST + STEP_PENALTY + ALLGATHER_COST
+
+                # S1 -> S01
+                elif len(source_shard_list) == len(target_shard_list) - 1:
+                    difference = ALLGATHER_COST + STEP_PENALTY + SHARD_COST + STEP_PENALTY + SHARD_COST
+
+                # S01 -> S1
+                elif len(source_shard_list) == len(target_shard_list) + 1:
+                    difference = ALLGATHER_COST + STEP_PENALTY + ALLGATHER_COST + STEP_PENALTY + SHARD_COST
+
+                else:
+                    difference = NAN
+                difference_dict[spec_pair] = difference
+
+        self.difference_dict = difference_dict
+
+    def dim_diff(self, other):
+        '''
+        The difference between two _DimSpec.
+
+        Argument:
+            other(_DimSpec): the dim spec to compare with.
+
+        Return:
+            difference(int): the difference between two _DimSpec.
+
+        Example:
+            dim_spec = _DimSpec([0])
+            other_dim_spec = _DimSpec([0, 1])
+            print(dim_spec.difference(other_dim_spec))
+
+        Output:
+            5
+        '''
+        difference = self.difference_dict[(str(self), str(other))]
+        return difference
+
+
+class ShardingSpec:
+    '''
+    Sharding spec describes how to shard a tensor with dim_size dimensions. The sharding sequence looks like
+    [R, R, S0, S1], which means
+
+    Argument:
+        dim_partition_dict(Dict[int, List[int]], optional): The key is the dimension of tensor to be sharded,
+            and the value of the key decribe which logical axis will be sharded in that dimension.
+        sharding_sequence(List[DimSpec], optional): A straight view of ShardingSpec looks like [R, R, S0, S1].
+    '''
+
+    def __init__(self,
+                 dim_size: int,
+                 dim_partition_dict: Dict[int, List[int]] = None,
+                 sharding_sequence: List[DimSpec] = None):
+        self.dims = dim_size
+        self.dim_partition_dict = dim_partition_dict
+        self.sharding_sequence = sharding_sequence
+        if self.sharding_sequence is None:
+            assert self.dim_partition_dict is not None, f'dim_partition_dict should not be None, if sharding_sequence is NoneType object.'
+            self.dim_partition_dict = merge_same_dim_mesh_list(dim_size=self.dims,
+                                                               dim_partition_dict=self.dim_partition_dict)
+            self.sharding_sequence = self.convert_dict_to_shard_sequence()
+
+        elif self.dim_partition_dict is None:
+            assert self.sharding_sequence is not None, f'sharding_sequence should not be None, if dim_partition_dict is NoneType object.'
+            self.dim_partition_dict = self.convert_shard_sequence_to_dict()
+
+        self._sanity_check()
+
+    def _sanity_check(self):
+        if len(self.sharding_sequence) > self.dims:
+            raise ShardingOutOfIndexError(
+                f'sharding_sequence should have {self.dims} elements, but got index {len(self.sharding_sequence)}.')
+
+        if max(list(self.dim_partition_dict.keys())) >= self.dims:
+            raise ShardingOutOfIndexError(
+                f'the key of dim_partition_dict should be less than {self.dims}, but got {max(list(self.dim_partition_dict.keys()))}.'
+            )
+
+    def __repr__(self):
+        res_list = ["ShardingSpec:"]
+        res_list.append(f"\n\tshard_sequence: " + ",".join(str(dimspec) for dimspec in self.sharding_sequence))
+        return ' '.join(res_list)
+
+    def convert_dict_to_shard_sequence(self):
+        '''
+        Convert dim_partition_dict into list of DimSpec, and assign it to sharding_sequence.
+        '''
+        sharding_sequence = [DimSpec([])] * self.dims
+        for dim, shard_list in self.dim_partition_dict.items():
+            sharding_sequence[dim] = DimSpec(shard_list)
+        return sharding_sequence
+
+    def convert_shard_sequence_to_dict(self):
+        '''
+        Convert sharding_sequence into dim_partition_dict.
+        '''
+        new_dim_partition_dict = {}
+        for index, dim_spec in enumerate(self.sharding_sequence):
+            if not dim_spec.is_replica:
+                if index not in new_dim_partition_dict:
+                    new_dim_partition_dict[index] = []
+                new_dim_partition_dict[index].extend(dim_spec.shard_list)
+        return new_dim_partition_dict
+
+    def spec_diff(self, other):
+        '''
+        This function is a naive version of difference computation. It just simply accumulates difference every dimension between the
+        pair of sharding sequence.
+
+        Example:
+            dim_partition_dict = {0: [0, 1]}
+            # DistSpec:
+            #     shard_sequence: S01,R,R
+            #     device_mesh_shape: (4, 4)
+            sharding_spec = ShardingSpec(device_mesh, entire_shape, dim_partition_dict)
+            dim_partition_dict_to_compare = {0: [0], 1: [1]}
+            # DistSpec:
+            #     shard_sequence: S0,S1,R
+            #     device_mesh_shape: (4, 4)
+            sharding_spec_to_compare = ShardingSpec(device_mesh, entire_shape, dim_partition_dict_to_compare)
+            print(sharding_spec.sharding_sequence_difference(sharding_spec_to_compare))
+
+        Output:
+            25
+
+        Argument:
+            other(ShardingSpec): The ShardingSpec to compared with.
+
+        Return:
+            difference(int): Difference between two ShardingSpec.
+        '''
+        assert len(self.sharding_sequence) == len(
+            other.sharding_sequence), f'Cannot compare difference for two sharding specs with different length.'
+        difference = 0
+        for orig_dim_spec, other_dim_spec in zip(self.sharding_sequence, other.sharding_sequence):
+            difference += orig_dim_spec.dim_diff(other_dim_spec)
+        return difference
diff --git a/tests/test_tensor/test_dtensor.py b/tests/test_tensor/test_dtensor/test_dtensor.py
similarity index 94%
rename from tests/test_tensor/test_dtensor.py
rename to tests/test_tensor/test_dtensor/test_dtensor.py
index 1de9563a2eff..80e275d9740e 100644
--- a/tests/test_tensor/test_dtensor.py
+++ b/tests/test_tensor/test_dtensor/test_dtensor.py
@@ -37,7 +37,10 @@ def check_dtensor(rank, world_size, port):
     target_sharding_spec = ShardingSpec(device_mesh=device_mesh,
                                         entire_shape=original_tensor.shape,
                                         dim_partition_dict={0: [0]})
-    layout = Layout(device_mesh=device_mesh, device_type=torch.device('cuda'), sharding_spec=target_sharding_spec)
+    layout = Layout(device_mesh=device_mesh,
+                    device_type=torch.device('cuda'),
+                    sharding_spec=target_sharding_spec,
+                    entire_shape=original_tensor.shape)
     d_tensor = DTensor(original_tensor, layout)
 
     assert d_tensor.entire_shape == original_tensor.shape
diff --git a/tests/test_tensor/test_dtensor/test_sharding_spec.py b/tests/test_tensor/test_dtensor/test_sharding_spec.py
new file mode 100644
index 000000000000..e02f710482a5
--- /dev/null
+++ b/tests/test_tensor/test_dtensor/test_sharding_spec.py
@@ -0,0 +1,34 @@
+import operator
+from functools import reduce
+
+from colossalai.tensor.d_tensor.sharding_spec import ALLGATHER_COST, SHARD_COST, STEP_PENALTY, ShardingSpec
+
+
+def test_sharding_spec():
+    dims = 4
+    dim_partition_dict_0 = {0: [0, 1]}
+    # DistSpec:
+    #     shard_sequence: S01,R,R,R
+    sharding_spec_0 = ShardingSpec(dims, dim_partition_dict=dim_partition_dict_0)
+    assert str(sharding_spec_0.sharding_sequence) == "[S01, R, R, R]"
+
+    dim_partition_dict_1 = {1: [0, 1]}
+    # DistSpec:
+    #     shard_sequence: R,S01,R,R
+    sharding_spec_1 = ShardingSpec(dims, dim_partition_dict=dim_partition_dict_1)
+    assert str(sharding_spec_1.sharding_sequence) == "[R, S01, R, R]"
+
+    dim_spec_list_0 = [dim_spec for dim_spec in sharding_spec_0.sharding_sequence]
+    dim_spec_list_1 = [dim_spec for dim_spec in sharding_spec_1.sharding_sequence]
+
+    assert dim_spec_list_0[0].dim_diff(dim_spec_list_1[0]) == ALLGATHER_COST + STEP_PENALTY + ALLGATHER_COST
+    assert dim_spec_list_0[1].dim_diff(dim_spec_list_1[1]) == SHARD_COST + STEP_PENALTY + SHARD_COST
+    assert dim_spec_list_0[2].dim_diff(dim_spec_list_1[2]) == 0
+    assert dim_spec_list_0[3].dim_diff(dim_spec_list_1[3]) == 0
+
+    assert sharding_spec_0.spec_diff(sharding_spec_1) == \
+        reduce(operator.add, [dim_spec_list_0[i].dim_diff(dim_spec_list_1[i]) for i in range(dims)], 0)
+
+
+if __name__ == '__main__':
+    test_sharding_spec()

From e86d9bb2e1e166c028007650797b8911ebd2fa31 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Tue, 7 Mar 2023 12:55:17 +0800
Subject: [PATCH 425/503] [format] applied code formatting on changed files in
 pull request 3025 (#3026)

Co-authored-by: github-actions <github-actions@github.com>
---
 applications/ChatGPT/examples/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/applications/ChatGPT/examples/README.md b/applications/ChatGPT/examples/README.md
index 39a76911027f..3876d20f02d7 100644
--- a/applications/ChatGPT/examples/README.md
+++ b/applications/ChatGPT/examples/README.md
@@ -15,9 +15,9 @@ Use these code to train your reward model.
 
 ```shell
 # Naive reward model training
-python train_reward_model.py --pretrain <your model path> --model <your model type> --strategy naive 
+python train_reward_model.py --pretrain <your model path> --model <your model type> --strategy naive
 # use colossalai_zero2
-torchrun --standalone --nproc_per_node=2 train_reward_model.py --pretrain <your model path> --model <your model type> --strategy colossalai_zero2 
+torchrun --standalone --nproc_per_node=2 train_reward_model.py --pretrain <your model path> --model <your model type> --strategy colossalai_zero2
 ```
 
 ## Train with dummy prompt data (Stage 3)

From 2e427ddf422ad0a450e680ba73354cc12c80bb2e Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Tue, 7 Mar 2023 13:31:23 +0800
Subject: [PATCH 426/503] [revert] recover "[refactor] restructure
 configuration files (#2977)" (#3022)

This reverts commit 35c8f4ce479e7dc7aab59e03bf00cba2d777ddb0.
---
 .clang-format                              |  1 +
 .isort.cfg                                 |  5 +++++
 .pre-commit-config.yaml                    |  3 +--
 .style.yapf                                |  5 +++++
 .github/CONTRIBUTING.md => CONTRIBUTING.md |  2 +-
 pyproject.toml                             | 19 -------------------
 pytest.ini                                 |  6 ++++++
 7 files changed, 19 insertions(+), 22 deletions(-)
 create mode 100644 .clang-format
 create mode 100644 .isort.cfg
 create mode 100644 .style.yapf
 rename .github/CONTRIBUTING.md => CONTRIBUTING.md (99%)
 delete mode 100644 pyproject.toml
 create mode 100644 pytest.ini

diff --git a/.clang-format b/.clang-format
new file mode 100644
index 000000000000..f6cb8ad931f5
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1 @@
+BasedOnStyle: Google
diff --git a/.isort.cfg b/.isort.cfg
new file mode 100644
index 000000000000..090aa28e39f3
--- /dev/null
+++ b/.isort.cfg
@@ -0,0 +1,5 @@
+[settings]
+line_length = 120
+multi_line_output=3
+include_trailing_comma = true
+ignore_comments = true
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b98edb6c9a8b..725d266375ef 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -11,14 +11,13 @@ repos:
     hooks:
     - id: yapf
       name: yapf formatter
-      args: ['--style=pyproject.toml', '--parallel', '--in-place']
+      args: ['--style=.style.yapf', '--parallel', '--in-place']
 
   - repo: https://github.com/pre-commit/mirrors-clang-format
     rev: v13.0.1
     hooks:
     - id: clang-format
       name: clang formatter
-      args: [--style, "{BasedOnStyle: Google}"]
 
   - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v4.3.0
diff --git a/.style.yapf b/.style.yapf
new file mode 100644
index 000000000000..05be0dc6a3a5
--- /dev/null
+++ b/.style.yapf
@@ -0,0 +1,5 @@
+[style]
+based_on_style = google
+spaces_before_comment = 4
+split_before_logical_operator = true
+column_limit = 120
diff --git a/.github/CONTRIBUTING.md b/CONTRIBUTING.md
similarity index 99%
rename from .github/CONTRIBUTING.md
rename to CONTRIBUTING.md
index 915c43174c6a..00abcf650158 100644
--- a/.github/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -138,4 +138,4 @@ You can now create a pull request on the GitHub webpage of your repository. The
 
 Do write clearly the description of your pull request and [link the pull request to your target issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue). This will automatically close the issue when the pull request is approved.
 
-In case of code conflict, you should rebase your branch and resolve the conflicts manually.
+In case of code conflict, you should rebase your branch and resolve the conflicts manually.
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
deleted file mode 100644
index 5d918a5784ea..000000000000
--- a/pyproject.toml
+++ /dev/null
@@ -1,19 +0,0 @@
-[tool.isort]
-line_length = 120
-multi_line_output = 3
-include_trailing_comma = true
-ignore_comments = true
-
-[tool.yapf]
-based_on_style = "google"
-spaces_before_comment = 4
-split_before_logical_operator = true
-column_limit = 120
-
-[tool.pytest.ini_options]
-markers = [
-    "cpu: tests which can run on CPU",
-    "gpu: tests which requires a single GPU",
-    "dist: tests which are run in a multi-GPU or multi-machine environment",
-    "experiment: tests for experimental features",
-]
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 000000000000..ac31ace4bfae
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,6 @@
+[pytest]
+markers =
+    cpu: tests which can run on CPU
+    gpu: tests which requires a single GPU
+    dist: tests which are run in a multi-GPU or multi-machine environment
+    experiment: tests for experimental features
\ No newline at end of file

From 2cd6ba30987ac1cb4312d6a2011c9a44f6007a2f Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Tue, 7 Mar 2023 13:35:45 +0800
Subject: [PATCH 427/503] [workflow] fixed the post-commit failure when no
 formatting needed (#3020)

* [workflow] fixed the post-commit failure when no formatting needed

* polish code

* polish code

* polish code
---
 .github/workflows/post_commit.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.github/workflows/post_commit.yml b/.github/workflows/post_commit.yml
index 765a4d4281e9..bf93eabbf43f 100644
--- a/.github/workflows/post_commit.yml
+++ b/.github/workflows/post_commit.yml
@@ -68,7 +68,11 @@ jobs:
         done
 
     # create commit for pre-commit
+    # when all files are well formatted, there is no need to create a commit
+    # therefore, this step will produce an error, which should be allowed
     - name: Create commits
+      id: commit
+      continue-on-error: true
       run: |
         git config --global user.name 'github-actions'
         git config --global user.email 'github-actions@github.com'
@@ -78,6 +82,7 @@ jobs:
 
     # create pull request
     - name: Create Pull Request
+      if: steps.commit.outputs.status == 'success'
       id: cpr
       uses: peter-evans/create-pull-request@v4
       with:
@@ -85,6 +90,7 @@ jobs:
         title: "[format] applied code formatting on changed files in PR ${{ github.event.pull_request.number }}"
 
     - name: Enable Auto-merge for the New PR
+      if: steps.commit.outputs.status == 'success'
       uses: peter-evans/enable-pull-request-automerge@v2
       with:
         pull-request-number: ${{ steps.cpr.outputs.pull-request-number }}

From 8fedc8766a7fc0c072337ac348b02b5da1037861 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Tue, 7 Mar 2023 14:21:26 +0800
Subject: [PATCH 428/503] [workflow] supported conda package installation in
 doc test (#3028)

* [workflow] supported conda package installation in doc test

* polish code

* polish code

* polish code

* polish code

* polish code

* polish code
---
 .github/workflows/doc_check_on_pr.yml |  4 ----
 .github/workflows/doc_test_on_pr.yml  | 15 +++++++++++----
 docs/README.md                        |  2 +-
 docs/conda-doc-test-deps.yml          |  2 ++
 4 files changed, 14 insertions(+), 9 deletions(-)
 create mode 100644 docs/conda-doc-test-deps.yml

diff --git a/.github/workflows/doc_check_on_pr.yml b/.github/workflows/doc_check_on_pr.yml
index 6593ac50e168..2022c957fba8 100644
--- a/.github/workflows/doc_check_on_pr.yml
+++ b/.github/workflows/doc_check_on_pr.yml
@@ -44,10 +44,6 @@ jobs:
         with:
           python-version: '3.8.14'
 
-      - run: |
-          ls -la
-          ls -la ..
-
       # we use the versions in the main branch as the guide for versions to display
       # checkout will give your merged branch
       # therefore, we need to make the merged branch as the main branch
diff --git a/.github/workflows/doc_test_on_pr.yml b/.github/workflows/doc_test_on_pr.yml
index a0572766f093..a083362a7f0f 100644
--- a/.github/workflows/doc_test_on_pr.yml
+++ b/.github/workflows/doc_test_on_pr.yml
@@ -54,6 +54,9 @@ jobs:
       image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
       options: --gpus all --rm
     timeout-minutes: 20
+    defaults:
+      run:
+        shell: bash
     steps:
       - name: Checkout ColossalAI-Documentation
         uses: actions/checkout@v2
@@ -68,17 +71,21 @@ jobs:
 
       - name: Checkout ColossalAI
         uses: actions/checkout@v3
+      
+      - name: Install Doc Test Requirements
+        run: |
+          source activate pytorch
+          conda env update --file docs/conda-doc-test-deps.yml --prune
+          pip install -r docs/requirements-doc-test.txt
 
       - name: Install ColossalAI
         run: |
+          source activate pytorch
           pip install -v .
 
-      - name: Install Doc Test Requirements
-        run: |
-          pip install -r docs/requirements-doc-test.txt
-
       - name: Test the Doc
         run: |
+          source activate pytorch
           for file in ${{ steps.changed-files.outputs.all_changed_files }}; do
             echo "Testing $file now..."
             docer test -p $file
diff --git a/docs/README.md b/docs/README.md
index 7261a6bc7c19..d5e0c22f5d51 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -98,7 +98,7 @@ Lastly, if you want to skip some code, you just need to add the following annota
 <!--- doc-test-ignore-end -->
 ```
 
-If you have any dependency required, please add it to `requriements-doc-test.txt`.
+If you have any dependency required, please add it to `requriements-doc-test.txt` for pip and `conda-doc-test-deps.yml` for conda.
 
 
 ### 💉 Auto Documentation
diff --git a/docs/conda-doc-test-deps.yml b/docs/conda-doc-test-deps.yml
new file mode 100644
index 000000000000..74a232214adc
--- /dev/null
+++ b/docs/conda-doc-test-deps.yml
@@ -0,0 +1,2 @@
+dependencies:
+  - cmake

From 4269196c79ca61bfbe48ce19a79a1788011c073b Mon Sep 17 00:00:00 2001
From: YuliangLiu0306 <72588413+YuliangLiu0306@users.noreply.github.com>
Date: Tue, 7 Mar 2023 15:50:00 +0800
Subject: [PATCH 429/503] [hotfix] skip auto checkpointing tests (#3029)

* [hotfix] skip auto checkpointing tests

* fix test name issue
---
 .../test_ckpt_solvers/test_C_solver_consistency.py        | 8 +++++---
 .../test_ckpt_solvers/test_ckpt_torchvision.py            | 5 +++--
 .../test_ckpt_solvers/test_linearize.py                   | 7 +++++--
 ...est_sharding_spec.py => test_dtensor_sharding_spec.py} | 4 ++--
 4 files changed, 15 insertions(+), 9 deletions(-)
 rename tests/{test_fx => test_auto_parallel}/test_ckpt_solvers/test_C_solver_consistency.py (94%)
 rename tests/{test_fx => test_auto_parallel}/test_ckpt_solvers/test_ckpt_torchvision.py (97%)
 rename tests/{test_fx => test_auto_parallel}/test_ckpt_solvers/test_linearize.py (95%)
 rename tests/test_tensor/test_dtensor/{test_sharding_spec.py => test_dtensor_sharding_spec.py} (95%)

diff --git a/tests/test_fx/test_ckpt_solvers/test_C_solver_consistency.py b/tests/test_auto_parallel/test_ckpt_solvers/test_C_solver_consistency.py
similarity index 94%
rename from tests/test_fx/test_ckpt_solvers/test_C_solver_consistency.py
rename to tests/test_auto_parallel/test_ckpt_solvers/test_C_solver_consistency.py
index 773cf151d2e9..f8dd0b16b7f6 100644
--- a/tests/test_fx/test_ckpt_solvers/test_C_solver_consistency.py
+++ b/tests/test_auto_parallel/test_ckpt_solvers/test_C_solver_consistency.py
@@ -1,16 +1,17 @@
 import copy
 
-import colossalai
 import pytest
 import torch
 import torch.fx
 import torch.multiprocessing as mp
 import torchvision.models as tm
+
+import colossalai
 from colossalai.core import global_context as gpc
 from colossalai.fx import ColoGraphModule, ColoTracer
 from colossalai.fx._compatibility import is_compatible_with_meta
-from colossalai.fx.passes.algorithms import solver_rotor
-from colossalai.fx.passes.algorithms.operation import Sequence
+# from colossalai.fx.passes.algorithms import solver_rotor
+# from colossalai.fx.passes.algorithms.operation import Sequence
 from colossalai.fx.passes.meta_info_prop import MetaInfoProp
 from colossalai.utils import free_port
 
@@ -67,6 +68,7 @@ def _run_C_solver_consistency_test(rank=0):
     gpc.destroy()
 
 
+@pytest.mark.skip("TODO(lyl): refactor all tests.")
 @pytest.mark.skipif(not withcodegen, reason="torch version is less than 1.12.0")
 def test_C_solver_consistency():
     mp.spawn(_run_C_solver_consistency_test, nprocs=1)
diff --git a/tests/test_fx/test_ckpt_solvers/test_ckpt_torchvision.py b/tests/test_auto_parallel/test_ckpt_solvers/test_ckpt_torchvision.py
similarity index 97%
rename from tests/test_fx/test_ckpt_solvers/test_ckpt_torchvision.py
rename to tests/test_auto_parallel/test_ckpt_solvers/test_ckpt_torchvision.py
index 9949d49c1e01..89600ea098a9 100644
--- a/tests/test_fx/test_ckpt_solvers/test_ckpt_torchvision.py
+++ b/tests/test_auto_parallel/test_ckpt_solvers/test_ckpt_torchvision.py
@@ -13,7 +13,7 @@
 from colossalai.fx import ColoTracer
 from colossalai.fx._compatibility import is_compatible_with_meta
 from colossalai.fx.graph_module import ColoGraphModule
-from colossalai.fx.passes.algorithms import chen_greedy, solver_rotor
+# from colossalai.fx.passes.algorithms import chen_greedy, solver_rotor
 from colossalai.fx.passes.meta_info_prop import MetaInfoProp
 from colossalai.utils import free_port
 
@@ -28,7 +28,8 @@
     from colossalai.fx.codegen import python_code_with_activation_checkpoint
     with_codegen = False
 
-SOLVERS = [chen_greedy, solver_rotor]
+# SOLVERS = [chen_greedy, solver_rotor]
+SOLVERS = []
 
 
 def _is_activation_checkpoint_available(gm: GraphModule):
diff --git a/tests/test_fx/test_ckpt_solvers/test_linearize.py b/tests/test_auto_parallel/test_ckpt_solvers/test_linearize.py
similarity index 95%
rename from tests/test_fx/test_ckpt_solvers/test_linearize.py
rename to tests/test_auto_parallel/test_ckpt_solvers/test_linearize.py
index a803f8c07277..0f90ba0b0989 100644
--- a/tests/test_fx/test_ckpt_solvers/test_linearize.py
+++ b/tests/test_auto_parallel/test_ckpt_solvers/test_linearize.py
@@ -1,11 +1,12 @@
 import pytest
 import torch
 import torchvision.models as tm
+
 from colossalai.fx import ColoTracer
 from colossalai.fx._compatibility import is_compatible_with_meta
 from colossalai.fx.graph_module import ColoGraphModule
-from colossalai.fx.passes.algorithms import linearize, solver_rotor
-from colossalai.fx.passes.algorithms.operation import (ForwardCheck, ForwardEnable, ForwardNograd, Loss)
+# from colossalai.fx.passes.algorithms import linearize, solver_rotor
+# from colossalai.fx.passes.algorithms.operation import (ForwardCheck, ForwardEnable, ForwardNograd, Loss)
 from colossalai.fx.passes.meta_info_prop import MetaInfoProp
 
 if is_compatible_with_meta():
@@ -21,6 +22,7 @@
 
 
 @pytest.mark.skip(reason='TODO: modify the logger')
+@pytest.mark.skip("TODO(lyl): refactor all tests.")
 @pytest.mark.skipif(not with_codegen, reason="torch version is lower than 1.12.0")
 def test_linearize():
     MODEL_DICT = {tm.resnet18: [2100, 3000], tm.densenet121: [8100, 17000]}
@@ -79,6 +81,7 @@ def test_linearize():
             del node_list
 
 
+@pytest.mark.skip("TODO(lyl): refactor all tests.")
 @pytest.mark.skip(reason="torch11 meta tensor not implemented")
 @pytest.mark.skipif(with_codegen, reason="torch version is equal to or higher than 1.12.0")
 def test_linearize_torch11():
diff --git a/tests/test_tensor/test_dtensor/test_sharding_spec.py b/tests/test_tensor/test_dtensor/test_dtensor_sharding_spec.py
similarity index 95%
rename from tests/test_tensor/test_dtensor/test_sharding_spec.py
rename to tests/test_tensor/test_dtensor/test_dtensor_sharding_spec.py
index e02f710482a5..7fd1c3d90fc4 100644
--- a/tests/test_tensor/test_dtensor/test_sharding_spec.py
+++ b/tests/test_tensor/test_dtensor/test_dtensor_sharding_spec.py
@@ -4,7 +4,7 @@
 from colossalai.tensor.d_tensor.sharding_spec import ALLGATHER_COST, SHARD_COST, STEP_PENALTY, ShardingSpec
 
 
-def test_sharding_spec():
+def test_dtensor_sharding_spec():
     dims = 4
     dim_partition_dict_0 = {0: [0, 1]}
     # DistSpec:
@@ -31,4 +31,4 @@ def test_sharding_spec():
 
 
 if __name__ == '__main__':
-    test_sharding_spec()
+    test_dtensor_sharding_spec()

From c21b11edce3b772cdbcb4e5fafe95f62ac49af94 Mon Sep 17 00:00:00 2001
From: Fazzie-Maqianli <55798671+Fazziekey@users.noreply.github.com>
Date: Tue, 7 Mar 2023 16:34:22 +0800
Subject: [PATCH 430/503] change nn to models (#3032)

---
 applications/ChatGPT/README.md                 |  3 ++-
 .../ChatGPT/benchmarks/benchmark_gpt_dummy.py  |  3 ++-
 .../benchmarks/benchmark_opt_lora_dummy.py     |  3 ++-
 .../ChatGPT/chatgpt/experience_maker/base.py   |  2 +-
 .../ChatGPT/chatgpt/experience_maker/naive.py  |  2 +-
 .../ChatGPT/chatgpt/models/__init__.py         |  4 ++++
 .../ChatGPT/chatgpt/models/base/__init__.py    |  5 +++++
 .../chatgpt/{nn => models/base}/actor.py       |  6 +++---
 .../chatgpt/{nn => models/base}/critic.py      |  4 ++--
 .../{nn => models/base}/reward_model.py        |  2 +-
 .../ChatGPT/chatgpt/models/bloom/__init__.py   |  5 +++++
 .../{nn => models/bloom}/bloom_actor.py        |  2 +-
 .../{nn => models/bloom}/bloom_critic.py       |  2 +-
 .../chatgpt/{nn => models/bloom}/bloom_rm.py   |  2 +-
 .../chatgpt/{nn => models}/generation.py       |  0
 .../chatgpt/{nn => models}/generation_utils.py |  0
 .../ChatGPT/chatgpt/models/gpt/__init__.py     |  5 +++++
 .../chatgpt/{nn => models/gpt}/gpt_actor.py    |  2 +-
 .../chatgpt/{nn => models/gpt}/gpt_critic.py   |  2 +-
 .../chatgpt/{nn => models/gpt}/gpt_rm.py       |  2 +-
 .../ChatGPT/chatgpt/{nn => models}/lora.py     |  0
 .../ChatGPT/chatgpt/{nn => models}/loss.py     |  0
 .../ChatGPT/chatgpt/models/opt/__init__.py     |  5 +++++
 .../chatgpt/{nn => models/opt}/opt_actor.py    |  2 +-
 .../chatgpt/{nn => models/opt}/opt_critic.py   |  2 +-
 .../chatgpt/{nn => models/opt}/opt_rm.py       |  2 +-
 .../ChatGPT/chatgpt/{nn => models}/utils.py    |  0
 applications/ChatGPT/chatgpt/nn/__init__.py    | 18 ------------------
 applications/ChatGPT/chatgpt/trainer/ppo.py    |  5 +++--
 applications/ChatGPT/chatgpt/trainer/rm.py     |  2 +-
 .../ChatGPT/chatgpt/trainer/strategies/base.py |  2 +-
 .../chatgpt/trainer/strategies/colossalai.py   |  2 +-
 .../ChatGPT/chatgpt/trainer/strategies/ddp.py  |  2 +-
 applications/ChatGPT/examples/inference.py     |  4 +++-
 applications/ChatGPT/examples/train_dummy.py   |  5 ++++-
 applications/ChatGPT/examples/train_prompts.py |  5 ++++-
 .../ChatGPT/examples/train_reward_model.py     |  5 ++++-
 applications/ChatGPT/tests/test_checkpoint.py  |  2 +-
 applications/ChatGPT/tests/test_data.py        |  3 ++-
 39 files changed, 72 insertions(+), 50 deletions(-)
 create mode 100644 applications/ChatGPT/chatgpt/models/__init__.py
 create mode 100644 applications/ChatGPT/chatgpt/models/base/__init__.py
 rename applications/ChatGPT/chatgpt/{nn => models/base}/actor.py (95%)
 rename applications/ChatGPT/chatgpt/{nn => models/base}/critic.py (95%)
 rename applications/ChatGPT/chatgpt/{nn => models/base}/reward_model.py (97%)
 create mode 100644 applications/ChatGPT/chatgpt/models/bloom/__init__.py
 rename applications/ChatGPT/chatgpt/{nn => models/bloom}/bloom_actor.py (97%)
 rename applications/ChatGPT/chatgpt/{nn => models/bloom}/bloom_critic.py (97%)
 rename applications/ChatGPT/chatgpt/{nn => models/bloom}/bloom_rm.py (96%)
 rename applications/ChatGPT/chatgpt/{nn => models}/generation.py (100%)
 rename applications/ChatGPT/chatgpt/{nn => models}/generation_utils.py (100%)
 create mode 100644 applications/ChatGPT/chatgpt/models/gpt/__init__.py
 rename applications/ChatGPT/chatgpt/{nn => models/gpt}/gpt_actor.py (97%)
 rename applications/ChatGPT/chatgpt/{nn => models/gpt}/gpt_critic.py (97%)
 rename applications/ChatGPT/chatgpt/{nn => models/gpt}/gpt_rm.py (96%)
 rename applications/ChatGPT/chatgpt/{nn => models}/lora.py (100%)
 rename applications/ChatGPT/chatgpt/{nn => models}/loss.py (100%)
 create mode 100644 applications/ChatGPT/chatgpt/models/opt/__init__.py
 rename applications/ChatGPT/chatgpt/{nn => models/opt}/opt_actor.py (97%)
 rename applications/ChatGPT/chatgpt/{nn => models/opt}/opt_critic.py (97%)
 rename applications/ChatGPT/chatgpt/{nn => models/opt}/opt_rm.py (96%)
 rename applications/ChatGPT/chatgpt/{nn => models}/utils.py (100%)
 delete mode 100644 applications/ChatGPT/chatgpt/nn/__init__.py

diff --git a/applications/ChatGPT/README.md b/applications/ChatGPT/README.md
index d26206144614..23c6aa3726ce 100644
--- a/applications/ChatGPT/README.md
+++ b/applications/ChatGPT/README.md
@@ -41,7 +41,8 @@ Simplest usage:
 ```python
 from chatgpt.trainer import PPOTrainer
 from chatgpt.trainer.strategies import ColossalAIStrategy
-from chatgpt.nn import GPTActor, GPTCritic, RewardModel
+from chatgpt.models.gpt import GPTActor, GPTCritic
+from chatgpt.models.base import RewardModel
 from copy import deepcopy
 from colossalai.nn.optimizer import HybridAdam
 
diff --git a/applications/ChatGPT/benchmarks/benchmark_gpt_dummy.py b/applications/ChatGPT/benchmarks/benchmark_gpt_dummy.py
index b5730c7c7bbc..5ee65763b936 100644
--- a/applications/ChatGPT/benchmarks/benchmark_gpt_dummy.py
+++ b/applications/ChatGPT/benchmarks/benchmark_gpt_dummy.py
@@ -4,7 +4,8 @@
 import torch
 import torch.distributed as dist
 import torch.nn as nn
-from chatgpt.nn import GPTActor, GPTCritic, RewardModel
+from chatgpt.models.base import RewardModel
+from chatgpt.models.gpt import GPTActor, GPTCritic
 from chatgpt.trainer import PPOTrainer
 from chatgpt.trainer.callbacks import PerformanceEvaluator
 from chatgpt.trainer.strategies import ColossalAIStrategy, DDPStrategy, Strategy
diff --git a/applications/ChatGPT/benchmarks/benchmark_opt_lora_dummy.py b/applications/ChatGPT/benchmarks/benchmark_opt_lora_dummy.py
index 6777cb770d53..207edbca94b5 100644
--- a/applications/ChatGPT/benchmarks/benchmark_opt_lora_dummy.py
+++ b/applications/ChatGPT/benchmarks/benchmark_opt_lora_dummy.py
@@ -4,7 +4,8 @@
 import torch
 import torch.distributed as dist
 import torch.nn as nn
-from chatgpt.nn import OPTActor, OPTCritic, RewardModel
+from chatgpt.models.base import RewardModel
+from chatgpt.models.opt import OPTActor, OPTCritic
 from chatgpt.trainer import PPOTrainer
 from chatgpt.trainer.callbacks import PerformanceEvaluator
 from chatgpt.trainer.strategies import ColossalAIStrategy, DDPStrategy, Strategy
diff --git a/applications/ChatGPT/chatgpt/experience_maker/base.py b/applications/ChatGPT/chatgpt/experience_maker/base.py
index 61895322cb31..f3640fc1e496 100644
--- a/applications/ChatGPT/chatgpt/experience_maker/base.py
+++ b/applications/ChatGPT/chatgpt/experience_maker/base.py
@@ -4,7 +4,7 @@
 
 import torch
 import torch.nn as nn
-from chatgpt.nn.actor import Actor
+from chatgpt.models.base import Actor
 
 
 @dataclass
diff --git a/applications/ChatGPT/chatgpt/experience_maker/naive.py b/applications/ChatGPT/chatgpt/experience_maker/naive.py
index f4fd2078c1eb..64835cfa1918 100644
--- a/applications/ChatGPT/chatgpt/experience_maker/naive.py
+++ b/applications/ChatGPT/chatgpt/experience_maker/naive.py
@@ -1,5 +1,5 @@
 import torch
-from chatgpt.nn.utils import compute_reward, normalize
+from chatgpt.models.utils import compute_reward, normalize
 
 from .base import Experience, ExperienceMaker
 
diff --git a/applications/ChatGPT/chatgpt/models/__init__.py b/applications/ChatGPT/chatgpt/models/__init__.py
new file mode 100644
index 000000000000..376fed8de792
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/models/__init__.py
@@ -0,0 +1,4 @@
+from .base import Actor, Critic, RewardModel
+from .loss import PairWiseLoss, PolicyLoss, PPOPtxActorLoss, ValueLoss
+
+__all__ = ['Actor', 'Critic', 'RewardModel', 'PolicyLoss', 'ValueLoss', 'PPOPtxActorLoss', 'PairWiseLoss']
diff --git a/applications/ChatGPT/chatgpt/models/base/__init__.py b/applications/ChatGPT/chatgpt/models/base/__init__.py
new file mode 100644
index 000000000000..86f403556904
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/models/base/__init__.py
@@ -0,0 +1,5 @@
+from .actor import Actor
+from .critic import Critic
+from .reward_model import RewardModel
+
+__all__ = ['Actor', 'Critic', 'RewardModel']
diff --git a/applications/ChatGPT/chatgpt/nn/actor.py b/applications/ChatGPT/chatgpt/models/base/actor.py
similarity index 95%
rename from applications/ChatGPT/chatgpt/nn/actor.py
rename to applications/ChatGPT/chatgpt/models/base/actor.py
index c4c0d579de58..e2841dc68feb 100644
--- a/applications/ChatGPT/chatgpt/nn/actor.py
+++ b/applications/ChatGPT/chatgpt/models/base/actor.py
@@ -4,9 +4,9 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-from .generation import generate
-from .lora import LoRAModule
-from .utils import log_probs_from_logits
+from ..generation import generate
+from ..lora import LoRAModule
+from ..utils import log_probs_from_logits
 
 
 class Actor(LoRAModule):
diff --git a/applications/ChatGPT/chatgpt/nn/critic.py b/applications/ChatGPT/chatgpt/models/base/critic.py
similarity index 95%
rename from applications/ChatGPT/chatgpt/nn/critic.py
rename to applications/ChatGPT/chatgpt/models/base/critic.py
index f3a1238540f9..4bff5ee97e51 100644
--- a/applications/ChatGPT/chatgpt/nn/critic.py
+++ b/applications/ChatGPT/chatgpt/models/base/critic.py
@@ -3,8 +3,8 @@
 import torch
 import torch.nn as nn
 
-from .lora import LoRAModule
-from .utils import masked_mean
+from ..lora import LoRAModule
+from ..utils import masked_mean
 
 
 class Critic(LoRAModule):
diff --git a/applications/ChatGPT/chatgpt/nn/reward_model.py b/applications/ChatGPT/chatgpt/models/base/reward_model.py
similarity index 97%
rename from applications/ChatGPT/chatgpt/nn/reward_model.py
rename to applications/ChatGPT/chatgpt/models/base/reward_model.py
index 27cd1ccaee93..ce8c0a1d3568 100644
--- a/applications/ChatGPT/chatgpt/nn/reward_model.py
+++ b/applications/ChatGPT/chatgpt/models/base/reward_model.py
@@ -3,7 +3,7 @@
 import torch
 import torch.nn as nn
 
-from .lora import LoRAModule
+from ..lora import LoRAModule
 
 
 class RewardModel(LoRAModule):
diff --git a/applications/ChatGPT/chatgpt/models/bloom/__init__.py b/applications/ChatGPT/chatgpt/models/bloom/__init__.py
new file mode 100644
index 000000000000..d0e7f7b1ef94
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/models/bloom/__init__.py
@@ -0,0 +1,5 @@
+from .bloom_actor import BLOOMActor
+from .bloom_critic import BLOOMCritic
+from .bloom_rm import BLOOMRM
+
+__all__ = ['BLOOMActor', 'BLOOMCritic', 'BLOOMRM']
diff --git a/applications/ChatGPT/chatgpt/nn/bloom_actor.py b/applications/ChatGPT/chatgpt/models/bloom/bloom_actor.py
similarity index 97%
rename from applications/ChatGPT/chatgpt/nn/bloom_actor.py
rename to applications/ChatGPT/chatgpt/models/bloom/bloom_actor.py
index 103536bc3940..d7577f096493 100644
--- a/applications/ChatGPT/chatgpt/nn/bloom_actor.py
+++ b/applications/ChatGPT/chatgpt/models/bloom/bloom_actor.py
@@ -3,7 +3,7 @@
 import torch
 from transformers import BloomConfig, BloomForCausalLM, BloomModel
 
-from .actor import Actor
+from ..base import Actor
 
 
 class BLOOMActor(Actor):
diff --git a/applications/ChatGPT/chatgpt/nn/bloom_critic.py b/applications/ChatGPT/chatgpt/models/bloom/bloom_critic.py
similarity index 97%
rename from applications/ChatGPT/chatgpt/nn/bloom_critic.py
rename to applications/ChatGPT/chatgpt/models/bloom/bloom_critic.py
index 3b03471a3d1d..5a907309a674 100644
--- a/applications/ChatGPT/chatgpt/nn/bloom_critic.py
+++ b/applications/ChatGPT/chatgpt/models/bloom/bloom_critic.py
@@ -4,7 +4,7 @@
 import torch.nn as nn
 from transformers import BloomConfig, BloomForCausalLM, BloomModel
 
-from .critic import Critic
+from ..base import Critic
 
 
 class BLOOMCritic(Critic):
diff --git a/applications/ChatGPT/chatgpt/nn/bloom_rm.py b/applications/ChatGPT/chatgpt/models/bloom/bloom_rm.py
similarity index 96%
rename from applications/ChatGPT/chatgpt/nn/bloom_rm.py
rename to applications/ChatGPT/chatgpt/models/bloom/bloom_rm.py
index 12c37957dd83..4dc2646e36ae 100644
--- a/applications/ChatGPT/chatgpt/nn/bloom_rm.py
+++ b/applications/ChatGPT/chatgpt/models/bloom/bloom_rm.py
@@ -3,7 +3,7 @@
 import torch.nn as nn
 from transformers import BloomConfig, BloomForCausalLM, BloomModel
 
-from .reward_model import RewardModel
+from ..base import RewardModel
 
 
 class BLOOMRM(RewardModel):
diff --git a/applications/ChatGPT/chatgpt/nn/generation.py b/applications/ChatGPT/chatgpt/models/generation.py
similarity index 100%
rename from applications/ChatGPT/chatgpt/nn/generation.py
rename to applications/ChatGPT/chatgpt/models/generation.py
diff --git a/applications/ChatGPT/chatgpt/nn/generation_utils.py b/applications/ChatGPT/chatgpt/models/generation_utils.py
similarity index 100%
rename from applications/ChatGPT/chatgpt/nn/generation_utils.py
rename to applications/ChatGPT/chatgpt/models/generation_utils.py
diff --git a/applications/ChatGPT/chatgpt/models/gpt/__init__.py b/applications/ChatGPT/chatgpt/models/gpt/__init__.py
new file mode 100644
index 000000000000..63dc5ab0f5ea
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/models/gpt/__init__.py
@@ -0,0 +1,5 @@
+from .gpt_actor import GPTActor
+from .gpt_critic import GPTCritic
+from .gpt_rm import GPTRM
+
+__all__ = ['GPTActor', 'GPTCritic', 'GPTRM']
diff --git a/applications/ChatGPT/chatgpt/nn/gpt_actor.py b/applications/ChatGPT/chatgpt/models/gpt/gpt_actor.py
similarity index 97%
rename from applications/ChatGPT/chatgpt/nn/gpt_actor.py
rename to applications/ChatGPT/chatgpt/models/gpt/gpt_actor.py
index 491182ffa405..da24685e16c8 100644
--- a/applications/ChatGPT/chatgpt/nn/gpt_actor.py
+++ b/applications/ChatGPT/chatgpt/models/gpt/gpt_actor.py
@@ -3,7 +3,7 @@
 from transformers.models.gpt2.configuration_gpt2 import GPT2Config
 from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
 
-from .actor import Actor
+from ..base import Actor
 
 
 class GPTActor(Actor):
diff --git a/applications/ChatGPT/chatgpt/nn/gpt_critic.py b/applications/ChatGPT/chatgpt/models/gpt/gpt_critic.py
similarity index 97%
rename from applications/ChatGPT/chatgpt/nn/gpt_critic.py
rename to applications/ChatGPT/chatgpt/models/gpt/gpt_critic.py
index b0a001f4aff5..897ddb4aeb03 100644
--- a/applications/ChatGPT/chatgpt/nn/gpt_critic.py
+++ b/applications/ChatGPT/chatgpt/models/gpt/gpt_critic.py
@@ -4,7 +4,7 @@
 from transformers.models.gpt2.configuration_gpt2 import GPT2Config
 from transformers.models.gpt2.modeling_gpt2 import GPT2Model
 
-from .critic import Critic
+from ..base import Critic
 
 
 class GPTCritic(Critic):
diff --git a/applications/ChatGPT/chatgpt/nn/gpt_rm.py b/applications/ChatGPT/chatgpt/models/gpt/gpt_rm.py
similarity index 96%
rename from applications/ChatGPT/chatgpt/nn/gpt_rm.py
rename to applications/ChatGPT/chatgpt/models/gpt/gpt_rm.py
index fcfb61cd4b82..0132dbf27ffc 100644
--- a/applications/ChatGPT/chatgpt/nn/gpt_rm.py
+++ b/applications/ChatGPT/chatgpt/models/gpt/gpt_rm.py
@@ -4,7 +4,7 @@
 from transformers.models.gpt2.configuration_gpt2 import GPT2Config
 from transformers.models.gpt2.modeling_gpt2 import GPT2Model
 
-from .reward_model import RewardModel
+from ..base import RewardModel
 
 
 class GPTRM(RewardModel):
diff --git a/applications/ChatGPT/chatgpt/nn/lora.py b/applications/ChatGPT/chatgpt/models/lora.py
similarity index 100%
rename from applications/ChatGPT/chatgpt/nn/lora.py
rename to applications/ChatGPT/chatgpt/models/lora.py
diff --git a/applications/ChatGPT/chatgpt/nn/loss.py b/applications/ChatGPT/chatgpt/models/loss.py
similarity index 100%
rename from applications/ChatGPT/chatgpt/nn/loss.py
rename to applications/ChatGPT/chatgpt/models/loss.py
diff --git a/applications/ChatGPT/chatgpt/models/opt/__init__.py b/applications/ChatGPT/chatgpt/models/opt/__init__.py
new file mode 100644
index 000000000000..334f4df0032a
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/models/opt/__init__.py
@@ -0,0 +1,5 @@
+from .opt_actor import OPTActor
+from .opt_critic import OPTCritic
+from .opt_rm import OPTRM
+
+__all__ = ['OPTActor', 'OPTCritic', 'OPTRM']
diff --git a/applications/ChatGPT/chatgpt/nn/opt_actor.py b/applications/ChatGPT/chatgpt/models/opt/opt_actor.py
similarity index 97%
rename from applications/ChatGPT/chatgpt/nn/opt_actor.py
rename to applications/ChatGPT/chatgpt/models/opt/opt_actor.py
index ff2bf7c00bea..c14e4377ffb2 100644
--- a/applications/ChatGPT/chatgpt/nn/opt_actor.py
+++ b/applications/ChatGPT/chatgpt/models/opt/opt_actor.py
@@ -3,7 +3,7 @@
 from transformers.models.opt.configuration_opt import OPTConfig
 from transformers.models.opt.modeling_opt import OPTForCausalLM
 
-from .actor import Actor
+from ..base import Actor
 
 
 class OPTActor(Actor):
diff --git a/applications/ChatGPT/chatgpt/nn/opt_critic.py b/applications/ChatGPT/chatgpt/models/opt/opt_critic.py
similarity index 97%
rename from applications/ChatGPT/chatgpt/nn/opt_critic.py
rename to applications/ChatGPT/chatgpt/models/opt/opt_critic.py
index 9c9cb873f38a..767cecb79353 100644
--- a/applications/ChatGPT/chatgpt/nn/opt_critic.py
+++ b/applications/ChatGPT/chatgpt/models/opt/opt_critic.py
@@ -4,7 +4,7 @@
 from transformers.models.opt.configuration_opt import OPTConfig
 from transformers.models.opt.modeling_opt import OPTModel
 
-from .critic import Critic
+from ..base import Critic
 
 
 class OPTCritic(Critic):
diff --git a/applications/ChatGPT/chatgpt/nn/opt_rm.py b/applications/ChatGPT/chatgpt/models/opt/opt_rm.py
similarity index 96%
rename from applications/ChatGPT/chatgpt/nn/opt_rm.py
rename to applications/ChatGPT/chatgpt/models/opt/opt_rm.py
index 5f518a3cc05e..7ad7b3887e53 100644
--- a/applications/ChatGPT/chatgpt/nn/opt_rm.py
+++ b/applications/ChatGPT/chatgpt/models/opt/opt_rm.py
@@ -3,7 +3,7 @@
 import torch.nn as nn
 from transformers import OPTConfig, OPTModel
 
-from .reward_model import RewardModel
+from ..base import RewardModel
 
 
 class OPTRM(RewardModel):
diff --git a/applications/ChatGPT/chatgpt/nn/utils.py b/applications/ChatGPT/chatgpt/models/utils.py
similarity index 100%
rename from applications/ChatGPT/chatgpt/nn/utils.py
rename to applications/ChatGPT/chatgpt/models/utils.py
diff --git a/applications/ChatGPT/chatgpt/nn/__init__.py b/applications/ChatGPT/chatgpt/nn/__init__.py
deleted file mode 100644
index c728d7df37d4..000000000000
--- a/applications/ChatGPT/chatgpt/nn/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-from .actor import Actor
-from .bloom_actor import BLOOMActor
-from .bloom_critic import BLOOMCritic
-from .bloom_rm import BLOOMRM
-from .critic import Critic
-from .gpt_actor import GPTActor
-from .gpt_critic import GPTCritic
-from .gpt_rm import GPTRM
-from .loss import PairWiseLoss, PolicyLoss, PPOPtxActorLoss, ValueLoss
-from .opt_actor import OPTActor
-from .opt_critic import OPTCritic
-from .opt_rm import OPTRM
-from .reward_model import RewardModel
-
-__all__ = [
-    'Actor', 'Critic', 'RewardModel', 'PolicyLoss', 'ValueLoss', 'PPOPtxActorLoss', 'PairWiseLoss', 'GPTActor',
-    'GPTCritic', 'GPTRM', 'BLOOMActor', 'BLOOMCritic', 'BLOOMRM', 'OPTActor', 'OPTCritic', 'OPTRM'
-]
diff --git a/applications/ChatGPT/chatgpt/trainer/ppo.py b/applications/ChatGPT/chatgpt/trainer/ppo.py
index 2c1fd2fb6cd3..789e0c2f8f1e 100644
--- a/applications/ChatGPT/chatgpt/trainer/ppo.py
+++ b/applications/ChatGPT/chatgpt/trainer/ppo.py
@@ -2,8 +2,9 @@
 
 import torch.nn as nn
 from chatgpt.experience_maker import Experience, NaiveExperienceMaker
-from chatgpt.nn import Actor, Critic, PolicyLoss, ValueLoss
-from chatgpt.nn.generation_utils import update_model_kwargs_fn
+from chatgpt.models.base import Actor, Critic
+from chatgpt.models.generation_utils import update_model_kwargs_fn
+from chatgpt.models.loss import PolicyLoss, ValueLoss
 from chatgpt.replay_buffer import NaiveReplayBuffer
 from torch.optim import Optimizer
 
diff --git a/applications/ChatGPT/chatgpt/trainer/rm.py b/applications/ChatGPT/chatgpt/trainer/rm.py
index d44944aeeb35..c07d65f84ca5 100644
--- a/applications/ChatGPT/chatgpt/trainer/rm.py
+++ b/applications/ChatGPT/chatgpt/trainer/rm.py
@@ -3,7 +3,7 @@
 import loralib as lora
 import torch
 from chatgpt.dataset import RewardDataset
-from chatgpt.nn import PairWiseLoss
+from chatgpt.models.loss import PairWiseLoss
 from torch.optim import Adam, Optimizer
 from torch.utils.data import DataLoader
 from tqdm import tqdm
diff --git a/applications/ChatGPT/chatgpt/trainer/strategies/base.py b/applications/ChatGPT/chatgpt/trainer/strategies/base.py
index 2a96078e98c1..4347c08b4333 100644
--- a/applications/ChatGPT/chatgpt/trainer/strategies/base.py
+++ b/applications/ChatGPT/chatgpt/trainer/strategies/base.py
@@ -5,7 +5,7 @@
 import numpy as np
 import torch
 import torch.nn as nn
-from chatgpt.nn import Actor
+from chatgpt.models.base import Actor, Critic, RewardModel
 from chatgpt.replay_buffer import ReplayBuffer
 from torch.optim import Optimizer
 from torch.utils.data import DataLoader
diff --git a/applications/ChatGPT/chatgpt/trainer/strategies/colossalai.py b/applications/ChatGPT/chatgpt/trainer/strategies/colossalai.py
index b6ed1d451b78..f08018fd232f 100644
--- a/applications/ChatGPT/chatgpt/trainer/strategies/colossalai.py
+++ b/applications/ChatGPT/chatgpt/trainer/strategies/colossalai.py
@@ -5,7 +5,7 @@
 import torch.distributed as dist
 import torch.nn as nn
 import torch.optim as optim
-from chatgpt.nn import Actor
+from chatgpt.models.base import Actor
 from torch.optim import Optimizer
 
 import colossalai
diff --git a/applications/ChatGPT/chatgpt/trainer/strategies/ddp.py b/applications/ChatGPT/chatgpt/trainer/strategies/ddp.py
index 66e99dd3977c..530dd998d193 100644
--- a/applications/ChatGPT/chatgpt/trainer/strategies/ddp.py
+++ b/applications/ChatGPT/chatgpt/trainer/strategies/ddp.py
@@ -5,7 +5,7 @@
 import torch
 import torch.distributed as dist
 import torch.nn as nn
-from chatgpt.nn import Actor
+from chatgpt.models.base import Actor
 from chatgpt.replay_buffer import ReplayBuffer
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.optim import Optimizer
diff --git a/applications/ChatGPT/examples/inference.py b/applications/ChatGPT/examples/inference.py
index 239b6e19b282..08885c33b194 100644
--- a/applications/ChatGPT/examples/inference.py
+++ b/applications/ChatGPT/examples/inference.py
@@ -1,7 +1,9 @@
 import argparse
 
 import torch
-from chatgpt.nn import BLOOMActor, GPTActor, OPTActor
+from chatgpt.models.bloom import BLOOMActor
+from chatgpt.models.gpt import GPTActor
+from chatgpt.models.opt import OPTActor
 from transformers import AutoTokenizer
 from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
 
diff --git a/applications/ChatGPT/examples/train_dummy.py b/applications/ChatGPT/examples/train_dummy.py
index df64515a1ce8..27ee7f0f1bd3 100644
--- a/applications/ChatGPT/examples/train_dummy.py
+++ b/applications/ChatGPT/examples/train_dummy.py
@@ -2,7 +2,10 @@
 from copy import deepcopy
 
 import torch
-from chatgpt.nn import BLOOMActor, BLOOMCritic, GPTActor, GPTCritic, OPTActor, OPTCritic, RewardModel
+from chatgpt.models.base import RewardModel
+from chatgpt.models.bloom import BLOOMActor, BLOOMCritic
+from chatgpt.models.gpt import GPTActor, GPTCritic
+from chatgpt.models.opt import OPTActor, OPTCritic
 from chatgpt.trainer import PPOTrainer
 from chatgpt.trainer.callbacks import SaveCheckpoint
 from chatgpt.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
diff --git a/applications/ChatGPT/examples/train_prompts.py b/applications/ChatGPT/examples/train_prompts.py
index db4c7d475aa7..576685234f27 100644
--- a/applications/ChatGPT/examples/train_prompts.py
+++ b/applications/ChatGPT/examples/train_prompts.py
@@ -3,7 +3,10 @@
 
 import pandas as pd
 import torch
-from chatgpt.nn import BLOOMActor, BLOOMCritic, GPTActor, GPTCritic, OPTActor, OPTCritic, RewardModel
+from chatgpt.models.base import RewardModel
+from chatgpt.models.bloom import BLOOMActor, BLOOMCritic
+from chatgpt.models.gpt import GPTActor, GPTCritic
+from chatgpt.models.opt import OPTActor, OPTCritic
 from chatgpt.trainer import PPOTrainer
 from chatgpt.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
 from torch.optim import Adam
diff --git a/applications/ChatGPT/examples/train_reward_model.py b/applications/ChatGPT/examples/train_reward_model.py
index 44acba192245..19b20b0847cc 100644
--- a/applications/ChatGPT/examples/train_reward_model.py
+++ b/applications/ChatGPT/examples/train_reward_model.py
@@ -3,7 +3,10 @@
 import loralib as lora
 import torch
 from chatgpt.dataset import RewardDataset
-from chatgpt.nn import BLOOMRM, GPTRM, OPTRM
+from chatgpt.models.base import RewardModel
+from chatgpt.models.bloom import BLOOMRM
+from chatgpt.models.gpt import GPTRM
+from chatgpt.models.opt import OPTRM
 from chatgpt.trainer import RewardModelTrainer
 from chatgpt.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
 from datasets import load_dataset
diff --git a/applications/ChatGPT/tests/test_checkpoint.py b/applications/ChatGPT/tests/test_checkpoint.py
index 6cbe51569ff3..1bbd133f76d3 100644
--- a/applications/ChatGPT/tests/test_checkpoint.py
+++ b/applications/ChatGPT/tests/test_checkpoint.py
@@ -7,7 +7,7 @@
 import torch
 import torch.distributed as dist
 import torch.multiprocessing as mp
-from chatgpt.nn import GPTActor
+from chatgpt.models.gpt import GPTActor
 from chatgpt.trainer.strategies import ColossalAIStrategy, DDPStrategy
 from transformers.models.gpt2.configuration_gpt2 import GPT2Config
 
diff --git a/applications/ChatGPT/tests/test_data.py b/applications/ChatGPT/tests/test_data.py
index b5a84c4d0ef2..3d8fe912cb27 100644
--- a/applications/ChatGPT/tests/test_data.py
+++ b/applications/ChatGPT/tests/test_data.py
@@ -7,7 +7,8 @@
 import torch.distributed as dist
 import torch.multiprocessing as mp
 from chatgpt.experience_maker import NaiveExperienceMaker
-from chatgpt.nn import GPTActor, GPTCritic, RewardModel
+from chatgpt.models.base import RewardModel
+from chatgpt.models.gpt import GPTActor, GPTCritic
 from chatgpt.replay_buffer import NaiveReplayBuffer
 from chatgpt.trainer.strategies import ColossalAIStrategy, DDPStrategy
 from transformers.models.gpt2.configuration_gpt2 import GPT2Config

From 378d827c6b8adabd9020a95ab7b0b4979b325143 Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Tue, 7 Mar 2023 17:49:01 +0800
Subject: [PATCH 431/503] [doc] update nvme offload doc (#3014)

* [doc] update nvme offload doc

* [doc] add doc testing cmd and requirements

* [doc] add api reference

* [doc] add dependencies
---
 docs/requirements-doc-test.txt               |   4 +
 docs/source/en/features/nvme_offload.md      | 214 +++++++++++++++++++
 docs/source/zh-Hans/features/nvme_offload.md | 202 +++++++++++++++++
 3 files changed, 420 insertions(+)

diff --git a/docs/requirements-doc-test.txt b/docs/requirements-doc-test.txt
index b49a94554afb..6a6bb3bee9b0 100644
--- a/docs/requirements-doc-test.txt
+++ b/docs/requirements-doc-test.txt
@@ -1,2 +1,6 @@
 colossalai
 torch
+packaging
+tensornvme
+psutil
+transformers
diff --git a/docs/source/en/features/nvme_offload.md b/docs/source/en/features/nvme_offload.md
index 8c0fd2053f8b..68c422116e1d 100644
--- a/docs/source/en/features/nvme_offload.md
+++ b/docs/source/en/features/nvme_offload.md
@@ -1,3 +1,4 @@
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 nvme_offload.py  -->
 # NVMe offload
 
 Author: Hongxin Liu
@@ -36,12 +37,225 @@ pip install tensornvme
 
 We implement NVMe offload of optimizer states for Adam ([CPUAdam](https://colossalai.readthedocs.io/en/latest/colossalai/colossalai.nn.optimizer.cpu_adam.html) and [HybridAdam](https://colossalai.readthedocs.io/en/latest/colossalai/colossalai.nn.optimizer.hybrid_adam.html)).
 
+
+<!--- doc-test-ignore-start -->
+
 ```python
 from colossalai.nn.optimizer import CPUAdam, HybridAdam
 
 optimizer = HybridAdam(model.parameters(), lr=1e-3, nvme_offload_fraction=1.0, nvme_offload_dir='./')
 ```
 
+<!--- doc-test-ignore-end -->
+
 `nvme_offload_fraction` is the fraction of optimizer states to be offloaded to NVMe. `nvme_offload_dir` is the directory to save NVMe offload files. If `nvme_offload_dir` is `None`, a random temporary directory will be used.
 
 It's compatible with all parallel methods in ColossalAI.
+
+> ⚠ It only offloads optimizer states on CPU. This means it only affects CPU training or Zero/Gemini with offloading.
+
+## Exampls
+
+Let's start from two simple examples -- training GPT with different methods. These examples relies on `transformers`.
+
+We should install denpendencies first:
+
+```shell
+pip install psutil transformers
+```
+
+First, we import essential packages and modules:
+
+```python
+import os
+import time
+from typing import Dict, Optional
+
+import psutil
+import torch
+import torch.nn as nn
+from transformers.models.gpt2.configuration_gpt2 import GPT2Config
+from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
+
+import colossalai
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.nn.parallel import zero_model_wrapper, zero_optim_wrapper
+from colossalai.utils.model.colo_init_context import ColoInitContext
+```
+
+Then we define a loss function:
+
+```python
+class GPTLMLoss(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.loss_fn = nn.CrossEntropyLoss()
+
+    def forward(self, logits, labels):
+        shift_logits = logits[..., :-1, :].contiguous()
+        shift_labels = labels[..., 1:].contiguous()
+        # Flatten the tokens
+        return self.loss_fn(shift_logits.view(-1, shift_logits.size(-1)),
+                            shift_labels.view(-1))
+```
+
+And we define some utility functions, which generates random data, computes the number of paramters of a model and get memory usage of current process:
+
+```python
+def get_data(batch_size: int, seq_len: int,
+             vocab_size: int, device: Optional[str] = None) -> Dict[str, torch.Tensor]:
+    device = torch.cuda.current_device() if device is None else device
+    input_ids = torch.randint(vocab_size, (batch_size, seq_len),
+                              device=device)
+    attn_mask = torch.ones_like(input_ids)
+    return dict(input_ids=input_ids, attention_mask=attn_mask)
+
+
+def get_model_numel(model: nn.Module) -> int:
+    return sum(p.numel() for p in model.parameters())
+
+
+def get_mem_usage() -> int:
+    proc = psutil.Process(os.getpid())
+    return proc.memory_info().rss
+```
+
+We first try to train GPT model on CPU:
+
+```python
+def train_cpu(nvme_offload_fraction: float = 0.0):
+    config = GPT2Config()
+    model = GPT2LMHeadModel(config)
+    criterion = GPTLMLoss()
+    optimizer = HybridAdam(model.parameters(), nvme_offload_fraction=nvme_offload_fraction)
+    print(f'Model numel: {get_model_numel(model) / 1024**3:.3f} B')
+
+    start = time.time()
+    for step in range(3):
+        data = get_data(4, 128, config.vocab_size, device='cpu')
+        outputs = model(**data)
+        loss = criterion(outputs.logits, data['input_ids'])
+        loss.backward()
+        optimizer.step()
+        optimizer.zero_grad()
+        print(f'[{step}] loss: {loss.item():.3f}')
+
+    print(f'Time: {time.time() - start:.3f} s')
+    print(f'Mem usage: {get_mem_usage() / 1024**2:.3f} MB')
+```
+
+Run without NVME offload:
+
+```python
+train_cpu(0.0)
+```
+
+We may get below output:
+
+```
+Model numel: 0.116 B
+[0] loss: 10.953
+[1] loss: 10.974
+[2] loss: 10.965
+Time: 7.739 s
+Mem usage: 5966.445 MB
+```
+
+And then run with (full) NVME offload:
+
+```python
+train_cpu(1.0)
+```
+
+We may get:
+
+```
+Model numel: 0.116 B
+[0] loss: 10.951
+[1] loss: 10.994
+[2] loss: 10.984
+Time: 8.527 s
+Mem usage: 4968.016 MB
+```
+
+For GPT2-S, which has 0.116 billion parameters, its optimizer states take about 0.928 GB memory. And NVME offload saves about 998 MB memory, which meets our expectations.
+
+Then we can train GPT model with Gemini. The placement policy of Gemini should be `"auto"`, `"cpu"` or `"const"`.
+
+```python
+def train_gemini_cpu(nvme_offload_fraction: float = 0.0):
+    colossalai.launch_from_torch({})
+    config = GPT2Config()
+    with ColoInitContext(device=torch.cuda.current_device()):
+        model = GPT2LMHeadModel(config)
+    criterion = GPTLMLoss()
+    optimizer = HybridAdam(model.parameters(), nvme_offload_fraction=nvme_offload_fraction)
+    print(f'Model numel: {get_model_numel(model) / 1024**3:.3f} B')
+
+    gemini_config = dict(strict_ddp_mode=True, device=torch.cuda.current_device(),
+                         placement_policy='cpu', pin_memory=True, hidden_dim=config.n_embd)
+    model = zero_model_wrapper(model, zero_stage=3, gemini_config=gemini_config)
+    optimizer = zero_optim_wrapper(model, optimizer, initial_scale=2**5)
+
+    start = time.time()
+    for step in range(3):
+        data = get_data(4, 128, config.vocab_size)
+        outputs = model(**data)
+        loss = criterion(outputs.logits, data['input_ids'])
+        optimizer.backward(loss)
+        optimizer.step()
+        optimizer.zero_grad()
+        print(f'[{step}] loss: {loss.item():.3f}')
+
+    print(f'Time: {time.time() - start:.3f} s')
+    print(f'Mem usage: {get_mem_usage() / 1024**2:.3f} MB')
+```
+
+Run without NVME offload:
+
+```python
+train_gemini_cpu(0.0)
+```
+
+We may get:
+
+```
+Model numel: 0.116 B
+searching chunk configuration is completed in 0.27 s.
+used number: 118.68 MB, wasted number: 0.75 MB
+total wasted percentage is 0.63%
+[0] loss: 10.953
+[1] loss: 10.938
+[2] loss: 10.969
+Time: 2.997 s
+Mem usage: 5592.227 MB
+```
+
+And run with (full) NVME offload:
+
+```python
+train_gemini_cpu(1.0)
+```
+
+We may get:
+
+```
+Model numel: 0.116 B
+searching chunk configuration is completed in 0.27 s.
+used number: 118.68 MB, wasted number: 0.75 MB
+total wasted percentage is 0.63%
+[0] loss: 10.953
+[1] loss: 10.938
+[2] loss: 10.969
+Time: 3.691 s
+Mem usage: 5298.344 MB
+```
+
+NVME offload saves about 294 MB memory. Note that enabling `pin_memory` of Gemini can accelerate training but increase memory usage. So this result also meets our expectation. If we disable `pin_memory`, we can aslo observe a memory usage drop about 900 MB.
+
+## API Reference
+
+{{ autodoc:colossalai.nn.optimizer.HybridAdam }}
+
+{{ autodoc:colossalai.nn.optimizer.CPUAdam }}
diff --git a/docs/source/zh-Hans/features/nvme_offload.md b/docs/source/zh-Hans/features/nvme_offload.md
index 6f3280fe19d4..f8ecdab42069 100644
--- a/docs/source/zh-Hans/features/nvme_offload.md
+++ b/docs/source/zh-Hans/features/nvme_offload.md
@@ -1,3 +1,4 @@
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 nvme_offload.py  -->
 # NVMe offload
 
 作者: Hongxin Liu
@@ -36,12 +37,213 @@ pip install tensornvme
 
 我们为 Adam ([CPUAdam](https://colossalai.readthedocs.io/en/latest/colossalai/colossalai.nn.optimizer.cpu_adam.html) 和 [HybridAdam](https://colossalai.readthedocs.io/en/latest/colossalai/colossalai.nn.optimizer.hybrid_adam.html)) 实现了优化器状态的 NVMe offload。
 
+<!--- doc-test-ignore-start -->
+
 ```python
 from colossalai.nn.optimizer import CPUAdam, HybridAdam
 
 optimizer = HybridAdam(model.parameters(), lr=1e-3, nvme_offload_fraction=1.0, nvme_offload_dir='./')
 ```
 
+<!--- doc-test-ignore-end -->
+
 `nvme_offload_fraction` 是要 offload 到 NVMe 的优化器状态的比例。 `nvme_offload_dir` 是保存 NVMe offload 文件的目录。如果 `nvme_offload_dir` 为 `None`，将使用随机临时目录。
 
 它与 ColossalAI 中的所有并行方法兼容。
+
+
+> ⚠ 它只会卸载在 CPU 上的优化器状态。这意味着它只会影响 CPU 训练或者使用卸载的 Zero/Gemini。
+
+## Exampls
+
+Let's start from two simple examples -- training GPT with different methods. These examples relies on `transformers`.
+首先让我们从两个简单的例子开始 -- 用不同的方法训练 GPT。这些例子依赖`transformers`。
+
+我们首先应该安装依赖：
+
+```shell
+pip install psutil transformers
+```
+
+首先，我们导入必要的包和模块：
+
+```python
+import os
+import time
+from typing import Dict, Optional
+import psutil
+import torch
+import torch.nn as nn
+from transformers.models.gpt2.configuration_gpt2 import GPT2Config
+from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
+import colossalai
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.nn.parallel import zero_model_wrapper, zero_optim_wrapper
+from colossalai.utils.model.colo_init_context import ColoInitContext
+```
+
+然后我们定义一个损失函数：
+
+```python
+class GPTLMLoss(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.loss_fn = nn.CrossEntropyLoss()
+    def forward(self, logits, labels):
+        shift_logits = logits[..., :-1, :].contiguous()
+        shift_labels = labels[..., 1:].contiguous()
+        # Flatten the tokens
+        return self.loss_fn(shift_logits.view(-1, shift_logits.size(-1)),
+                            shift_labels.view(-1))
+```
+
+我们定义一些工具函数，用来生成随机数据、计算模型参数量和获取当前进程内存占用：
+
+```python
+def get_data(batch_size: int, seq_len: int,
+             vocab_size: int, device: Optional[str] = None) -> Dict[str, torch.Tensor]:
+    device = torch.cuda.current_device() if device is None else device
+    input_ids = torch.randint(vocab_size, (batch_size, seq_len),
+                              device=device)
+    attn_mask = torch.ones_like(input_ids)
+    return dict(input_ids=input_ids, attention_mask=attn_mask)
+def get_model_numel(model: nn.Module) -> int:
+    return sum(p.numel() for p in model.parameters())
+def get_mem_usage() -> int:
+    proc = psutil.Process(os.getpid())
+    return proc.memory_info().rss
+```
+
+我们首先尝试在 CPU 上训练 GPT 模型：
+
+```python
+def train_cpu(nvme_offload_fraction: float = 0.0):
+    config = GPT2Config()
+    model = GPT2LMHeadModel(config)
+    criterion = GPTLMLoss()
+    optimizer = HybridAdam(model.parameters(), nvme_offload_fraction=nvme_offload_fraction)
+    print(f'Model numel: {get_model_numel(model) / 1024**3:.3f} B')
+    start = time.time()
+    for step in range(3):
+        data = get_data(4, 128, config.vocab_size, device='cpu')
+        outputs = model(**data)
+        loss = criterion(outputs.logits, data['input_ids'])
+        loss.backward()
+        optimizer.step()
+        optimizer.zero_grad()
+        print(f'[{step}] loss: {loss.item():.3f}')
+    print(f'Time: {time.time() - start:.3f} s')
+    print(f'Mem usage: {get_mem_usage() / 1024**2:.3f} MB')
+```
+
+不使用 NVME 卸载：
+
+```python
+train_cpu(0.0)
+```
+
+我们可能得到如下输出：
+
+```
+Model numel: 0.116 B
+[0] loss: 10.953
+[1] loss: 10.974
+[2] loss: 10.965
+Time: 7.739 s
+Mem usage: 5966.445 MB
+```
+
+然后使用（全量） NVME 卸载：
+
+```python
+train_cpu(1.0)
+```
+
+我们可能得到：
+
+```
+Model numel: 0.116 B
+[0] loss: 10.951
+[1] loss: 10.994
+[2] loss: 10.984
+Time: 8.527 s
+Mem usage: 4968.016 MB
+```
+
+对于有1.16亿参数的 GPT2-S 来说，它的优化器状态大约需要占用 0.928 GB 内存。NVME 卸载节省了大约 998 MB 内存，符合我们的预期。
+
+然后我们可以用 Gemini 来训练 GPT 模型。放置策略应该设置为`"auto"`、 `"cpu"` 或 `"const"`。
+
+```python
+def train_gemini_cpu(nvme_offload_fraction: float = 0.0):
+    colossalai.launch_from_torch({})
+    config = GPT2Config()
+    with ColoInitContext(device=torch.cuda.current_device()):
+        model = GPT2LMHeadModel(config)
+    criterion = GPTLMLoss()
+    optimizer = HybridAdam(model.parameters(), nvme_offload_fraction=nvme_offload_fraction)
+    print(f'Model numel: {get_model_numel(model) / 1024**3:.3f} B')
+    gemini_config = dict(strict_ddp_mode=True, device=torch.cuda.current_device(),
+                         placement_policy='cpu', pin_memory=True, hidden_dim=config.n_embd)
+    model = zero_model_wrapper(model, zero_stage=3, gemini_config=gemini_config)
+    optimizer = zero_optim_wrapper(model, optimizer, initial_scale=2**5)
+    start = time.time()
+    for step in range(3):
+        data = get_data(4, 128, config.vocab_size)
+        outputs = model(**data)
+        loss = criterion(outputs.logits, data['input_ids'])
+        optimizer.backward(loss)
+        optimizer.step()
+        optimizer.zero_grad()
+        print(f'[{step}] loss: {loss.item():.3f}')
+    print(f'Time: {time.time() - start:.3f} s')
+    print(f'Mem usage: {get_mem_usage() / 1024**2:.3f} MB')
+```
+
+不使用 NVME 卸载：
+
+```python
+train_gemini_cpu(0.0)
+```
+
+我们可能得到：
+
+```
+Model numel: 0.116 B
+searching chunk configuration is completed in 0.27 s.
+used number: 118.68 MB, wasted number: 0.75 MB
+total wasted percentage is 0.63%
+[0] loss: 10.953
+[1] loss: 10.938
+[2] loss: 10.969
+Time: 2.997 s
+Mem usage: 5592.227 MB
+```
+
+然后使用（全量） NVME 卸载：
+
+```python
+train_gemini_cpu(1.0)
+```
+
+我们可能得到：
+
+```
+Model numel: 0.116 B
+searching chunk configuration is completed in 0.27 s.
+used number: 118.68 MB, wasted number: 0.75 MB
+total wasted percentage is 0.63%
+[0] loss: 10.953
+[1] loss: 10.938
+[2] loss: 10.969
+Time: 3.691 s
+Mem usage: 5298.344 MB
+```
+
+NVME 卸载节省了大约 294 MB 内存。注意使用 Gemini 的 `pin_memory` 功能可以加速训练，但是会增加内存占用。所以这个结果也是符合我们预期的。如果我们关闭 `pin_memory`，我们仍然可以观察到大约 900 MB 的内存占用下降。
+
+## API 参考
+
+{{ autodoc:colossalai.nn.optimizer.HybridAdam }}
+
+{{ autodoc:colossalai.nn.optimizer.CPUAdam }}

From ea0b52c12ee7598fe126ed0d8b0557f7e8a0e999 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Tue, 7 Mar 2023 18:04:10 +0800
Subject: [PATCH 432/503] [doc] specified operating system requirement (#3019)

* [doc] specified operating system requirement

* polish code
---
 README-zh-Hans.md                               | 15 +++++++++++----
 README.md                                       | 12 +++++++++---
 docs/source/en/get_started/installation.md      | 12 +++++++++---
 docs/source/zh-Hans/get_started/installation.md | 13 ++++++++++---
 setup.py                                        |  5 +++++
 5 files changed, 44 insertions(+), 13 deletions(-)

diff --git a/README-zh-Hans.md b/README-zh-Hans.md
index 8ff25c64f5d4..8a3bee5ec17c 100644
--- a/README-zh-Hans.md
+++ b/README-zh-Hans.md
@@ -274,19 +274,26 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
 <p align="right">(<a href="#top">返回顶端</a>)</p>
 
 ## 安装
-> Colossal-AI 目前仅支持Linux操作系统，没有在其他操作系统如Windows和macOS进行测试。
->
-> 环境要求: PyTorch 1.10 ~ 1.12 (更新版本正在兼容中), Python >= 3.7, CUDA >= 11.0。如果你遇到安装问题，可以向本项目 [反馈](https://github.com/hpcaitech/ColossalAI/issues/new/choose)。
+
+环境要求:
+
+- PyTorch >= 1.11 (PyTorch 2.x 正在适配中)
+- Python >= 3.7
+- CUDA >= 11.0
+  
+如果你遇到安装问题，可以向本项目 [反馈](https://github.com/hpcaitech/ColossalAI/issues/new/choose)。
 
 
 ### 从PyPI安装
 
-您可以用下面的命令直接从PyPI上下载并安装Colossal-AI。我们默认不会安装PyTorch扩展包
+您可以用下面的命令直接从PyPI上下载并安装Colossal-AI。我们默认不会安装PyTorch扩展包。
 
 ```bash
 pip install colossalai
 ```
 
+**注：目前只支持Linux。**
+
 但是，如果你想在安装时就直接构建PyTorch扩展，您可以设置环境变量`CUDA_EXT=1`.
 
 ```bash
diff --git a/README.md b/README.md
index 10d59e34c8f8..3115192d6ab2 100644
--- a/README.md
+++ b/README.md
@@ -276,9 +276,13 @@ Acceleration of [AlphaFold Protein Structure](https://alphafold.ebi.ac.uk/)
 <p align="right">(<a href="#top">back to top</a>)</p>
 
 ## Installation
-> Colossal-AI currently only supports the Linux operating system and has not been tested on other OS such as Windows and macOS.
->
-> Environment Requirement: PyTorch 1.10 ~ 1.12 (WIP higher version), Python >= 3.7, CUDA >= 11.0. If you encounter any problem about installation, you may want to raise an [issue](https://github.com/hpcaitech/ColossalAI/issues/new/choose) in this repository.
+
+Requirements:
+- PyTorch >= 1.11 (PyTorch 2.x in progress)
+- Python >= 3.7
+- CUDA >= 11.0
+  
+If you encounter any problem about installation, you may want to raise an [issue](https://github.com/hpcaitech/ColossalAI/issues/new/choose) in this repository.
 
 ### Install from PyPI
 
@@ -288,6 +292,8 @@ You can easily install Colossal-AI with the following command. **By default, we
 pip install colossalai
 ```
 
+**Note: only Linux is supported for now.**
+
 However, if you want to build the PyTorch extensions during installation, you can set `CUDA_EXT=1`.
 
 ```bash
diff --git a/docs/source/en/get_started/installation.md b/docs/source/en/get_started/installation.md
index da30ab4e5507..c1c34b4110d6 100644
--- a/docs/source/en/get_started/installation.md
+++ b/docs/source/en/get_started/installation.md
@@ -1,9 +1,13 @@
 <!-- doc-test-command: echo "installation.md does not need test" -->
 
 # Setup
-> Colossal-AI currently only supports the Linux operating system and has not been tested on other OS such as Windows and macOS.
->
-> Environment Requirement: PyTorch 1.10 ~ 1.12 (WIP higher version), Python >= 3.7, CUDA >= 11.0. If you encounter any problem about installation, you may want to raise an [issue](https://github.com/hpcaitech/ColossalAI/issues/new/choose) in this repository.
+
+Requirements:
+- PyTorch >= 1.11 (PyTorch 2.x in progress)
+- Python >= 3.7
+- CUDA >= 11.0
+  
+If you encounter any problem about installation, you may want to raise an [issue](https://github.com/hpcaitech/ColossalAI/issues/new/choose) in this repository.
 
 
 ## Download From PyPI
@@ -14,6 +18,8 @@ You can install Colossal-AI with
 pip install colossalai
 ```
 
+**Note: only Linux is supported for now**
+
 If you want to build PyTorch extensions during installation, you can use the command below. Otherwise, the PyTorch extensions will be built during runtime.
 
 ```shell
diff --git a/docs/source/zh-Hans/get_started/installation.md b/docs/source/zh-Hans/get_started/installation.md
index 2ceb0231f2f3..fb79fc676c0b 100755
--- a/docs/source/zh-Hans/get_started/installation.md
+++ b/docs/source/zh-Hans/get_started/installation.md
@@ -1,7 +1,12 @@
 # 安装
-> Colossal-AI 目前仅支持Linux操作系统，没有在其他操作系统如Windows和macOS进行测试
->
-> 环境要求: PyTorch 1.10 ~ 1.12 (更新版本正在兼容中), Python >= 3.7, CUDA >= 11.0。如果你遇到安装问题，可以向本项目 [反馈](https://github.com/hpcaitech/ColossalAI/issues/new/choose)。
+
+环境要求:
+
+- PyTorch >= 1.11 (PyTorch 2.x 正在适配中)
+- Python >= 3.7
+- CUDA >= 11.0
+  
+如果你遇到安装问题，可以向本项目 [反馈](https://github.com/hpcaitech/ColossalAI/issues/new/choose)。
 
 ## 从PyPI上安装
 
@@ -11,6 +16,8 @@
 pip install colossalai
 ```
 
+**注：现在只支持Linux。**
+
 如果你想同时安装PyTorch扩展的话，可以添加`CUDA_EXT=1`。如果不添加的话，PyTorch扩展会在运行时自动安装。
 
 ```shell
diff --git a/setup.py b/setup.py
index 6c24cb504251..0a66a90084ee 100644
--- a/setup.py
+++ b/setup.py
@@ -1,4 +1,5 @@
 import os
+import sys
 from datetime import datetime
 from typing import List
 
@@ -31,6 +32,10 @@
 # a variable to store the op builder
 ext_modules = []
 
+# we do not support windows currently
+if sys.platform == 'win32':
+    raise RuntimeError("Windows is not supported yet. Please try again within the Windows Subsystem for Linux (WSL).")
+
 
 # check for CUDA extension dependencies
 def environment_check_for_cuda_extension_build():

From 29386a54e66d7e5ca40cabf1686839fba9aac71d Mon Sep 17 00:00:00 2001
From: YuliangLiu0306 <72588413+YuliangLiu0306@users.noreply.github.com>
Date: Wed, 8 Mar 2023 10:45:31 +0800
Subject: [PATCH 433/503] [DTensor] refactor CommSpec (#3034)

---
 colossalai/tensor/d_tensor/comm_spec.py       | 310 ++++++++++++++++++
 colossalai/tensor/d_tensor/sharding_spec.py   |   2 +-
 .../test_dtensor/test_comm_spec.py            | 190 +++++++++++
 3 files changed, 501 insertions(+), 1 deletion(-)
 create mode 100644 colossalai/tensor/d_tensor/comm_spec.py
 create mode 100644 tests/test_tensor/test_dtensor/test_comm_spec.py

diff --git a/colossalai/tensor/d_tensor/comm_spec.py b/colossalai/tensor/d_tensor/comm_spec.py
new file mode 100644
index 000000000000..765d8ec1b01a
--- /dev/null
+++ b/colossalai/tensor/d_tensor/comm_spec.py
@@ -0,0 +1,310 @@
+from enum import Enum
+from typing import Dict
+
+import torch
+import torch.distributed as dist
+from torch.distributed import ReduceOp
+
+__all__ = [
+    'CollectiveCommPattern',
+    'CommSpec',
+]
+
+
+class CollectiveCommPattern(Enum):
+    GATHER_FWD_SPLIT_BWD = 'gather_fwd_split_bwd'
+    ALL2ALL_FWD_ALL2ALL_BWD = 'all2all_fwd_all2all_bwd'
+    SPLIT_FWD_GATHER_BWD = 'split_fwd_gather_bwd'
+    ALLREDUCE_FWD_IDENTITY_BWD = 'all_reduce_fwd_identity_bwd'
+    IDENTITY_FWD_ALLREDUCE_BWD = 'identity_fwd_all_reduce_bwd'
+    MIXGATHER_FWD_SPLIT_BWD = "mixgather_fwd_split_bwd"
+
+
+class CommSpec:
+    '''
+    Communication spec is used to record the communication action. It converts the communication spec
+    to real action which will be used in runtime. It contains comm_pattern to determine the
+    communication method, process_groups_dict to determine the process groups, gather_dim and shard_dim
+    to determine the buffer shape, and logical_process_axis
+
+    Argument:
+        comm_pattern(CollectiveCommPattern): decribe the communication method used in this spec.
+        process_groups_dict(Dict): A dict which contains the process groups used to apply this CommSpec.
+        gather_dim(int, Optional): The gather_dim of the tensor will be gathered.
+        shard_dim(int, Optional): The shard_dim of the tensor will be sharded.
+        logical_process_axis(Union(int, List[int]), Optional): The mesh_dim to implement the communication action.
+    '''
+
+    def __init__(self,
+                 comm_pattern: CollectiveCommPattern,
+                 process_groups_dict: Dict,
+                 gather_dim: int = None,
+                 shard_dim: int = None,
+                 logical_process_axis: int = None):
+        self.comm_pattern = comm_pattern
+        self.gather_dim = gather_dim
+        self.shard_dim = shard_dim
+        self.logical_process_axis = logical_process_axis
+        self.process_groups_dict = process_groups_dict
+
+    def __repr__(self):
+        res_list = ["CommSpec:("]
+        if self.comm_pattern == CollectiveCommPattern.GATHER_FWD_SPLIT_BWD:
+            res_list.append(f"comm_pattern:GATHER_FWD_SPLIT_BWD, ")
+            res_list.append(f"gather_dim:{self.gather_dim}, ")
+            res_list.append(f"shard_dim:{self.gather_dim}, ")
+            res_list.append(f"logical_process_axis:{self.logical_process_axis})")
+        elif self.comm_pattern == CollectiveCommPattern.ALL2ALL_FWD_ALL2ALL_BWD:
+            res_list.append(f"comm_pattern:ALL2ALL_FWD_ALL2ALL_BWD, ")
+            res_list.append(f"gather_dim:{self.gather_dim}, ")
+            res_list.append(f"shard_dim:{self.shard_dim}, ")
+            res_list.append(f"logical_process_axis: {self.logical_process_axis})")
+        elif self.comm_pattern == CollectiveCommPattern.SPLIT_FWD_GATHER_BWD:
+            res_list.append(f"comm_pattern:SPLIT_FWD_GATHER_BWD, ")
+            res_list.append(f"gather_dim:{self.gather_dim}, ")
+            res_list.append(f"shard_dim:{self.shard_dim}, ")
+            res_list.append(f"logical_process_axis:{self.logical_process_axis})")
+        elif self.comm_pattern == CollectiveCommPattern.ALLREDUCE_FWD_IDENTITY_BWD:
+            res_list.append(f"comm_pattern:ALLREDUCE_FWD_IDENTITY_BWD, ")
+            res_list.append(f"logical_process_axis:{self.logical_process_axis})")
+        elif self.comm_pattern == CollectiveCommPattern.IDENTITY_FWD_ALLREDUCE_BWD:
+            res_list.append(f"comm_pattern:IDENTITY_FWD_ALLREDUCE_BWD, ")
+            res_list.append(f"logical_process_axis:{self.logical_process_axis})")
+
+        return ''.join(res_list)
+
+    def covert_spec_to_action(self, tensor):
+        '''
+        Convert CommSpec into runtime action, implement real collection communication to target tensor.
+        The collection communication action is directed by the CommSpec.
+
+        Argument:
+            tensor(torch.Tensor): Tensor stored in each device, which could be different in different ranks.
+        '''
+        if self.comm_pattern in pattern_to_func_dict:
+            tensor = pattern_to_func_dict[self.comm_pattern](tensor, self)
+        else:
+            tensor = tensor
+        return tensor
+
+
+def _all_gather(tensor: torch.Tensor, comm_spec: CommSpec):
+    '''
+    Implement all gather operation on device mesh based on information provided by comm_spec.
+    '''
+    process_groups_list = comm_spec.process_groups_dict[comm_spec.logical_process_axis]
+    for rank_list, process_group in process_groups_list:
+        if dist.get_rank() in rank_list:
+            tensor_list = [
+                torch.zeros(tensor.shape, dtype=tensor.dtype, device=tensor.device) for _ in range(len(rank_list))
+            ]
+            # without this contiguous operation, the all gather may get some unexpected results.
+            tensor = tensor.contiguous()
+            dist.all_gather(tensor_list, tensor, group=process_group)
+            output = torch.cat(tuple(tensor_list), comm_spec.gather_dim).contiguous()
+            return output
+
+
+def _split(tensor: torch.Tensor, comm_spec: CommSpec):
+    '''
+    Implement shard operation on device mesh based on information provided by comm_spec.
+    '''
+    process_groups_list = comm_spec.process_groups_dict[comm_spec.logical_process_axis]
+    for rank_list, _ in process_groups_list:
+        if dist.get_rank() in rank_list:
+            dim = comm_spec.shard_dim
+            length = tensor.shape[comm_spec.shard_dim] // len(rank_list)
+            start = length * rank_list.index(dist.get_rank())
+            output = torch.narrow(tensor, dim, start, length).contiguous()
+            return output
+
+
+def _all_to_all(tensor: torch.Tensor, comm_spec: CommSpec):
+    '''
+    Implement all to all operation on device mesh based on information provided by comm_spec.
+    '''
+    process_groups_list = comm_spec.process_groups_dict[comm_spec.logical_process_axis]
+    for rank_list, process_group in process_groups_list:
+        if dist.get_rank() in rank_list:
+            new_shape = list(tensor.shape)
+            new_shape[comm_spec.shard_dim] = new_shape[comm_spec.shard_dim] // len(rank_list)
+            new_shape = torch.Size(new_shape)
+            output_tensor_list = [
+                torch.zeros(new_shape, dtype=tensor.dtype, device=tensor.device) for _ in range(len(rank_list))
+            ]
+            dim = comm_spec.shard_dim
+            length = tensor.shape[comm_spec.shard_dim] // len(rank_list)
+            input_tensor_list = [
+                torch.narrow(tensor, dim, length * i, length).contiguous() for i in range(len(rank_list))
+            ]
+            group = process_group
+            dist.all_to_all(output_tensor_list, input_tensor_list, group)
+            output = torch.cat(tuple(output_tensor_list), comm_spec.gather_dim).contiguous()
+            return output
+
+
+def _all_reduce(tensor: torch.Tensor, comm_spec: CommSpec, async_op: bool = False):
+    '''
+    Implement all reduce operation on device mesh based on information provided by comm_spec.
+    '''
+    process_groups_list = comm_spec.process_groups_dict[comm_spec.logical_process_axis]
+    for rank_list, process_group in process_groups_list:
+        if dist.get_rank() in rank_list:
+            if not tensor.is_contiguous():
+                tensor = tensor.contiguous()
+            dist.all_reduce(tensor, op=ReduceOp.SUM, group=process_group, async_op=async_op)
+            return tensor
+
+
+class _ReduceGrad(torch.autograd.Function):
+    """
+    A customized communication operation which forward is an identity operation,
+    backward is all_reduce operation.
+
+    Args:
+        input_: input matrix.
+        comm_spec: comm_spec will give information like process group, rank list, etc.
+    """
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return input_
+
+    @staticmethod
+    def forward(ctx, input_, comm_spec):
+        ctx.comm_spec = comm_spec
+        return input_
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _all_reduce(grad_output, ctx.comm_spec), None
+
+
+class _ReduceInput(torch.autograd.Function):
+    """
+    A customized communication operation which forward is all_reduce operation,
+    backward is an identity operation.
+
+    Args:
+        input_: input matrix.
+        comm_spec: comm_spec will give information like process group, rank list, etc.
+    """
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return _all_reduce(input_)
+
+    @staticmethod
+    def forward(ctx, input_, comm_spec):
+        return _all_reduce(input_, comm_spec)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output, None
+
+
+class _SplitForwardGatherBackward(torch.autograd.Function):
+    """
+    A customized communication operation which forward is split operation,
+    backward is an all gather operation.
+
+    Args:
+        input_: input matrix.
+        comm_spec: comm_spec will give information like process group, rank list, etc.
+    """
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return _split(input_)
+
+    @staticmethod
+    def forward(ctx, input_, comm_spec):
+        ctx.comm_spec = comm_spec
+        return _split(input_, comm_spec)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _all_gather(grad_output, ctx.comm_spec), None
+
+
+class _GatherForwardSplitBackward(torch.autograd.Function):
+    """
+    A customized communication operation which forward is an all gather operation,
+    backward is split operation.
+
+    Args:
+        input_: input matrix.
+        comm_spec: comm_spec will give information like process group, rank list, etc.
+    """
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return _all_gather(input_)
+
+    @staticmethod
+    def forward(ctx, input_, comm_spec):
+        ctx.comm_spec = comm_spec
+        return _all_gather(input_, comm_spec)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _split(grad_output, ctx.comm_spec), None
+
+
+class _AllToAll(torch.autograd.Function):
+    """
+    A customized communication operation which forward is an all to all operation,
+    backward is an all to all operation.
+
+    Args:
+        input_: input matrix.
+        comm_spec: comm_spec will give information like process group, rank list, etc.
+    """
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return _all_to_all(input_)
+
+    @staticmethod
+    def forward(ctx, input_, comm_spec):
+        output = _all_to_all(input_, comm_spec)
+        comm_spec_for_backward = CommSpec(comm_pattern=comm_spec.comm_pattern,
+                                          process_groups_dict=comm_spec.process_groups_dict,
+                                          gather_dim=comm_spec.shard_dim,
+                                          shard_dim=comm_spec.gather_dim,
+                                          logical_process_axis=comm_spec.logical_process_axis)
+        ctx.comm_spec = comm_spec_for_backward
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_outputs):
+        return _all_to_all(grad_outputs, ctx.comm_spec), None
+
+
+def reduce_grad(input_, comm_spec):
+    return _ReduceGrad.apply(input_, comm_spec)
+
+
+def reduce_input(input_, comm_spec):
+    return _ReduceInput.apply(input_, comm_spec)
+
+
+def split_forward_gather_backward(input_, comm_spec):
+    return _SplitForwardGatherBackward.apply(input_, comm_spec)
+
+
+def gather_forward_split_backward(input_, comm_spec):
+    return _GatherForwardSplitBackward.apply(input_, comm_spec)
+
+
+def all_to_all(input_, comm_spec):
+    return _AllToAll.apply(input_, comm_spec)
+
+
+pattern_to_func_dict = {
+    CollectiveCommPattern.GATHER_FWD_SPLIT_BWD: gather_forward_split_backward,
+    CollectiveCommPattern.ALL2ALL_FWD_ALL2ALL_BWD: all_to_all,
+    CollectiveCommPattern.SPLIT_FWD_GATHER_BWD: split_forward_gather_backward,
+    CollectiveCommPattern.ALLREDUCE_FWD_IDENTITY_BWD: reduce_input,
+    CollectiveCommPattern.IDENTITY_FWD_ALLREDUCE_BWD: reduce_grad,
+}
diff --git a/colossalai/tensor/d_tensor/sharding_spec.py b/colossalai/tensor/d_tensor/sharding_spec.py
index b135c46d68ac..7591f760cb30 100644
--- a/colossalai/tensor/d_tensor/sharding_spec.py
+++ b/colossalai/tensor/d_tensor/sharding_spec.py
@@ -171,7 +171,7 @@ def _sanity_check(self):
             raise ShardingOutOfIndexError(
                 f'sharding_sequence should have {self.dims} elements, but got index {len(self.sharding_sequence)}.')
 
-        if max(list(self.dim_partition_dict.keys())) >= self.dims:
+        if list(self.dim_partition_dict.keys()) and max(list(self.dim_partition_dict.keys())) >= self.dims:
             raise ShardingOutOfIndexError(
                 f'the key of dim_partition_dict should be less than {self.dims}, but got {max(list(self.dim_partition_dict.keys()))}.'
             )
diff --git a/tests/test_tensor/test_dtensor/test_comm_spec.py b/tests/test_tensor/test_dtensor/test_comm_spec.py
new file mode 100644
index 000000000000..547a96b264dc
--- /dev/null
+++ b/tests/test_tensor/test_dtensor/test_comm_spec.py
@@ -0,0 +1,190 @@
+from functools import partial
+
+import pytest
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+from torch.distributed import ReduceOp
+
+from colossalai.core import global_context as gpc
+from colossalai.device.device_mesh import DeviceMesh
+from colossalai.initialize import launch
+from colossalai.logging import disable_existing_loggers
+from colossalai.tensor.d_tensor.comm_spec import CollectiveCommPattern, CommSpec
+from colossalai.tensor.d_tensor.sharding_spec import ShardingSpec
+from colossalai.testing import rerun_if_address_is_in_use
+from colossalai.utils import free_port
+
+
+def check_all_gather(process_groups_dict, rank):
+    # tensor to comm
+    if rank in (0, 2):
+        sharded_tensor_to_comm = torch.ones(2, 2).cuda()
+    else:
+        sharded_tensor_to_comm = torch.zeros(2, 2).cuda()
+
+    # tensor to check
+    tensor_to_check = torch.cat((torch.ones(2, 2), torch.zeros(2, 2)), 1).cuda()
+
+    # CommSpec:(comm_pattern:allgather, gather_dim:1, logical_process_axis:1)
+    comm_spec = CommSpec(CollectiveCommPattern.GATHER_FWD_SPLIT_BWD,
+                         process_groups_dict,
+                         gather_dim=1,
+                         logical_process_axis=1)
+    sharded_tensor_to_comm = sharded_tensor_to_comm = comm_spec.covert_spec_to_action(sharded_tensor_to_comm)
+
+    assert sharded_tensor_to_comm.equal(tensor_to_check)
+
+
+def check_shard(process_groups_dict, rank):
+    # tensor to comm
+    sharded_tensor_to_comm_0 = torch.zeros(2, 2).cuda()
+    sharded_tensor_to_comm_1 = torch.ones(2, 2).cuda()
+    # tensor([[0., 0., 1., 1.],
+    #         [0., 0., 1., 1.]])
+    tensor_to_shard = torch.cat((sharded_tensor_to_comm_0, sharded_tensor_to_comm_1), 1)
+
+    # CommSpec:(comm_pattern:shard, shard_dim:1, logical_process_axis:1)
+    comm_spec = CommSpec(CollectiveCommPattern.SPLIT_FWD_GATHER_BWD,
+                         process_groups_dict,
+                         shard_dim=1,
+                         logical_process_axis=1)
+    tensor_to_shard = comm_spec.covert_spec_to_action(tensor_to_shard)
+
+    if rank in (0, 2):
+        assert tensor_to_shard.equal(sharded_tensor_to_comm_0)
+    if rank in (1, 3):
+        assert tensor_to_shard.equal(sharded_tensor_to_comm_1)
+
+
+def check_all_to_all(process_groups_dict, rank):
+    # tensor to comm
+    if rank in (0, 1):
+        sharded_tensor_0 = torch.zeros(2, 1)
+        sharded_tensor_1 = torch.ones(2, 1)
+        # tensor([[0., 1.],
+        #         [0., 1.]])
+        tensor_to_comm = torch.cat((sharded_tensor_0, sharded_tensor_1), 1).cuda()
+    if rank in (2, 3):
+        sharded_tensor_0 = torch.ones(2, 1) * 2
+        sharded_tensor_1 = torch.ones(2, 1) * 3
+        # tensor([[2., 3.],
+        #         [2., 3.]])
+        tensor_to_comm = torch.cat((sharded_tensor_0, sharded_tensor_1), 1).cuda()
+
+    if rank in (0, 1):
+        # tensor([[0.],
+        #         [0.],
+        #         [2.],
+        #         [2.]])
+        tensor_to_check = torch.tensor([[0], [0], [2], [2]], dtype=tensor_to_comm.dtype).cuda()
+    if rank in (2, 3):
+        # tensor([[1.],
+        #         [1.],
+        #         [3.],
+        #         [3.]])
+        tensor_to_check = torch.tensor([[1], [1], [3], [3]], dtype=tensor_to_comm.dtype).cuda()
+
+    # CommSpec:(comm_pattern:shard, shard_dim:1, logical_process_axis:1)
+    comm_spec = CommSpec(CollectiveCommPattern.ALL2ALL_FWD_ALL2ALL_BWD,
+                         process_groups_dict,
+                         gather_dim=0,
+                         shard_dim=1,
+                         logical_process_axis=0)
+    tensor_to_comm = comm_spec.covert_spec_to_action(tensor_to_comm)
+
+    assert tensor_to_comm.equal(tensor_to_check)
+
+
+def check_all_reduce_fwd(process_groups_dict, rank):
+    # tensor to comm
+    tensor_to_comm = torch.ones(2, 2).cuda() * rank
+
+    # reduce through logical process axis 0
+    # tensor to check
+    if rank in (0, 2):
+        # tensor([[2., 2.],
+        #         [2., 2.]])
+        tensor_to_check = torch.tensor([[2, 2], [2, 2]], dtype=tensor_to_comm.dtype).cuda()
+    if rank in (1, 3):
+        # tensor([[4., 4.],
+        #         [4., 4.]])
+        tensor_to_check = torch.tensor([[4, 4], [4, 4]], dtype=tensor_to_comm.dtype).cuda()
+
+    comm_spec = CommSpec(CollectiveCommPattern.ALLREDUCE_FWD_IDENTITY_BWD, process_groups_dict, logical_process_axis=0)
+    tensor_to_comm = comm_spec.covert_spec_to_action(tensor_to_comm)
+
+    assert tensor_to_comm.equal(tensor_to_check)
+
+
+def check_all_reduce_bwd(process_groups_dict, rank):
+    # tensor to comm
+    tensor_to_comm = torch.ones(2, 2).cuda() * rank
+
+    tensor_to_check = torch.ones(2, 2).cuda() * rank
+
+    comm_spec = CommSpec(CollectiveCommPattern.IDENTITY_FWD_ALLREDUCE_BWD, process_groups_dict, logical_process_axis=0)
+    tensor_to_comm = comm_spec.covert_spec_to_action(tensor_to_comm)
+
+    assert tensor_to_comm.equal(tensor_to_check)
+
+
+def check_all_reduce_in_flatten_device_mesh(process_groups_dict, rank):
+    # tensor to comm
+    tensor_to_comm = torch.ones(2, 2).cuda() * rank
+
+    # reduce through logical process axis 0 at flatten device mesh
+    # tensor to check
+    # tensor([[6., 6.],
+    #         [6., 6.]])
+    tensor_to_check = torch.tensor([[6, 6], [6, 6]], dtype=tensor_to_comm.dtype).cuda()
+
+    # CommSpec:(comm_pattern:all_reduce, logical_process_axis:[0, 1])
+    comm_spec = CommSpec(CollectiveCommPattern.ALLREDUCE_FWD_IDENTITY_BWD, process_groups_dict, logical_process_axis=0)
+    tensor_to_comm = comm_spec.covert_spec_to_action(tensor_to_comm)
+
+    assert tensor_to_comm.equal(tensor_to_check)
+
+
+def check_comm(rank, world_size, port):
+    disable_existing_loggers()
+    launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+
+    physical_mesh_id = torch.arange(0, 4)
+    assert rank == gpc.get_global_rank()
+
+    mesh_shape = (2, 2)
+    # [[0, 1,
+    #  [2, 3]]
+    device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
+    process_groups_dict = device_mesh.process_groups_dict
+
+    # test all gather
+    check_all_gather(process_groups_dict, rank)
+
+    # test shard
+    check_shard(process_groups_dict, rank)
+
+    # test all to all
+    check_all_to_all(process_groups_dict, rank)
+
+    # test all reduce
+    check_all_reduce_fwd(process_groups_dict, rank)
+    check_all_reduce_bwd(process_groups_dict, rank)
+
+    flatten_process_groups_dict = device_mesh.flatten_device_mesh.process_groups_dict
+    # test all reduce in 1D flatten device mesh
+    check_all_reduce_in_flatten_device_mesh(flatten_process_groups_dict, rank)
+    gpc.destroy()
+
+
+@pytest.mark.dist
+@rerun_if_address_is_in_use()
+def test_comm_spec():
+    world_size = 4
+    run_func = partial(check_comm, world_size=world_size, port=free_port())
+    mp.spawn(run_func, nprocs=world_size)
+
+
+if __name__ == '__main__':
+    test_comm_spec()

From 2ef855c79885c028dccf40172125f7a63e30e4a0 Mon Sep 17 00:00:00 2001
From: ramos <49182011+nemoramo@users.noreply.github.com>
Date: Wed, 8 Mar 2023 13:45:15 +0800
Subject: [PATCH 434/503] support shardinit option to avoid OPT OOM
 initializing problem (#3037)

Co-authored-by: poe <poe@nemoramo>
---
 examples/language/opt/run_gemini.sh       |  8 +++++++
 examples/language/opt/train_gemini_opt.py | 28 ++++++++++++++++++++---
 2 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/examples/language/opt/run_gemini.sh b/examples/language/opt/run_gemini.sh
index 92fd481c5bc3..73f231292a13 100644
--- a/examples/language/opt/run_gemini.sh
+++ b/examples/language/opt/run_gemini.sh
@@ -4,10 +4,17 @@ export MEMCAP=${MEMCAP:-0}
 # Acceptable values include `125m`, `350m`, `1.3b`, `2.7b`, `6.7b`, `13b`, `30b`, `66b`. For `175b`
 export MODEL=${MODEL:-"125m"}
 export GPUNUM=${GPUNUM:-1}
+export USE_SHARD_INIT=${USE_SHARD_INIT:-"false"}
 
 # make directory for logs
 mkdir -p ./logs
 
+if [ ${USE_SHARD_INIT} = "true" ]; then
+  USE_SHARD_INIT="--shardinit"
+else
+  USE_SHARD_INIT=""
+fi
+
 export MODLE_PATH="facebook/opt-${MODEL}"
 
 # HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1
@@ -17,4 +24,5 @@ torchrun \
   train_gemini_opt.py \
   --mem_cap ${MEMCAP} \
   --model_name_or_path ${MODLE_PATH} \
+  ${USE_SHARD_INIT} \
   --batch_size ${BS} 2>&1 | tee ./logs/colo_${MODEL}_bs_${BS}_cap_${MEMCAP}_gpu_${GPUNUM}.log
diff --git a/examples/language/opt/train_gemini_opt.py b/examples/language/opt/train_gemini_opt.py
index 64426ba4285c..1546b31ba922 100755
--- a/examples/language/opt/train_gemini_opt.py
+++ b/examples/language/opt/train_gemini_opt.py
@@ -39,6 +39,8 @@
 from colossalai.utils import get_current_device
 from colossalai.utils.model.colo_init_context import ColoInitContext
 
+from colossalai.tensor import ProcessGroup, ShardSpec
+
 
 def get_data(batch_size, seq_len, vocab_size):
     input_ids = torch.randint(0, vocab_size, (batch_size, seq_len), device=torch.cuda.current_device())
@@ -102,6 +104,11 @@ def parse_args():
         help="Model type to use if training from scratch.",
         choices=MODEL_TYPES,
     )
+    parser.add_argument(
+        "--shardinit",
+        action="store_true",
+        help="Initialize the model with tensor parallel",
+    )
     parser.add_argument("--mem_cap", type=int, default=0, help="use mem cap")
     parser.add_argument("--init_in_cpu", action='store_true', default=False, help="init training model in cpu")
     args = parser.parse_args()
@@ -159,16 +166,30 @@ def main():
     else:
         init_dev = get_current_device()
 
+    # shard init prameters
+    if args.shardinit:
+        logger.info("Sharding initialization !", ranks=[0])
+    else:
+        logger.info("Skipping sharding initialization", ranks=[0])
+
+    world_size = torch.distributed.get_world_size()
+    shard_pg = ProcessGroup(tp_degree=world_size) if args.shardinit else None
+    default_dist_spec = ShardSpec([-1], [world_size]) if args.shardinit else None
+
     # build model
     if args.model_name_or_path is None or args.model_name_or_path == 'facebook/opt-13b':
         # currently, there has a bug in pretrained opt-13b
         # we can not import it until huggingface fix it
         logger.info("Train a new model from scratch", ranks=[0])
-        with ColoInitContext(device=init_dev, dtype=torch.half):
+        with ColoInitContext(device=init_dev, dtype=torch.half,
+                             default_dist_spec=default_dist_spec,
+                             default_pg=shard_pg):
             model = OPTForCausalLM(config)
     else:
         logger.info("Finetune a pre-trained model", ranks=[0])
-        with ColoInitContext(device=init_dev, dtype=torch.half):
+        with ColoInitContext(device=init_dev, dtype=torch.half,
+                             default_dist_spec=default_dist_spec,
+                             default_pg=shard_pg):
             model = OPTForCausalLM.from_pretrained(args.model_name_or_path,
                                                    from_tf=bool(".ckpt" in args.model_name_or_path),
                                                    config=config,
@@ -179,7 +200,8 @@ def main():
 
     numel = sum([p.numel() for p in model.parameters()])
     PLACEMENT_POLICY = 'cpu'
-    model = GeminiDDP(model, device=get_current_device(), placement_policy=PLACEMENT_POLICY, pin_memory=True)
+    model = GeminiDDP(model, device=get_current_device(), placement_policy=PLACEMENT_POLICY,
+                      pin_memory=True, strict_ddp_mode=args.shardinit)
     optimizer = GeminiAdamOptimizer(model, lr=args.learning_rate, initial_scale=2**14, gpu_margin_mem_ratio=0.0)
 
     SEQ_LEN = 1024

From b51bfec3573e2d217a8ab4f314cf891a53e18e19 Mon Sep 17 00:00:00 2001
From: wenjunyang <wendaleyang@gmail.com>
Date: Wed, 8 Mar 2023 15:18:02 +0800
Subject: [PATCH 435/503] [chatgpt] change critic input as state (#3042)

* fix Critic

* fix Critic

* fix Critic

* fix neglect of attention mask

* fix neglect of attention mask

* fix neglect of attention mask

* add return

---------

Co-authored-by: yangwenjun <yangwenjun@soyoung.com>
Co-authored-by: yangwjd <yangwjd@chanjet.com>
---
 applications/ChatGPT/chatgpt/models/base/critic.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/applications/ChatGPT/chatgpt/models/base/critic.py b/applications/ChatGPT/chatgpt/models/base/critic.py
index 4bff5ee97e51..b12bddfcb2e5 100644
--- a/applications/ChatGPT/chatgpt/models/base/critic.py
+++ b/applications/ChatGPT/chatgpt/models/base/critic.py
@@ -36,12 +36,15 @@ def forward(self,
         outputs = self.model(sequences, attention_mask=attention_mask)
         last_hidden_states = outputs['last_hidden_state']
 
-        values = self.value_head(last_hidden_states).squeeze(-1)[:, :-1]
+        values = self.value_head(last_hidden_states).squeeze(-1)
 
         if action_mask is not None:
             num_actions = action_mask.size(1)
-            values = values[:, -num_actions:]
-            value = masked_mean(values, action_mask, dim=1)
+            prompt_mask = attention_mask[:, :-num_actions]
+            values = values[:, :-num_actions]
+            value = masked_mean(values, prompt_mask, dim=1)
             return value
+
+        values = values[:, :-1]
         value = values.mean(dim=1).squeeze(1)
         return value

From 2ca9728cbb0863e27d74c7318358e339f9450d3e Mon Sep 17 00:00:00 2001
From: Xuanlei Zhao <43881818+oahzxl@users.noreply.github.com>
Date: Wed, 8 Mar 2023 16:22:30 +0800
Subject: [PATCH 436/503] [autochunk] refactor chunk memory estimation (#2762)

* refact memory code

* dont log free var memory

* add memory align

* update chunk target

* update setting for new memory

* finish test

* update tracer

* update typo

* update test
---
 colossalai/autochunk/autochunk_codegen.py     |  42 ++-
 colossalai/autochunk/estimate_memory.py       | 313 +++++++-----------
 colossalai/autochunk/search_chunk.py          | 135 +++-----
 colossalai/autochunk/select_chunk.py          |  31 +-
 colossalai/autochunk/trace_indice.py          |  92 ++---
 colossalai/autochunk/utils.py                 |  13 +-
 .../benchmark_autochunk_alphafold.py          |  29 +-
 .../test_autochunk_evoformer_block.py         |   8 +-
 .../test_autochunk_extramsa_block.py          |  11 -
 .../benchmark_autochunk_transformer.py        |   5 +-
 .../test_autochunk_gpt.py                     |  23 +-
 .../test_autochunk_transformer_utils.py       |  30 +-
 12 files changed, 302 insertions(+), 430 deletions(-)

diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
index 90bde8730052..15e15517ba01 100644
--- a/colossalai/autochunk/autochunk_codegen.py
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, Iterable, List, Tuple
+from typing import Any, Callable, Dict, Iterable, List, Tuple
 
 import torch
 
@@ -216,14 +216,13 @@ def _add_node_slice(
     return body
 
 
-def emit_code_with_chunk(
-    body: List[str],
-    nodes: Iterable[Node],
-    emit_node_func,
-    delete_unused_value_func,
-    search_chunk: SearchChunk,
-    chunk_infos: List,
-):
+def emit_code_with_chunk(body: List[str],
+                         nodes: Iterable[Node],
+                         emit_node_func: Callable,
+                         delete_unused_value_func: Callable,
+                         search_chunk: SearchChunk,
+                         chunk_infos: List,
+                         eval_mem: bool = False):
     """
     Emit code with chunk according to chunk_infos.
 
@@ -260,6 +259,9 @@ def emit_code_with_chunk(
     region_idx = 0
     within_chunk_region = False
 
+    if eval_mem:
+        body.append("init_memory = torch.cuda.memory_allocated() / 1024**2\n")
+
     while node_idx < len(node_list):
         node = node_list[node_idx]
 
@@ -289,10 +291,18 @@ def emit_code_with_chunk(
             body[-1] = _replace_reshape_size(body[-1], node.name, chunk_infos[region_idx]["reshape_size"])
             body[-1] = "    " + body[-1]
             delete_unused_value_func(node, body, chunk_inputs_names)
+            if eval_mem:
+                body.append(
+                    "    if chunk_idx == 0:\n        print('%s', torch.cuda.max_memory_allocated() / 1024**2 - init_memory);  torch.cuda.reset_peak_memory_stats()\n"
+                    % (node.name))
         else:
             emit_node_func(node, body)
             if node_idx not in chunk_inputs:
                 delete_unused_value_func(node, body, chunk_inputs_names)
+            if eval_mem:
+                body.append(
+                    "print('%s', torch.cuda.max_memory_allocated() / 1024**2 - init_memory);  torch.cuda.reset_peak_memory_stats()\n"
+                    % (node.name))
 
         # generate chunk region end
         if node_idx in chunk_ends:
@@ -312,8 +322,10 @@ def __init__(self,
                      meta_graph,
                      max_memory: int = None,
                      print_mem: bool = False,
-                     print_progress: bool = False) -> None:
+                     print_progress: bool = False,
+                     eval_mem: bool = False) -> None:
             super().__init__()
+            self.eval_mem = eval_mem
             # find the chunk regions
             self.search_chunk = SearchChunk(meta_graph, max_memory, print_mem, print_progress)
             self.chunk_infos = self.search_chunk.search_region()
@@ -511,14 +523,8 @@ def emit_node(node: Node, body):
 
             # if any node has a list of labels for activation_checkpoint, we
             # will use nested type of activation checkpoint codegen
-            emit_code_with_chunk(
-                body,
-                nodes,
-                emit_node,
-                delete_unused_values,
-                self.search_chunk,
-                self.chunk_infos,
-            )
+            emit_code_with_chunk(body, nodes, emit_node, delete_unused_values, self.search_chunk, self.chunk_infos,
+                                 self.eval_mem)
 
             if len(body) == 0:
                 # If the Graph has no non-placeholder nodes, no lines for the body
diff --git a/colossalai/autochunk/estimate_memory.py b/colossalai/autochunk/estimate_memory.py
index f457696e6310..08a55f9aa04a 100644
--- a/colossalai/autochunk/estimate_memory.py
+++ b/colossalai/autochunk/estimate_memory.py
@@ -2,11 +2,11 @@
 from typing import Any, Callable, Dict, Iterable, List, Tuple
 
 import torch
-from torch.fx.node import Node, map_arg
+from torch.fx.node import Node
 
 from colossalai.fx.profiler import activation_size, parameter_size
 
-from .utils import NodeMgr, delete_free_var_from_last_use, get_node_shape, is_non_memory_node
+from .utils import NodeMgr, get_node_shape, is_non_memory_node
 
 
 class EstimateMemory(object):
@@ -14,102 +14,85 @@ class EstimateMemory(object):
     Estimate memory with chunk
     """
 
-    def __init__(self, node_mgr: NodeMgr) -> None:
-        self.node_mgr = node_mgr
+    def __init__(self) -> None:
+        pass
 
-    def _get_meta_node_size(self, x):
+    def _get_node_size(self, x: Node) -> float:
+        """
+        return node size in MB
+        """
         x = x.meta["tensor_meta"]
-        x = x.numel * torch.tensor([], dtype=x.dtype).element_size()
-        return x
-
-    def _get_output_node(self, n):
-        out_size = activation_size(n.meta["fwd_out"])
-        out_node = [n.name] if out_size > 0 else []
-        return out_size, out_node
-
-    def _get_output_node_size(self, n):
-        return self._get_output_node(n)[0]
-
-    def _add_active_node(self, n, active_list):
-        new_active = self._get_output_node(n)[1]
-        if n.op == "placeholder" and get_node_shape(n) is not None:
-            new_active.append(n.name)
-        for i in new_active:
-            if i not in active_list and get_node_shape(n) is not None:
-                active_list.append(i)
-
-    def _get_delete_node(self, user, user_to_last_uses, to_keep=None):
-        delete_size = 0
-        delete_node = []
-        if user.op not in ("output",):
-            nodes_to_delete = user_to_last_uses.get(user, [])
-            if len(user.users) == 0:
-                nodes_to_delete.append(user)
-            if to_keep is not None:
-                keep_list = []
-                for n in nodes_to_delete:
-                    if n.name in to_keep:
-                        keep_list.append(n)
-                for n in keep_list:
-                    if n in nodes_to_delete:
-                        nodes_to_delete.remove(n)
-            if len(nodes_to_delete):
-                out_node = [self._get_output_node(i) for i in nodes_to_delete]
-                delete_size = sum([i[0] for i in out_node])
-                for i in range(len(out_node)):
-                    if out_node[i][0] > 0:
-                        delete_node.append(out_node[i][1][0])
-                    elif nodes_to_delete[i].op == "placeholder":
-                        delete_node.append(nodes_to_delete[i].name)
-                    # elif any(j in nodes_to_delete[i].name for j in ['transpose', 'permute', 'view']):
-                    #     delete_node.append(nodes_to_delete[i].name)
-        return delete_size, delete_node
-
-    def _get_delete_node_size(self, user, user_to_last_uses, to_keep):
-        return self._get_delete_node(user, user_to_last_uses, to_keep)[0]
-
-    def _remove_deactive_node(self, user, user_to_last_uses, active_list):
-        delete_node = self._get_delete_node(user, user_to_last_uses)[1]
-        for i in delete_node:
-            if i in active_list:
-                active_list.remove(i)
-
-    def _get_chunk_inputs_size(self, chunk_inputs, chunk_inputs_non_chunk, node_list, chunk_end_idx):
-        nodes_to_delete = []
-        for chunk_input in chunk_inputs + chunk_inputs_non_chunk:
-            chunk_input_users = chunk_input.users.keys()
-            chunk_input_users_idx = [self.node_mgr.find_node_idx(i) for i in chunk_input_users]
-            if all(i <= chunk_end_idx for i in chunk_input_users_idx):
-                if chunk_input not in nodes_to_delete:
-                    nodes_to_delete.append(chunk_input)
-        out_node = [self._get_output_node(i) for i in nodes_to_delete]
-        delete_size = sum([i[0] for i in out_node])
-        return delete_size
-
-    def _get_last_usr(self, nodes):
-        node_to_last_use: Dict[Node, Node] = {}
-        user_to_last_uses: Dict[Node, List[Node]] = {}
-
-        def register_last_uses(n: Node, user: Node):
-            if n not in node_to_last_use:
-                node_to_last_use[n] = user
-                user_to_last_uses.setdefault(user, []).append(n)
-
-        for node in reversed(nodes):
-            map_arg(node.args, lambda n: register_last_uses(n, node))
-            map_arg(node.kwargs, lambda n: register_last_uses(n, node))
-        return user_to_last_uses
-
-    def _get_contiguous_memory(self, node, not_contiguous_list, delete=False):
+        if not hasattr(x, "numel"):
+            out = sum([i.numel * torch.tensor([], dtype=i.dtype).element_size() for i in x])
+        else:
+            out = x.numel * torch.tensor([], dtype=x.dtype).element_size()
+        out = float(out) / 1024**2
+        return out
+
+    def _add_active_node(self, n: Node, active_nodes: Dict, chunk_ratio: float) -> None:
+        """
+        add an active node and its shape to active node dict
+        """
+        if get_node_shape(n) is None:
+            return
+        if n.op == "placeholder":
+            return
+        if n not in active_nodes:
+            node_size = self._get_node_size(n) * chunk_ratio
+            active_nodes[n] = node_size
+
+    def _build_delete_node_dict(self, node_mgr: NodeMgr) -> Dict:
+        """
+        build delete node dict, means node should be deleted at what time
+        """
+        delete_node_dict = {}
+        for idx, node in enumerate(node_mgr.get_node_list()):
+            # skip non shape node
+            if get_node_shape(node) is None:
+                continue
+            # dont remove free nodes
+            elif node.op == "placeholder":
+                delete_node_dict[node] = len(node_mgr.get_node_list())
+            # node no user
+            elif len(node.users) == 0:
+                delete_node_dict[node] = idx
+            # log max use
+            else:
+                node_user_idx = [node_mgr.find_node_idx(i) for i in node.users.keys()]
+                delete_node_dict[node] = max(node_user_idx)
+        return delete_node_dict
+
+    def _remove_deactive_node(self,
+                              user_idx: int,
+                              user: Node,
+                              active_nodes: List,
+                              delete_node_dict: List,
+                              kept_nodes: List = None) -> None:
+        """
+        remove deactivate nodes from active nodes
+        """
+        if kept_nodes is None:
+            kept_nodes = []
+        if user.op in ("output",):
+            return
+
+        for node in list(active_nodes.keys()):
+            # dont delete kept nodes
+            if node in kept_nodes:
+                continue
+            # should be deleted
+            if delete_node_dict[node] <= user_idx:
+                active_nodes.pop(node)
+
+    def _get_tmp_memory(self, node, not_contiguous_list, delete=False):
         mem = 0
         not_contiguous_ops = ["permute"]
-        inherit_contiguous_ops = ["transpose", "view"]
 
         if node.op == "call_function" and any(n in node.name for n in ["matmul", "reshape"]):
             for n in node.args:
                 if n in not_contiguous_list:
                     # matmul won't change origin tensor, but create a tmp copy
-                    mem += self._get_output_node_size(n)
+                    mem += self._get_node_size(n)
         elif node.op == "call_module":
             for n in node.args:
                 if n in not_contiguous_list:
@@ -129,31 +112,7 @@ def _get_chunk_ratio(self, node, chunk_node_dim, chunk_size):
         if chunk_dim is None:
             return 1.0
         else:
-            return float(chunk_size) / node_shape[chunk_dim]
-
-    def _get_chunk_delete_node_size(self, user, user_to_last_uses, chunk_ratio, chunk_inputs_names):
-        # if any(j in user.name for j in ['transpose', 'permute', 'view']):
-        #     return 0
-        if user.op in ("placeholder", "output"):
-            return 0
-        nodes_to_delete = user_to_last_uses.get(user, [])
-        if len(user.users) == 0:
-            nodes_to_delete.append(user)
-        delete_size = 0
-        for n in nodes_to_delete:
-            if n.name in chunk_inputs_names:
-                continue
-            delete_size += self._get_output_node_size(n) * chunk_ratio
-        return delete_size
-
-    def _print_mem_log(self, log, nodes, title=None):
-        if title:
-            print(title)
-        for idx, (l, n) in enumerate(zip(log, nodes)):
-            print("%s:%.2f \t" % (n.name, l), end="")
-            if (idx + 1) % 3 == 0:
-                print("")
-        print("\n")
+            return chunk_size / float(node_shape[chunk_dim])
 
     def _print_compute_op_mem_log(self, log, nodes, title=None):
         if title:
@@ -168,12 +127,22 @@ def _print_compute_op_mem_log(self, log, nodes, title=None):
                 print("")
         print("\n")
 
-    def estimate_chunk_inference_mem(
-        self,
-        node_list: List,
-        chunk_infos=None,
-        print_mem=False,
-    ):
+    def _add_active_nodes_from_list(self, active_nodes: List, nodes: List) -> List:
+        """
+        add active nodes from nodes
+        """
+        for n in nodes:
+            self._add_active_node(n, active_nodes, 1)
+
+    def _get_memory_from_active_nodes(self, active_nodes: Dict) -> float:
+        """
+        sum all memory of active nodes
+        """
+        out = [i for i in active_nodes.values()]
+        out = sum(out)
+        return out
+
+    def estimate_chunk_inference_mem(self, node_list: List, chunk_infos: Dict = None, print_mem: bool = False):
         """
         Estimate inference memory with chunk
 
@@ -191,18 +160,17 @@ def estimate_chunk_inference_mem(
         act_memory = 0.0
         act_memory_peak_log = []
         act_memory_after_node_log = []
-        active_node_list = []
-        active_node_list_log = []
+        active_nodes = {}
+        active_nodes_log = []
         not_contiguous_list = []
-        user_to_last_uses = self._get_last_usr(node_list)
-        user_to_last_uses_no_free_var = self._get_last_usr(node_list)
-        delete_free_var_from_last_use(user_to_last_uses_no_free_var)
+        node_mgr = NodeMgr(node_list)
+        delete_node_dict = self._build_delete_node_dict(node_mgr)
 
         use_chunk = True if chunk_infos is not None else False
         chunk_within = False
         chunk_region_idx = None
         chunk_ratio = 1    # use it to estimate chunk mem
-        chunk_inputs_names = []
+        chunk_inputs_all = []
 
         if use_chunk:
             chunk_regions = [i["region"] for i in chunk_infos]
@@ -210,30 +178,30 @@ def estimate_chunk_inference_mem(
             chunk_ends = [i[1] for i in chunk_regions]
             chunk_inputs = [i["inputs"] for i in chunk_infos]
             chunk_inputs_non_chunk = [i["inputs_non_chunk"] for i in chunk_infos]
-            chunk_inputs_names = [j.name for i in chunk_inputs for j in i
-                                 ] + [j.name for i in chunk_inputs_non_chunk for j in i]
+            chunk_inputs_all = [j for i in chunk_inputs for j in i] + [j for i in chunk_inputs_non_chunk for j in i]
             chunk_outputs = [i["outputs"] for i in chunk_infos]
             chunk_node_dim = [i["node_chunk_dim"] for i in chunk_infos]
             chunk_sizes = [i["chunk_size"] if "chunk_size" in i else 1 for i in chunk_infos]
 
-        for idx, node in enumerate(node_list):
+        for idx, node in enumerate(node_mgr.get_node_list()):
+
             # if node in chunk start nodes, change chunk ratio and add chunk_tensor
             if use_chunk and idx in chunk_starts:
                 chunk_within = True
                 chunk_region_idx = chunk_starts.index(idx)
-                act_memory += sum(self._get_output_node_size(i) for i in chunk_outputs[chunk_region_idx]) / (1024**2)
+                self._add_active_nodes_from_list(active_nodes, chunk_outputs[chunk_region_idx])
 
             # determine chunk ratio for current node
             if chunk_within:
-                chunk_ratio = self._get_chunk_ratio(
-                    node,
-                    chunk_node_dim[chunk_region_idx],
-                    chunk_sizes[chunk_region_idx],
-                )
+                chunk_ratio = self._get_chunk_ratio(node, chunk_node_dim[chunk_region_idx],
+                                                    chunk_sizes[chunk_region_idx])
+
+            # add current node as active node
+            self._add_active_node(node, active_nodes, chunk_ratio)
+            act_memory = self._get_memory_from_active_nodes(active_nodes)
 
             # if node is placeholder, just add the size of the node
             if node.op == "placeholder":
-                act_memory += self._get_meta_node_size(node) * chunk_ratio / (1024**2)
                 act_memory_peak_log.append(act_memory)
             # skip output
             elif node.op == "output":
@@ -241,83 +209,32 @@ def estimate_chunk_inference_mem(
             # no change for non compute node
             elif is_non_memory_node(node):
                 act_memory_peak_log.append(act_memory)
-            # node is a compute op
-            # calculate tmp, output node and delete node memory
+            # node is a compute op, calculate tmp
             else:
                 # forward memory
                 # TODO: contiguous_memory still not accurate for matmul, view, reshape and transpose
-                act_memory += (self._get_contiguous_memory(node, not_contiguous_list) * chunk_ratio / (1024**2))
-                act_memory += (self._get_output_node_size(node) * chunk_ratio / (1024**2))
+                tmp_memory = self._get_tmp_memory(node, not_contiguous_list, delete=True) * chunk_ratio
                 # record max act memory
-                act_memory_peak_log.append(act_memory)
-                # delete useless memory
-                act_memory -= (self._get_contiguous_memory(node, not_contiguous_list, delete=True) * chunk_ratio /
-                               (1024**2))
-                # delete unused vars not in chunk_input_list
-                # we can't delete input nodes until chunk ends
-                if chunk_within:
-                    act_memory -= self._get_chunk_delete_node_size(
-                        node,
-                        user_to_last_uses_no_free_var,
-                        chunk_ratio,
-                        chunk_inputs_names,
-                    ) / (1024**2)
-                else:
-                    act_memory -= self._get_delete_node_size(node, user_to_last_uses_no_free_var,
-                                                             chunk_inputs_names) / (1024**2)
-
-            # log active node, only effective without chunk
-            self._add_active_node(node, active_node_list)
-            self._remove_deactive_node(node, user_to_last_uses, active_node_list)
+                act_memory_peak_log.append(act_memory + tmp_memory)
+
+            # remove_deactive_node
+            self._remove_deactive_node(idx, node, active_nodes, delete_node_dict, kept_nodes=chunk_inputs_all)
 
             # if node in chunk end nodes, restore chunk settings
             if use_chunk and idx in chunk_ends:
-                act_memory -= (self._get_output_node_size(node) * chunk_ratio / (1024**2))
-                act_memory -= self._get_chunk_inputs_size(
-                    chunk_inputs[chunk_region_idx],
-                    chunk_inputs_non_chunk[chunk_region_idx],
-                    node_list,
-                    chunk_regions[chunk_region_idx][1],
-                ) / (1024**2)
+                self._remove_deactive_node(idx, node, active_nodes, delete_node_dict)    # dont provide kept nodes now
                 chunk_within = False
                 chunk_ratio = 1
                 chunk_region_idx = None
 
+            act_memory = self._get_memory_from_active_nodes(active_nodes)
             act_memory_after_node_log.append(act_memory)
-            active_node_list_log.append(copy.deepcopy(active_node_list))
+            active_nodes_log.append(active_nodes.copy())
 
         if print_mem:
             print("with chunk" if use_chunk else "without chunk")
-            # self._print_mem_log(act_memory_peak_log, node_list, "peak")
-            # self._print_mem_log(act_memory_after_node_log, node_list, "after")
-            self._print_compute_op_mem_log(act_memory_peak_log, node_list, "peak")
-            # self._print_compute_op_mem_log(
-            #     act_memory_after_node_log, node_list, "after"
-            # )
+            self._print_compute_op_mem_log(act_memory_peak_log, node_mgr.get_node_list(), "peak")
 
         # param_memory = parameter_size(gm)
         # all_memory = act_memory + param_memory
-        return act_memory_peak_log, act_memory_after_node_log, active_node_list_log
-
-    def get_active_nodes(self, node_list: List) -> List:
-        """
-        Get active nodes for every node
-
-        Args:
-            node_list (List): _description_
-
-        Returns:
-            active_node_list_log (List): active nodes of every node. active nodes refer to
-                nodes generated but not deleted.
-        """
-        active_node_list = []
-        active_node_list_log = []
-        user_to_last_uses = self._get_last_usr(node_list)
-        user_to_last_uses_no_free_var = self._get_last_usr(node_list)
-        delete_free_var_from_last_use(user_to_last_uses_no_free_var)
-        for _, node in enumerate(node_list):
-            # log active node, only effective without chunk
-            self._add_active_node(node, active_node_list)
-            self._remove_deactive_node(node, user_to_last_uses, active_node_list)
-            active_node_list_log.append(copy.deepcopy(active_node_list))
-        return active_node_list_log
+        return act_memory_peak_log, act_memory_after_node_log, active_nodes_log
diff --git a/colossalai/autochunk/search_chunk.py b/colossalai/autochunk/search_chunk.py
index eb99490957aa..326445ee9f12 100644
--- a/colossalai/autochunk/search_chunk.py
+++ b/colossalai/autochunk/search_chunk.py
@@ -42,10 +42,11 @@ class SearchChunk(object):
 
     def __init__(self, gm, max_memory=None, print_mem=False, print_progress=False) -> None:
         self.print_mem = print_mem
+        self.max_memory = max_memory
         self.print_progress = print_progress
-        self.node_mgr = NodeMgr(gm)
+        self.node_mgr = NodeMgr(list(gm.graph.nodes))
         self.trace_indice = TraceIndice(self.node_mgr)
-        self.estimate_memory = EstimateMemory(self.node_mgr)
+        self.estimate_memory = EstimateMemory()
         self._init_trace()
         self.trace_flow = TraceFlow(self.trace_indice, self.node_mgr)
         self.reorder_graph = ReorderGraph(self.trace_indice, self.node_mgr)
@@ -63,45 +64,46 @@ def _init_trace(self) -> None:
         reduce the computation complexity of trace_indice
         """
         # find all max ranges
-        active_nodes = self.estimate_memory.get_active_nodes(self.node_mgr.get_node_list())
-        cur_node_idx = len(self._get_free_var_idx())
-        max_chunk_region_list = []
-        while True:
-            max_chunk_region = self._search_max_chunk_region(active_nodes, cur_node_idx)
-            cur_node_idx = max_chunk_region[1] + 1
-            if cur_node_idx >= len(active_nodes) - 1:
-                break
-            max_chunk_region_list.append(max_chunk_region)
-
-        # nothing to limit for the first range
-        max_chunk_region_list = max_chunk_region_list[1:]
-        max_chunk_region_list[0] = (0, max_chunk_region_list[0][1])
-
+        active_nodes = self.estimate_memory.estimate_chunk_inference_mem(self.node_mgr.get_node_list())[2]
         # set trace range and do the trace
         if self.print_progress:
             get_logger().info("AutoChunk start tracing indice")
-        self.trace_indice.set_trace_range(max_chunk_region_list, active_nodes)
+        self.trace_indice.set_active_nodes(active_nodes)
         self.trace_indice.trace_indice()
 
-    def _find_peak_node(self, mem_peak: List) -> int:
+    def _find_peak_region(self, mem_peak: List) -> int:
+        """
+        find peak node, along with its neighbour nodes exceeds max mem
+        """
         max_value = max(mem_peak)
         max_idx = mem_peak.index(max_value)
-        return max_idx
-
-    def _get_free_var_idx(self) -> List:
-        """
-        Get free var index
+        peak_region = [max_idx, max_idx]
+        if self.max_memory is None:
+            return peak_region
+
+        # to left
+        count = 0
+        for i in range(max_idx - 1, -1, -1):
+            if mem_peak[i] > self.max_memory:
+                peak_region[0] = i
+            else:
+                count += 1
+            if count >= 3:
+                break
+        # to right
+        count = 0
+        for i in range(max_idx + 1, len(mem_peak) - 1):
+            if mem_peak[i] > self.max_memory:
+                peak_region[1] = i
+                count = 0
+            else:
+                count += 1
+            if count >= 3:
+                break
 
-        Returns:
-            free_var_idx (List): all indexs of free vars
-        """
-        free_var_idx = []
-        for idx, n in enumerate(self.node_mgr.get_node_list()):
-            if n.op == "placeholder" and get_node_shape(n) is not None:
-                free_var_idx.append(idx)
-        return free_var_idx
+        return peak_region
 
-    def _search_max_chunk_region(self, active_node: List, peak_node_idx: int, chunk_regions: List = None) -> Tuple:
+    def _search_max_chunk_region(self, active_node: List, peak_region: int, chunk_regions: List = None) -> Tuple:
         """
         Search max chunk region according to peak memory node
 
@@ -119,50 +121,24 @@ def _search_max_chunk_region(self, active_node: List, peak_node_idx: int, chunk_
         # check if peak node already in chunkinfo
         if chunk_regions is not None:
             for i in chunk_regions:
-                if i["region"][0] < peak_node_idx <= i["region"][1]:
+                if i["region"][0] < peak_region[0] <= i["region"][1] or \
+                    i["region"][0] < peak_region[1] <= i["region"][1]:
                     return None
 
-        free_vars = self._get_free_var_idx()
-        free_var_num = len(free_vars)
         active_node_num = [len(i) for i in active_node]
-        min_active_node_num = min(active_node_num[free_var_num:])
-        threshold = max(free_var_num, min_active_node_num)
-
-        # normal search
-        # from peak_node to free_var
-        inside_flag = False
-        chunk_region_start = free_var_num
-        for i in range(peak_node_idx, -1, -1):
-            if active_node_num[i] <= threshold:
-                inside_flag = True
-            if inside_flag and active_node_num[i] > threshold:
-                chunk_region_start = i + 1
-                break
-        # from peak_node to len-2
-        inside_flag = False
-        chunk_region_end = len(active_node) - 1
-        for i in range(peak_node_idx, len(active_node)):
-            if active_node_num[i] <= threshold:
-                inside_flag = True
-            if inside_flag and active_node_num[i] > threshold:
+        window_size = 100
+        # search min for start
+        min_num = 1e4
+        for i in range(peak_region[0], max(peak_region[0] - window_size, -1), -1):
+            if active_node_num[i] < min_num:
+                min_num = active_node_num[i]
+                chunk_region_start = i
+        # search min for end
+        min_num = 1e4
+        for i in range(peak_region[1], min(peak_region[1] + window_size, len(active_node_num))):
+            if active_node_num[i] < min_num:
+                min_num = active_node_num[i]
                 chunk_region_end = i
-                break
-
-        # if normal search fails, use approximate search
-        if (chunk_region_end - chunk_region_start) > 250:
-            window_size = 100
-            # search min for start
-            min_num = 1e3
-            for i in range(max(peak_node_idx - window_size, 0), peak_node_idx + 1):
-                if active_node_num[i] < min_num:
-                    min_num = active_node_num[i]
-                    chunk_region_start = i
-            # search min for end
-            min_num = 1e3
-            for i in range(min(peak_node_idx + window_size, len(active_node_num) - 1), peak_node_idx - 1, -1):
-                if active_node_num[i] < min_num:
-                    min_num = active_node_num[i]
-                    chunk_region_end = i
 
         # avoid chunk regions overlap
         if chunk_regions is not None:
@@ -214,7 +190,7 @@ def _find_chunk_info(self, input_trace, output_trace, start_idx, end_idx) -> Lis
                     chunk_infos.append(chunk_info)
         return chunk_infos
 
-    def _search_possible_chunk_regions(self, max_chunk_region: Tuple, peak_node: Node) -> List:
+    def _search_possible_chunk_regions(self, max_chunk_region: Tuple, peak_region: Node) -> List:
         """
         Search every possible region within the max chunk region.
 
@@ -235,8 +211,8 @@ def _search_possible_chunk_regions(self, max_chunk_region: Tuple, peak_node: Nod
                     cur_trace[arg] = self.trace_indice._find_trace_from_node(arg)
             input_trace.append(cur_trace)
 
-        for start_idx in range(max_chunk_region[0], peak_node + 1):
-            for end_idx in range(peak_node, max_chunk_region[1] + 1):
+        for start_idx in range(max_chunk_region[0], peak_region[0] + 1):
+            for end_idx in range(peak_region[1], max_chunk_region[1] + 1):
                 # skip non compute nodes
                 if is_non_compute_node(self.node_mgr.get_node_by_idx(start_idx)) or is_non_compute_node(
                         self.node_mgr.get_node_by_idx(end_idx)):
@@ -270,13 +246,12 @@ def _step_search(
         Returns:
             best_chunk_region (Dict)
         """
-        peak_node = self._find_peak_node(mem_peak)
-        max_chunk_region = self._search_max_chunk_region(active_node, peak_node, chunk_infos)
+        peak_region = self._find_peak_region(mem_peak)
+        max_chunk_region = self._search_max_chunk_region(active_node, peak_region, chunk_infos)
         if max_chunk_region == None:
             return None
-        possible_chunk_regions = self._search_possible_chunk_regions(max_chunk_region, peak_node)
-        best_chunk_region = self.select_chunk._select_best_chunk_region(possible_chunk_regions, chunk_infos, peak_node,
-                                                                        max_chunk_region, mem_peak)
+        possible_chunk_regions = self._search_possible_chunk_regions(max_chunk_region, peak_region)
+        best_chunk_region = self.select_chunk._select_best_chunk_region(possible_chunk_regions, chunk_infos, mem_peak)
         best_chunk_region = self.reorder_graph.reorder_all(best_chunk_region)
         return best_chunk_region
 
diff --git a/colossalai/autochunk/select_chunk.py b/colossalai/autochunk/select_chunk.py
index 1bb7d318cacf..94a29bfd5691 100644
--- a/colossalai/autochunk/select_chunk.py
+++ b/colossalai/autochunk/select_chunk.py
@@ -24,29 +24,16 @@ def __init__(
         else:
             self.stratge = "min_memory"
 
-    def _select_best_chunk_region(self, possible_chunk_regions, chunk_infos, peak_node, max_chunk_region, mem_peak):
+    def _select_best_chunk_region(self, possible_chunk_regions, chunk_infos, mem_peak):
         if self.stratge == "min_memory":
-            best_region = self._select_min_memory_chunk_region(
-                possible_chunk_regions,
-                chunk_infos,
-                peak_node,
-                max_chunk_region,
-                mem_peak,
-            )
+            best_region = self._select_min_memory_chunk_region(possible_chunk_regions, chunk_infos)
         elif self.stratge == "fit_memory":
-            best_region = self._select_fit_memory_chunk_region(
-                possible_chunk_regions,
-                chunk_infos,
-                peak_node,
-                max_chunk_region,
-                mem_peak,
-            )
+            best_region = self._select_fit_memory_chunk_region(possible_chunk_regions, chunk_infos, mem_peak)
         else:
             raise RuntimeError()
         return best_region
 
-    def _select_fit_memory_chunk_region(self, possible_chunk_regions, chunk_infos, peak_node, max_chunk_region,
-                                        mem_peak):
+    def _select_fit_memory_chunk_region(self, possible_chunk_regions, chunk_infos, mem_peak):
         # stop chunk if max memory satisfy memory limit
         if max(mem_peak) < self.max_memory:
             return None
@@ -63,17 +50,14 @@ def _select_fit_memory_chunk_region(self, possible_chunk_regions, chunk_infos, p
         if len(possible_chunk_regions) == 0:
             return None
 
-        max_possible_chunk_region = (min([i["region"][0] for i in possible_chunk_regions]),
-                                     max([i["region"][1] for i in possible_chunk_regions]))
-
         # get mem for chunk region
         regions_dict = []
         for region in possible_chunk_regions:
             cur_region = region.copy()
             cur_node_list, cur_region = self.reorder_graph.tmp_reorder(self.node_mgr.get_node_list(), cur_region)
             cur_chunk_infos = chunk_infos + [cur_region]
-            cur_mem_peak = self.estimate_memory.estimate_chunk_inference_mem(cur_node_list, cur_chunk_infos)[0]
-            cur_chunk_region_peak = cur_mem_peak[max_possible_chunk_region[0]:max_possible_chunk_region[1] + 1]
+            cur_mem = self.estimate_memory.estimate_chunk_inference_mem(cur_node_list, cur_chunk_infos)[0]
+            cur_chunk_region_peak = cur_mem[cur_region["region"][0]:cur_region["region"][1] + 1]
             cur_chunk_region_max_peak = max(cur_chunk_region_peak)
             if cur_chunk_region_max_peak < self.max_memory:
                 regions_dict.append({
@@ -141,8 +125,7 @@ def _get_compute_node_num(self, start, end):
                 count += 1
         return count
 
-    def _select_min_memory_chunk_region(self, possible_chunk_regions, chunk_infos, peak_node, max_chunk_region,
-                                        mem_peak):
+    def _select_min_memory_chunk_region(self, possible_chunk_regions, chunk_infos):
         # remove illegal regions
         illegal_regions = []
         for i in possible_chunk_regions:
diff --git a/colossalai/autochunk/trace_indice.py b/colossalai/autochunk/trace_indice.py
index 1e41073d7da6..92199b79a2be 100644
--- a/colossalai/autochunk/trace_indice.py
+++ b/colossalai/autochunk/trace_indice.py
@@ -33,7 +33,6 @@ def __init__(self, node_mgr: NodeMgr) -> None:
         self.indice_trace_list = self._init_indice_trace_list()
         self.indice_view_list = {}
         self.indice_count = -1
-        self.trace_range = []
         self.active_node_list = []
 
     def _init_indice_trace_list(self) -> List:
@@ -50,8 +49,7 @@ def _init_indice_trace_list(self) -> List:
             indice_trace_list.append(cur_trace)
         return indice_trace_list
 
-    def set_trace_range(self, trace_range: List, active_node_list: List) -> None:
-        self.trace_range = trace_range
+    def set_active_nodes(self, active_node_list: List) -> None:
         self.active_node_list = active_node_list
 
     def _add_indice(self) -> int:
@@ -731,23 +729,35 @@ def _assign_view_reshape_indice(self, node: Node, node_idx: int) -> None:
         dim_from.reverse()
 
         # search view list
-        for view_node, view_dict in self.indice_view_list.items():
-            if (view_dict["idx_to"] == idx_from and view_dict["dim_to"] == dim_from
-                    and view_dict["dim_from"] == dim_to):
-                # inheirt indice from current node
-                if len_diff == 1:
-                    if origin_shape[dim_from[0]] == 1:
-                        self._inherit_indice(origin_node, dim_from[1], node, dim_to[0], init=False)
-                    elif origin_shape[dim_from[1]] == 1:
-                        self._inherit_indice(origin_node, dim_from[0], node, dim_to[0], init=False)
-                elif len_diff == -1:
-                    if target_shape[dim_to[0]] == 1:
-                        self._inherit_indice(origin_node, dim_from[0], node, dim_to[1], init=False)
-                    elif target_shape[dim_to[1]] == 1:
-                        self._inherit_indice(origin_node, dim_from[0], node, dim_to[0], init=False)
-                # inherid indice from input node of last view
-                for dim_to_i in dim_to:
-                    self._inherit_indice(view_node.args[0], dim_to_i, node, dim_to_i, init=False)
+        # for view_node, view_dict in self.indice_view_list.items():
+        #     if (view_dict["idx_to"] == idx_from and view_dict["dim_to"] == dim_from
+        #             and view_dict["dim_from"] == dim_to):
+        #         # inheirt indice from current node
+        #         if len_diff == 1:
+        #             if origin_shape[dim_from[0]] == 1:
+        #                 self._inherit_indice(origin_node, dim_from[1], node, dim_to[0], init=False)
+        #             elif origin_shape[dim_from[1]] == 1:
+        #                 self._inherit_indice(origin_node, dim_from[0], node, dim_to[0], init=False)
+        #         elif len_diff == -1:
+        #             if target_shape[dim_to[0]] == 1:
+        #                 self._inherit_indice(origin_node, dim_from[0], node, dim_to[1], init=False)
+        #             elif target_shape[dim_to[1]] == 1:
+        #                 self._inherit_indice(origin_node, dim_from[0], node, dim_to[0], init=False)
+        #         # inherid indice from input node of last view
+        #         for dim_to_i in dim_to:
+        #             self._inherit_indice(view_node.args[0], dim_to_i, node, dim_to_i, init=False)
+
+        # inheirt indice from current node
+        if len_diff == 1:
+            if origin_shape[dim_from[0]] == 1:
+                self._inherit_indice(origin_node, dim_from[1], node, dim_to[0], init=False)
+            elif origin_shape[dim_from[1]] == 1:
+                self._inherit_indice(origin_node, dim_from[0], node, dim_to[0], init=False)
+        elif len_diff == -1:
+            if target_shape[dim_to[0]] == 1:
+                self._inherit_indice(origin_node, dim_from[0], node, dim_to[1], init=False)
+            elif target_shape[dim_to[1]] == 1:
+                self._inherit_indice(origin_node, dim_from[0], node, dim_to[0], init=False)
 
         # log view, not used now
         view_dict = {
@@ -762,32 +772,22 @@ def _clear_trace(self, node_idx: int) -> None:
         """
         clear too far trace to speed up computation
         """
-        trace_range = None
-        for i in range(len(self.trace_range)):
-            if self.trace_range[i][1] == node_idx:
-                trace_range = (self.trace_range[i][0], self.trace_range[i][1])
-                break
-            if self.trace_range[i][1] > node_idx:
-                break
-        if trace_range is None:
-            return
-
-        active_nodes = self.active_node_list[trace_range[0]:trace_range[1] + 1]
-        active_nodes = set(flat_list(active_nodes))
-        active_nodes = [self.node_mgr.find_node_idx_by_name(i) for i in active_nodes]
-        for i in range(trace_range[0], trace_range[1] + 1):
-            trace = self.indice_trace_list[i]
-            # clear compute
-            for dim_compute in trace["compute"]:
-                for i in range(len(dim_compute) - 1, -1, -1):
-                    if (dim_compute[i] < trace_range[0] and dim_compute[i] not in active_nodes):
-                        dim_compute.pop(i)
-                continue
-            # clear source
-            for dim_source in trace["source"]:
-                for k in list(dim_source.keys()):
-                    if k < trace_range[0] and k not in active_nodes:
-                        dim_source.pop(k)
+        trace_barrier = max(node_idx - 100, 0)
+        active_nodes = self.active_node_list[trace_barrier]
+        active_nodes = [self.node_mgr.find_node_idx(i) for i in active_nodes.keys()]
+
+        trace = self.indice_trace_list[node_idx]
+        # clear compute
+        for dim_compute in trace["compute"]:
+            for i in range(len(dim_compute) - 1, -1, -1):
+                if (dim_compute[i] < trace_barrier and dim_compute[i] not in active_nodes):
+                    dim_compute.pop(i)
+            continue
+        # clear source
+        for dim_source in trace["source"]:
+            for k in list(dim_source.keys()):
+                if k < trace_barrier and k not in active_nodes:
+                    dim_source.pop(k)
 
     def trace_indice(self) -> None:
         for idx, node in enumerate(self.node_mgr.get_node_list()):
diff --git a/colossalai/autochunk/utils.py b/colossalai/autochunk/utils.py
index c6bbc219e41f..7c0bc29b5893 100644
--- a/colossalai/autochunk/utils.py
+++ b/colossalai/autochunk/utils.py
@@ -11,8 +11,8 @@
 
 class NodeMgr(object):
 
-    def __init__(self, gm) -> None:
-        self._node_list = list(gm.graph.nodes)
+    def __init__(self, nodes_list: List[Node]) -> None:
+        self._node_list = nodes_list
         self._node_dict = {}
         self._set_node_dict()
 
@@ -76,6 +76,8 @@ def flat_list(inputs: Any) -> List:
     for i in inputs:
         if isinstance(i, list) or isinstance(i, set) or isinstance(i, tuple):
             res.extend(flat_list(i))
+        elif isinstance(i, dict):
+            res.extend(flat_list(list(i.keys())))
         else:
             res.append(i)
     return res
@@ -135,13 +137,6 @@ def is_non_compute_node_except_placeholder_output(node: Node) -> bool:
     return is_non_compute_node_except_placeholder(node)
 
 
-def find_node_idx(name: str, nodes_list: List) -> int:
-    for idx, node in enumerate(nodes_list):
-        if node.name == name:
-            return idx
-    raise RuntimeError("name %s not found in node list" % name)
-
-
 def delete_free_var_from_last_use(user_to_last_uses: Dict) -> None:
     for key, value in user_to_last_uses.items():
         for n in value:
diff --git a/tests/test_autochunk/test_autochunk_alphafold/benchmark_autochunk_alphafold.py b/tests/test_autochunk/test_autochunk_alphafold/benchmark_autochunk_alphafold.py
index 2f56f139abaf..896751e40146 100644
--- a/tests/test_autochunk/test_autochunk_alphafold/benchmark_autochunk_alphafold.py
+++ b/tests/test_autochunk/test_autochunk_alphafold/benchmark_autochunk_alphafold.py
@@ -61,7 +61,7 @@ def _benchmark_evoformer_stack_gm(
     # bench
     mem = _benchmark_memory(gm, inputs)
     speed = _benchmark_speed(gm, inputs)
-    print("evoformer stack gm, mem: %.2fMB, time: %.4fs, data_args: %s" % (mem, speed, str(data_args)))
+    print("evoformer stack gm, mem: %.2fMB, time: %.4fs" % (mem, speed))
 
 
 def _benchmark_evoformer_stack_origin(
@@ -83,14 +83,15 @@ def _benchmark_evoformer_stack_origin(
     # bench
     mem = _benchmark_memory(model, inputs)
     speed = _benchmark_speed(model, inputs)
-    print("evoformer stack origin, mem: %.2fMB, time: %.4fs, data_args: %s" % (mem, speed, str(data_args)))
+    print("evoformer stack origin, mem: %.2fMB, time: %.4fs" % (mem, speed))
+    return mem
 
 
 def _benchmark_memory(model, inputs):
     with torch.no_grad():
         torch.cuda.reset_peak_memory_stats()
         now_mem = torch.cuda.memory_allocated() / 1024**2
-        model(*[i.clone() if isinstance(i, torch.Tensor) else i for i in inputs])
+        model(*inputs)
         new_max_mem = torch.cuda.max_memory_allocated() / 1024**2
     return new_max_mem - now_mem
 
@@ -108,13 +109,18 @@ def _benchmark_speed(model, inputs, loop=5):
     return (time2 - time1) / loop
 
 
-def benchmark_evoformer_stack():
+def benchmark_evoformer_stack(data_args):
     from test_autochunk_evoformer_stack import get_data, get_model
-    data_args = [128, 256]
-    print("")
-    _benchmark_evoformer_stack_origin(data_args, get_model, get_data)
-    _benchmark_evoformer_stack_gm(data_args, 600, get_model, get_data)
-    _benchmark_evoformer_stack_gm(data_args, 400, get_model, get_data)
+    print("\nmsa len: %d, pair len: %d" % (data_args[0], data_args[1]))
+    max_mem = _benchmark_evoformer_stack_origin(data_args, get_model, get_data)
+    for ratio in [0.5, 0.4, 0.3, 0.2, 0.1]:
+        try:
+            _benchmark_evoformer_stack_gm(data_args, max_mem * ratio, get_model, get_data)
+        except RuntimeError as e:
+            if e.args[0] == 'Search failed. Try a larger memory threshold.':
+                break
+        except Exception as e:
+            raise e
     _benchmark_evoformer_stack_gm(data_args, None, get_model, get_data)
 
 
@@ -128,4 +134,7 @@ def benchmark_evoformer_stack():
         port=free_port(),
         backend="nccl",
     )
-    benchmark_evoformer_stack()
+    benchmark_evoformer_stack((256, 256))
+    benchmark_evoformer_stack((256, 512))
+    benchmark_evoformer_stack((256, 1024))
+    benchmark_evoformer_stack((256, 1280))
diff --git a/tests/test_autochunk/test_autochunk_alphafold/test_autochunk_evoformer_block.py b/tests/test_autochunk/test_autochunk_alphafold/test_autochunk_evoformer_block.py
index be727701c091..17a5abf4cab8 100644
--- a/tests/test_autochunk/test_autochunk_alphafold/test_autochunk_evoformer_block.py
+++ b/tests/test_autochunk/test_autochunk_alphafold/test_autochunk_evoformer_block.py
@@ -55,10 +55,10 @@ def get_data(msa_len: int, pair_len: int) -> Tuple[List, List]:
 
 def get_chunk_target() -> Dict:
     return {
-        None: [(120, 123), (222, 237), (269, 289), (305, 311), (100, 105), (146, 152), (187, 193), (241, 242),
-               (25, 50)],
-        20: [(120, 123), (232, 237), (277, 282), (305, 306), (100, 101), (34, 39)],
-        24: [(120, 123)],
+        None: [(120, 126), (225, 244), (270, 289), (306, 311), (70, 106), (23, 46), (146, 152), (187, 193), (181, 184),
+               (140, 145), (162, 163), (203, 204)],
+        20: [(120, 123), (232, 237), (277, 282), (305, 306)],
+        24: [(122, 123)],
     }
 
 
diff --git a/tests/test_autochunk/test_autochunk_alphafold/test_autochunk_extramsa_block.py b/tests/test_autochunk/test_autochunk_alphafold/test_autochunk_extramsa_block.py
index f8102f351982..ad955479e617 100644
--- a/tests/test_autochunk/test_autochunk_alphafold/test_autochunk_extramsa_block.py
+++ b/tests/test_autochunk/test_autochunk_alphafold/test_autochunk_extramsa_block.py
@@ -53,15 +53,6 @@ def get_data(msa_len: int, pair_len: int) -> Tuple[List, List]:
     return meta_args, concrete_args
 
 
-def get_chunk_target() -> Dict:
-    return {
-        None: [(128, 131), (230, 245), (277, 297), (313, 319), (108, 113), (154, 160), (195, 201), (249, 250),
-               (36, 46)],
-        20: [(128, 131), (240, 245), (285, 290), (313, 314), (108, 109), (41, 46)],
-        24: [(128, 131)],
-    }
-
-
 @pytest.mark.skipif(
     not (AUTOCHUNK_AVAILABLE and HAS_REPO),
     reason="torch version is lower than 1.12.0",
@@ -75,7 +66,6 @@ def test_extramsa_block(data_args, max_memory):
         max_memory=max_memory,
         get_model=get_model,
         get_data=get_data,
-        get_chunk_target=get_chunk_target,
     )
     mp.spawn(run_func, nprocs=1)
 
@@ -87,7 +77,6 @@ def test_extramsa_block(data_args, max_memory):
         max_memory=None,
         get_model=get_model,
         get_data=get_data,
-        get_chunk_target=get_chunk_target,
         print_code=False,
         print_mem=False,
         print_progress=False,
diff --git a/tests/test_autochunk/test_autochunk_transformer/benchmark_autochunk_transformer.py b/tests/test_autochunk/test_autochunk_transformer/benchmark_autochunk_transformer.py
index 43cefcb74988..5791af35124b 100644
--- a/tests/test_autochunk/test_autochunk_transformer/benchmark_autochunk_transformer.py
+++ b/tests/test_autochunk/test_autochunk_transformer/benchmark_autochunk_transformer.py
@@ -95,7 +95,7 @@ def _benchmark_memory(model, inputs):
     with torch.no_grad():
         torch.cuda.reset_peak_memory_stats()
         now_mem = float(torch.cuda.memory_allocated()) / 1024**2
-        model(*[i.clone() if isinstance(i, torch.Tensor) else i for i in inputs])
+        model(*inputs)
         new_max_mem = float(torch.cuda.max_memory_allocated()) / 1024**2
     return new_max_mem - now_mem
 
@@ -116,8 +116,7 @@ def _benchmark_speed(model, inputs, loop=5):
 def benchmark_autochunk_gpt(batch=1, seq=512, n_embd=768, n_head=12):
     from test_autochunk_gpt import GPT2Config, GPT2Model, get_data
     model = GPT2Model
-    config = GPT2Config(n_embd=n_embd, n_position=seq, n_layer=2, n_head=n_head)
-    config.max_position_embeddings = seq
+    config = GPT2Config(n_embd=n_embd, n_positions=seq, n_layer=2, n_head=n_head)
     model = model(config=config)
     shape = [batch, seq]
     print("\nbatch: %d, seq: %d, n_embd: %d, n_head: %d" % (batch, seq, n_embd, n_head))
diff --git a/tests/test_autochunk/test_autochunk_transformer/test_autochunk_gpt.py b/tests/test_autochunk/test_autochunk_transformer/test_autochunk_gpt.py
index 6e1076ec792b..018a2557a974 100644
--- a/tests/test_autochunk/test_autochunk_transformer/test_autochunk_gpt.py
+++ b/tests/test_autochunk/test_autochunk_transformer/test_autochunk_gpt.py
@@ -44,20 +44,19 @@ def test_autochunk_gpt(model, shape, max_memory):
         data=get_data(shape),
         max_memory=max_memory,
         model=model,
-        config=GPT2Config(n_embd=96, n_position=shape[1], n_layer=2, n_head=4),
+        config=GPT2Config(n_embd=96, n_positions=shape[1], n_layer=2, n_head=4),
     )
     mp.spawn(run_func, nprocs=1)
 
 
 if __name__ == "__main__":
-    run_test(
-        rank=0,
-        data=get_data((BATCH_SIZE, SEQ_LENGTH)),
-        max_memory=None,
-        model=GPT2Model,
-        config=GPT2Config(n_embd=96, n_position=SEQ_LENGTH, n_layer=2, n_head=4),
-        print_code=False,
-        print_est_mem=False,
-        print_mem=False,
-        print_progress=False,
-    )
+    run_test(rank=0,
+             data=get_data((BATCH_SIZE, SEQ_LENGTH)),
+             max_memory=None,
+             model=GPT2Model,
+             config=GPT2Config(n_embd=96, n_position=SEQ_LENGTH, n_layer=2, n_head=4),
+             print_code=False,
+             print_est_mem=False,
+             print_mem=False,
+             print_progress=False,
+             eval_mem=False)
diff --git a/tests/test_autochunk/test_autochunk_transformer/test_autochunk_transformer_utils.py b/tests/test_autochunk/test_autochunk_transformer/test_autochunk_transformer_utils.py
index cc26168c7191..bc5eda7edf91 100644
--- a/tests/test_autochunk/test_autochunk_transformer/test_autochunk_transformer_utils.py
+++ b/tests/test_autochunk/test_autochunk_transformer/test_autochunk_transformer_utils.py
@@ -24,6 +24,7 @@ def assert_codegen_run(
     print_mem: bool = False,
     print_progress: bool = False,
     print_code: bool = False,
+    eval_mem: bool = False,
 ) -> List[Dict]:
     meta_args, concrete_args, sequence = data
     if concrete_args is None:
@@ -39,12 +40,11 @@ def assert_codegen_run(
     meta_tensors = [meta_args[i] if i in meta_args else concrete_args[i] for i in sequence]
     meta_tensors = [MetaTensor(i, fake_device="cuda:0") if isinstance(i, torch.Tensor) else i for i in meta_tensors]
     interp.propagate(*meta_tensors)
-    codegen = AutoChunkCodeGen(
-        meta_graph,
-        max_memory=max_memory,
-        print_mem=print_est_mem,
-        print_progress=print_progress,
-    )
+    codegen = AutoChunkCodeGen(meta_graph,
+                               max_memory=max_memory,
+                               print_mem=print_est_mem,
+                               print_progress=print_progress,
+                               eval_mem=eval_mem)
     chunks = codegen.chunk_infos
 
     # trace and recompile
@@ -108,6 +108,7 @@ def run_test(
     print_est_mem: bool = False,
     print_mem: bool = False,
     print_progress: bool = False,
+    eval_mem: bool = False,
     get_chunk_target: Any = None,
 ) -> None:
     model = model(config=config)
@@ -122,15 +123,14 @@ def run_test(
     )
 
     # build model and input
-    chunks = assert_codegen_run(
-        model,
-        data=data,
-        max_memory=max_memory,
-        print_code=print_code,
-        print_est_mem=print_est_mem,
-        print_mem=print_mem,
-        print_progress=print_progress,
-    )
+    chunks = assert_codegen_run(model,
+                                data=data,
+                                max_memory=max_memory,
+                                print_code=print_code,
+                                print_est_mem=print_est_mem,
+                                print_mem=print_mem,
+                                print_progress=print_progress,
+                                eval_mem=eval_mem)
 
     if get_chunk_target is not None:
         chunk_found = [i["region"] for i in chunks]

From af3888481d6c5e2602ffe4afa69276778b161e7a Mon Sep 17 00:00:00 2001
From: Tomek <58812616+tomekrut@users.noreply.github.com>
Date: Thu, 9 Mar 2023 03:47:41 +0100
Subject: [PATCH 437/503] [example] fixed opt model downloading from
 huggingface

---
 examples/language/opt/train_gemini_opt.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/language/opt/train_gemini_opt.py b/examples/language/opt/train_gemini_opt.py
index 1546b31ba922..4ea81ca01c45 100755
--- a/examples/language/opt/train_gemini_opt.py
+++ b/examples/language/opt/train_gemini_opt.py
@@ -177,7 +177,7 @@ def main():
     default_dist_spec = ShardSpec([-1], [world_size]) if args.shardinit else None
 
     # build model
-    if args.model_name_or_path is None or args.model_name_or_path == 'facebook/opt-13b':
+    if args.model_name_or_path is None:
         # currently, there has a bug in pretrained opt-13b
         # we can not import it until huggingface fix it
         logger.info("Train a new model from scratch", ranks=[0])

From 360674283d617ff11c81653a0a67cd6113d03bcc Mon Sep 17 00:00:00 2001
From: binmakeswell <binmakeswell@gmail.com>
Date: Thu, 9 Mar 2023 10:59:28 +0800
Subject: [PATCH 438/503] [example] fix redundant note (#3065)

---
 examples/language/opt/train_gemini_opt.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/examples/language/opt/train_gemini_opt.py b/examples/language/opt/train_gemini_opt.py
index 4ea81ca01c45..4993ce25db17 100755
--- a/examples/language/opt/train_gemini_opt.py
+++ b/examples/language/opt/train_gemini_opt.py
@@ -178,8 +178,6 @@ def main():
 
     # build model
     if args.model_name_or_path is None:
-        # currently, there has a bug in pretrained opt-13b
-        # we can not import it until huggingface fix it
         logger.info("Train a new model from scratch", ranks=[0])
         with ColoInitContext(device=init_dev, dtype=torch.half,
                              default_dist_spec=default_dist_spec,

From faa8526b857bd1c4e2ff75f1e35466b0667e3d63 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 9 Mar 2023 11:22:56 +0800
Subject: [PATCH 439/503] Automated submodule synchronization (#3062)

Co-authored-by: github-actions <github-actions@github.com>
---
 inference | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/inference b/inference
index a2c9905d94f9..56b35f3c06ea 160000
--- a/inference
+++ b/inference
@@ -1 +1 @@
-Subproject commit a2c9905d94f926a3135acd7228ae6b4e7474217c
+Subproject commit 56b35f3c06eaac11b1bee633d1e836563f74bcea

From f19b49e16460e78b2f92dedf6f115b98afd60913 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Thu, 9 Mar 2023 11:27:46 +0800
Subject: [PATCH 440/503] [booster] init module structure and definition
 (#3056)

---
 colossalai/booster/__init__.py          |  5 ++
 colossalai/booster/accelerator.py       | 14 ++++++
 colossalai/booster/booster.py           | 66 +++++++++++++++++++++++++
 colossalai/booster/environment_table.py | 18 +++++++
 colossalai/booster/plugin.py            | 46 +++++++++++++++++
 colossalai/booster/precision.py         | 25 ++++++++++
 6 files changed, 174 insertions(+)
 create mode 100644 colossalai/booster/__init__.py
 create mode 100644 colossalai/booster/accelerator.py
 create mode 100644 colossalai/booster/booster.py
 create mode 100644 colossalai/booster/environment_table.py
 create mode 100644 colossalai/booster/plugin.py
 create mode 100644 colossalai/booster/precision.py

diff --git a/colossalai/booster/__init__.py b/colossalai/booster/__init__.py
new file mode 100644
index 000000000000..d475676ba06a
--- /dev/null
+++ b/colossalai/booster/__init__.py
@@ -0,0 +1,5 @@
+from .accelerator import Accelerator
+from .booster import Booster
+from .environment_table import EnvironmentTable
+from .plugin import Plugin
+from .precision import Precision
diff --git a/colossalai/booster/accelerator.py b/colossalai/booster/accelerator.py
new file mode 100644
index 000000000000..63ba193e3e4f
--- /dev/null
+++ b/colossalai/booster/accelerator.py
@@ -0,0 +1,14 @@
+import torch
+import torch.nn as nn
+
+__all__ = ['Accelerator']
+
+
+class Accelerator:
+
+    def __init__(self, device: torch.device):
+        self.device = device
+
+    def setup_model(self, model: nn.Module) -> nn.Module:
+        # TODO: implement this method
+        pass
diff --git a/colossalai/booster/booster.py b/colossalai/booster/booster.py
new file mode 100644
index 000000000000..4aae200a0607
--- /dev/null
+++ b/colossalai/booster/booster.py
@@ -0,0 +1,66 @@
+from contextlib import contextmanager
+from typing import Callable, Iterator, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
+from torch.utils.data import DataLoader
+
+from .plugin import Plugin
+
+__all__ = ['Booster']
+
+
+class Booster:
+
+    def __init__(self,
+                 device: Union[str, torch.device] = 'cuda',
+                 precision: str = 'fp32',
+                 grad_clipping_type: str = 'norm',
+                 grad_clipping_value: float = 0.0,
+                 plugin: Optional[Plugin] = None) -> None:
+        # TODO: implement this method
+        pass
+
+    def boost(
+        self, *args: Union[nn.Module, Optimizer, LRScheduler, DataLoader]
+    ) -> List[Union[nn.Module, Optimizer, LRScheduler, DataLoader]]:
+        # TODO: implement this method
+        pass
+
+    def backward(self, loss: torch.Tensor, optimizer: Optimizer) -> None:
+        # TODO: implement this method
+        pass
+
+    def execute_pipeline(self,
+                         data_iter: Iterator,
+                         model: nn.Module,
+                         criterion: Callable[[torch.Tensor], torch.Tensor],
+                         optimizer: Optimizer,
+                         return_loss: bool = True,
+                         return_outputs: bool = False) -> Tuple[Optional[torch.Tensor], ...]:
+        # TODO: implement this method
+        # run pipeline forward backward pass
+        # return loss or outputs if needed
+        pass
+
+    def no_sync(self, model: nn.Module) -> contextmanager:
+        # TODO: implement this method
+        pass
+
+    def save(self,
+             obj: Union[nn.Module, Optimizer, LRScheduler],
+             path_like: str,
+             plan: str = 'torch',
+             **kwargs) -> None:
+        # TODO: implement this method
+        pass
+
+    def load(self,
+             obj: Union[nn.Module, Optimizer, LRScheduler],
+             path_like: str,
+             plan: str = 'torch',
+             **kwargs) -> None:
+        # TODO: implement this method
+        pass
diff --git a/colossalai/booster/environment_table.py b/colossalai/booster/environment_table.py
new file mode 100644
index 000000000000..4b16f120c1b9
--- /dev/null
+++ b/colossalai/booster/environment_table.py
@@ -0,0 +1,18 @@
+from typing import List
+
+__all__ = ['EnvironmentTable']
+
+
+class EnvironmentTable:
+
+    def __init__(self, intra_op_world_sizes: List[int]):
+        # TODO: implement this method
+        pass
+
+    @property
+    def is_master(self) -> bool:
+        # TODO: implement this method
+        pass
+
+    # TODO: implement more utility methods as given in
+    # https://github.com/hpcaitech/ColossalAI/issues/3051
diff --git a/colossalai/booster/plugin.py b/colossalai/booster/plugin.py
new file mode 100644
index 000000000000..32e0a7bde3f7
--- /dev/null
+++ b/colossalai/booster/plugin.py
@@ -0,0 +1,46 @@
+from typing import List, Tuple
+
+import torch
+import torch.nn as nn
+from torch.optim import Optimizer
+from torch.utils.data import DataLoader
+
+from colossalai.device.device_mesh import DeviceMesh
+
+__all__ = ['Plugin']
+
+
+class Plugin:
+
+    @property
+    def supported_devices(self) -> List[torch.device]:
+        pass
+
+    @property
+    def supported_precisions(self) -> List[str]:
+        pass
+
+    @property
+    def control_precision(self) -> bool:
+        pass
+
+    @property
+    def control_device(self) -> bool:
+        pass
+
+    @property
+    def support_no_sync(self) -> bool:
+        pass
+
+    def setup_model(self, model: nn.Module, device_mesh_pool: DeviceMesh) -> nn.Module:
+        pass
+
+    def setup_optimizer(self, optimizer: Optimizer) -> Optimizer:
+        pass
+
+    def setup_dataloader(self, dataloader: DataLoader) -> DataLoader:
+        pass
+
+    @property
+    def device_mesh_shape(self) -> List[Tuple[int, ...]]:
+        pass
diff --git a/colossalai/booster/precision.py b/colossalai/booster/precision.py
new file mode 100644
index 000000000000..8a391d9e4c88
--- /dev/null
+++ b/colossalai/booster/precision.py
@@ -0,0 +1,25 @@
+import torch
+import torch.nn as nn
+from torch.optim import Optimizer
+
+__all__ = ['Precision']
+
+
+class Precision:
+
+    def __init__(self, precision_type: torch.dtype, grad_clipping_type: str, grad_clipping_value: float):
+        self.precision_type = precision_type
+        self.grad_clipping_type = grad_clipping_type
+        self.grad_clipping_value = grad_clipping_value
+
+    def setup_model(self, model: nn.Module) -> nn.Module:
+        # TODO: implement this method
+        pass
+
+    def setup_optimizer(self, optimizer: Optimizer) -> Optimizer:
+        # TODO: implement this method
+        # inject grad clipping and unscale loss
+        pass
+
+    def scale_loss(self, loss: torch.Tensor) -> torch.Tensor:
+        pass

From 91ccf97514af50111551e88a8a194c60f82590b4 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Thu, 9 Mar 2023 17:31:41 +0800
Subject: [PATCH 441/503] [workflow] fixed doc build trigger condition (#3072)

---
 .github/workflows/doc_build_after_merge.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/doc_build_after_merge.yml b/.github/workflows/doc_build_after_merge.yml
index 2f7b708ab3a8..ede04b336620 100644
--- a/.github/workflows/doc_build_after_merge.yml
+++ b/.github/workflows/doc_build_after_merge.yml
@@ -1,11 +1,11 @@
-name: Build Documentation upon Release
+name: Build Documentation After Merge
 
 on:
   workflow_dispatch:
   pull_request:
     paths:
       - 'version.txt'
-      - 'docs/'
+      - 'docs/**'
     types:
       - closed
 

From 416a50dbd713edab5ccb39aaf6dd1aecb0520e09 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Thu, 9 Mar 2023 18:10:45 +0800
Subject: [PATCH 442/503] [doc] moved doc test command to bottom (#3075)

---
 docs/README.md                                  | 2 +-
 docs/source/en/features/nvme_offload.md         | 4 +++-
 docs/source/en/get_started/installation.md      | 7 ++++---
 docs/source/zh-Hans/features/nvme_offload.md    | 4 +++-
 docs/source/zh-Hans/get_started/installation.md | 4 +++-
 5 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/docs/README.md b/docs/README.md
index d5e0c22f5d51..edeb4ec253f4 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -72,7 +72,7 @@ Meanwhile, you need to ensure the `sidebars.json` is updated such that it contai
 
 ### 🧹 Doc Testing
 
-Every documentation is tested to ensure it works well. You need to add the following line to the top of your file and replace `$command` with the actual command. Do note that the markdown will be converted into a Python file. Assuming you have a `demo.md` file, the test file generated will be `demo.py`. Therefore, you should use `demo.py` in your command, e.g. `python demo.py`.
+Every documentation is tested to ensure it works well. You need to add the following line to the **bottom of your file** and replace `$command` with the actual command. Do note that the markdown will be converted into a Python file. Assuming you have a `demo.md` file, the test file generated will be `demo.py`. Therefore, you should use `demo.py` in your command, e.g. `python demo.py`.
 
 ```markdown
 <!-- doc-test-command: $command  -->
diff --git a/docs/source/en/features/nvme_offload.md b/docs/source/en/features/nvme_offload.md
index 68c422116e1d..2933c3db6c58 100644
--- a/docs/source/en/features/nvme_offload.md
+++ b/docs/source/en/features/nvme_offload.md
@@ -1,4 +1,3 @@
-<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 nvme_offload.py  -->
 # NVMe offload
 
 Author: Hongxin Liu
@@ -259,3 +258,6 @@ NVME offload saves about 294 MB memory. Note that enabling `pin_memory` of Gemin
 {{ autodoc:colossalai.nn.optimizer.HybridAdam }}
 
 {{ autodoc:colossalai.nn.optimizer.CPUAdam }}
+
+
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 nvme_offload.py  -->
diff --git a/docs/source/en/get_started/installation.md b/docs/source/en/get_started/installation.md
index c1c34b4110d6..672fd8ae03a4 100644
--- a/docs/source/en/get_started/installation.md
+++ b/docs/source/en/get_started/installation.md
@@ -1,12 +1,10 @@
-<!-- doc-test-command: echo "installation.md does not need test" -->
-
 # Setup
 
 Requirements:
 - PyTorch >= 1.11 (PyTorch 2.x in progress)
 - Python >= 3.7
 - CUDA >= 11.0
-  
+
 If you encounter any problem about installation, you may want to raise an [issue](https://github.com/hpcaitech/ColossalAI/issues/new/choose) in this repository.
 
 
@@ -47,3 +45,6 @@ If you don't want to install and enable CUDA kernel fusion (compulsory installat
 ```shell
 CUDA_EXT=1 pip install .
 ```
+
+
+<!-- doc-test-command: echo "installation.md does not need test" -->
diff --git a/docs/source/zh-Hans/features/nvme_offload.md b/docs/source/zh-Hans/features/nvme_offload.md
index f8ecdab42069..f33474efaa78 100644
--- a/docs/source/zh-Hans/features/nvme_offload.md
+++ b/docs/source/zh-Hans/features/nvme_offload.md
@@ -1,4 +1,3 @@
-<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 nvme_offload.py  -->
 # NVMe offload
 
 作者: Hongxin Liu
@@ -247,3 +246,6 @@ NVME 卸载节省了大约 294 MB 内存。注意使用 Gemini 的 `pin_memory`
 {{ autodoc:colossalai.nn.optimizer.HybridAdam }}
 
 {{ autodoc:colossalai.nn.optimizer.CPUAdam }}
+
+
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 nvme_offload.py  -->
diff --git a/docs/source/zh-Hans/get_started/installation.md b/docs/source/zh-Hans/get_started/installation.md
index fb79fc676c0b..7a9b20255e77 100755
--- a/docs/source/zh-Hans/get_started/installation.md
+++ b/docs/source/zh-Hans/get_started/installation.md
@@ -5,7 +5,7 @@
 - PyTorch >= 1.11 (PyTorch 2.x 正在适配中)
 - Python >= 3.7
 - CUDA >= 11.0
-  
+
 如果你遇到安装问题，可以向本项目 [反馈](https://github.com/hpcaitech/ColossalAI/issues/new/choose)。
 
 ## 从PyPI上安装
@@ -44,3 +44,5 @@ pip install .
 ```shell
 NO_CUDA_EXT=1 pip install .
 ```
+
+<!-- doc-test-command: echo "installation.md does not need test" -->

From 89aa7926acf4b9d7c51df76ffa682a0fe796bf07 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Fri, 10 Mar 2023 09:47:20 +0800
Subject: [PATCH 443/503] [release] v0.2.6 (#3057)

---
 version.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version.txt b/version.txt
index 3a4036fb450f..53a75d673557 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.2.5
+0.2.6

From 8e4e8601b7b339d7028a09cd1cbfe5e2e324efd6 Mon Sep 17 00:00:00 2001
From: YuliangLiu0306 <72588413+YuliangLiu0306@users.noreply.github.com>
Date: Fri, 10 Mar 2023 09:53:52 +0800
Subject: [PATCH 444/503] [DTensor] implement layout converter (#3055)

* [DTensor] refactor LayoutConverter for DTensor

* polish code

* polish docstring
---
 .../tensor/d_tensor/layout_converter.py       | 556 ++++++++++++++++++
 colossalai/tensor/d_tensor/utils.py           |  66 +++
 .../test_dtensor/test_layout_converter.py     | 206 +++++++
 3 files changed, 828 insertions(+)
 create mode 100644 colossalai/tensor/d_tensor/layout_converter.py
 create mode 100644 colossalai/tensor/d_tensor/utils.py
 create mode 100644 tests/test_tensor/test_dtensor/test_layout_converter.py

diff --git a/colossalai/tensor/d_tensor/layout_converter.py b/colossalai/tensor/d_tensor/layout_converter.py
new file mode 100644
index 000000000000..22bbb1d2fe74
--- /dev/null
+++ b/colossalai/tensor/d_tensor/layout_converter.py
@@ -0,0 +1,556 @@
+import math
+from copy import deepcopy
+from dataclasses import dataclass
+from typing import Dict, List, Tuple
+
+import numpy as np
+import torch
+
+from colossalai.auto_parallel.tensor_shard.sharding_strategy import MemoryCost, TrainCycleItem
+from colossalai.context.singleton_meta import SingletonMeta
+from colossalai.tensor.d_tensor.comm_spec import *
+from colossalai.tensor.d_tensor.layout import Layout
+from colossalai.tensor.sharding_spec import ShardingSpecException
+from colossalai.tensor.utils import all_gather_simulator, all_to_all_simulator, shard_simulator
+
+from .sharding_spec import ShardingSpec
+from .utils import get_comm_cost
+
+__all__ = ['LayoutConverter', 'LayoutConverterOptions', 'set_layout_converting_options']
+
+
+@dataclass
+class LayoutConverterOptions:
+    """
+    LayoutConverterOptions is a dataclass which specifies the preferences for shape consistency.
+    """
+    # TODO: layout converter option is not implemented yet
+    pass
+
+
+def to_global(distributed_tensor: torch.Tensor, layout: Layout) -> torch.Tensor:
+    shape_consistency_manager = LayoutConverter()
+    global_sharding_spec = ShardingSpec(distributed_tensor.dim(), {})
+    global_layout = Layout(device_mesh=layout.device_mesh,
+                           device_type=layout.device_type,
+                           sharding_spec=global_sharding_spec,
+                           entire_shape=layout.entire_shape)
+    with torch.no_grad():
+        global_tensor = shape_consistency_manager.apply(distributed_tensor, layout, global_layout)
+    return global_tensor
+
+
+def set_layout_converting_options(options: LayoutConverterOptions):
+    """
+    Configure the shape consistency manager via function call.
+    """
+    manager = LayoutConverter()
+    manager.options = options
+
+
+class LayoutConverter(metaclass=SingletonMeta):
+
+    def __init__(self):
+        self._options = None
+        self._forward_only = False
+        self.cached_solution = {}
+
+    @property
+    def options(self):
+        return self._options
+
+    @options.setter
+    def options(self, options_: LayoutConverterOptions):
+        assert isinstance(options_, LayoutConverterOptions)
+        self._options = options_
+
+    @property
+    def forward_only(self):
+        return self._forward_only
+
+    @forward_only.setter
+    def forward_only(self, value):
+        assert isinstance(value, bool)
+        self._forward_only = value
+
+    def all_gather_transform_layouts(self, source_layout: Layout) -> Dict[Layout, CommSpec]:
+        '''
+        Get all valid layouts from source_layout with single all-gather operation.
+        For the all-gather operation, we just care about the S dimension.
+
+        Argument:
+            source_layout: the layout to be transformed.
+
+        Return:
+            valid_spec_dict(Dict[Layout, CommSpec]): all valid layouts from source_layout with single all-gather operation.
+
+        Example:
+            layout_converter = LayoutConverter()
+            physical_mesh_id = torch.arange(0, 4)
+            mesh_shape = (2, 2)
+            # [[0, 1,
+            #  [2, 3]]
+            device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
+            entire_shape = (4, 4, 4)
+            dim_partition_dict = {0: [0], 1: [1]}
+
+            # [S0,S1,R]
+            sharding_spec = ShardingSpec(dim_size=3, dim_partition_dict=dim_partition_dict)
+            layout = Layout(device_mesh=device_mesh,
+                            device_type=torch.device('cuda'),
+                            sharding_spec=sharding_spec,
+                            entire_shape=entire_shape)
+
+            rst_dict = layout_converter.all_gather_transform_layouts(layout)
+            for layout, comm_spec in rst_dict.items():
+                print(f'{layout.sharding_spec.sharding_sequence}: {comm_spec}')
+
+        Output:
+            [R, S1, R]: CommSpec:(comm_pattern:GATHER_FWD_SPLIT_BWD, gather_dim:0, shard_dim:0, logical_process_axis:0)
+            [S0, R, R]: CommSpec:(comm_pattern:GATHER_FWD_SPLIT_BWD, gather_dim:1, shard_dim:1, logical_process_axis:1)
+        '''
+        valid_spec_dict = {}
+        comm_pattern = CollectiveCommPattern.GATHER_FWD_SPLIT_BWD
+        source_spec = source_layout.sharding_spec
+        process_groups_dict = source_layout.device_mesh.process_groups_dict
+        for target_pair in source_spec.dim_partition_dict.items():
+            shard_list = all_gather_simulator(target_pair)
+            index = target_pair[0]
+            new_dim_partition_dict = deepcopy(source_spec.dim_partition_dict)
+
+            # We won't add empty list into dim_partition_dict
+            # The key will be popped if the related shard_list is empty
+            if shard_list:
+                new_dim_partition_dict[index] = shard_list
+            else:
+                new_dim_partition_dict.pop(index)
+
+            # generate the CommSpec to record the action of source_sharding_spec->new_sharding_spec
+            gather_dim = index
+            logical_process_axis = target_pair[1][-1]
+            comm_spec = CommSpec(
+                comm_pattern,
+                process_groups_dict=process_groups_dict,
+                gather_dim=gather_dim,
+            # shard_dim will be used during backward
+                shard_dim=gather_dim,
+                logical_process_axis=logical_process_axis)
+
+            # generate new sharding spec
+            try:
+                new_sharding_spec = ShardingSpec(source_spec.dims, dim_partition_dict=new_dim_partition_dict)
+                new_layout = Layout(device_mesh=source_layout.device_mesh,
+                                    sharding_spec=new_sharding_spec,
+                                    device_type=source_layout.device_type,
+                                    entire_shape=source_layout.entire_shape)
+
+                valid_spec_dict[new_layout] = comm_spec
+            except ShardingSpecException:
+                pass
+        return valid_spec_dict
+
+    def all_to_all_transform_layout(self, source_layout: Layout) -> Dict[Layout, CommSpec]:
+        '''
+        Get all valid layouts from source_layout with single all-to-all operation.
+        For the all-to-all operation, we just care about the pairs containing S dimension.
+
+        Argument:
+            source_layout(Layout): the layout to be transformed.
+
+        Return:
+            valid_spec_dict(Dict[Layout, CommSpec]): all valid layouts from source_layout with single all-to-all operation.
+
+        Example:
+            layout_converter = LayoutConverter()
+            physical_mesh_id = torch.arange(0, 4)
+            mesh_shape = (2, 2)
+            # [[0, 1,
+            #  [2, 3]]
+            device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
+            entire_shape = (4, 4, 4)
+            dim_partition_dict = {0: [0], 1: [1]}
+
+            # [S0,S1,R]
+            sharding_spec = ShardingSpec(dim_size=3, dim_partition_dict=dim_partition_dict)
+            layout = Layout(device_mesh=device_mesh,
+                                    device_type=torch.device('cuda'),
+                                    sharding_spec=sharding_spec,
+                                    entire_shape=entire_shape)
+            rst_dict = layout_converter.all_to_all_transform_layout(layout)
+
+            for layout, comm_spec in rst_dict.items():
+                print(f'{layout.sharding_spec.sharding_sequence}: {comm_spec}')
+
+        Output:
+            [S01, R, R]: CommSpec:(comm_pattern:ALL2ALL_FWD_ALL2ALL_BWD, gather_dim:1, shard_dim:0, logical_process_axis: 1)
+            [R, S1, S0]: CommSpec:(comm_pattern:ALL2ALL_FWD_ALL2ALL_BWD, gather_dim:0, shard_dim:2, logical_process_axis: 0)
+            [S0, R, S1]: CommSpec:(comm_pattern:ALL2ALL_FWD_ALL2ALL_BWD, gather_dim:1, shard_dim:2, logical_process_axis: 1)
+        '''
+        valid_spec_dict = {}
+        comm_pattern = CollectiveCommPattern.ALL2ALL_FWD_ALL2ALL_BWD
+        process_groups_dict = source_layout.device_mesh.process_groups_dict
+        source_spec = source_layout.sharding_spec
+        tensor_dims = source_spec.dims
+        for f_index in range(tensor_dims - 1):
+            for b_index in range(f_index + 1, tensor_dims):
+                # skip (R, R) cases
+                if f_index not in source_spec.dim_partition_dict and b_index not in source_spec.dim_partition_dict:
+                    continue
+                else:
+                    if f_index in source_spec.dim_partition_dict:
+                        # skip (S01, R) -> (R, S01) is NOT allowed
+                        if len(source_spec.dim_partition_dict[f_index]) >= 2:
+                            continue
+                        f_target_pair = (f_index, deepcopy(source_spec.dim_partition_dict[f_index]))
+                    else:
+                        f_target_pair = (f_index, [])
+                    if b_index in source_spec.dim_partition_dict:
+                        # skip (R, S01) -> (S01, R) is NOT allowed
+                        if len(source_spec.dim_partition_dict[b_index]) >= 2:
+                            continue
+                        b_target_pair = (b_index, deepcopy(source_spec.dim_partition_dict[b_index]))
+                    else:
+                        b_target_pair = (b_index, [])
+
+                # skip (S1, S0) -> S10
+                if f_target_pair[1] and b_target_pair[1] and f_target_pair[1][0] >= b_target_pair[1][0]:
+                    continue
+                f_shard_list, b_shard_list = all_to_all_simulator(f_target_pair, b_target_pair)
+                f_index = f_target_pair[0]
+                b_index = b_target_pair[0]
+
+                # generate the CommSpec to record the action of source_sharding_spec->new_sharding_spec
+                if len(f_shard_list) < len(f_target_pair[1]):
+                    gather_dim = f_index
+                    shard_dim = b_index
+                    logical_process_axis = f_target_pair[1][-1]
+                else:
+                    gather_dim = b_index
+                    shard_dim = f_index
+                    logical_process_axis = b_target_pair[1][-1]
+                comm_spec = CommSpec(comm_pattern,
+                                     process_groups_dict,
+                                     gather_dim=gather_dim,
+                                     shard_dim=shard_dim,
+                                     logical_process_axis=logical_process_axis)
+
+                new_dim_partition_dict = deepcopy(source_spec.dim_partition_dict)
+
+                # We won't add empty list into dim_partition_dict
+                # The key will be popped if the related shard_list is empty
+                if f_shard_list:
+                    new_dim_partition_dict[f_index] = f_shard_list
+                else:
+                    new_dim_partition_dict.pop(f_index)
+                if b_shard_list:
+                    new_dim_partition_dict[b_index] = b_shard_list
+                else:
+                    new_dim_partition_dict.pop(b_index)
+
+                # generate new sharding spec
+                try:
+                    new_sharding_spec = ShardingSpec(source_spec.dims, dim_partition_dict=new_dim_partition_dict)
+                    new_layout = Layout(device_mesh=source_layout.device_mesh,
+                                        sharding_spec=new_sharding_spec,
+                                        device_type=source_layout.device_type,
+                                        entire_shape=source_layout.entire_shape)
+                    valid_spec_dict[new_layout] = comm_spec
+                except ShardingSpecException:
+                    pass
+
+        return valid_spec_dict
+
+    def shard_transform_layout(self, source_layout: Layout) -> Dict[Layout, CommSpec]:
+        '''
+        Get all valid layouts from source_layout with single shard operation.
+        For the sharding operation, we just care about legal sharding dimensions.
+
+        Argument:
+            source_layout(Layout): the layout to be transformed.
+
+        Return:
+            valid_spec_dict(Dict[Layout, CommSpec]): all valid layouts from source_layout with single shard operation.
+
+        Example:
+            layout_converter = LayoutConverter()
+            physical_mesh_id = torch.arange(0, 4)
+            mesh_shape = (2, 2)
+            # [[0, 1,
+            #  [2, 3]]
+            device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
+            entire_shape = (4, 4, 4)
+
+            dim_partition_dict = {0: [0]}
+
+            # [S0,R,R]
+            sharding_spec = ShardingSpec(dim_size=3, dim_partition_dict=dim_partition_dict)
+            layout = Layout(device_mesh=device_mesh,
+                          device_type=torch.device('cuda'),
+                          sharding_spec=sharding_spec,
+                          entire_shape=entire_shape)
+            rst_dict = layout_converter.shard_transform_layout(layout)
+
+            for layout, comm_spec in rst_dict.items():
+                print(f'{layout.sharding_spec.sharding_sequence}: {comm_spec}')
+
+        Output:
+            [S01, R, R]: CommSpec:(comm_pattern:SPLIT_FWD_GATHER_BWD, gather_dim:0, shard_dim:0, logical_process_axis:1)
+            [S0, S1, R]: CommSpec:(comm_pattern:SPLIT_FWD_GATHER_BWD, gather_dim:1, shard_dim:1, logical_process_axis:1)
+            [S0, R, S1]: CommSpec:(comm_pattern:SPLIT_FWD_GATHER_BWD, gather_dim:2, shard_dim:2, logical_process_axis:1)
+        '''
+        valid_spec_dict = {}
+        comm_pattern = CollectiveCommPattern.SPLIT_FWD_GATHER_BWD
+        source_spec = source_layout.sharding_spec
+        process_groups_dict = source_layout.device_mesh.process_groups_dict
+
+        # legal sharding dims means the mesh_id is still available to use.
+        legal_sharding_dims = [i for i in range(len(source_layout.device_mesh.mesh_shape))]
+        for dim, shard_list in source_spec.dim_partition_dict.items():
+            for element in shard_list:
+                legal_sharding_dims.remove(element)
+
+        if len(legal_sharding_dims) == 0:
+            return valid_spec_dict
+
+        tensor_dims = source_spec.dims
+
+        for index in range(tensor_dims):
+            if index not in source_spec.dim_partition_dict:
+                shard_list_list = shard_simulator((index, []), legal_sharding_dims)
+            else:
+                shard_list_list = shard_simulator((index, source_spec.dim_partition_dict[index]), legal_sharding_dims)
+            if not shard_list_list:
+                continue
+            for shard_list in shard_list_list:
+                new_dim_partition_dict = deepcopy(source_spec.dim_partition_dict)
+                new_dim_partition_dict[index] = shard_list
+
+                # generate the CommSpec to record the action of source_sharding_spec->new_sharding_spec
+                shard_dim = index
+                logical_process_axis = shard_list[-1]
+                comm_spec = CommSpec(comm_pattern,
+                                     process_groups_dict,
+                                     gather_dim=shard_dim,
+                                     shard_dim=shard_dim,
+                                     logical_process_axis=logical_process_axis)
+
+                # generate new sharding spec
+                try:
+                    new_sharding_spec = ShardingSpec(dim_size=source_spec.dims,
+                                                     dim_partition_dict=new_dim_partition_dict)
+                    new_layout = Layout(device_mesh=source_layout.device_mesh,
+                                        sharding_spec=new_sharding_spec,
+                                        device_type=source_layout.device_type,
+                                        entire_shape=source_layout.entire_shape)
+                    valid_spec_dict[new_layout] = comm_spec
+                except ShardingSpecException:
+                    pass
+        return valid_spec_dict
+
+    def get_all_one_step_transform_spec(self, source_layout: Layout) -> Dict[Layout, CommSpec]:
+        '''
+        Get all valid layouts from source_layout with one step transform.
+
+        Note:
+            all-gather will eliminate a sharding dimension, all-to-all will keep sharding dimension same as before,
+            and shard will add a sharding dimension. Therefore, the result of above operations are mutual exclusive,
+            we could safely put them together.
+
+        Argument:
+            source_layout(Layout): the layout to be transformer.
+
+        Return:
+            valid_spec_dict(Dict[Layout, CommSpec]): all valid layouts from source_layout with one step transform.
+        '''
+        valid_spec_dict = {}
+        valid_spec_dict.update(self.all_gather_transform_layouts(source_layout))
+        valid_spec_dict.update(self.all_to_all_transform_layout(source_layout))
+        valid_spec_dict.update(self.shard_transform_layout(source_layout))
+        return valid_spec_dict
+
+    def layout_converting(self, source_layout: Layout,
+                          target_layout: Layout) -> Tuple[List[Layout], List[CommSpec], float]:
+        '''
+        This method will find a path to transform source_layout to target_layout with
+        a greedy algorithm.
+        The basic idea is:
+        Step1:
+            Generate all one-step transform sequences from source_layout.
+        Step2:
+            Pick the 'best' layout following the heuristic function.
+        Step3:
+            Repeat above steps until the source layout transform to target layout.
+
+        Additionally, to avoid repeating the path search in runtime, we cached all solved path
+        in auto parallel strategy building time, which could handle most of cases in runtime.
+
+        Args:
+            source_layout(Layout): the layout to be transformed.
+            target_layout(Layout): the layout to be achieved after a serious of transforms.
+
+        Return:
+            transform_path(List[Layout]): The transform path from source_layout to target_layout,
+                                                it contains the source_layout and target_layout.
+            comm_action_sequence(List[CommSpec]): Keep the communication operations to complete the layout converting in order.
+
+        Example:
+            physical_mesh_id = torch.arange(0, 4)
+            mesh_shape = (2, 2)
+            # [[0, 1,
+            #  [2, 3]]
+            device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
+            entire_shape = (4, 4, 4)
+
+            dim_partition_source = {1: [0, 1]}
+            dim_partition_target = {0: [0, 1]}
+
+            # [R,S01,R]
+            sharding_spec_source = ShardingSpec(dim_size=3, dim_partition_dict=dim_partition_source)
+            source_layout = Layout(device_mesh=device_mesh,
+                                device_type=torch.device('cuda'),
+                                sharding_spec=sharding_spec_source,
+                                entire_shape=entire_shape)
+
+            # [S01,R,R]
+            sharding_spec_target = ShardingSpec(dim_size=3, dim_partition_dict=dim_partition_target)
+            target_layout = Layout(device_mesh=device_mesh,
+                                device_type=torch.device('cuda'),
+                                sharding_spec=sharding_spec_target,
+                                entire_shape=entire_shape)
+
+            transform_path, comm_action_sequence = layout_converter.layout_converting(source_layout, target_layout)
+            transform_path_str = '->'.join([str(layout.sharding_spec.sharding_sequence) for layout in transform_path])
+            print(transform_path_str)
+
+        output:
+            [R, S01, R]->[R, S0, R]->[S0, R, R]->[S01, R, R]
+        '''
+        source_spec = source_layout.sharding_spec
+        target_spec = target_layout.sharding_spec
+        MAX_TRANSFORM_STEPS = 20
+        total_steps = 0
+        transform_path = []
+        comm_action_sequence = []
+        spec_pairs = (str(source_spec.sharding_sequence), str(target_spec.sharding_sequence))
+
+        if spec_pairs in self.cached_solution:
+            return self.cached_solution[spec_pairs]
+
+        # We do nothing if the sharding spec is all the same.
+        if source_spec.spec_diff(target_spec) == 0:
+            self.cached_solution[spec_pairs] = (transform_path, comm_action_sequence)
+            return (
+                transform_path,
+                comm_action_sequence,
+            )
+
+        temp_sharding_layout = source_layout
+
+        transform_path.append(temp_sharding_layout)
+        # To avoid dead loop, the loop will break after MAX_TRANSFORM_STEPS transforms
+        while total_steps <= MAX_TRANSFORM_STEPS:
+            valid_transform_spec_dict = self.get_all_one_step_transform_spec(temp_sharding_layout)
+            best_difference_score = math.inf
+
+            for layout, comm_spec in valid_transform_spec_dict.items():
+                sharding_spec = layout.sharding_spec
+                spec_difference = sharding_spec.spec_diff(target_spec)
+
+                if spec_difference == 0:
+                    transform_path.append(layout)
+                    comm_action_sequence.append(comm_spec)
+                    self.cached_solution[spec_pairs] = (transform_path, comm_action_sequence)
+                    return (transform_path, comm_action_sequence)
+
+                if spec_difference < best_difference_score:
+                    temp_sharding_layout = layout
+                    temp_comm_spec = comm_spec
+                    best_difference_score = spec_difference
+
+            transform_path.append(temp_sharding_layout)
+            comm_action_sequence.append(temp_comm_spec)
+
+            total_steps += 1
+
+        raise RuntimeError(f"Could not find a valid transform path with in {MAX_TRANSFORM_STEPS} steps.")
+
+    def get_total_comm_cost(self, source_layout: Layout, target_layout: Layout) -> Dict[str, float]:
+        '''
+        Get the total communication cost of the layout converting process.
+        '''
+        transform_path, comm_action_sequence = self.layout_converting(source_layout, target_layout)
+        total_cost = {'forward': 0.0, 'backward': 0.0, 'total': 0.0}
+        for layout, comm_spec in zip(transform_path, comm_action_sequence):
+            cost_dict = get_comm_cost(layout, comm_spec, self.forward_only)
+            for key in total_cost:
+                total_cost[key] += cost_dict[key]
+        return total_cost
+
+    def apply(self, tensor: torch.Tensor, source_layout: Layout, target_layout: Layout) -> torch.Tensor:
+        '''
+        Apply target_layout to tensor with source layout, the transform path is generated by the
+        layout_converting method.
+
+        Argument:
+            tensor (torch.Tensor): The tensor to be redistributed.
+            source_layout(Layout): The source layout of the tensor.
+            target_layout (Layout): The tensor will be redistributed to the target_layout.
+
+        Example:
+            layout_converter = LayoutConverter()
+            dim_partition_source = {0: [0]}
+            dim_partition_target = {1: [0]}
+            physical_mesh_id = torch.arange(0, 4)
+            mesh_shape = (2, 2)
+            # [[0, 1,
+            #  [2, 3]]
+            device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
+            entire_shape = (4, 4, 4)
+
+            # [S0,R,R]
+            sharding_spec_source = ShardingSpec(dim_size=3, dim_partition_dict=dim_partition_source)
+            source_layout = Layout(device_mesh=device_mesh,
+                                device_type=torch.device('cuda'),
+                                sharding_spec=sharding_spec_source,
+                                entire_shape=entire_shape)
+
+            # [R,S0,R]
+            sharding_spec_target = ShardingSpec(dim_size=3, dim_partition_dict=dim_partition_target)
+            target_layout = Layout(device_mesh=device_mesh,
+                                device_type=torch.device('cuda'),
+                                sharding_spec=sharding_spec_target,
+                                entire_shape=entire_shape)
+
+            if rank in (0, 1):
+                sharded_tensor_0 = torch.zeros(2, 1)
+                sharded_tensor_1 = torch.ones(2, 1)
+                # tensor([[0., 1.],
+                #         [0., 1.]])
+                tensor_to_comm = torch.cat((sharded_tensor_0, sharded_tensor_1), 1).cuda()
+            if rank in (2, 3):
+                sharded_tensor_0 = torch.ones(2, 1) * 2
+                sharded_tensor_1 = torch.ones(2, 1) * 3
+                # tensor([[2., 3.],
+                #         [2., 3.]])
+                tensor_to_comm = torch.cat((sharded_tensor_0, sharded_tensor_1), 1).cuda()
+
+            # converted_tensor: [R, S0, R]
+            converted_tensor = layout_converter.apply(tensor_to_comm, source_layout, target_layout)
+            print(converted_tensor)
+
+        Output in rank0 and rank1:
+            tensor([[0.],
+                    [0.],
+                    [2.],
+                    [2.]])
+
+        Output in rank2 and rank3:
+            tensor([[1.],
+                    [1.],
+                    [3.],
+                    [3.]])
+        '''
+        _, comm_action_sequence = self.layout_converting(source_layout, target_layout)
+        for comm_spec in comm_action_sequence:
+            tensor = comm_spec.covert_spec_to_action(tensor)
+        return tensor
diff --git a/colossalai/tensor/d_tensor/utils.py b/colossalai/tensor/d_tensor/utils.py
new file mode 100644
index 000000000000..644bb6306b42
--- /dev/null
+++ b/colossalai/tensor/d_tensor/utils.py
@@ -0,0 +1,66 @@
+import operator
+from functools import reduce
+from typing import Dict
+
+from colossalai.tensor.d_tensor.comm_spec import CollectiveCommPattern, CommSpec
+from colossalai.tensor.d_tensor.layout import Layout
+
+
+def get_comm_cost(layout: Layout, comm_spec: CommSpec, forward_only: bool = False) -> Dict[str, float]:
+    '''
+    This method is used to compute the communication cost for a given layout and comm_spec.
+
+    For all_gather, all2all, and all_reduce operation, the formula provided in DeviceMesh with alpha-beta model is used to
+    compute the communication cost. For shard operation, it is an on-chip operation, so the communication cost is a tiny cost.
+
+    Args:
+        layout: the layout of the tensor.
+        comm_spec: the comm_spec to instruct the communication operation.
+        forward_only: if it is True, we will just count the forward communication cost.
+            If it is False, we will count both forward and backward communication cost.
+    '''
+    comm_size = reduce(operator.mul, layout.get_sharded_shape_per_device(), 1)
+    device_mesh = layout.device_mesh
+    comm_pattern = comm_spec.comm_pattern
+    logical_process_axis = comm_spec.logical_process_axis
+    cost_dict = {}
+
+    if comm_pattern == CollectiveCommPattern.GATHER_FWD_SPLIT_BWD:
+        # the comm size for all gather is the size of the gathered tensor
+        gather_dim = comm_spec.gather_dim
+        all_gather_axis = layout.sharding_spec.dim_partition_dict[gather_dim][-1]
+        all_gather_size = device_mesh.mesh_shape[all_gather_axis]
+        comm_size_for_all_gather = comm_size * all_gather_size
+        forward_communication_cost = device_mesh.all_gather_cost(comm_size_for_all_gather, logical_process_axis)
+        # give a tiny cost to shard
+        backward_communication_cost = 100
+
+    if comm_pattern == CollectiveCommPattern.ALL2ALL_FWD_ALL2ALL_BWD:
+        forward_communication_cost = device_mesh.all_to_all_cost(comm_size, logical_process_axis)
+        # grad should have same shape as input tensor
+        # all to all operation has same logical process axis as forward.
+        backward_communication_cost = device_mesh.all_to_all_cost(comm_size, logical_process_axis)
+
+    if comm_pattern == CollectiveCommPattern.ALLREDUCE_FWD_IDENTITY_BWD:
+        forward_communication_cost = device_mesh.all_reduce_cost(comm_size, logical_process_axis)
+        backward_communication_cost = 0
+
+    if comm_pattern == CollectiveCommPattern.IDENTITY_FWD_ALLREDUCE_BWD:
+        forward_communication_cost = 0
+        backward_communication_cost = device_mesh.all_reduce_cost(comm_size, logical_process_axis)
+
+    if comm_pattern == CollectiveCommPattern.SPLIT_FWD_GATHER_BWD:
+        # give a tiny cost to shard
+        forward_communication_cost = 100
+        backward_communication_cost = device_mesh.all_gather_cost(comm_size, logical_process_axis)
+
+    if forward_only:
+        cost_dict["forward"] = forward_communication_cost
+        cost_dict["backward"] = 0
+        cost_dict["total"] = cost_dict["forward"] + cost_dict["backward"]
+    else:
+        cost_dict["forward"] = forward_communication_cost
+        cost_dict["backward"] = backward_communication_cost
+        cost_dict["total"] = cost_dict["forward"] + cost_dict["backward"]
+
+    return cost_dict
diff --git a/tests/test_tensor/test_dtensor/test_layout_converter.py b/tests/test_tensor/test_dtensor/test_layout_converter.py
new file mode 100644
index 000000000000..70cf8726dbd0
--- /dev/null
+++ b/tests/test_tensor/test_dtensor/test_layout_converter.py
@@ -0,0 +1,206 @@
+import math
+from functools import partial
+
+import pytest
+import torch
+import torch.multiprocessing as mp
+
+from colossalai.device.device_mesh import DeviceMesh
+from colossalai.initialize import launch
+from colossalai.logging import disable_existing_loggers
+from colossalai.tensor.d_tensor.comm_spec import CollectiveCommPattern
+from colossalai.tensor.d_tensor.layout import Layout
+from colossalai.tensor.d_tensor.layout_converter import LayoutConverter
+from colossalai.tensor.d_tensor.sharding_spec import DimSpec, ShardingSpec
+from colossalai.testing import rerun_if_address_is_in_use
+from colossalai.utils import free_port
+
+entire_shape = torch.Size((64, 32, 16))
+layout_converter = LayoutConverter()
+physical_mesh_id = torch.arange(0, 4).reshape(2, 2)
+mesh_shape = (2, 2)
+
+
+def check_one_step_transform(rank, world_size, port):
+    disable_existing_loggers()
+    launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+    # [[0, 1],
+    #  [2, 3]]
+    device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
+
+    dim_partition_dict = {0: [0], 1: [1]}
+    # DistSpec:
+    #     shard_sequence: S0,S1,R
+    #     device_mesh_shape: (2, 2)
+    sharding_spec = ShardingSpec(dim_size=3, dim_partition_dict=dim_partition_dict)
+    layout = Layout(device_mesh=device_mesh,
+                    device_type=torch.device('cuda'),
+                    sharding_spec=sharding_spec,
+                    entire_shape=entire_shape)
+
+    rst_dict = layout_converter.all_gather_transform_layouts(layout)
+
+    assert '[R, S1, R]' in [
+        str(all_gather_layout.sharding_spec.sharding_sequence) for all_gather_layout in rst_dict.keys()
+    ]
+    assert '[S0, R, R]' in [
+        str(all_gather_layout.sharding_spec.sharding_sequence) for all_gather_layout in rst_dict.keys()
+    ]
+
+    dim_partition_dict_all2all = {0: [0], 1: [1]}
+    # DistSpec:
+    #     shard_sequence: S0,S1,R
+    #     device_mesh_shape: (4, 4)
+    sharding_spec_all2all = ShardingSpec(dim_size=3, dim_partition_dict=dim_partition_dict_all2all)
+    layout_all2all = Layout(device_mesh=device_mesh,
+                            device_type=torch.device('cuda'),
+                            sharding_spec=sharding_spec_all2all,
+                            entire_shape=entire_shape)
+
+    rst_dict_all2all = layout_converter.all_to_all_transform_layout(layout_all2all)
+
+    assert '[S01, R, R]' in [
+        str(all2all_layout.sharding_spec.sharding_sequence) for all2all_layout in rst_dict_all2all.keys()
+    ]
+    assert '[R, S1, S0]' in [
+        str(all2all_layout.sharding_spec.sharding_sequence) for all2all_layout in rst_dict_all2all.keys()
+    ]
+    assert '[S0, R, S1]' in [
+        str(all2all_layout.sharding_spec.sharding_sequence) for all2all_layout in rst_dict_all2all.keys()
+    ]
+
+    dim_partition_shard = {0: [0]}
+    # DistSpec:
+    #     shard_sequence: S0,R,R
+    #     device_mesh_shape: (4, 4)
+    sharding_spec_shard = ShardingSpec(dim_size=3, dim_partition_dict=dim_partition_shard)
+    shard_layout = Layout(device_mesh=device_mesh,
+                          device_type=torch.device('cuda'),
+                          sharding_spec=sharding_spec_shard,
+                          entire_shape=entire_shape)
+
+    rst_dict_shard = layout_converter.shard_transform_layout(shard_layout)
+
+    assert '[S01, R, R]' in [
+        str(shard_layout.sharding_spec.sharding_sequence) for shard_layout in rst_dict_shard.keys()
+    ]
+    assert '[S0, S1, R]' in [
+        str(shard_layout.sharding_spec.sharding_sequence) for shard_layout in rst_dict_shard.keys()
+    ]
+    assert '[S0, R, S1]' in [
+        str(shard_layout.sharding_spec.sharding_sequence) for shard_layout in rst_dict_shard.keys()
+    ]
+
+
+def check_layout_converting(rank, world_size, port):
+    disable_existing_loggers()
+    launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+    dim_partition_source = {1: [0, 1]}
+    dim_partition_target = {0: [0, 1]}
+    device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
+
+    # DistSpec:
+    #     shard_sequence: R,S01,R
+    #     device_mesh_shape: (4, 4)
+    sharding_spec_source = ShardingSpec(dim_size=3, dim_partition_dict=dim_partition_source)
+    source_layout = Layout(device_mesh=device_mesh,
+                           device_type=torch.device('cuda'),
+                           sharding_spec=sharding_spec_source,
+                           entire_shape=entire_shape)
+
+    # DistSpec:
+    #     shard_sequence: S01,R,R
+    #     device_mesh_shape: (4, 4)
+    sharding_spec_target = ShardingSpec(dim_size=3, dim_partition_dict=dim_partition_target)
+    target_layout = Layout(device_mesh=device_mesh,
+                           device_type=torch.device('cuda'),
+                           sharding_spec=sharding_spec_target,
+                           entire_shape=entire_shape)
+
+    transform_path, comm_action_sequence = layout_converter.layout_converting(source_layout, target_layout)
+
+    # check transform path
+    transform_path_str = '->'.join([str(layout.sharding_spec.sharding_sequence) for layout in transform_path])
+    assert transform_path_str == '[R, S01, R]->[R, S0, R]->[S0, R, R]->[S01, R, R]'
+
+    # check comm action sequence
+    # all-gather(S01) -> S0
+    assert comm_action_sequence[0].comm_pattern == CollectiveCommPattern.GATHER_FWD_SPLIT_BWD
+    assert comm_action_sequence[0].gather_dim == 1
+    assert comm_action_sequence[0].logical_process_axis == 1
+
+    # all-to-all(R, S0) -> [S0, R]
+    assert comm_action_sequence[1].comm_pattern == CollectiveCommPattern.ALL2ALL_FWD_ALL2ALL_BWD
+    assert comm_action_sequence[1].gather_dim == 1
+    assert comm_action_sequence[1].shard_dim == 0
+    assert comm_action_sequence[1].logical_process_axis == 0
+
+    # shard(S0) -> [S01]
+    assert comm_action_sequence[2].comm_pattern == CollectiveCommPattern.SPLIT_FWD_GATHER_BWD
+    assert comm_action_sequence[2].shard_dim == 0
+    assert comm_action_sequence[2].logical_process_axis == 1
+
+    # checkout chached_spec_pairs_transform_path
+    assert layout_converter.cached_solution[('[R, S01, R]', '[S01, R, R]')][0] == transform_path
+    assert layout_converter.cached_solution[('[R, S01, R]', '[S01, R, R]')][1] == comm_action_sequence
+
+    comm_cost = layout_converter.get_total_comm_cost(source_layout, target_layout)
+
+    assert comm_cost['forward'] == comm_cost['backward']
+    assert math.floor(comm_cost['total']) == math.floor(comm_cost['forward'] + comm_cost['backward'])
+
+
+def check_layout_converting_apply(rank, world_size, port):
+    disable_existing_loggers()
+    launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+
+    dim_partition_source = {1: [0, 1]}
+    dim_partition_target = {0: [0, 1]}
+    device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
+
+    # DistSpec:
+    #     shard_sequence: R,S01,R
+    #     device_mesh_shape: (4, 4)
+    sharding_spec_source = ShardingSpec(dim_size=3, dim_partition_dict=dim_partition_source)
+    source_layout = Layout(device_mesh=device_mesh,
+                           device_type=torch.device('cuda'),
+                           sharding_spec=sharding_spec_source,
+                           entire_shape=entire_shape)
+
+    # DistSpec:
+    #     shard_sequence: S01,R,R
+    #     device_mesh_shape: (4, 4)
+    sharding_spec_target = ShardingSpec(dim_size=3, dim_partition_dict=dim_partition_target)
+    target_layout = Layout(device_mesh=device_mesh,
+                           device_type=torch.device('cuda'),
+                           sharding_spec=sharding_spec_target,
+                           entire_shape=entire_shape)
+
+    original_tensor = torch.rand(entire_shape).cuda()
+
+    # tensor_to_apply: [R, S01, R]
+    tensor_to_apply = original_tensor.narrow(1, rank * 8, 8)
+
+    # tensor_to_check: [S01, R, R]
+    tensor_to_check = original_tensor.narrow(0, rank * 16, 16)
+
+    converted_tensor = layout_converter.apply(tensor_to_apply, source_layout, target_layout)
+    assert converted_tensor.equal(tensor_to_check)
+
+
+@pytest.mark.dist
+@rerun_if_address_is_in_use()
+def test_layout_converter():
+    world_size = 4
+    run_func = partial(check_one_step_transform, world_size=world_size, port=free_port())
+    mp.spawn(run_func, nprocs=world_size)
+
+    run_func = partial(check_layout_converting, world_size=world_size, port=free_port())
+    mp.spawn(run_func, nprocs=world_size)
+
+    run_func = partial(check_layout_converting_apply, world_size=world_size, port=free_port())
+    mp.spawn(run_func, nprocs=world_size)
+
+
+if __name__ == '__main__':
+    test_layout_converter()

From e58a3c804c4b107462d232c749a276cf7a704512 Mon Sep 17 00:00:00 2001
From: Camille Zhong <44392324+Camille7777@users.noreply.github.com>
Date: Fri, 10 Mar 2023 09:55:58 +0800
Subject: [PATCH 445/503] Fix the version of lightning and colossalai in Stable
 Diffusion environment requirement (#3073)

1. Modify the README of stable diffusion
2. Fix the version of pytorch lightning&lightning and colossalai version to enable codes running successfully.
---
 examples/images/diffusion/README.md        | 2 +-
 examples/images/diffusion/environment.yaml | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/images/diffusion/README.md b/examples/images/diffusion/README.md
index cc57f6d54a8e..9e9900f2ca61 100644
--- a/examples/images/diffusion/README.md
+++ b/examples/images/diffusion/README.md
@@ -69,7 +69,7 @@ python setup.py install
 ##### From pip
 
 ```
-pip install pytorch-lightning
+pip install pytorch-lightning==2.0.0rc0
 ```
 
 #### Step 3:Install [Colossal-AI](https://colossalai.org/download/) From Our Official Website
diff --git a/examples/images/diffusion/environment.yaml b/examples/images/diffusion/environment.yaml
index 5164be72e556..d1ec69c1a585 100644
--- a/examples/images/diffusion/environment.yaml
+++ b/examples/images/diffusion/environment.yaml
@@ -27,5 +27,6 @@ dependencies:
     - torchmetrics==0.7.0
     - prefetch_generator
     - datasets
-    - colossalai
+    - colossalai==0.2.5
+    - lightning==1.9.0
     - -e .

From 10c61de2f7ba2f58de70d24de862b6c9bb3cd3f3 Mon Sep 17 00:00:00 2001
From: Xuanlei Zhao <43881818+oahzxl@users.noreply.github.com>
Date: Fri, 10 Mar 2023 10:23:26 +0800
Subject: [PATCH 446/503] [autochunk] support vit (#3084)

support vit for autochunk
* support some new ops for vit
* fix some bugs
* add test for vit
---
 colossalai/autochunk/autochunk_codegen.py     |   4 +-
 colossalai/autochunk/trace_indice.py          | 156 ++++++++++++------
 colossalai/autochunk/utils.py                 |   7 +-
 colossalai/fx/profiler/opcount.py             |   3 +-
 .../benchmark_autochunk_diffuser.py           | 147 +++++++++++++++++
 .../test_autochunk_unet.py                    |   4 +-
 .../test_autochunk_vit/test_autochunk_vit.py  |  53 ++++++
 .../test_autochunk_vit_utils.py               | 128 ++++++++++++++
 8 files changed, 445 insertions(+), 57 deletions(-)
 create mode 100644 tests/test_autochunk/test_autochunk_diffuser/benchmark_autochunk_diffuser.py
 create mode 100644 tests/test_autochunk/test_autochunk_vit/test_autochunk_vit.py
 create mode 100644 tests/test_autochunk/test_autochunk_vit/test_autochunk_vit_utils.py

diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
index 15e15517ba01..2cbc6c9221aa 100644
--- a/colossalai/autochunk/autochunk_codegen.py
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -63,7 +63,7 @@ def _gen_loop_start(chunk_input: List[Node], chunk_output: List[Node], chunk_oup
     context = ""
     for i in range(len(chunk_output)):
         shape_str = str(list(get_node_shape(chunk_output[i])))
-        if get_node_name(chunk_output[i]) == "split":
+        if get_node_name(chunk_output[i]) in ["split", "unbind"]:
             tensor_str = "torch.empty(%s, dtype=%s.dtype, device=%s.device), " % (shape_str, input_node.name,
                                                                                   input_node.name)
             tensor_str = tensor_str * len(chunk_output[i].meta['tensor_meta'])
@@ -205,7 +205,7 @@ def _add_node_slice(
             if chunk_node.name == node.name or (chunk_node.name in [i.name for i in node.all_input_nodes]):
                 chunk_slice = _gen_chunk_slice_dim(chunk_nodes_dim[region_idx][chunk_node_idx], "chunk_idx",
                                                    get_node_shape(chunk_node))
-                if get_node_name(chunk_node) == "split":
+                if get_node_name(chunk_node) in ["split", "unbind"]:
                     split_chunk_slice = ""
                     for i in range(len(chunk_node.meta['tensor_meta'])):
                         split_chunk_slice += "%s[%d]%s, " % (chunk_node.name, i, chunk_slice)
diff --git a/colossalai/autochunk/trace_indice.py b/colossalai/autochunk/trace_indice.py
index 92199b79a2be..307f4de326d7 100644
--- a/colossalai/autochunk/trace_indice.py
+++ b/colossalai/autochunk/trace_indice.py
@@ -74,6 +74,9 @@ def _add_dim(self, node_idx: int, dim_idx: int) -> None:
         """
         add a dim for indice, compute and source
         """
+        # need to remap if dim_idx < 0, e.g. -1
+        if dim_idx < 0:
+            dim_idx = list(range(len(self.indice_trace_list[node_idx]["indice"]) + 1))[dim_idx]
         self.indice_trace_list[node_idx]["indice"].insert(dim_idx, self._add_indice())
         self.indice_trace_list[node_idx]["compute"].insert(dim_idx, [])
         self.indice_trace_list[node_idx]["source"].insert(dim_idx, {})
@@ -575,6 +578,60 @@ def _assign_sum_indice(self, node: Node, node_idx: int) -> None:
         cat_dim = node.kwargs["dim"]
         self._del_dim(node_idx, cat_dim)
 
+    def _assign_flatten_indice(self, node: Node, node_idx: int) -> None:
+        """
+        Assign indice for flatten op.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        nodes_in = node.args[0]
+        nodes_in_shape = get_node_shape(nodes_in)
+        flatten_start_dim = node.args[1]
+        flatten_dim_num = len(nodes_in_shape) - flatten_start_dim - 1
+        assert flatten_dim_num > 0
+        for _ in range(flatten_dim_num):
+            self._add_dim(node_idx, 0)
+        self._assign_indice_as_input(node, node_idx, nodes_in)
+        for _ in range(flatten_dim_num + 1):
+            self._del_dim(node_idx, -1)
+        self._add_dim(node_idx, -1)
+
+    def _assign_expand_indice(self, node: Node, node_idx: int) -> None:
+        """
+        Assign indice for expand op.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        expand_shape = node.args[1:]
+        node_in_shape = get_node_shape(node.args[0])
+        assert len(expand_shape) == len(node_in_shape)
+        self._assign_indice_as_input(node, node_idx)
+        for i in range(len(node_in_shape)):
+            if expand_shape[i] == node_in_shape[i] or expand_shape[i] == -1:
+                continue
+            elif expand_shape[i] > node_in_shape[i]:
+                self._del_dim(node_idx, i)
+                self._add_dim(node_idx, i)
+            else:
+                raise RuntimeError()
+
+    def _assign_unbind_indice(self, node: Node, node_idx: int) -> None:
+        """
+        Assign indice for unbind op.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        unbind_dim = node.args[1]
+        self._add_dim(node_idx, unbind_dim)
+        self._assign_indice_as_input(node, node_idx)
+        self._del_dim(node_idx, unbind_dim)
+
     def _assign_embedding_indice(self, node: Node, node_idx: int) -> None:
         """
         Assign indice for embedding op.
@@ -695,32 +752,39 @@ def _assign_view_reshape_indice(self, node: Node, node_idx: int) -> None:
             shape_idx = target_shape.index(-1)
             target_shape[shape_idx] = origin_product // target_product
 
-        # determine changed dim
-        len_diff = len(origin_shape) - len(target_shape)
-        if len_diff == 1:
+        # find same dim
+        dim_to_same_dim = []
+        dim_from_same_dim = []
+        for i in range(len(origin_shape)):
+            if origin_shape[i] == target_shape[i]:
+                dim_to_same_dim.append(i)
+                dim_from_same_dim.append(i)
+            else:
+                break
+        for i in range(-1, -len(origin_shape), -1):
+            if origin_shape[i] == target_shape[i]:
+                dim_to_same_dim.append(len(target_shape) + i)
+                dim_from_same_dim.append(len(origin_shape) + i)
+            else:
+                break
+
+        dim_from = list(set(range(len(origin_shape))) - set(dim_from_same_dim))
+        dim_to = list(set(range(len(target_shape))) - set(dim_to_same_dim))
+        assert len(dim_from) == 1 or len(dim_to) == 1 or len(dim_from) == len(dim_to)
+
+        dim_diff = len(dim_from) - len(dim_to)
+        if dim_diff > 0:
             # dim merge
-            dim_equal = [i == j for i, j in zip(origin_shape[:-1], target_shape)]
-            dim_to = [dim_equal.index(False)]
-            dim_from = [dim_equal.index(False), dim_equal.index(False) + 1]
-            self._add_dim(node_idx, -1)
-        elif len_diff == -1:
+            for i in range(dim_diff):
+                self._add_dim(node_idx, -1)
+        elif dim_diff < 0:
             # dim expand
-            dim_equal = [i == j for i, j in zip(origin_shape, target_shape[:-1])]
-            dim_from = [dim_equal.index(False)]
-            dim_to = [dim_equal.index(False), dim_equal.index(False) + 1]
-            self._del_dim(node_idx, -1)
-        elif len_diff == 0:
-            # dim equal
-            dim_equal = [i == j for i, j in zip(origin_shape, target_shape[:-1])]
-            dim_from = []
-            dim_to = []
-        else:
-            raise NotImplementedError("shape" + str(origin_shape) + "and" + str(target_shape) + "view not implemented")
+            for i in range(-dim_diff):
+                self._del_dim(node_idx, -1)
 
         # get new indice
         origin_trace = self._find_indice_trace_from_node(origin_node)
         self._assign_indice_as_input(node, node_idx, origin_node)
-        idx_from = [origin_trace[i] for i in dim_from]
         dim_from.reverse()
         for i in dim_from:
             self._del_dim(node_idx, i)
@@ -728,36 +792,18 @@ def _assign_view_reshape_indice(self, node: Node, node_idx: int) -> None:
             self._add_dim(node_idx, i)
         dim_from.reverse()
 
-        # search view list
-        # for view_node, view_dict in self.indice_view_list.items():
-        #     if (view_dict["idx_to"] == idx_from and view_dict["dim_to"] == dim_from
-        #             and view_dict["dim_from"] == dim_to):
-        #         # inheirt indice from current node
-        #         if len_diff == 1:
-        #             if origin_shape[dim_from[0]] == 1:
-        #                 self._inherit_indice(origin_node, dim_from[1], node, dim_to[0], init=False)
-        #             elif origin_shape[dim_from[1]] == 1:
-        #                 self._inherit_indice(origin_node, dim_from[0], node, dim_to[0], init=False)
-        #         elif len_diff == -1:
-        #             if target_shape[dim_to[0]] == 1:
-        #                 self._inherit_indice(origin_node, dim_from[0], node, dim_to[1], init=False)
-        #             elif target_shape[dim_to[1]] == 1:
-        #                 self._inherit_indice(origin_node, dim_from[0], node, dim_to[0], init=False)
-        #         # inherid indice from input node of last view
-        #         for dim_to_i in dim_to:
-        #             self._inherit_indice(view_node.args[0], dim_to_i, node, dim_to_i, init=False)
-
         # inheirt indice from current node
-        if len_diff == 1:
-            if origin_shape[dim_from[0]] == 1:
-                self._inherit_indice(origin_node, dim_from[1], node, dim_to[0], init=False)
-            elif origin_shape[dim_from[1]] == 1:
-                self._inherit_indice(origin_node, dim_from[0], node, dim_to[0], init=False)
-        elif len_diff == -1:
-            if target_shape[dim_to[0]] == 1:
-                self._inherit_indice(origin_node, dim_from[0], node, dim_to[1], init=False)
-            elif target_shape[dim_to[1]] == 1:
-                self._inherit_indice(origin_node, dim_from[0], node, dim_to[0], init=False)
+        if len(dim_from) != 0 and len(dim_to) != 0:
+            if dim_diff == 1:
+                if origin_shape[dim_from[0]] == 1:
+                    self._inherit_indice(origin_node, dim_from[1], node, dim_to[0], init=False)
+                elif origin_shape[dim_from[1]] == 1:
+                    self._inherit_indice(origin_node, dim_from[0], node, dim_to[0], init=False)
+            elif dim_diff == -1:
+                if target_shape[dim_to[0]] == 1:
+                    self._inherit_indice(origin_node, dim_from[0], node, dim_to[1], init=False)
+                elif target_shape[dim_to[1]] == 1:
+                    self._inherit_indice(origin_node, dim_from[0], node, dim_to[0], init=False)
 
         # log view, not used now
         view_dict = {
@@ -809,6 +855,14 @@ def trace_indice(self) -> None:
                     self._assgin_no_change_indice(node, idx)
                 elif "new_ones" == node_name:
                     self._assign_all_indice(node, idx)
+                elif "flatten" == node_name:
+                    self._assign_flatten_indice(node, idx)
+                elif "expand" == node_name:
+                    self._assign_expand_indice(node, idx)
+                elif "unbind" == node_name:
+                    self._assign_unbind_indice(node, idx)
+                elif "softmax" == node_name:
+                    self._assign_softmax_indice(node, idx)
                 elif any(i == node_name for i in ["size"]):
                     continue
                 else:
@@ -859,7 +913,9 @@ def trace_indice(self) -> None:
                     self._assign_linear_indice(node, idx)
                 elif "conv2d" == node_name:
                     self._assign_conv2d_indice(node, idx)
-                elif any(n == node_name for n in ["sigmoid", "dropout", "relu", "silu"]):
+                elif "identity" == node_name:
+                    self._assgin_no_change_indice(node, idx)
+                elif any(n == node_name for n in ["sigmoid", "dropout", "relu", "silu", "gelu"]):
                     self._assign_elementwise_indice(node, idx)
                 else:
                     raise NotImplementedError(node_name, "module not implemented yet!")
diff --git a/colossalai/autochunk/utils.py b/colossalai/autochunk/utils.py
index 7c0bc29b5893..064baa047155 100644
--- a/colossalai/autochunk/utils.py
+++ b/colossalai/autochunk/utils.py
@@ -109,8 +109,11 @@ def is_non_compute_node(node: Node) -> bool:
     return False
 
 
-def get_node_shape(node: Node) -> List:
-    if get_node_name(node) == "split":
+def get_node_shape(node: Node) -> Any:
+    """
+    return node data shape
+    """
+    if get_node_name(node) in ["split", "unbind"]:
         return node.meta["tensor_meta"][0].shape
     if hasattr(node.meta["tensor_meta"], "shape"):
         return node.meta["tensor_meta"].shape
diff --git a/colossalai/fx/profiler/opcount.py b/colossalai/fx/profiler/opcount.py
index 6bdec865fd84..e302c842126f 100644
--- a/colossalai/fx/profiler/opcount.py
+++ b/colossalai/fx/profiler/opcount.py
@@ -359,7 +359,8 @@ def zero_flop_jit(*args):
         aten.where.self,
         aten.zero_.default,
         aten.zeros_like.default,
-        aten.fill_.Scalar
+        aten.fill_.Scalar,
+        aten.stack.default
     ]  # yapf: disable
 
     for op in zero_flop_aten:
diff --git a/tests/test_autochunk/test_autochunk_diffuser/benchmark_autochunk_diffuser.py b/tests/test_autochunk/test_autochunk_diffuser/benchmark_autochunk_diffuser.py
new file mode 100644
index 000000000000..5c127bd69980
--- /dev/null
+++ b/tests/test_autochunk/test_autochunk_diffuser/benchmark_autochunk_diffuser.py
@@ -0,0 +1,147 @@
+import time
+from typing import Any, Dict, List
+
+import torch
+import torch.fx
+
+import colossalai
+from colossalai.autochunk.autochunk_codegen import AUTOCHUNK_AVAILABLE
+from colossalai.fx.graph_module import ColoGraphModule
+from colossalai.fx.passes.meta_info_prop import MetaInfoProp
+from colossalai.fx.profiler import parameter_size
+from colossalai.utils import free_port
+
+if AUTOCHUNK_AVAILABLE:
+    from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
+    from colossalai.fx.profiler import MetaTensor
+    from colossalai.fx.tracer.experimental import ColoTracer, symbolic_trace
+
+
+def _benchmark_autochunk_unet_gm(
+    model: Any,
+    data: tuple,
+    max_memory: int = None,
+) -> None:
+    model = model.cuda().eval()
+
+    # build model and input
+    meta_args, concrete_args = data
+    if concrete_args is None:
+        concrete_args = {}
+
+    # trace the meta graph and setup codegen
+    meta_graph = symbolic_trace(
+        model,
+        meta_args={k: v.to(torch.device("meta")) for k, v in meta_args},
+        concrete_args={k: v for k, v in concrete_args},
+    )
+    model = model.cuda().eval()
+    interp = MetaInfoProp(meta_graph)
+    meta_tensors = [i[1] for i in meta_args] + [i[1] for i in concrete_args]
+    meta_tensors = [MetaTensor(i, fake_device="cuda:0") if isinstance(i, torch.Tensor) else i for i in meta_tensors]
+    interp.propagate(*meta_tensors)
+    codegen = AutoChunkCodeGen(
+        meta_graph,
+        max_memory=max_memory,
+    )
+
+    # trace and recompile
+    # MetaInfoProp requires symbolic_trace but CodeGen requires ColoTracer
+    graph = ColoTracer().trace(
+        model.cuda().eval(),
+        meta_args={k: v.to(torch.device("meta")) for k, v in meta_args},
+        concrete_args={k: v for k, v in concrete_args},
+    )
+    graph.set_codegen(codegen)
+    gm = ColoGraphModule(model, graph, ckpt_codegen=False)
+    gm.recompile()
+
+    # init inputs
+    inputs = [i[1] for i in meta_args] + [i[1] for i in concrete_args]
+    inputs = [i.cuda() if isinstance(i, torch.Tensor) else i for i in inputs]
+    model.cuda().eval()
+
+    # bench
+    para_mem = float(parameter_size(model)) / 1024**2
+    act_mem = _benchmark_memory(gm, inputs)
+    speed = _benchmark_speed(gm, inputs)
+    print("unet autochunk, time: %.4fs, act mem: %.2fMB, para mem: %.2fMB, all mem: %.2fMB" %
+          (speed, act_mem, para_mem, act_mem + para_mem))
+
+
+def _benchmark_autochunk_unet_origin(
+    model: Any,
+    data: tuple,
+) -> None:
+    # build model and input
+    meta_args, concrete_args = data
+    if concrete_args is None:
+        concrete_args = {}
+
+    # init inputs
+    inputs = [i[1] for i in meta_args] + [i[1] for i in concrete_args]
+    inputs = [i.cuda() if isinstance(i, torch.Tensor) else i for i in inputs]
+    model.cuda().eval()
+
+    # bench
+    para_mem = float(parameter_size(model)) / 1024**2
+    act_mem = _benchmark_memory(model, inputs)
+    speed = _benchmark_speed(model, inputs)
+    print("unet origin, time: %.4fs, act mem: %.2fMB, para mem: %.2fMB, all mem: %.2fMB" %
+          (speed, act_mem, para_mem, act_mem + para_mem))
+    return act_mem
+
+
+def _benchmark_memory(model, inputs):
+    with torch.no_grad():
+        torch.cuda.reset_peak_memory_stats()
+        now_mem = float(torch.cuda.memory_allocated()) / 1024**2
+        model(*inputs)
+        new_max_mem = float(torch.cuda.max_memory_allocated()) / 1024**2
+    return new_max_mem - now_mem
+
+
+def _benchmark_speed(model, inputs, loop=5):
+    with torch.no_grad():
+        for _ in range(loop // 2 + 1):
+            model(*inputs)
+        torch.cuda.synchronize()
+        time1 = time.time()
+        for _ in range(loop):
+            model(*inputs)
+        torch.cuda.synchronize()
+        time2 = time.time()
+    return (time2 - time1) / loop
+
+
+def benchmark_autochunk_unet(batch=1, height=448, width=448):
+    from test_autochunk_unet import UNet2DModel, get_data
+    model = UNet2DModel()
+    latent_shape = (batch, 3, height // 7, width // 7)
+
+    print("\nbatch: %d, height: %d, width: %d" % (batch, height, width))
+    max_mem = _benchmark_autochunk_unet_origin(model, get_data(latent_shape))
+    for ratio in [0.5, 0.4, 0.3, 0.2]:
+        try:
+            _benchmark_autochunk_unet_gm(model, get_data(latent_shape), max_mem * ratio)
+        except RuntimeError as e:
+            if e.args[0] == 'Search failed. Try a larger memory threshold.':
+                break
+        except Exception as e:
+            raise e
+    _benchmark_autochunk_unet_gm(model, get_data(latent_shape), None)
+
+
+if __name__ == "__main__":
+    # launch colossalai
+    colossalai.launch(
+        config={},
+        rank=0,
+        world_size=1,
+        host="localhost",
+        port=free_port(),
+        backend="nccl",
+    )
+    benchmark_autochunk_unet(batch=1, height=224 * 2, width=224 * 2)
+    benchmark_autochunk_unet(batch=1, height=224 * 3, width=224 * 3)
+    benchmark_autochunk_unet(batch=1, height=224 * 4, width=224 * 4)
diff --git a/tests/test_autochunk/test_autochunk_diffuser/test_autochunk_unet.py b/tests/test_autochunk/test_autochunk_diffuser/test_autochunk_unet.py
index 518c7f45124d..16c5b10ff4ae 100644
--- a/tests/test_autochunk/test_autochunk_diffuser/test_autochunk_unet.py
+++ b/tests/test_autochunk/test_autochunk_diffuser/test_autochunk_unet.py
@@ -39,7 +39,7 @@ def get_data(shape: tuple) -> Tuple[List, List]:
 )
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("shape", [LATENTS_SHAPE])
-@pytest.mark.parametrize("max_memory", [None])
+@pytest.mark.parametrize("max_memory", [None, 150, 300])
 def test_evoformer_block(model, shape, max_memory):
     run_func = partial(
         run_test,
@@ -57,7 +57,7 @@ def test_evoformer_block(model, shape, max_memory):
         max_memory=None,
         model=UNet2DModel,
         print_code=False,
-        print_mem=False,
+        print_mem=True,
         print_est_mem=False,
         print_progress=False,
     )
diff --git a/tests/test_autochunk/test_autochunk_vit/test_autochunk_vit.py b/tests/test_autochunk/test_autochunk_vit/test_autochunk_vit.py
new file mode 100644
index 000000000000..2b7cbf1390d2
--- /dev/null
+++ b/tests/test_autochunk/test_autochunk_vit/test_autochunk_vit.py
@@ -0,0 +1,53 @@
+from functools import partial
+from typing import List, Tuple
+
+import pytest
+import torch
+import torch.multiprocessing as mp
+
+try:
+    from timm.models.vision_transformer import vit_large_patch16_384 as vit
+    MODELS = [vit]
+    HAS_REPO = True
+except:
+    MODELS = []
+    HAS_REPO = False
+
+from test_autochunk_vit_utils import run_test
+
+from colossalai.autochunk.autochunk_codegen import AUTOCHUNK_AVAILABLE
+
+
+def get_data() -> Tuple[List, List]:
+    data = torch.rand(1, 3, 384, 384)
+    meta_args = {'x': data}
+    return data, meta_args
+
+
+@pytest.mark.skipif(
+    not (AUTOCHUNK_AVAILABLE and HAS_REPO),
+    reason="torch version is lower than 1.12.0",
+)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("max_memory", [None, 32, 40])
+def test_evoformer_block(model, max_memory):
+    run_func = partial(
+        run_test,
+        max_memory=max_memory,
+        model=model,
+        data=get_data(),
+    )
+    mp.spawn(run_func, nprocs=1)
+
+
+if __name__ == "__main__":
+    run_test(
+        rank=0,
+        data=get_data(),
+        max_memory=None,
+        model=vit,
+        print_code=False,
+        print_mem=False,
+        print_est_mem=False,
+        print_progress=False,
+    )
diff --git a/tests/test_autochunk/test_autochunk_vit/test_autochunk_vit_utils.py b/tests/test_autochunk/test_autochunk_vit/test_autochunk_vit_utils.py
new file mode 100644
index 000000000000..035dd59799b4
--- /dev/null
+++ b/tests/test_autochunk/test_autochunk_vit/test_autochunk_vit_utils.py
@@ -0,0 +1,128 @@
+from typing import Any, Dict, List
+
+import torch
+import torch.fx
+
+import colossalai
+from colossalai.autochunk.autochunk_codegen import AUTOCHUNK_AVAILABLE
+from colossalai.core import global_context as gpc
+from colossalai.fx.graph_module import ColoGraphModule
+from colossalai.fx.passes.meta_info_prop import MetaInfoProp
+from colossalai.utils import free_port
+
+if AUTOCHUNK_AVAILABLE:
+    from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
+    from colossalai.fx.profiler import MetaTensor
+    from colossalai.fx.tracer.experimental import ColoTracer, symbolic_trace
+
+
+def assert_codegen_run(
+    model: Any,
+    meta_args: Dict,
+    data: Any,
+    max_memory: int = None,
+    print_mem: bool = False,
+    print_est_mem: bool = False,
+    print_progress: bool = False,
+    print_code: bool = False,
+) -> List[Dict]:
+    model = model()
+
+    # trace the meta graph and setup codegen
+    meta_graph = symbolic_trace(model, meta_args={k: v.to(torch.device("meta")) for k, v in meta_args.items()})
+    model = model.cuda().eval()
+    interp = MetaInfoProp(meta_graph)
+    meta_tensors = [MetaTensor(i[1], fake_device="cuda:0") for i in meta_args.items()]
+    interp.propagate(*meta_tensors)
+    codegen = AutoChunkCodeGen(
+        meta_graph,
+        max_memory=max_memory,
+        print_mem=print_est_mem,
+        print_progress=print_progress,
+    )
+    chunks = codegen.chunk_infos
+
+    # trace and recompile
+    # MetaInfoProp requires symbolic_trace but CodeGen requires ColoTracer
+    graph = ColoTracer().trace(
+        model.cuda(),
+        meta_args={k: v.to(torch.device("meta")) for k, v in meta_args.items()},
+    )
+    graph.set_codegen(codegen)
+    gm = ColoGraphModule(model, graph, ckpt_codegen=False)
+    gm.recompile()
+
+    # assert chunk in code
+    code = graph.python_code("self").src
+    if print_code:
+        print(code)
+    assert "chunk_size = None;  " in code
+
+    # assert result
+    inputs = [data.cuda()]
+    model.cuda().eval()
+    gm.eval()
+    with torch.no_grad():
+        if print_mem:
+            torch.cuda.reset_peak_memory_stats()
+            now_mem_gm = torch.cuda.memory_allocated() / 1024**2
+        out_gm = gm(*[i.clone() if isinstance(i, torch.Tensor) else i for i in inputs])
+        if print_mem:
+            max_mem_gm = torch.cuda.max_memory_allocated() / 1024**2
+            torch.cuda.reset_peak_memory_stats()
+            now_mem_ori = torch.cuda.memory_allocated() / 1024**2
+        out_model = model(*[i.clone() if isinstance(i, torch.Tensor) else i for i in inputs])
+        if print_mem:
+            max_mem_ori = torch.cuda.max_memory_allocated() / 1024**2
+            print("origin mem: %.2fMB, autochunk mem: %.2fMB" % (max_mem_ori - now_mem_ori, max_mem_gm - now_mem_gm))
+
+    assert torch.allclose(out_gm, out_model,
+                          atol=1e-3), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(
+                              torch.abs(out_gm - out_model))
+
+    return chunks
+
+
+def run_test(
+    rank: int,
+    model: Any,
+    data: tuple,
+    max_memory: int,
+    print_code: bool = False,
+    print_mem: bool = False,
+    print_est_mem: bool = False,
+    print_progress: bool = False,
+    get_chunk_target: Any = None,
+) -> None:
+    # launch colossalai
+    colossalai.launch(
+        config={},
+        rank=rank,
+        world_size=1,
+        host="localhost",
+        port=free_port(),
+        backend="nccl",
+    )
+
+    # build model and input
+    data, meta_args = data
+    chunks = assert_codegen_run(
+        model,
+        meta_args=meta_args,
+        data=data,
+        max_memory=max_memory,
+        print_code=print_code,
+        print_mem=print_mem,
+        print_est_mem=print_est_mem,
+        print_progress=print_progress,
+    )
+
+    if get_chunk_target is not None:
+        chunk_found = [i["region"] for i in chunks]
+        chunk_target = get_chunk_target()[max_memory]
+        assert (chunk_found == chunk_target), "found regions %s doesn't equal target regions %s" % (
+            str(chunk_found),
+            str(chunk_target),
+        )
+
+    gpc.destroy()

From 3213347b49b1d4644cc2d15fc9ee3c24dffd395f Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Fri, 10 Mar 2023 10:32:14 +0800
Subject: [PATCH 447/503] [doc] fixed typos in docs/README.md (#3082)

---
 docs/README.md | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/docs/README.md b/docs/README.md
index edeb4ec253f4..f520608d552c 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -14,7 +14,7 @@
 
 ## 📝 Overview
 
-We evaluated various existing solutions for documentation in the community and discussed their advantages and disadvangtes in the [issue #2651](https://github.com/hpcaitech/ColossalAI/issues/2651). Therefore, we propose to build a more modern and robust documentation system by integrating the Sphinx [autodoc](https://www.sphinx-doc.org/en/master/usage/extensions/autodoc.html) function and the [Docusaurus](https://docusaurus.io/) framework.
+We evaluated various existing solutions for documentation in the community and discussed their advantages and disadvantages in the [issue #2651](https://github.com/hpcaitech/ColossalAI/issues/2651). Therefore, we propose to build a more modern and robust documentation system by integrating the Sphinx [autodoc](https://www.sphinx-doc.org/en/master/usage/extensions/autodoc.html) function and the [Docusaurus](https://docusaurus.io/) framework.
 
 ## 🗺 Module Structure
 
@@ -29,15 +29,15 @@ We evaluated various existing solutions for documentation in the community and d
 ```
 
 The documentation module structure is shown above:
-1. source: This folder contains the multi-language documentation files.
+1. source: This folder contains multi-language documentation files.
 2. `sidebars.json`: The `sidebars.json` defines the table of content for the tutorials. You need to update this file when a new doc is added/deleted.
 3. `versions.json`: The `versions.json` in the **main branch** in the **latest commit** will be used to control the versions to be displayed on our website
 
 ## 🧱 Our Documentation System
 
-We believe that there are several advantages from the existing system can be combined for simplicity, usability and maintainability:1
-1. Support Markdown](https://www.markdownguide.org/), we belive is a more popular language for writing documentations comapred to [RST](https://docutils.sourceforge.io/rst.html).
-2. Support Autodoc, which can automatically generate documentation from the docstrings in the source code provided by [Sphinx](https://www.sphinx-doc.org/en/master/).
+We believe that the combination of the existing systems can yield several advantages such as simplicity, usability and maintainability:
+1. Support [Markdown](https://www.markdownguide.org/). We believe is a more popular language for writing documentation compared to [RST](https://docutils.sourceforge.io/rst.html).
+2. Support Autodoc. It can automatically generate documentation from the docstrings in the source code provided by [Sphinx](https://www.sphinx-doc.org/en/master/).
 3. Support elegant and modern UI, which is provided by [Docusaurus](https://docusaurus.io/).
 4. Support MDX for more flexible and powerful documentation, which is provided by [Docusaurus](https://docusaurus.io/).
 5. Support hosting blogs/project home page/other pages besides the documentation, which is provided by [Docusaurus](https://docusaurus.io/).
@@ -46,7 +46,7 @@ Therefore, we have built the [ColossalAI-Documentation](https://github.com/hpcai
 
 ## 🎊 Contribution
 
-You can contribute to the documentation by directly set up a Pull Request towards the `docs/source` folder. There are several guidelines for documentation contribution.
+You can contribute to the documentation by directly setting up a Pull Request towards the `docs/source` folder. There are several guidelines for documentation contribution.
 
 1. The documentation is written in Markdown. You can refer to the [Markdown Guide](https://www.markdownguide.org/) for the syntax.
 2. You must ensure that the documentation exists for all languages. You can refer to the [Adding a New Documentation](#-adding-a-new-documentation) for more details.
@@ -68,7 +68,7 @@ Let's assume that you want to add a file called `your_doc.md`, your file structu
   - sidebars.json  # add your documentation file name here
 ```
 
-Meanwhile, you need to ensure the `sidebars.json` is updated such that it contains your documentation file. Our CI will check whether a documentation exists for all languages and can be used to build the website successfully.
+Meanwhile, you need to ensure the `sidebars.json` is updated such that it contains your documentation file. Our CI will check whether documentation exists for all languages and can be used to build the website successfully.
 
 ### 🧹 Doc Testing
 
@@ -78,7 +78,7 @@ Every documentation is tested to ensure it works well. You need to add the follo
 <!-- doc-test-command: $command  -->
 ```
 
-Meanwhile, only code labelled as a Python code block will be considered for testing.
+Meanwhile, only code labeled as a Python code block will be considered for testing.
 
 ```markdown
     ```python
@@ -98,13 +98,13 @@ Lastly, if you want to skip some code, you just need to add the following annota
 <!--- doc-test-ignore-end -->
 ```
 
-If you have any dependency required, please add it to `requriements-doc-test.txt` for pip and `conda-doc-test-deps.yml` for conda.
+If you have any dependency required, please add it to `requriements-doc-test.txt` for pip and `conda-doc-test-deps.yml` for Conda.
 
 
 ### 💉 Auto Documentation
 
 Lastly, you may want to include the API documentation for a class/function in your documentation for reference.
-We support `autodoc` to extract the docstring and transform it into a Web element for elegant display.
+We support `autodoc` to extract the docstring and transform it into a Web element for an elegant display.
 You just need to add `{{ autodoc:<mod-name> }}` in your markdown as a single line. An example is given below and you can see the outcome in [this PR](https://github.com/hpcaitech/ColossalAI-Documentation/pull/175).
 
 ```markdown

From 5d5f475d758347b5e61dbb4b0ccb6108821e3e93 Mon Sep 17 00:00:00 2001
From: Fazzie-Maqianli <55798671+Fazziekey@users.noreply.github.com>
Date: Fri, 10 Mar 2023 10:35:15 +0800
Subject: [PATCH 448/503] [diffusers] fix ci and docker (#3085)

---
 examples/images/diffusion/README.md         | 19 +------------------
 examples/images/diffusion/docker/Dockerfile |  2 +-
 examples/images/diffusion/test_ci.sh        |  2 --
 3 files changed, 2 insertions(+), 21 deletions(-)

diff --git a/examples/images/diffusion/README.md b/examples/images/diffusion/README.md
index 9e9900f2ca61..ff468f4f4acc 100644
--- a/examples/images/diffusion/README.md
+++ b/examples/images/diffusion/README.md
@@ -55,24 +55,7 @@ conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit
 pip install transformers diffusers invisible-watermark
 ```
 
-#### Step 2: install lightning
-
-Install Lightning version later than 2022.01.04. We suggest you install lightning from source.
-
-##### From Source
-```
-git clone https://github.com/Lightning-AI/lightning.git
-pip install -r requirements.txt
-python setup.py install
-```
-
-##### From pip
-
-```
-pip install pytorch-lightning==2.0.0rc0
-```
-
-#### Step 3:Install [Colossal-AI](https://colossalai.org/download/) From Our Official Website
+#### Step 2:Install [Colossal-AI](https://colossalai.org/download/) From Our Official Website
 
 ##### From pip
 
diff --git a/examples/images/diffusion/docker/Dockerfile b/examples/images/diffusion/docker/Dockerfile
index e8e6957b79e1..3b5301b89853 100644
--- a/examples/images/diffusion/docker/Dockerfile
+++ b/examples/images/diffusion/docker/Dockerfile
@@ -15,7 +15,7 @@ RUN git clone https://github.com/NVIDIA/apex && \
 #     && cd ./ColossalAI \
 #     && pip install -v --no-cache-dir .
 
-RUN pip install colossalai==0.1.12+torch1.12cu11.3 -f https://release.colossalai.org
+RUN pip install colossalai
 
 
 # install titans
diff --git a/examples/images/diffusion/test_ci.sh b/examples/images/diffusion/test_ci.sh
index 51ceeb41d47e..44cf47046684 100755
--- a/examples/images/diffusion/test_ci.sh
+++ b/examples/images/diffusion/test_ci.sh
@@ -10,8 +10,6 @@ pip install transformers diffusers invisible-watermark
 
 CUDA_EXT=1  pip install colossalai
 
-pip install pytorch-lightning
-
 wget https://huggingface.co/stabilityai/stable-diffusion-2-base/resolve/main/512-base-ema.ckpt
 
 python main.py --logdir /tmp --train --base configs/Teyvat/train_colossalai_teyvat.yaml --ckpt 512-base-ema.ckpt

From fff98f06edfb0ec0aba339776db34ba5bb6405f9 Mon Sep 17 00:00:00 2001
From: Super Daniel <78588128+super-dainiu@users.noreply.github.com>
Date: Fri, 10 Mar 2023 13:21:05 +0800
Subject: [PATCH 449/503] [analyzer] a minimal implementation of static graph
 analyzer (#2852)

* [hotfix] meta tensor default device.

* [siu] add experimental submodules to main branch.

* [siu]

* [siu]

* [analyzer] init.

* [analyzer] readme.

* [analyzer] readme.

* [analyzer] readme.

* [analyzer] readme.

* [test] add test.

* Update symbolic_trace.py

* mark skip tests.

* try except.

* try except.

* try except.

* s

* init

* init

* fix

* skip

* skip

---------

Co-authored-by: Daniel Shao <superdainiu@MININT-PVARVID.fareast.corp.microsoft.com>
Co-authored-by: Daniel Shao <superdainiu@Daniels-Mac.local>
---
 colossalai/_analyzer/README.md                | 306 +++++++++
 colossalai/_analyzer/_subclasses/__init__.py  |   4 +
 .../_subclasses/_meta_registration.py         | 481 ++++++++++++++
 .../_analyzer/_subclasses/_monkey_patch.py    |  88 +++
 .../_analyzer/_subclasses/flop_tensor.py      | 536 +++++++++++++++
 .../_analyzer/_subclasses/meta_tensor.py      | 207 ++++++
 colossalai/_analyzer/envs.py                  |   7 +
 colossalai/_analyzer/fx/__init__.py           |   4 +
 colossalai/_analyzer/fx/bias_addition.py      | 155 +++++
 colossalai/_analyzer/fx/codegen.py            | 456 +++++++++++++
 colossalai/_analyzer/fx/graph_module.py       | 173 +++++
 colossalai/_analyzer/fx/node_util.py          | 211 ++++++
 colossalai/_analyzer/fx/passes/__init__.py    |   2 +
 .../_analyzer/fx/passes/graph_profile.py      | 347 ++++++++++
 colossalai/_analyzer/fx/passes/shape_prop.py  | 194 ++++++
 colossalai/_analyzer/fx/symbolic_profile.py   |  40 ++
 colossalai/_analyzer/fx/symbolic_trace.py     | 620 ++++++++++++++++++
 colossalai/fx/passes/concrete_info_prop.py    |   2 +-
 colossalai/fx/passes/meta_info_prop.py        |   4 +
 colossalai/fx/profiler/opcount.py             |   1 +
 tests/test_analyzer/test_fx/__init__.py       |   0
 .../test_fx/test_bias_addition.py             | 113 ++++
 tests/test_analyzer/test_fx/test_mod_dir.py   |  78 +++
 .../test_analyzer/test_fx/test_nested_ckpt.py |  55 ++
 .../test_analyzer/test_fx/test_shape_prop.py  |  63 ++
 .../test_fx/test_symbolic_profile.py          |  49 ++
 tests/test_analyzer/test_fx/zoo.py            |  53 ++
 .../test_analyzer/test_subclasses/__init__.py |   0
 .../test_subclasses/test_aten.py              |  82 +++
 .../test_subclasses/test_flop_tensor.py       |  50 ++
 .../test_subclasses/test_meta_mode.py         |  38 ++
 tests/test_analyzer/test_subclasses/zoo.py    |  53 ++
 32 files changed, 4471 insertions(+), 1 deletion(-)
 create mode 100644 colossalai/_analyzer/README.md
 create mode 100644 colossalai/_analyzer/_subclasses/__init__.py
 create mode 100644 colossalai/_analyzer/_subclasses/_meta_registration.py
 create mode 100644 colossalai/_analyzer/_subclasses/_monkey_patch.py
 create mode 100644 colossalai/_analyzer/_subclasses/flop_tensor.py
 create mode 100644 colossalai/_analyzer/_subclasses/meta_tensor.py
 create mode 100644 colossalai/_analyzer/envs.py
 create mode 100644 colossalai/_analyzer/fx/__init__.py
 create mode 100644 colossalai/_analyzer/fx/bias_addition.py
 create mode 100644 colossalai/_analyzer/fx/codegen.py
 create mode 100644 colossalai/_analyzer/fx/graph_module.py
 create mode 100644 colossalai/_analyzer/fx/node_util.py
 create mode 100644 colossalai/_analyzer/fx/passes/__init__.py
 create mode 100644 colossalai/_analyzer/fx/passes/graph_profile.py
 create mode 100644 colossalai/_analyzer/fx/passes/shape_prop.py
 create mode 100644 colossalai/_analyzer/fx/symbolic_profile.py
 create mode 100644 colossalai/_analyzer/fx/symbolic_trace.py
 create mode 100644 tests/test_analyzer/test_fx/__init__.py
 create mode 100644 tests/test_analyzer/test_fx/test_bias_addition.py
 create mode 100644 tests/test_analyzer/test_fx/test_mod_dir.py
 create mode 100644 tests/test_analyzer/test_fx/test_nested_ckpt.py
 create mode 100644 tests/test_analyzer/test_fx/test_shape_prop.py
 create mode 100644 tests/test_analyzer/test_fx/test_symbolic_profile.py
 create mode 100644 tests/test_analyzer/test_fx/zoo.py
 create mode 100644 tests/test_analyzer/test_subclasses/__init__.py
 create mode 100644 tests/test_analyzer/test_subclasses/test_aten.py
 create mode 100644 tests/test_analyzer/test_subclasses/test_flop_tensor.py
 create mode 100644 tests/test_analyzer/test_subclasses/test_meta_mode.py
 create mode 100644 tests/test_analyzer/test_subclasses/zoo.py

diff --git a/colossalai/_analyzer/README.md b/colossalai/_analyzer/README.md
new file mode 100644
index 000000000000..c5c55eddd325
--- /dev/null
+++ b/colossalai/_analyzer/README.md
@@ -0,0 +1,306 @@
+# Analyzer
+
+# Overview
+The Analyzer is a collection of static graph utils including Colossal-AI FX. Features include:
+- MetaTensor -- enabling:
+  - Ahead-of-time Profiling
+  - Shape Propagation
+  - Ideal Flop Counter
+- symbolic_trace()
+  - Robust Control-flow Tracing / Recompile
+  - Robust Activation Checkpoint Tracing / CodeGen
+  - Easy-to-define Bias-Addition Split
+- symbolic_profile()
+  - Support ``MetaTensorMode``, where all Tensor operations are executed symbolically.
+  - Shape Inference Across Device and Unified ``MetaInfo``
+  - Ideal Flop Counter https://dev-discuss.pytorch.org/t/the-ideal-pytorch-flop-counter-with-torch-dispatch/505
+
+# Quickstart
+## Analyzer.FX
+**Reference:**
+
+  https://pytorch.org/docs/stable/fx.html [[paper](https://arxiv.org/pdf/2112.08429)]
+
+
+torch.FX is a toolkit for developers to use to transform nn.Module instances. FX consists of three main components: a symbolic tracer, an intermediate representation, and Python code generation. FX.Tracer hacks _\_\_torch_function\_\__ and use a Proxy object to propagate through any forward function of torch.nn.Module.
+![image](https://user-images.githubusercontent.com/78588128/212531495-bbb934dd-dbbb-4578-8869-6171973f7dd8.png)
+ColossalAI FX is modified from torch.FX, with the extra capability of ahead-of-time profiling enabled by the subclass of ``MetaTensor``.
+
+### Analyzer.FX.symbolic_trace()
+A drawback of the original torch.FX implementation is that it is poor at handling control flow. All control flow is not PyTorch native operands and requires actual instances that specify the branches to execute on. For example,
+
+```python
+class MyModule(nn.Module):
+    def forward(self, x):
+        if x.dim() == 3:
+            return x * 2 + 1
+        else:
+            return x - 5
+```
+
+The above function has the computation graph of
+
+![image](https://user-images.githubusercontent.com/78588128/212532631-dba30734-577b-4418-8dc9-004d7983abc5.png)
+
+However, since Proxy does not have concrete data, applying ``x.dim()`` will return nothing. In the context of the auto-parallel system, at least the control-flow dependencies for tensor shape should be removed, since any searched strategy could only auto-parallelize a specific computation graph with the same tensor shape. It is native to attach concrete data onto a Proxy, and propagate them through control flow.
+
+![image](https://user-images.githubusercontent.com/78588128/212533403-1b620986-1c3a-420a-87c6-d08c9702135d.png)
+
+
+With ``MetaTensor``, the computation during shape propagation can be virtualized. This speeds up tracing by avoiding allocating actual memory on devices.
+
+#### Remarks
+There is no free lunch for PyTorch to unify all operands in both its repo and other repos in its eco-system. For example, the einops library currently has no intention to support torch.FX (See https://github.com/arogozhnikov/einops/issues/188). To support different PyTorch-based libraries without modifying source code, good practices can be to allow users to register their implementation to substitute the functions not supported by torch.FX, or to avoid entering incompatible submodules.
+
+### Analyzer.FX.symbolic_profile()
+
+``symbolic_profile`` is another important feature of Colossal-AI's auto-parallel system. Profiling DNN can be costly, as you need to allocate memory and execute on real devices. However, since the profiling requirements for auto-parallel is enough if we can detect when and where the intermediate activations (i.e. Tensor) are generated, we can profile the whole procedure without actually executing it. ``symbolic_profile``, as its name infers, profiles the whole network with symbolic information only.
+
+```python
+with MetaTensorMode():
+    model = MyModule().cuda()
+    sample = torch.rand(100, 3, 224, 224).cuda()
+meta_args = dict(
+    x = sample,
+)
+gm = symbolic_trace(model, meta_args=meta_args)
+gm = symbolic_profile(gm, sample)
+```
+
+``symbolic_profile`` is enabled by ``ShapeProp`` and ``GraphProfile``.
+
+#### ShapeProp
+Both Tensor Parallel and Activation Checkpoint solvers need to know the shape information ahead of time. Unlike PyTorch's implementation, this ``ShapeProp`` can be executed under MetaTensorMode. With this, all the preparation for auto-parallel solvers can be done in milliseconds.
+
+Meanwhile, it is easy to keep track of the memory usage of each node when doing shape propagation. However, the drawbacks of FX is that not every ``call_function`` saves its input for backward, and different tensor that flows within one FX.Graph can actually have the same layout. This raises problems for fine-grained profiling.
+
+![image](https://user-images.githubusercontent.com/78588128/215312957-7eb6cbc3-61b2-49cf-95a4-6b859149eb8d.png)
+
+To address this problem, I came up with a simulated environment enabled by ``torch.autograd.graph.saved_tensor_hooks`` and fake ``data_ptr`` (check ``_subclasses/meta_tensor.py`` for more details of ``data_ptr`` updates).
+
+```python
+class sim_env(saved_tensors_hooks):
+    """
+    A simulation of memory allocation and deallocation in the forward pass
+    using ``saved_tensor_hooks``.
+
+    Attributes:
+        ctx (Dict[int, torch.Tensor]): A dictionary that maps the
+            data pointer of a tensor to the tensor itself. This is used
+            to track the memory allocation and deallocation.
+
+        param_ctx (Dict[int, torch.Tensor]): A dictionary that maps the
+            data pointer of all model parameters to the parameter itself.
+            This avoids overestimating the memory usage of the intermediate activations.
+    """
+
+    def __init__(self, module: Optional[torch.nn.Module] = None):
+        super().__init__(self.pack_hook, self.unpack_hook)
+        self.ctx = {}
+        self.param_ctx = {param.data_ptr(): param for param in module.parameters()}
+        self.buffer_ctx = {buffer.data_ptr(): buffer for buffer in module.buffers()} if module else {}
+
+    def pack_hook(self, tensor: torch.Tensor):
+        if tensor.data_ptr() not in self.param_ctx and tensor.data_ptr() not in self.buffer_ctx:
+            self.ctx[tensor.data_ptr()] = tensor
+        return tensor
+
+    def unpack_hook(self, tensor):
+        return tensor
+```
+The ``ctx`` variable will keep track of all saved tensors with a unique identifier. It is likely that ``nn.Parameter`` is also counted in the ``ctx``, which is not desired. To avoid this, we can use ``param_ctx`` to keep track of all parameters in the model. The ``buffer_ctx`` is used to keep track of all buffers in the model. The ``local_ctx`` that is attached to each ``Node`` marks the memory usage of the stage to which the node belongs. With simple ``intersect``, ``union`` and ``subtract`` operations, we can get any memory-related information. For non-profileable nodes, you might add your customized profile rules to simulate the memory allocation. If a ``Graph`` is modified with some non-PyTorch functions, such as fused operands, you can register the shape propagation rule with the decorator.
+
+```python
+@register_shape_impl(fuse_conv_bn)
+def fuse_conv_bn_shape_impl(*args, **kwargs):
+     # infer output shape here
+     return torch.empty(output_shape, device=output_device)
+```
+
+An important notice is that ``ShapeProp`` will attach additional information to the graph, which will be exactly the input of ``Profiler``.
+
+#### GraphProfiler
+``GraphProfiler`` executes at the node level, and profiles both forward and backward within one node. For example, ``FlopProfiler`` will profile the forward and backward FLOPs of a node, and ``CommunicationProfiler`` will profile the forward and backward communication cost of a node. The ``GraphProfiler`` will attach the profiling results to the ``Node``. These procedures are decoupled for better extensibility.
+
+To provide a general insight of the profiled results, you can set ``verbose=True`` to print the summary as well.
+```python
+model = tm.resnet18()
+sample = torch.rand(100, 3, 224, 224)
+meta_args = dict(x=sample)
+gm = symbolic_trace(model, meta_args=meta_args)
+gm = symbolic_profile(gm, sample, verbose=True)
+
+============================================================ Results =====================================================================
+       Op type                                              Op    Accumulate size    Incremental size    Output size    Temp size    Param size    Backward size      Fwd FLOPs      Bwd FLOPs
+-------------  ----------------------------------------------  -----------------  ------------------  -------------  -----------  ------------  ---------------  -------------  -------------
+  placeholder                                               x            4.59 Mb                 0 b        4.59 Mb          0 b           0 b              0 b        0 FLOPs        0 FLOPs
+  call_module                                       conv_proj            4.59 Mb                 0 b            0 b      4.59 Mb       2.25 Mb          4.59 Mb  924.84 MFLOPs  924.84 MFLOPs
+  call_method                                         reshape            4.59 Mb                 0 b            0 b      4.59 Mb           0 b          4.59 Mb        0 FLOPs        0 FLOPs
+  call_method                                         permute            4.59 Mb                 0 b            0 b      4.59 Mb           0 b          4.59 Mb        0 FLOPs        0 FLOPs
+     get_attr                                     class_token            4.59 Mb                 0 b            0 b          0 b           0 b              0 b        0 FLOPs        0 FLOPs
+  call_method                                          expand            4.59 Mb                 0 b            0 b     24.00 Kb       3.00 Kb              0 b        0 FLOPs    6.14 kFLOPs
+call_function                                             cat            4.59 Mb                 0 b            0 b      4.62 Mb           0 b              0 b        0 FLOPs        0 FLOPs
+     get_attr                           encoder_pos_embedding            4.59 Mb                 0 b            0 b          0 b           0 b              0 b        0 FLOPs        0 FLOPs
+call_function                                             add            9.21 Mb             4.62 Mb        4.62 Mb          0 b     591.00 Kb          4.62 Mb    1.21 MFLOPs    1.21 MFLOPs
+  call_module                                 encoder_dropout            9.21 Mb                 0 b        4.62 Mb          0 b           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+  call_module             encoder_layers_encoder_layer_0_ln_1            9.22 Mb            12.31 Kb            0 b      4.62 Mb       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module   encoder_layers_encoder_layer_0_self_attention           46.52 Mb            37.30 Mb            0 b      4.62 Mb       9.01 Mb         13.85 Mb    4.20 GFLOPs    8.40 GFLOPs
+call_function                                         getitem           46.52 Mb                 0 b            0 b      4.62 Mb           0 b              0 b        0 FLOPs        0 FLOPs
+call_function                                       getitem_1           46.52 Mb                 0 b            0 b          0 b           0 b              0 b        0 FLOPs        0 FLOPs
+  call_module          encoder_layers_encoder_layer_0_dropout           46.52 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                           add_1           51.14 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module             encoder_layers_encoder_layer_0_ln_2           51.15 Mb            12.31 Kb            0 b      4.62 Mb       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module            encoder_layers_encoder_layer_0_mlp_0           74.24 Mb            23.09 Mb       18.47 Mb          0 b       9.01 Mb          4.62 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module            encoder_layers_encoder_layer_0_mlp_1           92.71 Mb            18.47 Mb       18.47 Mb          0 b           0 b         18.47 Mb    4.84 MFLOPs    4.84 MFLOPs
+  call_module            encoder_layers_encoder_layer_0_mlp_2           92.71 Mb                 0 b       18.47 Mb          0 b           0 b         18.47 Mb        0 FLOPs        0 FLOPs
+  call_module            encoder_layers_encoder_layer_0_mlp_3           92.71 Mb                 0 b            0 b      4.62 Mb       9.00 Mb         18.47 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module            encoder_layers_encoder_layer_0_mlp_4           92.71 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                           add_2           97.32 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module             encoder_layers_encoder_layer_1_ln_1          101.95 Mb             4.63 Mb        4.62 Mb          0 b       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module   encoder_layers_encoder_layer_1_self_attention          134.63 Mb            32.68 Mb            0 b      4.62 Mb       9.01 Mb         13.85 Mb    4.20 GFLOPs    8.40 GFLOPs
+call_function                                       getitem_2          134.63 Mb                 0 b            0 b      4.62 Mb           0 b              0 b        0 FLOPs        0 FLOPs
+call_function                                       getitem_3          134.63 Mb                 0 b            0 b          0 b           0 b              0 b        0 FLOPs        0 FLOPs
+  call_module          encoder_layers_encoder_layer_1_dropout          134.63 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                           add_3          139.25 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module             encoder_layers_encoder_layer_1_ln_2          139.26 Mb            12.31 Kb            0 b      4.62 Mb       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module            encoder_layers_encoder_layer_1_mlp_0          162.35 Mb            23.09 Mb       18.47 Mb          0 b       9.01 Mb          4.62 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module            encoder_layers_encoder_layer_1_mlp_1          180.82 Mb            18.47 Mb       18.47 Mb          0 b           0 b         18.47 Mb    4.84 MFLOPs    4.84 MFLOPs
+  call_module            encoder_layers_encoder_layer_1_mlp_2          180.82 Mb                 0 b       18.47 Mb          0 b           0 b         18.47 Mb        0 FLOPs        0 FLOPs
+  call_module            encoder_layers_encoder_layer_1_mlp_3          180.82 Mb                 0 b            0 b      4.62 Mb       9.00 Mb         18.47 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module            encoder_layers_encoder_layer_1_mlp_4          180.82 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                           add_4          185.43 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module             encoder_layers_encoder_layer_2_ln_1          190.06 Mb             4.63 Mb        4.62 Mb          0 b       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module   encoder_layers_encoder_layer_2_self_attention          222.74 Mb            32.68 Mb            0 b      4.62 Mb       9.01 Mb         13.85 Mb    4.20 GFLOPs    8.40 GFLOPs
+call_function                                       getitem_4          222.74 Mb                 0 b            0 b      4.62 Mb           0 b              0 b        0 FLOPs        0 FLOPs
+call_function                                       getitem_5          222.74 Mb                 0 b            0 b          0 b           0 b              0 b        0 FLOPs        0 FLOPs
+  call_module          encoder_layers_encoder_layer_2_dropout          222.74 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                           add_5          227.36 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module             encoder_layers_encoder_layer_2_ln_2          227.37 Mb            12.31 Kb            0 b      4.62 Mb       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module            encoder_layers_encoder_layer_2_mlp_0          250.46 Mb            23.09 Mb       18.47 Mb          0 b       9.01 Mb          4.62 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module            encoder_layers_encoder_layer_2_mlp_1          268.93 Mb            18.47 Mb       18.47 Mb          0 b           0 b         18.47 Mb    4.84 MFLOPs    4.84 MFLOPs
+  call_module            encoder_layers_encoder_layer_2_mlp_2          268.93 Mb                 0 b       18.47 Mb          0 b           0 b         18.47 Mb        0 FLOPs        0 FLOPs
+  call_module            encoder_layers_encoder_layer_2_mlp_3          268.93 Mb                 0 b            0 b      4.62 Mb       9.00 Mb         18.47 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module            encoder_layers_encoder_layer_2_mlp_4          268.93 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                           add_6          273.54 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module             encoder_layers_encoder_layer_3_ln_1          278.17 Mb             4.63 Mb        4.62 Mb          0 b       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module   encoder_layers_encoder_layer_3_self_attention          310.86 Mb            32.68 Mb            0 b      4.62 Mb       9.01 Mb         13.85 Mb    4.20 GFLOPs    8.40 GFLOPs
+call_function                                       getitem_6          310.86 Mb                 0 b            0 b      4.62 Mb           0 b              0 b        0 FLOPs        0 FLOPs
+call_function                                       getitem_7          310.86 Mb                 0 b            0 b          0 b           0 b              0 b        0 FLOPs        0 FLOPs
+  call_module          encoder_layers_encoder_layer_3_dropout          310.86 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                           add_7          315.47 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module             encoder_layers_encoder_layer_3_ln_2          315.48 Mb            12.31 Kb            0 b      4.62 Mb       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module            encoder_layers_encoder_layer_3_mlp_0          338.57 Mb            23.09 Mb       18.47 Mb          0 b       9.01 Mb          4.62 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module            encoder_layers_encoder_layer_3_mlp_1          357.04 Mb            18.47 Mb       18.47 Mb          0 b           0 b         18.47 Mb    4.84 MFLOPs    4.84 MFLOPs
+  call_module            encoder_layers_encoder_layer_3_mlp_2          357.04 Mb                 0 b       18.47 Mb          0 b           0 b         18.47 Mb        0 FLOPs        0 FLOPs
+  call_module            encoder_layers_encoder_layer_3_mlp_3          357.04 Mb                 0 b            0 b      4.62 Mb       9.00 Mb         18.47 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module            encoder_layers_encoder_layer_3_mlp_4          357.04 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                           add_8          361.66 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module             encoder_layers_encoder_layer_4_ln_1          366.29 Mb             4.63 Mb        4.62 Mb          0 b       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module   encoder_layers_encoder_layer_4_self_attention          398.97 Mb            32.68 Mb            0 b      4.62 Mb       9.01 Mb         13.85 Mb    4.20 GFLOPs    8.40 GFLOPs
+call_function                                       getitem_8          398.97 Mb                 0 b            0 b      4.62 Mb           0 b              0 b        0 FLOPs        0 FLOPs
+call_function                                       getitem_9          398.97 Mb                 0 b            0 b          0 b           0 b              0 b        0 FLOPs        0 FLOPs
+  call_module          encoder_layers_encoder_layer_4_dropout          398.97 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                           add_9          403.58 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module             encoder_layers_encoder_layer_4_ln_2          403.60 Mb            12.31 Kb            0 b      4.62 Mb       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module            encoder_layers_encoder_layer_4_mlp_0          426.68 Mb            23.09 Mb       18.47 Mb          0 b       9.01 Mb          4.62 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module            encoder_layers_encoder_layer_4_mlp_1          445.15 Mb            18.47 Mb       18.47 Mb          0 b           0 b         18.47 Mb    4.84 MFLOPs    4.84 MFLOPs
+  call_module            encoder_layers_encoder_layer_4_mlp_2          445.15 Mb                 0 b       18.47 Mb          0 b           0 b         18.47 Mb        0 FLOPs        0 FLOPs
+  call_module            encoder_layers_encoder_layer_4_mlp_3          445.15 Mb                 0 b            0 b      4.62 Mb       9.00 Mb         18.47 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module            encoder_layers_encoder_layer_4_mlp_4          445.15 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                          add_10          449.77 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module             encoder_layers_encoder_layer_5_ln_1          454.40 Mb             4.63 Mb        4.62 Mb          0 b       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module   encoder_layers_encoder_layer_5_self_attention          487.08 Mb            32.68 Mb            0 b      4.62 Mb       9.01 Mb         13.85 Mb    4.20 GFLOPs    8.40 GFLOPs
+call_function                                      getitem_10          487.08 Mb                 0 b            0 b      4.62 Mb           0 b              0 b        0 FLOPs        0 FLOPs
+call_function                                      getitem_11          487.08 Mb                 0 b            0 b          0 b           0 b              0 b        0 FLOPs        0 FLOPs
+  call_module          encoder_layers_encoder_layer_5_dropout          487.08 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                          add_11          491.70 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module             encoder_layers_encoder_layer_5_ln_2          491.71 Mb            12.31 Kb            0 b      4.62 Mb       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module            encoder_layers_encoder_layer_5_mlp_0          514.79 Mb            23.09 Mb       18.47 Mb          0 b       9.01 Mb          4.62 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module            encoder_layers_encoder_layer_5_mlp_1          533.26 Mb            18.47 Mb       18.47 Mb          0 b           0 b         18.47 Mb    4.84 MFLOPs    4.84 MFLOPs
+  call_module            encoder_layers_encoder_layer_5_mlp_2          533.26 Mb                 0 b       18.47 Mb          0 b           0 b         18.47 Mb        0 FLOPs        0 FLOPs
+  call_module            encoder_layers_encoder_layer_5_mlp_3          533.26 Mb                 0 b            0 b      4.62 Mb       9.00 Mb         18.47 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module            encoder_layers_encoder_layer_5_mlp_4          533.26 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                          add_12          537.88 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module             encoder_layers_encoder_layer_6_ln_1          542.51 Mb             4.63 Mb        4.62 Mb          0 b       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module   encoder_layers_encoder_layer_6_self_attention          575.19 Mb            32.68 Mb            0 b      4.62 Mb       9.01 Mb         13.85 Mb    4.20 GFLOPs    8.40 GFLOPs
+call_function                                      getitem_12          575.19 Mb                 0 b            0 b      4.62 Mb           0 b              0 b        0 FLOPs        0 FLOPs
+call_function                                      getitem_13          575.19 Mb                 0 b            0 b          0 b           0 b              0 b        0 FLOPs        0 FLOPs
+  call_module          encoder_layers_encoder_layer_6_dropout          575.19 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                          add_13          579.81 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module             encoder_layers_encoder_layer_6_ln_2          579.82 Mb            12.31 Kb            0 b      4.62 Mb       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module            encoder_layers_encoder_layer_6_mlp_0          602.90 Mb            23.09 Mb       18.47 Mb          0 b       9.01 Mb          4.62 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module            encoder_layers_encoder_layer_6_mlp_1          621.37 Mb            18.47 Mb       18.47 Mb          0 b           0 b         18.47 Mb    4.84 MFLOPs    4.84 MFLOPs
+  call_module            encoder_layers_encoder_layer_6_mlp_2          621.37 Mb                 0 b       18.47 Mb          0 b           0 b         18.47 Mb        0 FLOPs        0 FLOPs
+  call_module            encoder_layers_encoder_layer_6_mlp_3          621.37 Mb                 0 b            0 b      4.62 Mb       9.00 Mb         18.47 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module            encoder_layers_encoder_layer_6_mlp_4          621.37 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                          add_14          625.99 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module             encoder_layers_encoder_layer_7_ln_1          630.62 Mb             4.63 Mb        4.62 Mb          0 b       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module   encoder_layers_encoder_layer_7_self_attention          663.30 Mb            32.68 Mb            0 b      4.62 Mb       9.01 Mb         13.85 Mb    4.20 GFLOPs    8.40 GFLOPs
+call_function                                      getitem_14          663.30 Mb                 0 b            0 b      4.62 Mb           0 b              0 b        0 FLOPs        0 FLOPs
+call_function                                      getitem_15          663.30 Mb                 0 b            0 b          0 b           0 b              0 b        0 FLOPs        0 FLOPs
+  call_module          encoder_layers_encoder_layer_7_dropout          663.30 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                          add_15          667.92 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module             encoder_layers_encoder_layer_7_ln_2          667.93 Mb            12.31 Kb            0 b      4.62 Mb       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module            encoder_layers_encoder_layer_7_mlp_0          691.02 Mb            23.09 Mb       18.47 Mb          0 b       9.01 Mb          4.62 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module            encoder_layers_encoder_layer_7_mlp_1          709.48 Mb            18.47 Mb       18.47 Mb          0 b           0 b         18.47 Mb    4.84 MFLOPs    4.84 MFLOPs
+  call_module            encoder_layers_encoder_layer_7_mlp_2          709.48 Mb                 0 b       18.47 Mb          0 b           0 b         18.47 Mb        0 FLOPs        0 FLOPs
+  call_module            encoder_layers_encoder_layer_7_mlp_3          709.48 Mb                 0 b            0 b      4.62 Mb       9.00 Mb         18.47 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module            encoder_layers_encoder_layer_7_mlp_4          709.48 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                          add_16          714.10 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module             encoder_layers_encoder_layer_8_ln_1          718.73 Mb             4.63 Mb        4.62 Mb          0 b       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module   encoder_layers_encoder_layer_8_self_attention          751.41 Mb            32.68 Mb            0 b      4.62 Mb       9.01 Mb         13.85 Mb    4.20 GFLOPs    8.40 GFLOPs
+call_function                                      getitem_16          751.41 Mb                 0 b            0 b      4.62 Mb           0 b              0 b        0 FLOPs        0 FLOPs
+call_function                                      getitem_17          751.41 Mb                 0 b            0 b          0 b           0 b              0 b        0 FLOPs        0 FLOPs
+  call_module          encoder_layers_encoder_layer_8_dropout          751.41 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                          add_17          756.03 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module             encoder_layers_encoder_layer_8_ln_2          756.04 Mb            12.31 Kb            0 b      4.62 Mb       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module            encoder_layers_encoder_layer_8_mlp_0          779.13 Mb            23.09 Mb       18.47 Mb          0 b       9.01 Mb          4.62 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module            encoder_layers_encoder_layer_8_mlp_1          797.60 Mb            18.47 Mb       18.47 Mb          0 b           0 b         18.47 Mb    4.84 MFLOPs    4.84 MFLOPs
+  call_module            encoder_layers_encoder_layer_8_mlp_2          797.60 Mb                 0 b       18.47 Mb          0 b           0 b         18.47 Mb        0 FLOPs        0 FLOPs
+  call_module            encoder_layers_encoder_layer_8_mlp_3          797.60 Mb                 0 b            0 b      4.62 Mb       9.00 Mb         18.47 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module            encoder_layers_encoder_layer_8_mlp_4          797.60 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                          add_18          802.21 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module             encoder_layers_encoder_layer_9_ln_1          806.84 Mb             4.63 Mb        4.62 Mb          0 b       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module   encoder_layers_encoder_layer_9_self_attention          839.52 Mb            32.68 Mb            0 b      4.62 Mb       9.01 Mb         13.85 Mb    4.20 GFLOPs    8.40 GFLOPs
+call_function                                      getitem_18          839.52 Mb                 0 b            0 b      4.62 Mb           0 b              0 b        0 FLOPs        0 FLOPs
+call_function                                      getitem_19          839.52 Mb                 0 b            0 b          0 b           0 b              0 b        0 FLOPs        0 FLOPs
+  call_module          encoder_layers_encoder_layer_9_dropout          839.52 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                          add_19          844.14 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module             encoder_layers_encoder_layer_9_ln_2          844.15 Mb            12.31 Kb            0 b      4.62 Mb       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module            encoder_layers_encoder_layer_9_mlp_0          867.24 Mb            23.09 Mb       18.47 Mb          0 b       9.01 Mb          4.62 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module            encoder_layers_encoder_layer_9_mlp_1          885.71 Mb            18.47 Mb       18.47 Mb          0 b           0 b         18.47 Mb    4.84 MFLOPs    4.84 MFLOPs
+  call_module            encoder_layers_encoder_layer_9_mlp_2          885.71 Mb                 0 b       18.47 Mb          0 b           0 b         18.47 Mb        0 FLOPs        0 FLOPs
+  call_module            encoder_layers_encoder_layer_9_mlp_3          885.71 Mb                 0 b            0 b      4.62 Mb       9.00 Mb         18.47 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module            encoder_layers_encoder_layer_9_mlp_4          885.71 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                          add_20          890.32 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module            encoder_layers_encoder_layer_10_ln_1          894.95 Mb             4.63 Mb        4.62 Mb          0 b       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module  encoder_layers_encoder_layer_10_self_attention          927.63 Mb            32.68 Mb            0 b      4.62 Mb       9.01 Mb         13.85 Mb    4.20 GFLOPs    8.40 GFLOPs
+call_function                                      getitem_20          927.63 Mb                 0 b            0 b      4.62 Mb           0 b              0 b        0 FLOPs        0 FLOPs
+call_function                                      getitem_21          927.63 Mb                 0 b            0 b          0 b           0 b              0 b        0 FLOPs        0 FLOPs
+  call_module         encoder_layers_encoder_layer_10_dropout          927.63 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                          add_21          932.25 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module            encoder_layers_encoder_layer_10_ln_2          932.26 Mb            12.31 Kb            0 b      4.62 Mb       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module           encoder_layers_encoder_layer_10_mlp_0          955.35 Mb            23.09 Mb       18.47 Mb          0 b       9.01 Mb          4.62 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module           encoder_layers_encoder_layer_10_mlp_1          973.82 Mb            18.47 Mb       18.47 Mb          0 b           0 b         18.47 Mb    4.84 MFLOPs    4.84 MFLOPs
+  call_module           encoder_layers_encoder_layer_10_mlp_2          973.82 Mb                 0 b       18.47 Mb          0 b           0 b         18.47 Mb        0 FLOPs        0 FLOPs
+  call_module           encoder_layers_encoder_layer_10_mlp_3          973.82 Mb                 0 b            0 b      4.62 Mb       9.00 Mb         18.47 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module           encoder_layers_encoder_layer_10_mlp_4          973.82 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                          add_22          978.44 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module            encoder_layers_encoder_layer_11_ln_1          983.06 Mb             4.63 Mb        4.62 Mb          0 b       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module  encoder_layers_encoder_layer_11_self_attention         1015.75 Mb            32.68 Mb            0 b      4.62 Mb       9.01 Mb         13.85 Mb    4.20 GFLOPs    8.40 GFLOPs
+call_function                                      getitem_22         1015.75 Mb                 0 b            0 b      4.62 Mb           0 b              0 b        0 FLOPs        0 FLOPs
+call_function                                      getitem_23         1015.75 Mb                 0 b            0 b          0 b           0 b              0 b        0 FLOPs        0 FLOPs
+  call_module         encoder_layers_encoder_layer_11_dropout         1015.75 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                          add_23         1020.36 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module            encoder_layers_encoder_layer_11_ln_2         1020.38 Mb            12.31 Kb            0 b      4.62 Mb       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module           encoder_layers_encoder_layer_11_mlp_0            1.02 Gb            23.09 Mb       18.47 Mb          0 b       9.01 Mb          4.62 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module           encoder_layers_encoder_layer_11_mlp_1            1.04 Gb            18.47 Mb       18.47 Mb          0 b           0 b         18.47 Mb    4.84 MFLOPs    4.84 MFLOPs
+  call_module           encoder_layers_encoder_layer_11_mlp_2            1.04 Gb                 0 b       18.47 Mb          0 b           0 b         18.47 Mb        0 FLOPs        0 FLOPs
+  call_module           encoder_layers_encoder_layer_11_mlp_3            1.04 Gb                 0 b            0 b      4.62 Mb       9.00 Mb         18.47 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module           encoder_layers_encoder_layer_11_mlp_4            1.04 Gb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                          add_24            1.04 Gb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module                                      encoder_ln            1.04 Gb            36.31 Kb       24.00 Kb          0 b       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+call_function                                      getitem_24            1.04 Gb                 0 b       24.00 Kb          0 b           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+  call_module                                      heads_head            1.04 Gb                 0 b            0 b     31.25 Kb       2.93 Mb         24.00 Kb    6.14 MFLOPs   12.30 MFLOPs
+       output                                          output            1.04 Gb                 0 b            0 b     31.25 Kb           0 b         31.25 Kb        0 FLOPs        0 FLOPs
+```
diff --git a/colossalai/_analyzer/_subclasses/__init__.py b/colossalai/_analyzer/_subclasses/__init__.py
new file mode 100644
index 000000000000..8464fed25edf
--- /dev/null
+++ b/colossalai/_analyzer/_subclasses/__init__.py
@@ -0,0 +1,4 @@
+from ._meta_registration import *
+from ._monkey_patch import *
+from .flop_tensor import flop_count, flop_mapping
+from .meta_tensor import MetaTensor, MetaTensorMode
diff --git a/colossalai/_analyzer/_subclasses/_meta_registration.py b/colossalai/_analyzer/_subclasses/_meta_registration.py
new file mode 100644
index 000000000000..20ab46054c8e
--- /dev/null
+++ b/colossalai/_analyzer/_subclasses/_meta_registration.py
@@ -0,0 +1,481 @@
+# meta patch from https://github.com/pytorch/pytorch/blob/master/torch/_meta_registrations.py
+# should be activated for PyTorch version 1.12.0 and below
+# refer to https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/native_functions.yaml
+# for more meta_registrations
+
+from typing import Callable, List, Optional, Tuple, Union
+
+import torch
+from torch.utils._pytree import tree_map
+
+aten = torch.ops.aten
+
+meta_lib = torch.library.Library("aten", "IMPL", "Meta")
+
+meta_table = {}
+
+orig_empty = torch.empty
+orig_empty_strided = torch.empty_strided
+orig_empty_like = torch.empty_like
+
+
+def new(*args, **kwargs):
+    return orig_empty(*args, **kwargs, device=torch.device('meta'))
+
+
+def new_strided(*args, **kwargs):
+    return orig_empty_strided(*args, **kwargs, device=torch.device('meta'))
+
+
+def new_like(*args, **kwargs):
+    return orig_empty_like(*args, **kwargs, device=torch.device('meta'))
+
+
+def register_meta(op, register_dispatcher=True):
+
+    def wrapper(f):
+
+        def add_func(op):
+            meta_table[op] = f
+            if register_dispatcher:
+                name = (op.__name__ if op._overloadname != "default" else op.overloadpacket.__name__)
+                try:
+                    meta_lib.impl(name, f)
+                except:
+                    pass
+
+        tree_map(add_func, op)
+        return f
+
+    return wrapper
+
+
+# ============================== Convolutions ======================================
+# https://github.com/pytorch/pytorch/pull/79834
+@register_meta(aten.convolution.default)
+def meta_conv(
+    input_tensor: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    stride: List[int],
+    padding: List[int],
+    dilation: List[int],
+    is_transposed: bool,
+    output_padding: List[int],
+    groups: int,
+):
+
+    def _formula(ln: int, p: int, d: int, k: int, s: int) -> int:
+        """
+        Formula to apply to calculate the length of some dimension of the output
+        See: https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
+        Args:
+            ln: length of the dimension
+            p: padding in that dim
+            d: dilation in that dim
+            k: kernel size in that dim
+            s: stride in that dim
+        Returns:
+            The output length
+        """
+        return (ln + 2 * p - d * (k - 1) - 1) // s + 1
+
+    def _formula_transposed(ln: int, p: int, d: int, k: int, s: int, op: int) -> int:
+        """
+        Formula to apply to calculate the length of some dimension of the output
+        if transposed convolution is used.
+        See: https://pytorch.org/docs/stable/generated/torch.nn.ConvTranspose2d.html
+        Args:
+            ln: length of the dimension
+            p: padding in that dim
+            d: dilation in that dim
+            k: kernel size in that dim
+            s: stride in that dim
+            op: output padding in that dim
+        Returns:
+            The output length
+        """
+        return (ln - 1) * s - 2 * p + d * (k - 1) + op + 1
+
+    def calc_conv_nd_return_shape(
+        dims: torch.Size,
+        kernel_size: torch.Size,
+        stride: Union[List[int], int],
+        padding: Union[List[int], int],
+        dilation: Union[List[int], int],
+        output_padding: Optional[Union[List[int], int]] = None,
+    ):
+        ret_shape = []
+        if isinstance(stride, int):
+            stride = [stride] * len(dims)
+        elif len(stride) == 1:
+            stride = [stride[0]] * len(dims)
+
+        if isinstance(padding, int):
+            padding = [padding] * len(dims)
+        elif len(padding) == 1:
+            padding = [padding[0]] * len(dims)
+
+        if isinstance(dilation, int):
+            dilation = [dilation] * len(dims)
+        elif len(dilation) == 1:
+            dilation = [dilation[0]] * len(dims)
+
+        output_padding_list: Optional[List[int]] = None
+        if output_padding:
+            if isinstance(output_padding, int):
+                output_padding_list = [output_padding] * len(dims)
+            elif len(output_padding) == 1:
+                output_padding_list = [output_padding[0]] * len(dims)
+            else:
+                output_padding_list = output_padding
+
+        for i in range(len(dims)):
+            # If output_padding is present, we are dealing with a transposed convolution
+            if output_padding_list:
+                ret_shape.append(
+                    _formula_transposed(
+                        dims[i],
+                        padding[i],
+                        dilation[i],
+                        kernel_size[i],
+                        stride[i],
+                        output_padding_list[i],
+                    ))
+            else:
+                ret_shape.append(_formula(dims[i], padding[i], dilation[i], kernel_size[i], stride[i]))
+        return ret_shape
+
+    def pick_memory_format():
+        if input_tensor.is_contiguous(memory_format=torch.channels_last):
+            return torch.channels_last
+        elif input_tensor.is_contiguous(memory_format=torch.contiguous_format):
+            return torch.contiguous_format
+        elif input_tensor.is_contiguous(memory_format=torch.preserve_format):
+            return torch.preserve_format
+
+    kernel_size = weight.shape[2:]
+    dims = input_tensor.shape[2:]
+    if is_transposed:
+        out_channels = groups * weight.shape[1]
+
+        shape_out = calc_conv_nd_return_shape(
+            dims,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            output_padding,
+        )
+
+    else:
+        out_channels = weight.shape[0]
+        if weight.shape[1] != input_tensor.shape[1] / groups:
+            raise RuntimeError("Invalid channel dimensions")
+        shape_out = calc_conv_nd_return_shape(dims, kernel_size, stride, padding, dilation)
+    out = input_tensor.new_empty((input_tensor.shape[0], out_channels, *shape_out))
+    mem_fmt = pick_memory_format()
+    out = out.to(memory_format=mem_fmt)    # type: ignore[call-overload]
+    return out
+
+
+@register_meta(aten._convolution.default)
+def meta__conv(input_tensor: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, stride: List[int],
+               padding: List[int], dilation: List[int], is_transposed: bool, output_padding: List[int], groups: int,
+               *extra_args):
+    out = meta_conv(input_tensor, weight, bias, stride, padding, dilation, is_transposed, output_padding, groups)
+    return out
+
+
+@register_meta(aten.convolution_backward.default)
+def meta_conv_backward(grad_output: torch.Tensor, input: torch.Tensor, weight: torch.Tensor, bias_sizes, stride,
+                       padding, dilation, transposed, output_padding, groups, output_mask):
+    return new_like(input), new_like(weight), new((bias_sizes))
+
+
+# https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/AdaptiveAveragePooling.cpp
+@register_meta(aten._adaptive_avg_pool2d_backward.default)
+def meta_adaptive_avg_pool2d_backward(
+    grad_output: torch.Tensor,
+    input: torch.Tensor,
+):
+    return new_like(input)
+
+
+# ================================ RNN =============================================
+# https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cudnn/RNN.cpp
+@register_meta(aten._cudnn_rnn.default)
+def meta_cuda_rnn(
+    input,
+    weight,
+    weight_stride0,
+    weight_buf,
+    hx,
+    cx,
+    mode,
+    hidden_size,
+    proj_size,
+    num_layers,
+    batch_first,
+    dropout,
+    train,
+    bidirectional,
+    batch_sizes,
+    dropout_state,
+):
+
+    is_input_packed = len(batch_sizes) != 0
+    if is_input_packed:
+        seq_length = len(batch_sizes)
+        mini_batch = batch_sizes[0]
+        batch_sizes_sum = input.shape[0]
+    else:
+        seq_length = input.shape[1] if batch_first else input.shape[0]
+        mini_batch = input.shape[0] if batch_first else input.shape[1]
+        batch_sizes_sum = -1
+
+    num_directions = 2 if bidirectional else 1
+    out_size = proj_size if proj_size != 0 else hidden_size
+    if is_input_packed:
+        out_shape = [batch_sizes_sum, out_size * num_directions]
+    else:
+        out_shape = ([mini_batch, seq_length, out_size *
+                      num_directions] if batch_first else [seq_length, mini_batch, out_size * num_directions])
+    output = input.new_empty(out_shape)
+
+    cell_shape = [num_layers * num_directions, mini_batch, hidden_size]
+    cy = new(0) if cx is None else cx.new_empty(cell_shape)
+
+    hy = hx.new_empty([num_layers * num_directions, mini_batch, out_size])
+
+    # TODO: Query cudnnGetRNNTrainingReserveSize (expose to python)
+    reserve_shape = 0 if train else 0
+    reserve = input.new_empty(reserve_shape, dtype=torch.uint8)
+
+    return output, hy, cy, reserve, weight_buf
+
+
+# https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cudnn/RNN.cpp
+@register_meta(aten._cudnn_rnn_backward.default)
+def meta_cudnn_rnn_backward(input: torch.Tensor,
+                            weight: torch.Tensor,
+                            weight_stride0: int,
+                            hx: torch.Tensor,
+                            cx: Optional[torch.Tensor] = None,
+                            *args,
+                            **kwargs):
+    return new_like(input), new_like(weight), new_like(hx), new_like(cx) if cx is not None else new(
+        ())    # (grad_input, grad_weight, grad_hx, grad_cx)
+
+
+# https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/Activation.cpp
+# ============================== Activations =======================================
+_unregistered_ewise = [
+    aten.relu.default,
+    aten.prelu.default,
+    aten.hardswish.default,
+    aten.hardtanh.default,
+    aten.prelu_backward.default,
+    aten.hardswish_backward.default,
+    aten.hardtanh_backward.default,
+]
+
+
+@register_meta(_unregistered_ewise)
+def meta_unregistered_ewise(input: torch.Tensor, *args):
+    return new_like(input)
+
+
+# ============================== Normalization =====================================
+# https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cudnn/BatchNorm.cpp
+@register_meta(aten.native_batch_norm.default)
+def meta_bn(input: torch.Tensor, weight, bias, running_mean, running_var, training, momentum, eps):
+    n_input = input.size(1)
+    return new_like(input), new((n_input)), new((n_input))
+
+
+# https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cudnn/BatchNorm.cpp
+@register_meta(aten.native_batch_norm_backward.default)
+def meta_bn_backward(dY: torch.Tensor, input: torch.Tensor, weight: torch.Tensor, running_mean, running_var, save_mean,
+                     save_invstd, train, eps, output_mask):
+    return new_like(input), new_like(weight), new_like(weight)    # (dX, dgamma, dbeta)
+
+
+# https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cudnn/BatchNorm.cpp
+@register_meta(aten.cudnn_batch_norm.default)
+def meta_cudnn_bn(input: torch.Tensor, weight, bias, running_mean, running_var, training, momentum, eps):
+    n_input = input.size(1)
+    return new_like(input), new((n_input)), new((n_input)), new(
+        (0), dtype=torch.uint8)    # (output, running_mean, running_var, reserve)
+
+
+# https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cudnn/BatchNorm.cpp
+# NB: CuDNN only implements the backward algorithm for batchnorm
+# in training mode (evaluation mode batchnorm has a different algorithm),
+# which is why this doesn't accept a 'training' parameter.
+@register_meta(aten.cudnn_batch_norm_backward.default)
+def meta_cudnn_bn_backward(dY: torch.Tensor, input: torch.Tensor, weight: torch.Tensor, running_mean, running_var,
+                           save_mean, save_invstd, eps, reserve):
+    return new_like(input), new_like(weight), new_like(weight)    # (dX, dgamma, dbeta)
+
+
+# https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/layer_norm.cpp
+@register_meta(aten.native_layer_norm.default)
+def meta_ln(input: torch.Tensor, normalized_shape, weight, bias, eps):
+    bs, n_input = input.size(0), input.size(1)
+    return new_like(input), new((bs, n_input, 1)), new((bs, n_input, 1))    # (output, running_mean, running_var)
+
+
+# https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/layer_norm.cpp
+@register_meta(aten.native_layer_norm_backward.default)
+def meta_ln_backward(dY: torch.Tensor, input: torch.Tensor, normalized_shape, mean, rstd, weight, bias,
+                     grad_input_mask):
+    return new_like(input), new_like(weight), new_like(bias)    # (dX, dgamma, dbeta)
+
+
+# ================================== Misc ==========================================
+# Maybe incorrect
+# https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/Im2Col.cpp
+@register_meta(aten.im2col.default)
+def meta_im2col(input: torch.Tensor, kernel_size, dilation, padding, stride):
+    return new_like(input)
+
+
+# https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/native_functions.yaml
+@register_meta(aten.eye.m_out)
+def meta_eye(n: int, m: int, out: torch.Tensor):
+    return out
+
+
+# https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/native_functions.yaml
+@register_meta(aten.roll.default)
+def meta_roll(input: torch.Tensor, shifts, dims):
+    return input
+
+
+# https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/Scalar.cpp
+@register_meta(aten._local_scalar_dense.default)
+def meta_local_scalar_dense(self: torch.Tensor):
+    return 0
+
+
+# https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/TensorCompare.cpp
+@register_meta(aten.where.self)
+def meta_where_self(condition: torch.Tensor, self: torch.Tensor, other: torch.Tensor):
+    result_type = torch.result_type(self, other)
+    return new_like(condition + self + other, dtype=result_type)
+
+
+@register_meta(aten.index.Tensor)
+def meta_index_Tensor(self, indices):
+    assert indices, "at least one index must be provided"
+    # aten::index is the internal advanced indexing implementation
+    # checkIndexTensorTypes and expandTensors
+    result: List[Optional[torch.Tensor]] = []
+    for i, index in enumerate(indices):
+        if index is not None:
+            assert index.dtype in [torch.long, torch.int8, torch.bool],\
+                "tensors used as indices must be long, byte or bool tensors"
+            if index.dtype in [torch.int8, torch.bool]:
+                nonzero = index.nonzero()
+                k = len(result)
+                assert k + index.ndim <= self.ndim, f"too many indices for tensor of dimension {self.ndim}"
+                for j in range(index.ndim):
+                    assert index.shape[j] == self.shape[
+                        k +
+                        j], f"The shape of the mask {index.shape} at index {i} does not match the shape of the indexed tensor {self.shape} at index {k + j}"
+                    result.append(nonzero.select(1, j))
+            else:
+                result.append(index)
+        else:
+            result.append(index)
+    indices = result
+    assert len(indices) <= self.ndim, f"too many indices for tensor of dimension {self.ndim} (got {len(indices)})"
+    # expand_outplace
+    import torch._refs as refs
+
+    indices = list(refs._maybe_broadcast(*indices))
+    # add missing null tensors
+    while len(indices) < self.ndim:
+        indices.append(None)
+
+    # hasContiguousSubspace
+    #   true if all non-null tensors are adjacent
+    # See:
+    # https://numpy.org/doc/stable/user/basics.indexing.html#combining-advanced-and-basic-indexing
+    # https://stackoverflow.com/questions/53841497/why-does-numpy-mixed-basic-advanced-indexing-depend-on-slice-adjacency
+    state = 0
+    has_contiguous_subspace = False
+    for index in indices:
+        if state == 0:
+            if index is not None:
+                state = 1
+        elif state == 1:
+            if index is None:
+                state = 2
+        else:
+            if index is not None:
+                break
+    else:
+        has_contiguous_subspace = True
+
+    # transposeToFront
+    # This is the logic that causes the newly inserted dimensions to show up
+    # at the beginning of the tensor, if they're not contiguous
+    if not has_contiguous_subspace:
+        dims = []
+        transposed_indices = []
+        for i, index in enumerate(indices):
+            if index is not None:
+                dims.append(i)
+                transposed_indices.append(index)
+        for i, index in enumerate(indices):
+            if index is None:
+                dims.append(i)
+                transposed_indices.append(index)
+        self = self.permute(dims)
+        indices = transposed_indices
+
+    # AdvancedIndex::AdvancedIndex
+    # Now we can assume the indices have contiguous subspace
+    # This is simplified from AdvancedIndex which goes to more effort
+    # to put the input and indices in a form so that TensorIterator can
+    # take them.  If we write a ref for this, probably that logic should
+    # get implemented
+    before_shape: List[int] = []
+    after_shape: List[int] = []
+    replacement_shape: List[int] = []
+    for dim, index in enumerate(indices):
+        if index is None:
+            if replacement_shape:
+                after_shape.append(self.shape[dim])
+            else:
+                before_shape.append(self.shape[dim])
+        else:
+            replacement_shape = list(index.shape)
+    return self.new_empty(before_shape + replacement_shape + after_shape)
+
+
+# ============================== Embedding =========================================
+# https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/Embedding.cpp
+@register_meta(aten.embedding_dense_backward.default)
+def meta_embedding_dense_backward(grad_output: torch.Tensor, indices: torch.Tensor, num_weights, padding_idx,
+                                  scale_grad_by_freq):
+    return new((num_weights, grad_output.size(-1)),
+               dtype=grad_output.dtype,
+               device=grad_output.device,
+               layout=grad_output.layout)
+
+
+# ============================== Dropout ===========================================
+# https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/Dropout.cpp
+@register_meta(aten.native_dropout.default)
+def meta_native_dropout_default(input: torch.Tensor, p: float, train: bool = False):
+    # notice that mask is bool
+    return new_like(input), new_like(input, dtype=torch.bool)    # (output, mask)
+
+
+# https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/Dropout.cpp
+@register_meta(aten.native_dropout_backward.default)
+def meta_native_dropout_backward_default(grad: torch.Tensor, mask: torch.Tensor, scale: float):
+    return new_like(grad)    # (grad_in)
diff --git a/colossalai/_analyzer/_subclasses/_monkey_patch.py b/colossalai/_analyzer/_subclasses/_monkey_patch.py
new file mode 100644
index 000000000000..1c7b972ab2f6
--- /dev/null
+++ b/colossalai/_analyzer/_subclasses/_monkey_patch.py
@@ -0,0 +1,88 @@
+import torch
+import torch.distributed as dist
+
+aten = torch.ops.aten
+
+__all__ = [
+    "_TorchFactoryMethod",
+    "_TorchOverrideableFactoryMethod",
+    "_TorchNonOverrideableFactoryMethod",
+    "_TensorPropertyMethod",
+    "_DistCommMethod",
+    "_AliasATen",
+    "_InplaceATen",
+    "_MaybeInplaceATen",
+]
+
+_TorchOverrideableFactoryMethod = [
+    "empty",
+    "eye",
+    "full",
+    "ones",
+    "rand",
+    "randn",
+    "zeros",
+]
+
+_TorchNonOverrideableFactoryMethod = [
+    "arange",
+    "finfo",
+    "linspace",
+    "logspace",
+    "randint",
+    "randperm",
+    "tensor",
+]
+
+_TorchFactoryMethod = _TorchOverrideableFactoryMethod + _TorchNonOverrideableFactoryMethod
+
+_TensorPropertyMethod = ["dtype", "shape", "device", "requires_grad", "grad", "grad_fn", "data"]
+
+_DistCommMethod = [
+    "all_gather",
+    "all_reduce",
+    "all_to_all",
+    "broadcast",
+    "gather",
+    "reduce",
+    "reduce_scatter",
+    "scatter",
+]
+
+# TODO: dive deep here
+# refer to https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/TensorShape.cpp
+_AliasATen = [
+    aten.detach.default,
+    aten.detach_.default,
+    aten.t.default,
+    aten.transpose.int,
+    aten.view.default,
+    aten._unsafe_view.default,
+    aten._reshape_alias.default,
+]
+
+_InplaceATen = [
+    aten.add_.Tensor,
+    aten.add_.Scalar,
+    aten.sub_.Tensor,
+    aten.sub_.Scalar,
+    aten.mul_.Tensor,
+    aten.mul_.Scalar,
+    aten.div_.Tensor,
+    aten.div_.Scalar,
+    aten.pow_.Tensor,
+    aten.pow_.Scalar,
+]
+
+# use `MaybeInplace` because they call ``as_strided()`` or ``slice()``
+_MaybeInplaceATen = [
+    aten.diagonal.default,
+    aten.expand.default,
+    aten.select.int,
+    aten.slice.Tensor,
+    aten.split.Tensor,
+    aten.squeeze.default,
+    aten.permute.default,
+    aten.unsqueeze.default,
+    aten.as_strided.default,
+]
diff --git a/colossalai/_analyzer/_subclasses/flop_tensor.py b/colossalai/_analyzer/_subclasses/flop_tensor.py
new file mode 100644
index 000000000000..ab93551467b8
--- /dev/null
+++ b/colossalai/_analyzer/_subclasses/flop_tensor.py
@@ -0,0 +1,536 @@
+# adopted from https://github.com/facebookresearch/fvcore/blob/main/fvcore/nn/jit_handles.py
+# ideas from https://pastebin.com/AkvAyJBw
+# and https://dev-discuss.pytorch.org/t/the-ideal-pytorch-flop-counter-with-torch-dispatch/505
+
+import operator
+from collections import defaultdict
+from contextlib import contextmanager
+from enum import Enum, auto
+from functools import partial, reduce
+from numbers import Number
+from typing import Any, Callable, List, Optional, Union
+
+import torch
+from torch.utils._pytree import tree_map
+
+from .meta_tensor import MetaTensor
+
+aten = torch.ops.aten
+
+
+class Phase(Enum):
+    FWD = auto()
+    BWD = auto()
+
+
+def normalize_tuple(x):
+    if not isinstance(x, tuple):
+        return (x,)
+    return x
+
+
+def _format_flops(flop):
+    K = 1e3
+    M = 1e6
+    B = 1e9
+    T = 1e12
+    if flop < K:
+        return f'{flop:.2f}'
+    elif flop < M:
+        return f'{flop / K:.2f}K'
+    elif flop < B:
+        return f'{flop / M:.2f}M'
+    elif flop < T:
+        return f'{flop / B:.2f}B'
+    else:
+        return f'{flop / T:.2f}T'
+
+
+def flop_count(module: Union[torch.nn.Module, Callable] = None, *args, verbose: bool = False, **kwargs) -> Number:
+    """
+    Count the number of floating point operations in a model.
+    Ideas from https://pastebin.com/AkvAyJBw.
+    Args:
+        module (torch.nn.Module): A PyTorch model.
+        *args: Input arguments to the model.
+        verbose (bool): If True, print the number of flops for each module.
+        **kwargs: Input keyword arguments to the model.
+    Returns:
+        Number: The total number of floating point operations (FWD + BWD).
+    """
+    maybe_inplace = (getattr(module, 'inplace', False) or kwargs.get('inplace', False)
+                     or getattr(module, '__name__', None) in ('add_', 'mul_', 'div_', 'sub_'))
+
+    class DummyModule(torch.nn.Module):
+
+        def __init__(self, func):
+            super().__init__()
+            self.func = func
+            self.__name__ = func.__name__
+
+        def forward(self, *args, **kwargs):
+            return self.func(*args, **kwargs)
+
+    total_flop_count = {Phase.FWD: 0, Phase.BWD: 0}
+    flop_counts = defaultdict(lambda: defaultdict(int))
+    parents = ['Global']
+    module = module if isinstance(module, torch.nn.Module) else DummyModule(module)
+
+    class FlopTensor(MetaTensor):
+        _tensor: torch.Tensor
+
+        def __repr__(self):
+            name = 'FlopParameter' if getattr(self, '_is_param', False) else 'FlopTensor'
+            if self.grad_fn:
+                return f"{name}(..., size={tuple(self.shape)}, device='{self.device}', dtype={self.dtype}, grad_fn={self.grad_fn})"
+            return f"{name}(..., size={tuple(self.shape)}, device='{self.device}', dtype={self.dtype})"
+
+        @classmethod
+        def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+
+            # no_dispatch is only needed if you use enable_python_mode.
+            # It prevents infinite recursion.
+            rs = super().__torch_dispatch__(func, types, args, kwargs)
+
+            outs = normalize_tuple(rs)
+
+            if func in flop_mapping:
+                nonlocal flop_counts, total_flop_count
+                flop_count = flop_mapping[func](args, outs)
+                for par in parents:
+                    flop_counts[par][func.__name__] += flop_count
+                total_flop_count[cur_phase] += flop_count
+
+            def wrap(x):
+                if isinstance(x, MetaTensor):
+                    x = FlopTensor(x)
+                return x
+
+            rs = tree_map(wrap, rs)
+
+            return rs
+
+    def is_autogradable(x):
+        return isinstance(x, torch.Tensor) and x.is_floating_point()
+
+    def create_backwards_push(name):
+
+        class PushState(torch.autograd.Function):
+
+            @staticmethod
+            def forward(ctx, *args):
+                args = tree_map(lambda x: x.clone() if isinstance(x, torch.Tensor) else x, args)
+                if len(args) == 1:
+                    return args[0]
+                return args
+
+            @staticmethod
+            def backward(ctx, *grad_outs):
+                nonlocal parents
+                parents.append(name)
+                return grad_outs
+
+        return PushState.apply
+
+    def create_backwards_pop(name):
+
+        class PopState(torch.autograd.Function):
+
+            @staticmethod
+            def forward(ctx, *args):
+                args = tree_map(lambda x: x.clone() if isinstance(x, torch.Tensor) else x, args)
+                if len(args) == 1:
+                    return args[0]
+                return args
+
+            @staticmethod
+            def backward(ctx, *grad_outs):
+                nonlocal parents
+                assert (parents[-1] == name)
+                parents.pop()
+                return grad_outs
+
+        return PopState.apply
+
+    def enter_module(name):
+
+        def f(module, inputs):
+            nonlocal parents
+            parents.append(name)
+            inputs = normalize_tuple(inputs)
+            out = create_backwards_pop(name)(*inputs)
+            return out
+
+        return f
+
+    def exit_module(name):
+
+        def f(module, inputs, outputs):
+            nonlocal parents
+            assert (parents[-1] == name)
+            parents.pop()
+            outputs = normalize_tuple(outputs)
+            return create_backwards_push(name)(*outputs)
+
+        return f
+
+    @contextmanager
+    def instrument_module(mod):
+        registered = []
+        for name, module in dict(mod.named_children()).items():
+            registered.append(module.register_forward_pre_hook(enter_module(name)))
+            registered.append(module.register_forward_hook(exit_module(name)))
+        yield
+        for handle in registered:
+            handle.remove()
+
+    def display_flops():
+        for mod in flop_counts.keys():
+            print(f"Module: ", mod)
+            for k, v in flop_counts[mod].items():
+                print('\t', k, _format_flops(v))
+            print()
+
+    def detach_variables(r):
+        if isinstance(r, torch.Tensor):
+            requires_grad = r.requires_grad
+            r = r.detach()
+            r.requires_grad = requires_grad
+        return r
+
+    def wrap(r):
+        if isinstance(r, torch.Tensor):
+            data_ptr_fn = getattr(r, '_tensor', r).data_ptr
+            r = FlopTensor(detach_variables(r))
+            if maybe_inplace:
+                r = r + 0
+            r._tensor.data_ptr = data_ptr_fn
+        return r
+
+    with instrument_module(module):
+        cur_phase = Phase.FWD
+        rst = module(*tree_map(wrap, args), **tree_map(wrap, kwargs))
+        rst = tuple(r for r in normalize_tuple(rst) if is_autogradable(r) and r.requires_grad)
+        cur_phase = Phase.BWD
+
+        if rst:
+            grad = [torch.zeros_like(t) for t in rst]
+            torch.autograd.backward(
+                rst,
+                grad,
+            )
+
+    if verbose:
+        display_flops()
+
+    return total_flop_count[Phase.FWD], total_flop_count[Phase.BWD]
+
+
+def matmul_flop_jit(inputs: List[Any], outputs: List[Any]) -> Number:
+    """
+    Count flops for matmul.
+    """
+    # Inputs should be a list of length 2.
+    # Inputs contains the shapes of two matrices.
+    input_shapes = [v.shape for v in inputs]
+    assert len(input_shapes) == 2, input_shapes
+    assert input_shapes[0][-1] == input_shapes[1][-2], input_shapes
+    flops = reduce(operator.mul, input_shapes[0]) * input_shapes[-1][-1]
+    return flops
+
+
+def addmm_flop_jit(inputs: List[Any], outputs: List[Any]) -> Number:
+    """
+    Count flops for fully connected layers.
+    """
+    # Count flop for nn.Linear
+    # inputs is a list of length 3.
+    input_shapes = [v.shape for v in inputs[1:3]]
+    # input_shapes[0]: [batch size, input feature dimension]
+    # input_shapes[1]: [input feature dimension, output feature dimension]
+    assert len(input_shapes[0]) == 2, input_shapes[0]
+    assert len(input_shapes[1]) == 2, input_shapes[1]
+    batch_size, input_dim = input_shapes[0]
+    output_dim = input_shapes[1][1]
+    flops = batch_size * input_dim * output_dim
+    return flops
+
+
+def linear_flop_jit(inputs: List[Any], outputs: List[Any]) -> Number:
+    """
+    Count flops for the aten::linear operator.
+    """
+    # Inputs is a list of length 3; unlike aten::addmm, it is the first
+    # two elements that are relevant.
+    input_shapes = [v.shape for v in inputs[0:2]]
+    # input_shapes[0]: [dim0, dim1, ..., input_feature_dim]
+    # input_shapes[1]: [output_feature_dim, input_feature_dim]
+    assert input_shapes[0][-1] == input_shapes[1][-1]
+    flops = reduce(operator.mul, input_shapes[0]) * input_shapes[1][0]
+    return flops
+
+
+def bmm_flop_jit(inputs: List[Any], outputs: List[Any]) -> Number:
+    """
+    Count flops for the bmm operation.
+    """
+    # Inputs should be a list of length 2.
+    # Inputs contains the shapes of two tensor.
+    assert len(inputs) == 2, len(inputs)
+    input_shapes = [v.shape for v in inputs]
+    n, c, t = input_shapes[0]
+    d = input_shapes[-1][-1]
+    flops = n * c * t * d
+    return flops
+
+
+def conv_flop_count(
+    x_shape: List[int],
+    w_shape: List[int],
+    out_shape: List[int],
+    transposed: bool = False,
+) -> Number:
+    """
+    Count flops for convolution. Note only multiplication is
+    counted. Computation for addition and bias is ignored.
+    Flops for a transposed convolution are calculated as
+    flops = (x_shape[2:] * prod(w_shape) * batch_size).
+    Args:
+        x_shape (list(int)): The input shape before convolution.
+        w_shape (list(int)): The filter shape.
+        out_shape (list(int)): The output shape after convolution.
+        transposed (bool): is the convolution transposed
+    Returns:
+        int: the number of flops
+    """
+    batch_size = x_shape[0]
+    conv_shape = (x_shape if transposed else out_shape)[2:]
+    flops = batch_size * reduce(operator.mul, w_shape) * reduce(operator.mul, conv_shape)
+    return flops
+
+
+def conv_flop_jit(inputs: List[Any], outputs: List[Any]):
+    """
+    Count flops for convolution.
+    """
+    x, w = inputs[:2]
+    x_shape, w_shape, out_shape = (x.shape, w.shape, outputs[0].shape)
+    transposed = inputs[6]
+
+    return conv_flop_count(x_shape, w_shape, out_shape, transposed=transposed)
+
+
+def transpose_shape(shape):
+    return [shape[1], shape[0]] + list(shape[2:])
+
+
+def conv_backward_flop_jit(inputs: List[Any], outputs: List[Any]):
+    grad_out_shape, x_shape, w_shape = [i.shape for i in inputs[:3]]
+    output_mask = inputs[-1]
+    fwd_transposed = inputs[7]
+    flop_count = 0
+
+    if output_mask[0]:
+        grad_input_shape = outputs[0].shape
+        flop_count += conv_flop_count(grad_out_shape, w_shape, grad_input_shape, not fwd_transposed)
+    if output_mask[1]:
+        grad_weight_shape = outputs[1].shape
+        flop_count += conv_flop_count(transpose_shape(x_shape), grad_out_shape, grad_weight_shape, fwd_transposed)
+
+    return flop_count
+
+
+def norm_flop_counter(affine_arg_index: int, input_arg_index: int) -> Callable:
+    """
+    Args:
+        affine_arg_index: index of the affine argument in inputs
+    """
+
+    def norm_flop_jit(inputs: List[Any], outputs: List[Any]) -> Number:
+        """
+        Count flops for norm layers.
+        """
+        # Inputs[0] contains the shape of the input.
+        input_shape = inputs[input_arg_index].shape
+
+        has_affine = inputs[affine_arg_index].shape is not None if hasattr(inputs[affine_arg_index],
+                                                                           'shape') else inputs[affine_arg_index]
+        assert 2 <= len(input_shape) <= 5, input_shape
+        # 5 is just a rough estimate
+        flop = reduce(operator.mul, input_shape) * (5 if has_affine else 4)
+        return flop
+
+    return norm_flop_jit
+
+
+def batchnorm_flop_jit(inputs: List[Any], outputs: List[Any], training: bool = None) -> Number:
+    if training is None:
+        training = inputs[-3]
+    assert isinstance(training, bool), "Signature of aten::batch_norm has changed!"
+    if training:
+        return norm_flop_counter(1, 0)(inputs, outputs)    # pyre-ignore
+    has_affine = inputs[1].shape is not None
+    input_shape = reduce(operator.mul, inputs[0].shape)
+    return input_shape * (2 if has_affine else 1)
+
+
+def ewise_flop_counter(input_scale: float = 1, output_scale: float = 0) -> Callable:
+    """
+    Count flops by
+        input_tensor.numel() * input_scale + output_tensor.numel() * output_scale
+    Args:
+        input_scale: scale of the input tensor (first argument)
+        output_scale: scale of the output tensor (first element in outputs)
+    """
+
+    def ewise_flop(inputs: List[Any], outputs: List[Any]) -> Number:
+        ret = 0
+        if input_scale != 0:
+            shape = inputs[0].shape
+            ret += input_scale * reduce(operator.mul, shape) if shape else 0
+        if output_scale != 0:
+            shape = outputs[0].shape
+            ret += output_scale * reduce(operator.mul, shape) if shape else 0
+        return ret
+
+    return ewise_flop
+
+
+def zero_flop_jit(*args):
+    """
+        Count flops for zero flop layers.
+    """
+    return 0
+
+
+flop_mapping = {
+    # gemm
+    aten.mm.default: matmul_flop_jit,
+    aten.matmul.default: matmul_flop_jit,
+    aten.addmm.default: addmm_flop_jit,
+    aten.bmm.default: bmm_flop_jit,
+
+    # convolution
+    aten.convolution.default: conv_flop_jit,
+    aten._convolution.default: conv_flop_jit,
+    aten.convolution_backward.default: conv_backward_flop_jit,
+
+    # normalization
+    aten.native_batch_norm.default: batchnorm_flop_jit,
+    aten.native_batch_norm_backward.default: batchnorm_flop_jit,
+    aten.cudnn_batch_norm.default: batchnorm_flop_jit,
+    aten.cudnn_batch_norm_backward.default: partial(batchnorm_flop_jit, training=True),
+    aten.native_layer_norm.default: norm_flop_counter(2, 0),
+    aten.native_layer_norm_backward.default: norm_flop_counter(2, 0),
+
+    # pooling
+    aten.avg_pool1d.default: ewise_flop_counter(1, 0),
+    aten.avg_pool2d.default: ewise_flop_counter(1, 0),
+    aten.avg_pool2d_backward.default: ewise_flop_counter(0, 1),
+    aten.avg_pool3d.default: ewise_flop_counter(1, 0),
+    aten.avg_pool3d_backward.default: ewise_flop_counter(0, 1),
+    aten.max_pool1d.default: ewise_flop_counter(1, 0),
+    aten.max_pool2d.default: ewise_flop_counter(1, 0),
+    aten.max_pool3d.default: ewise_flop_counter(1, 0),
+    aten.max_pool1d_with_indices.default: ewise_flop_counter(1, 0),
+    aten.max_pool2d_with_indices.default: ewise_flop_counter(1, 0),
+    aten.max_pool2d_with_indices_backward.default: ewise_flop_counter(0, 1),
+    aten.max_pool3d_with_indices.default: ewise_flop_counter(1, 0),
+    aten.max_pool3d_with_indices_backward.default: ewise_flop_counter(0, 1),
+    aten._adaptive_avg_pool2d.default: ewise_flop_counter(1, 0),
+    aten._adaptive_avg_pool2d_backward.default: ewise_flop_counter(0, 1),
+    aten._adaptive_avg_pool3d.default: ewise_flop_counter(1, 0),
+    aten._adaptive_avg_pool3d_backward.default: ewise_flop_counter(0, 1),
+    aten.embedding_dense_backward.default: ewise_flop_counter(0, 1),
+    aten.embedding.default: ewise_flop_counter(1, 0),
+}
+
+ewise_flop_aten = [
+    # basic op
+    aten.add.Tensor,
+    aten.add_.Tensor,
+    aten.div.Tensor,
+    aten.div_.Tensor,
+    aten.div.Scalar,
+    aten.div_.Scalar,
+    aten.mul.Tensor,
+    aten.mul.Scalar,
+    aten.mul_.Tensor,
+    aten.neg.default,
+    aten.pow.Tensor_Scalar,
+    aten.rsub.Scalar,
+    aten.sum.default,
+    aten.sum.dim_IntList,
+    aten.mean.dim,
+
+    # activation op
+    aten.hardswish.default,
+    aten.hardswish_.default,
+    aten.hardswish_backward.default,
+    aten.hardtanh.default,
+    aten.hardtanh_.default,
+    aten.hardtanh_backward.default,
+    aten.hardsigmoid_backward.default,
+    aten.hardsigmoid.default,
+    aten.gelu.default,
+    aten.gelu_backward.default,
+    aten.silu.default,
+    aten.silu_.default,
+    aten.silu_backward.default,
+    aten.sigmoid.default,
+    aten.sigmoid_backward.default,
+    aten._softmax.default,
+    aten._softmax_backward_data.default,
+    aten.relu_.default,
+    aten.relu.default,
+    aten.tanh.default,
+    aten.tanh_backward.default,
+    aten.threshold_backward.default,
+
+    # dropout
+    aten.native_dropout.default,
+    aten.native_dropout_backward.default,
+
+    # distribution
+    aten.bernoulli_.float,
+
+    # where
+    aten.where.self,
+]
+for op in ewise_flop_aten:
+    flop_mapping[op] = ewise_flop_counter(1, 0)
+
+# fix-me: this will be removed in future
+zero_flop_aten = [
+    aten.as_strided.default,
+    aten.as_strided_.default,
+    aten.cat.default,
+    aten.clone.default,
+    aten.copy_.default,
+    aten.detach.default,
+    aten.expand.default,
+    aten.empty_like.default,
+    aten.new_empty.default,
+    aten.new_empty_strided.default,
+    aten.ones_like.default,
+    aten._reshape_alias.default,
+    aten.select.int,
+    aten.select_backward.default,
+    aten.squeeze.dim,
+    aten.slice.Tensor,
+    aten.slice_backward.default,
+    aten.split.Tensor,
+    aten.permute.default,
+    aten.t.default,
+    aten.transpose.int,
+    aten._to_copy.default,
+    aten.unsqueeze.default,
+    aten.unbind.int,
+    aten._unsafe_view.default,
+    aten.view.default,
+    aten.zero_.default,
+    aten.zeros_like.default,
+]
+
+for op in zero_flop_aten:
+    flop_mapping[op] = zero_flop_jit
diff --git a/colossalai/_analyzer/_subclasses/meta_tensor.py b/colossalai/_analyzer/_subclasses/meta_tensor.py
new file mode 100644
index 000000000000..2bc212938ee0
--- /dev/null
+++ b/colossalai/_analyzer/_subclasses/meta_tensor.py
@@ -0,0 +1,207 @@
+import uuid
+from functools import partial
+
+import torch
+import torch.distributed as dist
+from torch.types import _bool, _device, _dtype
+from torch.utils._pytree import tree_flatten, tree_map
+
+from ._monkey_patch import _AliasATen, _DistCommMethod, _InplaceATen, _MaybeInplaceATen, _TorchOverrideableFactoryMethod
+
+__all__ = ['MetaTensor', 'MetaTensorMode']
+
+
+def register_storage(r, data_ptr_fn=None):
+    if isinstance(r, torch.Tensor):
+        if data_ptr_fn is not None:
+            r.data_ptr = data_ptr_fn
+        elif not r.data_ptr():
+            data_ptr = uuid.uuid1()
+            r.data_ptr = lambda: data_ptr
+
+
+def _normalize_tuple(x):
+    if not isinstance(x, tuple):
+        return (x,)
+    return x
+
+
+# a hack of inplace execution in PyTorch
+def _assert_alias(func):
+    return func in (_AliasATen + _InplaceATen + _MaybeInplaceATen    # TODO: check if should be this aggressive
+                   )
+
+
+class MetaTensor(torch.Tensor):
+    """
+    A wrapping tensor that hacks ``torch.autograd`` without patching more ``torch.ops.aten`` ops.
+    `device` is the device that ``MetaTensor`` is supposed to run on. Meta tensors give you the
+    ability to run PyTorch code without having to actually do computation through tensors
+    allocated on a `meta` device. Because the device is `meta`, meta tensors do not model
+    device propagation. ``MetaTensor`` extends its usage by carrying an additional `device`
+    which tracks devices that would have been used.
+
+    Reference:
+        https://github.com/pytorch/pytorch/blob/master/torch/_subclasses/fake_tensor.py
+    """
+
+    _tensor: torch.Tensor
+
+    @staticmethod
+    def __new__(cls, elem, device=None, data_ptr_fn=None):
+        requires_grad = elem.requires_grad
+        # Avoid multiple wrapping
+        while isinstance(elem, MetaTensor):
+            device = elem.device if device is None else device
+            elem = elem._tensor
+
+        # The wrapping tensor (MetaTensor) shouldn't hold any
+        # memory for the class in question, but it should still
+        # advertise the same device as before
+        r = torch.Tensor._make_wrapper_subclass(
+            cls,
+            elem.size(),
+            strides=elem.stride(),
+            storage_offset=elem.storage_offset(),
+            dtype=elem.dtype,
+            layout=elem.layout,
+            device=device or (elem.device if elem.device.type != 'meta' else torch.device('cpu')),
+            requires_grad=requires_grad)    # deceive the frontend for aten selections
+        r._tensor = elem
+        # ...the real tensor is held as an element on the tensor.
+        if not r._tensor.is_meta:
+            val = elem.data_ptr()
+            data_ptr_fn = lambda: val
+            r._tensor = r._tensor.to(torch.device('meta'))
+
+        # only tensor not on `meta` should be copied to `meta`
+        register_storage(r._tensor, data_ptr_fn)
+        if isinstance(elem, torch.nn.Parameter):
+            r = torch.nn.Parameter(r)
+        return r
+
+    def __repr__(self):
+        name = 'MetaParameter' if getattr(self, '_is_param', False) else 'MetaTensor'
+        if self.grad_fn:
+            return f"{name}(..., size={tuple(self.shape)}, device='{self.device}', dtype={self.dtype}, grad_fn={self.grad_fn})"
+        return f"{name}(..., size={tuple(self.shape)}, device='{self.device}', dtype={self.dtype})"
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+        device = None
+
+        def unwrap(x):
+            nonlocal device
+            if isinstance(x, MetaTensor):
+                device = x.device
+                x = x._tensor
+            elif isinstance(x, torch.Tensor):
+                device = x.device
+                x = x.to(torch.device('meta'))
+            return x
+
+        args = tree_map(unwrap, args)
+        kwargs = tree_map(unwrap, kwargs)
+
+        if 'device' in kwargs:
+            device = kwargs['device']
+            kwargs['device'] = torch.device('meta')
+
+        # run aten for backend=CPU but actually on backend=Meta
+        # here we detect whether or not the execution generates a physical copy
+        # of the input tensor
+        ret = func(*args, **kwargs)
+
+        if _assert_alias(func):
+            val = args[0].data_ptr()
+            tree_map(partial(register_storage, data_ptr_fn=lambda: val), _normalize_tuple(ret))
+
+        # Now, we want to continue propagating this tensor, so we rewrap Tensors in
+        # our custom tensor subclass
+        def wrap(x):
+            return MetaTensor(x, device=device) if isinstance(x, torch.Tensor) else x
+
+        return tree_map(wrap, ret)
+
+    def to(self, *args, **kwargs) -> torch.Tensor:
+        """An extension of `torch.Tensor.to()` to MetaTensor
+        Returns:
+            result (MetaTensor): MetaTensor
+        Usage:
+            >>> tensor = MetaTensor(torch.rand(10), device='cuda:100')
+            >>> tensor.to(torch.uint8)
+            MetaTensor(tensor(..., device='meta', size=(10,), dtype=torch.uint8), device='cuda:100')
+            >>> tensor.to(torch.device('cuda:42'))
+            MetaTensor(tensor(..., device='meta', size=(10,)), device='cuda:42')
+            >>> tensor.to('vulkan')
+            MetaTensor(tensor(..., device='meta', size=(10,)), device='vulkan')
+        """
+        # this imitates c++ function in the way of @overload
+        device = None
+
+        def replace(x):
+            nonlocal device
+            if isinstance(x, str) or isinstance(x, _device):
+                device = x
+                return torch.device('meta')
+            return x
+
+        elem = self._tensor.to(*tree_map(replace, args), **tree_map(replace, kwargs))
+        return MetaTensor(elem, device=device)
+
+    def cpu(self, *args, **kwargs):
+        if self.device.type == 'cpu':
+            return self.to(*args, **kwargs)
+        return self.to(*args, device='cpu', **kwargs)
+
+    def cuda(self, device=None, non_blocking=False):
+        if device is not None:
+            return self.to(device=device, non_blocking=non_blocking)
+        return self.to(device='cuda:0', non_blocking=non_blocking)
+
+    def data_ptr(self):
+        return self._tensor.data_ptr()
+
+
+class MetaTensorMode(object):
+    """
+    A context manager that enables MetaTensor mode.
+
+    Usage:
+        >>> with MetaTensorMode():
+        >>>     # all torch.xxx and torch.distributed.xxx will be replaced by patched functions
+        >>>     # and the actual execution will be on torch.device('meta')
+        >>>     a = torch.rand(100000, 100000)
+        >>>     b = torch.rand(100000, 100000)
+        >>>     c = torch.mm(a, b)
+    """
+
+    def __init__(self):
+        self.torch_overrides = {}    # override torch.xxx
+        self.dist_overrides = {}    # override torch.distributed.xxx
+
+    def __enter__(self):
+
+        def _dummy(*args, **kwargs):
+            pass
+
+        def _new(*args, orig_new=torch.empty, **kwargs):
+            return MetaTensor(orig_new(*args, **{
+                **kwargs, 'device': 'meta'
+            }),
+                              device=kwargs.get('device', torch.device('cpu')))
+
+        for func in _TorchOverrideableFactoryMethod:
+            self.torch_overrides[func] = getattr(torch, func)
+            setattr(torch, func, partial(_new, orig_new=getattr(torch, func)))
+
+        for func in _DistCommMethod:
+            self.dist_overrides[func] = getattr(dist, func)
+            setattr(dist, func, _dummy)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        for func, func_impl in self.torch_overrides.items():
+            setattr(torch, func, func_impl)
+
+        for func, func_impl in self.dist_overrides.items():
+            setattr(dist, func, func_impl)
diff --git a/colossalai/_analyzer/envs.py b/colossalai/_analyzer/envs.py
new file mode 100644
index 000000000000..b537747c57a8
--- /dev/null
+++ b/colossalai/_analyzer/envs.py
@@ -0,0 +1,7 @@
+from dataclasses import dataclass
+
+
+@dataclass
+class MeshConfig:
+    TFLOPS: float = 1.9e12
+    BANDWIDTH = 1.2e9
diff --git a/colossalai/_analyzer/fx/__init__.py b/colossalai/_analyzer/fx/__init__.py
new file mode 100644
index 000000000000..2e857b1b054b
--- /dev/null
+++ b/colossalai/_analyzer/fx/__init__.py
@@ -0,0 +1,4 @@
+from .bias_addition import *
+from .node_util import MetaInfo
+from .symbolic_profile import symbolic_profile
+from .symbolic_trace import symbolic_trace
diff --git a/colossalai/_analyzer/fx/bias_addition.py b/colossalai/_analyzer/fx/bias_addition.py
new file mode 100644
index 000000000000..5359752d4cb4
--- /dev/null
+++ b/colossalai/_analyzer/fx/bias_addition.py
@@ -0,0 +1,155 @@
+"""
+If FX.Graph is traced for auto-parallel module, some extra node will be added during
+graph construction to deal with the compatibility between bias-addition and all-reduce.
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.modules.utils import _pair, _single, _triple
+
+from .symbolic_trace import register_tracer_impl
+
+__all__ = []
+
+
+@register_tracer_impl(F.linear, name='_bias_addition_impl')
+def linear_impl(input, weight, bias=None):
+    if bias is None:
+        return F.linear(input, weight)
+    else:
+        return F.linear(input, weight) + bias
+
+
+@register_tracer_impl(F.conv1d, name='_bias_addition_impl')
+def conv1d_impl(input, weight, bias=None, stride=_single(1), padding=_single(0), dilation=_single(1), groups=1):
+    if bias is None:
+        return F.conv1d(input, weight, stride=stride, padding=padding, dilation=dilation, groups=groups)
+    else:
+        return F.conv1d(input, weight, stride=stride, padding=padding, dilation=dilation, groups=groups) + bias.reshape(
+            (-1, 1))
+
+
+@register_tracer_impl(F.conv2d, name='_bias_addition_impl')
+def conv2d_impl(input, weight, bias=None, stride=_pair(1), padding=_pair(0), dilation=_pair(1), groups=1):
+    if bias is None:
+        return F.conv2d(input, weight, stride=stride, padding=padding, dilation=dilation, groups=groups)
+    else:
+        return F.conv2d(input, weight, stride=stride, padding=padding, dilation=dilation, groups=groups) + bias.reshape(
+            (-1, 1, 1))
+
+
+@register_tracer_impl(F.conv3d, name='_bias_addition_impl')
+def conv3d_impl(input, weight, bias=None, stride=_triple(1), padding=_triple(0), dilation=_triple(1), groups=1):
+    if bias is None:
+        return F.conv3d(input, weight, stride=stride, padding=padding, dilation=dilation, groups=groups)
+    else:
+        return F.conv3d(input, weight, stride=stride, padding=padding, dilation=dilation, groups=groups) + bias.reshape(
+            (-1, 1, 1, 1))
+
+
+@register_tracer_impl(F.conv_transpose1d, name='_bias_addition_impl')
+def conv_transpose1d_impl(input,
+                          weight,
+                          bias=None,
+                          stride=_single(1),
+                          padding=_single(0),
+                          output_padding=_single(0),
+                          groups=1,
+                          dilation=_single(1)):
+    if bias is None:
+        return F.conv_transpose1d(input,
+                                  weight,
+                                  stride=stride,
+                                  padding=padding,
+                                  output_padding=output_padding,
+                                  groups=groups,
+                                  dilation=dilation)
+    else:
+        return F.conv_transpose1d(input,
+                                  weight,
+                                  stride=stride,
+                                  padding=padding,
+                                  output_padding=output_padding,
+                                  groups=groups,
+                                  dilation=dilation) + bias.reshape((-1, 1))
+
+
+@register_tracer_impl(F.conv_transpose2d, name='_bias_addition_impl')
+def conv_transpose2d_impl(input,
+                          weight,
+                          bias=None,
+                          stride=_pair(1),
+                          padding=_pair(0),
+                          output_padding=_pair(0),
+                          groups=1,
+                          dilation=_pair(1)):
+    if bias is None:
+        return F.conv_transpose2d(input,
+                                  weight,
+                                  stride=stride,
+                                  padding=padding,
+                                  output_padding=output_padding,
+                                  groups=groups,
+                                  dilation=dilation)
+    else:
+        return F.conv_transpose2d(input,
+                                  weight,
+                                  stride=stride,
+                                  padding=padding,
+                                  output_padding=output_padding,
+                                  groups=groups,
+                                  dilation=dilation) + bias.reshape((-1, 1, 1))
+
+
+@register_tracer_impl(F.conv_transpose3d, name='_bias_addition_impl')
+def conv_transpose3d_impl(input,
+                          weight,
+                          bias=None,
+                          stride=_triple(1),
+                          padding=_triple(0),
+                          output_padding=_triple(0),
+                          groups=1,
+                          dilation=_triple(1)):
+    if bias is None:
+        return F.conv_transpose3d(input,
+                                  weight,
+                                  stride=stride,
+                                  padding=padding,
+                                  output_padding=output_padding,
+                                  groups=groups,
+                                  dilation=dilation)
+    else:
+        return F.conv_transpose3d(input,
+                                  weight,
+                                  stride=stride,
+                                  padding=padding,
+                                  output_padding=output_padding,
+                                  groups=groups,
+                                  dilation=dilation) + bias.reshape((-1, 1, 1, 1))
+
+
+@register_tracer_impl(torch.addmm, name='_bias_addition_impl')
+@register_tracer_impl(torch.Tensor.addmm, name='_bias_addition_impl')
+def addmm_impl(input, mat1, mat2, beta=1, alpha=1):
+    if alpha != 1 and beta != 1:
+        return F.linear(mat1, mat2.transpose(0, 1)) * alpha + input * beta
+    elif alpha != 1:
+        return F.linear(mat1, mat2.transpose(0, 1)) * alpha + input
+    elif beta != 1:
+        return F.linear(mat1, mat2.transpose(0, 1)) + input * beta
+    else:
+        return F.linear(mat1, mat2.transpose(0, 1)) + input
+
+
+@register_tracer_impl(torch.addbmm, name='_bias_addition_impl')
+@register_tracer_impl(torch.Tensor.addbmm, name='_bias_addition_impl')
+def addbmm_impl(input, batch1, batch2, beta=1, alpha=1):
+    if alpha != 1 and beta != 1:
+        return torch.bmm(batch1, batch2.transpose(1, 2)) * alpha + input * beta
+    elif alpha != 1:
+        return torch.bmm(batch1, batch2.transpose(1, 2)) * alpha + input
+    elif beta != 1:
+        return torch.bmm(batch1, batch2.transpose(1, 2)) + input * beta
+    else:
+        return torch.bmm(batch1, batch2.transpose(1, 2)) + input
diff --git a/colossalai/_analyzer/fx/codegen.py b/colossalai/_analyzer/fx/codegen.py
new file mode 100644
index 000000000000..1117c0103166
--- /dev/null
+++ b/colossalai/_analyzer/fx/codegen.py
@@ -0,0 +1,456 @@
+from typing import Any, Callable, Dict, Iterable, List, Tuple
+
+import torch
+from torch.fx.graph import (
+    CodeGen,
+    PythonCode,
+    _custom_builtins,
+    _format_target,
+    _is_from_torch,
+    _Namespace,
+    _origin_type_map,
+    _register_custom_builtin,
+    inplace_methods,
+    magic_methods,
+)
+from torch.fx.node import Argument, Node, _get_qualified_name, _type_repr, map_arg
+
+import colossalai
+from colossalai.fx._compatibility import compatibility
+
+_register_custom_builtin('colossalai', 'import colossalai', colossalai)
+
+
+def _gen_ckpt_fn_def(label, free_vars: List[str]) -> str:
+    """
+    Generate the checkpoint function definition
+    """
+    return f"def checkpoint_{label}({', '.join(['self'] + free_vars)}):"
+
+
+def _gen_ckpt_output(output_vars: List[str]) -> str:
+    """
+    Generate the return statement for checkpoint region
+    """
+    return f"return {', '.join(output_vars)}"
+
+
+def _gen_ckpt_usage(label, input_vars, output_vars, use_reentrant=True):
+    """
+    Generate the checkpoint function call code text
+    """
+    outputs = ', '.join(output_vars)
+    inputs = ', '.join(input_vars)
+    return f'{outputs} = torch.utils.checkpoint.checkpoint(self.checkpoint_{label}, {inputs}, use_reentrant={use_reentrant})'
+
+
+def _end_of_ckpt(node: Node, ckpt_level: int) -> bool:
+    """
+    Check if the node could end the ckpt region at `ckpt_level`
+    """
+    if len(node.meta['info'].to_recompute) > ckpt_level:
+        return node.meta['info'].to_recompute[ckpt_level] is not None
+    return True
+
+
+def _find_input_and_output_nodes(nodes: List[Node]):
+    """
+    Find the input and output node names which are not found in the given list of nodes.
+    """
+    input_nodes = []
+    output_nodes = []
+
+    # if a node has an input node which is not in the node list
+    # we treat that input node as the input of the checkpoint function
+    for node in nodes:
+        for input_node in node._input_nodes.keys():
+            node_repr = repr(input_node)
+            if input_node not in nodes and node_repr not in input_nodes:
+                input_nodes.append(node_repr)
+
+    # if a node has a user node which is not in the node list
+    # we treat that user node as the node receiving the current node output
+    for node in nodes:
+        for output_node in node.users.keys():
+            node_repr = repr(node)
+            if output_node not in nodes and node_repr not in output_nodes:
+                output_nodes.append(node_repr)
+
+    return input_nodes, output_nodes
+
+
+def _find_nested_ckpt_regions(node_list: List[Node], ckpt_level: int = 0):
+    """
+    Find the nested checkpoint regions given a list of consecutive nodes. The outputs
+    will be list of tuples, each tuple is in the form of (start_index, end_index).
+    """
+    ckpt_regions = []
+    start = -1
+    end = -1
+    current_region = None
+
+    for idx, node in enumerate(node_list):
+        if len(node.meta['info'].to_recompute) > ckpt_level:
+            act_ckpt_label = node.meta['info'].to_recompute[ckpt_level]
+
+            # this activation checkpoint label is not set yet
+            # meaning this is the first node of the activation ckpt region
+            if current_region is None:
+                current_region = act_ckpt_label
+                start = idx
+
+            # if activation checkpoint has changed
+            # we restart the tracking
+            # e.g. node ckpt states = [ckpt1, ckpt2, ckpt2, ckpt2]
+            if act_ckpt_label != current_region:
+                assert start != -1
+                ckpt_regions.append((start, idx - 1))
+                current_region = act_ckpt_label
+                start = idx
+                end = -1
+
+        elif current_region is not None and _end_of_ckpt(node, ckpt_level):
+            # used to check the case below
+            # node ckpt states = [ckpt, ckpt, non-ckpt]
+            end = idx - 1
+            assert start != -1 and end != -1
+            ckpt_regions.append((start, end))
+            start = end = -1
+            current_region = None
+
+        else:
+            pass
+
+    if current_region is not None:
+        end = len(node_list) - 1
+        ckpt_regions.append((start, end))
+    return ckpt_regions
+
+
+def emit_ckpt_func(body,
+                   ckpt_func,
+                   node_list: List[Node],
+                   emit_node_func,
+                   delete_unused_value_func,
+                   ckpt_level=0,
+                   in_ckpt=False):
+    """Emit ckpt fuction in nested way
+
+    Args:
+        body: forward code - in recursive calls, this part will be checkpoint
+        functions code
+        ckpt_func: checkpoint functions code - in recursive calls, this part
+        will be a buffer
+        node_list (List[Node]): list of torch.fx.Node
+        emit_node_func: function to emit a node
+        delete_unused_value_func: function to delete unused value
+        level (int, optional): checkpoint level. Defaults to 0.
+        in_ckpt (bool, optional): indicates wether the func is in recursive
+        call. Defaults to False.
+    """
+    inputs, outputs = _find_input_and_output_nodes(node_list)
+
+    # label given by each layer, e.g. if you are currently at level (0, 1, 1)
+    # the label will be '0_1_1'
+    label = "_".join([str(idx) for idx in node_list[0].meta['info'].to_recompute[:ckpt_level + 1]])
+    ckpt_fn_def = _gen_ckpt_fn_def(label, inputs)
+    ckpt_func.append(f'{ckpt_fn_def}\n')
+
+    # if there is more level to fetch
+    if ckpt_level + 1 < max(map(lambda node: len(node.meta['info'].to_recompute), node_list)):
+        ckpt_regions = _find_nested_ckpt_regions(node_list, ckpt_level + 1)
+        start_idx = [item[0] for item in ckpt_regions]
+        end_idx = [item[1] for item in ckpt_regions]
+
+        # use ckpt_func_buffer to store nested checkpoint functions
+        ckpt_func_buffer = []
+        node_idx = 0
+        while 1:
+            if node_idx >= len(node_list):
+                break
+
+            if node_idx in start_idx:
+                ckpt_node_list = node_list[node_idx:end_idx[start_idx.index(node_idx)] + 1]
+                emit_ckpt_func(ckpt_func, ckpt_func_buffer, ckpt_node_list, emit_node_func, delete_unused_value_func,
+                               ckpt_level + 1, True)
+                node_idx += len(ckpt_node_list)
+
+            else:
+                node = node_list[node_idx]
+                emit_node_func(node, ckpt_func)
+                ckpt_func[-1] = '    ' + ckpt_func[-1]
+                delete_unused_value_func(node, ckpt_func)
+                node_idx += 1
+
+        ckpt_func.append('    ' + _gen_ckpt_output(outputs) + '\n\n')
+        ckpt_func += ckpt_func_buffer
+
+    # last level
+    else:
+        for node in node_list:
+            emit_node_func(node, ckpt_func)
+            ckpt_func[-1] = '    ' + ckpt_func[-1]
+            delete_unused_value_func(node, ckpt_func)
+
+        ckpt_func.append('    ' + _gen_ckpt_output(outputs) + '\n\n')
+
+    usage = _gen_ckpt_usage(label, inputs, outputs, False) + '\n'
+    if in_ckpt:
+        usage = '    ' + usage
+    body.append(usage)
+
+
+def emit_code_with_activation_checkpoint(body, ckpt_func, nodes, emit_node_func, delete_unused_value_func):
+    """Emit code with nested activation checkpoint
+    When we detect some of the annotation is a , we will use
+    this function to emit the activation checkpoint codes.
+
+    Args:
+        body: forward code
+        ckpt_func: checkpoint functions code
+        nodes: graph.nodes
+        emit_node_func: function to emit node
+        delete_unused_value_func: function to remove the unused value
+    """
+    ckpt_regions = _find_nested_ckpt_regions(nodes, 0)
+    start_idx = [item[0] for item in ckpt_regions]
+    end_idx = [item[1] for item in ckpt_regions]
+
+    node_list = list(nodes)
+
+    node_idx = 0
+    while 1:
+        # break if we finish the processing all the nodes
+        if node_idx >= len(node_list):
+            break
+
+        # process ckpt_regions
+        if node_idx in start_idx:
+            ckpt_node_list = node_list[node_idx:end_idx[start_idx.index(node_idx)] + 1]
+            emit_ckpt_func(body, ckpt_func, ckpt_node_list, emit_node_func, delete_unused_value_func)
+            node_idx += len(ckpt_node_list)
+
+        # process node in forward function
+        else:
+            node = node_list[node_idx]
+            emit_node_func(node, body)
+            delete_unused_value_func(node, body)
+            node_idx += 1
+
+
+@compatibility(is_backward_compatible=True)
+class ActivationCheckpointCodeGen(CodeGen):
+
+    def _gen_python_code(self, nodes, root_module: str, namespace: _Namespace) -> PythonCode:
+        free_vars: List[str] = []
+        body: List[str] = []
+        globals_: Dict[str, Any] = {}
+        wrapped_fns: Dict[str, None] = {}
+
+        # Wrap string in list to pass by reference
+        maybe_return_annotation: List[str] = ['']
+
+        def add_global(name_hint: str, obj: Any):
+            """Add an obj to be tracked as a global.
+            We call this for names that reference objects external to the
+            Graph, like functions or types.
+            Returns: the global name that should be used to reference 'obj' in generated source.
+            """
+            if _is_from_torch(obj) and obj != torch.device:    # to support registering torch.device
+                # HACK: workaround for how torch custom ops are registered. We
+                # can't import them like normal modules so they must retain their
+                # fully qualified name.
+                return _get_qualified_name(obj)
+
+            # normalize the name hint to get a proper identifier
+            global_name = namespace.create_name(name_hint, obj)
+
+            if global_name in globals_:
+                assert globals_[global_name] is obj
+                return global_name
+            globals_[global_name] = obj
+            return global_name
+
+        # Pre-fill the globals table with registered builtins.
+        for name, (_, obj) in _custom_builtins.items():
+            add_global(name, obj)
+
+        def type_repr(o: Any):
+            if o == ():
+                # Empty tuple is used for empty tuple type annotation Tuple[()]
+                return '()'
+
+            typename = _type_repr(o)
+
+            if hasattr(o, '__origin__'):
+                # This is a generic type, e.g. typing.List[torch.Tensor]
+                origin_type = _origin_type_map.get(o.__origin__, o.__origin__)
+                origin_typename = add_global(_type_repr(origin_type), origin_type)
+
+                if hasattr(o, '__args__'):
+                    # Assign global names for each of the inner type variables.
+                    args = [type_repr(arg) for arg in o.__args__]
+
+                    if len(args) == 0:
+                        # Bare type, such as `typing.Tuple` with no subscript
+                        # This code-path used in Python < 3.9
+                        return origin_typename
+
+                    return f'{origin_typename}[{",".join(args)}]'
+                else:
+                    # Bare type, such as `typing.Tuple` with no subscript
+                    # This code-path used in Python 3.9+
+                    return origin_typename
+
+            # Common case: this is a regular module name like 'foo.bar.baz'
+            return add_global(typename, o)
+
+        def _format_args(args: Tuple[Argument, ...], kwargs: Dict[str, Argument]) -> str:
+
+            def _get_repr(arg):
+                # Handle NamedTuples (if it has `_fields`) via add_global.
+                if isinstance(arg, tuple) and hasattr(arg, '_fields'):
+                    qualified_name = _get_qualified_name(type(arg))
+                    global_name = add_global(qualified_name, type(arg))
+                    return f"{global_name}{repr(tuple(arg))}"
+                return repr(arg)
+
+            args_s = ', '.join(_get_repr(a) for a in args)
+            kwargs_s = ', '.join(f'{k} = {_get_repr(v)}' for k, v in kwargs.items())
+            if args_s and kwargs_s:
+                return f'{args_s}, {kwargs_s}'
+            return args_s or kwargs_s
+
+        # Run through reverse nodes and record the first instance of a use
+        # of a given node. This represents the *last* use of the node in the
+        # execution order of the program, which we will use to free unused
+        # values
+        node_to_last_use: Dict[Node, Node] = {}
+        user_to_last_uses: Dict[Node, List[Node]] = {}
+
+        def register_last_uses(n: Node, user: Node):
+            if n not in node_to_last_use:
+                node_to_last_use[n] = user
+                user_to_last_uses.setdefault(user, []).append(n)
+
+        for node in reversed(nodes):
+            map_arg(node.args, lambda n: register_last_uses(n, node))
+            map_arg(node.kwargs, lambda n: register_last_uses(n, node))
+
+        # NOTE: we add a variable to distinguish body and ckpt_func
+        def delete_unused_values(user: Node, body):
+            """
+            Delete values after their last use. This ensures that values that are
+            not used in the remainder of the code are freed and the memory usage
+            of the code is optimal.
+            """
+            if user.op == 'placeholder':
+                return
+            if user.op == 'output':
+                body.append('\n')
+                return
+            nodes_to_delete = user_to_last_uses.get(user, [])
+            if len(nodes_to_delete):
+                to_delete_str = ' = '.join([repr(n) for n in nodes_to_delete] + ['None'])
+                body.append(f';  {to_delete_str}\n')
+            else:
+                body.append('\n')
+
+        # NOTE: we add a variable to distinguish body and ckpt_func
+        def emit_node(node: Node, body):
+            maybe_type_annotation = '' if node.type is None else f' : {type_repr(node.type)}'
+            if node.op == 'placeholder':
+                assert isinstance(node.target, str)
+                maybe_default_arg = '' if not node.args else f' = {repr(node.args[0])}'
+                free_vars.append(f'{node.target}{maybe_type_annotation}{maybe_default_arg}')
+                raw_name = node.target.replace('*', '')
+                if raw_name != repr(node):
+                    body.append(f'{repr(node)} = {raw_name}\n')
+                return
+            elif node.op == 'call_method':
+                assert isinstance(node.target, str)
+                body.append(f'{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.target)}'
+                            f'({_format_args(node.args[1:], node.kwargs)})')
+                return
+            elif node.op == 'call_function':
+                assert callable(node.target)
+                # pretty print operators
+                if node.target.__module__ == '_operator' and node.target.__name__ in magic_methods:
+                    assert isinstance(node.args, tuple)
+                    body.append(f'{repr(node)}{maybe_type_annotation} = '
+                                f'{magic_methods[node.target.__name__].format(*(repr(a) for a in node.args))}')
+                    return
+
+                # pretty print inplace operators; required for jit.script to work properly
+                # not currently supported in normal FX graphs, but generated by torchdynamo
+                if node.target.__module__ == '_operator' and node.target.__name__ in inplace_methods:
+                    body.append(f'{inplace_methods[node.target.__name__].format(*(repr(a) for a in node.args))};  '
+                                f'{repr(node)}{maybe_type_annotation} = {repr(node.args[0])}')
+                    return
+
+                qualified_name = _get_qualified_name(node.target)
+                global_name = add_global(qualified_name, node.target)
+                # special case for getattr: node.args could be 2-argument or 3-argument
+                # 2-argument: attribute access; 3-argument: fall through to attrib function call with default value
+                if global_name == 'getattr' and \
+                isinstance(node.args, tuple) and \
+                isinstance(node.args[1], str) and \
+                node.args[1].isidentifier() and \
+                len(node.args) == 2:
+                    body.append(
+                        f'{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.args[1])}')
+                    return
+                body.append(
+                    f'{repr(node)}{maybe_type_annotation} = {global_name}({_format_args(node.args, node.kwargs)})')
+                if node.meta.get('is_wrapped', False):
+                    wrapped_fns.setdefault(global_name)
+                return
+            elif node.op == 'call_module':
+                assert isinstance(node.target, str)
+                body.append(f'{repr(node)}{maybe_type_annotation} = '
+                            f'{_format_target(root_module, node.target)}({_format_args(node.args, node.kwargs)})')
+                return
+            elif node.op == 'get_attr':
+                assert isinstance(node.target, str)
+                body.append(f'{repr(node)}{maybe_type_annotation} = {_format_target(root_module, node.target)}')
+                return
+            elif node.op == 'output':
+                if node.type is not None:
+                    maybe_return_annotation[0] = f" -> {type_repr(node.type)}"
+                body.append(self.generate_output(node.args[0]))
+                return
+            raise NotImplementedError(f'node: {node.op} {node.target}')
+
+        # Modified for activation checkpointing
+        ckpt_func = []
+        emit_code_with_activation_checkpoint(body, ckpt_func, nodes, emit_node, delete_unused_values)
+
+        if len(body) == 0:
+            # If the Graph has no non-placeholder nodes, no lines for the body
+            # have been emitted. To continue to have valid Python code, emit a
+            # single pass statement
+            body.append('pass\n')
+
+        if len(wrapped_fns) > 0:
+            wrap_name = add_global('wrap', torch.fx.wrap)
+            wrap_stmts = '\n'.join([f'{wrap_name}("{name}")' for name in wrapped_fns])
+        else:
+            wrap_stmts = ''
+
+        if self._body_transformer:
+            body = self._body_transformer(body)
+
+        for name, value in self.additional_globals():
+            add_global(name, value)
+
+        prologue = self.gen_fn_def(free_vars, maybe_return_annotation[0])
+        prologue = ''.join(ckpt_func) + prologue
+        prologue = prologue
+
+        code = ''.join(body)
+        code = '\n'.join('    ' + line for line in code.split('\n'))
+        fn_code = f"""
+{wrap_stmts}
+{prologue}
+{code}"""
+        return PythonCode(fn_code, globals_)
diff --git a/colossalai/_analyzer/fx/graph_module.py b/colossalai/_analyzer/fx/graph_module.py
new file mode 100644
index 000000000000..779b42ebaafd
--- /dev/null
+++ b/colossalai/_analyzer/fx/graph_module.py
@@ -0,0 +1,173 @@
+import os
+import warnings
+from pathlib import Path
+from typing import Any, Dict, Optional, Union
+
+import torch
+import torch.fx
+import torch.nn as nn
+from torch.fx.graph import PythonCode, _PyTreeCodeGen
+from torch.fx.graph_module import _exec_with_source, _forward_from_src, _WrappedCall
+from torch.nn.modules.module import _addindent
+
+
+class ColoGraphModule(torch.fx.GraphModule):
+    """
+    ColoGraphGraphModule is an nn.Module generated from an fx.Graph.
+    ColoGraphmodule has a ``graph`` attribute, as well as ``code`` and ``forward``
+    attributes generated from that ``graph``.
+
+    The difference between ``ColoGraphModule`` and ``torch.fx.GraphModule`` is that
+    ``ColoGraphModule`` has a ``bind()`` function to bind customized functions
+    (i.e. activation checkpoint) to ``code`` of ``nn.Module``. If you want to use
+    specific features in Colossal-AI that are not supported by ``torch.fx.GraphModule``,
+    you can use ``ColoGraphModule`` instead.
+
+    ``colossalai.fx.symbolic_trace()`` will return a ``ColoGraphModule`` as default.
+
+    .. warning::
+
+        When ``graph`` is reassigned, ``code`` and ``forward`` will be automatically
+        regenerated. However, if you edit the contents of the ``graph`` without reassigning
+        the ``graph`` attribute itself, you must call ``recompile()`` to update the generated
+        code.
+    """
+
+    def __init__(self,
+                 root: Union[torch.nn.Module, Dict[str, Any]],
+                 graph: torch.fx.Graph,
+                 class_name: str = 'GraphModule'):
+        super().__init__(root, graph, class_name)
+
+    def bind(self, ckpt_def, globals):
+        """Bind function needed for correctly execute ``GraphModule.forward()``
+
+        We need to bind checkpoint functions to ``ColoGraphModule`` so that we could
+        correctly execute ``GraphModule.forward()``
+
+        Args:
+            ckpt_def (List[str]): definition before the forward function
+            globals (Dict[str, Any]): global variables
+        """
+
+        ckpt_code = "\n".join(ckpt_def)
+        globals_copy = globals.copy()
+        _exec_with_source(ckpt_code, globals_copy)
+        func_list = [func for func in globals_copy.keys() if "checkpoint" in func or "pack" in func]
+        for func in func_list:
+            tmp_func = globals_copy[func]
+            setattr(self, func, tmp_func.__get__(self, self.__class__))
+            del globals_copy[func]
+
+    def recompile(self) -> PythonCode:
+        """
+        Recompile this GraphModule from its ``graph`` attribute. This should be
+        called after editing the contained ``graph``, otherwise the generated
+        code of this ``GraphModule`` will be out of date.
+        """
+        if isinstance(self._graph._codegen, _PyTreeCodeGen):
+            self._in_spec = self._graph._codegen.pytree_info.in_spec
+            self._out_spec = self._graph._codegen.pytree_info.out_spec
+        python_code = self._graph.python_code(root_module='self')
+        self._code = python_code.src
+
+        # To split ckpt functions code and forward code
+        _code_list = self._code.split("\n")
+        _fwd_def = [item for item in _code_list if "def forward" in item][0]
+        _fwd_idx = _code_list.index(_fwd_def)
+        ckpt_def = _code_list[:_fwd_idx]
+        self._code = "\n".join(_code_list[_fwd_idx:])
+
+        self.bind(ckpt_def, python_code.globals)
+
+        cls = type(self)
+        cls.forward = _forward_from_src(self._code, python_code.globals)
+
+        # Determine whether this class explicitly defines a __call__ implementation
+        # to wrap. If it does, save it in order to have wrapped_call invoke it.
+        # If it does not, wrapped_call can use a dynamic call to super() instead.
+        # In most cases, super().__call__ should be torch.nn.Module.__call__.
+        # We do not want to hold a reference to Module.__call__ here; doing so will
+        # bypass patching of torch.nn.Module.__call__ done while symbolic tracing.
+        cls_call = cls.__call__ if "__call__" in vars(cls) else None
+
+        if '_wrapped_call' not in vars(cls):
+            cls._wrapped_call = _WrappedCall(cls, cls_call)    # type: ignore[attr-defined]
+
+        def call_wrapped(self, *args, **kwargs):
+            return self._wrapped_call(self, *args, **kwargs)
+
+        cls.__call__ = call_wrapped
+
+        # reset self._code to original src, otherwise to_folder will be wrong
+        self._code = python_code.src
+        return python_code
+
+    def to_folder(self, folder: Union[str, os.PathLike], module_name: str = "FxModule"):
+        """Dumps out module to ``folder`` with ``module_name`` so that it can be
+        imported with ``from <folder> import <module_name>``
+
+        Args:
+
+            folder (Union[str, os.PathLike]): The folder to write the code out to
+
+            module_name (str): Top-level name to use for the ``Module`` while
+                writing out the code
+        """
+        folder = Path(folder)
+        Path(folder).mkdir(exist_ok=True)
+        torch.save(self.state_dict(), folder / 'state_dict.pt')
+        tab = " " * 4
+
+        # we add import colossalai here
+        model_str = f"""
+import torch
+from torch.nn import *
+import colossalai
+
+
+class {module_name}(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+"""
+
+        def _gen_model_repr(module_name: str, module: torch.nn.Module) -> Optional[str]:
+            safe_reprs = [nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d, nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d]
+            if type(module) in safe_reprs:
+                return f"{module.__repr__()}"
+            else:
+                return None
+
+        blobified_modules = []
+        for module_name, module in self.named_children():
+            module_str = _gen_model_repr(module_name, module)
+            if module_str is None:
+                module_file = folder / f'{module_name}.pt'
+                torch.save(module, module_file)
+                blobified_modules.append(module_name)
+                module_repr = module.__repr__().replace('\r', ' ').replace('\n', ' ')
+                module_str = f"torch.load(r'{module_file}') # {module_repr}"
+            model_str += f"{tab*2}self.{module_name} = {module_str}\n"
+
+        for buffer_name, buffer in self._buffers.items():
+            if buffer is None:
+                continue
+            model_str += f"{tab*2}self.register_buffer('{buffer_name}', torch.empty({list(buffer.shape)}, dtype={buffer.dtype}))\n"
+
+        for param_name, param in self._parameters.items():
+            if param is None:
+                continue
+            model_str += f"{tab*2}self.{param_name} = torch.nn.Parameter(torch.empty({list(param.shape)}, dtype={param.dtype}))\n"
+
+        model_str += f"{tab*2}self.load_state_dict(torch.load(r'{folder}/state_dict.pt'))\n"
+        model_str += f"{_addindent(self.code, 4)}\n"
+
+        module_file = folder / 'module.py'
+        module_file.write_text(model_str)
+
+        init_file = folder / '__init__.py'
+        init_file.write_text('from .module import *')
+
+        if len(blobified_modules) > 0:
+            warnings.warn("Was not able to save the following children modules as reprs -"
+                          f"saved as pickled files instead: {blobified_modules}")
diff --git a/colossalai/_analyzer/fx/node_util.py b/colossalai/_analyzer/fx/node_util.py
new file mode 100644
index 000000000000..d06fa8b93fc6
--- /dev/null
+++ b/colossalai/_analyzer/fx/node_util.py
@@ -0,0 +1,211 @@
+from dataclasses import dataclass, field
+from typing import Callable, ClassVar, Dict, List, Optional, Tuple, Union
+
+import torch
+from torch.autograd.profiler_util import _format_memory, _format_time
+from torch.fx import Graph, GraphModule, Node
+
+from colossalai._analyzer.envs import MeshConfig
+
+
+def intersect(a, b):
+    return {k: a[k] for k in a if k in b}
+
+
+def subtract(a, b):
+    return {k: a[k] for k in a if k not in b}
+
+
+def union(a, b):
+    return {**a, **b}
+
+
+def compute_size_in_bytes(elem: torch.Tensor | Dict | List | Tuple | int) -> int:
+    """Compute the size of a tensor or a collection of tensors in bytes.
+
+    Args:
+        elem (torch.Tensor | Dict | List | Tuple | int): Arbitrary nested ``torch.Tensor`` data structure.
+
+    Returns:
+        int: The size of the tensor or the collection of tensors in bytes.
+    """
+    nbytes = 0
+    if isinstance(elem, torch.Tensor):
+        if elem.is_quantized:
+            nbytes += elem.numel() * torch._empty_affine_quantized([], dtype=elem.dtype).element_size()
+        else:
+            nbytes += elem.numel() * torch.tensor([], dtype=elem.dtype).element_size()
+    elif isinstance(elem, dict):
+        value_list = [v for _, v in elem.items()]
+        nbytes += compute_size_in_bytes(value_list)
+    elif isinstance(elem, tuple) or isinstance(elem, list) or isinstance(elem, set):
+        for e in elem:
+            nbytes += compute_size_in_bytes(e)
+    return nbytes
+
+
+@dataclass
+class MetaInfo:
+    r"""
+    The base class to store all profiling and static graph analysis information
+    needed for auto-parallel system in Colossal-AI.
+    ============================================================================
+                            -------------------------------
+                            |          FX.Node            |    <-----
+    [input/param] are  ---> |[input/param]      [grad_inp]|    [grad_inp] contributes to the
+    placeholders (might be  |     | \__________     |     |    profiled peak memory in backward
+    saved for backward.     |     |            \    |     |    pass. [grad_param] is calculated
+                            |     |             \   |     |    separately.
+                            | [interm] -------> [grad_int]|    <-----
+                            |     |  \_________     |     |    [grad_interm] marks the peak
+                            |    / \           \    |     |    memory in backward pass.
+    [x] is not counted ---> | [x]  [interm] --> [grad_int]|    <-----
+    in [interm] because     |          |  \_____    |     |
+    it is not saved for     |          |        \   |     |
+    backward.               |      [output]      \  |     |    <----- [output] is potentially
+                            -------------------------------    [input] for the next node.
+    ============================================================================
+
+    Accumulate Size = ALL_PREVIOUS_CTX U {Interm Size + Output Size}
+    Output Size = ([output] in global_ctx and not is_alias)
+    Temp Size = ([output] not in global_ctx and not is_alias)
+    Backward Size = ([grad_inp])
+
+    Usage:
+        >>> for node in graph.nodes:
+        >>>     n_info = MetaInfo(node)     # will create a new MetaInfo instance and store in node.meta['info']
+        >>>                                 # if not exist, otherwise return the existing one
+        >>>     n_info.to_recompute = ...   # set the to_recompute attribute
+
+    Remarks:
+        This feature is experimental and all the entries are subject to change.
+    """
+
+    # reference
+    node: Node
+
+    # directory
+    mod_dir: str = ''
+
+    # ctx[data_ptr] = Tensor
+    # mark the storage for ctx.save_for_backward
+    global_ctx: Dict[str, torch.Tensor] = field(default_factory=lambda: {})    # globally shared
+    curr_ctx: Dict[str, torch.Tensor] = field(default_factory=lambda: {})    # global_ctx till this node
+
+    # should be updated after each graph manipulation
+    # ============================== Update ====================================
+    # parameter and buffer within ``Node``
+    parameters: Dict[str, torch.nn.Parameter] = field(default_factory=lambda: {})
+    buffers: Dict[str, torch.Tensor] = field(default_factory=lambda: {})
+
+    inputs: Tuple[torch.Tensor] = ()
+    outputs: Tuple[torch.Tensor] = ()
+    is_alias: Tuple[bool] = ()    # whether the output is an alias of input
+
+    # compute cost
+    fwd_flop: Optional[int] = 0
+    bwd_flop: Optional[int] = 0
+
+    # communication cost (should be the size in bytes of communication)
+    fwd_comm: Optional[int] = 0
+    bwd_comm: Optional[int] = 0
+
+    # should keep the same whenever manipulated
+    # ============================= Invariant ==================================
+    to_recompute: Tuple[torch.Tensor] = ()    # (region_0, region_1, ...) support nested codegen
+    to_offload: Optional[bool] = False
+    sharding_spec: str = 'RR'
+
+    def __new__(cls, node: Node, **kwargs):
+        orig_init = cls.__init__
+
+        # if initialized, return the existing one
+        # should disable the __init__ function
+        if node.meta.get('info', None) is not None:
+
+            def _dummy(self, *args, **kwargs):
+                if getattr(self, '_is_init', False):
+                    self._is_init = True
+                    orig_init(self, *args, **kwargs)
+                cls.__init__ = orig_init
+
+            cls.__init__ = _dummy
+            return node.meta['info']
+        return super().__new__(cls)
+
+    def __post_init__(self):
+        self.node.meta['info'] = self
+
+    @property
+    def fwd_time(self, tflops: float = MeshConfig.TFLOPS, bandwidth: float = MeshConfig.BANDWIDTH):
+        return self.fwd_flop / tflops + self.fwd_comm / bandwidth
+
+    @property
+    def bwd_time(self, tflops: float = MeshConfig.TFLOPS, bandwidth: float = MeshConfig.BANDWIDTH):
+        return self.bwd_flop / tflops + self.bwd_comm / bandwidth
+
+    @property
+    def param_size(self):
+        return compute_size_in_bytes(self.parameters)
+
+    @property
+    def buffer_size(self):
+        return compute_size_in_bytes(self.buffers)
+
+    @property
+    def output_size(self):
+        """Used in CheckpointSolver"""
+        output_ctx = {
+            o.data_ptr(): o
+            for o, is_alias in zip(self.outputs, self.is_alias)
+            if not is_alias and isinstance(o, torch.Tensor) and not isinstance(o, torch.nn.Parameter)
+        }
+        return compute_size_in_bytes(intersect(self.global_ctx, output_ctx))
+
+    @property
+    def accumulate_size(self):
+        """Used in CheckpointSolver"""
+        output_ctx = {
+            o.data_ptr(): o
+            for o, is_alias in zip(self.outputs, self.is_alias)
+            if not is_alias and isinstance(o, torch.Tensor) and not isinstance(o, torch.nn.Parameter)
+        }
+        return compute_size_in_bytes(union(self.curr_ctx, intersect(self.global_ctx, output_ctx)))
+
+    @property
+    def temp_size(self):
+        """Used in CheckpointSolver"""
+        output_ctx = {
+            o.data_ptr(): o
+            for o, is_alias in zip(self.outputs, self.is_alias)
+            if not is_alias and isinstance(o, torch.Tensor) and not isinstance(o, torch.nn.Parameter)
+        }
+        return compute_size_in_bytes(subtract(output_ctx, self.global_ctx))
+
+    @property
+    def backward_size(self):
+        """Used in CheckpointSolver"""
+        return compute_size_in_bytes(self.inputs)
+
+    def __repr__(self):
+        s = f'Node {self.node.name}'
+        if self.parameters:
+            s += f'\n\thas parameter of size {_format_memory(self.param_size)}'
+        if self.buffers:
+            s += f'\n\thas buffer of size {_format_memory(self.buffer_size)}'
+        if self.output_size:
+            s += f'\n\thas output activation of size {_format_memory(self.output_size)}'
+        if self.total_size:
+            s += f'\n\thas total activation of size {_format_memory(self.total_size)}'
+        if self.temp_size:
+            s += f'\n\thas temp activation of size {_format_memory(self.temp_size)}'
+        if self.backward_size:
+            s += f'\n\thas backward activation of size {_format_memory(self.backward_size)}'
+        s += f'\n\tfwd_flop = {self.fwd_flop}'\
+            f'\n\tbwd_flop = {self.bwd_flop}'\
+            f'\n\tfwd_comm = {self.fwd_comm}'\
+            f'\n\tbwd_comm = {self.bwd_comm}'\
+            f'\n\tto_recompute = {self.to_recompute}'\
+            f'\n\tto_offload = {self.to_offload}'\
+            f'\n\tsharding_spec = {self.sharding_spec}'
+        return s
diff --git a/colossalai/_analyzer/fx/passes/__init__.py b/colossalai/_analyzer/fx/passes/__init__.py
new file mode 100644
index 000000000000..ae02d90a236c
--- /dev/null
+++ b/colossalai/_analyzer/fx/passes/__init__.py
@@ -0,0 +1,2 @@
+from .graph_profile import graph_profile_pass
+from .shape_prop import ShapeProp, shape_prop_pass, sim_env
diff --git a/colossalai/_analyzer/fx/passes/graph_profile.py b/colossalai/_analyzer/fx/passes/graph_profile.py
new file mode 100644
index 000000000000..c3e760b31e96
--- /dev/null
+++ b/colossalai/_analyzer/fx/passes/graph_profile.py
@@ -0,0 +1,347 @@
+from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
+
+import torch
+import torch.fx
+from torch.autograd.profiler_util import _format_memory, _format_time
+from torch.fx import GraphModule
+from torch.fx.node import Argument, Node, Target
+
+from colossalai._analyzer._subclasses import flop_count
+from colossalai._analyzer.fx.node_util import MetaInfo
+
+
+def _format_flops(flops: float) -> str:
+    """Returns a formatted FLOP size string"""
+    if flops > 1e12:
+        return f'{flops / 1e12:.2f} TFLOPs'
+    elif flops > 1e9:
+        return f'{flops / 1e9:.2f} GFLOPs'
+    elif flops > 1e6:
+        return f'{flops / 1e6:.2f} MFLOPs'
+    elif flops > 1e3:
+        return f'{flops / 1e3:.2f} kFLOPs'
+    return f'{flops} FLOPs'
+
+
+def _denormalize_tuple(t: Tuple[int, ...]) -> Tuple[int, ...]:
+    return t[0] if len(t) == 1 else t
+
+
+def _normalize_tuple(x):
+    if not isinstance(x, tuple):
+        return (x,)
+    return x
+
+
+def _current_device(module):
+    return next(module.parameters()).device
+
+
+class GraphProfiler(torch.fx.Interpreter):
+    """
+    Fetch shape argument from ``ShapeProp`` without re-executing
+    the ``GraphModule`` from scratch.
+    """
+    _profileable = [
+        'call_function',
+        'call_module',
+        'call_method',
+    ]
+
+    def __init__(self, module: GraphModule, garbage_collect_values: bool = True):
+        super().__init__(module, garbage_collect_values)
+
+    def run(self, *args, initial_env: Optional[Dict[Node, Any]] = None, enable_io_processing: bool = True) -> Any:
+        """
+        Run `module` via interpretation and return the result.
+
+        Args:
+            *args: The arguments to the Module to run, in positional order
+            initial_env (Optional[Dict[Node, Any]]): An optional starting environment for execution.
+                This is a dict mapping `Node` to any value. This can be used, for example, to
+                pre-populate results for certain `Nodes` so as to do only partial evaluation within
+                the interpreter.
+            enable_io_processing (bool): If true, we process the inputs and outputs with graph's process_inputs and
+                process_outputs function first before using them.
+
+        Returns:
+            Any: The value returned from executing the Module
+        """
+        self.env = initial_env if initial_env else {}
+
+        # Positional function args are consumed left-to-right by
+        # `placeholder` nodes. Use an iterator to keep track of
+        # position and extract those values.
+        if enable_io_processing:
+            args = self.module.graph.process_inputs(*args)
+        self.args_iter: Iterator[Any] = iter(args)
+
+        for node in self.module.graph.nodes:
+
+            self.run_node(node)    # No need to store.
+
+            if self.garbage_collect_values:
+                for to_delete in self.user_to_last_uses.get(node, []):
+                    del self.env[to_delete]
+
+            if node.op == 'output':
+                output_val = self.env[node]
+                return self.module.graph.process_outputs(output_val) if enable_io_processing else output_val
+
+    def fetch_initial_env(self, device=None) -> Dict[Node, Any]:
+        """
+        Fetch ``initial_env`` for execution. This is because ``ShapeProp``
+        has already attached outputs of each ``Node`` to its ``MetaInfo``.
+
+        Args:
+            device (torch.device): The device to place the execution, default to ``None``
+
+        Returns:
+            Dict[Node, Any]: The initial environment for execution
+        """
+        initial_env = {}
+        for n in self.module.graph.nodes:
+            initial_env[n] = _denormalize_tuple(MetaInfo(n).outputs)
+        return initial_env
+
+    def propagate(self, *args, device=None):
+        """
+        Run `module` via interpretation and profile the execution
+        of each ``Node``.
+
+        Args:
+            *args (Tensor): The sample input, not used
+            device (torch.device): The device to place the execution, default to ``None``
+
+        Returns:
+            Any: The value returned from executing the Module
+        """
+        initial_env = self.fetch_initial_env(device)
+
+        return self.run(initial_env=initial_env)
+
+    def summary(self) -> str:
+        """
+        Summarizes the profiled statistics of the `GraphModule` in
+        tabular format. Note that this API requires the ``tabulate`` module
+        to be installed.
+
+        Returns:
+            str: The summary of the profiled statistics
+        """
+        # https://github.com/pytorch/pytorch/blob/master/torch/fx/graph.py
+        try:
+            from tabulate import tabulate
+        except ImportError:
+            print("`summary` relies on the library `tabulate`, "
+                  "which could not be found on this machine. Run `pip "
+                  "install tabulate` to install the library.")
+
+        # Build up a list of summary information for each node
+        node_summaries: List[List[Any]] = []
+        last_n_info = None
+
+        for node in self.module.graph.nodes:
+            node: Node
+            n_info = MetaInfo(node)
+            last_n_info = last_n_info or n_info
+            node_summaries.append([
+                node.op,
+                str(node),
+                _format_memory(n_info.accumulate_size),
+                _format_memory(n_info.accumulate_size - last_n_info.accumulate_size),
+                _format_memory(n_info.output_size),
+                _format_memory(n_info.temp_size),
+                _format_memory(n_info.param_size),
+                _format_memory(n_info.backward_size),
+                _format_flops(n_info.fwd_flop),
+                _format_flops(n_info.bwd_flop),
+            ])
+            last_n_info = n_info
+
+        # Use the ``tabulate`` library to create a well-formatted table
+        # presenting our summary information
+        headers: List[str] = [
+            'Op type',
+            'Op',
+            'Accumulate size',
+            'Incremental size',
+            'Output size',
+            'Temp size',
+            'Param size',
+            'Backward size',
+            'Fwd FLOPs',
+            'Bwd FLOPs',
+        ]
+
+        return tabulate(node_summaries, headers=headers, stralign='right')
+
+
+class CommunicationProfiler(GraphProfiler):
+    """
+    TODO(lyl): Add this for all comm nodes
+    """
+
+    def __init__(self, module: GraphModule, garbage_collect_values: bool = True):
+        raise NotImplementedError()
+
+
+class FlopProfiler(GraphProfiler):
+    """
+    Execute an FX graph Node-by-Node and record the meta data of the result
+    into the corresponding node.
+
+    Usage:
+        >>> model = MyModule()
+        >>> x = torch.rand(10, 10)
+        >>> gm = colossalai.fx.symbolic_trace(model, meta_args = {'x': x}})
+        >>> shape_interp = ShapeProp(gm)    # must do this first
+        >>> shape_interp.propagate(x)
+        >>> profiler = FlopProfiler(gm)
+        >>> profiler.propagate(x)
+
+    Args:
+        module (GraphModule): The module to be executed
+
+    Hints:
+        If you want to add a new flop count rule, you can first
+        check the existing files in ``../_subclasses/flop_tensor.py``.
+        If your flop count rules are incompatible with the existing
+        ones, you can do so by adding a new method to this class
+        with the ``@register_flop_count_impl`` decorator. The method
+        should take (*args, **kwargs) instance as its input and
+        generate flop count for both forward and backward as its
+        output.
+
+        For example, if you want to add a flop count rule for
+        ``my_fn``, which is a hand-written operand not detected by
+        PyTorch, you can do so by adding a new method to this
+        class with the ``@register_flop_count_impl`` decorator:
+
+        >>> @register_flop_count_impl(my_fn)
+        >>> def my_fn_flop_count_impl(*args, **kwargs):
+        >>>     return 0, 0
+    """
+    _custom_flop_count_impl = {}
+
+    def run_node(self, n: torch.fx.Node) -> Any:
+        """
+        Run a specific node ``n`` and profile its execution time and memory usage.
+        Calls into call_function, call_method, and call_module only.
+
+        Args:
+            n (Node): The Node to profile
+
+        Returns:
+            Any: The output of the node
+
+        Raises:
+            RuntimeError: If the node is not profileable.
+        """
+        args, kwargs = self.fetch_args_kwargs_from_env(n)
+        n_info = MetaInfo(n)
+
+        if n.op in self._profileable:
+            try:
+                (
+                    n_info.fwd_flop,
+                    n_info.bwd_flop,
+                ) = getattr(self, n.op)(n.target, args, kwargs)
+            except Exception as e:
+                raise RuntimeError(
+                    f'Error {str(e)} occurred when profiling node {n}, node.target = {n.target}. '
+                    f'Please refer to function\'s docstring to register the relevant profile_impl for this node!'
+                ) from e
+
+        # retain the autograd graph
+        for param in self.module.parameters():
+            param.grad = None
+
+        return _denormalize_tuple(n_info.outputs)
+
+    def call_function(self, target: 'Target', args: Tuple[Argument, ...], kwargs: Dict[str, Any]) -> Any:
+        """
+        Execute a ``call_function`` node and return the profiling result.
+        Dispatch to ``_custom_flop_count_impl`` if ``call_function`` should be
+        profiled in a user-defined behavior.
+
+        Args:
+            target (Target): The call target for this node. See
+                `Node <https://pytorch.org/docs/master/fx.html#torch.fx.Node>`__ for
+                details on semantics
+            args (Tuple): Tuple of positional args for this invocation
+            kwargs (Dict): Dict of keyword arguments for this invocation
+
+        Return
+            flop_count (Tuple[int]): (fwd_flop, bwd_flop)
+        """
+        assert not isinstance(target, str)
+
+        # Dispatch the impl for profiling, default will be ``flop_count``
+        if target in self._custom_flop_count_impl:
+            return self._custom_flop_count_impl[target](*args, **kwargs)
+        else:
+            return flop_count(target, *args, **kwargs)
+
+    def call_method(self, target: 'Target', args: Tuple[Argument, ...], kwargs: Dict[str, Any]) -> Any:
+        """
+        Execute a ``call_method`` node and return the profiling result.
+
+        Args:
+            target (Target): The call target for this node. See
+                `Node <https://pytorch.org/docs/master/fx.html#torch.fx.Node>`__ for
+                details on semantics
+            args (Tuple): Tuple of positional args for this invocation
+            kwargs (Dict): Dict of keyword arguments for this invocation
+
+        Return
+            flop_count (Tuple[int]): (fwd_flop, bwd_flop)
+        """
+        # Execute the method and return the result
+        assert isinstance(target, str)
+        return flop_count(getattr(torch.Tensor, target), *args, **kwargs)
+
+    def call_module(self, target: 'Target', args: Tuple[Argument, ...], kwargs: Dict[str, Any]) -> Any:
+        """
+        Execute a ``call_module`` node and return the profiling result.
+
+        Args:
+            target (Target): The call target for this node. See
+                `Node <https://pytorch.org/docs/master/fx.html#torch.fx.Node>`__ for
+                details on semantics
+            args (Tuple): Tuple of positional args for this invocation
+            kwargs (Dict): Dict of keyword arguments for this invocation
+
+        Return
+            flop_count (Tuple[int]): (fwd_flop, bwd_flop)
+        """
+        # Retrieve executed args and kwargs values from the environment
+
+        # Execute the method and return the result
+        assert isinstance(target, str)
+        submod = self.fetch_attr(target)
+        return flop_count(submod, *args, **kwargs)
+
+
+def graph_profile_pass(module: GraphModule, *args, verbose=False) -> GraphModule:
+    """
+    Run ``module`` via interpretation and profile the execution
+    of each ``Node``.
+
+    Args:
+        module (GraphModule): The GraphModule to profile
+        *args (Any): The sample input, not used
+        verbose (bool): Whether to print the profiling summary
+
+    Returns:
+        GraphModule: The same GraphModule with profiling information
+    """
+    for profiler_cls in (FlopProfiler,
+    # CommunicationProfiler,    # TODO: add communication profiling
+                        ):
+        profiler = profiler_cls(module)
+        profiler.propagate(*args, device=_current_device(module))
+
+    if verbose:
+        print(profiler.summary())
+    return module
diff --git a/colossalai/_analyzer/fx/passes/shape_prop.py b/colossalai/_analyzer/fx/passes/shape_prop.py
new file mode 100644
index 000000000000..3691497ed8cd
--- /dev/null
+++ b/colossalai/_analyzer/fx/passes/shape_prop.py
@@ -0,0 +1,194 @@
+"""``torch.fx.ShapeProp``, but with ``MetaTensor``"""
+
+from typing import Any, Callable, Dict, Optional, Tuple, Union
+
+import torch
+import torch.fx
+from torch.autograd.graph import saved_tensors_hooks
+from torch.utils._pytree import tree_map
+
+from colossalai._analyzer._subclasses import MetaTensor, MetaTensorMode
+from colossalai._analyzer.fx.node_util import MetaInfo
+from colossalai.fx._compatibility import compatibility
+
+Target = Union[Callable[..., Any], str]
+
+
+class sim_env(saved_tensors_hooks):
+    """
+    A simulation of memory allocation and deallocation in the forward pass
+    using ``saved_tensor_hooks``.
+
+    Attributes:
+        ctx (Dict[int, torch.Tensor]): A dictionary that maps the
+            data pointer of a tensor to the tensor itself. This is used
+            to track the memory allocation and deallocation.
+
+        param_ctx (Dict[int, torch.Tensor]): A dictionary that maps the
+            data pointer of all model parameters to the parameter itself.
+            This avoids overestimating the memory usage of the intermediate activations.
+    """
+
+    def __init__(self, module: Optional[torch.nn.Module] = None):
+        super().__init__(self.pack_hook, self.unpack_hook)
+        self.ctx = {}
+        self.param_ctx = {param.data_ptr(): param for param in module.parameters()}
+        self.buffer_ctx = {buffer.data_ptr(): buffer for buffer in module.buffers()} if module else {}
+
+    def pack_hook(self, tensor: torch.Tensor):
+        if tensor.data_ptr() not in self.param_ctx and tensor.data_ptr() not in self.buffer_ctx:
+            self.ctx[tensor.data_ptr()] = tensor
+        return tensor
+
+    def unpack_hook(self, tensor):
+        return tensor
+
+
+def _normalize_tuple(x):
+    if not isinstance(x, tuple):
+        return (x,)
+    return x
+
+
+def _current_device(module):
+    return next(module.parameters()).device
+
+
+@compatibility(is_backward_compatible=False)
+class ShapeProp(torch.fx.Interpreter):
+    """
+    Execute an FX graph Node-by-Node and record the meta data of the result
+    into the corresponding node.
+
+    Usage:
+        >>> model = MyModule()
+        >>> x = torch.rand(10, 10)
+        >>> gm = colossalai.fx.symbolic_trace(model, meta_args = {'x': x})
+        >>> interp = ShapeProp(gm)
+        >>> interp.propagate(x)
+
+    Args:
+        module (GraphModule): The module to be executed
+
+    Hints:
+        If you want to add a new shape propagation rule, you can do so by
+        adding a new method to this class with the ``@register_shape_impl``
+        decorator. The method should take (*args, **kwargs) instance as its
+        input and generate output.
+
+        For example, if you want to add a shape propagation rule for
+        ``torch.nn.functional.linear``, you can do so by adding a new method
+        to this class with the ``@register_shape_impl`` decorator (Since the
+        ``MetaTensorMode`` is compatible with ``torch.nn.functional.linear``,
+        in practice you don't have to do as follows):
+
+        >>> @register_shape_impl(torch.nn.functional.linear)
+        >>> def linear_shape_impl(*args, **kwargs):
+        >>>     # do something here
+        >>>     return torch.empty(output_shape, device=output_device)
+    """
+    _custom_dispatch_func = {}
+    _mode = MetaTensorMode()
+
+    def __init__(self, module: torch.fx.GraphModule, garbage_collect_values: bool = True):
+        super().__init__(module, garbage_collect_values)
+        self.global_hook = sim_env(module=self.module)
+
+    def run_node(self, n: torch.fx.Node) -> Any:
+        """
+        Run a specific node ``n`` and return the result. Attach
+        (
+            ``inputs``, ``outputs``, ``parameters``, ``buffers``
+        ) to ``n``.
+
+        Args:
+            n (Node): The ``Node`` to execute
+
+        Returns:
+            Any: The result of executing ``n``
+        """
+        args, kwargs = self.fetch_args_kwargs_from_env(n)
+        with self.global_hook:
+            r = getattr(self, n.op)(n.target, args, kwargs)
+
+        unwrap_fn = lambda elem: elem._tensor if isinstance(elem, MetaTensor) else elem
+        is_pure_tensor = lambda elem: isinstance(elem, MetaTensor) and not isinstance(elem, torch.nn.Parameter)
+        n_info = MetaInfo(n)
+        n_info.outputs = _normalize_tuple(r)
+
+        if n.op == 'call_module':
+            submod = self.fetch_attr(n.target)
+            n_info.parameters.update({k: MetaTensor(v) for k, v in submod.named_parameters()})
+            n_info.buffers.update({k: MetaTensor(v) for k, v in submod.named_buffers()})
+
+        else:
+            n_info.parameters.update({
+                k.name: MetaTensor(v)
+                for k, v in zip(n.args, args)
+                if isinstance(k, torch.fx.Node) and isinstance(v, torch.nn.Parameter)
+            })
+            n_info.parameters.update({k: MetaTensor(v) for k, v in kwargs.items() if isinstance(v, torch.nn.Parameter)})
+
+        n_info.inputs = tuple(v for v in args if is_pure_tensor(v)) + \
+                        tuple(v for v in kwargs.values() if is_pure_tensor(v))
+
+        n._meta_data = tree_map(unwrap_fn, _normalize_tuple(r))    # align with SPMD
+
+        n_info.global_ctx = self.global_hook.ctx
+        n_info.curr_ctx = self.global_hook.ctx.copy()
+
+        crit = lambda x: x.data_ptr() in self.global_hook.ctx if isinstance(x, torch.Tensor) else False
+        n_info.is_alias = _normalize_tuple(tree_map(crit, n_info.outputs))
+        return r
+
+    def call_function(self, target: 'Target', args: Tuple[Any, ...], kwargs: Dict[str, Any]) -> Any:
+        """
+        Execute a ``call_function`` node and return the result.
+        If the target of ``Node`` is registered with ``@register_shape_impl``,
+        the registered function will be used to execute the node. This is common
+        if we insert some customized kernels.
+
+        Args:
+            target (Target): The call target for this node. See
+                `Node <https://pytorch.org/docs/master/fx.html#torch.fx.Node>`__ for
+                details on semantics
+            args (Tuple): Tuple of positional args for this invocation
+            kwargs (Dict): Dict of keyword arguments for this invocation
+
+        Return
+            Any: The value returned by the function invocation
+        """
+        if target in self._custom_dispatch_func:
+            return self._custom_dispatch_func[target](*args, **kwargs)
+        else:
+            return super().call_function(target, args, kwargs)
+
+    def propagate(self, *args, device=None):
+        """
+        Run `module` via interpretation and return the result and record the
+        shape of each node.
+        Args:
+            *args (Tensor): The sample input.
+        Returns:
+            Any: The value returned from executing the Module
+        """
+        wrap_fn = lambda elem: MetaTensor(elem, device=device)
+        with self._mode:
+            return super().run(*tree_map(wrap_fn, args))
+
+
+def shape_prop_pass(module: torch.fx.GraphModule, *args) -> torch.fx.GraphModule:
+    """
+    Run ``module`` via interpretation and return the result and record the
+    shape of each ``Node``.
+
+    Args:
+        module (GraphModule): The GraphModule to profile
+        *args (Any): The sample input
+
+    Returns:
+        GraphModule: The same GraphModule with shape information
+    """
+
+    ShapeProp(module).propagate(*args, device=_current_device(module))
+    return module
diff --git a/colossalai/_analyzer/fx/symbolic_profile.py b/colossalai/_analyzer/fx/symbolic_profile.py
new file mode 100644
index 000000000000..dd7f22c6c98a
--- /dev/null
+++ b/colossalai/_analyzer/fx/symbolic_profile.py
@@ -0,0 +1,40 @@
+import torch
+import torch.fx
+from torch.fx import GraphModule
+
+from .passes import ShapeProp, graph_profile_pass, shape_prop_pass
+from .passes.graph_profile import FlopProfiler
+
+
+def register_flop_count_impl(func):
+
+    def wrapper(impl):
+        FlopProfiler._custom_flop_count_impl[func] = impl
+        return impl
+
+    return wrapper
+
+
+def register_shape_impl(func):
+
+    def wrapper(impl):
+        ShapeProp._custom_dispatch_func[func] = impl
+        return impl
+
+    return wrapper
+
+
+def symbolic_profile(module: GraphModule, *args, verbose=False) -> GraphModule:
+    """Symbolically profile a model with sample inputs.
+
+    Args:
+        module (GraphModule): The module to be profiled
+        args (Tuple): The sample inputs
+        verbose (bool): Whether to print the profiling result
+
+    Returns:
+        GraphModule: The profiled module
+    """
+    module = shape_prop_pass(module, *args)
+    module = graph_profile_pass(module, *args, verbose=verbose)
+    return module
diff --git a/colossalai/_analyzer/fx/symbolic_trace.py b/colossalai/_analyzer/fx/symbolic_trace.py
new file mode 100644
index 000000000000..5d858c87a3c8
--- /dev/null
+++ b/colossalai/_analyzer/fx/symbolic_trace.py
@@ -0,0 +1,620 @@
+import functools
+import inspect
+import operator
+from contextlib import contextmanager
+from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Type, Union
+
+import torch
+import torch.nn as nn
+from torch.fx import Graph, Node, Proxy, Tracer
+from torch.fx.graph import _Namespace
+from torch.utils._pytree import tree_map
+
+from colossalai._analyzer._subclasses import MetaTensor, _TensorPropertyMethod, _TorchFactoryMethod
+
+from .codegen import ActivationCheckpointCodeGen
+from .graph_module import ColoGraphModule
+from .node_util import MetaInfo
+
+Target = Union[Callable[..., Any], str]
+Argument = Optional[Union[Tuple[Any, ...],    # actually Argument, but mypy can't represent recursive types
+                          List[Any],    # actually Argument
+                          Dict[str, Any],    # actually Argument
+                          slice,    # Slice[Argument, Argument, Argument], but slice is not a templated type in typing
+                          'Node',]]
+zeros = torch.zeros
+
+
+def _truncate_suffix(s: str):
+    import re
+
+    # FIXME: don't know why but torch.fx always gets a suffix like '_1' in the name
+    return re.sub(r'_\d+$', '', s)
+
+
+def _default_device():
+    return torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
+
+
+def _current_device(module):
+    try:
+        return next(module.parameters()).device
+    except:
+        return _default_device()
+
+
+def register_tracer_impl(func: Callable[..., Any], name: Optional[str] = '_custom_impl'):
+
+    def wrapper(impl):
+        assert hasattr(ColoTracer, name), f"Cannot register {func.__name__} in ColoTracer.{name}"
+        getattr(ColoTracer, name)[func] = impl
+        return impl
+
+    return wrapper
+
+
+def register_leaf_module_impl(module: nn.Module):
+
+    def wrapper(impl):
+        ColoTracer._custom_leaf_module_impl[module] = impl
+        return impl
+
+    return wrapper
+
+
+def register_leaf_module(module: nn.Module):
+    ColoTracer._custom_leaf_module.add(module)
+
+
+def register_non_leaf_module(module: nn.Module):
+    ColoTracer._custom_non_leaf_module.add(module)
+
+
+class ColoProxy(Proxy):
+    _func_dispatch: Dict[Target, Callable[..., Any]] = {}
+
+    def __init__(self, *args, data=None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._meta_data = data
+
+    @property
+    def meta_data(self):
+        return self._meta_data
+
+    @meta_data.setter
+    def meta_data(self, args):
+        wrap_fn = lambda x: MetaTensor(x) if isinstance(x, torch.Tensor) else x
+        self._meta_data = tree_map(wrap_fn, args)
+
+    @classmethod
+    def __torch_function__(cls, orig_method, types, args=(), kwargs=None):
+        kwargs = {} if kwargs is None else kwargs
+        if orig_method in cls._func_dispatch:
+            impl = cls._func_dispatch.pop(orig_method)    # avoid recursion
+            proxy = impl(*args, **kwargs)
+            cls._func_dispatch[orig_method] = impl
+            return proxy
+        else:
+            proxy = cls.from_torch_proxy(super().__torch_function__(orig_method, types, args, kwargs))
+            unwrap_fn = lambda p: p.meta_data if isinstance(p, ColoProxy) else p
+            if proxy.meta_data is None:
+                proxy.meta_data = orig_method(*tree_map(unwrap_fn, args), **tree_map(unwrap_fn, kwargs))
+            return proxy
+
+    @classmethod
+    def from_torch_proxy(cls, proxy: Proxy):
+        return cls(proxy.node, proxy.tracer)
+
+    def __repr__(self):
+        return f"ColoProxy({self.node.name}, meta_data={self.meta_data})"
+
+    def __len__(self):
+        return len(self.meta_data)
+
+    def __int__(self):
+        return int(self.meta_data)
+
+    def __index__(self):
+        try:
+            return int(self.meta_data)
+        except:
+            return zeros(self.meta_data.shape, dtype=torch.bool).numpy().__index__()
+
+    def __float__(self):
+        return float(self.meta_data)
+
+    def __bool__(self):
+        return self.meta_data
+
+    def __getattr__(self, k):
+        return ColoAttribute(self, k, getattr(self._meta_data, k, None))
+
+    def __setitem__(self, key, value):
+        proxy = self.tracer.create_proxy('call_function', operator.setitem, (self, key, value), {})
+        proxy.meta_data = self._meta_data
+        return proxy
+
+    def __contains__(self, key):
+        if self.node.op == "placeholder":
+            # this is used to handle like
+            # if x in kwargs
+            # we don't handle this case for now
+            return False
+        return super().__contains__(key)
+
+    def __isinstancecheck__(self, type):
+        return isinstance(self.meta_data, type)
+
+    def size(self, dim=None):
+        if self._meta_data is None:
+            return self._meta_data.size(*[dim] if dim else [])
+        return self.tracer.create_proxy('call_method', 'size', (self, dim) if dim else (self,), {})
+
+    def dim(self):
+        if self._meta_data is not None:
+            return self._meta_data.dim()
+        return self.tracer.create_proxy('call_method', 'dim', (self,), {})
+
+    @property
+    def shape(self):
+        if self._meta_data is not None:
+            return self._meta_data.shape
+        return self.tracer.create_proxy('call_function', getattr, (self, 'shape'), {})
+
+    @property
+    def ndim(self):
+        if self._meta_data is not None:
+            return self._meta_data.ndim
+        return self.tracer.create_proxy('call_function', getattr, (self, 'ndim'), {})
+
+    @property
+    def device(self):
+        if self._meta_data is not None:
+            return self._meta_data.device
+        return self.tracer.create_proxy('call_function', getattr, (self, 'device'), {})
+
+    @property
+    def dtype(self):
+        if self._meta_data is not None:
+            return self._meta_data.dtype
+        return self.tracer.create_proxy('call_function', getattr, (self, 'dtype'), {})
+
+    def to(self, *args, **kwargs):
+        return self.tracer.create_proxy('call_method', 'to', (self, *args), {**kwargs})
+
+    def cpu(self, *args, **kwargs):
+        return self.tracer.create_proxy('call_method', 'cpu', (self, *args), {**kwargs})
+
+    def cuda(self, *args, **kwargs):
+        return self.tracer.create_proxy('call_method', 'cuda', (self, *args), {**kwargs})
+
+
+class ColoAttribute(ColoProxy):
+
+    def __init__(self, root, attr: str, data=None):
+        self.root = root
+        self.attr = attr
+        self.tracer = root.tracer
+        self._meta_data = data
+        self._node: Optional[Node] = None
+
+    @property
+    def node(self):
+        # the node for attributes is added lazily, since most will just be method calls
+        # which do not rely on the getitem call
+        if self._node is None:
+            self._node = self.tracer.create_proxy('call_function', getattr, (self.root, self.attr), {}).node
+        return self._node
+
+    def __call__(self, *args, **kwargs):
+        return self.tracer.create_proxy('call_method', self.attr, (self.root,) + args, kwargs)
+
+    def __repr__(self):
+        return f"ColoAttribute({self.node.name}, attr={self.attr})"
+
+
+class ColoTracer(Tracer):
+    _custom_leaf_module: Set[Type[nn.Module]] = set()
+    _custom_leaf_module_impl: Dict[Type[nn.Module], Callable[..., Any]] = {}
+    _custom_non_leaf_module: Set[Type[nn.Module]] = set()
+    _custom_impl: Dict[Callable[..., Any], Callable[..., Any]] = {}
+    _bias_addition_impl: Dict[Callable[..., Any], Callable[..., Any]] = {}
+    _bias_addition_module = [
+        torch.nn.Linear,
+        torch.nn.Conv1d,
+        torch.nn.Conv2d,
+        torch.nn.Conv3d,
+        torch.nn.ConvTranspose1d,
+        torch.nn.ConvTranspose2d,
+        torch.nn.ConvTranspose3d,
+    ]
+
+    def __init__(self, trace_act_ckpt: bool = False, bias_addition_split: bool = False, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.disable_module_getattr = False
+        self.proxy_buffer_attributes = True
+
+        # whether the tracer will record the usage of torch.utils.checkpoint
+        self.trace_act_ckpt = trace_act_ckpt
+        self.ckpt_regions = []
+        self.ckpt_idx = 0
+
+        self.mod_dir = ''
+
+        # whether the tracer should split the bias_add ops into two ops
+        self.bias_addition_split = bias_addition_split
+
+    def is_leaf_module(self, m: nn.Module, module_qualified_name: str) -> bool:
+        # if bias-addiction split is enabled, and module has bias, then it is not a leaf module
+        # we will enter the module and split the bias-addition ops
+        if self.bias_addition_split and type(m) in self._bias_addition_module and m.bias is not None:
+            return False
+
+        # user can specify which modules are leaf modules and which are not
+        return (type(m) not in self._custom_non_leaf_module
+                and (type(m) in self._custom_leaf_module or super().is_leaf_module(m, module_qualified_name)))
+
+    def call_module(self, m: torch.nn.Module, forward: Callable[..., Any], args: Tuple[Any, ...],
+                    kwargs: Dict[str, Any]) -> Any:
+        curr_dir = self.mod_dir
+        self.mod_dir = 'self.' + self.path_of_module(m)
+        rst = super().call_module(m, forward, args, kwargs)
+        self.mod_dir = curr_dir
+        return rst
+
+    def proxy(self, node: Node) -> 'ColoProxy':
+        return ColoProxy(node, self)
+
+    def create_proxy(self,
+                     kind: str,
+                     target: Target,
+                     args: Tuple[Any, ...],
+                     kwargs: Dict[str, Any],
+                     name: Optional[str] = None,
+                     type_expr: Optional[Any] = None,
+                     proxy_factory_fn: Callable[[Node], 'Proxy'] = None):
+
+        proxy: ColoProxy = super().create_proxy(kind, target, args, kwargs, name, type_expr, proxy_factory_fn)
+        unwrap_fn = lambda p: p.meta_data if isinstance(p, ColoProxy) else p
+        if kind == 'placeholder':
+            proxy.meta_data = self.meta_args[target] if target in self.meta_args else self.concrete_args.get(
+                _truncate_suffix(target), None)
+        elif kind == 'get_attr':
+            self.disable_module_getattr = True
+            try:
+                attr_itr = self.root
+                atoms = target.split(".")
+                for atom in atoms:
+                    attr_itr = getattr(attr_itr, atom)
+                proxy.meta_data = attr_itr
+            finally:
+                self.disable_module_getattr = False
+        elif kind == 'call_function':
+            proxy.meta_data = target(*tree_map(unwrap_fn, args), **tree_map(unwrap_fn, kwargs))
+        elif kind == 'call_method':
+            self.disable_module_getattr = True
+            try:
+                if target == '__call__':
+                    proxy.meta_data = unwrap_fn(args[0])(*tree_map(unwrap_fn, args[1:]), **tree_map(unwrap_fn, kwargs))
+                else:
+                    if target not in _TensorPropertyMethod:
+                        proxy._meta_data = getattr(unwrap_fn(args[0]), target)(*tree_map(unwrap_fn, args[1:]),
+                                                                               **tree_map(unwrap_fn, kwargs))
+            finally:
+                self.disable_module_getattr = False
+        elif kind == 'call_module':
+            mod = self.root.get_submodule(target)
+            self.disable_module_getattr = True
+            try:
+                proxy.meta_data = self._custom_leaf_module_impl.get(type(mod),
+                                                                    mod.forward)(*tree_map(unwrap_fn, args),
+                                                                                 **tree_map(unwrap_fn, kwargs))
+            finally:
+                self.disable_module_getattr = False
+        return proxy
+
+    def create_node(self, *args, **kwargs) -> Node:
+        node = super().create_node(*args, **kwargs)
+        n_info = MetaInfo(node, mod_dir=self.mod_dir, to_recompute=tuple(self.ckpt_regions))
+        return node
+
+    def trace(self,
+              root: torch.nn.Module,
+              concrete_args: Optional[Dict[str, torch.Tensor]] = {},
+              meta_args: Optional[Dict[str, torch.Tensor]] = {}) -> Graph:
+
+        # check concrete and meta args have valid names
+        sig = inspect.signature(root.forward)
+        sig_names = set(sig.parameters.keys())
+        meta_arg_names = set(meta_args.keys())
+        concrete_arg_names = set(concrete_args.keys())
+
+        # update concrete args with default values
+        for k, v in sig.parameters.items():
+            if k in sig_names - meta_arg_names and \
+                    k not in concrete_args and \
+                    v.default is not inspect.Parameter.empty:
+                concrete_args[k] = v.default
+
+        def _check_arg_name_valid(names: Iterable[str]):
+            for name in names:
+                if name not in sig_names:
+                    raise ValueError(f"Argument {name} is not in the signature of {root.__class__.__name__}.forward")
+
+        _check_arg_name_valid(meta_arg_names)
+        _check_arg_name_valid(concrete_arg_names)
+
+        self.concrete_args = concrete_args
+        self.meta_args = meta_args
+
+        with self._torch_factory_override(), self._tracer_override(), torch.no_grad():
+            self.mod_dir = 'self'
+            self.graph = super().trace(root, concrete_args=concrete_args)
+            self.mod_dir = ''
+        self.graph.lint()
+        return self.graph
+
+    @contextmanager
+    def _tracer_override(self):
+        # override the tracer to support custom modules and checkpointing
+        if self.trace_act_ckpt:
+            orig_ckpt_func_apply = torch.utils.checkpoint.CheckpointFunction.apply
+            orig_ckpt_func_without_reentrant = torch.utils.checkpoint._checkpoint_without_reentrant
+
+            def checkpoint(run_function, preserve_rng_state=False, *args):
+                self.ckpt_regions.append(self.ckpt_idx)
+                out = run_function(*args)
+                self.ckpt_idx = self.ckpt_regions.pop(-1) + 1
+                return out
+
+            # override the checkpoint function
+            torch.utils.checkpoint.CheckpointFunction.apply = checkpoint
+            torch.utils.checkpoint._checkpoint_without_reentrant = checkpoint
+
+        # override the custom functions
+        ColoProxy._func_dispatch.update({k: v for k, v in self._custom_impl.items()})
+
+        # override the bias addition functions
+        if self.bias_addition_split:
+            ColoProxy._func_dispatch.update({k: v for k, v in self._bias_addition_impl.items()})
+
+        yield
+
+        if self.trace_act_ckpt:
+            # recover the checkpoint function upon exit
+            torch.utils.checkpoint.CheckpointFunction.apply = orig_ckpt_func_apply
+            torch.utils.checkpoint._checkpoint_reentrant = orig_ckpt_func_without_reentrant
+
+        ColoProxy._func_dispatch = {}
+
+    @contextmanager
+    def _torch_factory_override(self):
+        # override the torch factory functions to create a proxy when the method
+        # is called during ``symbolic_trace()``.
+        def wrap_factory_method(target):
+
+            @functools.wraps(target)
+            def wrapper(*args, **kwargs):
+                is_proxy = any(isinstance(p, ColoProxy) for p in args) | any(
+                    isinstance(p, ColoProxy) for p in kwargs.values())
+                if is_proxy:
+                    # if the arg is a proxy, then need to record this function called on this proxy
+                    # e.g. torch.ones(size) where size is an input proxy
+                    self.disable_module_getattr = True
+                    try:
+                        proxy = self.create_proxy('call_function', target, args, kwargs)
+                    finally:
+                        self.disable_module_getattr = False
+                    return proxy
+                else:
+                    return target(*args, **kwargs)
+
+            return wrapper, target
+
+        overrides = {
+            target: wrap_factory_method(getattr(torch, target))
+            for target in _TorchFactoryMethod
+            if callable(getattr(torch, target))
+        }
+        for name, (wrapper, orig) in overrides.items():
+            setattr(torch, name, wrapper)
+
+        yield
+
+        # recover the torch factory functions upon exit
+        for name, (wrapper, orig) in overrides.items():
+            setattr(torch, name, orig)
+
+    def _post_check(self, non_concrete_arg_names: Set[str]):
+        # This is necessary because concrete args are added as input to the traced module since
+        # https://github.com/pytorch/pytorch/pull/55888.
+        for node in self.graph.nodes:
+            if node.op == "placeholder":
+                # Removing default values for inputs as the forward pass will fail with them.
+                if node.target in non_concrete_arg_names:
+                    node.args = ()
+                    # Without this, torch.jit.script fails because the inputs type is Optional[torch.Tensor].
+                    # It cannot infer on the attributes and methods the input should have, and fails.
+                    node.type = torch.Tensor
+                # It is a concrete arg so it is not used and should be removed.
+                else:
+                    if hasattr(torch.fx._symbolic_trace, "_assert_is_none"):
+                        # Newer versions of torch.fx emit an assert statement
+                        # for concrete arguments; delete those before we delete
+                        # the concrete arg.
+                        to_delete = []
+                        for user in node.users:
+                            if user.target == torch.fx._symbolic_trace._assert_is_none:
+                                to_delete.append(user)
+                        for user in to_delete:
+                            self.graph.erase_node(user)
+
+                    self.graph.erase_node(node)
+
+            if node.op == "output":
+                node.type = None
+            self.graph.lint()
+     
+    def getattr(self, attr, attr_val, parameter_proxy_cache):
+        return self._module_getattr(attr, attr_val, parameter_proxy_cache)
+
+    def _module_getattr(self, attr, attr_val, parameter_proxy_cache):
+        if getattr(self, "disable_module_getattr", False):
+            return attr_val
+
+        def maybe_get_proxy_for_attr(attr_val, collection_to_search, parameter_proxy_cache):
+            for n, p in collection_to_search:
+                if attr_val is p:
+                    if n not in parameter_proxy_cache:
+                        kwargs = {}
+                        if 'proxy_factory_fn' in inspect.signature(self.create_proxy).parameters:
+                            kwargs['proxy_factory_fn'] = (None if not self.param_shapes_constant else
+                                                          lambda node: ColoProxy(self, node, n, attr_val))
+                        val_proxy = self.create_proxy('get_attr', n, (), {}, **kwargs)    # type: ignore[arg-type]
+                        parameter_proxy_cache[n] = val_proxy
+                    return parameter_proxy_cache[n]
+            return None
+
+        if self.proxy_buffer_attributes and isinstance(attr_val, torch.Tensor):
+            maybe_buffer_proxy = maybe_get_proxy_for_attr(attr_val, self.root.named_buffers(), parameter_proxy_cache)
+            if maybe_buffer_proxy is not None:
+                return maybe_buffer_proxy
+
+        if isinstance(attr_val, torch.nn.Parameter):
+            maybe_parameter_proxy = maybe_get_proxy_for_attr(attr_val, self.root.named_parameters(),
+                                                             parameter_proxy_cache)
+            if maybe_parameter_proxy is not None:
+                return maybe_parameter_proxy
+
+        return attr_val
+
+
+def symbolic_trace(
+    root: Union[torch.nn.Module, Callable[..., Any]],
+    concrete_args: Optional[Dict[str, Any]] = {},
+    meta_args: Optional[Dict[str, Any]] = {},
+    trace_act_ckpt: bool = False,
+    bias_addition_split: bool = False,
+) -> ColoGraphModule:
+    """
+    Traces a ``torch.nn.Module`` or a function and returns a ``GraphModule`` with ``Node``s and ``MetaInfo``
+    attached to the ``Node``s.
+
+    Can be used to trace the usage of ``torch.utils.checkpoint`` and the path of module
+    (https://github.com/pytorch/examples/blob/main/fx/module_tracer.py).
+
+    This tracer is able to trace basic control flow and for loops.
+
+    It will split the bias addition into two parts if ``bias_addition_split`` is set to be ``True``.
+    (See ./bias_addition.py for more details).
+
+    Examples:
+    1. Tracing a ``torch.nn.Module`` with control flow.
+
+    .. code-block:: python
+
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(2, 2)
+
+            def forward(self, x):
+                if x.size(0) > 1:
+                    x = x.sum(dim=0)
+                return self.linear(x)
+
+        traced = symbolic_trace(MyModule(), meta_args={'x': torch.randn(1, 2, 2)})
+
+        # traced code like:
+        # def forward(self, x):
+        #     linear_1 = self.linear(x)
+        #     return linear_1
+
+        traced = symbolic_trace(MyModule(), meta_args={'x': torch.randn(2, 2, 2)})
+
+        # traced code like:
+        # def forward(self, x):
+        #     sum = x.sum(dim=0); x = None
+        #     linear = self.linear(sum); sum = None
+        #     return linear
+
+    2. Tracing a ``torch.nn.Module`` with ``torch.utils.checkpoint``.
+
+    .. code-block:: python
+
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(2, 2)
+
+            def forward(self, x):
+                def custom_forward(x):
+                    return self.linear(x)
+                return torch.utils.checkpoint.checkpoint(custom_forward, x)
+
+        traced = symbolic_trace(MyModule(), meta_args={'x': torch.randn(1, 2, 2)}, trace_act_ckpt=True)
+
+        # traced code like:
+        # def checkpoint_0(self, x):
+        #     linear = self.linear(x); x = None
+        #     return linear
+        #
+        # def forward(self, x):
+        #     linear = torch.utils.checkpoint.checkpoint(checkpoint_0, x); x = None
+        #     return linear
+
+    3. Tracing a ``torch.nn.Module`` with ``bias_addition_split``.
+
+    .. code-block:: python
+
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(2, 2, bias=True)
+
+            def forward(self, x):
+                return self.linear(x)
+
+        traced = symbolic_trace(MyModule(), meta_args={'x': torch.randn(1, 2, 2)}, bias_addition_split=True)
+
+        # traced code like:
+        # def forward(self, x):
+        #     linear_bias = self.linear.bias
+        #     linear_weight = self.linear.weight
+        #     linear = torch._C._nn.linear(x, linear_weight);  x = linear_weight = None
+        #     add = linear + linear_bias;  linear = linear_bias = None
+        #     return add
+
+    Args:
+        root (Union[torch.nn.Module, Callable[..., Any]]): The ``torch.nn.Module`` or function to be traced.
+        concrete_args (Optional[Dict[str, Any]], optional): Concrete arguments to be passed to the ``root``.
+            Defaults to {}.
+        meta_args (Optional[Dict[str, Any]], optional): Meta arguments to be passed to the ``root``. Mostly used
+            for tracing control flow. Defaults to {}.
+        trace_act_ckpt (bool, optional): Whether to trace the usage of ``torch.utils.checkpoint``.
+            Defaults to False.
+        bias_addition_split (bool, optional): Whether to split the bias addition into two parts. Defaults to False.
+
+    Returns:
+        ColoGraphModule: A traced ``GraphModule`` that is ready for activation checkpoint ``CodeGen``.
+
+    Remarks:
+        This part of ``symbolic_trace()`` is maintained by Colossal-AI team. If you encountered
+        any unexpected error during tracing, feel free to raise an issue on Colossal-AI GitHub
+        repo. We welcome any feedback and contributions to enhance the extensibility of
+        Colossal-AI.
+    """
+    if meta_args:
+        device, orig_device = _default_device(), _current_device(root)
+        wrap_fn = lambda elem: MetaTensor(elem, device=device) if isinstance(elem, torch.Tensor) else elem
+        graph = ColoTracer(trace_act_ckpt=trace_act_ckpt,
+                           bias_addition_split=bias_addition_split).trace(root.to(device),
+                                                                          concrete_args=concrete_args,
+                                                                          meta_args=tree_map(wrap_fn, meta_args))
+        if trace_act_ckpt:
+            graph.set_codegen(ActivationCheckpointCodeGen())
+        root.to(orig_device)
+    else:
+        graph = Tracer().trace(root, concrete_args=concrete_args)
+    name = root.__class__.__name__ if isinstance(root, torch.nn.Module) else root.__name__
+    return ColoGraphModule(root, graph, name)
diff --git a/colossalai/fx/passes/concrete_info_prop.py b/colossalai/fx/passes/concrete_info_prop.py
index ab38e8cb14e9..81ac64205528 100644
--- a/colossalai/fx/passes/concrete_info_prop.py
+++ b/colossalai/fx/passes/concrete_info_prop.py
@@ -226,7 +226,7 @@ def propagate(self, *args):
         Returns:
             Any: The value returned from executing the Module
         """
-        return super().run(*args)
+        return self.run(*args)
 
     def summary(self, unit: str = 'MB') -> str:
         """
diff --git a/colossalai/fx/passes/meta_info_prop.py b/colossalai/fx/passes/meta_info_prop.py
index c2394a13c697..2b4a8749cfd7 100644
--- a/colossalai/fx/passes/meta_info_prop.py
+++ b/colossalai/fx/passes/meta_info_prop.py
@@ -288,13 +288,16 @@ def mem_repr(mem: int) -> str:
         def flops_repr(flop: int) -> str:
             return f"{flop:,} FLOPs"
 
+        accumulate_size = 0
         for node in self.module.graph.nodes:
             node: Node
+            accumulate_size += calculate_fwd_out(node) + calculate_fwd_tmp(node)
             node_summaries.append([
                 node.op,
                 str(node),
                 flops_repr(node.meta['fwd_flop']),
                 flops_repr(node.meta['bwd_flop']),
+                mem_repr(accumulate_size),
                 mem_repr(calculate_fwd_in(node)),
                 mem_repr(calculate_fwd_out(node)),
                 mem_repr(calculate_fwd_tmp(node)),
@@ -309,6 +312,7 @@ def flops_repr(flop: int) -> str:
             'Op',
             'Forward FLOPs',
             'Backward FLOPs',
+            'Accumulated Memory',
             'FWD_IN',
             'FWD_OUT',
             'FWD_TMP',
diff --git a/colossalai/fx/profiler/opcount.py b/colossalai/fx/profiler/opcount.py
index e302c842126f..407a6bed5200 100644
--- a/colossalai/fx/profiler/opcount.py
+++ b/colossalai/fx/profiler/opcount.py
@@ -347,6 +347,7 @@ def zero_flop_jit(*args):
         aten.squeeze.dim,
         aten.slice.Tensor,
         aten.slice_backward.default,
+        aten.stack.default,
         aten.split.Tensor,
         aten.permute.default,
         aten.t.default,
diff --git a/tests/test_analyzer/test_fx/__init__.py b/tests/test_analyzer/test_fx/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/test_analyzer/test_fx/test_bias_addition.py b/tests/test_analyzer/test_fx/test_bias_addition.py
new file mode 100644
index 000000000000..5c9ec7cc3477
--- /dev/null
+++ b/tests/test_analyzer/test_fx/test_bias_addition.py
@@ -0,0 +1,113 @@
+import pytest
+import torch
+from torch.utils.checkpoint import checkpoint
+
+try:
+    from colossalai._analyzer.fx import symbolic_trace
+except:
+    pass
+
+
+class LinearModel(torch.nn.Module):
+
+    def __init__(self, in_features, out_features, bias):
+        super().__init__()
+        self.linear = torch.nn.Linear(in_features, out_features, bias=bias)
+
+    def forward(self, x):
+        x = self.linear(x)
+        return x
+
+
+class ConvModel(torch.nn.Module):
+
+    def __init__(self, in_channel, out_channels, kernel_size, bias) -> None:
+        super().__init__()
+        self.conv = torch.nn.Conv2d(in_channel,
+                                    out_channels,
+                                    kernel_size,
+                                    bias=bias,
+                                    padding=1,
+                                    stride=2,
+                                    dilation=2,
+                                    groups=3)
+        self.conv_transpose = torch.nn.ConvTranspose2d(in_channel,
+                                                       out_channels,
+                                                       kernel_size,
+                                                       bias=bias,
+                                                       padding=1,
+                                                       stride=2,
+                                                       dilation=2,
+                                                       groups=3)
+
+    def forward(self, x, select=0):
+        if select == 0:
+            x = self.conv(x)
+        else:
+            x = self.conv_transpose(x)
+        return x
+
+
+class SiuModel(torch.nn.Module):
+
+    def __init__(self, bias) -> None:
+        super().__init__()
+        self.linear = LinearModel(3, 3, bias)
+        self.conv = ConvModel(3, 6, 3, bias)
+
+    def forward(self, x, select=0):
+        x = self.linear(x)
+        x = checkpoint(self.conv, x, select)
+        return x
+
+
+class AddmmModel(torch.nn.Module):
+
+    def __init__(self, alpha, beta) -> None:
+        super().__init__()
+        self.alpha = alpha
+        self.beta = beta
+
+    def forward(self, x):
+        x = torch.addmm(x, x, x, alpha=self.alpha, beta=self.beta)
+        return x
+
+
+@pytest.mark.skipif(torch.__version__ < '1.12.0', reason='torch version < 12')
+@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize("bias_addition_split", [True, False])
+@pytest.mark.parametrize("shape", [(3, 3, 3), (3, 3, 3, 3)])
+@pytest.mark.parametrize("select", [0, 1])
+def test_siu_model(bias, bias_addition_split, shape, select):
+    model = SiuModel(bias=bias)
+    x = torch.rand(shape)
+    gm = symbolic_trace(model,
+                        meta_args={'x': x},
+                        concrete_args={'select': select},
+                        trace_act_ckpt=True,
+                        bias_addition_split=bias_addition_split)
+    assert torch.allclose(model(x, select), gm(x, select)), 'original model and traced model should be the same!'
+    if bias and bias_addition_split:
+        assert '+' in gm.code, 'bias addition should be split!'
+    else:
+        assert '+' not in gm.code, 'bias addition should not be split!'
+
+
+@pytest.mark.skipif(torch.__version__ < '1.12.0', reason='torch version < 12')
+@pytest.mark.parametrize("alpha", [1, 2])
+@pytest.mark.parametrize("beta", [1, 2])
+@pytest.mark.parametrize("bias_addition_split", [True, False])
+@pytest.mark.parametrize("shape", [(3, 3), (5, 5)])
+def test_addmm_model(alpha, beta, bias_addition_split, shape):
+    model = AddmmModel(alpha=alpha, beta=beta)
+    x = torch.rand(shape)
+    gm = symbolic_trace(model, meta_args={'x': x}, trace_act_ckpt=True, bias_addition_split=bias_addition_split)
+    assert torch.allclose(model(x), gm(x)), 'original model and traced model should be the same!'
+    if (alpha == 1 and beta == 1) or not bias_addition_split:
+        assert '*' not in gm.code, 'bias addition should not be split!'
+    elif bias_addition_split:
+        assert '+' in gm.code, 'bias addition should be split!'
+
+
+if __name__ == '__main__':
+    test_siu_model(True, True, (3, 3, 3))
diff --git a/tests/test_analyzer/test_fx/test_mod_dir.py b/tests/test_analyzer/test_fx/test_mod_dir.py
new file mode 100644
index 000000000000..15e0c2ec21c7
--- /dev/null
+++ b/tests/test_analyzer/test_fx/test_mod_dir.py
@@ -0,0 +1,78 @@
+import pytest
+import torch
+
+try:
+    from colossalai._analyzer.fx import symbolic_trace
+except:
+    pass
+
+
+class LinearModel(torch.nn.Module):
+
+    def __init__(self, in_features, out_features, bias):
+        super().__init__()
+        self.linear = torch.nn.Linear(in_features, out_features, bias=bias)
+
+    def forward(self, x):
+        x = self.linear(x)
+        return x
+
+
+class ConvModel(torch.nn.Module):
+
+    def __init__(self, in_channel, out_channels, kernel_size, bias) -> None:
+        super().__init__()
+        self.conv = torch.nn.Conv2d(in_channel,
+                                    out_channels,
+                                    kernel_size,
+                                    bias=bias,
+                                    padding=1,
+                                    stride=2,
+                                    dilation=2,
+                                    groups=3)
+        self.conv_transpose = torch.nn.ConvTranspose2d(out_channels,
+                                                       out_channels,
+                                                       kernel_size,
+                                                       bias=bias,
+                                                       padding=1,
+                                                       stride=2,
+                                                       dilation=2,
+                                                       groups=3)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.conv_transpose(x)
+        return x
+
+
+class AModel(torch.nn.Module):
+
+    def __init__(self, bias) -> None:
+        super().__init__()
+        self.linear_1 = LinearModel(3, 3, bias)
+        self.linear_2 = LinearModel(3, 3, bias)
+        self.conv = ConvModel(3, 6, 3, bias)
+
+    def forward(self, x):
+        for i in range(x.shape[0]):
+            x = self.linear_1(x)
+            x = self.linear_2(x)
+        x = self.conv(x)
+        return x
+
+
+@pytest.mark.skipif(torch.__version__ < '1.12.0', reason='torch version < 12')
+@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize("bias_addition_split", [True, False])
+@pytest.mark.parametrize("shape", [(3, 3, 3), (3, 3, 3, 3)])
+def test_mod_dir(bias, bias_addition_split, shape):
+    model = AModel(bias=bias)
+    x = torch.rand(shape)
+    gm = symbolic_trace(model, meta_args={'x': x}, bias_addition_split=bias_addition_split)
+    for node in gm.graph.nodes:
+        assert len(node.meta['info'].mod_dir), f"{node} should have non-trivial ``mod_dir``."
+        print(node, node.meta['info'].mod_dir)
+
+
+if __name__ == '__main__':
+    test_mod_dir(True, True, (3, 3, 3))
diff --git a/tests/test_analyzer/test_fx/test_nested_ckpt.py b/tests/test_analyzer/test_fx/test_nested_ckpt.py
new file mode 100644
index 000000000000..c31aab6752f8
--- /dev/null
+++ b/tests/test_analyzer/test_fx/test_nested_ckpt.py
@@ -0,0 +1,55 @@
+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+import pytest
+
+try:
+    from colossalai._analyzer.fx import symbolic_trace
+except:
+    pass
+
+
+class MyModule(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.a = nn.Linear(10, 10)
+        self.b = nn.Linear(10, 10)
+        self.c = nn.Linear(10, 10)
+        self.d = nn.Linear(10, 10)
+        self.e = nn.Linear(10, 10)
+
+    def checkpoint_0(self, x):
+        return checkpoint(self.checkpoint_0_0, x) + checkpoint(self.checkpoint_0_1, x) + self.e(x)
+
+    def checkpoint_0_0(self, x):
+        return checkpoint(self.checkpoint_0_0_0, x) + checkpoint(self.checkpoint_0_0_1, x)
+
+    def checkpoint_0_0_0(self, x):
+        return self.a(x) + checkpoint(self.checkpoint_0_0_0_0, x, use_reentrant=False)
+
+    def checkpoint_0_0_0_0(self, x):
+        return self.b(x)
+
+    def checkpoint_0_0_1(self, x):
+        return self.b(x) + self.c(x)
+
+    def checkpoint_0_1(self, x):
+        return self.d(x)
+
+    def forward(self, x):
+        return checkpoint(self.checkpoint_0, x)
+
+
+@pytest.mark.skipif(torch.__version__ < '1.12.0', reason='torch version < 12')
+def test_nested_ckpt():
+    model = MyModule()
+    x = torch.rand(10, 10)
+    gm = symbolic_trace(model, meta_args={'x': x}, trace_act_ckpt=True)
+    assert torch.allclose(gm(x), model(x)), "The traced model should generate the same output as the original model."
+    for ckpt_def in filter(lambda s: s.startswith('checkpoint'), dir(model)):
+        assert ckpt_def in gm.code, f"Checkpoint {ckpt_def} should be in the traced code.\n Traced code = {gm.code}"
+
+
+if __name__ == "__main__":
+    test_nested_ckpt()
diff --git a/tests/test_analyzer/test_fx/test_shape_prop.py b/tests/test_analyzer/test_fx/test_shape_prop.py
new file mode 100644
index 000000000000..b19884a70fb2
--- /dev/null
+++ b/tests/test_analyzer/test_fx/test_shape_prop.py
@@ -0,0 +1,63 @@
+import pytest
+import timm.models as tmm
+import torch
+import torchvision.models as tm
+from .zoo import tm_models, tmm_models
+
+try:
+    from colossalai._analyzer._subclasses import MetaTensorMode
+    from colossalai._analyzer.fx import symbolic_trace
+    from colossalai._analyzer.fx.passes.shape_prop import shape_prop_pass
+    from colossalai._analyzer.fx.symbolic_profile import register_shape_impl
+    
+    
+    @register_shape_impl(torch.nn.functional.linear)
+    def linear_impl(*args, **kwargs):
+        assert True
+        return torch.nn.functional.linear(*args, **kwargs)
+except:
+    pass
+
+
+def _check_gm_validity(gm: torch.fx.GraphModule):
+    for node in gm.graph.nodes:
+        assert node.meta['info'].outputs, f'In {gm.__class__.__name__}, {node} has no output shape.'
+        if node.op in [
+        # 'call_module',    # can apply to params
+        # 'call_function',  # can apply to params
+        # 'call_method',    # can apply to params
+        ]:
+            assert node.meta['info'].inputs, f'In {gm.__class__.__name__}, {node} has no input shape.'
+
+
+@pytest.mark.skipif(torch.__version__ < '1.12.0', reason='torch version < 12')
+@pytest.mark.parametrize('m', tm_models)
+def test_torchvision_shape_prop(m):
+    with MetaTensorMode():
+        model = m()
+        data = torch.rand(100, 3, 224, 224)
+    meta_args = {
+        "x": data,
+    }
+    gm = symbolic_trace(model, meta_args=meta_args)
+    shape_prop_pass(gm, data)
+    _check_gm_validity(gm)
+
+
+@pytest.mark.skipif(torch.__version__ < '1.12.0', reason='torch version < 12')
+@pytest.mark.parametrize('m', tmm_models)
+def test_timm_shape_prop(m):
+    with MetaTensorMode():
+        model = m()
+        data = torch.rand(100, 3, 224, 224)
+    meta_args = {
+        "x": data,
+    }
+    gm = symbolic_trace(model, meta_args=meta_args)
+    shape_prop_pass(gm, data)
+    _check_gm_validity(gm)
+
+
+if __name__ == "__main__":
+    test_torchvision_shape_prop(tm.resnet18)
+    test_timm_shape_prop(tmm.vgg11)
diff --git a/tests/test_analyzer/test_fx/test_symbolic_profile.py b/tests/test_analyzer/test_fx/test_symbolic_profile.py
new file mode 100644
index 000000000000..5f749e6f3c50
--- /dev/null
+++ b/tests/test_analyzer/test_fx/test_symbolic_profile.py
@@ -0,0 +1,49 @@
+import pytest
+import timm.models as tmm
+import torch
+import torchvision.models as tm
+from .zoo import tm_models, tmm_models
+
+try:
+    from colossalai._analyzer._subclasses import MetaTensorMode
+    from colossalai._analyzer.fx import symbolic_profile, symbolic_trace
+except:
+    pass
+
+
+def _check_gm_validity(gm: torch.fx.GraphModule):
+    for node in gm.graph.nodes:
+        assert len(node.meta['info'].global_ctx), f'In {gm.__class__.__name__}, {node} has empty global context.'
+
+
+@pytest.mark.skipif(torch.__version__ < '1.12.0', reason='torch version < 12')
+@pytest.mark.parametrize('m', tm_models)
+def test_torchvision_profile(m, verbose=False, bias_addition_split=False):
+    with MetaTensorMode():
+        model = m()
+        data = torch.rand(8, 3, 224, 224)
+    meta_args = {
+        "x": data,
+    }
+    gm = symbolic_trace(model, meta_args=meta_args, bias_addition_split=bias_addition_split)
+    symbolic_profile(gm, data, verbose=verbose)
+    _check_gm_validity(gm)
+
+
+@pytest.mark.skipif(torch.__version__ < '1.12.0', reason='torch version < 12')
+@pytest.mark.parametrize('m', tmm_models)
+def test_timm_profile(m, verbose=False, bias_addition_split=False):
+    with MetaTensorMode():
+        model = m()
+        data = torch.rand(8, 3, 224, 224)
+    meta_args = {
+        "x": data,
+    }
+    gm = symbolic_trace(model, meta_args=meta_args, bias_addition_split=bias_addition_split)
+    symbolic_profile(gm, data, verbose=verbose)
+    _check_gm_validity(gm)
+
+
+if __name__ == "__main__":
+    test_torchvision_profile(tm.vit_b_16, verbose=True, bias_addition_split=False)
+    test_timm_profile(tmm.gmlp_b16_224, verbose=True, bias_addition_split=False)
diff --git a/tests/test_analyzer/test_fx/zoo.py b/tests/test_analyzer/test_fx/zoo.py
new file mode 100644
index 000000000000..925078d0dcbe
--- /dev/null
+++ b/tests/test_analyzer/test_fx/zoo.py
@@ -0,0 +1,53 @@
+import timm.models as tmm
+import torchvision.models as tm
+
+# input shape: (batch_size, 3, 224, 224)
+tm_models = [
+    tm.alexnet,
+    tm.convnext_base,
+    tm.densenet121,
+    # tm.efficientnet_v2_s,
+    # tm.googlenet,   # output bad case
+    # tm.inception_v3,  # bad case
+    tm.mobilenet_v2,
+    tm.mobilenet_v3_small,
+    tm.mnasnet0_5,
+    tm.resnet18,
+    tm.regnet_x_16gf,
+    tm.resnext50_32x4d,
+    tm.shufflenet_v2_x0_5,
+    tm.squeezenet1_0,
+    # tm.swin_s,  # fx bad case
+    tm.vgg11,
+    tm.vit_b_16,
+    tm.wide_resnet50_2,
+]
+
+tmm_models = [
+    tmm.beit_base_patch16_224,
+    tmm.beitv2_base_patch16_224,
+    tmm.cait_s24_224,
+    tmm.coat_lite_mini,
+    tmm.convit_base,
+    tmm.deit3_base_patch16_224,
+    tmm.dm_nfnet_f0,
+    tmm.eca_nfnet_l0,
+    tmm.efficientformer_l1,
+    tmm.ese_vovnet19b_dw,
+    tmm.gmixer_12_224,
+    tmm.gmlp_b16_224,
+    tmm.hardcorenas_a,
+    tmm.hrnet_w18_small,
+    tmm.inception_v3,
+    tmm.mixer_b16_224,
+    tmm.nf_ecaresnet101,
+    tmm.nf_regnet_b0,
+    # tmm.pit_b_224,  # pretrained only
+    tmm.regnetv_040,
+    tmm.skresnet18,
+    # tmm.swin_base_patch4_window7_224,     # fx bad case
+    # tmm.tnt_b_patch16_224,    # bad case
+    tmm.vgg11,
+    tmm.vit_base_patch16_18x2_224,
+    tmm.wide_resnet50_2,
+]
diff --git a/tests/test_analyzer/test_subclasses/__init__.py b/tests/test_analyzer/test_subclasses/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/test_analyzer/test_subclasses/test_aten.py b/tests/test_analyzer/test_subclasses/test_aten.py
new file mode 100644
index 000000000000..591a8d617580
--- /dev/null
+++ b/tests/test_analyzer/test_subclasses/test_aten.py
@@ -0,0 +1,82 @@
+from typing import Any, Callable, Union
+import pytest
+
+import torch
+import torch.nn as nn
+
+try:
+    from colossalai._analyzer._subclasses import MetaTensor
+except:
+    pass
+
+aten = torch.ops.aten
+
+registered_meta = {
+    ('aten.convolution.default', True): [    # (aten ops, requires_backward)
+        (nn.Conv1d(in_channels=3, out_channels=4, kernel_size=2, padding=1, dilation=2), torch.rand(2, 3, 4)),
+        (nn.Conv2d(in_channels=3, out_channels=4, kernel_size=2, padding=1, dilation=2), torch.rand(2, 3, 4, 4)),
+        (nn.Conv3d(in_channels=3, out_channels=4, kernel_size=2, padding=1, dilation=2), torch.rand(2, 3, 4, 4, 4)),
+        (nn.ConvTranspose1d(in_channels=3, out_channels=4, kernel_size=2, padding=1, dilation=2), torch.rand(2, 3, 4)),
+        (nn.ConvTranspose2d(in_channels=3, out_channels=4, kernel_size=2, padding=1,
+                            dilation=2), torch.rand(2, 3, 4, 4)),
+        (nn.ConvTranspose3d(in_channels=3, out_channels=4, kernel_size=2, padding=1,
+                            dilation=2), torch.rand(2, 3, 4, 4, 4)),
+    ],
+    ('aten.native_batch_norm.default', True): [
+        (nn.BatchNorm1d(4), torch.rand(2, 4)),
+        (nn.BatchNorm2d(4), torch.rand(1, 4, 4, 4)),
+        (nn.BatchNorm3d(4), torch.rand(1, 4, 4, 4, 4)),
+    ],
+    ('aten.native_layer_norm.default', True): [(nn.LayerNorm(4), torch.rand(1, 2, 3, 4)),],
+    ('aten.avg_pool1d.default', True): [
+        (nn.MaxPool1d(3, stride=2), torch.rand(4, 5, 5)),
+        (nn.AvgPool1d(3, stride=2), torch.rand(4, 5, 5)),
+        (nn.AdaptiveMaxPool1d(3), torch.rand(4, 5, 5)),
+        (nn.AdaptiveAvgPool1d(3), torch.rand(4, 5, 5)),
+    ],
+    ('aten.avg_pool2d.default', True): [
+        (nn.MaxPool2d((3, 2), stride=(2, 1)), torch.rand(2, 4, 5, 5)),
+        (nn.AvgPool2d((3, 2), stride=(2, 1)), torch.rand(2, 4, 5, 5)),
+        (nn.AdaptiveMaxPool2d((3, 2)), torch.rand(2, 4, 5, 5)),
+        (nn.AdaptiveAvgPool2d((3, 2)), torch.rand(2, 4, 5, 5)),
+    ],
+    ('aten.relu.default', True): [
+        (nn.ReLU(), torch.rand(4, 3, 1, 2)),
+        (nn.LeakyReLU(), torch.rand(4, 3, 1, 2)),
+        (nn.SiLU(), torch.rand(4, 3, 1, 2)),
+        (nn.GELU(), torch.rand(4, 3, 1, 2)),
+        (nn.ELU(), torch.rand(4, 3, 1, 2)),
+        (nn.Sigmoid(), torch.rand(4, 3, 1, 2)),
+        (nn.Tanh(), torch.rand(4, 3, 1, 2)),
+        (nn.Hardswish(), torch.rand(4, 3, 1, 2)),
+    ]
+}
+
+
+def compare_all(tensor: torch.Tensor, meta_tensor: torch.Tensor) -> Any:
+    assert tensor.shape == meta_tensor.shape, f'the shape of tensor ({tensor.shape}) and meta tensor ({meta_tensor.shape}) does not match.'
+    assert tensor.dtype == meta_tensor.dtype, f'the dtype of tensor ({tensor.dtype}) and meta tensor ({meta_tensor.dtype}) does not match.'
+    assert tensor.stride() == meta_tensor.stride(
+    ), f'the stride of tensor ({tensor.stride()}) and meta tensor ({meta_tensor.stride()}) does not match.'
+
+
+def run_and_compare(f: Union[nn.Module, Callable], x: torch.Tensor, requires_backward=False) -> Any:
+    x.requires_grad = requires_backward
+    meta_x = MetaTensor(x)
+    x_out, meta_out = f(x), f(meta_x)
+    compare_all(x_out, meta_out)
+    if requires_backward:
+        x_out.sum().backward()
+        meta_out.sum().backward()
+        compare_all(x.grad, meta_x.grad)
+
+
+@pytest.mark.skipif(torch.__version__ < '1.12.0', reason='torch version < 12')
+def test_meta_aten():
+    for (aten_op, requires_backward), v in registered_meta.items():
+        for f, x in v:
+            run_and_compare(f, x, requires_backward)
+
+
+if __name__ == '__main__':
+    test_meta_aten()
diff --git a/tests/test_analyzer/test_subclasses/test_flop_tensor.py b/tests/test_analyzer/test_subclasses/test_flop_tensor.py
new file mode 100644
index 000000000000..551628103325
--- /dev/null
+++ b/tests/test_analyzer/test_subclasses/test_flop_tensor.py
@@ -0,0 +1,50 @@
+import pytest
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.models as tm
+from .zoo import tm_models, tmm_models
+
+try:
+    from colossalai._analyzer._subclasses import MetaTensorMode, flop_count
+except:
+    pass
+
+
+@pytest.mark.skipif(torch.__version__ < '1.12.0', reason='torch version < 12')
+@pytest.mark.parametrize('m', tm_models + tmm_models)
+def test_flop_count_module(m):
+    x = torch.rand(2, 3, 224, 224)
+    with MetaTensorMode():    # save time for testing
+        module = m()
+    rs_fwd, rs_bwd = flop_count(module, x, verbose=True)
+    assert rs_fwd > 0, f'fwd flop count of {m.__name__} is {rs_fwd}'
+    assert rs_bwd > 0, f'bwd flop count of {m.__name__} is {rs_bwd}'
+
+
+odd_cases = [
+    (F.relu, (torch.rand(2, 3, 224, 224, requires_grad=True),), {
+        'inplace': True
+    }),
+    (F.max_pool2d, (torch.rand(2, 3, 224, 224, requires_grad=True),), {
+        'kernel_size': 3,
+        'stride': 2,
+        'padding': 1,
+        'dilation': 2
+    }),
+    (torch.where, (torch.rand(2, 3, 224, 224) > 0.5, torch.rand(2, 3, 224, 224, requires_grad=True),
+                   torch.rand(2, 3, 224, 224, requires_grad=True)), {}),
+]
+
+
+@pytest.mark.skipif(torch.__version__ < '1.12.0', reason='torch version < 12')
+@pytest.mark.parametrize('func, args, kwargs', odd_cases)
+def test_flop_count_function(func, args, kwargs):
+    rs_fwd, rs_bwd = flop_count(func, *args, **kwargs, verbose=True)
+    assert rs_fwd > 0, f'fwd flop count of {func.__name__} is {rs_fwd}'
+    assert rs_bwd > 0, f'bwd flop count of {func.__name__} is {rs_bwd}'
+
+
+if __name__ == '__main__':
+    test_flop_count_module(tm.resnet18, torch.rand(2, 3, 224, 224))
+    test_flop_count_function(F.relu, (torch.rand(2, 3, 224, 224, requires_grad=True),), {'inplace': True})
diff --git a/tests/test_analyzer/test_subclasses/test_meta_mode.py b/tests/test_analyzer/test_subclasses/test_meta_mode.py
new file mode 100644
index 000000000000..d8122b019619
--- /dev/null
+++ b/tests/test_analyzer/test_subclasses/test_meta_mode.py
@@ -0,0 +1,38 @@
+import pytest
+import torch
+import torch.distributed as dist
+import torchvision.models as tm
+try:
+    from colossalai._analyzer._subclasses import MetaTensor, MetaTensorMode
+except:
+    pass
+from .zoo import tm_models, tmm_models
+
+
+def compare_all(tensor: torch.Tensor, meta_tensor: torch.Tensor):
+    assert tensor.shape == meta_tensor.shape, f'the shape of tensor ({tensor.shape}) and meta tensor ({meta_tensor.shape}) does not match.'
+    assert tensor.dtype == meta_tensor.dtype, f'the dtype of tensor ({tensor.dtype}) and meta tensor ({meta_tensor.dtype}) does not match.'
+    assert tensor.stride() == meta_tensor.stride(
+    ), f'the stride of tensor ({tensor.stride()}) and meta tensor ({meta_tensor.stride()}) does not match.'
+
+
+def run_and_compare(model):
+    x = torch.rand(2, 3, 224, 224, requires_grad=True)
+    x_out = model(x)
+    with MetaTensorMode():
+        meta_x = torch.rand(2, 3, 224, 224, requires_grad=True)
+        meta_out = model(meta_x)
+    compare_all(x_out, meta_out)
+    x_out.sum().backward()
+    meta_out.sum().backward()
+    compare_all(x.grad, meta_x.grad)
+
+
+@pytest.mark.skipif(torch.__version__ < '1.12.0', reason='torch version < 12')
+@pytest.mark.parametrize('m', tm_models + tmm_models)
+def test_meta_mode_shape(m):
+    run_and_compare(m())
+
+
+if __name__ == '__main__':
+    test_meta_mode_shape(tm.resnet18)
diff --git a/tests/test_analyzer/test_subclasses/zoo.py b/tests/test_analyzer/test_subclasses/zoo.py
new file mode 100644
index 000000000000..925078d0dcbe
--- /dev/null
+++ b/tests/test_analyzer/test_subclasses/zoo.py
@@ -0,0 +1,53 @@
+import timm.models as tmm
+import torchvision.models as tm
+
+# input shape: (batch_size, 3, 224, 224)
+tm_models = [
+    tm.alexnet,
+    tm.convnext_base,
+    tm.densenet121,
+    # tm.efficientnet_v2_s,
+    # tm.googlenet,   # output bad case
+    # tm.inception_v3,  # bad case
+    tm.mobilenet_v2,
+    tm.mobilenet_v3_small,
+    tm.mnasnet0_5,
+    tm.resnet18,
+    tm.regnet_x_16gf,
+    tm.resnext50_32x4d,
+    tm.shufflenet_v2_x0_5,
+    tm.squeezenet1_0,
+    # tm.swin_s,  # fx bad case
+    tm.vgg11,
+    tm.vit_b_16,
+    tm.wide_resnet50_2,
+]
+
+tmm_models = [
+    tmm.beit_base_patch16_224,
+    tmm.beitv2_base_patch16_224,
+    tmm.cait_s24_224,
+    tmm.coat_lite_mini,
+    tmm.convit_base,
+    tmm.deit3_base_patch16_224,
+    tmm.dm_nfnet_f0,
+    tmm.eca_nfnet_l0,
+    tmm.efficientformer_l1,
+    tmm.ese_vovnet19b_dw,
+    tmm.gmixer_12_224,
+    tmm.gmlp_b16_224,
+    tmm.hardcorenas_a,
+    tmm.hrnet_w18_small,
+    tmm.inception_v3,
+    tmm.mixer_b16_224,
+    tmm.nf_ecaresnet101,
+    tmm.nf_regnet_b0,
+    # tmm.pit_b_224,  # pretrained only
+    tmm.regnetv_040,
+    tmm.skresnet18,
+    # tmm.swin_base_patch4_window7_224,     # fx bad case
+    # tmm.tnt_b_patch16_224,    # bad case
+    tmm.vgg11,
+    tmm.vit_base_patch16_18x2_224,
+    tmm.wide_resnet50_2,
+]

From 95a36eae637563e75484b73ccb6dc3feb56ce05b Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Fri, 10 Mar 2023 14:27:09 +0800
Subject: [PATCH 450/503] [kernel] added kernel loader to softmax autograd
 function (#3093)

* [kernel] added kernel loader to softmax autograd function

* [release] v0.2.6
---
 colossalai/kernel/cuda_native/scaled_softmax.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/colossalai/kernel/cuda_native/scaled_softmax.py b/colossalai/kernel/cuda_native/scaled_softmax.py
index 05c6ee35b8ce..24e458bb3ea5 100644
--- a/colossalai/kernel/cuda_native/scaled_softmax.py
+++ b/colossalai/kernel/cuda_native/scaled_softmax.py
@@ -180,4 +180,9 @@ def forward_torch_softmax(self, input, mask):
         return probs
 
     def get_batch_per_block(self, sq, sk, b, np):
+        # build and load kernel if not pre-built
+        global scaled_masked_softmax
+        if scaled_masked_softmax is None:
+            scaled_masked_softmax = ScaledMaskedSoftmaxBuilder().load()
+
         return scaled_masked_softmax.get_batch_per_block(sq, sk, b, np)

From 02ae80bf9c94c303327aa4ca06cfdb19d8dae60c Mon Sep 17 00:00:00 2001
From: Fazzie-Maqianli <55798671+Fazziekey@users.noreply.github.com>
Date: Fri, 10 Mar 2023 14:40:14 +0800
Subject: [PATCH 451/503] [chatgpt]add flag of action mask in critic(#3086)

---
 .../ChatGPT/chatgpt/models/base/actor.py       |  2 +-
 .../ChatGPT/chatgpt/models/base/critic.py      | 18 +++++++++++-------
 .../chatgpt/models/bloom/bloom_critic.py       |  5 +++--
 .../ChatGPT/chatgpt/models/gpt/gpt_critic.py   |  5 +++--
 .../ChatGPT/chatgpt/models/opt/opt_critic.py   |  5 +++--
 5 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/applications/ChatGPT/chatgpt/models/base/actor.py b/applications/ChatGPT/chatgpt/models/base/actor.py
index e2841dc68feb..57db2bb11a6a 100644
--- a/applications/ChatGPT/chatgpt/models/base/actor.py
+++ b/applications/ChatGPT/chatgpt/models/base/actor.py
@@ -37,7 +37,7 @@ def generate(
         if pad_token_id is not None:
             attention_mask = sequences.not_equal(pad_token_id).to(dtype=torch.long, device=sequences.device)
         if not return_action_mask:
-            return sequences, attention_mask
+            return sequences, attention_mask, None
         input_len = input_ids.size(1)
         eos_token_id = kwargs.get('eos_token_id', None)
         if eos_token_id is None:
diff --git a/applications/ChatGPT/chatgpt/models/base/critic.py b/applications/ChatGPT/chatgpt/models/base/critic.py
index b12bddfcb2e5..e68a743a7762 100644
--- a/applications/ChatGPT/chatgpt/models/base/critic.py
+++ b/applications/ChatGPT/chatgpt/models/base/critic.py
@@ -18,15 +18,19 @@ class Critic(LoRAModule):
         lora_train_bias (str): LoRA bias training mode.
     """
 
-    def __init__(self,
-                 model: nn.Module,
-                 value_head: nn.Module,
-                 lora_rank: int = 0,
-                 lora_train_bias: str = 'none') -> None:
+    def __init__(
+        self,
+        model: nn.Module,
+        value_head: nn.Module,
+        lora_rank: int = 0,
+        lora_train_bias: str = 'none',
+        use_action_mask: bool = False,
+    ) -> None:
 
         super().__init__(lora_rank=lora_rank, lora_train_bias=lora_train_bias)
         self.model = model
         self.value_head = value_head
+        self.use_action_mask = use_action_mask
         self.convert_to_lora()
 
     def forward(self,
@@ -38,7 +42,7 @@ def forward(self,
 
         values = self.value_head(last_hidden_states).squeeze(-1)
 
-        if action_mask is not None:
+        if action_mask is not None and self.use_action_mask:
             num_actions = action_mask.size(1)
             prompt_mask = attention_mask[:, :-num_actions]
             values = values[:, :-num_actions]
@@ -46,5 +50,5 @@ def forward(self,
             return value
 
         values = values[:, :-1]
-        value = values.mean(dim=1).squeeze(1)
+        value = values.mean(dim=1)
         return value
diff --git a/applications/ChatGPT/chatgpt/models/bloom/bloom_critic.py b/applications/ChatGPT/chatgpt/models/bloom/bloom_critic.py
index 5a907309a674..a32fb2e102f9 100644
--- a/applications/ChatGPT/chatgpt/models/bloom/bloom_critic.py
+++ b/applications/ChatGPT/chatgpt/models/bloom/bloom_critic.py
@@ -24,7 +24,8 @@ def __init__(self,
                  config: Optional[BloomConfig] = None,
                  checkpoint: bool = False,
                  lora_rank: int = 0,
-                 lora_train_bias: str = 'none') -> None:
+                 lora_train_bias: str = 'none',
+                 **kwargs) -> None:
         if pretrained is not None:
             model = BloomModel.from_pretrained(pretrained)
         elif config is not None:
@@ -34,4 +35,4 @@ def __init__(self,
         if checkpoint:
             model.gradient_checkpointing_enable()
         value_head = nn.Linear(model.config.hidden_size, 1)
-        super().__init__(model, value_head, lora_rank, lora_train_bias)
+        super().__init__(model, value_head, lora_rank, lora_train_bias, **kwargs)
diff --git a/applications/ChatGPT/chatgpt/models/gpt/gpt_critic.py b/applications/ChatGPT/chatgpt/models/gpt/gpt_critic.py
index 897ddb4aeb03..01e824386d4a 100644
--- a/applications/ChatGPT/chatgpt/models/gpt/gpt_critic.py
+++ b/applications/ChatGPT/chatgpt/models/gpt/gpt_critic.py
@@ -20,7 +20,8 @@ class GPTCritic(Critic):
     def __init__(self,
                  pretrained: Optional[str] = None,
                  config: Optional[GPT2Config] = None,
-                 checkpoint: bool = False) -> None:
+                 checkpoint: bool = False,
+                 **kwargs) -> None:
         if pretrained is not None:
             model = GPT2Model.from_pretrained(pretrained)
         elif config is not None:
@@ -30,4 +31,4 @@ def __init__(self,
         if checkpoint:
             model.gradient_checkpointing_enable()
         value_head = nn.Linear(model.config.n_embd, 1)
-        super().__init__(model, value_head)
+        super().__init__(model, value_head, **kwargs)
diff --git a/applications/ChatGPT/chatgpt/models/opt/opt_critic.py b/applications/ChatGPT/chatgpt/models/opt/opt_critic.py
index 767cecb79353..1f5ead7582f7 100644
--- a/applications/ChatGPT/chatgpt/models/opt/opt_critic.py
+++ b/applications/ChatGPT/chatgpt/models/opt/opt_critic.py
@@ -24,7 +24,8 @@ def __init__(self,
                  config: Optional[OPTConfig] = None,
                  checkpoint: bool = False,
                  lora_rank: int = 0,
-                 lora_train_bias: str = 'none') -> None:
+                 lora_train_bias: str = 'none',
+                 **kargs) -> None:
         if pretrained is not None:
             model = OPTModel.from_pretrained(pretrained)
         elif config is not None:
@@ -34,4 +35,4 @@ def __init__(self,
         if checkpoint:
             model.gradient_checkpointing_enable()
         value_head = nn.Linear(model.config.hidden_size, 1)
-        super().__init__(model, value_head, lora_rank, lora_train_bias)
+        super().__init__(model, value_head, lora_rank, lora_train_bias, **kwargs)

From 26db1cb57bc03223fc8fea79763ebb3b2ea40cb7 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Fri, 10 Mar 2023 14:54:37 +0800
Subject: [PATCH 452/503] [release] v0.2.7 (#3094)

---
 version.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version.txt b/version.txt
index 53a75d673557..b0032849c80b 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.2.6
+0.2.7

From 65a4dbda6c1e9061cb5d8f375f0f7fc830ba7fc5 Mon Sep 17 00:00:00 2001
From: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
Date: Fri, 10 Mar 2023 00:24:08 -0800
Subject: [PATCH 453/503] [NVIDIA] Add FP8 example using TE (#3080)

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
---
 examples/tutorial/fp8/mnist/README.md |   7 +
 examples/tutorial/fp8/mnist/main.py   | 237 ++++++++++++++++++++++++++
 2 files changed, 244 insertions(+)
 create mode 100644 examples/tutorial/fp8/mnist/README.md
 create mode 100644 examples/tutorial/fp8/mnist/main.py

diff --git a/examples/tutorial/fp8/mnist/README.md b/examples/tutorial/fp8/mnist/README.md
new file mode 100644
index 000000000000..308549cd29f7
--- /dev/null
+++ b/examples/tutorial/fp8/mnist/README.md
@@ -0,0 +1,7 @@
+# Basic MNIST Example with optional FP8
+
+```bash
+python main.py
+python main.py --use-te   # Linear layers from TransformerEngine
+python main.py --use-fp8  # FP8 + TransformerEngine for Linear layers
+```
diff --git a/examples/tutorial/fp8/mnist/main.py b/examples/tutorial/fp8/mnist/main.py
new file mode 100644
index 000000000000..000ded2f111f
--- /dev/null
+++ b/examples/tutorial/fp8/mnist/main.py
@@ -0,0 +1,237 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+import argparse
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torchvision import datasets, transforms
+from torch.optim.lr_scheduler import StepLR
+
+try:
+    from transformer_engine import pytorch as te
+    HAVE_TE = True
+except (ImportError, ModuleNotFoundError):
+    HAVE_TE = False
+
+
+class Net(nn.Module):
+    def __init__(self, use_te=False):
+        super(Net, self).__init__()
+        self.conv1 = nn.Conv2d(1, 32, 3, 1)
+        self.conv2 = nn.Conv2d(32, 64, 3, 1)
+        self.dropout1 = nn.Dropout(0.25)
+        self.dropout2 = nn.Dropout(0.5)
+        if use_te:
+            self.fc1 = te.Linear(9216, 128)
+            self.fc2 = te.Linear(128, 16)
+        else:
+            self.fc1 = nn.Linear(9216, 128)
+            self.fc2 = nn.Linear(128, 16)
+        self.fc3 = nn.Linear(16, 10)
+
+    def forward(self, x):
+        """FWD"""
+        x = self.conv1(x)
+        x = F.relu(x)
+        x = self.conv2(x)
+        x = F.relu(x)
+        x = F.max_pool2d(x, 2)
+        x = self.dropout1(x)
+        x = torch.flatten(x, 1)
+        x = self.fc1(x)
+        x = F.relu(x)
+        x = self.dropout2(x)
+        x = self.fc2(x)
+        x = self.fc3(x)
+        output = F.log_softmax(x, dim=1)
+        return output
+
+
+def train(args, model, device, train_loader, optimizer, epoch, use_fp8):
+    """Training function."""
+    model.train()
+    for batch_idx, (data, target) in enumerate(train_loader):
+        data, target = data.to(device), target.to(device)
+        optimizer.zero_grad()
+        with te.fp8_autocast(enabled=use_fp8):
+            output = model(data)
+        loss = F.nll_loss(output, target)
+        loss.backward()
+        optimizer.step()
+        if batch_idx % args.log_interval == 0:
+            print(
+                f"Train Epoch: {epoch} "
+                f"[{batch_idx * len(data)}/{len(train_loader.dataset)} "
+                f"({100. * batch_idx / len(train_loader):.0f}%)]\t"
+                f"Loss: {loss.item():.6f}"
+            )
+            if args.dry_run:
+                break
+
+
+def calibrate(model, device, test_loader):
+    """Calibration function."""
+    model.eval()
+    test_loss = 0
+    correct = 0
+    with torch.no_grad():
+        for data, target in test_loader:
+            data, target = data.to(device), target.to(device)
+            with te.fp8_autocast(enabled=False, calibrating=True):
+                output = model(data)
+
+def test(model, device, test_loader, use_fp8):
+    """Testing function."""
+    model.eval()
+    test_loss = 0
+    correct = 0
+    with torch.no_grad():
+        for data, target in test_loader:
+            data, target = data.to(device), target.to(device)
+            with te.fp8_autocast(enabled=use_fp8):
+                output = model(data)
+            test_loss += F.nll_loss(
+                output, target, reduction="sum"
+            ).item()  # sum up batch loss
+            pred = output.argmax(
+                dim=1, keepdim=True
+            )  # get the index of the max log-probability
+            correct += pred.eq(target.view_as(pred)).sum().item()
+
+    test_loss /= len(test_loader.dataset)
+
+    print(
+        f"\nTest set: Average loss: {test_loss:.4f}, "
+        f"Accuracy: {correct}/{len(test_loader.dataset)} "
+        f"({100. * correct / len(test_loader.dataset):.0f}%)\n"
+    )
+
+
+def main():
+    # Training settings
+    parser = argparse.ArgumentParser(description="PyTorch MNIST Example")
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=64,
+        metavar="N",
+        help="input batch size for training (default: 64)",
+    )
+    parser.add_argument(
+        "--test-batch-size",
+        type=int,
+        default=1000,
+        metavar="N",
+        help="input batch size for testing (default: 1000)",
+    )
+    parser.add_argument(
+        "--epochs",
+        type=int,
+        default=14,
+        metavar="N",
+        help="number of epochs to train (default: 14)",
+    )
+    parser.add_argument(
+        "--lr",
+        type=float,
+        default=1.0,
+        metavar="LR",
+        help="learning rate (default: 1.0)",
+    )
+    parser.add_argument(
+        "--gamma",
+        type=float,
+        default=0.7,
+        metavar="M",
+        help="Learning rate step gamma (default: 0.7)",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        default=False,
+        help="quickly check a single pass",
+    )
+    parser.add_argument(
+        "--seed", type=int, default=1, metavar="S", help="random seed (default: 1)"
+    )
+    parser.add_argument(
+        "--log-interval",
+        type=int,
+        default=10,
+        metavar="N",
+        help="how many batches to wait before logging training status",
+    )
+    parser.add_argument(
+        "--save-model",
+        action="store_true",
+        default=False,
+        help="For Saving the current Model",
+    )
+    parser.add_argument(
+        "--use-fp8", action="store_true", default=False, help="Use FP8 for inference and training without recalibration"
+    )
+    parser.add_argument(
+        "--use-fp8-infer", action="store_true", default=False, help="Use FP8 inference only"
+    )
+    parser.add_argument(
+        "--use-te", action="store_true", default=False, help="Use Transformer Engine"
+    )
+    args = parser.parse_args()
+    use_cuda = torch.cuda.is_available()
+
+    if args.use_te or args.use_fp8 or args.use_fp8_infer:
+        assert HAVE_TE, "TransformerEngine not installed."
+
+    if args.use_fp8 or args.use_fp8_infer:
+        args.use_te = True
+
+    if args.use_te:
+        assert use_cuda, "CUDA needed for FP8 execution."
+
+    if args.use_fp8_infer:
+        assert not args.use_fp8, "fp8-infer path currently only supports calibration from a bfloat checkpoint"
+
+    torch.manual_seed(args.seed)
+
+    device = torch.device("cuda" if use_cuda else "cpu")
+
+    train_kwargs = {"batch_size": args.batch_size}
+    test_kwargs = {"batch_size": args.test_batch_size}
+    if use_cuda:
+        cuda_kwargs = {"num_workers": 1, "pin_memory": True, "shuffle": True}
+        train_kwargs.update(cuda_kwargs)
+        test_kwargs.update(cuda_kwargs)
+
+    transform = transforms.Compose(
+        [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
+    )
+    dataset1 = datasets.MNIST("../data", train=True, download=True, transform=transform)
+    dataset2 = datasets.MNIST("../data", train=False, transform=transform)
+    train_loader = torch.utils.data.DataLoader(dataset1, **train_kwargs)
+    test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)
+
+    model = Net(use_te=args.use_te).to(device)
+    optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
+
+    scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
+    for epoch in range(1, args.epochs + 1):
+        train(args, model, device, train_loader, optimizer, epoch, args.use_fp8)
+        test(model, device, test_loader, args.use_fp8)
+        scheduler.step()
+
+    if args.use_fp8_infer:
+        calibrate(model, device, test_loader)
+
+    if args.save_model or args.use_fp8_infer:
+        torch.save(model.state_dict(), "mnist_cnn.pt")
+        print('Eval with reloaded checkpoint : fp8='+str(args.use_fp8_infer))
+        weights = torch.load("mnist_cnn.pt")
+        model.load_state_dict(weights)
+        test(model, device, test_loader, args.use_fp8_infer)
+
+
+if __name__ == "__main__":
+    main()

From 018936a3f30e8a08e7f99e094340ce322b0c2966 Mon Sep 17 00:00:00 2001
From: binmakeswell <binmakeswell@gmail.com>
Date: Fri, 10 Mar 2023 16:30:52 +0800
Subject: [PATCH 454/503] [tutorial] update notes for TransformerEngine (#3098)

---
 examples/tutorial/fp8/mnist/README.md | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/examples/tutorial/fp8/mnist/README.md b/examples/tutorial/fp8/mnist/README.md
index 308549cd29f7..46711f9ebdd8 100644
--- a/examples/tutorial/fp8/mnist/README.md
+++ b/examples/tutorial/fp8/mnist/README.md
@@ -1,7 +1,13 @@
-# Basic MNIST Example with optional FP8
+# Basic MNIST Example with optional FP8 of TransformerEngine
+
+[TransformerEngine](https://github.com/NVIDIA/TransformerEngine) is a library for accelerating Transformer models on NVIDIA GPUs, including using 8-bit floating point (FP8) precision on Hopper GPUs, to provide better performance with lower memory utilization in both training and inference.
+
+Thanks for the contribution to this tutorial from NVIDIA. 
 
 ```bash
 python main.py
 python main.py --use-te   # Linear layers from TransformerEngine
 python main.py --use-fp8  # FP8 + TransformerEngine for Linear layers
 ```
+
+> We are working to integrate it with Colossal-AI and will finish it soon.

From c9dd036592ee97e73faa9934831d2f8f7184e9b7 Mon Sep 17 00:00:00 2001
From: BlueRum <70618399+ht-zhou@users.noreply.github.com>
Date: Fri, 10 Mar 2023 17:58:10 +0800
Subject: [PATCH 455/503] [chatgpt] fix lora save bug (#3099)

* fix colo-stratergy

* polish

* fix lora

* fix ddp

* polish

* polish
---
 applications/ChatGPT/chatgpt/models/lora.py      |  3 +++
 .../chatgpt/trainer/strategies/colossalai.py     | 16 ++++++++++++++++
 .../ChatGPT/chatgpt/trainer/strategies/ddp.py    | 12 ++++++++++--
 3 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/applications/ChatGPT/chatgpt/models/lora.py b/applications/ChatGPT/chatgpt/models/lora.py
index 46a43ec91681..9c19f472d726 100644
--- a/applications/ChatGPT/chatgpt/models/lora.py
+++ b/applications/ChatGPT/chatgpt/models/lora.py
@@ -74,6 +74,8 @@ def T(w):
             # Merge the weights and mark it
             if self.r > 0:
                 self.weight.data += T(self.lora_B @ self.lora_A) * self.scaling
+                delattr(self, 'lora_A')
+                delattr(self, 'lora_B')
             self.merged = True
 
     def forward(self, x: torch.Tensor):
@@ -125,3 +127,4 @@ def convert_to_lora(self) -> None:
             return
         convert_to_lora_recursively(self, self.lora_rank)
         lora.mark_only_lora_as_trainable(self, self.lora_train_bias)
+                
diff --git a/applications/ChatGPT/chatgpt/trainer/strategies/colossalai.py b/applications/ChatGPT/chatgpt/trainer/strategies/colossalai.py
index f08018fd232f..b20b02d3d34d 100644
--- a/applications/ChatGPT/chatgpt/trainer/strategies/colossalai.py
+++ b/applications/ChatGPT/chatgpt/trainer/strategies/colossalai.py
@@ -6,11 +6,13 @@
 import torch.nn as nn
 import torch.optim as optim
 from chatgpt.models.base import Actor
+from chatgpt.models.lora import LoraLinear
 from torch.optim import Optimizer
 
 import colossalai
 from colossalai.nn.optimizer import CPUAdam, HybridAdam
 from colossalai.nn.parallel import ZeroDDP, zero_model_wrapper, zero_optim_wrapper
+from colossalai.nn.parallel.utils import get_static_torch_model
 from colossalai.tensor import ProcessGroup, ShardSpec
 from colossalai.utils import get_current_device
 from colossalai.utils.model.colo_init_context import ColoInitContext
@@ -143,6 +145,20 @@ def _unwrap_actor(actor: Actor) -> nn.Module:
 
     def save_model(self, model: nn.Module, path: str, only_rank0: bool = False) -> None:
         unwrapped_model = self._unwrap_model(model)
+        # TODO : better way to get torch model from gemini model
+        # to get torch model from gemini model
+        if isinstance(unwrapped_model, ZeroDDP):
+            state_dict = unwrapped_model.state_dict()
+            unwrapped_model = get_static_torch_model(unwrapped_model)
+            if only_rank0 and dist.get_rank() != 0:
+                return
+            unwrapped_model.load_state_dict(state_dict)
+        # merge lora_weights into weights
+        for module in unwrapped_model.modules():
+            if isinstance(module, LoraLinear):
+                module.merge_weights=True
+                module.eval()
+        # get state_dict and save
         state_dict = unwrapped_model.state_dict()
         if only_rank0 and dist.get_rank() != 0:
             return
diff --git a/applications/ChatGPT/chatgpt/trainer/strategies/ddp.py b/applications/ChatGPT/chatgpt/trainer/strategies/ddp.py
index 530dd998d193..c9f92c12fe0a 100644
--- a/applications/ChatGPT/chatgpt/trainer/strategies/ddp.py
+++ b/applications/ChatGPT/chatgpt/trainer/strategies/ddp.py
@@ -6,6 +6,7 @@
 import torch.distributed as dist
 import torch.nn as nn
 from chatgpt.models.base import Actor
+from chatgpt.models.lora import LoraLinear
 from chatgpt.replay_buffer import ReplayBuffer
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.optim import Optimizer
@@ -72,10 +73,17 @@ def _unwrap_actor(actor: Actor) -> nn.Module:
         return model.module
 
     def save_model(self, model: nn.Module, path: str, only_rank0: bool = False) -> None:
+        for module in model.modules():
+            if isinstance(module, LoraLinear):
+                module.merge_weights=True
+                module.eval()
+                
         if only_rank0 and dist.get_rank() != 0:
             return
-        super().save_model(model, path, only_rank0)
-
+        model = model.model.module
+        state_dict = model.state_dict()
+        torch.save(state_dict, path)
+        
     def save_optimizer(self, optimizer: Optimizer, path: str, only_rank0: bool = False) -> None:
         if only_rank0 and dist.get_rank() != 0:
             return

From 145ccfd7d1053a643206bfbb91bf1d9203117034 Mon Sep 17 00:00:00 2001
From: binmakeswell <binmakeswell@gmail.com>
Date: Sat, 11 Mar 2023 15:21:45 +0800
Subject: [PATCH 456/503] [doc] add Intel cooperation for biomedicine (#3108)

* [doc] add Intel cooperation for biomedicine
---
 README-zh-Hans.md | 12 +++++++++++-
 README.md         | 14 ++++++++++++--
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/README-zh-Hans.md b/README-zh-Hans.md
index 8a3bee5ec17c..283cc27cb9c2 100644
--- a/README-zh-Hans.md
+++ b/README-zh-Hans.md
@@ -196,6 +196,10 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
 
 - [Energon-AI](https://github.com/hpcaitech/EnergonAI) ：用相同的硬件推理加速50%
 
+<p id="OPT-Serving" align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/BLOOM%20serving.png" width=600/>
+</p>
+
 - [OPT推理服务](https://colossalai.org/docs/advanced_tutorials/opt_service): 体验1750亿参数OPT在线推理服务
 
 <p id="BLOOM-Inference" align="center">
@@ -265,6 +269,12 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
 
 - [FastFold](https://github.com/hpcaitech/FastFold): 加速AlphaFold训练与推理、数据前处理、推理序列长度超过10000残基
 
+<p id="FastFold-Intel" align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/data%20preprocessing%20with%20Intel.jpg" width=600/>
+</p>
+
+- [FastFold with Intel](https://github.com/hpcaitech/FastFold): 3倍推理加速和39%成本节省
+
 <p id="xTrimoMultimer" align="center">
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/xTrimoMultimer_Table.jpg" width=800/>
 </p>
@@ -280,7 +290,7 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
 - PyTorch >= 1.11 (PyTorch 2.x 正在适配中)
 - Python >= 3.7
 - CUDA >= 11.0
-  
+
 如果你遇到安装问题，可以向本项目 [反馈](https://github.com/hpcaitech/ColossalAI/issues/new/choose)。
 
 
diff --git a/README.md b/README.md
index 3115192d6ab2..602193f76def 100644
--- a/README.md
+++ b/README.md
@@ -198,6 +198,10 @@ Please visit our [documentation](https://www.colossalai.org/) and [examples](htt
 
 - [Energon-AI](https://github.com/hpcaitech/EnergonAI): 50% inference acceleration on the same hardware
 
+<p id="OPT-Serving" align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/BLOOM%20serving.png" width=600/>
+</p>
+
 - [OPT Serving](https://colossalai.org/docs/advanced_tutorials/opt_service): Try 175-billion-parameter OPT online services
 
 <p id="BLOOM-Inference" align="center">
@@ -264,7 +268,13 @@ Acceleration of [AlphaFold Protein Structure](https://alphafold.ebi.ac.uk/)
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/FastFold.jpg" width=800/>
 </p>
 
-- [FastFold](https://github.com/hpcaitech/FastFold): accelerating training and inference on GPU Clusters, faster data processing, inference sequence containing more than 10000 residues.
+- [FastFold](https://github.com/hpcaitech/FastFold): Accelerating training and inference on GPU Clusters, faster data processing, inference sequence containing more than 10000 residues.
+
+<p id="FastFold-Intel" align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/data%20preprocessing%20with%20Intel.jpg" width=600/>
+</p>
+
+- [FastFold with Intel](https://github.com/hpcaitech/FastFold): 3x inference acceleration and 39% cost reduce.
 
 <p id="xTrimoMultimer" align="center">
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/xTrimoMultimer_Table.jpg" width=800/>
@@ -281,7 +291,7 @@ Requirements:
 - PyTorch >= 1.11 (PyTorch 2.x in progress)
 - Python >= 3.7
 - CUDA >= 11.0
-  
+
 If you encounter any problem about installation, you may want to raise an [issue](https://github.com/hpcaitech/ColossalAI/issues/new/choose) in this repository.
 
 ### Install from PyPI

From 191daf74111251df7327f4ef7a069d0254554d2c Mon Sep 17 00:00:00 2001
From: hiko2MSP <hiko2msp@gmail.com>
Date: Mon, 13 Mar 2023 01:00:02 +0900
Subject: [PATCH 457/503] [chatgpt] type miss of kwargs (#3107)

---
 applications/ChatGPT/chatgpt/models/opt/opt_critic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/applications/ChatGPT/chatgpt/models/opt/opt_critic.py b/applications/ChatGPT/chatgpt/models/opt/opt_critic.py
index 1f5ead7582f7..847813332ac1 100644
--- a/applications/ChatGPT/chatgpt/models/opt/opt_critic.py
+++ b/applications/ChatGPT/chatgpt/models/opt/opt_critic.py
@@ -25,7 +25,7 @@ def __init__(self,
                  checkpoint: bool = False,
                  lora_rank: int = 0,
                  lora_train_bias: str = 'none',
-                 **kargs) -> None:
+                 **kwargs) -> None:
         if pretrained is not None:
             model = OPTModel.from_pretrained(pretrained)
         elif config is not None:

From 453f7ae5a0f8b11e3cae4b6b64227bd8e5ca87e0 Mon Sep 17 00:00:00 2001
From: Jeff Rasley <jerasley@microsoft.com>
Date: Sun, 12 Mar 2023 17:50:31 -0700
Subject: [PATCH 458/503] prevent op_builder being installed in site-pkgs
 (#3104)

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 0a66a90084ee..89a7b0de461b 100644
--- a/setup.py
+++ b/setup.py
@@ -152,6 +152,7 @@ def get_version() -> str:
 setup(name=package_name,
       version=version,
       packages=find_packages(exclude=(
+          'op_builder',
           'benchmark',
           'docker',
           'tests',

From 0aa92c04092bf18919a511cdbbe9f7214bd53fdb Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 13 Mar 2023 08:58:06 +0800
Subject: [PATCH 459/503] Automated submodule synchronization (#3105)

Co-authored-by: github-actions <github-actions@github.com>
---
 examples/tutorial/fastfold/FastFold | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tutorial/fastfold/FastFold b/examples/tutorial/fastfold/FastFold
index c9309ecf2437..867587b3aa4e 160000
--- a/examples/tutorial/fastfold/FastFold
+++ b/examples/tutorial/fastfold/FastFold
@@ -1 +1 @@
-Subproject commit c9309ecf2437ffd7f308ace7ea31042a380fb82c
+Subproject commit 867587b3aa4e43bdaf64f9910127842f1dfbfebd

From 0672b5afacf06735fe669fe568aadab3cbd9d8aa Mon Sep 17 00:00:00 2001
From: BlueRum <70618399+ht-zhou@users.noreply.github.com>
Date: Mon, 13 Mar 2023 10:37:41 +0800
Subject: [PATCH 460/503] [chatgpt] fix lora support for gpt (#3113)

* fix gpt-actor

* fix gpt-critic

* fix opt-critic
---
 applications/ChatGPT/chatgpt/models/gpt/gpt_actor.py  | 8 ++++++--
 applications/ChatGPT/chatgpt/models/gpt/gpt_critic.py | 7 +++++--
 applications/ChatGPT/chatgpt/models/opt/opt_critic.py | 2 +-
 3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/applications/ChatGPT/chatgpt/models/gpt/gpt_actor.py b/applications/ChatGPT/chatgpt/models/gpt/gpt_actor.py
index da24685e16c8..6a53ad40b817 100644
--- a/applications/ChatGPT/chatgpt/models/gpt/gpt_actor.py
+++ b/applications/ChatGPT/chatgpt/models/gpt/gpt_actor.py
@@ -14,12 +14,16 @@ class GPTActor(Actor):
         pretrained (str): Pretrained model name or path.
         config (GPT2Config): Model config.
         checkpoint (bool): Enable gradient checkpointing.
+        lora_rank (int): Rank of the LoRa layer.
+        lora_train_bias (str): Bias training strategy for the LoRa layer.
     """
 
     def __init__(self,
                  pretrained: Optional[str] = None,
                  config: Optional[GPT2Config] = None,
-                 checkpoint: bool = False) -> None:
+                 checkpoint: bool = False,
+                 lora_rank: int = 0,
+                 lora_train_bias: str = 'none') -> None:
         if pretrained is not None:
             model = GPT2LMHeadModel.from_pretrained(pretrained)
         elif config is not None:
@@ -28,4 +32,4 @@ def __init__(self,
             model = GPT2LMHeadModel(GPT2Config())
         if checkpoint:
             model.gradient_checkpointing_enable()
-        super().__init__(model)
+        super().__init__(model, lora_rank, lora_train_bias)
diff --git a/applications/ChatGPT/chatgpt/models/gpt/gpt_critic.py b/applications/ChatGPT/chatgpt/models/gpt/gpt_critic.py
index 01e824386d4a..25bb1ed94de4 100644
--- a/applications/ChatGPT/chatgpt/models/gpt/gpt_critic.py
+++ b/applications/ChatGPT/chatgpt/models/gpt/gpt_critic.py
@@ -15,13 +15,16 @@ class GPTCritic(Critic):
         pretrained (str): Pretrained model name or path.
         config (GPT2Config): Model config.
         checkpoint (bool): Enable gradient checkpointing.
+        lora_rank (int): Rank of the LO-RA decomposition.
+        lora_train_bias (str): LoRA bias training mode.
     """
 
     def __init__(self,
                  pretrained: Optional[str] = None,
                  config: Optional[GPT2Config] = None,
                  checkpoint: bool = False,
-                 **kwargs) -> None:
+                 lora_rank: int = 0,
+                 lora_train_bias: str = 'none') -> None:
         if pretrained is not None:
             model = GPT2Model.from_pretrained(pretrained)
         elif config is not None:
@@ -31,4 +34,4 @@ def __init__(self,
         if checkpoint:
             model.gradient_checkpointing_enable()
         value_head = nn.Linear(model.config.n_embd, 1)
-        super().__init__(model, value_head, **kwargs)
+        super().__init__(model, value_head, lora_rank, lora_train_bias)
diff --git a/applications/ChatGPT/chatgpt/models/opt/opt_critic.py b/applications/ChatGPT/chatgpt/models/opt/opt_critic.py
index 847813332ac1..fcfebd8a8b03 100644
--- a/applications/ChatGPT/chatgpt/models/opt/opt_critic.py
+++ b/applications/ChatGPT/chatgpt/models/opt/opt_critic.py
@@ -34,5 +34,5 @@ def __init__(self,
             model = OPTModel(OPTConfig())
         if checkpoint:
             model.gradient_checkpointing_enable()
-        value_head = nn.Linear(model.config.hidden_size, 1)
+        value_head = nn.Linear(model.config.word_embed_proj_dim, 1)
         super().__init__(model, value_head, lora_rank, lora_train_bias, **kwargs)

From 68577fbc4399b0e8333ad958959ac09e5c54033d Mon Sep 17 00:00:00 2001
From: BlueRum <70618399+ht-zhou@users.noreply.github.com>
Date: Mon, 13 Mar 2023 11:12:22 +0800
Subject: [PATCH 461/503] [chatgpt]Fix examples (#3116)

* fix train_dummy

* fix train-prompts
---
 applications/ChatGPT/examples/train_dummy.py  | 26 ++++++++++---------
 .../ChatGPT/examples/train_prompts.py         | 25 ++++++++++--------
 2 files changed, 28 insertions(+), 23 deletions(-)

diff --git a/applications/ChatGPT/examples/train_dummy.py b/applications/ChatGPT/examples/train_dummy.py
index 27ee7f0f1bd3..4c81f2f72688 100644
--- a/applications/ChatGPT/examples/train_dummy.py
+++ b/applications/ChatGPT/examples/train_dummy.py
@@ -38,19 +38,19 @@ def main(args):
     # configure model
     with strategy.model_init_context():
         if args.model == 'gpt2':
-            actor = GPTActor().cuda()
-            critic = GPTCritic().cuda()
+            actor = GPTActor(pretrained=args.pretrain, lora_rank=args.lora_rank).to(torch.cuda.current_device())
+            critic = GPTCritic(pretrained=args.pretrain, lora_rank=args.lora_rank).to(torch.cuda.current_device())
         elif args.model == 'bloom':
-            actor = BLOOMActor(pretrained=args.pretrain, lora_rank=args.lora_rank).cuda()
-            critic = BLOOMCritic(pretrained=args.pretrain, lora_rank=args.lora_rank).cuda()
+            actor = BLOOMActor(pretrained=args.pretrain, lora_rank=args.lora_rank).to(torch.cuda.current_device())
+            critic = BLOOMCritic(pretrained=args.pretrain, lora_rank=args.lora_rank).to(torch.cuda.current_device())
         elif args.model == 'opt':
-            actor = OPTActor().cuda()
-            critic = OPTCritic().cuda()
+            actor = OPTActor(pretrained=args.pretrain, lora_rank=args.lora_rank).to(torch.cuda.current_device())
+            critic = OPTCritic(pretrained=args.pretrain, lora_rank=args.lora_rank).to(torch.cuda.current_device())
         else:
             raise ValueError(f'Unsupported model "{args.model}"')
 
-        initial_model = deepcopy(actor).cuda()
-        reward_model = RewardModel(deepcopy(critic.model), deepcopy(critic.value_head)).cuda()
+        initial_model = deepcopy(actor).to(torch.cuda.current_device())
+        reward_model = RewardModel(deepcopy(critic.model), deepcopy(critic.value_head)).to(torch.cuda.current_device())
 
     # configure optimizer
     if args.strategy.startswith('colossalai'):
@@ -114,12 +114,13 @@ def main(args):
                 max_timesteps=args.max_timesteps,
                 update_timesteps=args.update_timesteps)
 
-    # save model checkpoint after fitting on only rank0
+    # save model checkpoint after fitting
     strategy.save_model(actor, 'actor_checkpoint_dummy.pt', only_rank0=True)
     # save optimizer checkpoint on all ranks
-    strategy.save_optimizer(actor_optim,
-                            'actor_optim_checkpoint_dummy_%d.pt' % (torch.cuda.current_device()),
-                            only_rank0=False)
+    if args.need_optim_ckpt:
+        strategy.save_optimizer(actor_optim,
+                                'actor_optim_checkpoint_dummy_%d.pt' % (torch.cuda.current_device()),
+                                only_rank0=False)
 
 
 if __name__ == '__main__':
@@ -129,6 +130,7 @@ def main(args):
                         default='naive')
     parser.add_argument('--model', type=str, default='gpt2', choices=['gpt2', 'bloom', 'opt'])
     parser.add_argument('--pretrain', type=str, default=None)
+    parser.add_argument('--need_optim_ckpt', type=bool, default=False)
     parser.add_argument('--num_episodes', type=int, default=50)
     parser.add_argument('--max_timesteps', type=int, default=10)
     parser.add_argument('--update_timesteps', type=int, default=10)
diff --git a/applications/ChatGPT/examples/train_prompts.py b/applications/ChatGPT/examples/train_prompts.py
index 576685234f27..49f0e2c4ae23 100644
--- a/applications/ChatGPT/examples/train_prompts.py
+++ b/applications/ChatGPT/examples/train_prompts.py
@@ -32,19 +32,20 @@ def main(args):
     # configure model
     with strategy.model_init_context():
         if args.model == 'gpt2':
-            actor = GPTActor().cuda()
-            critic = GPTCritic().cuda()
+            actor = GPTActor(pretrained=args.pretrain, lora_rank=args.lora_rank).to(torch.cuda.current_device())
+            critic = GPTCritic(pretrained=args.pretrain, lora_rank=args.lora_rank).to(torch.cuda.current_device())
         elif args.model == 'bloom':
-            actor = BLOOMActor(pretrained=args.pretrain, lora_rank=args.lora_rank).cuda()
-            critic = BLOOMCritic(pretrained=args.pretrain, lora_rank=args.lora_rank).cuda()
+            actor = BLOOMActor(pretrained=args.pretrain, lora_rank=args.lora_rank).to(torch.cuda.current_device())
+            critic = BLOOMCritic(pretrained=args.pretrain, lora_rank=args.lora_rank).to(torch.cuda.current_device())
         elif args.model == 'opt':
-            actor = OPTActor(lora_rank=args.lora_rank).cuda()
-            critic = OPTCritic(lora_rank=args.lora_rank).cuda()
+            actor = OPTActor(pretrained=args.pretrain, lora_rank=args.lora_rank).to(torch.cuda.current_device())
+            critic = OPTCritic(pretrained=args.pretrain, lora_rank=args.lora_rank).to(torch.cuda.current_device())
         else:
             raise ValueError(f'Unsupported model "{args.model}"')
 
         initial_model = deepcopy(actor)
-        reward_model = RewardModel(deepcopy(critic.model), deepcopy(critic.value_head)).cuda()
+        reward_model = RewardModel(deepcopy(critic.model), deepcopy(critic.value_head)).to(torch.cuda.current_device())
+
 
     # configure optimizer
     if args.strategy.startswith('colossalai'):
@@ -100,12 +101,13 @@ def tokenize_fn(texts):
                 num_episodes=args.num_episodes,
                 max_timesteps=args.max_timesteps,
                 update_timesteps=args.update_timesteps)
-    # save model checkpoint after fitting on only rank0
+    # save model checkpoint after fitting 
     strategy.save_model(actor, 'actor_checkpoint_prompts.pt', only_rank0=True)
     # save optimizer checkpoint on all ranks
-    strategy.save_optimizer(actor_optim,
-                            'actor_optim_checkpoint_prompts_%d.pt' % (torch.cuda.current_device()),
-                            only_rank0=False)
+    if args.need_optim_ckpt:
+        strategy.save_optimizer(actor_optim,
+                                'actor_optim_checkpoint_prompts_%d.pt' % (torch.cuda.current_device()),
+                                only_rank0=False)
 
 
 if __name__ == '__main__':
@@ -116,6 +118,7 @@ def tokenize_fn(texts):
                         default='naive')
     parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt'])
     parser.add_argument('--pretrain', type=str, default=None)
+    parser.add_argument('--need_optim_ckpt', type=bool, default=False)
     parser.add_argument('--num_episodes', type=int, default=10)
     parser.add_argument('--max_timesteps', type=int, default=10)
     parser.add_argument('--update_timesteps', type=int, default=10)

From 30dd13c45008ffe9e532f6cffb78f530933e6028 Mon Sep 17 00:00:00 2001
From: Xuanlei Zhao <43881818+oahzxl@users.noreply.github.com>
Date: Mon, 13 Mar 2023 17:42:37 +0800
Subject: [PATCH 462/503] [autochunk] support complete benchmark (#3121)

* refact memory code

* dont log free var memory

* add memory align

* update chunk target

* update setting for new memory

* finish test

* update tracer

* update typo

* update test

* add unet test

* add bench

* update bench

* update bench

* init

* support vit

* move to cpu

* add cpu benchmark
---
 .../benchmark_autochunk_alphafold.py                        | 4 ++--
 .../test_autochunk_diffuser/benchmark_autochunk_diffuser.py | 6 +++---
 .../benchmark_autochunk_transformer.py                      | 6 +++---
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/test_autochunk/test_autochunk_alphafold/benchmark_autochunk_alphafold.py b/tests/test_autochunk/test_autochunk_alphafold/benchmark_autochunk_alphafold.py
index 896751e40146..9a2240d62de4 100644
--- a/tests/test_autochunk/test_autochunk_alphafold/benchmark_autochunk_alphafold.py
+++ b/tests/test_autochunk/test_autochunk_alphafold/benchmark_autochunk_alphafold.py
@@ -23,7 +23,7 @@ def _benchmark_evoformer_stack_gm(
     get_data: Any,
 ) -> None:
     # build model and input
-    model = get_model()
+    model = get_model().cpu().eval()
     meta_args, concrete_args = get_data(*data_args)
     if concrete_args is None:
         concrete_args = []
@@ -35,7 +35,7 @@ def _benchmark_evoformer_stack_gm(
         concrete_args={k: v for k, v in concrete_args},
     )
     interp = MetaInfoProp(meta_graph)
-    meta_tensors = [MetaTensor(i[1], fake_device="cuda:0") for i in meta_args] + [i[1] for i in concrete_args]
+    meta_tensors = [MetaTensor(i[1], fake_device="cpu") for i in meta_args] + [i[1] for i in concrete_args]
     interp.propagate(*meta_tensors)
     codegen = AutoChunkCodeGen(
         meta_graph,
diff --git a/tests/test_autochunk/test_autochunk_diffuser/benchmark_autochunk_diffuser.py b/tests/test_autochunk/test_autochunk_diffuser/benchmark_autochunk_diffuser.py
index 5c127bd69980..6fb7efa7a8fc 100644
--- a/tests/test_autochunk/test_autochunk_diffuser/benchmark_autochunk_diffuser.py
+++ b/tests/test_autochunk/test_autochunk_diffuser/benchmark_autochunk_diffuser.py
@@ -35,10 +35,9 @@ def _benchmark_autochunk_unet_gm(
         meta_args={k: v.to(torch.device("meta")) for k, v in meta_args},
         concrete_args={k: v for k, v in concrete_args},
     )
-    model = model.cuda().eval()
     interp = MetaInfoProp(meta_graph)
     meta_tensors = [i[1] for i in meta_args] + [i[1] for i in concrete_args]
-    meta_tensors = [MetaTensor(i, fake_device="cuda:0") if isinstance(i, torch.Tensor) else i for i in meta_tensors]
+    meta_tensors = [MetaTensor(i, fake_device="cpu") if isinstance(i, torch.Tensor) else i for i in meta_tensors]
     interp.propagate(*meta_tensors)
     codegen = AutoChunkCodeGen(
         meta_graph,
@@ -142,6 +141,7 @@ def benchmark_autochunk_unet(batch=1, height=448, width=448):
         port=free_port(),
         backend="nccl",
     )
-    benchmark_autochunk_unet(batch=1, height=224 * 2, width=224 * 2)
     benchmark_autochunk_unet(batch=1, height=224 * 3, width=224 * 3)
     benchmark_autochunk_unet(batch=1, height=224 * 4, width=224 * 4)
+    benchmark_autochunk_unet(batch=1, height=224 * 5, width=224 * 5)
+    benchmark_autochunk_unet(batch=1, height=224 * 6, width=224 * 6)
diff --git a/tests/test_autochunk/test_autochunk_transformer/benchmark_autochunk_transformer.py b/tests/test_autochunk/test_autochunk_transformer/benchmark_autochunk_transformer.py
index 5791af35124b..63490aaee7ff 100644
--- a/tests/test_autochunk/test_autochunk_transformer/benchmark_autochunk_transformer.py
+++ b/tests/test_autochunk/test_autochunk_transformer/benchmark_autochunk_transformer.py
@@ -22,7 +22,7 @@ def _benchmark_autochunk_gpt_gm(
     data: tuple,
     max_memory: int = None,
 ) -> None:
-    model = model.cuda().eval()
+    model = model.eval().cpu()
 
     # build model and input
     meta_args, concrete_args, sequence = data
@@ -37,7 +37,7 @@ def _benchmark_autochunk_gpt_gm(
     )
     interp = MetaInfoProp(meta_graph)
     meta_tensors = [meta_args[i] if i in meta_args else concrete_args[i] for i in sequence]
-    meta_tensors = [MetaTensor(i, fake_device="cuda:0") if isinstance(i, torch.Tensor) else i for i in meta_tensors]
+    meta_tensors = [MetaTensor(i, fake_device="cpu") if isinstance(i, torch.Tensor) else i for i in meta_tensors]
     interp.propagate(*meta_tensors)
     codegen = AutoChunkCodeGen(
         meta_graph,
@@ -58,7 +58,7 @@ def _benchmark_autochunk_gpt_gm(
     # init inputs
     inputs = [meta_args[i] if i in meta_args else concrete_args[i] for i in sequence]
     inputs = [i.cuda() if isinstance(i, torch.Tensor) else i for i in inputs]
-    model.cuda().eval()
+    model.cuda()
 
     # bench
     para_mem = float(parameter_size(model)) / 1024**2 * 6

From 169ed4d24edce15e7b855799c592b58eaa0e64e7 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Tue, 14 Mar 2023 10:11:32 +0800
Subject: [PATCH 463/503] [workflow] purged extension cache before GPT test
 (#3128)

---
 .github/workflows/run_chatgpt_examples.yml   | 1 +
 .github/workflows/run_chatgpt_unit_tests.yml | 1 +
 2 files changed, 2 insertions(+)

diff --git a/.github/workflows/run_chatgpt_examples.yml b/.github/workflows/run_chatgpt_examples.yml
index af59c8db2d6b..51bb9d074644 100644
--- a/.github/workflows/run_chatgpt_examples.yml
+++ b/.github/workflows/run_chatgpt_examples.yml
@@ -35,6 +35,7 @@ jobs:
       - name: Execute Examples
         run: |
           cd applications/ChatGPT
+          rm -rf ~/.cache/colossalai
           ./examples/test_ci.sh
         env:
           NCCL_SHM_DISABLE: 1
diff --git a/.github/workflows/run_chatgpt_unit_tests.yml b/.github/workflows/run_chatgpt_unit_tests.yml
index 8dcf21fe2146..4e539bfe06fd 100644
--- a/.github/workflows/run_chatgpt_unit_tests.yml
+++ b/.github/workflows/run_chatgpt_unit_tests.yml
@@ -36,6 +36,7 @@ jobs:
       - name: Execute Unit Testing
         run: |
           cd applications/ChatGPT
+          rm -rf ~/.cache/colossalai
           pytest tests/
         env:
           NCCL_SHM_DISABLE: 1

From 23cd5e2ccf4ed84cde26ddc2236c20fc5d4ae38b Mon Sep 17 00:00:00 2001
From: BlueRum <70618399+ht-zhou@users.noreply.github.com>
Date: Tue, 14 Mar 2023 11:01:17 +0800
Subject: [PATCH 464/503] [chatgpt]update ci (#3087)

* [chatgpt]update ci

* Update test_ci.sh

* Update test_ci.sh

* Update test_ci.sh

* test

* Update train_prompts.py

* Update train_dummy.py

* add save_path

* polish

* add save path

* polish

* add save path

* polish

* delete bloom-560m test

delete bloom-560m test because of oom

* add ddp test
---
 applications/ChatGPT/examples/test_ci.sh      | 58 +++++++++++++++++--
 applications/ChatGPT/examples/train_dummy.py  |  3 +-
 .../ChatGPT/examples/train_prompts.py         |  3 +-
 3 files changed, 56 insertions(+), 8 deletions(-)

diff --git a/applications/ChatGPT/examples/test_ci.sh b/applications/ChatGPT/examples/test_ci.sh
index 8109db2260a0..0aa4a36fe514 100755
--- a/applications/ChatGPT/examples/test_ci.sh
+++ b/applications/ChatGPT/examples/test_ci.sh
@@ -15,11 +15,57 @@ export OMP_NUM_THREADS=8
 pip install -r ${BASE}/requirements.txt
 
 # train dummy
-for strategy in ddp colossalai_gemini colossalai_zero2; do
-    torchrun --standalone --nproc_per_node=2 ${BASE}/train_dummy.py --strategy ${strategy} --num_episodes 2 --max_timesteps 3 --update_timesteps 3 --max_epochs 3 --experience_batch_size 4 --train_batch_size 4
-done
+python ${BASE}/train_dummy.py --strategy naive --num_episodes 1 \
+                              --max_timesteps 2 --update_timesteps 2 \
+                              --max_epochs 1 --train_batch_size 2 --lora_rank 4
+
+torchrun --standalone --nproc_per_node=2 ${BASE}/train_dummy.py \
+         --strategy colossalai_gemini --num_episodes 1 --max_timesteps 2 \
+         --update_timesteps 2 --max_epochs 1 --train_batch_size 2\
+         --pretrain 'facebook/opt-350m' --model opt --lora_rank 4\
+         --save_path ${BASE}/actor_checkpoint_dummy.pt
+python ${BASE}/inference.py --model_path ${BASE}/actor_checkpoint_dummy.pt --pretrain 'facebook/opt-350m' --model opt
+
+torchrun --standalone --nproc_per_node=2 ${BASE}/train_dummy.py \
+         --strategy ddp --num_episodes 1 --max_timesteps 2 \
+         --update_timesteps 2 --max_epochs 1 --train_batch_size 2\
+         --pretrain 'facebook/opt-350m' --model opt --lora_rank 4\
+         --save_path ${BASE}/actor_checkpoint_dummy.pt
+python ${BASE}/inference.py --model_path ${BASE}/actor_checkpoint_dummy.pt --pretrain 'facebook/opt-350m' --model opt
+
+torchrun --standalone --nproc_per_node=2 ${BASE}/train_dummy.py \
+         --strategy colossalai_zero2 --num_episodes 1 --max_timesteps 2 \
+         --update_timesteps 2 --max_epochs 1 --train_batch_size 2\
+         --pretrain 'gpt2' --model gpt2 --lora_rank 4\
+         --save_path ${BASE}/actor_checkpoint_dummy.pt
+python ${BASE}/inference.py --model_path ${BASE}/actor_checkpoint_dummy.pt --pretrain 'gpt2' --model gpt2
+
+rm -rf ${BASE}/actor_checkpoint_dummy.pt
 
 # train prompts
-for strategy in ddp colossalai_gemini colossalai_zero2; do
-    torchrun --standalone --nproc_per_node=2 ${BASE}/train_prompts.py $PROMPT_PATH --strategy ${strategy} --num_episodes 2 --max_timesteps 3 --update_timesteps 3 --max_epochs 3
-done
+python ${BASE}/train_prompts.py $PROMPT_PATH --strategy naive --num_episodes 1 \
+                                             --max_timesteps 2 --update_timesteps 2 \
+                                             --max_epochs 1 --train_batch_size 2 --lora_rank 4
+
+torchrun --standalone --nproc_per_node=2 ${BASE}/train_prompts.py $PROMPT_PATH \
+         --strategy colossalai_zero2 --num_episodes 1 --max_timesteps 2 \
+         --update_timesteps 2 --max_epochs 1 --train_batch_size 2\
+         --pretrain 'facebook/opt-350m' --model opt --lora_rank 4\
+         --save_path ${BASE}/actor_checkpoint_prompts.pt
+python ${BASE}/inference.py --model_path ${BASE}/actor_checkpoint_prompts.pt --pretrain 'facebook/opt-350m' --model opt
+
+torchrun --standalone --nproc_per_node=2 ${BASE}/train_prompts.py $PROMPT_PATH \
+         --strategy ddp --num_episodes 1 --max_timesteps 2 \
+         --update_timesteps 2 --max_epochs 1 --train_batch_size 2\
+         --pretrain 'gpt2' --model gpt2 --lora_rank 4\
+         --save_path ${BASE}/actor_checkpoint_prompts.pt
+python ${BASE}/inference.py --model_path ${BASE}/actor_checkpoint_prompts.pt --pretrain 'gpt2' --model gpt2
+
+torchrun --standalone --nproc_per_node=2 ${BASE}/train_prompts.py $PROMPT_PATH \
+         --strategy colossalai_gemini --num_episodes 1 --max_timesteps 2 \
+         --update_timesteps 2 --max_epochs 1 --train_batch_size 2\
+         --pretrain 'gpt2' --model gpt2 --lora_rank 4\
+         --save_path ${BASE}/actor_checkpoint_prompts.pt
+python ${BASE}/inference.py --model_path ${BASE}/actor_checkpoint_prompts.pt --pretrain 'gpt2' --model gpt2
+
+rm -rf ${BASE}/actor_checkpoint_prompts.pt
diff --git a/applications/ChatGPT/examples/train_dummy.py b/applications/ChatGPT/examples/train_dummy.py
index 4c81f2f72688..c0ebf8f9b7b6 100644
--- a/applications/ChatGPT/examples/train_dummy.py
+++ b/applications/ChatGPT/examples/train_dummy.py
@@ -115,7 +115,7 @@ def main(args):
                 update_timesteps=args.update_timesteps)
 
     # save model checkpoint after fitting
-    strategy.save_model(actor, 'actor_checkpoint_dummy.pt', only_rank0=True)
+    strategy.save_model(actor, args.save_path, only_rank0=True)
     # save optimizer checkpoint on all ranks
     if args.need_optim_ckpt:
         strategy.save_optimizer(actor_optim,
@@ -130,6 +130,7 @@ def main(args):
                         default='naive')
     parser.add_argument('--model', type=str, default='gpt2', choices=['gpt2', 'bloom', 'opt'])
     parser.add_argument('--pretrain', type=str, default=None)
+    parser.add_argument('--save_path', type=str, default='actor_checkpoint_dummy.pt')
     parser.add_argument('--need_optim_ckpt', type=bool, default=False)
     parser.add_argument('--num_episodes', type=int, default=50)
     parser.add_argument('--max_timesteps', type=int, default=10)
diff --git a/applications/ChatGPT/examples/train_prompts.py b/applications/ChatGPT/examples/train_prompts.py
index 49f0e2c4ae23..d4f31e61eb75 100644
--- a/applications/ChatGPT/examples/train_prompts.py
+++ b/applications/ChatGPT/examples/train_prompts.py
@@ -102,7 +102,7 @@ def tokenize_fn(texts):
                 max_timesteps=args.max_timesteps,
                 update_timesteps=args.update_timesteps)
     # save model checkpoint after fitting 
-    strategy.save_model(actor, 'actor_checkpoint_prompts.pt', only_rank0=True)
+    strategy.save_model(actor, args.save_path, only_rank0=True)
     # save optimizer checkpoint on all ranks
     if args.need_optim_ckpt:
         strategy.save_optimizer(actor_optim,
@@ -118,6 +118,7 @@ def tokenize_fn(texts):
                         default='naive')
     parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt'])
     parser.add_argument('--pretrain', type=str, default=None)
+    parser.add_argument('--save_path', type=str, default='actor_checkpoint_prompts.pt')
     parser.add_argument('--need_optim_ckpt', type=bool, default=False)
     parser.add_argument('--num_episodes', type=int, default=10)
     parser.add_argument('--max_timesteps', type=int, default=10)

From 86ac782d7c8b20289fe42b70ab09dea86024b353 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Tue, 14 Mar 2023 14:29:18 +0800
Subject: [PATCH 465/503] [test] added timm models to test model zoo (#3129)

* [test] added timm models to test model zoo

* polish code

* polish code

* polish code

* polish code

* polish code
---
 tests/kit/__init__.py                         |   0
 tests/kit/model_zoo/__init__.py               |   4 +
 tests/kit/model_zoo/registry.py               |  63 +++++++
 tests/kit/model_zoo/timm/__init__.py          |   1 +
 tests/kit/model_zoo/timm/timm.py              | 159 ++++++++++++++++++
 .../test_timm_model/test_timm_model.py        |  70 ++++----
 6 files changed, 256 insertions(+), 41 deletions(-)
 create mode 100644 tests/kit/__init__.py
 create mode 100644 tests/kit/model_zoo/__init__.py
 create mode 100644 tests/kit/model_zoo/registry.py
 create mode 100644 tests/kit/model_zoo/timm/__init__.py
 create mode 100644 tests/kit/model_zoo/timm/timm.py

diff --git a/tests/kit/__init__.py b/tests/kit/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/kit/model_zoo/__init__.py b/tests/kit/model_zoo/__init__.py
new file mode 100644
index 000000000000..435daea2c7de
--- /dev/null
+++ b/tests/kit/model_zoo/__init__.py
@@ -0,0 +1,4 @@
+from . import timm
+from .registry import model_zoo
+
+__all__ = ['model_zoo']
diff --git a/tests/kit/model_zoo/registry.py b/tests/kit/model_zoo/registry.py
new file mode 100644
index 000000000000..4e7dcb30f04d
--- /dev/null
+++ b/tests/kit/model_zoo/registry.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python
+from dataclasses import dataclass
+from typing import Callable
+
+__all__ = ['ModelZooRegistry', 'ModelAttributem', 'model_zoo']
+
+
+@dataclass
+class ModelAttribute:
+    """
+    Attributes of a model.
+    """
+    has_control_flow: bool = False
+
+
+class ModelZooRegistry(dict):
+    """
+    A registry to map model names to model and data generation functions.
+    """
+
+    def register(self,
+                 name: str,
+                 model_fn: Callable,
+                 data_gen_fn: Callable,
+                 output_transform_fn: Callable,
+                 model_attribute: ModelAttribute = None):
+        """
+        Register a model and data generation function.
+
+        Examples:
+        >>> # Register
+        >>> model_zoo = ModelZooRegistry()
+        >>> model_zoo.register('resnet18', resnet18, resnet18_data_gen)
+        >>> # Run the model
+        >>> data = resnresnet18_data_gen() # do not input any argument
+        >>> model = resnet18() # do not input any argument
+        >>> out = model(**data)
+
+        Args:
+            name (str): Name of the model.
+            model_fn (callable): A function that returns a model. **It must not contain any arguments.**
+            output_transform_fn (callable): A function that transforms the output of the model into Dict.
+            data_gen_fn (callable): A function that returns a data sample in the form of Dict. **It must not contain any arguments.**
+            model_attribute (ModelAttribute): Attributes of the model. Defaults to None.
+        """
+        self[name] = (model_fn, data_gen_fn, output_transform_fn, model_attribute)
+
+    def get_sub_registry(self, keyword: str):
+        """
+        Get a sub registry with models that contain the keyword.
+
+        Args:
+            keyword (str): Keyword to filter models.
+        """
+        new_dict = dict()
+
+        for k, v in self.items():
+            if keyword in k:
+                new_dict[k] = v
+        return new_dict
+
+
+model_zoo = ModelZooRegistry()
diff --git a/tests/kit/model_zoo/timm/__init__.py b/tests/kit/model_zoo/timm/__init__.py
new file mode 100644
index 000000000000..c9c85319448d
--- /dev/null
+++ b/tests/kit/model_zoo/timm/__init__.py
@@ -0,0 +1 @@
+from .timm import *
diff --git a/tests/kit/model_zoo/timm/timm.py b/tests/kit/model_zoo/timm/timm.py
new file mode 100644
index 000000000000..b29ac12a6b53
--- /dev/null
+++ b/tests/kit/model_zoo/timm/timm.py
@@ -0,0 +1,159 @@
+import timm.models as tm
+import torch
+
+from ..registry import ModelAttribute, model_zoo
+
+## ==============
+# Register models without control flow
+## ==============
+data_gen_fn = lambda: dict(x=torch.rand(2, 3, 224, 224))
+output_transform_fn = lambda x: dict(output=x)
+
+model_zoo.register(name='timm_resnet',
+                   model_fn=tm.resnest.resnest50d,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
+model_zoo.register(name='timm_beit',
+                   model_fn=tm.beit.beit_base_patch16_224,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
+model_zoo.register(name='timm_cait',
+                   model_fn=tm.cait.cait_s24_224,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
+model_zoo.register(name='timm_convmixer',
+                   model_fn=tm.convmixer.convmixer_768_32,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
+model_zoo.register(name='timm_efficientnetv2',
+                   model_fn=tm.efficientnet.efficientnetv2_m,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
+model_zoo.register(name='timm_resmlp',
+                   model_fn=tm.resmlp_12_224,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
+model_zoo.register(name='timm_vision_transformer',
+                   model_fn=tm.vision_transformer.vit_base_patch16_224,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
+model_zoo.register(name='timm_deit',
+                   model_fn=tm.deit_base_distilled_patch16_224,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
+model_zoo.register(name='timm_beitv2',
+                   model_fn=tm.beitv2_base_patch16_224,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
+model_zoo.register(name='timm_coat',
+                   model_fn=tm.coat.coat_lite_mini,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
+
+model_zoo.register(name='timm_deit3',
+                   model_fn=tm.deit3_base_patch16_224,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
+
+model_zoo.register(name='timm_eca_nfnet',
+                   model_fn=tm.eca_nfnet_l0,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
+model_zoo.register(name='timm_efficientformer',
+                   model_fn=tm.efficientformer_l1,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
+model_zoo.register(name='timm_ese_vovnet19b_dw',
+                   model_fn=tm.ese_vovnet19b_dw,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
+model_zoo.register(name='timm_gmixer_12_224',
+                   model_fn=tm.gmixer_12_224,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
+model_zoo.register(name='timm_gmlp_b16_224',
+                   model_fn=tm.gmlp_b16_224,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
+model_zoo.register(name='timm_hardcorenas_a',
+                   model_fn=tm.hardcorenas_a,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
+model_zoo.register(name='timm_hrnet_w18_small',
+                   model_fn=tm.hrnet_w18_small,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
+model_zoo.register(name='timm_inception_v3',
+                   model_fn=tm.inception_v3,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
+model_zoo.register(name='timm_mixer_b16_224',
+                   model_fn=tm.mixer_b16_224,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
+model_zoo.register(name='timm_nf_ecaresnet101',
+                   model_fn=tm.nf_ecaresnet101,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
+model_zoo.register(name='timm_nf_regnet_b0',
+                   model_fn=tm.nf_regnet_b0,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
+model_zoo.register(name='timm_regnetv_040',
+                   model_fn=tm.regnetv_040,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
+model_zoo.register(name='timm_skresnet18',
+                   model_fn=tm.skresnet18,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
+model_zoo.register(name='timm_tnt_b_patch16_224',
+                   model_fn=tm.tnt_b_patch16_224,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
+model_zoo.register(name='timm_wide_resnet50_2',
+                   model_fn=tm.wide_resnet50_2,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
+model_zoo.register(name='timm_convit',
+                   model_fn=tm.convit_base,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
+model_zoo.register(name='timm_dm_nfnet',
+                   model_fn=tm.dm_nfnet_f0,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
+
+# ==============
+# Register models with control flow
+# ==============
+model_zoo.register(name='timm_convnext',
+                   model_fn=tm.convnext.convnext_base,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn,
+                   model_attribute=ModelAttribute(has_control_flow=True))
+model_zoo.register(name='timm_vgg',
+                   model_fn=tm.vgg.vgg11,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn,
+                   model_attribute=ModelAttribute(has_control_flow=True))
+model_zoo.register(name='timm_dpn',
+                   model_fn=tm.dpn.dpn68,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn,
+                   model_attribute=ModelAttribute(has_control_flow=True))
+model_zoo.register(name='timm_densenet',
+                   model_fn=tm.densenet.densenet121,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn,
+                   model_attribute=ModelAttribute(has_control_flow=True))
+model_zoo.register(name='timm_rexnet',
+                   model_fn=tm.rexnet.rexnet_100,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn,
+                   model_attribute=ModelAttribute(has_control_flow=True))
+model_zoo.register(name='timm_swin_transformer',
+                   model_fn=tm.swin_transformer.swin_base_patch4_window7_224,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn,
+                   model_attribute=ModelAttribute(has_control_flow=True))
diff --git a/tests/test_fx/test_tracer/test_timm_model/test_timm_model.py b/tests/test_fx/test_tracer/test_timm_model/test_timm_model.py
index 28ec3d82556c..31baa3e89798 100644
--- a/tests/test_fx/test_tracer/test_timm_model/test_timm_model.py
+++ b/tests/test_fx/test_tracer/test_timm_model/test_timm_model.py
@@ -3,9 +3,10 @@
 import torch
 
 from colossalai.fx import symbolic_trace
+from tests.kit.model_zoo import model_zoo
 
 
-def trace_and_compare(model_cls, data, meta_args=None):
+def trace_and_compare(model_cls, data, output_transform_fn, meta_args=None):
     # trace
     model = model_cls()
 
@@ -14,60 +15,47 @@ def trace_and_compare(model_cls, data, meta_args=None):
     # without this statement, the torch.nn.functional.batch_norm will always be in training mode
     model.eval()
 
+    # TODO: support the following models
+    # 1. ConViT
+    # 2. NormFreeNet
+    # as they are not supported, let's skip them
+    if model.__class__.__name__ in ['ConViT', 'NormFreeNet']:
+        return
+
     gm = symbolic_trace(model, meta_args=meta_args)
 
     # run forward
     with torch.no_grad():
-        fx_out = gm(data)
-        non_fx_out = model(data)
+        fx_out = gm(**data)
+        non_fx_out = model(**data)
 
     # compare output
-    if isinstance(fx_out, tuple):
-        # some models produce tuple as output
-        for v1, v2 in zip(fx_out, non_fx_out):
-            assert torch.allclose(v1, v2), f'{model.__class__.__name__} has inconsistent outputs, {v1} vs {v2}'
-    else:
-        assert torch.allclose(
-            fx_out, non_fx_out,
-            atol=1e-5), f'{model.__class__.__name__} has inconsistent outputs, {fx_out} vs {non_fx_out}'
-
-
-def test_timm_models_without_control_flow():
-    torch.backends.cudnn.deterministic = True
-
-    MODEL_LIST = [
-        tm.resnest.resnest50d,
-        tm.beit.beit_base_patch16_224,
-        tm.cait.cait_s24_224,
-        tm.convmixer.convmixer_768_32,
-        tm.efficientnet.efficientnetv2_m,
-        tm.resmlp_12_224,
-        tm.vision_transformer.vit_base_patch16_224,
-        tm.deit_base_distilled_patch16_224,
-    ]
+    transformed_fx_out = output_transform_fn(fx_out)
+    transformed_non_fx_out = output_transform_fn(non_fx_out)
 
-    data = torch.rand(2, 3, 224, 224)
+    assert len(transformed_fx_out) == len(transformed_non_fx_out)
 
-    for model_cls in MODEL_LIST:
-        trace_and_compare(model_cls, data)
+    for key in transformed_fx_out.keys():
+        fx_output_val = transformed_fx_out[key]
+        non_fx_output_val = transformed_non_fx_out[key]
+        assert torch.allclose(fx_output_val, non_fx_output_val, atol=1e-5), \
+            f'{model.__class__.__name__} has inconsistent outputs, {fx_output_val} vs {non_fx_output_val}'
 
 
-def test_timm_models_with_control_flow():
+def test_timm_models():
     torch.backends.cudnn.deterministic = True
 
-    MODEL_LIST_WITH_CONTROL_FLOW = [
-        tm.convnext.convnext_base, tm.vgg.vgg11, tm.dpn.dpn68, tm.densenet.densenet121, tm.rexnet.rexnet_100,
-        tm.swin_transformer.swin_base_patch4_window7_224
-    ]
-
-    data = torch.rand(2, 3, 224, 224)
+    sub_model_zoo = model_zoo.get_sub_registry('timm')
 
-    meta_args = {'x': data.to('meta')}
+    for name, (model_fn, data_gen_fn, output_transform_fn, attribute) in sub_model_zoo.items():
+        data = data_gen_fn()
+        if attribute is not None and attribute.has_control_flow:
+            meta_args = {k: v.to('meta') for k, v in data.items()}
+        else:
+            meta_args = None
 
-    for model_cls in MODEL_LIST_WITH_CONTROL_FLOW:
-        trace_and_compare(model_cls, data, meta_args)
+        trace_and_compare(model_fn, data, output_transform_fn, meta_args)
 
 
 if __name__ == '__main__':
-    test_timm_models_with_control_flow()
-    test_timm_models_without_control_flow()
+    test_timm_models()

From ed8f60b93b9dfe88635baf40501ac80de84fa18c Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Tue, 14 Mar 2023 15:37:12 +0800
Subject: [PATCH 466/503] [lazyinit] refactor lazy tensor and lazy init ctx
 (#3131)

* [lazyinit] refactor lazy tensor and lazy init ctx

* [lazyinit] polish docstr

* [lazyinit] polish docstr
---
 colossalai/utils/model/experimental.py | 384 +++++++++++--------------
 1 file changed, 170 insertions(+), 214 deletions(-)

diff --git a/colossalai/utils/model/experimental.py b/colossalai/utils/model/experimental.py
index 8291227b7ba2..b8eb742f8c71 100644
--- a/colossalai/utils/model/experimental.py
+++ b/colossalai/utils/model/experimental.py
@@ -1,17 +1,11 @@
-import contextlib
-import copy
-import gc
-import pprint
-from typing import Callable, List, Optional, Union
+from typing import Callable, Optional, Union
 
 import torch
 import torch.nn as nn
+from torch import Tensor
 from torch.utils._pytree import tree_map
 
-from colossalai.device.device_mesh import DeviceMesh
 from colossalai.fx.profiler import MetaTensor
-from colossalai.tensor.shape_consistency import ShapeConsistencyManager
-from colossalai.tensor.sharding_spec import ShardingSpec
 
 # reference: https://pytorch.org/cppdocs/notes/tensor_creation.html
 _TorchFactoryMethod = [
@@ -30,9 +24,23 @@
     "tensor",
 ]
 
-orig_empty = torch.empty    # avoid override
+_EARLY_MATERIALIZED_OPS = ['__getitem__', 'split']
 
-scm = ShapeConsistencyManager()
+
+class _MyTensor(Tensor):
+    """This class is only for correctness verification.
+    """
+    _pre_op_fn: Callable[['LazyTensor'], None] = lambda *args: None
+
+    def __new__(cls, func, *args, dtype=None, device=None, **kwargs) -> '_MyTensor':
+        cls._pre_op_fn()
+        data = func(*args, dtype=dtype, device=device, **kwargs)
+        return Tensor._make_subclass(cls, data, require_grad=data.requires_grad)
+
+    @classmethod
+    def __torch_function__(cls, func, types, args=(), kwargs=None):
+        cls._pre_op_fn()
+        return super().__torch_function__(func, types, args, kwargs)
 
 
 class LazyTensor(torch.Tensor):
@@ -50,140 +58,114 @@ class LazyTensor(torch.Tensor):
         tensor([[0., 1., 1.],
                 [1., 1., 1.]], device='cuda:0', dtype=torch.float16)
 
-        2. Generate ``MetaTensor`` from ``LazyTensor``
-        >>> x = LazyTensor(torch.zeros, 2, 3)
-        >>> x.reshape(3, 2)
-        >>> x = x.traceable()    # generate ``MetaTensor``
-        >>> print(x)
-        MetaTensor(..., size=(3, 2), device=cpu, dtype=torch.float32)
-
-        3. Use ``LazyTensor`` to generate sharded ``nn.Parameter``.
-        >>> x = LazyTensor(torch.zeros, 2, 3)
-        >>> x.spec = ...    # some ``ShardingSpec``
-        >>> x.distribute()    # distribute the tensor according to the ``ShardingSpec``
-
     Warnings:
         1. Cases that ``LazyTensor`` can't deal with.
         >>> x = LazyTensor(torch.ones, 2, 3)
         >>> x[0, 0] = -x[0, 0]    # this will cause infinite recursion
+        >>> y = x.clone()
+        >>> x.add_(1) # modifying origin tensor after cloning leads to wrong materialization
+        >>> z = x.tolist()
+        >>> x.zeros_() # modifying origin tensor after cloning tolist is not allowed
+        >>> x.data = torch.rand(2, 3) # directly set data of a lazy tensor is not allowed
+
+        2. Cases that ``LazyTensor`` becomes eager (early materialization).
+        >>> b = a[:, 2:]  # get a slice of a lazy tensor triggers early materialization
+        >>> chunks = a.split(3)  # this also triggers early materialization
 
-        2. ``LazyTensor.materialize()`` can't be called multiple times.
-        >>> x = LazyTensor(torch.ones, 2, 3)
-        >>> x.materialize()
-        >>> x.materialize()    # this is disallowed
     """
 
     _repr = True
     _meta_data: Optional[MetaTensor] = None    # shape, dtype, device
-    _cached_data: Optional[torch.Tensor] = None    # materialized data
+    _pre_op_fn: Callable[['LazyTensor'], None] = lambda *args: None
 
     @staticmethod
-    def __new__(cls, func, *args, dtype=None, device=None, **kwargs):
-        elem = func(*args, dtype=dtype, device='meta', **kwargs)
+    def __new__(cls, func, *args, meta_data=None, **kwargs):
+        if meta_data is None:
+            device = kwargs.get('device', 'cpu')
+            elem = func(*args, **{**kwargs, 'device': 'meta'})
+            meta_data = MetaTensor(elem, fake_device=device)
+        elem = meta_data._tensor
         r = torch.Tensor._make_wrapper_subclass(cls,
                                                 elem.size(),
                                                 strides=elem.stride(),
                                                 storage_offset=elem.storage_offset(),
                                                 dtype=elem.dtype,
                                                 layout=elem.layout,
-                                                device=device if device is not None else torch.device('cpu'),
+                                                device=elem.device,
                                                 requires_grad=elem.requires_grad)
-        r._meta_data = MetaTensor(elem, fake_device=device)
+        r._meta_data = meta_data
         return r
 
-    def __init__(self, func, *args, dtype=None, device=None, **kwargs):
-        self._factory_method = (func, args, {'dtype': dtype, 'device': device, **kwargs})    # (func, args, kwargs)
-        self._cached_buffer = list()    # (func, args, kwargs)
-        self._spec = None
-        self._data = self
-
-    def __repr__(self):
-        if self._repr:
-            # avoid recursive representation
-            self.__class__._repr = False
-            s = f'LazyTensor(..., size={tuple(self._meta_data.shape)}, device={self._meta_data.device}, dtype={self._meta_data.dtype})\n'\
-                f'factory method: {self._factory_method}\n'\
-                f'cached: {pprint.pformat(self._cached_buffer) if self._cached_data is None else self._cached_data}\n'\
-                f'spec: {self._spec}'
-            self.__class__._repr = True
-            return s
-        else:
-            return 'LazyTensor(...)'
+    def __init__(self, func, *args, meta_data=None, **kwargs):
+        self._factory_method = (func, args, kwargs)    # (func, args, kwargs)
+        self._op_buffer = []    # (func, args, kwargs, replace)
+        self._materialized_data: Optional[torch.Tensor] = None    # materialized data
 
     def materialize(self) -> torch.Tensor:
         """Materialize the ``LazyTensor`` to ``torch.Tensor``.
 
-        Warnings:
-            Calling ``self.materialize()`` will clear all cached sequence and factory method,
-            because we don't allow materialize the same ``LazyTensor`` twice.
-            This is mentioned in the paper: https://arxiv.org/pdf/2102.13267.pdf (Part 4.3).
-
         Returns:
             torch.Tensor: The materialized tensor.
         """
-        target = self._data._realize_cached_data()
+        target = self._materialize_data()
         if isinstance(self, nn.Parameter):
             target = nn.Parameter(target, requires_grad=self.requires_grad)
-        self._clear_all()
         return target
 
-    def traceable(self) -> MetaTensor:
-        """Generate ``MetaTensor`` from ``LazyTensor``. (Mostly for tracing)
-
-        Returns:
-            MetaTensor: The generated ``MetaTensor``.
+    def clean(self) -> None:
+        """Clean all stored operations, meta data and materialized data, which prevents memory leaking. This should be called after all tensors are materialized.
         """
-        if isinstance(self, nn.Parameter):
-            return nn.Parameter(self._meta_data, requires_grad=self.requires_grad)
-        else:
-            return self._meta_data
+        self._factory_method = None
+        self._op_buffer = None
+        self._materialized_data = None
+        self._meta_data = None
 
-    def distribute(self) -> torch.Tensor:
-        """Distribute the ``LazyTensor`` according to the ``ShardingSpec``.
+    @staticmethod
+    def _replace_with_materialized(x):
+        if isinstance(x, LazyTensor):
+            return x._materialize_data()
+        return x
 
-        Returns:
-            torch.Tensor: The sharded tensor.
-        """
-        if self._spec is None:
-            raise RuntimeError('ShardingSpec is not set for\n{self}')
-        spec, device_mesh = self._spec, self._spec.device_mesh
-        target = self.materialize()
-
-        # TODO(some man): better not be coupled with auto-parallel
-        target.data = scm.apply_for_autoparallel_runtime(target.data, ShardingSpec(device_mesh, target.shape, {}),
-                                                         spec).detach().clone()
-        return target
+    def _materialize_data(self) -> torch.Tensor:
+        # self._materialized_data should be generated after the first call of this function
+        if self._materialized_data is None:
+            # apply factory method
+            func, args, kwargs = self._factory_method
 
-    def _realize_cached_data(self) -> torch.Tensor:
-        # self._cached_data should be generated after the first call of this function
-        if self._cached_data is None:
-            if self._factory_method is not None:
-                # apply factory method
-                func, args, kwargs = self._factory_method
+            # apply cached sequence
+            self._pre_op_fn()
 
-                # apply cached sequence
-                self._cached_data = self._apply_cache_buffer(func(*args, **kwargs))
-            else:
-                # apply cached sequence only
-                self._cached_data = self._apply_cache_buffer()
-        return self._cached_data
+            try:
+                init_val = func(*tree_map(self._replace_with_materialized, args),
+                                **tree_map(self._replace_with_materialized, kwargs))
+            except TypeError as e:
+                print(f'init fn: {func.__name__}')
+                raise e
+
+            self._materialized_data = self._rerun_ops(init_val)
+        return self._materialized_data
+
+    def _rerun_ops(self, target=None) -> torch.Tensor:
+        """Do lazy execution by rerunning all (stored) related operations.
+
+        Args:
+            target (torc.Tensor, optional): Intial value of the target tensor (self). Defaults to None.
+        """
 
-    def _apply_cache_buffer(self, target=None) -> torch.Tensor:
-        # dump all cached sequence
-        # super-dainiu: support methods for single Tensor only
         def replace(x):
             if x is self:
                 return target
             elif isinstance(x, LazyTensor):
-                return x._realize_cached_data()
+                return x._materialize_data()
             return x
 
         packed = None
 
-        for (func, args, kwargs) in self._cached_buffer:
+        for (func, args, kwargs) in self._op_buffer:
             if func == torch.Tensor.requires_grad_:
                 packed = func, args, kwargs    # requires grad should be set at last
             else:
+                self._pre_op_fn()
                 o = func(*tree_map(replace, args), **tree_map(replace, kwargs))
                 target = o if isinstance(o, torch.Tensor) else target    # if func returns non-Tensor, discard the value
 
@@ -194,24 +176,23 @@ def replace(x):
 
         return target
 
-    # clear all means:
-    #   1. clear factory method
-    #   2. clear cached sequence
-    #   3. clear cached data
-    def _clear_all(self):
-        self._cached_data = None
-        self._cached_buffer = None
-        self._data = None
-        gc.collect()    # avoid memory leak
-
     # cache everything with __torch_function__
+
     @classmethod
     def __torch_function__(cls, func, types, args=(), kwargs=None):
         if kwargs is None:
             kwargs = {}
-        target = None
+        if func.__name__ in _EARLY_MATERIALIZED_OPS:
+            # These OPs cannot be lazy and related tensors should be early materialized
+            tree_map(cls._replace_with_materialized, args)
+            tree_map(cls._replace_with_materialized, kwargs)
+        is_inplace: bool = (func.__name__.endswith('_') and not (func.__name__.endswith('__'))
+                            or func.__name__ == "__setitem__")
 
         if isinstance(func, torch._C.ScriptMethod):
+            # FIXME(ver217): torch script functions are not verified
+
+            target = None
 
             def unwrap(x):
                 if isinstance(x, LazyTensor):
@@ -219,79 +200,83 @@ def unwrap(x):
                 return x
 
             target: LazyTensor = args[0].clone()
-            target._cached_buffer.append((func, args, kwargs))
+            target._op_buffer.append((func, args, kwargs))
             target._meta_data = getattr(target._meta_data, func.name)(*tree_map(unwrap, args[1:]),
                                                                       **tree_map(unwrap, kwargs))
-
+            return target
         else:
 
+            meta_to_lazy = {}
+
             def unwrap(x):
-                nonlocal target
                 if isinstance(x, LazyTensor):
-                    target = x if (func.__name__.endswith('_') and not (func.__name__.endswith('__'))
-                                   or func.__name__ == "__setitem__") else x.clone()
-                    target._cached_buffer.append((func, args, kwargs))
-                    return x._meta_data
+                    if x._materialized_data is not None:
+                        # for early materialized tensor, use its materialized data directly
+                        return x._materialized_data
+                    t = x if is_inplace else x.clone()
+                    t._op_buffer.append((func, args, kwargs))
+                    meta = x._meta_data.data
+                    meta_to_lazy[meta] = t
+                    return meta
                 return x
 
-            args = tree_map(unwrap, args)
-            kwargs = tree_map(unwrap, kwargs)
-            o = func(*args, **kwargs)
-
-        if isinstance(o, MetaTensor):
-            target._meta_data = o
-            return target
-        else:
-            return o
+            def wrap(y, i=None):
+                if isinstance(y, MetaTensor):
+                    if y in meta_to_lazy:
+                        # inplace op, just return origin lazy tensor
+                        return meta_to_lazy[y]
+                    else:
+                        # out of place op, create new lazy tensor
+                        fn = lambda *a, **kw: func(*a, **kw) if i is None else func(*a, **kw)[i]
+                        lazy_y = LazyTensor(fn, *args, meta_data=y, **kwargs)
+                        return lazy_y
+                elif type(y) is Tensor:
+                    # for early materialized tensor
+                    with torch._C.DisableTorchFunction():
+                        meta = MetaTensor(y.new_empty(y.shape, dtype=y.dtype, device='meta'), fake_device=y.device)
+                    lazy_y = LazyTensor(lambda: None, meta_data=meta)
+                    lazy_y._materialized_data = y
+                    return lazy_y
+                return y
+
+            o = func(*tree_map(unwrap, args), **tree_map(unwrap, kwargs))
+            if isinstance(o, (tuple, list)):
+                return type(o)(wrap(y, i=i) for i, y in enumerate(o))
+            return wrap(o)
 
     @classmethod
     def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
         pass    # skip
 
     def clone(self) -> "LazyTensor":
-        """Create a new ``LazyTensor`` with same cached sequence and factory method.
 
-        Returns:
-            LazyTensor: the new ``LazyTensor``
-        """
-        target = LazyTensor(orig_empty, 0, dtype=self._meta_data.dtype, device=self._meta_data.device)
-        target._factory_method = None
-        target._cached_buffer = list()
-        target._meta_data = self._meta_data.clone()
-        target._cached_data = self._cached_data.clone() if self._cached_data is not None else None
-        target._spec = copy.deepcopy(self._spec)
-        return target
+        def factory_fn():
+            return self.materialize().clone()
 
-    def detach(self) -> "LazyTensor":
-        target = self.clone()
-        target._cached_buffer.append((torch.Tensor.detach_, (self,), {}))
-        return target
+        target = LazyTensor(factory_fn, meta_data=self._meta_data)
 
-    @property
-    def spec(self) -> ShardingSpec:
-        return self._spec
+        return target
 
-    @spec.setter
-    def spec(self, other: ShardingSpec):
-        self._spec = other
+    def detach(self) -> Tensor:
+        return self
 
     @property
-    def data(self) -> "LazyTensor":
-        return self._data.detach()
+    def data(self):
+        return self
 
     @data.setter
-    def data(self, other: "LazyTensor") -> "LazyTensor":
-        """This avoid the following infinite recursion, which is very common in ``nn.Module`` initialization.
+    def data(self, other: 'LazyTensor'):
+        raise NotImplementedError
 
-        Usage:
-            >>> a = LazyTensor(torch.empty, 0, dtype=torch.float32, device='cpu')
-            >>> b = a.cuda()
-            >>> a.data = b
-        """
-        self._data = other
+    def tolist(self) -> list:
+        t = self.materialize()
+        return t.tolist()
 
+    def __hash__(self):
+        return id(self)
 
-class LazyInitContext():
+
+class LazyInitContext:
     """Context manager for lazy initialization. Enables initializing the model without allocating real memory.
 
     Usage:
@@ -319,16 +304,21 @@ class LazyInitContext():
             1. Quantization strategies can be applied before allocating real memory.
             2. Lazy initialization seems slower than normal initialization.
     """
+    _replaced: bool = False
 
-    def __init__(self):
+    def __init__(self, tensor_cls: Union[_MyTensor, LazyTensor] = LazyTensor):
         self.overrides = {}
+        self.tensor_cls = tensor_cls
 
     def __enter__(self):
+        if LazyInitContext._replaced:
+            raise RuntimeError(f'LazyInitContext is not reentrant')
+        LazyInitContext._replaced = True
 
         def wrap_factory_method(target):
             # factory functions (eg. torch.empty())
             def wrapper(*args, **kwargs):
-                return LazyTensor(target, *args, **kwargs)
+                return self.tensor_cls(target, *args, **kwargs)
 
             return wrapper, target
 
@@ -336,7 +326,7 @@ def wrap_factory_like_method(orig_target, target):
             # factory_like functions (eg. torch.empty_like())
             def wrapper(*args, **kwargs):
                 orig_t = args[0]
-                return LazyTensor(orig_target, *args[1:], device=orig_t.device, dtype=orig_t.dtype, **kwargs)
+                return self.tensor_cls(orig_target, *args[1:], device=orig_t.device, dtype=orig_t.dtype, **kwargs)
 
             return wrapper, target
 
@@ -356,85 +346,51 @@ def wrapper(*args, **kwargs):
             setattr(torch, name, wrapper)
 
     def __exit__(self, exc_type, exc_val, exc_tb):
+        LazyInitContext._replaced = False
         for name, (wrapper, orig) in self.overrides.items():
             setattr(torch, name, orig)
 
     @staticmethod
-    def materialize(module: torch.nn.Module):
+    def materialize(module: torch.nn.Module, verbose: bool = False):
         """Initialize all ``nn.Parameter`` from ``LazyTensor``.
 
         Args:
             module (torch.nn.Module): Target ``nn.Module``
+            verbose (bool): Whether to print lazy initialization rate. Defaults to False.
         """
+        if verbose:
+            param_cnt = 0
+            param_lazy_cnt = 0
+            buf_cnt = 0
+            buf_lazy_cnt = 0
 
         @torch.no_grad()
         def init_recursively(module: nn.Module):
+            nonlocal param_cnt, param_lazy_cnt, buf_cnt, buf_lazy_cnt
             # recursively initialize the module
             for mod in module.children():
                 init_recursively(mod)
 
             # initialize tensors directly attached to the current module
             for name, param in module.named_parameters(recurse=False):
+                if verbose:
+                    param_cnt += 1
+                    if param._materialized_data is None:
+                        param_lazy_cnt += 1
                 setattr(module, name, param.materialize())
+                param.clean()
 
             for name, buf in module.named_buffers(recurse=False):
+                if verbose:
+                    buf_cnt += 1
+                    if buf._materialized_data is None:
+                        buf_lazy_cnt += 1
                 setattr(module, name, buf.materialize())
+                buf.clean()
 
         init_recursively(module)
-        return module
-
-    @staticmethod
-    def distribute(module: torch.nn.Module):
-        """Initialize and shard all ``nn.Parameter`` from ``LazyTensor``.
-
-        Args:
-            module (torch.nn.Module): Sharded target ``nn.Module``
-        """
-
-        @torch.no_grad()
-        def init_recursively(module: nn.Module):
-            # recursively initialize the module
-            for mod in module.children():
-                init_recursively(mod)
-
-            # initialize tensors directly attached to the current module
-            for name, param in module.named_parameters(recurse=False):
-                setattr(module, name, param.distribute())
-
-            for name, buf in module.named_buffers(recurse=False):
-                setattr(module, name, buf.distribute())
 
-        init_recursively(module)
+        if verbose:
+            print(f'Param lazy rate: {param_lazy_cnt}/{param_cnt}')
+            print(f'Buffer lazy rate: {buf_lazy_cnt}/{buf_cnt}')
         return module
-
-    @staticmethod
-    @contextlib.contextmanager
-    def traceable(module: torch.nn.Module):
-        """Initialize all ``nn.Parameters`` as ``MetaTensor``. This enables ``ColoTracer`` with control flow.
-
-        Args:
-            module (torch.nn.Module): Traceable ``nn.Module`` with ``MetaTensor`` as parameters.
-        """
-        orig_val = dict()
-
-        def init_recursively(module: nn.Module):
-            # recursively initialize the module
-            for mod in module.children():
-                init_recursively(mod)
-
-            # initialize tensors directly attached to the current module
-            for name, param in module.named_parameters(recurse=False):
-                setattr(module, name, param.traceable())
-                orig_val[(module, name)] = param
-
-            for name, buf in module.named_buffers(recurse=False):
-                setattr(module, name, buf.traceable())
-                orig_val[(module, name)] = buf
-
-        init_recursively(module)
-
-        yield
-
-        # restore original values
-        for (module, name), val in orig_val.items():
-            setattr(module, name, val)

From 2eca4cd376918e6cd7b085b87483af92acf067bf Mon Sep 17 00:00:00 2001
From: YuliangLiu0306 <72588413+YuliangLiu0306@users.noreply.github.com>
Date: Tue, 14 Mar 2023 16:25:47 +0800
Subject: [PATCH 467/503] [DTensor] refactor dtensor with new components
 (#3089)

* [DTensor] refactor dtensor with new components

* polish
---
 colossalai/tensor/d_tensor/d_tensor.py        | 44 ++++++-------------
 .../tensor/d_tensor/layout_converter.py       |  6 +--
 .../test_tensor/test_dtensor/test_dtensor.py  | 11 ++---
 3 files changed, 20 insertions(+), 41 deletions(-)

diff --git a/colossalai/tensor/d_tensor/d_tensor.py b/colossalai/tensor/d_tensor/d_tensor.py
index e311eb3ba241..c1fe9d50a048 100644
--- a/colossalai/tensor/d_tensor/d_tensor.py
+++ b/colossalai/tensor/d_tensor/d_tensor.py
@@ -3,12 +3,11 @@
 import torch
 from torch.utils._pytree import tree_map
 
-from colossalai.device.device_mesh import DeviceMesh
-from colossalai.tensor.d_tensor.layout import Layout
-from colossalai.tensor.shape_consistency import ShapeConsistencyManager, to_global
-from colossalai.tensor.sharding_spec import ShardingSpec
+from .layout import Layout
+from .layout_converter import LayoutConverter, to_global
+from .sharding_spec import ShardingSpec
 
-shape_consistency_manager = ShapeConsistencyManager()
+layout_converter = LayoutConverter()
 
 
 class DTensor(torch.Tensor):
@@ -17,8 +16,6 @@ def __init__(self, local_tensor: torch.Tensor, dist_layout: Layout):
         self.local_tensor = local_tensor
         self.data_type = local_tensor.dtype
         self.entire_shape = local_tensor.shape
-        if dist_layout.entire_shape is None:
-            dist_layout.entire_shape = self.entire_shape
         self.dist_layout = dist_layout
         self._apply_layout()
 
@@ -36,20 +33,19 @@ def layout_convert(self, target_layout):
         '''
         Convert the layout of the tensor from source_spec to target_spec.
         '''
-        source_spec = convert_layout_to_sharding_spec(self.dist_layout)
-        target_spec = convert_layout_to_sharding_spec(target_layout)
-        self.local_tensor = shape_consistency_manager.apply_for_autoparallel_runtime(
-            self.local_tensor, source_spec, target_spec)
+        self.local_tensor = layout_converter.apply(self.local_tensor, self.dist_layout, target_layout)
         self.dist_layout = target_layout
 
     def _apply_layout(self):
         '''
         Apply the layout to the local tensor during initializing process.
         '''
-        source_spec = construct_default_sharding_spec(self.local_tensor, self.device_mesh)
-        target_spec = convert_layout_to_sharding_spec(self.dist_layout)
-        self.local_tensor = shape_consistency_manager.apply_for_autoparallel_runtime(
-            self.local_tensor, source_spec, target_spec)
+        source_spec = construct_default_sharding_spec(self.local_tensor)
+        source_layout = Layout(device_mesh=self.dist_layout.device_mesh,
+                               device_type=self.dist_layout.device_type,
+                               sharding_spec=source_spec,
+                               entire_shape=self.entire_shape)
+        self.local_tensor = layout_converter.apply(self.local_tensor, source_layout, self.dist_layout)
 
     @classmethod
     def __torch_function__(cls, func, types, args=(), kwargs=None):
@@ -108,7 +104,7 @@ def to_global(self):
         will not change the layout of the DTensor. This function is mainly used for debugging or
         check the correctness of the distributed tensor.
         '''
-        return to_global(self.local_tensor, convert_layout_to_sharding_spec(self.dist_layout))
+        return to_global(self.local_tensor, self.dist_layout)
 
 
 def distribute_tensor(local_tensor: torch.Tensor, dist_layout: Layout) -> DTensor:
@@ -139,20 +135,8 @@ def distribute_module(module: torch.nn.Module, partition_fn: Optional[callable]
     return module
 
 
-def convert_layout_to_sharding_spec(layout: Layout) -> ShardingSpec:
-    '''
-    Convert the layout from Layout class to ShardingSpec class.
-    '''
-    return ShardingSpec(device_mesh=layout.device_mesh,
-                        entire_shape=layout.entire_shape,
-                        dim_partition_dict=layout.sharding_spec.dim_partition_dict)
-
-
-def construct_default_sharding_spec(
-    tensor: torch.Tensor,
-    device_mesh: DeviceMesh,
-) -> ShardingSpec:
+def construct_default_sharding_spec(tensor: torch.Tensor,) -> ShardingSpec:
     '''
     Construct the default sharding specification for the tensor.
     '''
-    return ShardingSpec(device_mesh=device_mesh, entire_shape=tensor.shape, dim_partition_dict={})
+    return ShardingSpec(dim_size=tensor.dim(), dim_partition_dict={})
diff --git a/colossalai/tensor/d_tensor/layout_converter.py b/colossalai/tensor/d_tensor/layout_converter.py
index 22bbb1d2fe74..a4f4c9c2dd80 100644
--- a/colossalai/tensor/d_tensor/layout_converter.py
+++ b/colossalai/tensor/d_tensor/layout_converter.py
@@ -22,21 +22,21 @@
 @dataclass
 class LayoutConverterOptions:
     """
-    LayoutConverterOptions is a dataclass which specifies the preferences for shape consistency.
+    LayoutConverterOptions is a dataclass which specifies the preferences for layout converting.
     """
     # TODO: layout converter option is not implemented yet
     pass
 
 
 def to_global(distributed_tensor: torch.Tensor, layout: Layout) -> torch.Tensor:
-    shape_consistency_manager = LayoutConverter()
+    layout_converter = LayoutConverter()
     global_sharding_spec = ShardingSpec(distributed_tensor.dim(), {})
     global_layout = Layout(device_mesh=layout.device_mesh,
                            device_type=layout.device_type,
                            sharding_spec=global_sharding_spec,
                            entire_shape=layout.entire_shape)
     with torch.no_grad():
-        global_tensor = shape_consistency_manager.apply(distributed_tensor, layout, global_layout)
+        global_tensor = layout_converter.apply(distributed_tensor, layout, global_layout)
     return global_tensor
 
 
diff --git a/tests/test_tensor/test_dtensor/test_dtensor.py b/tests/test_tensor/test_dtensor/test_dtensor.py
index 80e275d9740e..a99ac6e41c5e 100644
--- a/tests/test_tensor/test_dtensor/test_dtensor.py
+++ b/tests/test_tensor/test_dtensor/test_dtensor.py
@@ -4,12 +4,11 @@
 import torch.multiprocessing as mp
 
 from colossalai.device.device_mesh import DeviceMesh
-from colossalai.fx.tracer import ColoTracer
 from colossalai.initialize import launch
 from colossalai.logging import disable_existing_loggers
 from colossalai.tensor.d_tensor.d_tensor import DTensor, distribute_tensor
 from colossalai.tensor.d_tensor.layout import Layout
-from colossalai.tensor.sharding_spec import ShardingSpec
+from colossalai.tensor.d_tensor.sharding_spec import ShardingSpec
 from colossalai.utils import free_port
 
 
@@ -34,9 +33,7 @@ def check_dtensor(rank, world_size, port):
     compare_output = test_model(original_tensor)
 
     device_mesh = DeviceMesh(torch.Tensor([0, 1, 2, 3]), (2, 2), init_process_group=True)
-    target_sharding_spec = ShardingSpec(device_mesh=device_mesh,
-                                        entire_shape=original_tensor.shape,
-                                        dim_partition_dict={0: [0]})
+    target_sharding_spec = ShardingSpec(dim_size=original_tensor.dim(), dim_partition_dict={0: [0]})
     layout = Layout(device_mesh=device_mesh,
                     device_type=torch.device('cuda'),
                     sharding_spec=target_sharding_spec,
@@ -62,9 +59,7 @@ def check_dtensor(rank, world_size, port):
     else:
         raise ValueError(f'rank {rank} is not in the device mesh')
 
-    new_sharding_spec = ShardingSpec(device_mesh=device_mesh,
-                                     entire_shape=original_tensor.shape,
-                                     dim_partition_dict={0: [0, 1]})
+    new_sharding_spec = ShardingSpec(dim_size=original_tensor.dim(), dim_partition_dict={0: [0, 1]})
     new_layout = Layout(device_mesh=device_mesh,
                         device_type=torch.device('cuda'),
                         sharding_spec=new_sharding_spec,

From 1a46e71e077acc021a96bd3b25b8bd6013cd95e0 Mon Sep 17 00:00:00 2001
From: Saurav Maheshkar <sauravvmaheshkar@gmail.com>
Date: Tue, 14 Mar 2023 08:28:06 +0000
Subject: [PATCH 468/503] [docker] Add opencontainers image-spec to
 `Dockerfile` (#3006)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat(docker): Add opencontainers image-spec to `Dockerfile`

This PR makes few changes to improve the overall quality of the docker image 🐳 . For reference more annotations can be found [here](https://github.com/opencontainers/image-spec/blob/main/annotations.md)

* feat(docker): add inline version declaration

* fix(docker): drop `org.opencontainers.image.version` LABEL
---
 docker/Dockerfile | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 0faba17b9cee..49ff9b344268 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,5 +1,10 @@
 FROM hpcaitech/cuda-conda:11.3
 
+# metainformation
+LABEL org.opencontainers.image.source = "https://github.com/hpcaitech/ColossalAI"
+LABEL org.opencontainers.image.licenses = "Apache License 2.0"
+LABEL org.opencontainers.image.base.name = "docker.io/library/hpcaitech/cuda-conda:11.3"
+
 # install torch
 RUN conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit=11.3 -c pytorch
 
@@ -22,4 +27,4 @@ RUN conda install cmake && \
     git clone https://github.com/hpcaitech/TensorNVMe.git && \
     cd TensorNVMe && \
     pip install -r requirements.txt && \
-    pip install -v --no-cache-dir .
\ No newline at end of file
+    pip install -v --no-cache-dir .

From 1216d1e7bdf223d831895e34c01fb40df36ea9c7 Mon Sep 17 00:00:00 2001
From: HELSON <c2h214748@gmail.com>
Date: Tue, 14 Mar 2023 17:20:28 +0800
Subject: [PATCH 469/503] [tests] diffuser models in model zoo (#3136)

* [tests] diffuser models in model zoo

* remove useless code

* [tests] add diffusers to requirement-test
---
 requirements/requirements-test.txt            |   1 +
 tests/kit/model_zoo/__init__.py               |   2 +-
 tests/kit/model_zoo/diffusers/__init__.py     |   1 +
 tests/kit/model_zoo/diffusers/diffusers.py    |  73 ++++++++
 .../test_hf_model/test_hf_diffuser.py         | 167 +++++++-----------
 5 files changed, 137 insertions(+), 107 deletions(-)
 create mode 100644 tests/kit/model_zoo/diffusers/__init__.py
 create mode 100644 tests/kit/model_zoo/diffusers/diffusers.py

diff --git a/requirements/requirements-test.txt b/requirements/requirements-test.txt
index 93055cd12109..05c0e6ac5e5c 100644
--- a/requirements/requirements-test.txt
+++ b/requirements/requirements-test.txt
@@ -1,3 +1,4 @@
+diffusers
 fbgemm-gpu==0.2.0
 pytest
 pytest-cov
diff --git a/tests/kit/model_zoo/__init__.py b/tests/kit/model_zoo/__init__.py
index 435daea2c7de..6d77fb850a4b 100644
--- a/tests/kit/model_zoo/__init__.py
+++ b/tests/kit/model_zoo/__init__.py
@@ -1,4 +1,4 @@
-from . import timm
+from . import diffusers, timm
 from .registry import model_zoo
 
 __all__ = ['model_zoo']
diff --git a/tests/kit/model_zoo/diffusers/__init__.py b/tests/kit/model_zoo/diffusers/__init__.py
new file mode 100644
index 000000000000..288f626a4539
--- /dev/null
+++ b/tests/kit/model_zoo/diffusers/__init__.py
@@ -0,0 +1 @@
+from .diffusers import *
diff --git a/tests/kit/model_zoo/diffusers/diffusers.py b/tests/kit/model_zoo/diffusers/diffusers.py
new file mode 100644
index 000000000000..8aa3f4c6741f
--- /dev/null
+++ b/tests/kit/model_zoo/diffusers/diffusers.py
@@ -0,0 +1,73 @@
+from functools import partial
+
+import diffusers
+import torch
+import transformers
+
+from ..registry import ModelAttribute, model_zoo
+
+BATCH_SIZE = 2
+SEQ_LENGTH = 5
+HEIGHT = 224
+WIDTH = 224
+IN_CHANNELS = 3
+LATENTS_SHAPE = (BATCH_SIZE, IN_CHANNELS, HEIGHT // 7, WIDTH // 7)
+TIME_STEP = 3
+
+data_vae_fn = lambda: dict(sample=torch.randn(2, 3, 32, 32))
+data_unet_fn = lambda: dict(sample=torch.randn(2, 3, 32, 32), timestep=3)
+
+identity_output = lambda x: x
+
+
+def data_clip_model():
+    input_ids = torch.zeros((BATCH_SIZE, SEQ_LENGTH), dtype=torch.int64)
+    attention_mask = torch.zeros((BATCH_SIZE, SEQ_LENGTH), dtype=torch.int64)
+    position_ids = torch.zeros((BATCH_SIZE, SEQ_LENGTH), dtype=torch.int64)
+    pixel_values = torch.zeros((BATCH_SIZE, IN_CHANNELS, HEIGHT, WIDTH), dtype=torch.float32)
+    return dict(input_ids=input_ids,
+                pixel_values=pixel_values,
+                attention_mask=attention_mask,
+                position_ids=position_ids)
+
+
+def data_clip_text():
+    input_ids = torch.zeros((BATCH_SIZE, SEQ_LENGTH), dtype=torch.int64)
+    attention_mask = torch.zeros((BATCH_SIZE, SEQ_LENGTH), dtype=torch.int64)
+    return dict(input_ids=input_ids, attention_mask=attention_mask)
+
+
+def data_clip_vision():
+    pixel_values = torch.zeros((BATCH_SIZE, IN_CHANNELS, HEIGHT, WIDTH), dtype=torch.float32)
+    return dict(pixel_values=pixel_values)
+
+
+model_zoo.register(name='diffusers_auto_encoder_kl',
+                   model_fn=diffusers.AutoencoderKL,
+                   data_gen_fn=data_vae_fn,
+                   output_transform_fn=identity_output)
+
+model_zoo.register(name='diffusers_vq_model',
+                   model_fn=diffusers.VQModel,
+                   data_gen_fn=data_vae_fn,
+                   output_transform_fn=identity_output)
+
+model_zoo.register(name='diffusers_clip_model',
+                   model_fn=partial(transformers.CLIPModel, config=transformers.CLIPConfig()),
+                   data_gen_fn=data_clip_model,
+                   output_transform_fn=identity_output)
+
+model_zoo.register(name='diffusers_clip_text_model',
+                   model_fn=partial(transformers.CLIPTextModel, config=transformers.CLIPTextConfig()),
+                   data_gen_fn=data_clip_text,
+                   output_transform_fn=identity_output)
+
+model_zoo.register(name='diffusers_clip_vision_model',
+                   model_fn=partial(transformers.CLIPVisionModel, config=transformers.CLIPVisionConfig()),
+                   data_gen_fn=data_clip_vision,
+                   output_transform_fn=identity_output)
+
+model_zoo.register(name='diffusers_unet2d_model',
+                   model_fn=diffusers.UNet2DModel,
+                   data_gen_fn=data_unet_fn,
+                   output_transform_fn=identity_output)
diff --git a/tests/test_fx/test_tracer/test_hf_model/test_hf_diffuser.py b/tests/test_fx/test_tracer/test_hf_model/test_hf_diffuser.py
index 04e874becd00..92ece357bfed 100644
--- a/tests/test_fx/test_tracer/test_hf_model/test_hf_diffuser.py
+++ b/tests/test_fx/test_tracer/test_hf_model/test_hf_diffuser.py
@@ -1,114 +1,69 @@
 import pytest
 import torch
-import transformers
-from hf_tracer_utils import trace_model_and_compare_output
 
 from colossalai.fx import symbolic_trace
+from colossalai.testing.random import seed_all
+from tests.kit.model_zoo import model_zoo
 
-try:
-    import diffusers
-    HAS_DIFFUSERS = True
-except ImportError:
-    HAS_DIFFUSERS = False
-
-BATCH_SIZE = 2
-SEQ_LENGTH = 5
-HEIGHT = 224
-WIDTH = 224
-IN_CHANNELS = 3
-LATENTS_SHAPE = (BATCH_SIZE, IN_CHANNELS, HEIGHT // 8, WIDTH // 8)
-TIME_STEP = 2
-
-
-@pytest.mark.skipif(not HAS_DIFFUSERS, reason="diffusers has not been installed")
-def test_vae():
-    MODEL_LIST = [
-        diffusers.AutoencoderKL,
-        diffusers.VQModel,
-    ]
-
-    for model_cls in MODEL_LIST:
-        model = model_cls()
-        sample = torch.zeros(LATENTS_SHAPE)
-
-        gm = symbolic_trace(model)
-
-        model.eval()
-        gm.eval()
-
-        with torch.no_grad():
-            fx_out = gm(sample)
-            non_fx_out = model(sample)
-        assert torch.allclose(
-            fx_out['sample'],
-            non_fx_out['sample']), f'{model.__class__.__name__} has inconsistent outputs, {fx_out} vs {non_fx_out}'
-
-
-def test_clip():
-    MODEL_LIST = [
-        transformers.CLIPModel,
-        transformers.CLIPTextModel,
-        transformers.CLIPVisionModel,
-    ]
-
-    CONFIG_LIST = [
-        transformers.CLIPConfig,
-        transformers.CLIPTextConfig,
-        transformers.CLIPVisionConfig,
-    ]
-
-    def data_gen():
-        if isinstance(model, transformers.CLIPModel):
-            input_ids = torch.zeros((BATCH_SIZE, SEQ_LENGTH), dtype=torch.int64)
-            attention_mask = torch.zeros((BATCH_SIZE, SEQ_LENGTH), dtype=torch.int64)
-            position_ids = torch.zeros((BATCH_SIZE, SEQ_LENGTH), dtype=torch.int64)
-            pixel_values = torch.zeros((BATCH_SIZE, IN_CHANNELS, HEIGHT, WIDTH), dtype=torch.float32)
-            kwargs = dict(input_ids=input_ids,
-                          attention_mask=attention_mask,
-                          position_ids=position_ids,
-                          pixel_values=pixel_values)
-        elif isinstance(model, transformers.CLIPTextModel):
-            input_ids = torch.zeros((BATCH_SIZE, SEQ_LENGTH), dtype=torch.int64)
-            attention_mask = torch.zeros((BATCH_SIZE, SEQ_LENGTH), dtype=torch.int64)
-            kwargs = dict(input_ids=input_ids, attention_mask=attention_mask)
-        elif isinstance(model, transformers.CLIPVisionModel):
-            pixel_values = torch.zeros((BATCH_SIZE, IN_CHANNELS, HEIGHT, WIDTH), dtype=torch.float32)
-            kwargs = dict(pixel_values=pixel_values)
-        return kwargs
-
-    for model_cls, config in zip(MODEL_LIST, CONFIG_LIST):
-        model = model_cls(config=config())
-        trace_model_and_compare_output(model, data_gen)
-
-
-@pytest.mark.skipif(not HAS_DIFFUSERS, reason="diffusers has not been installed")
-@pytest.mark.skip(reason='cannot pass the test yet')
-def test_unet():
-    MODEL_LIST = [
-        diffusers.UNet2DModel,
-        diffusers.UNet2DConditionModel,
-    ]
-
-    for model_cls in MODEL_LIST:
-        model = model_cls()
-        sample = torch.zeros(LATENTS_SHAPE)
-
-        gm = symbolic_trace(model)
-
-        model.eval()
-        gm.eval()
-
-        with torch.no_grad():
-            fx_out = gm(sample, TIME_STEP)
-            non_fx_out = model(sample, TIME_STEP)
-        assert torch.allclose(
-            fx_out['sample'],
-            non_fx_out['sample']), f'{model.__class__.__name__} has inconsistent outputs, {fx_out} vs {non_fx_out}'
 
+def assert_dict(da, db, assert_fn):
+    assert len(da) == len(db)
+    for k, v in da.items():
+        assert k in db
+        if not torch.is_tensor(v):
+            continue
+        u = db.get(k)
+        assert_fn(u, v)
 
-if __name__ == "__main__":
-    test_vae()
-    test_clip()
 
-    # skip because of failure
-    # test_unet()
+def trace_and_compare(model_cls, data, output_fn):
+    model = model_cls()
+    model.eval()
+
+    concrete_args = {k: v for k, v in data.items() if not torch.is_tensor(v)}
+    meta_args = {k: v.to('meta') for k, v in data.items() if torch.is_tensor(v)}
+    gm = symbolic_trace(model, concrete_args=concrete_args, meta_args=meta_args)
+
+    # run forward
+    with torch.no_grad():
+        fx_out = gm(**data)
+        non_fx_out = model(**data)
+
+    # compare output
+    transformed_fx_out = output_fn(fx_out)
+    transformed_non_fx_out = output_fn(non_fx_out)
+
+    def assert_fn(ta, tb):
+        assert torch.equal(ta, tb)
+
+    assert_dict(transformed_fx_out, transformed_non_fx_out, assert_fn)
+
+
+@pytest.mark.skip(reason='cannot pass this test yet')
+def test_diffusers():
+    seed_all(9091, cuda_deterministic=True)
+
+    sub_model_zoo = model_zoo.get_sub_registry('diffusers')
+
+    for name, (model_fn, data_gen_fn, output_transform_fn, attribute) in sub_model_zoo.items():
+        data = data_gen_fn()
+        trace_and_compare(model_fn, data, output_transform_fn)
+        torch.cuda.synchronize()
+        print(f"{name:40s} √")
+
+
+def test_torch_diffusers():
+    seed_all(65535, cuda_deterministic=True)
+
+    sub_model_zoo = model_zoo.get_sub_registry('diffusers')
+
+    for name, (model_fn, data_gen_fn, output_transform_fn, attribute) in sub_model_zoo.items():
+        data = data_gen_fn()
+        model = model_fn()
+        output = model(**data)
+        torch.cuda.synchronize()
+        print(f"{name:40s} √")
+
+
+if __name__ == "__main__":
+    test_torch_diffusers()

From a674c6334846aa4af71703961d68907e8d0611b2 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Wed, 15 Mar 2023 10:42:07 +0800
Subject: [PATCH 470/503] [test] added torchvision models to test model zoo
 (#3132)

* [test] added torchvision models to test model zoo

* polish code

* polish code

* polish code

* polish code

* polish code

* polish code
---
 tests/kit/model_zoo/__init__.py               |   2 +-
 tests/kit/model_zoo/registry.py               |   5 +
 tests/kit/model_zoo/torchvision/__init__.py   |   1 +
 .../kit/model_zoo/torchvision/torchvision.py  | 131 ++++++++++++++++++
 .../test_torchvision_model.py                 |  49 ++++---
 5 files changed, 162 insertions(+), 26 deletions(-)
 create mode 100644 tests/kit/model_zoo/torchvision/__init__.py
 create mode 100644 tests/kit/model_zoo/torchvision/torchvision.py

diff --git a/tests/kit/model_zoo/__init__.py b/tests/kit/model_zoo/__init__.py
index 6d77fb850a4b..abe18ebfad8f 100644
--- a/tests/kit/model_zoo/__init__.py
+++ b/tests/kit/model_zoo/__init__.py
@@ -1,4 +1,4 @@
-from . import diffusers, timm
+from . import diffusers, timm, torchvision
 from .registry import model_zoo
 
 __all__ = ['model_zoo']
diff --git a/tests/kit/model_zoo/registry.py b/tests/kit/model_zoo/registry.py
index 4e7dcb30f04d..7470327a65b6 100644
--- a/tests/kit/model_zoo/registry.py
+++ b/tests/kit/model_zoo/registry.py
@@ -9,8 +9,13 @@
 class ModelAttribute:
     """
     Attributes of a model.
+
+    Args:
+        has_control_flow (bool): Whether the model contains branching in its forward method.
+        has_stochastic_depth_prob (bool): Whether the model contains stochastic depth probability. Often seen in the torchvision models.
     """
     has_control_flow: bool = False
+    has_stochastic_depth_prob: bool = False
 
 
 class ModelZooRegistry(dict):
diff --git a/tests/kit/model_zoo/torchvision/__init__.py b/tests/kit/model_zoo/torchvision/__init__.py
new file mode 100644
index 000000000000..55d58f97b5d4
--- /dev/null
+++ b/tests/kit/model_zoo/torchvision/__init__.py
@@ -0,0 +1 @@
+from .torchvision import *
diff --git a/tests/kit/model_zoo/torchvision/torchvision.py b/tests/kit/model_zoo/torchvision/torchvision.py
new file mode 100644
index 000000000000..62bda93d5a75
--- /dev/null
+++ b/tests/kit/model_zoo/torchvision/torchvision.py
@@ -0,0 +1,131 @@
+from collections import namedtuple
+
+import torch
+import torchvision
+import torchvision.models as tm
+from packaging import version
+
+from ..registry import ModelAttribute, model_zoo
+
+data_gen_fn = lambda: dict(x=torch.rand(4, 3, 224, 224))
+output_transform_fn = lambda x: dict(output=x)
+
+# special data gen fn
+inception_v3_data_gen_fn = lambda: dict(x=torch.rand(4, 3, 299, 299))
+
+
+# special model fn
+def swin_s():
+    from torchvision.models.swin_transformer import Swin_T_Weights, _swin_transformer
+
+    # adapted from torchvision.models.swin_transformer.swin_small
+    weights = None
+    weights = Swin_T_Weights.verify(weights)
+    progress = True
+
+    return _swin_transformer(
+        patch_size=[4, 4],
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=[7, 7],
+        stochastic_depth_prob=0,    # it is originally 0.2, but we set it to 0 to make it deterministic
+        weights=weights,
+        progress=progress,
+    )
+
+
+# special output transform fn
+google_net_output_transform_fn = lambda x: dict(output=x.logits) if isinstance(x, torchvision.models.GoogLeNetOutputs
+                                                                              ) else dict(output=x)
+swin_s_output_output_transform_fn = lambda x: {f'output{idx}': val
+                                               for idx, val in enumerate(x)} if isinstance(x, tuple) else dict(output=x)
+inception_v3_output_transform_fn = lambda x: dict(output=x.logits) if isinstance(x, torchvision.models.InceptionOutputs
+                                                                                ) else dict(output=x)
+
+model_zoo.register(name='torchvision_alexnet',
+                   model_fn=tm.alexnet,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
+model_zoo.register(name='torchvision_densenet121',
+                   model_fn=tm.densenet121,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
+model_zoo.register(name='torchvision_efficientnet_b0',
+                   model_fn=tm.efficientnet_b0,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn,
+                   model_attribute=ModelAttribute(has_stochastic_depth_prob=True))
+model_zoo.register(name='torchvision_googlenet',
+                   model_fn=tm.googlenet,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=google_net_output_transform_fn)
+model_zoo.register(name='torchvision_inception_v3',
+                   model_fn=tm.inception_v3,
+                   data_gen_fn=inception_v3_data_gen_fn,
+                   output_transform_fn=inception_v3_output_transform_fn)
+model_zoo.register(name='torchvision_mobilenet_v2',
+                   model_fn=tm.mobilenet_v2,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
+model_zoo.register(name='torchvision_mobilenet_v3_small',
+                   model_fn=tm.mobilenet_v3_small,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
+model_zoo.register(name='torchvision_mnasnet0_5',
+                   model_fn=tm.mnasnet0_5,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
+model_zoo.register(name='torchvision_resnet18',
+                   model_fn=tm.resnet18,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
+model_zoo.register(name='torchvision_regnet_x_16gf',
+                   model_fn=tm.regnet_x_16gf,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
+model_zoo.register(name='torchvision_resnext50_32x4d',
+                   model_fn=tm.resnext50_32x4d,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
+model_zoo.register(name='torchvision_shufflenet_v2_x0_5',
+                   model_fn=tm.shufflenet_v2_x0_5,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
+model_zoo.register(name='torchvision_squeezenet1_0',
+                   model_fn=tm.squeezenet1_0,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
+
+model_zoo.register(name='torchvision_vgg11',
+                   model_fn=tm.vgg11,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
+model_zoo.register(name='torchvision_wide_resnet50_2',
+                   model_fn=tm.wide_resnet50_2,
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
+
+if version.parse(torchvision.__version__) >= version.parse('0.12.0'):
+    model_zoo.register(name='torchvision_vit_b_16',
+                       model_fn=tm.vit_b_16,
+                       data_gen_fn=data_gen_fn,
+                       output_transform_fn=output_transform_fn)
+    model_zoo.register(name='torchvision_convnext_base',
+                       model_fn=tm.convnext_base,
+                       data_gen_fn=data_gen_fn,
+                       output_transform_fn=output_transform_fn,
+                       model_attribute=ModelAttribute(has_stochastic_depth_prob=True))
+
+if version.parse(torchvision.__version__) >= version.parse('0.13.0'):
+    model_zoo.register(
+        name='torchvision_swin_s',
+        model_fn=swin_s,
+        data_gen_fn=data_gen_fn,
+        output_transform_fn=swin_s_output_output_transform_fn,
+    )
+    model_zoo.register(name='torchvision_efficientnet_v2_s',
+                       model_fn=tm.efficientnet_v2_s,
+                       data_gen_fn=data_gen_fn,
+                       output_transform_fn=output_transform_fn,
+                       model_attribute=ModelAttribute(has_stochastic_depth_prob=True))
diff --git a/tests/test_fx/test_tracer/test_torchvision_model/test_torchvision_model.py b/tests/test_fx/test_tracer/test_torchvision_model/test_torchvision_model.py
index 2a6c6ae1674b..455638818463 100644
--- a/tests/test_fx/test_tracer/test_torchvision_model/test_torchvision_model.py
+++ b/tests/test_fx/test_tracer/test_torchvision_model/test_torchvision_model.py
@@ -1,44 +1,43 @@
 import torch
-import torchvision
-import torchvision.models as tm
-from packaging import version
 
 from colossalai.fx import symbolic_trace
+from tests.kit.model_zoo import model_zoo
 
 
 def test_torchvision_models():
-    MODEL_LIST = [
-        tm.vgg11, tm.resnet18, tm.densenet121, tm.mobilenet_v3_small, tm.resnext50_32x4d, tm.wide_resnet50_2,
-        tm.regnet_x_16gf, tm.mnasnet0_5, tm.efficientnet_b0
-    ]
-
-    RANDOMIZED_MODELS = [tm.efficientnet_b0]
-
-    if version.parse(torchvision.__version__) >= version.parse('0.12.0'):
-        MODEL_LIST.extend([tm.vit_b_16, tm.convnext_small])
-        RANDOMIZED_MODELS.append(tm.convnext_small)
-
     torch.backends.cudnn.deterministic = True
+    tv_sub_registry = model_zoo.get_sub_registry('torchvision')
 
-    data = torch.rand(2, 3, 224, 224)
+    for name, (model_fn, data_gen_fn, output_transform_fn, model_attribute) in tv_sub_registry.items():
+        data = data_gen_fn()
 
-    for model_cls in MODEL_LIST:
-        if model_cls in RANDOMIZED_MODELS:
-            # remove the impact of randomicity
-            model = model_cls(stochastic_depth_prob=0)
+        if model_attribute is not None and model_attribute.has_stochastic_depth_prob:
+            model = model_fn(stochastic_depth_prob=0)
         else:
-            model = model_cls()
+            model = model_fn()
 
         gm = symbolic_trace(model)
 
         model.eval()
         gm.eval()
 
-        with torch.no_grad():
-            fx_out = gm(data)
-            non_fx_out = model(data)
-        assert torch.allclose(
-            fx_out, non_fx_out), f'{model.__class__.__name__} has inconsistent outputs, {fx_out} vs {non_fx_out}'
+        try:
+            with torch.no_grad():
+                fx_out = gm(**data)
+                non_fx_out = model(**data)
+                transformed_out = output_transform_fn(fx_out)
+                transformed_non_fx_out = output_transform_fn(non_fx_out)
+
+            assert len(transformed_out) == len(transformed_non_fx_out)
+
+            for key in transformed_out.keys():
+                fx_val = transformed_out[key]
+                non_fx_val = transformed_non_fx_out[key]
+                assert torch.allclose(
+                    fx_val,
+                    non_fx_val), f'{model.__class__.__name__} has inconsistent outputs, {fx_val} vs {non_fx_val}'
+        except Exception as e:
+            print(name, e)
 
 
 if __name__ == '__main__':

From 6d48eb0560159624014dc17b019e6319376b4ed6 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Wed, 15 Mar 2023 11:26:10 +0800
Subject: [PATCH 471/503] [test] added transformers models to test model zoo
 (#3135)

---
 tests/kit/model_zoo/__init__.py               |  2 +-
 tests/kit/model_zoo/transformers/__init__.py  |  5 ++
 tests/kit/model_zoo/transformers/albert.py    | 85 ++++++++++++++++++
 tests/kit/model_zoo/transformers/bert.py      | 88 +++++++++++++++++++
 tests/kit/model_zoo/transformers/gpt.py       | 49 +++++++++++
 tests/kit/model_zoo/transformers/opt.py       | 35 ++++++++
 tests/kit/model_zoo/transformers/t5.py        | 46 ++++++++++
 .../test_hf_model/test_hf_albert.py           | 64 ++------------
 .../test_tracer/test_hf_model/test_hf_bert.py | 68 ++------------
 .../test_tracer/test_hf_model/test_hf_gpt.py  | 28 ++----
 .../test_tracer/test_hf_model/test_hf_opt.py  | 25 ++----
 .../test_tracer/test_hf_model/test_hf_t5.py   | 37 ++------
 12 files changed, 339 insertions(+), 193 deletions(-)
 create mode 100644 tests/kit/model_zoo/transformers/__init__.py
 create mode 100644 tests/kit/model_zoo/transformers/albert.py
 create mode 100644 tests/kit/model_zoo/transformers/bert.py
 create mode 100644 tests/kit/model_zoo/transformers/gpt.py
 create mode 100644 tests/kit/model_zoo/transformers/opt.py
 create mode 100644 tests/kit/model_zoo/transformers/t5.py

diff --git a/tests/kit/model_zoo/__init__.py b/tests/kit/model_zoo/__init__.py
index abe18ebfad8f..7f14d04c0910 100644
--- a/tests/kit/model_zoo/__init__.py
+++ b/tests/kit/model_zoo/__init__.py
@@ -1,4 +1,4 @@
-from . import diffusers, timm, torchvision
+from . import diffusers, timm, torchvision, transformers
 from .registry import model_zoo
 
 __all__ = ['model_zoo']
diff --git a/tests/kit/model_zoo/transformers/__init__.py b/tests/kit/model_zoo/transformers/__init__.py
new file mode 100644
index 000000000000..f56ff7ad84eb
--- /dev/null
+++ b/tests/kit/model_zoo/transformers/__init__.py
@@ -0,0 +1,5 @@
+from .albert import *
+from .bert import *
+from .gpt import *
+from .opt import *
+from .t5 import *
diff --git a/tests/kit/model_zoo/transformers/albert.py b/tests/kit/model_zoo/transformers/albert.py
new file mode 100644
index 000000000000..e85f564e376a
--- /dev/null
+++ b/tests/kit/model_zoo/transformers/albert.py
@@ -0,0 +1,85 @@
+import torch
+import transformers
+
+from ..registry import ModelAttribute, model_zoo
+
+# ===============================
+# Register single-sentence ALBERT
+# ===============================
+BATCH_SIZE = 2
+SEQ_LENGTH = 16
+
+
+def data_gen_fn():
+    input_ids = torch.zeros((BATCH_SIZE, SEQ_LENGTH), dtype=torch.int64)
+    token_type_ids = torch.zeros((BATCH_SIZE, SEQ_LENGTH), dtype=torch.int64)
+    attention_mask = torch.zeros((BATCH_SIZE, SEQ_LENGTH), dtype=torch.int64)
+    return dict(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
+
+
+output_transform_fn = lambda x: x
+
+config = transformers.AlbertConfig(embedding_size=128,
+                                   hidden_size=128,
+                                   num_hidden_layers=2,
+                                   num_attention_heads=4,
+                                   intermediate_size=256)
+
+model_zoo.register(name='transformers_albert',
+                   model_fn=lambda: transformers.AlbertModel(config),
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn,
+                   model_attribute=ModelAttribute(has_control_flow=True))
+model_zoo.register(name='transformers_albert_for_pretraining',
+                   model_fn=lambda: transformers.AlbertForPreTraining(config),
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn,
+                   model_attribute=ModelAttribute(has_control_flow=True))
+model_zoo.register(name='transformers_albert_for_masked_lm',
+                   model_fn=lambda: transformers.AlbertForMaskedLM(config),
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn,
+                   model_attribute=ModelAttribute(has_control_flow=True))
+model_zoo.register(name='transformers_albert_for_sequence_classification',
+                   model_fn=lambda: transformers.AlbertForSequenceClassification(config),
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn,
+                   model_attribute=ModelAttribute(has_control_flow=True))
+model_zoo.register(name='transformers_albert_for_token_classification',
+                   model_fn=lambda: transformers.AlbertForTokenClassification(config),
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn,
+                   model_attribute=ModelAttribute(has_control_flow=True))
+
+# ===============================
+# Register multi-sentence ALBERT
+# ===============================
+
+
+def data_gen_for_qa():
+    question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
+    tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased")
+    inputs = tokenizer(question, text, return_tensors="pt")
+    return inputs
+
+
+def data_gen_for_mcq():
+    prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+    choice0 = "It is eaten with a fork and a knife."
+    choice1 = "It is eaten while held in the hand."
+    tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased")
+    encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors="pt", padding=True)
+    encoding = {k: v.unsqueeze(0) for k, v in encoding.items()}
+    return encoding
+
+
+model_zoo.register(name='transformers_albert_for_question_answering',
+                   model_fn=lambda: transformers.AlbertForQuestionAnswering(config),
+                   data_gen_fn=data_gen_for_qa,
+                   output_transform_fn=output_transform_fn,
+                   model_attribute=ModelAttribute(has_control_flow=True))
+model_zoo.register(name='transformers_albert_for_multiple_choice',
+                   model_fn=lambda: transformers.AlbertForMultipleChoice(config),
+                   data_gen_fn=data_gen_for_mcq,
+                   output_transform_fn=output_transform_fn,
+                   model_attribute=ModelAttribute(has_control_flow=True))
diff --git a/tests/kit/model_zoo/transformers/bert.py b/tests/kit/model_zoo/transformers/bert.py
new file mode 100644
index 000000000000..99135704da70
--- /dev/null
+++ b/tests/kit/model_zoo/transformers/bert.py
@@ -0,0 +1,88 @@
+import torch
+import transformers
+
+from ..registry import ModelAttribute, model_zoo
+
+# ===============================
+# Register single-sentence BERT
+# ===============================
+BATCH_SIZE = 2
+SEQ_LENGTH = 16
+
+
+def data_gen_fn():
+    input_ids = torch.zeros((BATCH_SIZE, SEQ_LENGTH), dtype=torch.int64)
+    token_type_ids = torch.zeros((BATCH_SIZE, SEQ_LENGTH), dtype=torch.int64)
+    attention_mask = torch.zeros((BATCH_SIZE, SEQ_LENGTH), dtype=torch.int64)
+    return dict(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
+
+
+output_transform_fn = lambda x: x
+
+config = transformers.BertConfig(hidden_size=128, num_hidden_layers=2, num_attention_heads=4, intermediate_size=256)
+
+# register the BERT variants
+model_zoo.register(name='transformers_bert',
+                   model_fn=lambda: transformers.BertModel(config),
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn,
+                   model_attribute=ModelAttribute(has_control_flow=True))
+model_zoo.register(name='transformers_bert_for_pretraining',
+                   model_fn=lambda: transformers.BertForPreTraining(config),
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn,
+                   model_attribute=ModelAttribute(has_control_flow=True))
+model_zoo.register(name='transformers_bert_lm_head_model',
+                   model_fn=lambda: transformers.BertLMHeadModel(config),
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn,
+                   model_attribute=ModelAttribute(has_control_flow=True))
+model_zoo.register(name='transformers_bert_for_masked_lm',
+                   model_fn=lambda: transformers.BertForMaskedLM(config),
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn,
+                   model_attribute=ModelAttribute(has_control_flow=True))
+model_zoo.register(name='transformers_bert_for_sequence_classification',
+                   model_fn=lambda: transformers.BertForSequenceClassification(config),
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn,
+                   model_attribute=ModelAttribute(has_control_flow=True))
+model_zoo.register(name='transformers_bert_for_token_classification',
+                   model_fn=lambda: transformers.BertForTokenClassification(config),
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn,
+                   model_attribute=ModelAttribute(has_control_flow=True))
+
+
+# ===============================
+# Register multi-sentence BERT
+# ===============================
+def data_gen_for_next_sentence():
+    tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased")
+    prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+    next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+    encoding = tokenizer(prompt, next_sentence, return_tensors="pt")
+    return encoding
+
+
+def data_gen_for_mcq():
+    tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased")
+    prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+    choice0 = "It is eaten with a fork and a knife."
+    choice1 = "It is eaten while held in the hand."
+    encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors="pt", padding=True)
+    encoding = {k: v.unsqueeze(0) for k, v in encoding.items()}
+    return encoding
+
+
+# register the following models
+model_zoo.register(name='transformers_bert_for_next_sentence',
+                   model_fn=lambda: transformers.BertForNextSentencePrediction(config),
+                   data_gen_fn=data_gen_for_next_sentence,
+                   output_transform_fn=output_transform_fn,
+                   model_attribute=ModelAttribute(has_control_flow=True))
+model_zoo.register(name='transformers_bert_for_mcq',
+                   model_fn=lambda: transformers.BertForMultipleChoice(config),
+                   data_gen_fn=data_gen_for_mcq,
+                   output_transform_fn=output_transform_fn,
+                   model_attribute=ModelAttribute(has_control_flow=True))
diff --git a/tests/kit/model_zoo/transformers/gpt.py b/tests/kit/model_zoo/transformers/gpt.py
new file mode 100644
index 000000000000..a92a46e36f0b
--- /dev/null
+++ b/tests/kit/model_zoo/transformers/gpt.py
@@ -0,0 +1,49 @@
+import torch
+import transformers
+
+from ..registry import ModelAttribute, model_zoo
+
+# ===============================
+# Register single-sentence GPT
+# ===============================
+BATCH_SIZE = 2
+SEQ_LENGTH = 16
+
+
+def data_gen():
+    input_ids = torch.zeros((BATCH_SIZE, SEQ_LENGTH), dtype=torch.int64)
+    token_type_ids = torch.zeros((BATCH_SIZE, SEQ_LENGTH), dtype=torch.int64)
+    attention_mask = torch.zeros((BATCH_SIZE, SEQ_LENGTH), dtype=torch.int64)
+    return dict(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
+
+
+output_transform_fn = lambda x: x
+
+config = transformers.GPT2Config(n_position=64, n_layer=2, n_head=4)
+
+# register the following models
+model_zoo.register(name='transformers_gpt',
+                   model_fn=lambda: transformers.GPT2Model(config),
+                   data_gen_fn=data_gen,
+                   output_transform_fn=output_transform_fn,
+                   model_attribute=ModelAttribute(has_control_flow=True))
+model_zoo.register(name='transformers_gpt_lm',
+                   model_fn=lambda: transformers.GPT2LMHeadModel(config),
+                   data_gen_fn=data_gen,
+                   output_transform_fn=output_transform_fn,
+                   model_attribute=ModelAttribute(has_control_flow=True))
+model_zoo.register(name='transformers_gpt_double_heads',
+                   model_fn=lambda: transformers.GPT2DoubleHeadsModel(config),
+                   data_gen_fn=data_gen,
+                   output_transform_fn=output_transform_fn,
+                   model_attribute=ModelAttribute(has_control_flow=True))
+model_zoo.register(name='transformers_gpt_for_token_classification',
+                   model_fn=lambda: transformers.GPT2ForTokenClassification(config),
+                   data_gen_fn=data_gen,
+                   output_transform_fn=output_transform_fn,
+                   model_attribute=ModelAttribute(has_control_flow=True))
+model_zoo.register(name='transformers_gpt_for_sequence_classification',
+                   model_fn=lambda: transformers.GPT2ForSequenceClassification(config),
+                   data_gen_fn=data_gen,
+                   output_transform_fn=output_transform_fn,
+                   model_attribute=ModelAttribute(has_control_flow=True))
diff --git a/tests/kit/model_zoo/transformers/opt.py b/tests/kit/model_zoo/transformers/opt.py
new file mode 100644
index 000000000000..d9c4a0b3c23c
--- /dev/null
+++ b/tests/kit/model_zoo/transformers/opt.py
@@ -0,0 +1,35 @@
+import torch
+import transformers
+
+from ..registry import ModelAttribute, model_zoo
+
+# ===============================
+# Register single-sentence OPT
+# ===============================
+BATCH_SIZE = 2
+SEQ_LENGTH = 16
+
+
+def data_gen():
+    input_ids = torch.zeros((BATCH_SIZE, SEQ_LENGTH), dtype=torch.int64)
+    attention_mask = torch.zeros((BATCH_SIZE, SEQ_LENGTH), dtype=torch.int64)
+    return dict(input_ids=input_ids, attention_mask=attention_mask)
+
+
+output_transform_fn = lambda x: x
+
+config = transformers.OPTConfig(hidden_size=128, num_hidden_layers=2, num_attention_heads=4)
+
+# register the following models
+# transformers.OPTModel,
+# transformers.OPTForCausalLM,
+model_zoo.register(name='transformers_opt',
+                   model_fn=lambda: transformers.OPTModel(config),
+                   data_gen_fn=data_gen,
+                   output_transform_fn=output_transform_fn,
+                   model_attribute=ModelAttribute(has_control_flow=True))
+model_zoo.register(name='transformers_opt_for_causal_lm',
+                   model_fn=lambda: transformers.OPTForCausalLM(config),
+                   data_gen_fn=data_gen,
+                   output_transform_fn=output_transform_fn,
+                   model_attribute=ModelAttribute(has_control_flow=True))
diff --git a/tests/kit/model_zoo/transformers/t5.py b/tests/kit/model_zoo/transformers/t5.py
new file mode 100644
index 000000000000..b81bcad90db8
--- /dev/null
+++ b/tests/kit/model_zoo/transformers/t5.py
@@ -0,0 +1,46 @@
+import torch
+import transformers
+
+from ..registry import ModelAttribute, model_zoo
+
+# ===============================
+# Register single-sentence T5
+# ===============================
+BATCH_SIZE = 2
+SEQ_LENGTH = 16
+
+
+def data_gen():
+    input_ids = torch.zeros((BATCH_SIZE, SEQ_LENGTH), dtype=torch.int64)
+    decoder_input_ids = torch.zeros((BATCH_SIZE, SEQ_LENGTH), dtype=torch.int64)
+    return dict(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+
+
+def data_gen_for_encoder_only():
+    input_ids = torch.zeros((BATCH_SIZE, SEQ_LENGTH), dtype=torch.int64)
+    return dict(input_ids=input_ids)
+
+
+output_transform_fn = lambda x: x
+
+config = transformers.T5Config(d_model=128, num_layers=2)
+
+# register the following models
+# transformers.T5Model,
+# transformers.T5ForConditionalGeneration,
+# transformers.T5EncoderModel,
+model_zoo.register(name='transformers_t5',
+                   model_fn=lambda: transformers.T5Model(config),
+                   data_gen_fn=data_gen,
+                   output_transform_fn=output_transform_fn,
+                   model_attribute=ModelAttribute(has_control_flow=True))
+model_zoo.register(name='transformers_t5_for_conditional_generation',
+                   model_fn=lambda: transformers.T5ForConditionalGeneration(config),
+                   data_gen_fn=data_gen,
+                   output_transform_fn=output_transform_fn,
+                   model_attribute=ModelAttribute(has_control_flow=True))
+model_zoo.register(name='transformers_t5_encoder_model',
+                   model_fn=lambda: transformers.T5EncoderModel(config),
+                   data_gen_fn=data_gen_for_encoder_only,
+                   output_transform_fn=output_transform_fn,
+                   model_attribute=ModelAttribute(has_control_flow=True))
diff --git a/tests/test_fx/test_tracer/test_hf_model/test_hf_albert.py b/tests/test_fx/test_tracer/test_hf_model/test_hf_albert.py
index 9c36b0c9cc96..b1c9c211a9a0 100644
--- a/tests/test_fx/test_tracer/test_hf_model/test_hf_albert.py
+++ b/tests/test_fx/test_tracer/test_hf_model/test_hf_albert.py
@@ -1,66 +1,18 @@
-import pytest
-import torch
-import transformers
 from hf_tracer_utils import trace_model_and_compare_output
 
+from tests.kit.model_zoo import model_zoo
+
 BATCH_SIZE = 2
 SEQ_LENGTH = 16
 
 
-def test_single_sentence_albert():
-    MODEL_LIST = [
-        transformers.AlbertModel,
-        transformers.AlbertForPreTraining,
-        transformers.AlbertForMaskedLM,
-        transformers.AlbertForSequenceClassification,
-        transformers.AlbertForTokenClassification,
-    ]
-
-    config = transformers.AlbertConfig(embedding_size=128,
-                                       hidden_size=128,
-                                       num_hidden_layers=2,
-                                       num_attention_heads=4,
-                                       intermediate_size=256)
-
-    def data_gen():
-        input_ids = torch.zeros((BATCH_SIZE, SEQ_LENGTH), dtype=torch.int64)
-        token_type_ids = torch.zeros((BATCH_SIZE, SEQ_LENGTH), dtype=torch.int64)
-        attention_mask = torch.zeros((BATCH_SIZE, SEQ_LENGTH), dtype=torch.int64)
-        meta_args = dict(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
-        return meta_args
-
-    for model_cls in MODEL_LIST:
-        model = model_cls(config=config)
-        trace_model_and_compare_output(model, data_gen)
-
-
-def test_multi_sentence_albert():
-    config = transformers.AlbertConfig(hidden_size=128,
-                                       num_hidden_layers=2,
-                                       num_attention_heads=4,
-                                       intermediate_size=256)
-    tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased")
-
-    def data_gen_for_qa():
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        inputs = tokenizer(question, text, return_tensors="pt")
-        return inputs
-
-    model = transformers.AlbertForQuestionAnswering(config)
-    trace_model_and_compare_output(model, data_gen_for_qa)
-
-    def data_gen_for_mcq():
-        prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        choice0 = "It is eaten with a fork and a knife."
-        choice1 = "It is eaten while held in the hand."
-        encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors="pt", padding=True)
-        encoding = {k: v.unsqueeze(0) for k, v in encoding.items()}
-        return encoding
+def test_albert():
+    sub_registry = model_zoo.get_sub_registry('transformers_albert')
 
-    model = transformers.AlbertForMultipleChoice(config)
-    trace_model_and_compare_output(model, data_gen_for_mcq)
+    for name, (model_fn, data_gen_fn, _, _) in sub_registry.items():
+        model = model_fn()
+        trace_model_and_compare_output(model, data_gen_fn)
 
 
 if __name__ == '__main__':
-    test_single_sentence_albert()
-    test_multi_sentence_albert()
+    test_albert()
diff --git a/tests/test_fx/test_tracer/test_hf_model/test_hf_bert.py b/tests/test_fx/test_tracer/test_hf_model/test_hf_bert.py
index 62273e2d51c9..1bf4947c31a0 100644
--- a/tests/test_fx/test_tracer/test_hf_model/test_hf_bert.py
+++ b/tests/test_fx/test_tracer/test_hf_model/test_hf_bert.py
@@ -1,69 +1,15 @@
-import pytest
-import torch
-import transformers
 from hf_tracer_utils import trace_model_and_compare_output
 
-BATCH_SIZE = 2
-SEQ_LENGTH = 16
+from tests.kit.model_zoo import model_zoo
 
 
-def test_single_sentence_bert():
-    MODEL_LIST = [
-        transformers.BertModel,
-        transformers.BertForPreTraining,
-        transformers.BertLMHeadModel,
-        transformers.BertForMaskedLM,
-        transformers.BertForSequenceClassification,
-        transformers.BertForTokenClassification,
-    ]
+def test_bert():
+    sub_registry = model_zoo.get_sub_registry('transformers_bert')
 
-    config = transformers.BertConfig(hidden_size=128, num_hidden_layers=2, num_attention_heads=4, intermediate_size=256)
-
-    def data_gen():
-        input_ids = torch.zeros((BATCH_SIZE, SEQ_LENGTH), dtype=torch.int64)
-        token_type_ids = torch.zeros((BATCH_SIZE, SEQ_LENGTH), dtype=torch.int64)
-        attention_mask = torch.zeros((BATCH_SIZE, SEQ_LENGTH), dtype=torch.int64)
-        meta_args = dict(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
-        return meta_args
-
-    for model_cls in MODEL_LIST:
-        model = model_cls(config=config)
-        trace_model_and_compare_output(model, data_gen)
-
-
-def test_multi_sentence_bert():
-    config = transformers.BertConfig(hidden_size=128, num_hidden_layers=2, num_attention_heads=4, intermediate_size=256)
-    tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased")
-
-    def data_gen_for_next_sentence():
-        prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        next_sentence = "The sky is blue due to the shorter wavelength of blue light."
-        encoding = tokenizer(prompt, next_sentence, return_tensors="pt")
-        return encoding
-
-    model = transformers.BertForNextSentencePrediction(config)
-    trace_model_and_compare_output(model, data_gen_for_next_sentence)
-
-    def data_gen_for_qa():
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        inputs = tokenizer(question, text, return_tensors="pt")
-        return inputs
-
-    model = transformers.BertForQuestionAnswering(config)
-    trace_model_and_compare_output(model, data_gen_for_qa)
-
-    def data_gen_for_mcq():
-        prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        choice0 = "It is eaten with a fork and a knife."
-        choice1 = "It is eaten while held in the hand."
-        encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors="pt", padding=True)
-        encoding = {k: v.unsqueeze(0) for k, v in encoding.items()}
-        return encoding
-
-    model = transformers.BertForMultipleChoice(config)
-    trace_model_and_compare_output(model, data_gen_for_mcq)
+    for name, (model_fn, data_gen_fn, _, _) in sub_registry.items():
+        model = model_fn()
+        trace_model_and_compare_output(model, data_gen_fn)
 
 
 if __name__ == '__main__':
-    test_single_sentence_bert()
-    test_multi_sentence_bert()
+    test_bert()
diff --git a/tests/test_fx/test_tracer/test_hf_model/test_hf_gpt.py b/tests/test_fx/test_tracer/test_hf_model/test_hf_gpt.py
index ad4c9684dc42..67a3178fae1b 100644
--- a/tests/test_fx/test_tracer/test_hf_model/test_hf_gpt.py
+++ b/tests/test_fx/test_tracer/test_hf_model/test_hf_gpt.py
@@ -1,35 +1,17 @@
 import pytest
-import torch
-import transformers
 from hf_tracer_utils import trace_model_and_compare_output
 
-BATCH_SIZE = 1
-SEQ_LENGTH = 16
+from tests.kit.model_zoo import model_zoo
 
 
 # TODO: remove this skip once we handle the latest gpt model
 @pytest.mark.skip
 def test_gpt():
-    MODEL_LIST = [
-        transformers.GPT2Model,
-        transformers.GPT2LMHeadModel,
-        transformers.GPT2DoubleHeadsModel,
-        transformers.GPT2ForTokenClassification,
-    # transformers.GPT2ForSequenceClassification, # not supported yet
-    ]
+    sub_registry = model_zoo.get_sub_registry('transformers_gpt')
 
-    config = transformers.GPT2Config(n_position=64, n_layer=2, n_head=4)
-
-    def data_gen():
-        input_ids = torch.zeros((BATCH_SIZE, SEQ_LENGTH), dtype=torch.int64)
-        token_type_ids = torch.zeros((BATCH_SIZE, SEQ_LENGTH), dtype=torch.int64)
-        attention_mask = torch.zeros((BATCH_SIZE, SEQ_LENGTH), dtype=torch.int64)
-        kwargs = dict(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
-        return kwargs
-
-    for model_cls in MODEL_LIST:
-        model = model_cls(config=config)
-        trace_model_and_compare_output(model, data_gen)
+    for name, (model_fn, data_gen_fn, _, _) in sub_registry.items():
+        model = model_fn()
+        trace_model_and_compare_output(model, data_gen_fn)
 
 
 if __name__ == '__main__':
diff --git a/tests/test_fx/test_tracer/test_hf_model/test_hf_opt.py b/tests/test_fx/test_tracer/test_hf_model/test_hf_opt.py
index 06260176ec6f..740f5a9f0c57 100644
--- a/tests/test_fx/test_tracer/test_hf_model/test_hf_opt.py
+++ b/tests/test_fx/test_tracer/test_hf_model/test_hf_opt.py
@@ -1,29 +1,14 @@
-import pytest
-import torch
-import transformers
 from hf_tracer_utils import trace_model_and_compare_output
 
-BATCH_SIZE = 1
-SEQ_LENGTH = 16
+from tests.kit.model_zoo import model_zoo
 
 
 def test_opt():
-    MODEL_LIST = [
-        transformers.OPTModel,
-        transformers.OPTForCausalLM,
-    ]
+    sub_registry = model_zoo.get_sub_registry('transformers_opt')
 
-    config = transformers.OPTConfig(hidden_size=128, num_hidden_layers=2, num_attention_heads=4)
-
-    def data_gen():
-        input_ids = torch.zeros((BATCH_SIZE, SEQ_LENGTH), dtype=torch.int64)
-        attention_mask = torch.zeros((BATCH_SIZE, SEQ_LENGTH), dtype=torch.int64)
-        kwargs = dict(input_ids=input_ids, attention_mask=attention_mask)
-        return kwargs
-
-    for model_cls in MODEL_LIST:
-        model = model_cls(config=config)
-        trace_model_and_compare_output(model, data_gen)
+    for name, (model_fn, data_gen_fn, _, _) in sub_registry.items():
+        model = model_fn()
+        trace_model_and_compare_output(model, data_gen_fn)
 
 
 if __name__ == '__main__':
diff --git a/tests/test_fx/test_tracer/test_hf_model/test_hf_t5.py b/tests/test_fx/test_tracer/test_hf_model/test_hf_t5.py
index 71e782fddc76..7073fd63470b 100644
--- a/tests/test_fx/test_tracer/test_hf_model/test_hf_t5.py
+++ b/tests/test_fx/test_tracer/test_hf_model/test_hf_t5.py
@@ -1,41 +1,14 @@
-import pytest
-import torch
-import transformers
 from hf_tracer_utils import trace_model_and_compare_output
 
-BATCH_SIZE = 1
-SEQ_LENGTH = 16
+from tests.kit.model_zoo import model_zoo
 
 
 def test_t5():
-    MODEL_LIST = [
-        transformers.T5Model,
-        transformers.T5ForConditionalGeneration,
-        transformers.T5EncoderModel,
-    ]
+    sub_registry = model_zoo.get_sub_registry('transformers_t5')
 
-    config = transformers.T5Config(d_model=128, num_layers=2)
-
-    def data_gen():
-        input_ids = torch.zeros((BATCH_SIZE, SEQ_LENGTH), dtype=torch.int64)
-        decoder_input_ids = torch.zeros((BATCH_SIZE, SEQ_LENGTH), dtype=torch.int64)
-        kwargs = dict(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
-        return kwargs
-
-    def data_gen_for_encoder_only():
-        input_ids = torch.zeros((BATCH_SIZE, SEQ_LENGTH), dtype=torch.int64)
-        kwargs = dict(input_ids=input_ids)
-        return kwargs
-
-    for model_cls in MODEL_LIST:
-        model = model_cls(config=config)
-
-        if isinstance(model, transformers.T5EncoderModel):
-            data_gen_func = data_gen_for_encoder_only
-        else:
-            data_gen_func = data_gen
-
-        trace_model_and_compare_output(model, data_gen_func)
+    for name, (model_fn, data_gen_fn, _, _) in sub_registry.items():
+        model = model_fn()
+        trace_model_and_compare_output(model, data_gen_fn)
 
 
 if __name__ == '__main__':

From 14a115000b29ffe0680e3241d4dbb045389eb56e Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Wed, 15 Mar 2023 11:51:16 +0800
Subject: [PATCH 472/503] [tests] model zoo add torchaudio models (#3138)

* [tests] model zoo add torchaudio models

* [tests] refactor torchaudio wavernn

* [tests] refactor fx torchaudio tests
---
 tests/kit/model_zoo/__init__.py               |   2 +-
 tests/kit/model_zoo/torchaudio/__init__.py    |   1 +
 tests/kit/model_zoo/torchaudio/torchaudio.py  | 130 ++++++++++++++++
 .../test_torchaudio_general.py                | 145 ------------------
 .../test_torchaudio_model.py                  |  22 +++
 .../test_torchaudio_tacotron.py               |  57 -------
 .../test_torchaudio_transformer.py            |  67 --------
 .../test_torchaudio_wave2vec.py               |  50 ------
 .../test_torchaudio_model/torchaudio_utils.py |  23 ++-
 9 files changed, 165 insertions(+), 332 deletions(-)
 create mode 100644 tests/kit/model_zoo/torchaudio/__init__.py
 create mode 100644 tests/kit/model_zoo/torchaudio/torchaudio.py
 delete mode 100644 tests/test_fx/test_tracer/test_torchaudio_model/test_torchaudio_general.py
 create mode 100644 tests/test_fx/test_tracer/test_torchaudio_model/test_torchaudio_model.py
 delete mode 100644 tests/test_fx/test_tracer/test_torchaudio_model/test_torchaudio_tacotron.py
 delete mode 100644 tests/test_fx/test_tracer/test_torchaudio_model/test_torchaudio_transformer.py
 delete mode 100644 tests/test_fx/test_tracer/test_torchaudio_model/test_torchaudio_wave2vec.py

diff --git a/tests/kit/model_zoo/__init__.py b/tests/kit/model_zoo/__init__.py
index 7f14d04c0910..82a61626b6c2 100644
--- a/tests/kit/model_zoo/__init__.py
+++ b/tests/kit/model_zoo/__init__.py
@@ -1,4 +1,4 @@
-from . import diffusers, timm, torchvision, transformers
+from . import diffusers, timm, torchaudio, torchvision, transformers
 from .registry import model_zoo
 
 __all__ = ['model_zoo']
diff --git a/tests/kit/model_zoo/torchaudio/__init__.py b/tests/kit/model_zoo/torchaudio/__init__.py
new file mode 100644
index 000000000000..082eb9ebb89c
--- /dev/null
+++ b/tests/kit/model_zoo/torchaudio/__init__.py
@@ -0,0 +1 @@
+from .torchaudio import *
diff --git a/tests/kit/model_zoo/torchaudio/torchaudio.py b/tests/kit/model_zoo/torchaudio/torchaudio.py
new file mode 100644
index 000000000000..74611720292f
--- /dev/null
+++ b/tests/kit/model_zoo/torchaudio/torchaudio.py
@@ -0,0 +1,130 @@
+import torch
+import torchaudio.models as tm
+
+from ..registry import ModelAttribute, model_zoo
+
+INPUT_DIM = 80
+IN_FEATURES = 16
+N_TIME = 20
+KERNEL_SIZE = 5
+HOP_LENGTH = 20
+N_CLASSES = 10
+N_FREQ = 16
+N_MELS = 80
+
+
+def conformer_data_gen_fn():
+    lengths = torch.randint(1, 400, (4,))
+    input = torch.rand(4, int(lengths.max()), INPUT_DIM)
+    return dict(input=input, lengths=lengths)
+
+
+transformer_output_transform_fn = lambda outputs: dict(frames=outputs[0], lengths=outputs[1])
+
+model_zoo.register(name='torchaudio_conformer',
+                   model_fn=lambda: tm.Conformer(
+                       input_dim=INPUT_DIM, num_heads=4, ffn_dim=128, num_layers=4, depthwise_conv_kernel_size=31),
+                   data_gen_fn=conformer_data_gen_fn,
+                   output_transform_fn=transformer_output_transform_fn)
+
+single_output_transform_fn = lambda output: dict(output=output)
+
+model_zoo.register(name='torchaudio_convtasnet',
+                   model_fn=tm.ConvTasNet,
+                   data_gen_fn=lambda: dict(input=torch.rand(4, 1, 8)),
+                   output_transform_fn=single_output_transform_fn,
+                   model_attribute=ModelAttribute(has_control_flow=True))
+
+model_zoo.register(name='torchaudio_deepspeech',
+                   model_fn=lambda: tm.DeepSpeech(IN_FEATURES, n_hidden=128, n_class=4),
+                   data_gen_fn=lambda: dict(x=torch.rand(4, 1, 10, IN_FEATURES)),
+                   output_transform_fn=single_output_transform_fn)
+
+
+def emformer_data_gen_fn():
+    input = torch.rand(4, 400, IN_FEATURES)
+    lengths = torch.randint(1, 200, (4,))
+    return dict(input=input, lengths=lengths)
+
+
+model_zoo.register(
+    name='torchaudio_emformer',
+    model_fn=lambda: tm.Emformer(input_dim=IN_FEATURES, num_heads=4, ffn_dim=128, num_layers=4, segment_length=4),
+    data_gen_fn=emformer_data_gen_fn,
+    output_transform_fn=transformer_output_transform_fn,
+    model_attribute=ModelAttribute(has_control_flow=True))
+
+model_zoo.register(name='torchaudio_wav2letter_waveform',
+                   model_fn=lambda: tm.Wav2Letter(input_type='waveform', num_features=40),
+                   data_gen_fn=lambda: dict(x=torch.rand(4, 40, 400)),
+                   output_transform_fn=single_output_transform_fn)
+
+model_zoo.register(name='torchaudio_wav2letter_mfcc',
+                   model_fn=lambda: tm.Wav2Letter(input_type='mfcc', num_features=40),
+                   data_gen_fn=lambda: dict(x=torch.rand(4, 40, 400)),
+                   output_transform_fn=single_output_transform_fn)
+
+
+def wavernn_data_gen_fn():
+    waveform = torch.rand(4, 1, (N_TIME - KERNEL_SIZE + 1) * HOP_LENGTH)
+    specgram = torch.rand(4, 1, N_FREQ, N_TIME)
+    return dict(waveform=waveform, specgram=specgram)
+
+
+model_zoo.register(name='torchaudio_wavernn',
+                   model_fn=lambda: tm.WaveRNN(upsample_scales=[2, 2, 5],
+                                               n_classes=N_CLASSES,
+                                               hop_length=HOP_LENGTH,
+                                               kernel_size=KERNEL_SIZE,
+                                               n_freq=N_FREQ,
+                                               n_res_block=2,
+                                               n_rnn=64,
+                                               n_fc=64,
+                                               n_hidden=16,
+                                               n_output=16),
+                   data_gen_fn=wavernn_data_gen_fn,
+                   output_transform_fn=single_output_transform_fn,
+                   model_attribute=ModelAttribute(has_control_flow=True))
+
+
+def tacotron_data_gen_fn():
+    n_batch = 4
+    max_text_length = 100
+    max_mel_specgram_length = 300
+    tokens = torch.randint(0, 148, (n_batch, max_text_length))
+    token_lengths = max_text_length * torch.ones((n_batch,))
+    mel_specgram = torch.rand(n_batch, N_MELS, max_mel_specgram_length)
+    mel_specgram_lengths = max_mel_specgram_length * torch.ones((n_batch,))
+    return dict(tokens=tokens,
+                token_lengths=token_lengths,
+                mel_specgram=mel_specgram,
+                mel_specgram_lengths=mel_specgram_lengths)
+
+
+model_zoo.register(
+    name='torchaudio_tacotron',
+    model_fn=lambda: tm.Tacotron2(n_mels=N_MELS),
+    data_gen_fn=tacotron_data_gen_fn,
+    output_transform_fn=lambda outputs: dict(
+        spectrogram_before=outputs[0], spectrogram_after=outputs[1], stop_tokens=outputs[2], attn_weights=outputs[3]),
+    model_attribute=ModelAttribute(has_control_flow=True))
+
+
+def wav2vec_data_gen_fn():
+    batch_size, num_frames = 4, 400
+    waveforms = torch.randn(batch_size, num_frames)
+    lengths = torch.randint(0, num_frames, (batch_size,))
+    return dict(waveforms=waveforms, lengths=lengths)
+
+
+model_zoo.register(name='torchaudio_wav2vec2_base',
+                   model_fn=tm.wav2vec2_base,
+                   data_gen_fn=wav2vec_data_gen_fn,
+                   output_transform_fn=transformer_output_transform_fn,
+                   model_attribute=ModelAttribute(has_control_flow=True))
+
+model_zoo.register(name='torchaudio_hubert_base',
+                   model_fn=tm.hubert_base,
+                   data_gen_fn=wav2vec_data_gen_fn,
+                   output_transform_fn=transformer_output_transform_fn,
+                   model_attribute=ModelAttribute(has_control_flow=True))
diff --git a/tests/test_fx/test_tracer/test_torchaudio_model/test_torchaudio_general.py b/tests/test_fx/test_tracer/test_torchaudio_model/test_torchaudio_general.py
deleted file mode 100644
index b2fa8c6c0bbb..000000000000
--- a/tests/test_fx/test_tracer/test_torchaudio_model/test_torchaudio_general.py
+++ /dev/null
@@ -1,145 +0,0 @@
-import torch
-from torchaudio_utils import trace_and_compare
-from torchaudio.models import ConvTasNet, DeepSpeech, Wav2Letter, WaveRNN
-from torchaudio.models.wavernn import MelResNet, UpsampleNetwork
-import pytest
-
-
-def test_wave2letter_waveform():
-    batch_size = 2
-    num_features = 1
-    num_classes = 40
-    input_length = 320
-
-    model = Wav2Letter(num_classes=num_classes, num_features=num_features)
-
-    def data_gen():
-        x = torch.rand(batch_size, num_features, input_length)
-        return dict(x=x)
-
-    trace_and_compare(model, data_gen, need_meta=False, need_concrete=False)
-
-
-def test_wave2letter_mfcc():
-    batch_size = 2
-    num_features = 13
-    num_classes = 40
-    input_length = 2
-
-    model = Wav2Letter(num_classes=num_classes, input_type="mfcc", num_features=num_features)
-
-    def data_gen():
-        x = torch.rand(batch_size, num_features, input_length)
-        return dict(x=x)
-
-    trace_and_compare(model, data_gen, need_meta=False, need_concrete=False)
-
-
-def test_melresnet_waveform():
-    n_batch = 2
-    n_time = 200
-    n_freq = 100
-    n_output = 128
-    n_res_block = 10
-    n_hidden = 128
-    kernel_size = 5
-
-    model = MelResNet(n_res_block, n_freq, n_hidden, n_output, kernel_size)
-
-    def data_gen():
-        x = torch.rand(n_batch, n_freq, n_time)
-        return dict(specgram=x)
-
-    trace_and_compare(model, data_gen, need_meta=False, need_concrete=False)
-
-
-def test_upsample_network_waveform():
-    upsample_scales = [5, 5, 8]
-    n_batch = 2
-    n_time = 200
-    n_freq = 100
-    n_output = 64
-    n_res_block = 10
-    n_hidden = 32
-    kernel_size = 5
-
-    total_scale = 1
-    for upsample_scale in upsample_scales:
-        total_scale *= upsample_scale
-
-    model = UpsampleNetwork(upsample_scales, n_res_block, n_freq, n_hidden, n_output, kernel_size)
-
-    def data_gen():
-        x = torch.rand(n_batch, n_freq, n_time)
-        return dict(specgram=x)
-
-    trace_and_compare(model, data_gen, need_meta=False, need_concrete=False)
-
-
-def test_wavernn_waveform():
-    upsample_scales = [2, 2, 5]
-    n_rnn = 16
-    n_fc = 16
-    n_classes = 10
-    hop_length = 20
-    n_batch = 2
-    n_time = 20
-    n_freq = 10
-    n_output = 16
-    n_res_block = 3
-    n_hidden = 16
-    kernel_size = 5
-
-    model = WaveRNN(upsample_scales, n_classes, hop_length, n_res_block, n_rnn, n_fc, kernel_size, n_freq, n_hidden,
-                    n_output)
-
-    def data_gen():
-        x = torch.rand(n_batch, 1, hop_length * (n_time - kernel_size + 1))
-        mels = torch.rand(n_batch, 1, n_freq, n_time)
-        return dict(waveform=x, specgram=mels)
-
-    trace_and_compare(model, data_gen, need_meta=True, need_concrete=False)
-
-
-def test_convtasnet_config():
-    batch_size = 32
-    num_frames = 800
-
-    model = ConvTasNet()
-
-    def data_gen():
-        tensor = torch.rand(batch_size, 1, num_frames)
-        return dict(input=tensor)
-
-    trace_and_compare(model, data_gen, need_meta=True, need_concrete=False)
-
-
-def test_deepspeech():
-    n_batch = 2
-    n_feature = 1
-    n_channel = 1
-    n_class = 40
-    n_time = 32
-
-    model = DeepSpeech(n_feature=n_feature, n_class=n_class)
-
-    def data_gen():
-        x = torch.rand(n_batch, n_channel, n_time, n_feature)
-        return dict(x=x)
-
-    trace_and_compare(model, data_gen, need_meta=False, need_concrete=False)
-
-
-if __name__ == '__main__':
-    TEST_LIST = [
-        test_wave2letter_waveform,
-        test_wave2letter_mfcc,
-        test_melresnet_waveform,
-        test_upsample_network_waveform,
-        test_wavernn_waveform,
-        test_convtasnet_config,
-        test_deepspeech,
-    ]
-
-    for test_fn in TEST_LIST:
-        test_fn()
diff --git a/tests/test_fx/test_tracer/test_torchaudio_model/test_torchaudio_model.py b/tests/test_fx/test_tracer/test_torchaudio_model/test_torchaudio_model.py
new file mode 100644
index 000000000000..bf6c7ae551ab
--- /dev/null
+++ b/tests/test_fx/test_tracer/test_torchaudio_model/test_torchaudio_model.py
@@ -0,0 +1,22 @@
+import re
+
+import torch
+from torchaudio_utils import trace_and_compare
+
+from tests.kit.model_zoo import model_zoo
+
+
+def test_torchaudio_models():
+    torch.backends.cudnn.deterministic = True
+
+    sub_model_zoo = model_zoo.get_sub_registry('torchaudio')
+
+    for name, (model_fn, data_gen_fn, output_transform_fn, attribute) in sub_model_zoo.items():
+        # FIXME(ver217): temporarily skip these models
+        if re.search(f'(conformer|emformer|tacotron|wav2vec2_base|hubert_base)', name):
+            continue
+        model = model_fn()
+        trace_and_compare(model,
+                          data_gen_fn,
+                          output_transform_fn,
+                          need_meta=(attribute is not None and attribute.has_control_flow))
diff --git a/tests/test_fx/test_tracer/test_torchaudio_model/test_torchaudio_tacotron.py b/tests/test_fx/test_tracer/test_torchaudio_model/test_torchaudio_tacotron.py
deleted file mode 100644
index 2073c46897f4..000000000000
--- a/tests/test_fx/test_tracer/test_torchaudio_model/test_torchaudio_tacotron.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import torch
-from torchaudio.models import Tacotron2
-from torchaudio_utils import trace_and_compare
-import pytest
-
-
-def _get_tacotron2_model(n_mels, decoder_max_step=2000, gate_threshold=0.5):
-    return Tacotron2(
-        mask_padding=False,
-        n_mels=n_mels,
-        n_symbol=20,
-        n_frames_per_step=1,
-        symbol_embedding_dim=32,
-        encoder_embedding_dim=32,
-        encoder_n_convolution=3,
-        encoder_kernel_size=5,
-        decoder_rnn_dim=32,
-        decoder_max_step=decoder_max_step,
-        decoder_dropout=0.1,
-        decoder_early_stopping=True,
-        attention_rnn_dim=32,
-        attention_hidden_dim=32,
-        attention_location_n_filter=32,
-        attention_location_kernel_size=31,
-        attention_dropout=0.1,
-        prenet_dim=32,
-        postnet_n_convolution=5,
-        postnet_kernel_size=5,
-        postnet_embedding_dim=512,
-        gate_threshold=gate_threshold,
-    )
-
-
-@pytest.mark.skip("Tracing failed")
-def test_tacotron_model():
-    n_mels = 80
-    n_batch = 3
-    max_mel_specgram_length = 300
-    max_text_length = 100
-
-    model = _get_tacotron2_model(n_mels)
-
-    def data_gen():
-        text = torch.randint(0, 148, (n_batch, max_text_length))
-        text_lengths = max_text_length * torch.ones((n_batch,))
-        mel_specgram = torch.rand(n_batch, n_mels, max_mel_specgram_length)
-        mel_specgram_lengths = max_mel_specgram_length * torch.ones((n_batch,))
-        return dict(tokens=text,
-                    token_lengths=text_lengths,
-                    mel_specgram=mel_specgram,
-                    mel_specgram_lengths=mel_specgram_lengths)
-
-    trace_and_compare(model, data_gen, need_meta=True, need_concrete=False)
-
-
-if __name__ == "__main__":
-    test_tacotron_model()
diff --git a/tests/test_fx/test_tracer/test_torchaudio_model/test_torchaudio_transformer.py b/tests/test_fx/test_tracer/test_torchaudio_model/test_torchaudio_transformer.py
deleted file mode 100644
index fbe24a8cd91f..000000000000
--- a/tests/test_fx/test_tracer/test_torchaudio_model/test_torchaudio_transformer.py
+++ /dev/null
@@ -1,67 +0,0 @@
-import torch
-from torchaudio_utils import trace_and_compare
-from torchaudio.models import Emformer, Conformer
-import pytest
-
-
-def test_conformer():
-    input_dim = 80
-    batch_size = 10
-    num_frames = 400
-    num_heads = 4
-    ffn_dim = 128
-    num_layers = 4
-    depthwise_conv_kernel_size = 31
-
-    model = Conformer(
-        input_dim=input_dim,
-        num_heads=num_heads,
-        ffn_dim=ffn_dim,
-        num_layers=num_layers,
-        depthwise_conv_kernel_size=depthwise_conv_kernel_size,
-    )
-
-    def data_gen():
-        lengths = torch.randint(1, num_frames, (batch_size,))
-        input = torch.rand(batch_size, int(lengths.max()), input_dim)
-        return dict(input=input, lengths=lengths)
-
-    def kwargs_transform(data):
-        new_data = {}
-
-        for k, v in data.items():
-            new_data[f'{k}_1'] = v
-        return new_data
-
-    trace_and_compare(model, data_gen, need_meta=False, need_concrete=True, kwargs_transform=kwargs_transform)
-
-
-@pytest.mark.skip("Tracing failed")
-def test_emformer():
-    input_dim = 128
-    batch_size = 10
-    num_heads = 8
-    ffn_dim = 256
-    num_layers = 3
-    segment_length = 4
-    num_frames = 400
-    right_context_length = 1
-
-    model = Emformer(input_dim, num_heads, ffn_dim, num_layers, segment_length, right_context_length)
-
-    def data_gen():
-        lengths = torch.randint(1, num_frames, (batch_size,))
-        input = torch.rand(batch_size, num_frames, input_dim)
-        return dict(input=input, lengths=lengths)
-
-    trace_and_compare(model, data_gen, need_meta=True, need_concrete=False)
-
-
-@pytest.mark.skip
-def test_torchaudio_transformers():
-    test_conformer()
-    test_emformer()
-
-
-if __name__ == "__main__":
-    test_torchaudio_transformers()
diff --git a/tests/test_fx/test_tracer/test_torchaudio_model/test_torchaudio_wave2vec.py b/tests/test_fx/test_tracer/test_torchaudio_model/test_torchaudio_wave2vec.py
deleted file mode 100644
index e8729b83fba0..000000000000
--- a/tests/test_fx/test_tracer/test_torchaudio_model/test_torchaudio_wave2vec.py
+++ /dev/null
@@ -1,50 +0,0 @@
-import torch
-from torchaudio.models.wav2vec2 import (
-    hubert_base,
-    hubert_large,
-    hubert_xlarge,
-    wav2vec2_base,
-    wav2vec2_large,
-    wav2vec2_large_lv60k,
-)
-from torchaudio_utils import trace_and_compare
-import pytest
-
-MODEL_LIST = [
-    hubert_base,
-    hubert_large,
-    hubert_xlarge,
-    wav2vec2_base,
-    wav2vec2_large,
-    wav2vec2_large_lv60k,
-]
-
-
-def _smoke_test(model, device):
-    model = model.to(device=device)
-
-    batch_size, num_frames = 3, 1024
-
-    def data_gen():
-        waveforms = torch.randn(batch_size, num_frames, device=device)
-        lengths = torch.randint(
-            low=0,
-            high=num_frames,
-            size=[
-                batch_size,
-            ],
-            device=device,
-        )
-        return dict(waveforms=waveforms, lengths=lengths)
-
-    trace_and_compare(model, data_gen, need_meta=True, need_concrete=False)
-
-
-@pytest.mark.skip("Tracing failed")
-def test_wav2vec():
-    for model_fn in MODEL_LIST:
-        _smoke_test(model_fn(), 'cpu')
-
-
-if __name__ == "__main__":
-    test_wav2vec()
diff --git a/tests/test_fx/test_tracer/test_torchaudio_model/torchaudio_utils.py b/tests/test_fx/test_tracer/test_torchaudio_model/torchaudio_utils.py
index 702c5f8f6a24..18d86fc05941 100644
--- a/tests/test_fx/test_tracer/test_torchaudio_model/torchaudio_utils.py
+++ b/tests/test_fx/test_tracer/test_torchaudio_model/torchaudio_utils.py
@@ -3,7 +3,7 @@
 from colossalai.fx import symbolic_trace
 
 
-def trace_and_compare(model, data_gen, need_meta=False, need_concrete=False, kwargs_transform=False):
+def trace_and_compare(model, data_gen, output_transform_fn, need_meta=False, need_concrete=False):
     data = data_gen()
     concrete_args = data if need_concrete else {}
     meta_args = {k: v.to('meta') for k, v in data.items()} if need_meta else {}
@@ -14,16 +14,15 @@ def trace_and_compare(model, data_gen, need_meta=False, need_concrete=False, kwa
 
     with torch.no_grad():
         non_fx_out = model(**data)
+        fx_out = gm(**data)
 
-        if kwargs_transform:
-            data = kwargs_transform(data)
+    # compare output
+    transformed_fx_out = output_transform_fn(fx_out)
+    transformed_non_fx_out = output_transform_fn(non_fx_out)
 
-        fx_out = gm(**data)
-    if isinstance(fx_out, tuple):
-        for non_fx, fx in zip(non_fx_out, fx_out):
-            assert torch.allclose(
-                non_fx, fx, atol=1e-5), f'{model.__class__.__name__} has inconsistent outputs, {fx_out} vs {non_fx_out}'
-    else:
-        assert torch.allclose(
-            fx_out, non_fx_out,
-            atol=1e-5), f'{model.__class__.__name__} has inconsistent outputs, {fx_out} vs {non_fx_out}'
+    assert len(transformed_fx_out) == len(transformed_non_fx_out)
+
+    for key, fx_output_val in transformed_fx_out.items():
+        non_fx_output_val = transformed_non_fx_out[key]
+        assert torch.allclose(fx_output_val, non_fx_output_val, atol=1e-5), \
+            f'{model.__class__.__name__} has inconsistent outputs, {fx_output_val} vs {non_fx_output_val}'

From ecd643f1e4e0100c08bd0765337fe5d2287f07dd Mon Sep 17 00:00:00 2001
From: YuliangLiu0306 <72588413+YuliangLiu0306@users.noreply.github.com>
Date: Wed, 15 Mar 2023 13:46:04 +0800
Subject: [PATCH 473/503] [test] add torchrec models to test model zoo (#3139)

---
 tests/kit/model_zoo/__init__.py               |   3 +-
 tests/kit/model_zoo/torchrec/torchrec.py      |  97 +++++++++++
 .../test_torchrec_model/test_deepfm_model.py  | 126 +++++++--------
 .../test_torchrec_model/test_dlrm_model.py    | 152 +++++++-----------
 4 files changed, 209 insertions(+), 169 deletions(-)
 create mode 100644 tests/kit/model_zoo/torchrec/torchrec.py

diff --git a/tests/kit/model_zoo/__init__.py b/tests/kit/model_zoo/__init__.py
index 82a61626b6c2..710038ffa387 100644
--- a/tests/kit/model_zoo/__init__.py
+++ b/tests/kit/model_zoo/__init__.py
@@ -1,4 +1,5 @@
-from . import diffusers, timm, torchaudio, torchvision, transformers
+from . import diffusers, timm, torchaudio, torchrec, torchvision, transformers
+
 from .registry import model_zoo
 
 __all__ = ['model_zoo']
diff --git a/tests/kit/model_zoo/torchrec/torchrec.py b/tests/kit/model_zoo/torchrec/torchrec.py
new file mode 100644
index 000000000000..014e9218b226
--- /dev/null
+++ b/tests/kit/model_zoo/torchrec/torchrec.py
@@ -0,0 +1,97 @@
+from collections import namedtuple
+from functools import partial
+
+import torch
+
+try:
+    from torchrec.models import deepfm, dlrm
+    from torchrec.modules.embedding_configs import EmbeddingBagConfig
+    from torchrec.modules.embedding_modules import EmbeddingBagCollection
+    from torchrec.sparse.jagged_tensor import KeyedJaggedTensor, KeyedTensor
+    NO_TORCHREC = False
+except ImportError:
+    NO_TORCHREC = True
+
+from ..registry import ModelAttribute, model_zoo
+
+
+def register_torchrec_models():
+    BATCH = 2
+    SHAPE = 10
+    # KeyedTensor
+    KT = KeyedTensor(keys=["f1", "f2"], length_per_key=[SHAPE, SHAPE], values=torch.rand((BATCH, 2 * SHAPE)))
+
+    # KeyedJaggedTensor
+    KJT = KeyedJaggedTensor.from_offsets_sync(keys=["f1", "f2"],
+                                              values=torch.tensor([1, 2, 3, 4, 5, 6, 7, 8]),
+                                              offsets=torch.tensor([0, 2, 4, 6, 8]))
+
+    data_gen_fn = lambda: dict(features=torch.rand((BATCH, SHAPE)))
+
+    interaction_arch_data_gen_fn = lambda: dict(dense_features=torch.rand((BATCH, SHAPE)), sparse_features=KT)
+
+    simple_dfm_data_gen_fn = lambda: dict(dense_features=torch.rand((BATCH, SHAPE)), sparse_features=KJT)
+
+    sparse_arch_data_gen_fn = lambda: dict(features=KJT)
+
+    output_transform_fn = lambda x: dict(output=x)
+
+    def get_ebc():
+        # EmbeddingBagCollection
+        eb1_config = EmbeddingBagConfig(name="t1", embedding_dim=SHAPE, num_embeddings=SHAPE, feature_names=["f1"])
+        eb2_config = EmbeddingBagConfig(name="t2", embedding_dim=SHAPE, num_embeddings=SHAPE, feature_names=["f2"])
+        return EmbeddingBagCollection(tables=[eb1_config, eb2_config])
+
+    model_zoo.register(name='deepfm_densearch',
+                       model_fn=partial(deepfm.DenseArch, SHAPE, SHAPE, SHAPE),
+                       data_gen_fn=data_gen_fn,
+                       output_transform_fn=output_transform_fn)
+
+    model_zoo.register(name='deepfm_interactionarch',
+                       model_fn=partial(deepfm.FMInteractionArch, SHAPE * 3, ["f1", "f2"], SHAPE),
+                       data_gen_fn=interaction_arch_data_gen_fn,
+                       output_transform_fn=output_transform_fn)
+
+    model_zoo.register(name='deepfm_overarch',
+                       model_fn=partial(deepfm.OverArch, SHAPE),
+                       data_gen_fn=data_gen_fn,
+                       output_transform_fn=output_transform_fn)
+
+    model_zoo.register(name='deepfm_simpledeepfmnn',
+                       model_fn=partial(deepfm.SimpleDeepFMNN, SHAPE, get_ebc(), SHAPE, SHAPE),
+                       data_gen_fn=simple_dfm_data_gen_fn,
+                       output_transform_fn=output_transform_fn)
+
+    model_zoo.register(name='deepfm_sparsearch',
+                       model_fn=partial(deepfm.SparseArch, get_ebc()),
+                       data_gen_fn=sparse_arch_data_gen_fn,
+                       output_transform_fn=output_transform_fn)
+
+    model_zoo.register(name='dlrm',
+                       model_fn=partial(dlrm.DLRM, get_ebc(), SHAPE, [SHAPE, SHAPE], [5, 1]),
+                       data_gen_fn=simple_dfm_data_gen_fn,
+                       output_transform_fn=output_transform_fn)
+
+    model_zoo.register(name='dlrm_densearch',
+                       model_fn=partial(dlrm.DenseArch, SHAPE, [SHAPE, SHAPE]),
+                       data_gen_fn=data_gen_fn,
+                       output_transform_fn=output_transform_fn)
+
+    model_zoo.register(name='dlrm_interactionarch',
+                       model_fn=partial(dlrm.InteractionArch, 2),
+                       data_gen_fn=interaction_arch_data_gen_fn,
+                       output_transform_fn=output_transform_fn)
+
+    model_zoo.register(name='dlrm_overarch',
+                       model_fn=partial(dlrm.OverArch, SHAPE, [5, 1]),
+                       data_gen_fn=data_gen_fn,
+                       output_transform_fn=output_transform_fn)
+
+    model_zoo.register(name='dlrm_sparsearch',
+                       model_fn=partial(dlrm.SparseArch, get_ebc()),
+                       data_gen_fn=sparse_arch_data_gen_fn,
+                       output_transform_fn=output_transform_fn)
+
+
+if not NO_TORCHREC:
+    register_torchrec_models()
diff --git a/tests/test_fx/test_tracer/test_torchrec_model/test_deepfm_model.py b/tests/test_fx/test_tracer/test_torchrec_model/test_deepfm_model.py
index dbe8a62e7c59..6cbca343d134 100644
--- a/tests/test_fx/test_tracer/test_torchrec_model/test_deepfm_model.py
+++ b/tests/test_fx/test_tracer/test_torchrec_model/test_deepfm_model.py
@@ -2,85 +2,69 @@
 import torch
 
 from colossalai.fx import symbolic_trace
-
-try:
-    from torchrec.models import deepfm
-    from torchrec.modules.embedding_configs import EmbeddingBagConfig
-    from torchrec.modules.embedding_modules import EmbeddingBagCollection
-    from torchrec.sparse.jagged_tensor import KeyedJaggedTensor, KeyedTensor
-    NOT_TORCHREC = False
-except ImportError:
-    NOT_TORCHREC = True
+from tests.kit.model_zoo import model_zoo
 
 BATCH = 2
 SHAPE = 10
 
+deepfm_models = model_zoo.get_sub_registry('deepfm')
+NOT_DFM = False
+if not deepfm_models:
+    NOT_DFM = True
+
+
+def trace_and_compare(model_cls, data, output_transform_fn, meta_args=None):
+    # trace
+    model = model_cls()
+
+    # convert to eval for inference
+    # it is important to set it to eval mode before tracing
+    # without this statement, the torch.nn.functional.batch_norm will always be in training mode
+    model.eval()
+
+    gm = symbolic_trace(model, meta_args=meta_args)
+    gm.eval()
+    # run forward
+    with torch.no_grad():
+        fx_out = gm(**data)
+        non_fx_out = model(**data)
+
+    # compare output
+    transformed_fx_out = output_transform_fn(fx_out)
+    transformed_non_fx_out = output_transform_fn(non_fx_out)
+
+    assert len(transformed_fx_out) == len(transformed_non_fx_out)
+    if torch.is_tensor(fx_out):
+        assert torch.allclose(
+            fx_out, non_fx_out), f'{model.__class__.__name__} has inconsistent outputs, {fx_out} vs {non_fx_out}'
+    else:
+        assert torch.allclose(
+            fx_out.values(),
+            non_fx_out.values()), f'{model.__class__.__name__} has inconsistent outputs, {fx_out} vs {non_fx_out}'
+    for key in transformed_fx_out.keys():
+        fx_output_val = transformed_fx_out[key]
+        non_fx_output_val = transformed_non_fx_out[key]
+        if torch.is_tensor(fx_output_val):
+            assert torch.allclose(fx_output_val, non_fx_output_val, atol=1e-5), \
+                f'{model.__class__.__name__} has inconsistent outputs, {fx_output_val} vs {non_fx_output_val}'
+        else:
+            assert torch.allclose(fx_output_val.values(), non_fx_output_val.values()
+                                 ), f'{model.__class__.__name__} has inconsistent outputs, {fx_out} vs {non_fx_out}'
 
-@pytest.mark.skipif(NOT_TORCHREC, reason='torchrec is not installed')
-def test_torchrec_deepfm_models():
-    MODEL_LIST = [deepfm.DenseArch, deepfm.FMInteractionArch, deepfm.OverArch, deepfm.SimpleDeepFMNN, deepfm.SparseArch]
-
-    # Data Preparation
-    # EmbeddingBagCollection
-    eb1_config = EmbeddingBagConfig(name="t1", embedding_dim=SHAPE, num_embeddings=SHAPE, feature_names=["f1"])
-    eb2_config = EmbeddingBagConfig(name="t2", embedding_dim=SHAPE, num_embeddings=SHAPE, feature_names=["f2"])
-
-    ebc = EmbeddingBagCollection(tables=[eb1_config, eb2_config])
-    keys = ["f1", "f2"]
-
-    # KeyedTensor
-    KT = KeyedTensor(keys=keys, length_per_key=[SHAPE, SHAPE], values=torch.rand((BATCH, 2 * SHAPE)))
-
-    # KeyedJaggedTensor
-    KJT = KeyedJaggedTensor.from_offsets_sync(keys=keys,
-                                              values=torch.tensor([1, 2, 3, 4, 5, 6, 7, 8]),
-                                              offsets=torch.tensor([0, 2, 4, 6, 8]))
-
-    # Dense Features
-    features = torch.rand((BATCH, SHAPE))
-
-    for model_cls in MODEL_LIST:
-        # Initializing model
-        if model_cls == deepfm.DenseArch:
-            model = model_cls(SHAPE, SHAPE, SHAPE)
-        elif model_cls == deepfm.FMInteractionArch:
-            model = model_cls(SHAPE * 3, keys, SHAPE)
-        elif model_cls == deepfm.OverArch:
-            model = model_cls(SHAPE)
-        elif model_cls == deepfm.SimpleDeepFMNN:
-            model = model_cls(SHAPE, ebc, SHAPE, SHAPE)
-        elif model_cls == deepfm.SparseArch:
-            model = model_cls(ebc)
-
-        # Setup GraphModule
-        gm = symbolic_trace(model)
-
-        model.eval()
-        gm.eval()
 
-        # Aligned Test
-        with torch.no_grad():
-            if model_cls == deepfm.DenseArch or model_cls == deepfm.OverArch:
-                fx_out = gm(features)
-                non_fx_out = model(features)
-            elif model_cls == deepfm.FMInteractionArch:
-                fx_out = gm(features, KT)
-                non_fx_out = model(features, KT)
-            elif model_cls == deepfm.SimpleDeepFMNN:
-                fx_out = gm(features, KJT)
-                non_fx_out = model(features, KJT)
-            elif model_cls == deepfm.SparseArch:
-                fx_out = gm(KJT)
-                non_fx_out = model(KJT)
+@pytest.mark.skipif(NOT_DFM, reason='torchrec is not installed')
+def test_torchrec_deepfm_models(deepfm_models):
+    torch.backends.cudnn.deterministic = True
 
-        if torch.is_tensor(fx_out):
-            assert torch.allclose(
-                fx_out, non_fx_out), f'{model.__class__.__name__} has inconsistent outputs, {fx_out} vs {non_fx_out}'
+    for name, (model_fn, data_gen_fn, output_transform_fn, attribute) in deepfm_models.items():
+        data = data_gen_fn()
+        if attribute is not None and attribute.has_control_flow:
+            meta_args = {k: v.to('meta') for k, v in data.items()}
         else:
-            assert torch.allclose(
-                fx_out.values(),
-                non_fx_out.values()), f'{model.__class__.__name__} has inconsistent outputs, {fx_out} vs {non_fx_out}'
+            meta_args = None
+
+        trace_and_compare(model_fn, data, output_transform_fn, meta_args)
 
 
 if __name__ == "__main__":
-    test_torchrec_deepfm_models()
+    test_torchrec_deepfm_models(deepfm_models)
diff --git a/tests/test_fx/test_tracer/test_torchrec_model/test_dlrm_model.py b/tests/test_fx/test_tracer/test_torchrec_model/test_dlrm_model.py
index 2f9fd8fe5982..7aa868265f15 100644
--- a/tests/test_fx/test_tracer/test_torchrec_model/test_dlrm_model.py
+++ b/tests/test_fx/test_tracer/test_torchrec_model/test_dlrm_model.py
@@ -1,112 +1,70 @@
+import pytest
 import torch
 
 from colossalai.fx import symbolic_trace
-
-try:
-    from torchrec.models import dlrm
-    from torchrec.modules.embedding_configs import EmbeddingBagConfig
-    from torchrec.modules.embedding_modules import EmbeddingBagCollection
-    from torchrec.sparse.jagged_tensor import KeyedJaggedTensor, KeyedTensor
-    NOT_TORCHREC = False
-except ImportError:
-    NOT_TORCHREC = True
-
-import pytest
+from tests.kit.model_zoo import model_zoo
 
 BATCH = 2
 SHAPE = 10
 
-
-@pytest.mark.skipif(NOT_TORCHREC, reason='torchrec is not installed')
-def test_torchrec_dlrm_models():
-    MODEL_LIST = [
-        dlrm.DLRM,
-        dlrm.DenseArch,
-        dlrm.InteractionArch,
-        dlrm.InteractionV2Arch,
-        dlrm.OverArch,
-        dlrm.SparseArch,
-    # dlrm.DLRMV2
-    ]
-
-    # Data Preparation
-    # EmbeddingBagCollection
-    eb1_config = EmbeddingBagConfig(name="t1", embedding_dim=SHAPE, num_embeddings=SHAPE, feature_names=["f1"])
-    eb2_config = EmbeddingBagConfig(name="t2", embedding_dim=SHAPE, num_embeddings=SHAPE, feature_names=["f2"])
-
-    ebc = EmbeddingBagCollection(tables=[eb1_config, eb2_config])
-    keys = ["f1", "f2"]
-
-    # KeyedTensor
-    KT = KeyedTensor(keys=keys, length_per_key=[SHAPE, SHAPE], values=torch.rand((BATCH, 2 * SHAPE)))
-
-    # KeyedJaggedTensor
-    KJT = KeyedJaggedTensor.from_offsets_sync(keys=keys,
-                                              values=torch.tensor([1, 2, 3, 4, 5, 6, 7, 8]),
-                                              offsets=torch.tensor([0, 2, 4, 6, 8]))
-
-    # Dense Features
-    dense_features = torch.rand((BATCH, SHAPE))
-
-    # Sparse Features
-    sparse_features = torch.rand((BATCH, len(keys), SHAPE))
-
-    for model_cls in MODEL_LIST:
-        # Initializing model
-        if model_cls == dlrm.DLRM:
-            model = model_cls(ebc, SHAPE, [SHAPE, SHAPE], [5, 1])
-        elif model_cls == dlrm.DenseArch:
-            model = model_cls(SHAPE, [SHAPE, SHAPE])
-        elif model_cls == dlrm.InteractionArch:
-            model = model_cls(len(keys))
-        elif model_cls == dlrm.InteractionV2Arch:
-            I1 = dlrm.DenseArch(3 * SHAPE, [3 * SHAPE, 3 * SHAPE])
-            I2 = dlrm.DenseArch(3 * SHAPE, [3 * SHAPE, 3 * SHAPE])
-            model = model_cls(len(keys), I1, I2)
-        elif model_cls == dlrm.OverArch:
-            model = model_cls(SHAPE, [5, 1])
-        elif model_cls == dlrm.SparseArch:
-            model = model_cls(ebc)
-        elif model_cls == dlrm.DLRMV2:
-            # Currently DLRMV2 cannot be traced
-            model = model_cls(ebc, SHAPE, [SHAPE, SHAPE], [5, 1], [4 * SHAPE, 4 * SHAPE], [4 * SHAPE, 4 * SHAPE])
-
-        # Setup GraphModule
-        if model_cls == dlrm.InteractionV2Arch:
-            concrete_args = {"dense_features": dense_features, "sparse_features": sparse_features}
-            gm = symbolic_trace(model, concrete_args=concrete_args)
+dlrm_models = model_zoo.get_sub_registry('dlrm')
+NOT_DLRM = False
+if not dlrm_models:
+    NOT_DLRM = True
+
+
+def trace_and_compare(model_cls, data, output_transform_fn, meta_args=None):
+    # trace
+    model = model_cls()
+
+    # convert to eval for inference
+    # it is important to set it to eval mode before tracing
+    # without this statement, the torch.nn.functional.batch_norm will always be in training mode
+    model.eval()
+
+    gm = symbolic_trace(model, meta_args=meta_args)
+    gm.eval()
+    # run forward
+    with torch.no_grad():
+        fx_out = gm(**data)
+        non_fx_out = model(**data)
+
+    # compare output
+    transformed_fx_out = output_transform_fn(fx_out)
+    transformed_non_fx_out = output_transform_fn(non_fx_out)
+
+    assert len(transformed_fx_out) == len(transformed_non_fx_out)
+    if torch.is_tensor(fx_out):
+        assert torch.allclose(
+            fx_out, non_fx_out), f'{model.__class__.__name__} has inconsistent outputs, {fx_out} vs {non_fx_out}'
+    else:
+        assert torch.allclose(
+            fx_out.values(),
+            non_fx_out.values()), f'{model.__class__.__name__} has inconsistent outputs, {fx_out} vs {non_fx_out}'
+    for key in transformed_fx_out.keys():
+        fx_output_val = transformed_fx_out[key]
+        non_fx_output_val = transformed_non_fx_out[key]
+        if torch.is_tensor(fx_output_val):
+            assert torch.allclose(fx_output_val, non_fx_output_val, atol=1e-5), \
+                f'{model.__class__.__name__} has inconsistent outputs, {fx_output_val} vs {non_fx_output_val}'
         else:
-            gm = symbolic_trace(model)
+            assert torch.allclose(fx_output_val.values(), non_fx_output_val.values()
+                                 ), f'{model.__class__.__name__} has inconsistent outputs, {fx_out} vs {non_fx_out}'
 
-        model.eval()
-        gm.eval()
 
-        # Aligned Test
-        with torch.no_grad():
-            if model_cls == dlrm.DLRM or model_cls == dlrm.DLRMV2:
-                fx_out = gm(dense_features, KJT)
-                non_fx_out = model(dense_features, KJT)
-            elif model_cls == dlrm.DenseArch:
-                fx_out = gm(dense_features)
-                non_fx_out = model(dense_features)
-            elif model_cls == dlrm.InteractionArch or model_cls == dlrm.InteractionV2Arch:
-                fx_out = gm(dense_features, sparse_features)
-                non_fx_out = model(dense_features, sparse_features)
-            elif model_cls == dlrm.OverArch:
-                fx_out = gm(dense_features)
-                non_fx_out = model(dense_features)
-            elif model_cls == dlrm.SparseArch:
-                fx_out = gm(KJT)
-                non_fx_out = model(KJT)
+@pytest.mark.skipif(NOT_DLRM, reason='torchrec is not installed')
+def test_torchrec_dlrm_models(dlrm_models):
+    torch.backends.cudnn.deterministic = True
 
-        if torch.is_tensor(fx_out):
-            assert torch.allclose(
-                fx_out, non_fx_out), f'{model.__class__.__name__} has inconsistent outputs, {fx_out} vs {non_fx_out}'
+    for name, (model_fn, data_gen_fn, output_transform_fn, attribute) in dlrm_models.items():
+        data = data_gen_fn()
+        if attribute is not None and attribute.has_control_flow:
+            meta_args = {k: v.to('meta') for k, v in data.items()}
         else:
-            assert torch.allclose(
-                fx_out.values(),
-                non_fx_out.values()), f'{model.__class__.__name__} has inconsistent outputs, {fx_out} vs {non_fx_out}'
+            meta_args = None
+
+        trace_and_compare(model_fn, data, output_transform_fn, meta_args)
 
 
 if __name__ == "__main__":
-    test_torchrec_dlrm_models()
+    test_torchrec_dlrm_models(dlrm_models)

From ed19290560430ce8ede56bfdc9116b7b3f6c42df Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Fri, 17 Mar 2023 11:00:15 +0800
Subject: [PATCH 474/503] [booster] implemented mixed precision class (#3151)

* [booster] implemented mixed precision class

* polish code
---
 colossalai/booster/__init__.py                |   1 -
 colossalai/booster/booster.py                 |  84 ++++++++++--
 colossalai/booster/interface/__init__.py      |   3 +
 colossalai/booster/interface/optimizer.py     | 121 +++++++++++++++++
 .../booster/mixed_precision/__init__.py       |  33 +++++
 colossalai/booster/mixed_precision/bf16.py    |   5 +
 .../booster/mixed_precision/fp16_apex.py      |   5 +
 .../booster/mixed_precision/fp16_torch.py     | 122 ++++++++++++++++++
 colossalai/booster/mixed_precision/fp8.py     |   5 +
 .../mixed_precision/mixed_precision_base.py   |  21 +++
 colossalai/booster/precision.py               |  25 ----
 tests/kit/model_zoo/transformers/gpt.py       |   2 +-
 .../test_mixed_precision/test_fp16_torch.py   |  23 ++++
 13 files changed, 410 insertions(+), 40 deletions(-)
 create mode 100644 colossalai/booster/interface/__init__.py
 create mode 100644 colossalai/booster/interface/optimizer.py
 create mode 100644 colossalai/booster/mixed_precision/__init__.py
 create mode 100644 colossalai/booster/mixed_precision/bf16.py
 create mode 100644 colossalai/booster/mixed_precision/fp16_apex.py
 create mode 100644 colossalai/booster/mixed_precision/fp16_torch.py
 create mode 100644 colossalai/booster/mixed_precision/fp8.py
 create mode 100644 colossalai/booster/mixed_precision/mixed_precision_base.py
 delete mode 100644 colossalai/booster/precision.py
 create mode 100644 tests/test_booster/test_mixed_precision/test_fp16_torch.py

diff --git a/colossalai/booster/__init__.py b/colossalai/booster/__init__.py
index d475676ba06a..3b3f45bb0fe2 100644
--- a/colossalai/booster/__init__.py
+++ b/colossalai/booster/__init__.py
@@ -2,4 +2,3 @@
 from .booster import Booster
 from .environment_table import EnvironmentTable
 from .plugin import Plugin
-from .precision import Precision
diff --git a/colossalai/booster/booster.py b/colossalai/booster/booster.py
index 4aae200a0607..7b351ae343d2 100644
--- a/colossalai/booster/booster.py
+++ b/colossalai/booster/booster.py
@@ -1,37 +1,95 @@
 from contextlib import contextmanager
-from typing import Callable, Iterator, List, Optional, Tuple, Union
+from typing import Callable, Iterable, Iterator, List, Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
+from torch import Tensor
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
 from torch.utils.data import DataLoader
 
+from .mixed_precision import MixedPrecision, mixed_precision_factory
 from .plugin import Plugin
 
 __all__ = ['Booster']
 
 
 class Booster:
+    """
+    Booster is a high-level API for training neural networks. It provides a unified interface for
+    training with different precisio, accelerator, and plugin.
+
+    Examples:
+        >>> colossalai.launch(...)
+        >>> plugin = GeminiPlugin(stage=3, ...)
+        >>> booster = Booster(precision='fp16', plugin=plugin)
+        >>>
+        >>> model = GPT2()
+        >>> optimizer = Adam(model.parameters())
+        >>> dataloader = Dataloader(Dataset)
+        >>> lr_scheduler = LinearWarmupScheduler()
+        >>> criterion = GPTLMLoss()
+        >>>
+        >>> model, optimizer, lr_scheduler, dataloader = booster.boost(model, optimizer, lr_scheduler, dataloader)
+        >>>
+        >>> for epoch in range(max_epochs):
+        >>>     for input_ids, attention_mask in dataloader:
+        >>>         outputs = model(input_ids, attention_mask)
+        >>>         loss = criterion(outputs.logits, input_ids)
+        >>>         booster.backward(loss, optimizer)
+        >>>         optimizer.step()
+        >>>         lr_scheduler.step()
+        >>>         optimizer.zero_grad()
+
+
+    Args:
+        device (str or torch.device): The device to run the training. Default: 'cuda'.
+        mixed_precision (str or MixedPrecision): The mixed precision to run the training. Default: None.
+                                If the argument is a string, it can be 'fp16', 'fp16_apex', 'bf16', or 'fp8'.
+                                'fp16' would use PyTorch AMP while `fp16_apex` would use Nvidia Apex.
+        plugin (Plugin): The plugin to run the training. Default: None.
+    """
 
     def __init__(self,
                  device: Union[str, torch.device] = 'cuda',
-                 precision: str = 'fp32',
-                 grad_clipping_type: str = 'norm',
-                 grad_clipping_value: float = 0.0,
+                 mixed_precision: Union[MixedPrecision, str] = None,
                  plugin: Optional[Plugin] = None) -> None:
-        # TODO: implement this method
-        pass
+        # validate and set precision
+        if isinstance(MixedPrecision, str):
+            # the user will take the default arguments for amp training
+            self.mixed_precision = mixed_precision_factory(mixed_precision)
+        elif isinstance(mixed_precision, MixedPrecision):
+            # the user can customize the arguments by passing the precision object
+            self.mixed_precision = mixed_precision
+        else:
+            raise ValueError(
+                f'Expected the argument mixed_precision to be a string or an instance of Precision, but got {type(mixed_precision)}.'
+            )
 
-    def boost(
-        self, *args: Union[nn.Module, Optimizer, LRScheduler, DataLoader]
-    ) -> List[Union[nn.Module, Optimizer, LRScheduler, DataLoader]]:
-        # TODO: implement this method
-        pass
+    def boost(self, model: nn.Module, optimizer: Optimizer, criterion: Callable, lr_scheduler: LRScheduler,
+              dataloader: DataLoader) -> List[Union[nn.Module, Optimizer, LRScheduler, DataLoader]]:
+        """
+        Boost the model, optimizer, criterion, lr_scheduler, and dataloader.
+
+        Args:
+            model (nn.Module): The model to be boosted.
+            optimizer (Optimizer): The optimizer to be boosted.
+            criterion (Callable): The criterion to be boosted.
+            lr_scheduler (LRScheduler): The lr_scheduler to be boosted.
+            dataloader (DataLoader): The dataloader to be boosted.
+        """
+        # TODO(FrankLeeeee): consider multi-model and multi-optimizer case
+        # TODO(lsg): Add plugin control logic
+        # e.g.
+        # if self.plugin is not None and self.plugin.control_boost:
+        #    ...
+        # transform model for mixed precision
+        model, optimizer, criterion = self.mixed_precision.configure(model, optimizer, criterion)
+        return model, optimizer, criterion, lr_scheduler, dataloader
 
     def backward(self, loss: torch.Tensor, optimizer: Optimizer) -> None:
-        # TODO: implement this method
-        pass
+        # TODO: implement this method with plugin
+        optimizer.backward(loss)
 
     def execute_pipeline(self,
                          data_iter: Iterator,
diff --git a/colossalai/booster/interface/__init__.py b/colossalai/booster/interface/__init__.py
new file mode 100644
index 000000000000..8892a13e1814
--- /dev/null
+++ b/colossalai/booster/interface/__init__.py
@@ -0,0 +1,3 @@
+from .optimizer import OptimizerWrapper
+
+__all__ = ['OptimizerWrapper']
diff --git a/colossalai/booster/interface/optimizer.py b/colossalai/booster/interface/optimizer.py
new file mode 100644
index 000000000000..dd9acab17584
--- /dev/null
+++ b/colossalai/booster/interface/optimizer.py
@@ -0,0 +1,121 @@
+from typing import Union
+
+import torch.nn as nn
+from torch import Tensor
+from torch.optim import Optimizer
+
+
+class OptimizerWrapper:
+    """
+    A standard interface for optimizers wrapped by the Booster.
+
+    Args:
+        optim (Optimizer): The optimizer to be wrapped.
+    """
+
+    def __init__(self, optim: Optimizer):
+        self.optim = optim
+
+    @property
+    def parameters(self):
+        params = []
+
+        for group in self.param_groups:
+            params += group['params']
+        return params
+
+    @property
+    def param_groups(self):
+        return self.optim.param_groups
+
+    @property
+    def defaults(self):
+        return self.optim.defaults
+
+    def add_param_group(self, *args, **kwargs):
+        return self.optim.add_param_group(*args, **kwargs)
+
+    def step(self, *args, **kwargs):
+        """
+        Performs a single optimization step.
+        """
+        return self.optim.step(*args, **kwargs)
+
+    def zero_grad(self, *args, **kwargs):
+        """
+        Clears the gradients of all optimized `torch.Tensor`.
+        """
+        self.optim.zero_grad(*args, **kwargs)
+
+    def backward(self, loss: Tensor, *args, **kwargs):
+        """
+        Performs a backward pass on the loss.
+        """
+        loss.backward(*args, **kwargs)
+
+    def state_dict(self):
+        """
+        Returns the optimizer state.
+        """
+        return self.optim.state_dict()
+
+    def load_state_dict(self, *args, **kwargs):
+        """
+        Loads the optimizer state.
+        """
+        self.optim.load_state_dict(*args, **kwargs)
+
+    def clip_grad_by_value(self, clip_value: float, *args, **kwargs) -> None:
+        """
+        Clips gradient of an iterable of parameters at specified min and max values.
+
+        Args:
+            clip_value (float or int): maximum allowed value of the gradients. Gradients are clipped in the range
+
+        Note:
+            In PyTorch Torch 2.0 and above, you can pass in foreach=True as kwargs to clip_grad_value_ to use the
+            faster implementation. Please refer to the PyTorch documentation for more details.
+        """
+        nn.utils.clip_grad_value_(self.parameters, clip_value, *args, **kwargs)
+
+    def clip_grad_by_norm(self,
+                          max_norm: Union[float, int],
+                          norm_type: Union[float, int] = 2.0,
+                          error_if_nonfinite: bool = False,
+                          *args,
+                          **kwargs) -> Tensor:
+        """
+        Clips gradient norm of an iterable of parameters.
+
+        Args:
+            max_norm (float or int): max norm of the gradients
+            norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for infinity norm.
+            error_if_nonfinite (bool): if True, an error is raised if the total norm is non-finite. Default: False
+
+        Note:
+            In PyTorch Torch 2.0 and above, you can pass in foreach=True as kwargs to clip_grad_norm_ to use the
+            faster implementation. Please refer to the PyTorch documentation for more details.
+        """
+        norm = nn.utils.clip_grad_norm_(self.parameters, max_norm, norm_type, error_if_nonfinite, *args, **kwargs)
+        return norm
+
+    def scale_loss(self, loss: Tensor):
+        """
+        Scales the loss for mixed precision training.
+
+        Note: Only available for optimizers with mixed precision training.
+
+        Args:
+            loss (Tensor): The loss to be scaled.
+        """
+        raise NotImplementedError(
+            "The method scale_loss is only available for optimizers with mixed precision training")
+
+    def unscale_grad(self):
+        """
+        Unscale the gradients for mixed precision training.
+
+        Note: Only available for optimizers with mixed precision training.
+        """
+        raise NotImplementedError(
+            "The method unscale_grad is only available for optimizers with mixed precision training")
diff --git a/colossalai/booster/mixed_precision/__init__.py b/colossalai/booster/mixed_precision/__init__.py
new file mode 100644
index 000000000000..3cf0ad28cdbe
--- /dev/null
+++ b/colossalai/booster/mixed_precision/__init__.py
@@ -0,0 +1,33 @@
+from .bf16 import BF16MixedPrecision
+from .fp8 import FP8MixedPrecision
+from .fp16_apex import FP16ApexMixedPrecision
+from .fp16_torch import FP16TorchMixedPrecision
+from .mixed_precision_base import MixedPrecision
+
+__all__ = [
+    'MixedPrecision', 'mixed_precision_factory', 'FP16_Apex_MixedPrecision', 'FP16_Torch_MixedPrecision',
+    'FP32_MixedPrecision', 'BF16_MixedPrecision', 'FP8_MixedPrecision'
+]
+
+_mixed_precision_mapping = {
+    'fp16': FP16TorchMixedPrecision,
+    'fp16_apex': FP16ApexMixedPrecision,
+    'bf16': BF16MixedPrecision,
+    'fp8': FP8MixedPrecision
+}
+
+
+def mixed_precision_factory(mixed_precision_type: str) -> MixedPrecision:
+    """
+    Factory method to create mixed precision object
+
+    Args:
+        mixed_precision_type (str): mixed precision type, including None, 'fp16', 'fp16_apex', 'bf16', and 'fp8'.
+    """
+
+    if mixed_precision_type in _mixed_precision_mapping:
+        return _mixed_precision_mapping[mixed_precision_type]()
+    else:
+        raise ValueError(
+            f'Mixed precision type {mixed_precision_type} is not supported, support types include {list(_mixed_precision_mapping.keys())}'
+        )
diff --git a/colossalai/booster/mixed_precision/bf16.py b/colossalai/booster/mixed_precision/bf16.py
new file mode 100644
index 000000000000..4a840fea69ea
--- /dev/null
+++ b/colossalai/booster/mixed_precision/bf16.py
@@ -0,0 +1,5 @@
+from .mixed_precision_base import MixedPrecision
+
+
+class BF16MixedPrecision(MixedPrecision):
+    pass
diff --git a/colossalai/booster/mixed_precision/fp16_apex.py b/colossalai/booster/mixed_precision/fp16_apex.py
new file mode 100644
index 000000000000..266a750734b1
--- /dev/null
+++ b/colossalai/booster/mixed_precision/fp16_apex.py
@@ -0,0 +1,5 @@
+from .mixed_precision_base import MixedPrecision
+
+
+class FP16ApexMixedPrecision(MixedPrecision):
+    pass
diff --git a/colossalai/booster/mixed_precision/fp16_torch.py b/colossalai/booster/mixed_precision/fp16_torch.py
new file mode 100644
index 000000000000..054f78d2e226
--- /dev/null
+++ b/colossalai/booster/mixed_precision/fp16_torch.py
@@ -0,0 +1,122 @@
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+from torch.optim import Optimizer
+
+from ..interface import OptimizerWrapper
+from .mixed_precision_base import MixedPrecision
+
+__all__ = ['FP16_Torch_MixedPrecision', 'TorchAMPOptimizer', 'TorchAMPModule']
+
+
+class TorchAMPOptimizer(OptimizerWrapper):
+    """
+    Optimizer wrapper for mixed precision training in FP16 using PyTorch AMP.
+
+    Args:
+        optim (Optimizer): Optimizer to wrap.
+        init_scale (float): Initial scale factor. Default: 2**16.
+        growth_factor (float): Factor by which the scale is multiplied during
+            :meth:`torch.cuda.amp.GradScaler.step` if gradients were found to be finite
+            this iteration. Default: 2.0.
+        backoff_factor (float): Factor by which the scale is multiplied during
+            :meth:`torch.cuda.amp.GradScaler.step` if gradients were found to be infinite
+            this iteration. Default: 0.5.
+        growth_interval (int): Number of iterations between :meth:`torch.cuda.amp.GradScaler.step`
+            calls that may cause the scale to increase. Default: 2000.
+    """
+
+    def __init__(self,
+                 optim: Optimizer,
+                 init_scale: float = 2.**16,
+                 growth_factor: float = 2.0,
+                 backoff_factor: float = 0.5,
+                 growth_interval: int = 2000) -> None:
+        super().__init__(optim)
+        self.scaler = torch.cuda.amp.GradScaler(init_scale=init_scale,
+                                                growth_factor=growth_factor,
+                                                backoff_factor=backoff_factor,
+                                                growth_interval=growth_interval)
+
+    def backward(self, loss: Tensor, *args, **kwargs) -> None:
+        scaled_loss = self.scale_loss(loss)
+        scaled_loss.backward(*args, **kwargs)
+
+    def step(self, *args, **kwargs) -> Optional[float]:
+        return self.scaler.step(self.optim, *args, **kwargs)
+
+    def scale_loss(self, loss: Tensor) -> Tensor:
+        return self.scaler.scale(loss)
+
+    def unscale_grad(self) -> None:
+        self.scaler.unscale_(self.optim)
+
+    def clip_grad_by_value(self, clip_value: float, *args, **kwargs) -> None:
+        self.unscale_grad()
+        super().clip_grad_by_value(clip_value, *args, **kwargs)
+
+    def clip_grad_by_norm(self,
+                          max_norm: Union[float, int],
+                          norm_type: Union[float, int] = 2.0,
+                          error_if_nonfinite: bool = False,
+                          *args,
+                          **kwargs) -> None:
+        self.unscale_grad()
+        super().clip_grad_by_norm(max_norm, norm_type, error_if_nonfinite, *args, **kwargs)
+
+
+class TorchAMPModule(nn.Module):
+    """
+    Module wrapper for mixed precision training in FP16 using PyTorch AMP.
+
+    Args:
+        module (nn.Module): Module to wrap.
+    """
+
+    def __init__(self, module: nn.Module):
+        super().__init__()
+        self.module = module
+
+    def forward(self, *args, **kwargs):
+        with torch.cuda.amp.autocast():
+            return self.module(*args, **kwargs)
+
+
+class FP16TorchMixedPrecision(MixedPrecision):
+    """
+    Precision for mixed precision training in FP16 using PyTorch AMP.
+
+    Args:
+        init_scale (float): Initial scale factor. Default: 2**16.
+        growth_factor (float): Factor by which the scale is multiplied during
+            :meth:`torch.cuda.amp.GradScaler.step` if gradients were found to be finite
+            this iteration. Default: 2.0.
+        backoff_factor (float): Factor by which the scale is multiplied during
+            :meth:`torch.cuda.amp.GradScaler.step` if gradients were found to be infinite
+            this iteration. Default: 0.5.
+        growth_interval (int): Number of iterations between :meth:`torch.cuda.amp.GradScaler.step`
+            calls that may cause the scale to increase. Default: 2000.
+    """
+
+    def __init__(self,
+                 init_scale: float = 2.**16,
+                 growth_factor: float = 2.0,
+                 backoff_factor: float = 0.5,
+                 growth_interval: int = 2000) -> None:
+        super().__init__()
+        self.torch_amp_kwargs = dict(init_scale=init_scale,
+                                     growth_factor=growth_factor,
+                                     backoff_factor=backoff_factor,
+                                     growth_interval=growth_interval)
+
+    def configure(self,
+                  model: nn.Module,
+                  optimizer: Optimizer,
+                  criterion: Callable = None) -> Tuple[nn.Module, OptimizerWrapper, Callable]:
+        model = TorchAMPModule(model)
+        optimizer = TorchAMPOptimizer(optimizer, **self.torch_amp_kwargs)
+        if criterion is not None:
+            criterion = TorchAMPModule(criterion)
+        return model, optimizer, criterion
diff --git a/colossalai/booster/mixed_precision/fp8.py b/colossalai/booster/mixed_precision/fp8.py
new file mode 100644
index 000000000000..28847345d91d
--- /dev/null
+++ b/colossalai/booster/mixed_precision/fp8.py
@@ -0,0 +1,5 @@
+from .mixed_precision_base import MixedPrecision
+
+
+class FP8MixedPrecision(MixedPrecision):
+    pass
diff --git a/colossalai/booster/mixed_precision/mixed_precision_base.py b/colossalai/booster/mixed_precision/mixed_precision_base.py
new file mode 100644
index 000000000000..d1e8acc82cc6
--- /dev/null
+++ b/colossalai/booster/mixed_precision/mixed_precision_base.py
@@ -0,0 +1,21 @@
+from abc import ABC, abstractmethod
+from typing import Callable, Tuple
+
+import torch.nn as nn
+from torch.optim import Optimizer
+
+from ..interface import OptimizerWrapper
+
+
+class MixedPrecision(ABC):
+    """
+    An abstract class for mixed precision training.
+    """
+
+    @abstractmethod
+    def configure(self,
+                  model: nn.Module,
+                  optimizer: Optimizer,
+                  criterion: Callable = None) -> Tuple[nn.Module, OptimizerWrapper, Callable]:
+        # TODO: implement this method
+        pass
diff --git a/colossalai/booster/precision.py b/colossalai/booster/precision.py
deleted file mode 100644
index 8a391d9e4c88..000000000000
--- a/colossalai/booster/precision.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import torch
-import torch.nn as nn
-from torch.optim import Optimizer
-
-__all__ = ['Precision']
-
-
-class Precision:
-
-    def __init__(self, precision_type: torch.dtype, grad_clipping_type: str, grad_clipping_value: float):
-        self.precision_type = precision_type
-        self.grad_clipping_type = grad_clipping_type
-        self.grad_clipping_value = grad_clipping_value
-
-    def setup_model(self, model: nn.Module) -> nn.Module:
-        # TODO: implement this method
-        pass
-
-    def setup_optimizer(self, optimizer: Optimizer) -> Optimizer:
-        # TODO: implement this method
-        # inject grad clipping and unscale loss
-        pass
-
-    def scale_loss(self, loss: torch.Tensor) -> torch.Tensor:
-        pass
diff --git a/tests/kit/model_zoo/transformers/gpt.py b/tests/kit/model_zoo/transformers/gpt.py
index a92a46e36f0b..2a100c981dea 100644
--- a/tests/kit/model_zoo/transformers/gpt.py
+++ b/tests/kit/model_zoo/transformers/gpt.py
@@ -6,7 +6,7 @@
 # ===============================
 # Register single-sentence GPT
 # ===============================
-BATCH_SIZE = 2
+BATCH_SIZE = 1    # it can only be 1 as GPT cannot handle batch sizes > 1 if no padding token is defined.
 SEQ_LENGTH = 16
 
 
diff --git a/tests/test_booster/test_mixed_precision/test_fp16_torch.py b/tests/test_booster/test_mixed_precision/test_fp16_torch.py
new file mode 100644
index 000000000000..c56fcae58a60
--- /dev/null
+++ b/tests/test_booster/test_mixed_precision/test_fp16_torch.py
@@ -0,0 +1,23 @@
+import torch
+from torch.optim import Adam
+
+from colossalai.booster.mixed_precision import FP16TorchMixedPrecision
+from tests.kit.model_zoo import model_zoo
+
+
+def test_torch_amp():
+    for name, (model_fn, data_gen_fn, output_transform_fn, _) in model_zoo.items():
+        model = model_fn().cuda()
+        optimizer = Adam(model.parameters(), lr=1e-3)
+        criterion = lambda x: x.mean()
+        data = data_gen_fn()
+        data = {k: v.cuda() if torch.is_tensor(v) else v for k, v in data.items()}
+        mixed_precision = FP16TorchMixedPrecision()
+        model, optimizer, criterion = mixed_precision.configure(model, optimizer, criterion)
+        output = model(**data)
+        output = output_transform_fn(output)
+        output_key = list(output.keys())[0]
+        loss = criterion(output[output_key])
+        optimizer.backward(loss)
+        optimizer.clip_grad_by_norm(1.0)
+        optimizer.step()

From 3c01280a56353a363a031a630d7f4b6eb44b17f5 Mon Sep 17 00:00:00 2001
From: binmakeswell <binmakeswell@gmail.com>
Date: Fri, 17 Mar 2023 11:07:24 +0800
Subject: [PATCH 475/503] [doc] add community contribution guide (#3153)

* [doc] update contribution guide

* [doc] update contribution guide

* [doc] add community contribution guide
---
 README-zh-Hans.md                    |  9 ++++++++-
 README.md                            |  9 +++++++--
 applications/ChatGPT/README.md       | 10 +++++-----
 examples/README.md                   | 12 ++++++++++++
 examples/images/diffusion/README.md  | 13 +++++++++++++
 examples/images/dreambooth/README.md | 13 +++++++++++++
 6 files changed, 58 insertions(+), 8 deletions(-)

diff --git a/README-zh-Hans.md b/README-zh-Hans.md
index 283cc27cb9c2..81c45abfd833 100644
--- a/README-zh-Hans.md
+++ b/README-zh-Hans.md
@@ -11,6 +11,7 @@
    <a href="https://github.com/hpcaitech/ColossalAI/discussions"> 论坛 </a> |
    <a href="https://medium.com/@hpcaitech"> 博客 </a></h3>
 
+   [![GitHub Repo stars](https://img.shields.io/github/stars/hpcaitech/ColossalAI?style=social)](https://github.com/hpcaitech/ColossalAI/stargazers)
    [![Build](https://github.com/hpcaitech/ColossalAI/actions/workflows/build_on_schedule.yml/badge.svg)](https://github.com/hpcaitech/ColossalAI/actions/workflows/build_on_schedule.yml)
    [![Documentation](https://readthedocs.org/projects/colossalai/badge/?version=latest)](https://colossalai.readthedocs.io/en/latest/?badge=latest)
    [![CodeFactor](https://www.codefactor.io/repository/github/hpcaitech/colossalai/badge)](https://www.codefactor.io/repository/github/hpcaitech/colossalai)
@@ -375,7 +376,13 @@ docker run -ti --gpus all --rm --ipc=host colossalai bash
 
 ## 做出贡献
 
-欢迎为该项目做出贡献，请参阅[贡献指南](./CONTRIBUTING.md)。
+参考社区的成功案例，如 [BLOOM](https://bigscience.huggingface.co/) and [Stable Diffusion](https://en.wikipedia.org/wiki/Stable_Diffusion) 等,
+无论是个人开发者，还是算力、数据、模型等可能合作方，都欢迎参与参与共建 Colossal-AI 社区，拥抱大模型时代！
+
+您可通过以下方式联系或参与：
+1. [留下Star ⭐](https://github.com/hpcaitech/ColossalAI/stargazers) 展现你的喜爱和支持，非常感谢!
+2. 发布 [issue](https://github.com/hpcaitech/ColossalAI/issues/new/choose), 或者在GitHub根据[贡献指南](https://github.com/hpcaitech/ColossalAI/blob/main/CONTRIBUTING.md) 提交一个 PR。
+3. 发送你的正式合作提案到 contact@hpcaitech.com
 
 真诚感谢所有贡献者！
 
diff --git a/README.md b/README.md
index 602193f76def..3b55649b44bb 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,7 @@
    <a href="https://github.com/hpcaitech/ColossalAI/discussions"> Forum </a> |
    <a href="https://medium.com/@hpcaitech"> Blog </a></h3>
 
+   [![GitHub Repo stars](https://img.shields.io/github/stars/hpcaitech/ColossalAI?style=social)](https://github.com/hpcaitech/ColossalAI/stargazers)
    [![Build](https://github.com/hpcaitech/ColossalAI/actions/workflows/build_on_schedule.yml/badge.svg)](https://github.com/hpcaitech/ColossalAI/actions/workflows/build_on_schedule.yml)
    [![Documentation](https://readthedocs.org/projects/colossalai/badge/?version=latest)](https://colossalai.readthedocs.io/en/latest/?badge=latest)
    [![CodeFactor](https://www.codefactor.io/repository/github/hpcaitech/colossalai/badge)](https://www.codefactor.io/repository/github/hpcaitech/colossalai)
@@ -374,9 +375,13 @@ Join the Colossal-AI community on [Forum](https://github.com/hpcaitech/ColossalA
 [Slack](https://join.slack.com/t/colossalaiworkspace/shared_invite/zt-z7b26eeb-CBp7jouvu~r0~lcFzX832w),
 and [WeChat(微信)](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/WeChat.png "qrcode") to share your suggestions, feedback, and questions with our engineering team.
 
-## Contributing
+## Invitation to open-source contribution
+Referring to the successful attempts of [BLOOM](https://bigscience.huggingface.co/) and [Stable Diffusion](https://en.wikipedia.org/wiki/Stable_Diffusion), any and all developers and partners with computing powers, datasets, models are welcome to join and build the Colossal-AI community, making efforts towards the era of big AI models!
 
-If you wish to contribute to this project, please follow the guideline in [Contributing](./CONTRIBUTING.md).
+You may contact us or participate in the following ways:
+1. [Leaving a Star ⭐](https://github.com/hpcaitech/ColossalAI/stargazers) to show your like and support. Thanks!
+2. Posting an [issue](https://github.com/hpcaitech/ColossalAI/issues/new/choose), or submitting a PR on GitHub follow the guideline in [Contributing](https://github.com/hpcaitech/ColossalAI/blob/main/CONTRIBUTING.md)
+3. Send your official proposal to email contact@hpcaitech.com
 
 Thanks so much to all of our amazing contributors!
 
diff --git a/applications/ChatGPT/README.md b/applications/ChatGPT/README.md
index 23c6aa3726ce..206ede5f1843 100644
--- a/applications/ChatGPT/README.md
+++ b/applications/ChatGPT/README.md
@@ -156,15 +156,15 @@ You will find our progress in github project broad
 [Open ChatGPT](https://github.com/orgs/hpcaitech/projects/17/views/1)
 
 ## Invitation to open-source contribution
-Referring to the successful attempts of [BLOOM](https://bigscience.huggingface.co/) and [Stable Diffusion](https://en.wikipedia.org/wiki/Stable_Diffusion), any and all developers and partners with computing powers, datasets, models are welcome to join and build an ecosystem with Colossal-AI, making efforts towards the era of big AI models from the starting point of replicating ChatGPT!
+Referring to the successful attempts of [BLOOM](https://bigscience.huggingface.co/) and [Stable Diffusion](https://en.wikipedia.org/wiki/Stable_Diffusion), any and all developers and partners with computing powers, datasets, models are welcome to join and build the Colossal-AI community, making efforts towards the era of big AI models from the starting point of replicating ChatGPT!
 
 You may contact us or participate in the following ways:
-1. Posting an [issue](https://github.com/hpcaitech/ColossalAI/issues/new/choose) or submitting a [PR](https://github.com/hpcaitech/ColossalAI/pulls) on GitHub
-2. Join the Colossal-AI community on
+1. [Leaving a Star ⭐](https://github.com/hpcaitech/ColossalAI/stargazers) to show your like and support. Thanks!
+2. Posting an [issue](https://github.com/hpcaitech/ColossalAI/issues/new/choose), or submitting a PR on GitHub follow the guideline in [Contributing](https://github.com/hpcaitech/ColossalAI/blob/main/CONTRIBUTING.md).
+3. Join the Colossal-AI community on
 [Slack](https://join.slack.com/t/colossalaiworkspace/shared_invite/zt-z7b26eeb-CBp7jouvu~r0~lcFzX832w),
 and [WeChat(微信)](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/WeChat.png "qrcode") to share your ideas.
-3. Check out and fill in the [cooperation proposal](https://www.hpc-ai.tech/partners)
-4. Send your proposal to email contact@hpcaitech.com
+4. Send your official proposal to email contact@hpcaitech.com
 
 Thanks so much to all of our amazing contributors!
 
diff --git a/examples/README.md b/examples/README.md
index 78facea5406d..710ced101768 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -24,6 +24,18 @@ This folder provides several examples accelerated by Colossal-AI. The `tutorial`
       └─ ...
   └─ ...
 ```
+## Invitation to open-source contribution
+Referring to the successful attempts of [BLOOM](https://bigscience.huggingface.co/) and [Stable Diffusion](https://en.wikipedia.org/wiki/Stable_Diffusion), any and all developers and partners with computing powers, datasets, models are welcome to join and build the Colossal-AI community, making efforts towards the era of big AI models!
+
+You may contact us or participate in the following ways:
+1. [Leaving a Star ⭐](https://github.com/hpcaitech/ColossalAI/stargazers) to show your like and support. Thanks!
+2. Posting an [issue](https://github.com/hpcaitech/ColossalAI/issues/new/choose), or submitting a PR on GitHub follow the guideline in [Contributing](https://github.com/hpcaitech/ColossalAI/blob/main/CONTRIBUTING.md).
+3. Join the Colossal-AI community on
+[Slack](https://join.slack.com/t/colossalaiworkspace/shared_invite/zt-z7b26eeb-CBp7jouvu~r0~lcFzX832w),
+and [WeChat(微信)](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/WeChat.png "qrcode") to share your ideas.
+4. Send your official proposal to email contact@hpcaitech.com
+
+Thanks so much to all of our amazing contributors!
 
 ## Integrate Your Example With Testing
 
diff --git a/examples/images/diffusion/README.md b/examples/images/diffusion/README.md
index ff468f4f4acc..2a99094b703a 100644
--- a/examples/images/diffusion/README.md
+++ b/examples/images/diffusion/README.md
@@ -237,6 +237,19 @@ optional arguments:
                         evaluate at this precision
 ```
 
+## Invitation to open-source contribution
+Referring to the successful attempts of [BLOOM](https://bigscience.huggingface.co/) and [Stable Diffusion](https://en.wikipedia.org/wiki/Stable_Diffusion), any and all developers and partners with computing powers, datasets, models are welcome to join and build the Colossal-AI community, making efforts towards the era of big AI models!
+
+You may contact us or participate in the following ways:
+1. [Leaving a Star ⭐](https://github.com/hpcaitech/ColossalAI/stargazers) to show your like and support. Thanks!
+2. Posting an [issue](https://github.com/hpcaitech/ColossalAI/issues/new/choose), or submitting a PR on GitHub follow the guideline in [Contributing](https://github.com/hpcaitech/ColossalAI/blob/main/CONTRIBUTING.md).
+3. Join the Colossal-AI community on
+[Slack](https://join.slack.com/t/colossalaiworkspace/shared_invite/zt-z7b26eeb-CBp7jouvu~r0~lcFzX832w),
+and [WeChat(微信)](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/WeChat.png "qrcode") to share your ideas.
+4. Send your official proposal to email contact@hpcaitech.com
+
+Thanks so much to all of our amazing contributors!
+
 ## Comments
 
 - Our codebase for the diffusion models builds heavily on [OpenAI's ADM codebase](https://github.com/openai/guided-diffusion)
diff --git a/examples/images/dreambooth/README.md b/examples/images/dreambooth/README.md
index 83b7e4c06d54..14ed66c8d45b 100644
--- a/examples/images/dreambooth/README.md
+++ b/examples/images/dreambooth/README.md
@@ -105,3 +105,16 @@ image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]
 
 image.save("dog-bucket.png")
 ```
+
+## Invitation to open-source contribution
+Referring to the successful attempts of [BLOOM](https://bigscience.huggingface.co/) and [Stable Diffusion](https://en.wikipedia.org/wiki/Stable_Diffusion), any and all developers and partners with computing powers, datasets, models are welcome to join and build the Colossal-AI community, making efforts towards the era of big AI models!
+
+You may contact us or participate in the following ways:
+1. [Leaving a Star ⭐](https://github.com/hpcaitech/ColossalAI/stargazers) to show your like and support. Thanks!
+2. Posting an [issue](https://github.com/hpcaitech/ColossalAI/issues/new/choose), or submitting a PR on GitHub follow the guideline in [Contributing](https://github.com/hpcaitech/ColossalAI/blob/main/CONTRIBUTING.md).
+3. Join the Colossal-AI community on
+[Slack](https://join.slack.com/t/colossalaiworkspace/shared_invite/zt-z7b26eeb-CBp7jouvu~r0~lcFzX832w),
+and [WeChat(微信)](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/WeChat.png "qrcode") to share your ideas.
+4. Send your official proposal to email contact@hpcaitech.com
+
+Thanks so much to all of our amazing contributors!

From 6ae8ed04070c2654b5b4d1aaab7efb42aaa2b030 Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Fri, 17 Mar 2023 13:49:04 +0800
Subject: [PATCH 476/503] [lazyinit] add correctness verification (#3147)

* [lazyinit] fix shared module

* [tests] add lazy init test utils

* [tests] add torchvision for lazy init

* [lazyinit] fix pre op fn

* [lazyinit] handle legacy constructor

* [tests] refactor lazy init test models

* [tests] refactor lazy init test utils

* [lazyinit] fix ops don't support meta

* [tests] lazy init test timm models

* [lazyinit] fix set data

* [lazyinit] handle apex layers

* [tests] lazy init test transformers models

* [tests] lazy init test torchaudio models

* [lazyinit] fix import path

* [tests] lazy init test torchrec models

* [tests] update torch version in CI

* [tests] revert torch version in CI

* [tests] skip lazy init test
---
 colossalai/utils/model/experimental.py        | 165 ++++++++++++++----
 tests/kit/model_zoo/torchrec/__init__.py      |   1 +
 .../test_utils/test_lazy_init/test_models.py  |  23 +++
 tests/test_utils/test_lazy_init/utils.py      |  69 ++++++++
 4 files changed, 226 insertions(+), 32 deletions(-)
 create mode 100644 tests/kit/model_zoo/torchrec/__init__.py
 create mode 100644 tests/test_utils/test_lazy_init/test_models.py
 create mode 100644 tests/test_utils/test_lazy_init/utils.py

diff --git a/colossalai/utils/model/experimental.py b/colossalai/utils/model/experimental.py
index b8eb742f8c71..00cb532d9c1d 100644
--- a/colossalai/utils/model/experimental.py
+++ b/colossalai/utils/model/experimental.py
@@ -1,17 +1,16 @@
-from typing import Callable, Optional, Union
+from typing import Callable, List, Optional, Union
 
 import torch
 import torch.nn as nn
 from torch import Tensor
 from torch.utils._pytree import tree_map
 
-from colossalai.fx.profiler import MetaTensor
+from colossalai.fx.profiler.tensor import MetaTensor
 
 # reference: https://pytorch.org/cppdocs/notes/tensor_creation.html
-_TorchFactoryMethod = [
+_NORMAL_FACTORY = [
     "arange",
     "empty",
-    "eye",
     "full",
     "linspace",
     "logspace",
@@ -24,17 +23,39 @@
     "tensor",
 ]
 
+# factory function that does not support meta tensor backend
+_NO_META_FACTORY = [
+    "eye",
+]
+
 _EARLY_MATERIALIZED_OPS = ['__getitem__', 'split']
 
+_LEGACY_TENSOR_CONSTRUCTOR = {
+    'FloatTensor': torch.float,
+    'DoubleTensor': torch.double,
+    'HalfTensor': torch.half,
+    'BFloat16Tensor': torch.bfloat16,
+    'ByteTensor': torch.uint8,
+    'CharTensor': torch.int8,
+    'ShortTensor': torch.short,
+    'IntTensor': torch.int,
+    'LongTensor': torch.long,
+    'BoolTensor': torch.bool,
+}
+
 
 class _MyTensor(Tensor):
     """This class is only for correctness verification.
     """
     _pre_op_fn: Callable[['LazyTensor'], None] = lambda *args: None
 
-    def __new__(cls, func, *args, dtype=None, device=None, **kwargs) -> '_MyTensor':
+    def __new__(cls, func, *args, concrete_data=None, **kwargs) -> '_MyTensor':
         cls._pre_op_fn()
-        data = func(*args, dtype=dtype, device=device, **kwargs)
+        if concrete_data is not None:
+            # uniform api as LazyTensor
+            data = concrete_data
+        else:
+            data = func(*args, **kwargs)
         return Tensor._make_subclass(cls, data, require_grad=data.requires_grad)
 
     @classmethod
@@ -66,11 +87,13 @@ class LazyTensor(torch.Tensor):
         >>> x.add_(1) # modifying origin tensor after cloning leads to wrong materialization
         >>> z = x.tolist()
         >>> x.zeros_() # modifying origin tensor after cloning tolist is not allowed
-        >>> x.data = torch.rand(2, 3) # directly set data of a lazy tensor is not allowed
+        >>> nn.utils.weight_norm(self.conv, name="weight", dim=2) # applying weight norm on a lazy tensor is not allowed
+
 
         2. Cases that ``LazyTensor`` becomes eager (early materialization).
         >>> b = a[:, 2:]  # get a slice of a lazy tensor triggers early materialization
         >>> chunks = a.split(3)  # this also triggers early materialization
+        >>> x.data = torch.rand(2, 3) # directly setting data of a lazy tensor triggers early materialization
 
     """
 
@@ -79,12 +102,16 @@ class LazyTensor(torch.Tensor):
     _pre_op_fn: Callable[['LazyTensor'], None] = lambda *args: None
 
     @staticmethod
-    def __new__(cls, func, *args, meta_data=None, **kwargs):
-        if meta_data is None:
-            device = kwargs.get('device', 'cpu')
-            elem = func(*args, **{**kwargs, 'device': 'meta'})
-            meta_data = MetaTensor(elem, fake_device=device)
-        elem = meta_data._tensor
+    def __new__(cls, func, *args, meta_data=None, concrete_data=None, **kwargs):
+        if concrete_data is not None:
+            # some ops don't support meta backend and should have concrete data
+            elem = concrete_data
+        else:
+            if meta_data is None:
+                device = kwargs.get('device', 'cpu')
+                elem = func(*args, **{**kwargs, 'device': 'meta'})
+                meta_data = MetaTensor(elem, fake_device=device)
+            elem = meta_data._tensor
         r = torch.Tensor._make_wrapper_subclass(cls,
                                                 elem.size(),
                                                 strides=elem.stride(),
@@ -96,10 +123,10 @@ def __new__(cls, func, *args, meta_data=None, **kwargs):
         r._meta_data = meta_data
         return r
 
-    def __init__(self, func, *args, meta_data=None, **kwargs):
+    def __init__(self, func, *args, meta_data=None, concrete_data=None, **kwargs):
         self._factory_method = (func, args, kwargs)    # (func, args, kwargs)
         self._op_buffer = []    # (func, args, kwargs, replace)
-        self._materialized_data: Optional[torch.Tensor] = None    # materialized data
+        self._materialized_data: Optional[torch.Tensor] = concrete_data    # materialized data
 
     def materialize(self) -> torch.Tensor:
         """Materialize the ``LazyTensor`` to ``torch.Tensor``.
@@ -212,7 +239,7 @@ def unwrap(x):
                 if isinstance(x, LazyTensor):
                     if x._materialized_data is not None:
                         # for early materialized tensor, use its materialized data directly
-                        return x._materialized_data
+                        return x._materialized_data.data
                     t = x if is_inplace else x.clone()
                     t._op_buffer.append((func, args, kwargs))
                     meta = x._meta_data.data
@@ -232,13 +259,10 @@ def wrap(y, i=None):
                         return lazy_y
                 elif type(y) is Tensor:
                     # for early materialized tensor
-                    with torch._C.DisableTorchFunction():
-                        meta = MetaTensor(y.new_empty(y.shape, dtype=y.dtype, device='meta'), fake_device=y.device)
-                    lazy_y = LazyTensor(lambda: None, meta_data=meta)
-                    lazy_y._materialized_data = y
-                    return lazy_y
+                    return LazyTensor(lambda: None, concrete_data=y)
                 return y
 
+            cls._pre_op_fn()
             o = func(*tree_map(unwrap, args), **tree_map(unwrap, kwargs))
             if isinstance(o, (tuple, list)):
                 return type(o)(wrap(y, i=i) for i, y in enumerate(o))
@@ -266,7 +290,10 @@ def data(self):
 
     @data.setter
     def data(self, other: 'LazyTensor'):
-        raise NotImplementedError
+        if other is self:
+            return
+        # TODO(ver217): to avoid infinity recursion, do early materialization
+        self._materialized_data = other._materialize_data()
 
     def tolist(self) -> list:
         t = self.materialize()
@@ -330,18 +357,61 @@ def wrapper(*args, **kwargs):
 
             return wrapper, target
 
+        def wrap_legacy_constructor(target, dtype):
+            # legacy constructor (e.g. torch.LongTensor())
+            def wrapper(*args, **kwargs):
+                if len(args) == 1 and isinstance(args[0], torch.Tensor):
+                    # (Tensor other)
+                    return args[0]
+                elif len(args) == 1:
+                    # (object data, *, torch.device device)
+                    kwargs = {**kwargs, 'dtype': dtype}
+                    replaced, orig = self.overrides['tensor']
+                    return replaced(*args, **kwargs)
+                elif _is_int_tuple(args):
+                    # (tuple of ints size, *, torch.device device)
+                    kwargs = {**kwargs, 'dtype': dtype}
+                    replaced, orig = self.overrides['empty']
+                    return replaced(*args, **kwargs)
+                else:
+                    raise TypeError(
+                        f'new() received an invalid combination of arguments - got {tuple(type(x) for x in args)}, but expected one of:\n * (Tensor other)\n * (tuple of ints size, *, torch.device device)\n * (object data, *, torch.device device)'
+                    )
+
+            return wrapper, target
+
+        def wrap_no_meta_factory(target):
+            # factory functions which don't support meta tensor backend
+            def wrapper(*args, **kwargs):
+                tensor = target(*args, **kwargs)
+                return self.tensor_cls(lambda: None, concrete_data=tensor)
+
+            return wrapper, target
+
         self.overrides = {
             target: wrap_factory_method(getattr(torch, target))
-            for target in _TorchFactoryMethod
+            for target in _NORMAL_FACTORY
             if callable(getattr(torch, target, None))
         }
 
         self.overrides.update({
             target + '_like': wrap_factory_like_method(getattr(torch, target), getattr(torch, target + '_like'))
-            for target in _TorchFactoryMethod
+            for target in _NORMAL_FACTORY
             if callable(getattr(torch, target + '_like', None))
         })
 
+        self.overrides.update({
+            target: wrap_legacy_constructor(getattr(torch, target), dtype)
+            for target, dtype in _LEGACY_TENSOR_CONSTRUCTOR.items()
+            if callable(getattr(torch, target, None))
+        })
+
+        self.overrides.update({
+            target: wrap_no_meta_factory(getattr(torch, target))
+            for target in _NO_META_FACTORY
+            if callable(getattr(torch, target, None))
+        })
+
         for name, (wrapper, orig) in self.overrides.items():
             setattr(torch, name, wrapper)
 
@@ -363,34 +433,65 @@ def materialize(module: torch.nn.Module, verbose: bool = False):
             param_lazy_cnt = 0
             buf_cnt = 0
             buf_lazy_cnt = 0
+            non_lazy_numel = 0
+
+        # do post cleaning to handle shared parameter
+        visited_lazy_tensors: List[LazyTensor] = []
+        # handle shared module
+        visited_modules = set()
 
         @torch.no_grad()
         def init_recursively(module: nn.Module):
-            nonlocal param_cnt, param_lazy_cnt, buf_cnt, buf_lazy_cnt
+            nonlocal param_cnt, param_lazy_cnt, buf_cnt, buf_lazy_cnt, non_lazy_numel
             # recursively initialize the module
             for mod in module.children():
-                init_recursively(mod)
+                if id(mod) not in visited_modules:
+                    visited_modules.add(id(mod))
+                    init_recursively(mod)
 
             # initialize tensors directly attached to the current module
             for name, param in module.named_parameters(recurse=False):
                 if verbose:
                     param_cnt += 1
-                    if param._materialized_data is None:
+                    if getattr(param, '_materialized_data', False) is None:
+                        # if no _materialized_data attr, the tensor is not lazy
                         param_lazy_cnt += 1
-                setattr(module, name, param.materialize())
-                param.clean()
+                    else:
+                        non_lazy_numel += param.numel()
+                if hasattr(param, 'materialize'):
+                    # TODO(ver217): apex layers cannot be captured
+                    visited_lazy_tensors.append(param)
+                    setattr(module, name, param.materialize())
 
             for name, buf in module.named_buffers(recurse=False):
                 if verbose:
                     buf_cnt += 1
-                    if buf._materialized_data is None:
+                    if getattr(buf, "_materialized_data", False) is None:
+                        # if no _materialized_data attr, the tensor is not lazy
                         buf_lazy_cnt += 1
-                setattr(module, name, buf.materialize())
-                buf.clean()
+                    else:
+                        non_lazy_numel += buf.numel()
+                if hasattr(buf, 'materialize'):
+                    # TODO(ver217): apex layers cannot be captured
+                    visited_lazy_tensors.append(buf)
+                    setattr(module, name, buf.materialize())
 
         init_recursively(module)
 
+        for t in visited_lazy_tensors:
+            t.clean()
+
         if verbose:
             print(f'Param lazy rate: {param_lazy_cnt}/{param_cnt}')
             print(f'Buffer lazy rate: {buf_lazy_cnt}/{buf_cnt}')
+            print(f'Non-lazy numel: {non_lazy_numel} ({non_lazy_numel/1024**2:.3f} M)')
         return module
+
+
+def _is_int_tuple(args) -> bool:
+    if not isinstance(args, tuple):
+        return False
+    for x in args:
+        if not isinstance(x, int):
+            return False
+    return True
diff --git a/tests/kit/model_zoo/torchrec/__init__.py b/tests/kit/model_zoo/torchrec/__init__.py
new file mode 100644
index 000000000000..43952e6998cf
--- /dev/null
+++ b/tests/kit/model_zoo/torchrec/__init__.py
@@ -0,0 +1 @@
+from .torchrec import *
diff --git a/tests/test_utils/test_lazy_init/test_models.py b/tests/test_utils/test_lazy_init/test_models.py
new file mode 100644
index 000000000000..9faddecbaca4
--- /dev/null
+++ b/tests/test_utils/test_lazy_init/test_models.py
@@ -0,0 +1,23 @@
+import pytest
+
+from tests.kit.model_zoo import model_zoo
+
+# FIXME(ver217): uncomment this line
+# from utils import check_lazy_init
+
+
+# FIXME(ver217): temporarily skip this test since torch 1.11 does not fully support meta tensor
+@pytest.mark.skip
+@pytest.mark.parametrize('subset', ['torchvision', 'diffusers', 'timm', 'transformers', 'torchaudio', 'deepfm', 'dlrm'])
+def test_torchvision_models_lazy_init(subset):
+    sub_model_zoo = model_zoo.get_sub_registry(subset)
+    for name, entry in sub_model_zoo.items():
+        # TODO(ver217): lazy init does not support weight norm, skip these models
+        if name in ('torchaudio_wav2vec2_base', 'torchaudio_hubert_base'):
+            continue
+        # FIXME(ver217): uncomment this line
+        # check_lazy_init(entry, verbose=True)
+
+
+if __name__ == '__main__':
+    test_torchvision_models_lazy_init('torchvision')
diff --git a/tests/test_utils/test_lazy_init/utils.py b/tests/test_utils/test_lazy_init/utils.py
new file mode 100644
index 000000000000..47ba534bc434
--- /dev/null
+++ b/tests/test_utils/test_lazy_init/utils.py
@@ -0,0 +1,69 @@
+import random
+from typing import Any, Callable, Optional, Tuple
+
+import numpy as np
+import torch
+
+from colossalai.utils.model.experimental import LazyInitContext, LazyTensor, _MyTensor
+from tests.kit.model_zoo.registry import ModelAttribute
+
+# model_fn, data_gen_fn, output_transform_fn, model_attr
+TestingEntry = Tuple[Callable[[], torch.nn.Module], Callable[[], dict], Callable[[], dict], Optional[ModelAttribute]]
+
+
+def set_seed(seed: int) -> None:
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+
+
+def assert_model_eqaual(m1: torch.nn.Module, m2: torch.nn.Module) -> None:
+    s1 = m1.state_dict()
+    s2 = m2.state_dict()
+
+    assert len(s1) == len(s2), f'len {len(s1)} vs {len(s2)}'
+
+    for (n1, t1), (n2, t2) in zip(s1.items(), s2.items()):
+        assert n1 == n2
+        assert torch.equal(t1, t2), f'{n1} {t1} vs {t2}'
+
+
+def assert_forward_equal(m1: torch.nn.Module, m2: torch.nn.Module, data_gen_fn: Callable[[], dict],
+                         output_transform_fn: Callable[[Any], dict]) -> None:
+    data = data_gen_fn()
+
+    m1.eval()
+    m2.eval()
+    # run forward
+    with torch.no_grad():
+        outputs1 = m1(**data)
+        outputs2 = m2(**data)
+
+    # compare output
+    transformed_out1 = output_transform_fn(outputs1)
+    transformed_out2 = output_transform_fn(outputs2)
+
+    assert len(transformed_out1) == len(transformed_out2)
+
+    for key, out1 in transformed_out1.items():
+        out2 = transformed_out2[key]
+        assert torch.allclose(out1, out2, atol=1e-5), \
+            f'{m1.__class__.__name__} has inconsistent outputs, {out1} vs {out2}'
+
+
+def check_lazy_init(entry: TestingEntry, seed: int = 42, verbose: bool = False, check_forward: bool = False) -> None:
+    model_fn, data_gen_fn, output_transform_fn, model_attr = entry
+    _MyTensor._pre_op_fn = lambda *args: set_seed(seed)
+    LazyTensor._pre_op_fn = lambda *args: set_seed(seed)
+    ctx = LazyInitContext(tensor_cls=_MyTensor)
+    with ctx:
+        model = model_fn()
+    ctx = LazyInitContext()
+    with ctx:
+        deferred_model = model_fn()
+    deferred_model = ctx.materialize(deferred_model, verbose=verbose)
+    assert_model_eqaual(model, deferred_model)
+    if check_forward:
+        assert_forward_equal(model, deferred_model, data_gen_fn, output_transform_fn)
+    if verbose:
+        print(f'{model.__class__.__name__} pass')

From c474fda282da90672527d5ad638d23bc4eec1ff8 Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Fri, 17 Mar 2023 15:41:47 +0800
Subject: [PATCH 477/503] [chatgpt] fix ppo training hanging problem with
 gemini (#3162)

* [chatgpt] fix generation early stopping

* [chatgpt] fix train prompts example
---
 applications/ChatGPT/chatgpt/models/generation.py | 11 ++++++++++-
 applications/ChatGPT/examples/train_prompts.py    |  7 ++++---
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/applications/ChatGPT/chatgpt/models/generation.py b/applications/ChatGPT/chatgpt/models/generation.py
index 4ee797561f7f..eb30c36d0f84 100644
--- a/applications/ChatGPT/chatgpt/models/generation.py
+++ b/applications/ChatGPT/chatgpt/models/generation.py
@@ -1,6 +1,7 @@
 from typing import Any, Callable, Optional
 
 import torch
+import torch.distributed as dist
 import torch.nn as nn
 
 try:
@@ -27,6 +28,14 @@ def prepare_logits_processor(top_k: Optional[int] = None,
     return processor_list
 
 
+def _is_sequence_finished(unfinished_sequences: torch.Tensor) -> bool:
+    if dist.is_initialized() and dist.get_world_size() > 1:
+        # consider DP
+        unfinished_sequences = unfinished_sequences.clone()
+        dist.all_reduce(unfinished_sequences)
+    return unfinished_sequences.max() == 0
+
+
 def sample(model: nn.Module,
            input_ids: torch.Tensor,
            max_length: int,
@@ -74,7 +83,7 @@ def sample(model: nn.Module,
             unfinished_sequences = unfinished_sequences.mul((next_tokens != eos_token_id).long())
 
         # stop when each sentence is finished if early_stopping=True
-        if early_stopping and unfinished_sequences.max() == 0:
+        if early_stopping and _is_sequence_finished(unfinished_sequences):
             break
 
     return input_ids
diff --git a/applications/ChatGPT/examples/train_prompts.py b/applications/ChatGPT/examples/train_prompts.py
index d4f31e61eb75..8f48a11c33e8 100644
--- a/applications/ChatGPT/examples/train_prompts.py
+++ b/applications/ChatGPT/examples/train_prompts.py
@@ -46,7 +46,6 @@ def main(args):
         initial_model = deepcopy(actor)
         reward_model = RewardModel(deepcopy(critic.model), deepcopy(critic.value_head)).to(torch.cuda.current_device())
 
-
     # configure optimizer
     if args.strategy.startswith('colossalai'):
         actor_optim = HybridAdam(actor.parameters(), lr=5e-6)
@@ -70,7 +69,9 @@ def main(args):
     dataset = pd.read_csv(args.prompt_path)['prompt']
 
     def tokenize_fn(texts):
-        batch = tokenizer(texts, return_tensors='pt', max_length=96, padding=True, truncation=True)
+        # MUST padding to max length to ensure inputs of all ranks have the same length
+        # Different length may lead to hang when using gemini, as different generation steps
+        batch = tokenizer(texts, return_tensors='pt', max_length=96, padding='max_length', truncation=True)
         return {k: v.cuda() for k, v in batch.items()}
 
     (actor, actor_optim), (critic, critic_optim), reward_model, initial_model = strategy.prepare(
@@ -101,7 +102,7 @@ def tokenize_fn(texts):
                 num_episodes=args.num_episodes,
                 max_timesteps=args.max_timesteps,
                 update_timesteps=args.update_timesteps)
-    # save model checkpoint after fitting 
+    # save model checkpoint after fitting
     strategy.save_model(actor, args.save_path, only_rank0=True)
     # save optimizer checkpoint on all ranks
     if args.need_optim_ckpt:

From 1e58d31bb75ce973f1fea7d4f061444138e35654 Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Fri, 17 Mar 2023 17:31:22 +0800
Subject: [PATCH 478/503] [chatgpt] fix trainer generate kwargs (#3166)

---
 applications/ChatGPT/chatgpt/trainer/ppo.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/applications/ChatGPT/chatgpt/trainer/ppo.py b/applications/ChatGPT/chatgpt/trainer/ppo.py
index 789e0c2f8f1e..dacab4784039 100644
--- a/applications/ChatGPT/chatgpt/trainer/ppo.py
+++ b/applications/ChatGPT/chatgpt/trainer/ppo.py
@@ -63,6 +63,7 @@ def __init__(self,
                  **generate_kwargs) -> None:
         experience_maker = NaiveExperienceMaker(actor, critic, reward_model, initial_model, kl_coef)
         replay_buffer = NaiveReplayBuffer(train_batch_size, buffer_limit, buffer_cpu_offload)
+        generate_kwargs = _set_default_generate_kwargs(strategy, generate_kwargs, actor)
         super().__init__(strategy, experience_maker, replay_buffer, experience_batch_size, max_epochs, tokenizer,
                          sample_replay_buffer, dataloader_pin_memory, callbacks, **generate_kwargs)
         self.actor = actor
@@ -73,7 +74,6 @@ def __init__(self,
 
         self.actor_optim = actor_optim
         self.critic_optim = critic_optim
-        self._set_default_generate_kwargs(generate_kwargs, actor)
 
     def training_step(self, experience: Experience) -> Dict[str, float]:
         self.actor.train()
@@ -102,11 +102,15 @@ def training_step(self, experience: Experience) -> Dict[str, float]:
 
         return {'actor_loss': actor_loss.item(), 'critic_loss': critic_loss.item()}
 
-    def _set_default_generate_kwargs(self, generate_kwargs: dict, actor: Actor) -> None:
-        origin_model = self.strategy._unwrap_actor(actor)
-        # use huggingface models method directly
-        if 'prepare_inputs_fn' not in generate_kwargs and hasattr(origin_model, 'prepare_inputs_for_generation'):
-            generate_kwargs['prepare_inputs_fn'] = origin_model.prepare_inputs_for_generation
 
-        if 'update_model_kwargs_fn' not in generate_kwargs:
-            generate_kwargs['update_model_kwargs_fn'] = update_model_kwargs_fn
+def _set_default_generate_kwargs(strategy: Strategy, generate_kwargs: dict, actor: Actor) -> None:
+    origin_model = strategy._unwrap_actor(actor)
+    new_kwargs = {**generate_kwargs}
+    # use huggingface models method directly
+    if 'prepare_inputs_fn' not in generate_kwargs and hasattr(origin_model, 'prepare_inputs_for_generation'):
+        new_kwargs['prepare_inputs_fn'] = origin_model.prepare_inputs_for_generation
+
+    if 'update_model_kwargs_fn' not in generate_kwargs:
+        new_kwargs['update_model_kwargs_fn'] = update_model_kwargs_fn
+
+    return new_kwargs

From 7548ca5a54ed117f03247dcb43ec1dd962ae04e0 Mon Sep 17 00:00:00 2001
From: BlueRum <70618399+ht-zhou@users.noreply.github.com>
Date: Mon, 20 Mar 2023 09:59:06 +0800
Subject: [PATCH 479/503] [chatgpt]Reward Model Training  Process update
 (#3133)

* add normalize function to value_head in bloom rm

* add normalization to value_function in gpt_rm

* add normalization to value_head of opt_rm

* add Anthropic/hh-rlhf dataset

* Update __init__.py

* Add LogExpLoss in RM training

* Update __init__.py

* update rm trainer to use acc as target

* update example/train_rm

* Update train_rm.sh

* code style

* Update README.md

* Update README.md

* add rm test to ci

* fix tokenier

* fix typo

* change batchsize to avoid oom in ci

* Update test_ci.sh
---
 .../ChatGPT/chatgpt/dataset/__init__.py       |   4 +-
 .../ChatGPT/chatgpt/dataset/reward_dataset.py |  65 +++++++++-
 .../ChatGPT/chatgpt/models/__init__.py        |   4 +-
 .../ChatGPT/chatgpt/models/bloom/bloom_rm.py  |   1 +
 .../ChatGPT/chatgpt/models/gpt/gpt_rm.py      |   1 +
 applications/ChatGPT/chatgpt/models/loss.py   |  14 ++-
 .../ChatGPT/chatgpt/models/opt/opt_rm.py      |   1 +
 applications/ChatGPT/chatgpt/trainer/rm.py    | 111 +++++++++++-------
 applications/ChatGPT/examples/README.md       |  41 +++++--
 applications/ChatGPT/examples/test_ci.sh      |  20 ++++
 .../ChatGPT/examples/train_reward_model.py    |  93 ++++++++++-----
 applications/ChatGPT/examples/train_rm.sh     |  26 ++--
 12 files changed, 270 insertions(+), 111 deletions(-)

diff --git a/applications/ChatGPT/chatgpt/dataset/__init__.py b/applications/ChatGPT/chatgpt/dataset/__init__.py
index b4599c82ba75..83393098775f 100644
--- a/applications/ChatGPT/chatgpt/dataset/__init__.py
+++ b/applications/ChatGPT/chatgpt/dataset/__init__.py
@@ -1,4 +1,4 @@
-from .reward_dataset import RewardDataset
+from .reward_dataset import RmStaticDataset, HhRlhfDataset
 from .utils import is_rank_0
 
-__all__ = ['RewardDataset', 'is_rank_0']
+__all__ = ['RmStaticDataset', 'HhRlhfDataset','is_rank_0']
diff --git a/applications/ChatGPT/chatgpt/dataset/reward_dataset.py b/applications/ChatGPT/chatgpt/dataset/reward_dataset.py
index 8bc850f2d52d..9ee13490b893 100644
--- a/applications/ChatGPT/chatgpt/dataset/reward_dataset.py
+++ b/applications/ChatGPT/chatgpt/dataset/reward_dataset.py
@@ -5,8 +5,8 @@
 
 from .utils import is_rank_0
 
-
-class RewardDataset(Dataset):
+# Dahaos/rm-static
+class RmStaticDataset(Dataset):
     """
     Dataset for reward model
 
@@ -14,16 +14,71 @@ class RewardDataset(Dataset):
         dataset: dataset for reward model
         tokenizer: tokenizer for reward model
         max_length: max length of input
+        special_token: special token at the end of sentence
     """
 
-    def __init__(self, dataset, tokenizer: Callable, max_length: int) -> None:
+    def __init__(self, dataset, tokenizer: Callable, max_length: int, special_token=None) -> None:
         super().__init__()
         self.chosen = []
         self.reject = []
+        if special_token is None:
+            self.end_token = tokenizer.eos_token
+        else:
+            self.end_token = special_token
         for data in tqdm(dataset, disable=not is_rank_0()):
             prompt = data['prompt']
 
-            chosen = prompt + data['chosen'] + "<|endoftext|>"
+            chosen = prompt + data['chosen'] + self.end_token
+            chosen_token = tokenizer(chosen,
+                                     max_length=max_length,
+                                     padding="max_length",
+                                     truncation=True,
+                                     return_tensors="pt")
+            self.chosen.append({
+                "input_ids": chosen_token['input_ids'],
+                "attention_mask": chosen_token['attention_mask']
+            })
+
+            reject = prompt + data['rejected'] + self.end_token
+            reject_token = tokenizer(reject,
+                                     max_length=max_length,
+                                     padding="max_length",
+                                     truncation=True,
+                                     return_tensors="pt")
+            self.reject.append({
+                "input_ids": reject_token['input_ids'],
+                "attention_mask": reject_token['attention_mask']
+            })
+
+    def __len__(self):
+        length = len(self.chosen)
+        return length
+
+    def __getitem__(self, idx):
+        return self.chosen[idx]["input_ids"], self.chosen[idx]["attention_mask"], self.reject[idx][
+            "input_ids"], self.reject[idx]["attention_mask"]
+
+# Anthropic/hh-rlhf
+class HhRlhfDataset(Dataset):
+    """
+    Dataset for reward model
+
+    Args:
+        dataset: dataset for reward model
+        tokenizer: tokenizer for reward model
+        max_length: max length of input
+        special_token: special token at the end of sentence
+    """
+    def __init__(self, dataset, tokenizer: Callable, max_length: int, special_token=None) -> None:
+        super().__init__()
+        self.chosen = []
+        self.reject = []
+        if special_token is None:
+            self.end_token = tokenizer.eos_token
+        else:
+            self.end_token = special_token
+        for data in tqdm(dataset, disable=not is_rank_0()):
+            chosen = data['chosen'] + self.end_token
             chosen_token = tokenizer(chosen,
                                      max_length=max_length,
                                      padding="max_length",
@@ -34,7 +89,7 @@ def __init__(self, dataset, tokenizer: Callable, max_length: int) -> None:
                 "attention_mask": chosen_token['attention_mask']
             })
 
-            reject = prompt + data['rejected'] + "<|endoftext|>"
+            reject = data['rejected'] + self.end_token
             reject_token = tokenizer(reject,
                                      max_length=max_length,
                                      padding="max_length",
diff --git a/applications/ChatGPT/chatgpt/models/__init__.py b/applications/ChatGPT/chatgpt/models/__init__.py
index 376fed8de792..b274188a21df 100644
--- a/applications/ChatGPT/chatgpt/models/__init__.py
+++ b/applications/ChatGPT/chatgpt/models/__init__.py
@@ -1,4 +1,4 @@
 from .base import Actor, Critic, RewardModel
-from .loss import PairWiseLoss, PolicyLoss, PPOPtxActorLoss, ValueLoss
+from .loss import PolicyLoss, PPOPtxActorLoss, ValueLoss, LogSigLoss, LogExpLoss
 
-__all__ = ['Actor', 'Critic', 'RewardModel', 'PolicyLoss', 'ValueLoss', 'PPOPtxActorLoss', 'PairWiseLoss']
+__all__ = ['Actor', 'Critic', 'RewardModel', 'PolicyLoss', 'ValueLoss', 'PPOPtxActorLoss', 'LogSigLoss', 'LogExpLoss']
diff --git a/applications/ChatGPT/chatgpt/models/bloom/bloom_rm.py b/applications/ChatGPT/chatgpt/models/bloom/bloom_rm.py
index 4dc2646e36ae..2dba227ff7d0 100644
--- a/applications/ChatGPT/chatgpt/models/bloom/bloom_rm.py
+++ b/applications/ChatGPT/chatgpt/models/bloom/bloom_rm.py
@@ -33,4 +33,5 @@ def __init__(self,
         if checkpoint:
             model.gradient_checkpointing_enable()
         value_head = nn.Linear(model.config.hidden_size, 1)
+        value_head.weight.data.normal_(mean=0.0, std=1/(model.config.hidden_size + 1))
         super().__init__(model, value_head, lora_rank, lora_train_bias)
diff --git a/applications/ChatGPT/chatgpt/models/gpt/gpt_rm.py b/applications/ChatGPT/chatgpt/models/gpt/gpt_rm.py
index 0132dbf27ffc..19d673de6825 100644
--- a/applications/ChatGPT/chatgpt/models/gpt/gpt_rm.py
+++ b/applications/ChatGPT/chatgpt/models/gpt/gpt_rm.py
@@ -35,4 +35,5 @@ def __init__(self,
             model.gradient_checkpointing_enable()
 
         value_head = nn.Linear(model.config.n_embd, 1)
+        value_head.weight.data.normal_(mean=0.0, std=1/(model.config.n_embd + 1))
         super().__init__(model, value_head, lora_rank, lora_train_bias)
diff --git a/applications/ChatGPT/chatgpt/models/loss.py b/applications/ChatGPT/chatgpt/models/loss.py
index 0ebcfea061b0..c5b1ccc93228 100644
--- a/applications/ChatGPT/chatgpt/models/loss.py
+++ b/applications/ChatGPT/chatgpt/models/loss.py
@@ -93,13 +93,23 @@ def forward(self,
         return policy_loss + self.pretrain_coef * lm_loss
 
 
-class PairWiseLoss(nn.Module):
+class LogSigLoss(nn.Module):
     """
     Pairwise Loss for Reward Model
+    Details: https://arxiv.org/abs/2203.02155
     """
-
     def forward(self, chosen_reward: torch.Tensor, reject_reward: torch.Tensor) -> torch.Tensor:
         probs = torch.sigmoid(chosen_reward - reject_reward)
         log_probs = torch.log(probs)
         loss = -log_probs.mean()
         return loss
+
+
+class LogExpLoss(nn.Module):
+    """
+    Pairwise Loss for Reward Model
+    Details: https://arxiv.org/abs/2204.05862
+    """
+    def forward(self, chosen_reward: torch.Tensor, reject_reward: torch.Tensor) -> torch.Tensor:
+        loss = torch.log(1 + torch.exp(reject_reward - chosen_reward)).mean()
+        return loss
diff --git a/applications/ChatGPT/chatgpt/models/opt/opt_rm.py b/applications/ChatGPT/chatgpt/models/opt/opt_rm.py
index 7ad7b3887e53..ef7f0fb16fd1 100644
--- a/applications/ChatGPT/chatgpt/models/opt/opt_rm.py
+++ b/applications/ChatGPT/chatgpt/models/opt/opt_rm.py
@@ -34,4 +34,5 @@ def __init__(self,
             model.gradient_checkpointing_enable()
 
         value_head = nn.Linear(model.config.word_embed_proj_dim, 1)
+        value_head.weight.data.normal_(mean=0.0, std=1/(model.config.word_embed_proj_dim + 1))
         super().__init__(model, value_head, lora_rank, lora_train_bias)
diff --git a/applications/ChatGPT/chatgpt/trainer/rm.py b/applications/ChatGPT/chatgpt/trainer/rm.py
index c07d65f84ca5..7fa87a64968b 100644
--- a/applications/ChatGPT/chatgpt/trainer/rm.py
+++ b/applications/ChatGPT/chatgpt/trainer/rm.py
@@ -1,13 +1,12 @@
 from abc import ABC
-
+import pandas as pd
 import loralib as lora
 import torch
-from chatgpt.dataset import RewardDataset
-from chatgpt.models.loss import PairWiseLoss
-from torch.optim import Adam, Optimizer
-from torch.utils.data import DataLoader
+from datetime import datetime
+from torch.optim import Optimizer, lr_scheduler
+from torch.utils.data import DataLoader, Dataset
 from tqdm import tqdm
-
+ 
 from .strategies import Strategy
 from .utils import is_rank_0
 
@@ -20,11 +19,12 @@ class RewardModelTrainer(ABC):
         model (torch.nn.Module): the model to train
         strategy (Strategy): the strategy to use for training
         optim(Optimizer): the optimizer to use for training
-        train_dataset (RewardDataset): the dataset to use for training
-        eval_dataset (RewardDataset): the dataset to use for evaluation
+        loss_fn (callable): the loss function to use for training
+        train_dataset (Dataset): the dataset to use for training
+        valid_dataset (Dataset): the dataset to use for validation
+        eval_dataset (Dataset): the dataset to use for evaluation
         batch_size (int, defaults to 1): the batch size while training
         max_epochs (int, defaults to 2): the number of epochs to train
-        optim_kwargs (dict, defaults to {'lr':1e-4}): the kwargs to use while initializing optimizer
     """
 
     def __init__(
@@ -32,24 +32,52 @@ def __init__(
         model,
         strategy: Strategy,
         optim: Optimizer,
-        train_dataset: RewardDataset,
-        eval_dataset: RewardDataset,
+        loss_fn,
+        train_dataset: Dataset,
+        valid_dataset: Dataset,
+        eval_dataset: Dataset,
         batch_size: int = 1,
-        max_epochs: int = 2,
+        max_epochs: int = 1,
     ) -> None:
         super().__init__()
         self.strategy = strategy
         self.epochs = max_epochs
-        self.train_dataloader = DataLoader(train_dataset, batch_size=batch_size)
-        self.eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size)
-
+        self.train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+        self.valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True)
+        self.eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size, shuffle=True)
+        
         self.model = strategy.setup_model(model)
-        if "DDP" in str(self.strategy):
-            self.model = self.model.module
-        self.loss_fn = PairWiseLoss()
+        self.loss_fn = loss_fn
         self.optimizer = strategy.setup_optimizer(optim, self.model)
+        self.scheduler = lr_scheduler.CosineAnnealingLR(self.optimizer, self.train_dataloader.__len__()//100)
+
 
-    def fit(self, use_lora):
+    def eval_acc(self, dataloader):
+        dist = 0
+        on = 0
+        cnt = 0
+        self.model.eval()
+        with torch.no_grad():
+            for chosen_ids, c_mask, reject_ids, r_mask in dataloader:
+                chosen_ids = chosen_ids.squeeze(1).to(torch.cuda.current_device())
+                c_mask = c_mask.squeeze(1).to(torch.cuda.current_device())
+                reject_ids = reject_ids.squeeze(1).to(torch.cuda.current_device())
+                r_mask = r_mask.squeeze(1).to(torch.cuda.current_device())
+                chosen_reward = self.model(chosen_ids, attention_mask=c_mask)
+                reject_reward = self.model(reject_ids, attention_mask=r_mask)
+                for i in range(len(chosen_reward)):
+                    cnt += 1
+                    if chosen_reward[i] > reject_reward[i]:
+                        on += 1
+                dist += (chosen_reward - reject_reward).mean().item()
+            dist_mean = dist / len(dataloader)
+            acc = on / cnt
+        self.model.train()
+        return dist_mean, acc
+    
+
+    def fit(self):
+        time = datetime.now()
         epoch_bar = tqdm(range(self.epochs), desc='Train epoch', disable=not is_rank_0())
         for epoch in range(self.epochs):
             step_bar = tqdm(range(self.train_dataloader.__len__()),
@@ -57,37 +85,36 @@ def fit(self, use_lora):
                             disable=not is_rank_0())
             # train
             self.model.train()
+            cnt = 0
+            acc = 0
+            dist = 0
             for chosen_ids, c_mask, reject_ids, r_mask in self.train_dataloader:
-                chosen_ids = chosen_ids.squeeze(1).cuda()
-                c_mask = c_mask.squeeze(1).cuda()
-                reject_ids = reject_ids.squeeze(1).cuda()
-                r_mask = r_mask.squeeze(1).cuda()
+                chosen_ids = chosen_ids.squeeze(1).to(torch.cuda.current_device())
+                c_mask = c_mask.squeeze(1).to(torch.cuda.current_device())
+                reject_ids = reject_ids.squeeze(1).to(torch.cuda.current_device())
+                r_mask = r_mask.squeeze(1).to(torch.cuda.current_device())
                 chosen_reward = self.model(chosen_ids, attention_mask=c_mask)
                 reject_reward = self.model(reject_ids, attention_mask=r_mask)
                 loss = self.loss_fn(chosen_reward, reject_reward)
                 self.strategy.backward(loss, self.model, self.optimizer)
                 self.strategy.optimizer_step(self.optimizer)
                 self.optimizer.zero_grad()
+                cnt += 1
+                if cnt == 100:
+                    self.scheduler.step()
+                    dist, acc = self.eval_acc(self.valid_dataloader)
+                    cnt = 0
+                    if is_rank_0():
+                        log = pd.DataFrame([[step_bar.n, loss.item(), dist, acc]], columns=['step', 'loss', 'dist', 'acc'])
+                        log.to_csv('log_%s.csv' % time, mode='a', header=False, index=False)
                 step_bar.update()
-                step_bar.set_postfix({'loss': loss.item()})
-
+                step_bar.set_postfix({'dist': dist, 'acc': acc})
+                
             # eval
-            self.model.eval()
-            with torch.no_grad():
-                dist = 0
-                loss_sum = 0
-                for chosen_ids, c_mask, reject_ids, r_mask in self.eval_dataloader:
-                    chosen_ids = chosen_ids.squeeze(1).cuda()
-                    c_mask = c_mask.squeeze(1).cuda()
-                    reject_ids = reject_ids.squeeze(1).cuda()
-                    r_mask = r_mask.squeeze(1).cuda()
-                    chosen_reward = self.model(chosen_ids, attention_mask=c_mask)
-                    reject_reward = self.model(reject_ids, attention_mask=r_mask)
-                    dist += (chosen_reward - reject_reward).mean().item()
-                    loss = self.loss_fn(chosen_reward, reject_reward)
-                    loss_sum += loss.item()
-                dist_mean = dist / self.eval_dataloader.__len__()
-                loss_mean = loss_sum / self.eval_dataloader.__len__()
+            dist, acc = self.eval_acc(self.eval_dataloader)
+            if is_rank_0():
+                    log = pd.DataFrame([[step_bar.n, loss.item(), dist, acc]], columns=['step', 'loss', 'dist', 'acc'])
+                    log.to_csv('log.csv', mode='a', header=False, index=False)
             epoch_bar.update()
-            step_bar.set_postfix({'loss': loss_mean, 'dist_mean': dist_mean})
+            step_bar.set_postfix({'dist': dist, 'acc': acc})
             step_bar.close()
diff --git a/applications/ChatGPT/examples/README.md b/applications/ChatGPT/examples/README.md
index 3876d20f02d7..ce73a5407944 100644
--- a/applications/ChatGPT/examples/README.md
+++ b/applications/ChatGPT/examples/README.md
@@ -7,26 +7,42 @@ pip install -r requirements.txt
 ```
 
 ## Train the reward model (Stage 2)
-We use [rm-static](https://huggingface.co/datasets/Dahoas/rm-static) as dataset to train our reward model. It is a dataset of chosen & rejected response of the same prompt.
-
-You can download the dataset from huggingface automatically.
-
 Use these code to train your reward model.
-
 ```shell
-# Naive reward model training
-python train_reward_model.py --pretrain <your model path> --model <your model type> --strategy naive
+# Take naive reward model training with opt-350m as example
+python train_reward_model.py --pretrain "facebook/opt-350m" --model 'opt' --strategy naive
 # use colossalai_zero2
-torchrun --standalone --nproc_per_node=2 train_reward_model.py --pretrain <your model path> --model <your model type> --strategy colossalai_zero2
+torchrun --standalone --nproc_per_node=2 train_reward_model.py --pretrain "facebook/opt-350m" --model 'opt' --strategy colossalai_zero2
 ```
 
+### Features and tricks in RM training
+- We support [Anthropic/hh-rlhf](https://huggingface.co/datasets/Anthropic/hh-rlhf)and[rm-static](https://huggingface.co/datasets/Dahoas/rm-static) datasets.
+- We support 2 kinds of loss_function named 'log_sig'(used by OpenAI) and 'log_exp'(used by Anthropic).
+- We change the loss to valid_acc and pair_dist to monitor progress during training.
+- We add special token to the end of the sequence to get better result.
+- We use cosine-reducing lr-scheduler for RM training.
+- We set value_head as 1 liner layer and initialize the weight of value_head using N(0，1/(d_model + 1)) distribution.
+- We train a Bloom-560m reward model for 1 epoch and find the test acc of the model achieve the performance mentions in [Anthropics paper](https://arxiv.org/abs/2112.00861).
+
+### Experiment result
+Model performance in [Anthropics paper](https://arxiv.org/abs/2112.00861):
+
+<div align=center> <img width="512" alt="image" src="https://user-images.githubusercontent.com/70618399/225263321-8d64c3a8-6877-4cc8-9b61-0e1c52d3d94f.png">
+
+<div align=left>Our training & test result of bloom-560m for 1 epoch:
+
+<div align=center> <img width="512" alt="image" src="https://user-images.githubusercontent.com/70618399/225262950-a7f0a686-25de-44ec-98f2-11b83ea86674.png">
+
+<div align=left>
+
 ## Train with dummy prompt data (Stage 3)
 
-This script supports 3 strategies:
+This script supports 4 kinds of strategies:
 
 - naive
 - ddp
-- colossalai
+- colossalai_zero2
+- colossalai_gemini
 
 It uses random generated prompt data.
 
@@ -53,7 +69,7 @@ We use [awesome-chatgpt-prompts](https://huggingface.co/datasets/fka/awesome-cha
 
 You should download `prompts.csv` first.
 
-This script also supports 3 strategies.
+This script also supports 4 strategies.
 
 ```shell
 # display cli help
@@ -75,6 +91,9 @@ python inference.py --model_path <your actor model path> --model <your model typ
 python inference.py --model_path ./actor_checkpoint_prompts.pt --pretrain bigscience/bloom-560m --model bloom
 ```
 
+## Attention
+The examples is just a demo for testing our progress of RM and PPO training.
+
 
 #### data
 - [x] [rm-static](https://huggingface.co/datasets/Dahoas/rm-static)
diff --git a/applications/ChatGPT/examples/test_ci.sh b/applications/ChatGPT/examples/test_ci.sh
index 0aa4a36fe514..abc43ab1ee9e 100755
--- a/applications/ChatGPT/examples/test_ci.sh
+++ b/applications/ChatGPT/examples/test_ci.sh
@@ -69,3 +69,23 @@ torchrun --standalone --nproc_per_node=2 ${BASE}/train_prompts.py $PROMPT_PATH \
 python ${BASE}/inference.py --model_path ${BASE}/actor_checkpoint_prompts.pt --pretrain 'gpt2' --model gpt2
 
 rm -rf ${BASE}/actor_checkpoint_prompts.pt
+
+# train rm
+torchrun --standalone --nproc_per_node=2 ${BASE}/train_reward_model.py \
+                             --pretrain 'facebook/opt-350m' --model 'opt' \
+                             --strategy colossalai_zero2 --loss_fn 'log_sig'\
+                             --dataset 'Anthropic/hh-rlhf' --subset 'harmless-base'\
+                             --test True --lora_rank 4
+
+torchrun --standalone --nproc_per_node=2 ${BASE}/train_reward_model.py \
+                             --pretrain 'gpt2' --model 'gpt2' \
+                             --strategy colossalai_gemini --loss_fn 'log_exp'\
+                             --dataset 'Dahoas/rm-static' --test True --lora_rank 4
+                             
+torchrun --standalone --nproc_per_node=2 ${BASE}/train_reward_model.py \
+                             --pretrain 'bigscience/bloom-560m' --model 'bloom' \
+                             --strategy colossalai_zero2 --loss_fn 'log_sig'\
+                             --dataset 'Anthropic/hh-rlhf' --subset 'harmless-base'\
+                             --test True --lora_rank 4
+
+rm -rf ${BASE}/rm_ckpt.pt
diff --git a/applications/ChatGPT/examples/train_reward_model.py b/applications/ChatGPT/examples/train_reward_model.py
index 19b20b0847cc..47dd988b8117 100644
--- a/applications/ChatGPT/examples/train_reward_model.py
+++ b/applications/ChatGPT/examples/train_reward_model.py
@@ -2,7 +2,8 @@
 
 import loralib as lora
 import torch
-from chatgpt.dataset import RewardDataset
+from chatgpt.dataset import HhRlhfDataset, RmStaticDataset
+from chatgpt.models import LogSigLoss, LogExpLoss
 from chatgpt.models.base import RewardModel
 from chatgpt.models.bloom import BLOOMRM
 from chatgpt.models.gpt import GPTRM
@@ -10,13 +11,13 @@
 from chatgpt.trainer import RewardModelTrainer
 from chatgpt.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
 from datasets import load_dataset
+from random import randint
 from torch.optim import Adam
 from transformers import AutoTokenizer, BloomTokenizerFast
 from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
 
 from colossalai.nn.optimizer import HybridAdam
 
-
 def train(args):
     # configure strategy
     if args.strategy == 'naive':
@@ -33,57 +34,85 @@ def train(args):
     # configure model
     with strategy.model_init_context():
         if args.model == 'bloom':
-            model = BLOOMRM(pretrained=args.pretrain, lora_rank=args.lora_rank).cuda()
+            model = BLOOMRM(pretrained=args.pretrain, lora_rank=args.lora_rank).to(torch.cuda.current_device())
         elif args.model == 'opt':
-            model = OPTRM(pretrained=args.pretrain, lora_rank=args.lora_rank).cuda()
+            model = OPTRM(pretrained=args.pretrain, lora_rank=args.lora_rank).to(torch.cuda.current_device())
         elif args.model == 'gpt2':
-            model = GPTRM(pretrained=args.pretrain, lora_rank=args.lora_rank).cuda()
+            model = GPTRM(pretrained=args.pretrain, lora_rank=args.lora_rank).to(torch.cuda.current_device())
         else:
             raise ValueError(f'Unsupported model "{args.model}"')
-
+        
+        if args.model_path is not None:
+            state_dict = torch.load(args.model_path)
+            model.load_state_dict(state_dict)
+        
     # configure tokenizer
     if args.model == 'gpt2':
         tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
         tokenizer.pad_token = tokenizer.eos_token
     elif args.model == 'bloom':
-        tokenizer = BloomTokenizerFast.from_pretrained(args.pretrain)
-        tokenizer.pad_token = tokenizer.eos_token
+        tokenizer = BloomTokenizerFast.from_pretrained('bigscience/bloom-560m')
     elif args.model == 'opt':
         tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
     else:
         raise ValueError(f'Unsupported model "{args.model}"')
-    tokenizer.pad_token = tokenizer.eos_token
-
-    max_len = 512
+    max_len = args.max_len
 
     # configure optimizer
     if args.strategy.startswith('colossalai'):
-        optim = HybridAdam(model.parameters(), lr=5e-5)
+        optim = HybridAdam(model.parameters(), lr=1.5e-5)
     else:
-        optim = Adam(model.parameters(), lr=5e-5)
-
+        optim = Adam(model.parameters(), lr=1.5e-5)
+    
+    # configure loss function
+    if args.loss_fn == 'log_sig':
+        loss_fn = LogSigLoss()
+    elif args.loss_fn == 'log_exp':
+        loss_fn = LogExpLoss()
+    else:
+        raise ValueError(f'Unsupported loss function "{args.loss_fn}"')
+    
     # prepare for data and dataset
-    data = load_dataset(args.dataset)
-    train_data = data["train"]
-    eval_data = data['test']
-    train_dataset = RewardDataset(train_data, tokenizer, max_len)
-    eval_dataset = RewardDataset(eval_data, tokenizer, max_len)
-
+    if args.subset is not None:
+        data = load_dataset(args.dataset, data_dir=args.subset)
+    else:
+        data = load_dataset(args.dataset)
+    
+    if args.test:
+        train_data = data['train'].select(range(100))
+        eval_data = data['test'].select(range(10)) 
+    else:
+        train_data = data['train']
+        eval_data = data['test']
+    valid_data = data['test'].select((randint(0, len(eval_data) - 1) for _ in range(len(eval_data)//10)))
+    
+    if args.dataset == 'Dahoas/rm-static':
+        train_dataset = RmStaticDataset(train_data, tokenizer, max_len)
+        valid_dataset = RmStaticDataset(valid_data, tokenizer, max_len)
+        eval_dataset = RmStaticDataset(eval_data, tokenizer, max_len)
+    elif args.dataset == 'Anthropic/hh-rlhf':
+        train_dataset = HhRlhfDataset(train_data, tokenizer, max_len)
+        valid_dataset = HhRlhfDataset(valid_data, tokenizer, max_len)
+        eval_dataset = HhRlhfDataset(eval_data, tokenizer, max_len)
+    else:
+        raise ValueError(f'Unsupported dataset "{args.dataset}"')
+    
     trainer = RewardModelTrainer(model=model,
                                  strategy=strategy,
                                  optim=optim,
+                                 loss_fn = loss_fn,
                                  train_dataset=train_dataset,
+                                 valid_dataset=valid_dataset,
                                  eval_dataset=eval_dataset,
                                  batch_size=args.batch_size,
                                  max_epochs=args.max_epochs)
 
-    trainer.fit(use_lora=args.lora_rank)
-
+    trainer.fit()
     # save model checkpoint after fitting on only rank0
-    strategy.save_model(model, 'rm_checkpoint.pt', only_rank0=True)
+    strategy.save_model(trainer.model, args.save_path, only_rank0=True)
     # save optimizer checkpoint on all ranks
-    strategy.save_optimizer(optim, 'rm_optim_checkpoint_%d.pt' % (torch.cuda.current_device()), only_rank0=False)
-
+    if args.need_optim_ckpt:
+        strategy.save_optimizer(trainer.optimizer, 'rm_optim_checkpoint_%d.pt' % (torch.cuda.current_device()), only_rank0=False)
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
@@ -92,10 +121,18 @@ def train(args):
                         default='naive')
     parser.add_argument('--model', choices=['gpt2', 'bloom', 'opt'], default='bloom')
     parser.add_argument('--pretrain', type=str, default=None)
-    parser.add_argument('--dataset', type=str, default='Dahoas/rm-static')
-    parser.add_argument('--save_path', type=str, default='rm_ckpt.pth')
+    parser.add_argument('--model_path', type=str, default=None)
+    parser.add_argument('--need_optim_ckpt', type=bool, default=False)
+    parser.add_argument('--dataset', type=str,
+                        choices=['Anthropic/hh-rlhf', 'Dahoas/rm-static'],
+                        default='Dahoas/rm-static')
+    parser.add_argument('--subset', type=str, default=None)
+    parser.add_argument('--save_path', type=str, default='rm_ckpt.pt')
     parser.add_argument('--max_epochs', type=int, default=1)
-    parser.add_argument('--batch_size', type=int, default=4)
+    parser.add_argument('--batch_size', type=int, default=1)
+    parser.add_argument('--max_len', type=int, default=512)
     parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
+    parser.add_argument('--loss_fn', type=str, default='log_sig', choices=['log_sig', 'log_exp'])
+    parser.add_argument('--test', type=bool, default=False)
     args = parser.parse_args()
     train(args)
diff --git a/applications/ChatGPT/examples/train_rm.sh b/applications/ChatGPT/examples/train_rm.sh
index 6e11a148bfbe..981b7a15fcd4 100755
--- a/applications/ChatGPT/examples/train_rm.sh
+++ b/applications/ChatGPT/examples/train_rm.sh
@@ -1,20 +1,8 @@
-set_n_least_used_CUDA_VISIBLE_DEVICES() {
-    local n=${1:-"9999"}
-    echo "GPU Memory Usage:"
-    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv \
-        | tail -n +2 \
-        | nl -v 0 \
-        | tee /dev/tty \
-        | sort -g -k 2 \
-        | awk '{print $1}' \
-        | head -n $n)
-    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
-    echo "Now CUDA_VISIBLE_DEVICES is set to:"
-    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
-}
+set_n_least_used_CUDA_VISIBLE_DEVICES 1
 
-set_n_least_used_CUDA_VISIBLE_DEVICES 2
-
-# torchrun --standalone --nproc_per_node=2 train_reward_model.py --pretrain 'bigscience/bloomz-560m' --model 'bloom' --strategy colossalai_zero2
-torchrun --standalone --nproc_per_node=2 train_reward_model.py  --model 'gpt2' --strategy colossalai_zero2
-# torchrun --standalone --nproc_per_node=2 train_reward_model.py --pretrain "facebook/opt-350m" --model 'opt' --strategy colossalai_zero2
+python train_reward_model.py --pretrain '/home/lczht/data2/bloom-560m' \
+                             --model 'bloom' \
+                             --strategy naive \
+                             --loss_fn 'log_exp'\
+                             --save_path 'rmstatic.pt' \
+                             --test True

From 20d1c99444fc902bc673b9a98f9af2f77901d5e4 Mon Sep 17 00:00:00 2001
From: Saurav Maheshkar <sauravvmaheshkar@gmail.com>
Date: Mon, 20 Mar 2023 02:52:01 +0000
Subject: [PATCH 480/503] [refactor] update docs (#3174)

* refactor: README-zh-Hans

* refactor: REFERENCE

* docs: update paths in README
---
 README.md                                   | 4 ++--
 README-zh-Hans.md => docs/README-zh-Hans.md | 0
 REFERENCE.md => docs/REFERENCE.md           | 0
 3 files changed, 2 insertions(+), 2 deletions(-)
 rename README-zh-Hans.md => docs/README-zh-Hans.md (100%)
 rename REFERENCE.md => docs/REFERENCE.md (100%)

diff --git a/README.md b/README.md
index 3b55649b44bb..5ce18650fb41 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@
    [![WeChat badge](https://img.shields.io/badge/微信-加入-green?logo=wechat&amp)](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/WeChat.png)
 
 
-   | [English](README.md) | [中文](README-zh-Hans.md) |
+   | [English](README.md) | [中文](docs/README-zh-Hans.md) |
 
 </div>
 
@@ -399,7 +399,7 @@ We leverage the power of [GitHub Actions](https://github.com/features/actions) t
 
 ## Cite Us
 
-This project is inspired by some related projects (some by our team and some by other organizations). We would like to credit these amazing projects as listed in the [Reference List](./REFERENCE.md).
+This project is inspired by some related projects (some by our team and some by other organizations). We would like to credit these amazing projects as listed in the [Reference List](./docs/REFERENCE.md).
 
 To cite this project, you can use the following BibTeX citation.
 
diff --git a/README-zh-Hans.md b/docs/README-zh-Hans.md
similarity index 100%
rename from README-zh-Hans.md
rename to docs/README-zh-Hans.md
diff --git a/REFERENCE.md b/docs/REFERENCE.md
similarity index 100%
rename from REFERENCE.md
rename to docs/REFERENCE.md

From 1ad3a636b17c4730bce4517a78b4056f0626fe50 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 20 Mar 2023 11:40:25 +0800
Subject: [PATCH 481/503] [test] fixed torchrec model test (#3167)

* [test] fixed torchrec model test

* polish code

* polish code

* polish code

* polish code

* polish code

* polish code
---
 tests/kit/model_zoo/torchrec/torchrec.py      | 139 +++++++++---------
 .../test_mixed_precision/test_fp16_torch.py   |   8 +-
 .../test_torchrec_model/test_deepfm_model.py  |  12 +-
 .../test_torchrec_model/test_dlrm_model.py    |  17 ++-
 tests/test_gemini/update/test_fwd_bwd.py      |  14 +-
 5 files changed, 96 insertions(+), 94 deletions(-)

diff --git a/tests/kit/model_zoo/torchrec/torchrec.py b/tests/kit/model_zoo/torchrec/torchrec.py
index 014e9218b226..03d95a06a89b 100644
--- a/tests/kit/model_zoo/torchrec/torchrec.py
+++ b/tests/kit/model_zoo/torchrec/torchrec.py
@@ -2,96 +2,95 @@
 from functools import partial
 
 import torch
-
-try:
-    from torchrec.models import deepfm, dlrm
-    from torchrec.modules.embedding_configs import EmbeddingBagConfig
-    from torchrec.modules.embedding_modules import EmbeddingBagCollection
-    from torchrec.sparse.jagged_tensor import KeyedJaggedTensor, KeyedTensor
-    NO_TORCHREC = False
-except ImportError:
-    NO_TORCHREC = True
+from torchrec.models import deepfm, dlrm
+from torchrec.modules.embedding_configs import EmbeddingBagConfig
+from torchrec.modules.embedding_modules import EmbeddingBagCollection
+from torchrec.sparse.jagged_tensor import KeyedJaggedTensor, KeyedTensor
 
 from ..registry import ModelAttribute, model_zoo
 
+BATCH = 2
+SHAPE = 10
+# KeyedTensor
+KT = KeyedTensor(keys=["f1", "f2"], length_per_key=[SHAPE, SHAPE], values=torch.rand((BATCH, 2 * SHAPE)))
 
-def register_torchrec_models():
-    BATCH = 2
-    SHAPE = 10
-    # KeyedTensor
-    KT = KeyedTensor(keys=["f1", "f2"], length_per_key=[SHAPE, SHAPE], values=torch.rand((BATCH, 2 * SHAPE)))
+# KeyedJaggedTensor
+KJT = KeyedJaggedTensor.from_offsets_sync(keys=["f1", "f2"],
+                                          values=torch.tensor([1, 2, 3, 4, 5, 6, 7, 8]),
+                                          offsets=torch.tensor([0, 2, 4, 6, 8]))
 
-    # KeyedJaggedTensor
-    KJT = KeyedJaggedTensor.from_offsets_sync(keys=["f1", "f2"],
-                                              values=torch.tensor([1, 2, 3, 4, 5, 6, 7, 8]),
-                                              offsets=torch.tensor([0, 2, 4, 6, 8]))
+data_gen_fn = lambda: dict(features=torch.rand((BATCH, SHAPE)))
 
-    data_gen_fn = lambda: dict(features=torch.rand((BATCH, SHAPE)))
+interaction_arch_data_gen_fn = lambda: dict(dense_features=torch.rand((BATCH, SHAPE)), sparse_features=KT)
 
-    interaction_arch_data_gen_fn = lambda: dict(dense_features=torch.rand((BATCH, SHAPE)), sparse_features=KT)
+simple_dfm_data_gen_fn = lambda: dict(dense_features=torch.rand((BATCH, SHAPE)), sparse_features=KJT)
 
-    simple_dfm_data_gen_fn = lambda: dict(dense_features=torch.rand((BATCH, SHAPE)), sparse_features=KJT)
+sparse_arch_data_gen_fn = lambda: dict(features=KJT)
 
-    sparse_arch_data_gen_fn = lambda: dict(features=KJT)
 
-    output_transform_fn = lambda x: dict(output=x)
+def output_transform_fn(x):
+    if isinstance(x, KeyedTensor):
+        output = dict()
+        for key in x.keys():
+            output[key] = x[key]
+        return output
+    else:
+        return dict(output=x)
 
-    def get_ebc():
-        # EmbeddingBagCollection
-        eb1_config = EmbeddingBagConfig(name="t1", embedding_dim=SHAPE, num_embeddings=SHAPE, feature_names=["f1"])
-        eb2_config = EmbeddingBagConfig(name="t2", embedding_dim=SHAPE, num_embeddings=SHAPE, feature_names=["f2"])
-        return EmbeddingBagCollection(tables=[eb1_config, eb2_config])
 
-    model_zoo.register(name='deepfm_densearch',
-                       model_fn=partial(deepfm.DenseArch, SHAPE, SHAPE, SHAPE),
-                       data_gen_fn=data_gen_fn,
-                       output_transform_fn=output_transform_fn)
+def get_ebc():
+    # EmbeddingBagCollection
+    eb1_config = EmbeddingBagConfig(name="t1", embedding_dim=SHAPE, num_embeddings=SHAPE, feature_names=["f1"])
+    eb2_config = EmbeddingBagConfig(name="t2", embedding_dim=SHAPE, num_embeddings=SHAPE, feature_names=["f2"])
+    return EmbeddingBagCollection(tables=[eb1_config, eb2_config])
 
-    model_zoo.register(name='deepfm_interactionarch',
-                       model_fn=partial(deepfm.FMInteractionArch, SHAPE * 3, ["f1", "f2"], SHAPE),
-                       data_gen_fn=interaction_arch_data_gen_fn,
-                       output_transform_fn=output_transform_fn)
 
-    model_zoo.register(name='deepfm_overarch',
-                       model_fn=partial(deepfm.OverArch, SHAPE),
-                       data_gen_fn=data_gen_fn,
-                       output_transform_fn=output_transform_fn)
+model_zoo.register(name='deepfm_densearch',
+                   model_fn=partial(deepfm.DenseArch, SHAPE, SHAPE, SHAPE),
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
 
-    model_zoo.register(name='deepfm_simpledeepfmnn',
-                       model_fn=partial(deepfm.SimpleDeepFMNN, SHAPE, get_ebc(), SHAPE, SHAPE),
-                       data_gen_fn=simple_dfm_data_gen_fn,
-                       output_transform_fn=output_transform_fn)
+model_zoo.register(name='deepfm_interactionarch',
+                   model_fn=partial(deepfm.FMInteractionArch, SHAPE * 3, ["f1", "f2"], SHAPE),
+                   data_gen_fn=interaction_arch_data_gen_fn,
+                   output_transform_fn=output_transform_fn)
 
-    model_zoo.register(name='deepfm_sparsearch',
-                       model_fn=partial(deepfm.SparseArch, get_ebc()),
-                       data_gen_fn=sparse_arch_data_gen_fn,
-                       output_transform_fn=output_transform_fn)
+model_zoo.register(name='deepfm_overarch',
+                   model_fn=partial(deepfm.OverArch, SHAPE),
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
 
-    model_zoo.register(name='dlrm',
-                       model_fn=partial(dlrm.DLRM, get_ebc(), SHAPE, [SHAPE, SHAPE], [5, 1]),
-                       data_gen_fn=simple_dfm_data_gen_fn,
-                       output_transform_fn=output_transform_fn)
+model_zoo.register(name='deepfm_simpledeepfmnn',
+                   model_fn=partial(deepfm.SimpleDeepFMNN, SHAPE, get_ebc(), SHAPE, SHAPE),
+                   data_gen_fn=simple_dfm_data_gen_fn,
+                   output_transform_fn=output_transform_fn)
 
-    model_zoo.register(name='dlrm_densearch',
-                       model_fn=partial(dlrm.DenseArch, SHAPE, [SHAPE, SHAPE]),
-                       data_gen_fn=data_gen_fn,
-                       output_transform_fn=output_transform_fn)
+model_zoo.register(name='deepfm_sparsearch',
+                   model_fn=partial(deepfm.SparseArch, get_ebc()),
+                   data_gen_fn=sparse_arch_data_gen_fn,
+                   output_transform_fn=output_transform_fn)
 
-    model_zoo.register(name='dlrm_interactionarch',
-                       model_fn=partial(dlrm.InteractionArch, 2),
-                       data_gen_fn=interaction_arch_data_gen_fn,
-                       output_transform_fn=output_transform_fn)
+model_zoo.register(name='dlrm',
+                   model_fn=partial(dlrm.DLRM, get_ebc(), SHAPE, [SHAPE, SHAPE], [5, 1]),
+                   data_gen_fn=simple_dfm_data_gen_fn,
+                   output_transform_fn=output_transform_fn)
 
-    model_zoo.register(name='dlrm_overarch',
-                       model_fn=partial(dlrm.OverArch, SHAPE, [5, 1]),
-                       data_gen_fn=data_gen_fn,
-                       output_transform_fn=output_transform_fn)
+model_zoo.register(name='dlrm_densearch',
+                   model_fn=partial(dlrm.DenseArch, SHAPE, [SHAPE, SHAPE]),
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
 
-    model_zoo.register(name='dlrm_sparsearch',
-                       model_fn=partial(dlrm.SparseArch, get_ebc()),
-                       data_gen_fn=sparse_arch_data_gen_fn,
-                       output_transform_fn=output_transform_fn)
+model_zoo.register(name='dlrm_interactionarch',
+                   model_fn=partial(dlrm.InteractionArch, 2),
+                   data_gen_fn=interaction_arch_data_gen_fn,
+                   output_transform_fn=output_transform_fn)
 
+model_zoo.register(name='dlrm_overarch',
+                   model_fn=partial(dlrm.OverArch, SHAPE, [5, 1]),
+                   data_gen_fn=data_gen_fn,
+                   output_transform_fn=output_transform_fn)
 
-if not NO_TORCHREC:
-    register_torchrec_models()
+model_zoo.register(name='dlrm_sparsearch',
+                   model_fn=partial(dlrm.SparseArch, get_ebc()),
+                   data_gen_fn=sparse_arch_data_gen_fn,
+                   output_transform_fn=output_transform_fn)
diff --git a/tests/test_booster/test_mixed_precision/test_fp16_torch.py b/tests/test_booster/test_mixed_precision/test_fp16_torch.py
index c56fcae58a60..98d00cd2caca 100644
--- a/tests/test_booster/test_mixed_precision/test_fp16_torch.py
+++ b/tests/test_booster/test_mixed_precision/test_fp16_torch.py
@@ -7,11 +7,17 @@
 
 def test_torch_amp():
     for name, (model_fn, data_gen_fn, output_transform_fn, _) in model_zoo.items():
+        # dlrm_interactionarch has not parameters, so skip
+        if name == 'dlrm_interactionarch':
+            continue
+
         model = model_fn().cuda()
         optimizer = Adam(model.parameters(), lr=1e-3)
         criterion = lambda x: x.mean()
         data = data_gen_fn()
-        data = {k: v.cuda() if torch.is_tensor(v) else v for k, v in data.items()}
+        data = {
+            k: v.to('cuda') if torch.is_tensor(v) or 'Tensor' in v.__class__.__name__ else v for k, v in data.items()
+        }
         mixed_precision = FP16TorchMixedPrecision()
         model, optimizer, criterion = mixed_precision.configure(model, optimizer, criterion)
         output = model(**data)
diff --git a/tests/test_fx/test_tracer/test_torchrec_model/test_deepfm_model.py b/tests/test_fx/test_tracer/test_torchrec_model/test_deepfm_model.py
index 6cbca343d134..a30139f26d29 100644
--- a/tests/test_fx/test_tracer/test_torchrec_model/test_deepfm_model.py
+++ b/tests/test_fx/test_tracer/test_torchrec_model/test_deepfm_model.py
@@ -7,11 +7,6 @@
 BATCH = 2
 SHAPE = 10
 
-deepfm_models = model_zoo.get_sub_registry('deepfm')
-NOT_DFM = False
-if not deepfm_models:
-    NOT_DFM = True
-
 
 def trace_and_compare(model_cls, data, output_transform_fn, meta_args=None):
     # trace
@@ -52,8 +47,9 @@ def trace_and_compare(model_cls, data, output_transform_fn, meta_args=None):
                                  ), f'{model.__class__.__name__} has inconsistent outputs, {fx_out} vs {non_fx_out}'
 
 
-@pytest.mark.skipif(NOT_DFM, reason='torchrec is not installed')
-def test_torchrec_deepfm_models(deepfm_models):
+@pytest.mark.skip('unknown error')
+def test_torchrec_deepfm_models():
+    deepfm_models = model_zoo.get_sub_registry('deepfm')
     torch.backends.cudnn.deterministic = True
 
     for name, (model_fn, data_gen_fn, output_transform_fn, attribute) in deepfm_models.items():
@@ -67,4 +63,4 @@ def test_torchrec_deepfm_models(deepfm_models):
 
 
 if __name__ == "__main__":
-    test_torchrec_deepfm_models(deepfm_models)
+    test_torchrec_deepfm_models()
diff --git a/tests/test_fx/test_tracer/test_torchrec_model/test_dlrm_model.py b/tests/test_fx/test_tracer/test_torchrec_model/test_dlrm_model.py
index 7aa868265f15..27a88291397e 100644
--- a/tests/test_fx/test_tracer/test_torchrec_model/test_dlrm_model.py
+++ b/tests/test_fx/test_tracer/test_torchrec_model/test_dlrm_model.py
@@ -7,11 +7,6 @@
 BATCH = 2
 SHAPE = 10
 
-dlrm_models = model_zoo.get_sub_registry('dlrm')
-NOT_DLRM = False
-if not dlrm_models:
-    NOT_DLRM = True
-
 
 def trace_and_compare(model_cls, data, output_transform_fn, meta_args=None):
     # trace
@@ -52,12 +47,18 @@ def trace_and_compare(model_cls, data, output_transform_fn, meta_args=None):
                                  ), f'{model.__class__.__name__} has inconsistent outputs, {fx_out} vs {non_fx_out}'
 
 
-@pytest.mark.skipif(NOT_DLRM, reason='torchrec is not installed')
-def test_torchrec_dlrm_models(dlrm_models):
+@pytest.mark.skip('unknown error')
+def test_torchrec_dlrm_models():
     torch.backends.cudnn.deterministic = True
+    dlrm_models = model_zoo.get_sub_registry('dlrm')
 
     for name, (model_fn, data_gen_fn, output_transform_fn, attribute) in dlrm_models.items():
         data = data_gen_fn()
+
+        # dlrm_interactionarch is not supported
+        if name == 'dlrm_interactionarch':
+            continue
+
         if attribute is not None and attribute.has_control_flow:
             meta_args = {k: v.to('meta') for k, v in data.items()}
         else:
@@ -67,4 +68,4 @@ def test_torchrec_dlrm_models(dlrm_models):
 
 
 if __name__ == "__main__":
-    test_torchrec_dlrm_models(dlrm_models)
+    test_torchrec_dlrm_models()
diff --git a/tests/test_gemini/update/test_fwd_bwd.py b/tests/test_gemini/update/test_fwd_bwd.py
index 0d35ba83d2e9..2821dc78d984 100644
--- a/tests/test_gemini/update/test_fwd_bwd.py
+++ b/tests/test_gemini/update/test_fwd_bwd.py
@@ -34,17 +34,17 @@ def check_grad(model: ZeroDDP, torch_model: torch.nn.Module):
         assert_close(p0, p1.grad, rtol=1e-3, atol=5e-5)
 
 
-@parameterize('init_device', [get_current_device()])
 @parameterize('placement_policy', ['cuda', 'cpu', 'auto', 'const'])
 @parameterize('keep_gather', [False, True])
 @parameterize('model_name', ['gpt2', 'bert', 'albert'])
 @parameterize('use_grad_checkpoint', [False, True])
-def exam_gpt_fwd_bwd(placement_policy,
-                     keep_gather,
-                     model_name: str,
-                     use_grad_checkpoint: bool = False,
-                     init_device=get_current_device()):
-
+def exam_gpt_fwd_bwd(
+    placement_policy,
+    keep_gather,
+    model_name: str,
+    use_grad_checkpoint: bool = False,
+):
+    init_device = get_current_device()
     get_components_func = non_distributed_component_funcs.get_callable(model_name)
     model_builder, train_dataloader, test_dataloader, optimizer_class, criterion = get_components_func()
 

From a9b8402d93ac69bb9a8b46e21cfe3697409972fe Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 20 Mar 2023 13:59:24 +0800
Subject: [PATCH 482/503] [booster] added the accelerator implementation
 (#3159)

---
 colossalai/booster/accelerator.py             | 48 +++++++++++++++++--
 colossalai/booster/booster.py                 | 15 +++++-
 tests/test_booster/test_accelerator.py        | 13 +++++
 .../test_torchrec_model/test_dlrm_model.py    |  1 +
 4 files changed, 72 insertions(+), 5 deletions(-)
 create mode 100644 tests/test_booster/test_accelerator.py

diff --git a/colossalai/booster/accelerator.py b/colossalai/booster/accelerator.py
index 63ba193e3e4f..fc2c4a40068b 100644
--- a/colossalai/booster/accelerator.py
+++ b/colossalai/booster/accelerator.py
@@ -3,12 +3,52 @@
 
 __all__ = ['Accelerator']
 
+_supported_devices = [
+    'cpu',
+    'cuda',
+
+    # To be supported
+    # 'xpu',
+    # 'npu',
+    # 'tpu',
+]
+
 
 class Accelerator:
+    """
+    Accelerator is an abstraction for the hardware device that is used to run the model.
+
+    Args:
+        device (str): The device to be used. Currently only support 'cpu' and 'gpu'.
+    """
 
-    def __init__(self, device: torch.device):
+    def __init__(self, device: str):
         self.device = device
 
-    def setup_model(self, model: nn.Module) -> nn.Module:
-        # TODO: implement this method
-        pass
+        assert self.device in _supported_devices, f"Device {self.device} is not supported yet, supported devices include {_supported_devices}"
+
+    def bind(self):
+        """
+        Set the default device for the current process.
+        """
+        if self.device == 'cpu':
+            pass
+        elif self.device == 'cuda':
+            # TODO(FrankLeeeee): use global environment to check if it is a dist job
+            # if is_distributed:
+            #     local_rank = EnvTable().get_local_rank()
+            #     torch.cuda.set_device(torch.device(f'cuda:{local_rank}'))
+            torch.cuda.set_device(torch.device('cuda'))
+            pass
+        else:
+            raise ValueError(f"Device {self.device} is not supported yet")
+
+    def configure_model(self, model: nn.Module) -> nn.Module:
+        """
+        Move the model to the device.
+
+        Args:
+            model (nn.Module): The model to be moved.
+        """
+        model = model.to(torch.device(self.device))
+        return model
diff --git a/colossalai/booster/booster.py b/colossalai/booster/booster.py
index 7b351ae343d2..7d7f21ca6cf2 100644
--- a/colossalai/booster/booster.py
+++ b/colossalai/booster/booster.py
@@ -8,6 +8,7 @@
 from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
 from torch.utils.data import DataLoader
 
+from .accelerator import Accelerator
 from .mixed_precision import MixedPrecision, mixed_precision_factory
 from .plugin import Plugin
 
@@ -51,9 +52,16 @@ class Booster:
     """
 
     def __init__(self,
-                 device: Union[str, torch.device] = 'cuda',
+                 device: str = 'cuda',
                  mixed_precision: Union[MixedPrecision, str] = None,
                  plugin: Optional[Plugin] = None) -> None:
+        # TODO(FrankLeeeee): add plugin control logic
+        # if self.plugin is not None and self.plugin.control_accelerator:
+        #     ...
+        # create acclerator
+        self.acceleartor = Accelerator(device)
+        self.acceleartor.set_default_device()
+
         # validate and set precision
         if isinstance(MixedPrecision, str):
             # the user will take the default arguments for amp training
@@ -78,6 +86,11 @@ def boost(self, model: nn.Module, optimizer: Optimizer, criterion: Callable, lr_
             lr_scheduler (LRScheduler): The lr_scheduler to be boosted.
             dataloader (DataLoader): The dataloader to be boosted.
         """
+        # TODO(FrankLeeeee): add plugin control logic
+        # if self.plugin is not None and self.plugin.control_accelerator:
+        #     ...
+        model = self.acceleartor.configure_model(model)
+
         # TODO(FrankLeeeee): consider multi-model and multi-optimizer case
         # TODO(lsg): Add plugin control logic
         # e.g.
diff --git a/tests/test_booster/test_accelerator.py b/tests/test_booster/test_accelerator.py
new file mode 100644
index 000000000000..4bfa3fd0631e
--- /dev/null
+++ b/tests/test_booster/test_accelerator.py
@@ -0,0 +1,13 @@
+import pytest
+import torch.nn as nn
+from torchvision.models import resnet18
+
+from colossalai.booster.accelerator import Accelerator
+
+
+@pytest.mark.parametrize('device', ['cpu', 'cuda'])
+def test_accelerator(device):
+    acceleartor = Accelerator(device)
+    model = nn.Linear(8, 8)
+    model = acceleartor.configure_model(model)
+    assert next(model.parameters()).device.type == device
diff --git a/tests/test_fx/test_tracer/test_torchrec_model/test_dlrm_model.py b/tests/test_fx/test_tracer/test_torchrec_model/test_dlrm_model.py
index 27a88291397e..71ecf7fca53e 100644
--- a/tests/test_fx/test_tracer/test_torchrec_model/test_dlrm_model.py
+++ b/tests/test_fx/test_tracer/test_torchrec_model/test_dlrm_model.py
@@ -56,6 +56,7 @@ def test_torchrec_dlrm_models():
         data = data_gen_fn()
 
         # dlrm_interactionarch is not supported
+        # TODO(FrankLeeeee): support this model
         if name == 'dlrm_interactionarch':
             continue
 

From 4e921cfbd68c0399a7576243e02ccb9b31fb5c94 Mon Sep 17 00:00:00 2001
From: NatalieC323 <127177614+NatalieC323@users.noreply.github.com>
Date: Mon, 20 Mar 2023 14:19:05 +0800
Subject: [PATCH 483/503] [examples] Solving the diffusion issue of
 incompatibility issue#3169 (#3170)

* Update requirements.txt

* Update environment.yaml

* Update README.md

* Update environment.yaml
---
 examples/images/diffusion/README.md        | 50 ++++++++++++++++------
 examples/images/diffusion/requirements.txt |  4 +-
 2 files changed, 39 insertions(+), 15 deletions(-)

diff --git a/examples/images/diffusion/README.md b/examples/images/diffusion/README.md
index 2a99094b703a..22970ced064e 100644
--- a/examples/images/diffusion/README.md
+++ b/examples/images/diffusion/README.md
@@ -40,8 +40,7 @@ This project is in rapid development.
 ### Option #1: install from source
 #### Step 1: Requirements
 
-A suitable [conda](https://conda.io/) environment named `ldm` can be created
-and activated with:
+To begin with, make sure your operating system has the cuda version suitable for this exciting training session, which is cuda11.6/11.8. For your convience, we have set up the rest of packages here. You can create and activate a suitable [conda](https://conda.io/) environment named `ldm` :
 
 ```
 conda env create -f environment.yaml
@@ -55,11 +54,34 @@ conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit
 pip install transformers diffusers invisible-watermark
 ```
 
-#### Step 2:Install [Colossal-AI](https://colossalai.org/download/) From Our Official Website
+#### Step 2: install lightning
+
+Install Lightning version later than 2022.01.04. We suggest you install lightning from source. Notice that the default download path of pip should be within the conda environment, or you may need to specify using 'which pip' and redirect the path into conda environment. 
+
+##### From Source
+```
+git clone https://github.com/Lightning-AI/lightning.git
+pip install -r requirements.txt
+python setup.py install
+```
 
 ##### From pip
 
-For example, you can install  v0.2.0 from our official website.
+```
+pip install pytorch-lightning
+```
+
+#### Step 3:Install [Colossal-AI](https://colossalai.org/download/) From Our Official Website
+
+You can install the latest version (0.2.7) from our official website or from source. Notice that the suitable version for this training is colossalai(0.2.5), which stands for torch(1.12.1).
+
+##### Download suggested verision for this training
+
+```
+pip install colossalai=0.2.5
+```
+
+##### Download the latest version from pip for latest torch version
 
 ```
 pip install colossalai
@@ -75,10 +97,12 @@ cd ColossalAI
 CUDA_EXT=1 pip install .
 ```
 
-#### Step 3:Accelerate with flash attention by xformers(Optional)
+#### Step 4:Accelerate with flash attention by xformers(Optional)
+
+Notice that xformers will accelerate the training process in cost of extra disk space. The suitable version of xformers for this training process is 0.12.0. You can download xformers directly via pip. For more release versions, feel free to check its official website: [XFormers](./https://pypi.org/project/xformers/)
 
 ```
-pip install xformers
+pip install xformers==0.0.12
 ```
 
 ### Option #2: Use Docker
@@ -94,7 +118,7 @@ docker build -t hpcaitech/diffusion:0.2.0  .
 docker pull hpcaitech/diffusion:0.2.0
 ```
 
-Once you have the image ready, you can launch the image with the following command:
+Once you have the image ready, you can launch the image with the following command
 
 ```bash
 ########################
@@ -157,10 +181,9 @@ you should the change the `data.file_path` in the `config/train_colossalai.yaml`
 
 ## Training
 
-We provide the script `train_colossalai.sh` to run the training task with colossalai,
-and can also use `train_ddp.sh` to run the training task with ddp to compare.
+We provide the script `train_colossalai.sh` to run the training task with colossalai. Meanwhile, we have enlightened other training process such as DDP model in PyTorch. You can also use `train_ddp.sh` to run the training task with ddp to compare the corresponding performance.
 
-In `train_colossalai.sh` the main command is:
+In `train_colossalai.sh` the main command is
 
 ```
 python main.py --logdir /tmp/ --train --base configs/train_colossalai.yaml --ckpt 512-base-ema.ckpt
@@ -176,9 +199,10 @@ python main.py --logdir /tmp/ --train --base configs/train_colossalai.yaml --ckp
 
 You can change the trainging config in the yaml file
 
-- devices: device number used for training, default 8
-- max_epochs: max training epochs, default 2
-- precision: the precision type used in training, default 16 (fp16), you must use fp16 if you want to apply colossalai
+- devices: device number used for training, default = 8
+- max_epochs: max training epochs, default = 2
+- precision: the precision type used in training, default = 16 (fp16), you must use fp16 if you want to apply colossalai
+- placement_policy: the training strategy supported by Colossal AI, defult = 'cuda', which refers to loading all the parameters into cuda memory. On the other hand, 'cpu' refers to 'cpu offload' strategy while 'auto' enables 'Gemini', both featured by Colossal AI.
 - more information about the configuration of ColossalAIStrategy can be found [here](https://pytorch-lightning.readthedocs.io/en/latest/advanced/model_parallel.html#colossal-ai)
 
 
diff --git a/examples/images/diffusion/requirements.txt b/examples/images/diffusion/requirements.txt
index d0af35353b66..59d027fcf60f 100644
--- a/examples/images/diffusion/requirements.txt
+++ b/examples/images/diffusion/requirements.txt
@@ -1,10 +1,10 @@
 albumentations==1.3.0
-opencv-python==4.6.0
+opencv-python==4.6.0.66
 pudb==2019.2
 prefetch_generator
 imageio==2.9.0
 imageio-ffmpeg==0.4.2
-torchmetrics==0.6
+torchmetrics==0.7
 omegaconf==2.1.1
 test-tube>=0.7.5
 streamlit>=0.73.1

From 085e7f4eff832f2510d8023a9821206ab1894b2e Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 20 Mar 2023 16:19:06 +0800
Subject: [PATCH 484/503] [test] fixed torchrec registration in model zoo
 (#3177)

* [test] fixed torchrec registration in model zoo

* polish code

* polish code

* polish code
---
 tests/kit/model_zoo/torchrec/torchrec.py      | 72 +++++++++++++++----
 .../test_torchrec_model/test_deepfm_model.py  |  1 -
 .../test_torchrec_model/test_dlrm_model.py    |  1 -
 3 files changed, 59 insertions(+), 15 deletions(-)

diff --git a/tests/kit/model_zoo/torchrec/torchrec.py b/tests/kit/model_zoo/torchrec/torchrec.py
index 03d95a06a89b..dda563155fca 100644
--- a/tests/kit/model_zoo/torchrec/torchrec.py
+++ b/tests/kit/model_zoo/torchrec/torchrec.py
@@ -11,21 +11,47 @@
 
 BATCH = 2
 SHAPE = 10
-# KeyedTensor
-KT = KeyedTensor(keys=["f1", "f2"], length_per_key=[SHAPE, SHAPE], values=torch.rand((BATCH, 2 * SHAPE)))
+
+
+def gen_kt():
+    KT = KeyedTensor(keys=["f1", "f2"], length_per_key=[SHAPE, SHAPE], values=torch.rand((BATCH, 2 * SHAPE)))
+    return KT
+
 
 # KeyedJaggedTensor
-KJT = KeyedJaggedTensor.from_offsets_sync(keys=["f1", "f2"],
-                                          values=torch.tensor([1, 2, 3, 4, 5, 6, 7, 8]),
-                                          offsets=torch.tensor([0, 2, 4, 6, 8]))
+def gen_kjt():
+    KJT = KeyedJaggedTensor.from_offsets_sync(keys=["f1", "f2"],
+                                              values=torch.tensor([1, 2, 3, 4, 5, 6, 7, 8]),
+                                              offsets=torch.tensor([0, 2, 4, 6, 8]))
+    return KJT
+
 
 data_gen_fn = lambda: dict(features=torch.rand((BATCH, SHAPE)))
 
-interaction_arch_data_gen_fn = lambda: dict(dense_features=torch.rand((BATCH, SHAPE)), sparse_features=KT)
 
-simple_dfm_data_gen_fn = lambda: dict(dense_features=torch.rand((BATCH, SHAPE)), sparse_features=KJT)
+def interaction_arch_data_gen_fn():
+    KT = gen_kt()
+    return dict(dense_features=torch.rand((BATCH, SHAPE)), sparse_features=KT)
+
+
+def simple_dfm_data_gen_fn():
+    KJT = gen_kjt()
+    return dict(dense_features=torch.rand((BATCH, SHAPE)), sparse_features=KJT)
+
 
-sparse_arch_data_gen_fn = lambda: dict(features=KJT)
+def sparse_arch_data_gen_fn():
+    KJT = gen_kjt()
+    return dict(features=KJT)
+
+
+def output_transform_fn(x):
+    if isinstance(x, KeyedTensor):
+        output = dict()
+        for key in x.keys():
+            output[key] = x[key]
+        return output
+    else:
+        return dict(output=x)
 
 
 def output_transform_fn(x):
@@ -42,7 +68,27 @@ def get_ebc():
     # EmbeddingBagCollection
     eb1_config = EmbeddingBagConfig(name="t1", embedding_dim=SHAPE, num_embeddings=SHAPE, feature_names=["f1"])
     eb2_config = EmbeddingBagConfig(name="t2", embedding_dim=SHAPE, num_embeddings=SHAPE, feature_names=["f2"])
-    return EmbeddingBagCollection(tables=[eb1_config, eb2_config])
+    return EmbeddingBagCollection(tables=[eb1_config, eb2_config], device=torch.device('cpu'))
+
+
+def sparse_arch_model_fn():
+    ebc = get_ebc()
+    return deepfm.SparseArch(ebc)
+
+
+def simple_deep_fmnn_model_fn():
+    ebc = get_ebc()
+    return deepfm.SimpleDeepFMNN(SHAPE, ebc, SHAPE, SHAPE)
+
+
+def dlrm_model_fn():
+    ebc = get_ebc()
+    return dlrm.DLRM(ebc, SHAPE, [SHAPE, SHAPE], [5, 1])
+
+
+def dlrm_sparsearch_model_fn():
+    ebc = get_ebc()
+    return dlrm.SparseArch(ebc)
 
 
 model_zoo.register(name='deepfm_densearch',
@@ -61,17 +107,17 @@ def get_ebc():
                    output_transform_fn=output_transform_fn)
 
 model_zoo.register(name='deepfm_simpledeepfmnn',
-                   model_fn=partial(deepfm.SimpleDeepFMNN, SHAPE, get_ebc(), SHAPE, SHAPE),
+                   model_fn=simple_deep_fmnn_model_fn,
                    data_gen_fn=simple_dfm_data_gen_fn,
                    output_transform_fn=output_transform_fn)
 
 model_zoo.register(name='deepfm_sparsearch',
-                   model_fn=partial(deepfm.SparseArch, get_ebc()),
+                   model_fn=sparse_arch_model_fn,
                    data_gen_fn=sparse_arch_data_gen_fn,
                    output_transform_fn=output_transform_fn)
 
 model_zoo.register(name='dlrm',
-                   model_fn=partial(dlrm.DLRM, get_ebc(), SHAPE, [SHAPE, SHAPE], [5, 1]),
+                   model_fn=dlrm_model_fn,
                    data_gen_fn=simple_dfm_data_gen_fn,
                    output_transform_fn=output_transform_fn)
 
@@ -91,6 +137,6 @@ def get_ebc():
                    output_transform_fn=output_transform_fn)
 
 model_zoo.register(name='dlrm_sparsearch',
-                   model_fn=partial(dlrm.SparseArch, get_ebc()),
+                   model_fn=dlrm_sparsearch_model_fn,
                    data_gen_fn=sparse_arch_data_gen_fn,
                    output_transform_fn=output_transform_fn)
diff --git a/tests/test_fx/test_tracer/test_torchrec_model/test_deepfm_model.py b/tests/test_fx/test_tracer/test_torchrec_model/test_deepfm_model.py
index a30139f26d29..a4e847dbcfcd 100644
--- a/tests/test_fx/test_tracer/test_torchrec_model/test_deepfm_model.py
+++ b/tests/test_fx/test_tracer/test_torchrec_model/test_deepfm_model.py
@@ -47,7 +47,6 @@ def trace_and_compare(model_cls, data, output_transform_fn, meta_args=None):
                                  ), f'{model.__class__.__name__} has inconsistent outputs, {fx_out} vs {non_fx_out}'
 
 
-@pytest.mark.skip('unknown error')
 def test_torchrec_deepfm_models():
     deepfm_models = model_zoo.get_sub_registry('deepfm')
     torch.backends.cudnn.deterministic = True
diff --git a/tests/test_fx/test_tracer/test_torchrec_model/test_dlrm_model.py b/tests/test_fx/test_tracer/test_torchrec_model/test_dlrm_model.py
index 71ecf7fca53e..ac377ff1d5f8 100644
--- a/tests/test_fx/test_tracer/test_torchrec_model/test_dlrm_model.py
+++ b/tests/test_fx/test_tracer/test_torchrec_model/test_dlrm_model.py
@@ -47,7 +47,6 @@ def trace_and_compare(model_cls, data, output_transform_fn, meta_args=None):
                                  ), f'{model.__class__.__name__} has inconsistent outputs, {fx_out} vs {non_fx_out}'
 
 
-@pytest.mark.skip('unknown error')
 def test_torchrec_dlrm_models():
     torch.backends.cudnn.deterministic = True
     dlrm_models = model_zoo.get_sub_registry('dlrm')

From 7bc0afc901f2f0ce187cab9a0b1587740094d7b5 Mon Sep 17 00:00:00 2001
From: zbian <kurisusnowdeng@gmail.com>
Date: Fri, 17 Mar 2023 15:09:47 +0800
Subject: [PATCH 485/503] updated flash attention usage

---
 LICENSE                                       |  70 ++++++
 .../kernel/cuda_native/flash_attention.py     | 207 +++++++++++++-----
 tests/test_utils/test_flash_attention.py      | 200 +++++++----------
 3 files changed, 307 insertions(+), 170 deletions(-)

diff --git a/LICENSE b/LICENSE
index 394791da2771..c7a5bb16880e 100644
--- a/LICENSE
+++ b/LICENSE
@@ -326,3 +326,73 @@ Copyright 2021- HPC-AI Technology Inc. All rights reserved.
    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
    POSSIBILITY OF SUCH DAMAGE.
+
+   ---------------- LICENSE FOR Flash Attention ----------------
+
+   BSD 3-Clause License
+
+   Copyright (c) 2022, the respective contributors, as shown by the AUTHORS file.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+
+   * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+   * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+   * Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+   DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+   FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+   SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+   CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+   OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   ---------------- LICENSE FOR Facebook xFormers ----------------
+
+   From xFormers:
+
+   Copyright (c) Facebook, Inc. and its affiliates
+
+
+   ===
+
+   BSD 3-Clause License
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+   3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
+      and IDIAP Research Institute nor the names of its contributors may be
+      used to endorse or promote products derived from this software without
+      specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
diff --git a/colossalai/kernel/cuda_native/flash_attention.py b/colossalai/kernel/cuda_native/flash_attention.py
index 907fa640d826..d793815ed681 100644
--- a/colossalai/kernel/cuda_native/flash_attention.py
+++ b/colossalai/kernel/cuda_native/flash_attention.py
@@ -1,12 +1,6 @@
 """
-The triton-based flash attention implementation is copied from the OpenAI/triton repository
-
-You can find the repository in Triton https://github.com/openai/triton
-You can find the source file in https://github.com/openai/triton/blob/main/python/tutorials/06-fused-attention.py
-
-Reference:
-1. Dao et al., https://arxiv.org/pdf/2205.14135v2.pdf
-2. Rabe and Staats https://arxiv.org/pdf/2112.05682v2.pdf
+A general attention module using the flash attention kernels from xformers:
+https://github.com/facebookresearch/xformers/tree/main/xformers/ops/fmha
 """
 
 import math
@@ -15,6 +9,159 @@
 
 import torch
 
+try:
+    from xformers.ops.fmha import memory_efficient_attention
+    HAS_MEM_EFF_ATTN = True
+except ImportError:
+    HAS_MEM_EFF_ATTN = False
+    print('please install xformers from https://github.com/facebookresearch/xformers')
+
+if HAS_MEM_EFF_ATTN:
+
+    from typing import Optional
+
+    from einops import rearrange
+    from xformers.ops.fmha import MemoryEfficientAttentionCutlassOp
+    from xformers.ops.fmha.attn_bias import BlockDiagonalMask, LowerTriangularMask, LowerTriangularMaskWithTensorBias
+
+    from .scaled_softmax import AttnMaskType
+
+    allow_alibi = True
+    for op in MemoryEfficientAttentionCutlassOp:
+        allow_alibi = allow_alibi & (LowerTriangularMaskWithTensorBias in op.SUPPORTED_ATTN_BIAS_TYPES)
+
+    class Unpad(torch.autograd.Function):
+        """
+        Adapted from
+        https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/bert_padding.py
+        """
+
+        @staticmethod
+        def forward(ctx, tensor: torch.Tensor, indices: torch.Tensor):
+            ctx.save_for_backward(indices)
+            # [b, s, ...]
+            assert tensor.ndim >= 3
+            ctx.bsz = tensor.shape[0]
+            out = rearrange(tensor, 'b s ... -> (b s) ...')
+            ctx.shape = out.shape
+            # [1, ntokens, ...]
+            return out[indices].unsqueeze(0)
+
+        @staticmethod
+        def backward(ctx, grad_output):
+            indices, = ctx.saved_tensors
+            # [b*s, ...]
+            grad = torch.zeros(ctx.shape, dtype=grad_output.dtype, device=grad_output.device)
+            grad[indices] = grad_output.squeeze(0)
+            grad = rearrange(grad, '(b s) ... -> b s ...', b=ctx.bsz)
+            # [b, s, ...]
+            return grad, None
+
+    class Repad(torch.autograd.Function):
+        """
+        Adapted from
+        https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/bert_padding.py
+        """
+
+        @staticmethod
+        def forward(ctx, tensor: torch.Tensor, indices: torch.Tensor, batch_size: int, seq_len: int):
+            ctx.save_for_backward(indices)
+            # [ntokens, ...]
+            tensor = tensor.squeeze(0)
+            out = torch.zeros((batch_size * seq_len, *tensor.shape[1:]), dtype=tensor.dtype, device=tensor.device)
+            # [b*s, ...]
+            out[indices] = tensor
+            # [b, s, ...]
+            out = rearrange(out, '(b s) ... -> b s ...', b=batch_size)
+            return out
+
+        @staticmethod
+        def backward(ctx, grad_output):
+            indices, = ctx.saved_tensors
+            # [b*s, ...]
+            grad_output = rearrange(grad_output, 'b s ... -> (b s) ...')
+            grad = grad_output[indices]
+            # [1, ntokens, ...]
+            return grad.unsqueeze(0), None, None, None
+
+    class ColoAttention(torch.nn.Module):
+
+        def __init__(self, embed_dim: int, num_heads: int, dropout: float = 0.0):
+            super().__init__()
+            assert embed_dim % num_heads == 0, \
+                f"the embed dim ({embed_dim}) is not divisible by the number of attention heads ({num_heads})."
+            self.scale = 1 / math.sqrt(embed_dim // num_heads)
+            self.dropout = dropout
+
+        @staticmethod
+        def get_seq_info_from_mask(attn_mask: torch.Tensor):
+            indices = torch.nonzero(attn_mask.flatten(), as_tuple=False).flatten()
+            seqlens = attn_mask.sum(dim=-1, dtype=torch.int32).flatten().tolist()
+            return indices, seqlens
+
+        @staticmethod
+        def unpad(tensor: torch.Tensor, indices: torch.Tensor) -> torch.Tensor:
+            return Unpad.apply(tensor, indices)
+
+        @staticmethod
+        def repad(tensor: torch.Tensor, indices: torch.Tensor, batch_size: int, seq_len: int) -> torch.Tensor:
+            return Repad.apply(tensor, indices, batch_size, seq_len)
+
+        def forward(self,
+                    query: torch.Tensor,
+                    key: torch.Tensor,
+                    value: torch.Tensor,
+                    attn_mask: Optional[torch.Tensor] = None,
+                    attn_mask_type: Optional[AttnMaskType] = None,
+                    bias: Optional[torch.Tensor] = None):
+            batch_size, tgt_len, src_len = query.shape[0], query.shape[1], key.shape[1]
+            attn_bias = None
+            if attn_mask_type == AttnMaskType.padding:    # bert style
+                assert attn_mask is not None, \
+                    f"attention mask {attn_mask} is not valid for attention mask type {attn_mask_type}."
+                assert attn_mask.dim() == 2, \
+                    "attention mask is supposed to have shape (batch_size, seq_len), " + \
+                    f"but got {attn_mask.dim()} dimensions."
+                if tgt_len == src_len:
+                    q_indices, q_seqlen = self.get_seq_info_from_mask(attn_mask)
+                    kv_seqlen = None
+                    if batch_size > 1:
+                        query, key, value = self.unpad(torch.stack([query, key, value], dim=2), q_indices).unbind(dim=2)
+                else:
+                    q_indices = torch.arange(batch_size * tgt_len, dtype=torch.int32, device=query.device)
+                    q_seqlen = torch.LongTensor([tgt_len] * batch_size, device=query.device)
+                    kv_indices, kv_seqlen = self.get_seq_info_from_mask(attn_mask)
+                    if batch_size > 1:
+                        query = rearrange(query, "b s ... -> c (b s) ...", c=1)
+                        key, value = self.unpad(torch.stack([query, key, value], dim=2), kv_indices).unbind(dim=2)
+                attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen, kv_seqlen)
+            elif attn_mask_type == AttnMaskType.causal:    # gpt style
+                attn_bias = LowerTriangularMask()
+
+            if bias is not None:    # alibi / relative position emebedding
+                assert allow_alibi, "flash attention with bias is not supported in this system."
+                assert attn_mask_type == AttnMaskType.causal, \
+                    "attention with bias is only supported for causal attention so far."
+                attn_bias = attn_bias.add_bias(bias)
+
+            out = memory_efficient_attention(query, key, value, attn_bias=attn_bias, p=self.dropout, scale=self.scale)
+
+            if attn_mask_type == AttnMaskType.padding and batch_size > 1:
+                out = self.repad(out, q_indices, batch_size, tgt_len)
+
+            out = rearrange(out, 'b s h d -> b s (h d)')
+            return out
+
+
+##########################################################################
+# the flash attention functions below that are copied
+# from the OpenAI/triton repository will be deprecated
+# You can find the repository in Triton https://github.com/openai/triton
+# You can find the source file in https://github.com/openai/triton/blob/main/python/tutorials/06-fused-attention.py
+# Reference:
+# 1. Dao et al., https://arxiv.org/pdf/2205.14135v2.pdf
+# 2. Rabe and Staats https://arxiv.org/pdf/2112.05682v2.pdf
+
 
 def triton_cuda_check():
     cuda_home = os.getenv("CUDA_HOME", default="/usr/local/cuda")
@@ -52,13 +199,6 @@ def triton_cuda_check():
     HAS_FLASH_ATTN = False
     print('please install flash_attn from https://github.com/HazyResearch/flash-attention')
 
-try:
-    from xformers.ops.fmha import memory_efficient_attention
-    HAS_MEM_EFF_ATTN = True
-except ImportError:
-    HAS_MEM_EFF_ATTN = False
-    print('please install xformers from https://github.com/facebookresearch/xformers')
-
 if HAS_TRITON:
     # the following functions are adapted from the OpenAI Triton tutorial
     # https://github.com/openai/triton/blob/main/python/tutorials/06-fused-attention.py
@@ -422,25 +562,6 @@ def triton_flash_attention(q, k, v, sm_scale):
 
 if HAS_FLASH_ATTN:
 
-    from einops import rearrange
-
-    class MaskedFlashAttention(torch.nn.Module):
-
-        def __init__(self, num_attention_heads: int, attention_head_size: int, attention_dropout: float) -> None:
-            super().__init__()
-            self.num_attention_heads = num_attention_heads
-            self.attention_head_size = attention_head_size
-            self.attention_func = FlashAttention(softmax_scale=math.sqrt(attention_head_size),
-                                                 attention_dropout=attention_dropout)
-
-        def forward(self, query_key_value: torch.Tensor, attention_mask: torch.Tensor, causal=False):
-            if attention_mask.dtype is not torch.bool:
-                attention_mask = attention_mask.bool()
-            qkv = rearrange(query_key_value, 'b s (three h d) -> b s three h d', three=3, h=self.num_attention_heads)
-            context, _ = self.attention_func(qkv, key_padding_mask=attention_mask, causal=causal)
-            context = rearrange(context, 'b s h d -> b s (h d)')
-            return context
-
     def flash_attention_qkv(qkv, sm_scale, batch_size, seq_len, dropout_p=0., causal=False):
         """
         Arguments:
@@ -511,20 +632,4 @@ def flash_attention_q_k_v(q, k, v, sm_scale, batch_size, q_seqlen, kv_seqlen, dr
                                         causal)
 
 
-if HAS_MEM_EFF_ATTN:
-
-    from einops import rearrange
-    from xformers.ops.fmha import LowerTriangularMask
-
-    class MemoryEfficientAttention(torch.nn.Module):
-
-        def __init__(self, hidden_size: int, num_attention_heads: int, attention_dropout: float = 0.0):
-            super().__init__()
-            attention_head_size = hidden_size // num_attention_heads
-            self.scale = 1 / attention_head_size**0.5
-            self.dropout = attention_dropout
-
-        def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, attention_mask: torch.Tensor):
-            context = memory_efficient_attention(query, key, value, attention_mask, self.dropout, self.scale)
-            context = rearrange(context, 'b s h d -> b s (h d)')
-            return context
+##########################################################################
diff --git a/tests/test_utils/test_flash_attention.py b/tests/test_utils/test_flash_attention.py
index 58e3b21d97eb..441cbbb22ce7 100644
--- a/tests/test_utils/test_flash_attention.py
+++ b/tests/test_utils/test_flash_attention.py
@@ -1,22 +1,13 @@
+import random
+
 import pytest
 import torch
 from einops import rearrange
 
-from colossalai.kernel.cuda_native.flash_attention import HAS_FLASH_ATTN, HAS_MEM_EFF_ATTN, HAS_TRITON
-
-if HAS_FLASH_ATTN:
-    from colossalai.kernel.cuda_native.flash_attention import (
-        MaskedFlashAttention,
-        flash_attention_q_k_v,
-        flash_attention_q_kv,
-        flash_attention_qkv,
-    )
-
-if HAS_TRITON:
-    from colossalai.kernel.cuda_native.flash_attention import triton_flash_attention
+from colossalai.kernel.cuda_native.flash_attention import HAS_MEM_EFF_ATTN
 
 if HAS_MEM_EFF_ATTN:
-    from colossalai.kernel.cuda_native.flash_attention import LowerTriangularMask, MemoryEfficientAttention
+    from colossalai.kernel.cuda_native.flash_attention import AttnMaskType, ColoAttention
 
 
 def baseline_attention(Z, N_CTX, H, q, k, v, sm_scale):
@@ -30,117 +21,88 @@ def baseline_attention(Z, N_CTX, H, q, k, v, sm_scale):
     return ref_out
 
 
-@pytest.mark.skipif(HAS_TRITON == False, reason="triton is not available")
-@pytest.mark.parametrize('Z, H, N_CTX, D_HEAD', [(3, 4, 2, 16)])
-def test_triton_flash_attention(Z, H, N_CTX, D_HEAD, dtype=torch.float16):
-    torch.manual_seed(20)
-    q = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0, std=.5).requires_grad_()
-    k = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0, std=.5).requires_grad_()
-    v = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0, std=.5).requires_grad_()
-    sm_scale = 0.3
-    dout = torch.randn_like(q)
-
-    ref_out = baseline_attention(Z, N_CTX, H, q, k, v, sm_scale)
-    ref_out.backward(dout)
-    ref_dv, v.grad = v.grad.clone(), None
-    ref_dk, k.grad = k.grad.clone(), None
-    ref_dq, q.grad = q.grad.clone(), None
-
-    # triton implementation
-    tri_out = triton_flash_attention(q, k, v, sm_scale)
-    tri_out.backward(dout)
-    tri_dv, v.grad = v.grad.clone(), None
-    tri_dk, k.grad = k.grad.clone(), None
-    tri_dq, q.grad = q.grad.clone(), None
-    # compare
-    assert torch.allclose(ref_out, tri_out, atol=1e-3)
-    assert torch.allclose(ref_dv, tri_dv, atol=1e-3)
-    assert torch.allclose(ref_dk, tri_dk, atol=1e-3)
-    assert torch.allclose(ref_dq, tri_dq, atol=1e-3)
-
-
-@pytest.mark.skipif(HAS_FLASH_ATTN == False, reason="flash is not available")
-@pytest.mark.parametrize('Z, H, N_CTX, D_HEAD', [(3, 4, 2, 16)])
-def test_flash_attention(Z, H, N_CTX, D_HEAD, dtype=torch.float16):
-    torch.manual_seed(20)
-    q = torch.randn((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0, std=.5).requires_grad_()
-    k = torch.randn((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0, std=.5).requires_grad_()
-    v = torch.randn((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0, std=.5).requires_grad_()
-    sm_scale = 0.3
-    dout = torch.randn_like(q)
-
-    # reference implementation
-    ref_out = baseline_attention(Z, N_CTX, H, q, k, v, sm_scale)
-    ref_out.backward(dout)
-    ref_dv, v.grad = v.grad.clone(), None
-    ref_dk, k.grad = k.grad.clone(), None
-    ref_dq, q.grad = q.grad.clone(), None
-
-    # flash implementation
-    q, k, v = map(lambda x: rearrange(x, 'z h n d -> (z n) h d'), [q, k, v])
-    dout = rearrange(dout, 'z h n d -> (z n) h d').detach()
-    for i in range(3):
-        if i == 0:
-            tri_out = flash_attention_q_k_v(q, k, v, sm_scale, Z, N_CTX, N_CTX, causal=True)
-        elif i == 1:
-            kv = torch.cat((k.unsqueeze(1), v.unsqueeze(1)), dim=1)
-            tri_out = flash_attention_q_kv(q, kv, sm_scale, Z, N_CTX, N_CTX, causal=True)
-        else:
-            qkv = torch.cat((q.unsqueeze(1), k.unsqueeze(1), v.unsqueeze(1)), dim=1)
-            tri_out = flash_attention_qkv(qkv, sm_scale, Z, N_CTX, causal=True)
-
-        tri_out.backward(dout, retain_graph=True)
-
-        if i == 0:
-            tri_dq, tri_dk, tri_dv, = torch.autograd.grad(tri_out, (q, k, v), dout)
-            tri_out, tri_dq, tri_dk, tri_dv = map(lambda x: rearrange(x, '(z n) h d -> z h n d', z=Z),
-                                                  (tri_out, tri_dq, tri_dk, tri_dv))
-        elif i == 1:
-            tri_dq, tri_dkv, = torch.autograd.grad(tri_out, (q, kv), dout)
-            tri_dk, tri_dv = torch.chunk(tri_dkv, 2, dim=1)
-            tri_out, tri_dq, tri_dk, tri_dv = map(lambda x: rearrange(x, '(z n) h d -> z h n d', z=Z),
-                                                  (tri_out, tri_dq, tri_dk.squeeze(1), tri_dv.squeeze(1)))
-        else:
-            tri_dqkv, = torch.autograd.grad(tri_out, (qkv), dout)
-            tri_dq, tri_dk, tri_dv = torch.chunk(tri_dqkv, 3, dim=1)
-            tri_out, tri_dq, tri_dk, tri_dv = map(lambda x: rearrange(x, '(z n) h d -> z h n d', z=Z),
-                                                  (tri_out, tri_dq.squeeze(1), tri_dk.squeeze(1), tri_dv.squeeze(1)))
-
-        # compare
-        assert torch.allclose(ref_out, tri_out, atol=1e-3)
-        assert torch.allclose(ref_dv, tri_dv, atol=1e-3)
-        assert torch.allclose(ref_dk, tri_dk, atol=1e-3)
-        assert torch.allclose(ref_dq, tri_dq, atol=1e-3)
-
-
-@pytest.mark.skipif(HAS_FLASH_ATTN == False, reason="flash is not available")
-@pytest.mark.parametrize('Z, H, N_CTX, D_HEAD', [(3, 4, 2, 16)])
-def test_masked_flash_attention(Z, H, N_CTX, D_HEAD, dtype=torch.float16):
-    attn = MaskedFlashAttention(N_CTX, D_HEAD, 0.1)
-
-    qkv = torch.randn((Z, H, 3 * N_CTX * D_HEAD), dtype=dtype, device="cuda").normal_(mean=0, std=.5).requires_grad_()
-    attention_mask = torch.randint(2, (Z, H)).cuda().bool()
-
-    out = attn(qkv, attention_mask)
-
-    dout = torch.rand_like(out)
-    out.backward(dout)
+@pytest.mark.skipif(HAS_MEM_EFF_ATTN == False, reason="xformers is not available")
+@pytest.mark.parametrize('B, S, H, D_HEAD', [(6, 8, 4, 16)])
+def test_attention_gpt(B, S, H, D_HEAD, dtype=torch.float16):
+    D = H * D_HEAD
+
+    c_attn = torch.nn.Linear(D, 3 * D, dtype=dtype, device="cuda")
+    attn = ColoAttention(D, H, dropout=0.1)
+
+    x = torch.randn((B, S, D), dtype=dtype, device="cuda")
+
+    qkv = c_attn(x)
+    q, k, v = rearrange(qkv, 'b s (n h d) -> n b s h d', n=3, h=H)
+    y = attn(q, k, v, attn_mask_type=AttnMaskType.causal)
+
+    assert list(y.shape) == [B, S, D]
+
+    dy = torch.rand_like(y)
+    y.backward(dy)
 
 
 @pytest.mark.skipif(HAS_MEM_EFF_ATTN == False, reason="xformers is not available")
-@pytest.mark.parametrize('Z, H, N_CTX, D_HEAD', [(6, 8, 4, 16)])
-def test_memory_efficient_attention(Z, H, N_CTX, D_HEAD, dtype=torch.float16):
-    attn = MemoryEfficientAttention(N_CTX * D_HEAD, N_CTX, 0.1)
+@pytest.mark.parametrize('B, S, H, D_HEAD', [(6, 8, 4, 16)])
+def test_attention_bert(B, S, H, D_HEAD, dtype=torch.float16):
+    D = H * D_HEAD
+
+    c_attn = torch.nn.Linear(D, 3 * D, dtype=dtype, device="cuda")
+    attn = ColoAttention(D, H, dropout=0.1)
+
+    x = torch.randn((B, S, D), dtype=dtype, device="cuda")
+    # attention mask of shape [B, S] with zero padding to max length S
+    mask = [torch.ones(S - i, dtype=dtype, device="cuda") for i in range(B)]
+    mask = torch.nn.utils.rnn.pad_sequence(mask, batch_first=True)
+
+    qkv = c_attn(x)
+    q, k, v = rearrange(qkv, 'b s (n h d) -> b s n h d', n=3, h=H).unbind(dim=2)
+    y = attn(q, k, v, attn_mask=mask, attn_mask_type=AttnMaskType.padding)
+
+    assert list(y.shape) == [B, S, D]
+
+    dy = torch.rand_like(y)
+    y.backward(dy)
+
+
+@pytest.mark.skipif(HAS_MEM_EFF_ATTN == False, reason="xformers is not available")
+@pytest.mark.parametrize('B, S, H, D_HEAD', [(6, 8, 4, 16)])
+def test_attention_no_mask(B, S, H, D_HEAD, dtype=torch.float16):
+    D = H * D_HEAD
+
+    c_attn = torch.nn.Linear(D, 3 * D, dtype=dtype, device="cuda")
+    attn = ColoAttention(D, H, dropout=0.1)
+
+    x = torch.randn((B, S, D), dtype=dtype, device="cuda")
+    qkv = c_attn(x)
+    q, k, v = rearrange(qkv, 'b s (n h d) -> b s n h d', n=3, h=H).unbind(dim=2)
+    y = attn(q, k, v)
+
+    assert list(y.shape) == [B, S, D]
+
+    dy = torch.rand_like(y)
+    y.backward(dy)
+
+
+@pytest.mark.skipif(HAS_MEM_EFF_ATTN == False, reason="xformers is not available")
+@pytest.mark.parametrize('B, S, T, H, D_HEAD', [(6, 24, 8, 4, 16)])
+def test_cross_attention(B, S, T, H, D_HEAD, dtype=torch.float16):
+    D = H * D_HEAD
+
+    q_attn = torch.nn.Linear(D, D, dtype=dtype, device="cuda")
+    kv_attn = torch.nn.Linear(D, 2 * D, dtype=dtype, device="cuda")
 
-    q = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0, std=.5).requires_grad_()
-    k = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0, std=.5).requires_grad_()
-    v = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0, std=.5).requires_grad_()
+    attn = ColoAttention(D, H, dropout=0.1)
 
-    out = attn(q, k, v, attention_mask=LowerTriangularMask())
+    src = torch.randn((B, S, D), dtype=dtype, device="cuda")
+    tgt = torch.randn((B, T, D), dtype=dtype, device="cuda")
 
-    dout = torch.rand_like(out)
-    out.backward(dout)
+    q = q_attn(tgt)
+    kv = kv_attn(src)
+    q = rearrange(q, 'b s (h d) -> b s h d', h=H)
+    k, v = rearrange(kv, 'b s (n h d) -> b s n h d', n=2, h=H).unbind(dim=2)
+    y = attn(q, k, v, attn_mask_type=AttnMaskType.causal)
 
+    assert list(y.shape) == [B, T, D]
 
-if __name__ == '__main__':
-    test_flash_attention(3, 4, 2, 16)
+    dy = torch.rand_like(y)
+    y.backward(dy)

From 9d644ff09f2b044c984328e08357a68d98ab17f3 Mon Sep 17 00:00:00 2001
From: YH <100389977+yhna940@users.noreply.github.com>
Date: Tue, 21 Mar 2023 12:48:21 +0900
Subject: [PATCH 486/503] Fix docstr for zero statedict (#3185)

---
 colossalai/zero/sharded_model/sharded_model_v2.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/colossalai/zero/sharded_model/sharded_model_v2.py b/colossalai/zero/sharded_model/sharded_model_v2.py
index 094f7d76a86d..12e8f65d4a35 100644
--- a/colossalai/zero/sharded_model/sharded_model_v2.py
+++ b/colossalai/zero/sharded_model/sharded_model_v2.py
@@ -494,6 +494,7 @@ def _colo_load_from_state_dict(self,
             error_msgs (list of str): error messages should be added to this
                 list, and will be reported together in
                 :meth:`~torch.nn.Module.load_state_dict`
+            shard_strategy (Optional[BaseShardStrategy], optional): A shard strategy to manage shard behavior. Defaults to None.
         """
         for hook in self._load_state_dict_pre_hooks.values():
             hook(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)

From 80aed29cd3835587052b9271e3ac70175a599771 Mon Sep 17 00:00:00 2001
From: YH <100389977+yhna940@users.noreply.github.com>
Date: Tue, 21 Mar 2023 13:36:47 +0900
Subject: [PATCH 487/503] [zero] Refactor ZeroContextConfig class using
 dataclass (#3186)

---
 colossalai/zero/init_ctx/init_context.py | 39 ++++++++++++------------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/colossalai/zero/init_ctx/init_context.py b/colossalai/zero/init_ctx/init_context.py
index 572ddd9e4e3f..b40b69962cf7 100644
--- a/colossalai/zero/init_ctx/init_context.py
+++ b/colossalai/zero/init_ctx/init_context.py
@@ -1,46 +1,45 @@
 import contextlib
 import functools
-from typing import Optional
 from contextlib import AbstractContextManager
+from dataclasses import dataclass
+from typing import Optional
 
 import torch
-import torch.nn as nn
 import torch.distributed as dist
+import torch.nn as nn
 
 from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
 from colossalai.context.singleton_meta import SingletonMeta
+from colossalai.core import global_context as gpc
 from colossalai.logging import get_dist_logger
+from colossalai.utils.model.utils import InsertPostInitMethodToModuleSubClasses
 from colossalai.zero.shard_utils import BaseShardStrategy
 from colossalai.zero.sharded_model._utils import cast_tensor_to_fp16
 from colossalai.zero.sharded_model.sharded_model_v2 import ShardedModelV2
 from colossalai.zero.sharded_param import ShardedParamV2
-from colossalai.utils.model.utils import InsertPostInitMethodToModuleSubClasses
 
 
-class ZeroContextConfig(object):
+@dataclass
+class ZeroContextConfig:
     """The configuration used to control zero context initialization.
 
     Args:
         target_device (torch.device): The device where param data are after exiting the context.
-        replicated (bool, optional): Whether the param is replicated across data parallel group.
+        is_replicated (bool, optional): Whether the param is replicated across data parallel group.
             Some parameters are not replicated, e.g. parameters in MOE experts.
         shard_param (bool, optional): Is param sharded after exiting the context. Defaults to False.
     """
 
-    def __init__(self, target_device: torch.device, replicated: bool = True, shard_param: bool = False):
-        super().__init__()
+    target_device: torch.device
+    is_replicated: bool = True
+    shard_param: bool = False
 
-        if shard_param:
-            assert replicated, "Non-replicated parameters can't be sharded."
+    def __post_init__(self):
+        if self.shard_param:
+            assert self.is_replicated, "Non-replicated parameters can't be sharded."
 
-        # replicated no-shard parameters should locate in cuda, since we will broadcast them soon
-        if replicated and not shard_param:
-            assert target_device.type == 'cuda', "Replicated no-shard paramters should locate in cuda."
-
-        self.target_device = target_device
-        self.is_replicated: bool = replicated
-        self.shard_param: bool = shard_param
+        if self.is_replicated and not self.shard_param:
+            assert self.target_device.type == 'cuda', "Replicated no-shard parameters should be located in cuda."
 
 
 class ZeroInitContext(InsertPostInitMethodToModuleSubClasses):
@@ -74,7 +73,7 @@ def __init__(self,
         self.seed = seed
         self.dp_process_group = gpc.get_group(ParallelMode.DATA)
 
-        self.config = ZeroContextConfig(target_device=target_device, replicated=True, shard_param=shard_param)
+        self.config = ZeroContextConfig(target_device=target_device, is_replicated=True, shard_param=shard_param)
 
         ZeroContextMgr().current_context = self
 
@@ -124,7 +123,7 @@ def calc_fanin_fanout(tensor: torch.Tensor):
         return fan_in, fan_out
 
     def _pre_context_exec(self):
-        """ 
+        """
         The Callback function when entering the context
         """
         self.logger = get_dist_logger("ZeroInitContext")
@@ -248,7 +247,7 @@ def hijack_context_config(self, **kwargs):
 
 def no_shard_zero_context(is_replicated: bool = True) -> AbstractContextManager:
     return ZeroContextMgr().hijack_context_config(target_device=torch.device('cuda', torch.cuda.current_device()),
-                                                  replicated=is_replicated,
+                                                  is_replicated=is_replicated,
                                                   shard_param=False)
 
 
From 258b43317c4a5cafb8d3da0ff63c8843443bc448 Mon Sep 17 00:00:00 2001
From: YuliangLiu0306 <72588413+YuliangLiu0306@users.noreply.github.com>
Date: Tue, 21 Mar 2023 13:24:18 +0800
Subject: [PATCH 488/503] [hotfix] layout converting issue (#3188)

---
 colossalai/tensor/d_tensor/layout_converter.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/colossalai/tensor/d_tensor/layout_converter.py b/colossalai/tensor/d_tensor/layout_converter.py
index a4f4c9c2dd80..cf02aac309f4 100644
--- a/colossalai/tensor/d_tensor/layout_converter.py
+++ b/colossalai/tensor/d_tensor/layout_converter.py
@@ -10,7 +10,7 @@
 from colossalai.context.singleton_meta import SingletonMeta
 from colossalai.tensor.d_tensor.comm_spec import *
 from colossalai.tensor.d_tensor.layout import Layout
-from colossalai.tensor.sharding_spec import ShardingSpecException
+from colossalai.tensor.d_tensor.misc import LayoutException
 from colossalai.tensor.utils import all_gather_simulator, all_to_all_simulator, shard_simulator
 
 from .sharding_spec import ShardingSpec
@@ -145,7 +145,7 @@ def all_gather_transform_layouts(self, source_layout: Layout) -> Dict[Layout, Co
                                     entire_shape=source_layout.entire_shape)
 
                 valid_spec_dict[new_layout] = comm_spec
-            except ShardingSpecException:
+            except LayoutException:
                 pass
         return valid_spec_dict
 
@@ -255,7 +255,7 @@ def all_to_all_transform_layout(self, source_layout: Layout) -> Dict[Layout, Com
                                         device_type=source_layout.device_type,
                                         entire_shape=source_layout.entire_shape)
                     valid_spec_dict[new_layout] = comm_spec
-                except ShardingSpecException:
+                except LayoutException:
                     pass
 
         return valid_spec_dict
@@ -343,7 +343,7 @@ def shard_transform_layout(self, source_layout: Layout) -> Dict[Layout, CommSpec
                                         device_type=source_layout.device_type,
                                         entire_shape=source_layout.entire_shape)
                     valid_spec_dict[new_layout] = comm_spec
-                except ShardingSpecException:
+                except LayoutException:
                     pass
         return valid_spec_dict
 

From 18dbe76caeef1f8bb0cd4c9bd332d50b5abc6e38 Mon Sep 17 00:00:00 2001
From: Zihao <804673818@qq.com>
Date: Tue, 21 Mar 2023 14:17:41 +0800
Subject: [PATCH 489/503] [auto-parallel] add auto-offload feature (#3154)

* add auto-offload feature

* polish code

* fix syn offload runtime pass bug

* add offload example

* fix offload testing bug

* fix example testing bug
---
 colossalai/auto_parallel/offload/__init__.py  |   0
 .../auto_parallel/offload/amp_optimizer.py    | 177 ++++++
 .../offload/base_offload_module.py            | 109 ++++
 .../auto_parallel/offload/mem_optimize.py     |  49 ++
 colossalai/auto_parallel/offload/region.py    | 144 +++++
 .../auto_parallel/offload/region_manager.py   | 526 ++++++++++++++++++
 colossalai/auto_parallel/offload/runtime.py   | 253 +++++++++
 colossalai/auto_parallel/offload/solver.py    | 523 +++++++++++++++++
 .../offload/training_simulator.py             | 458 +++++++++++++++
 colossalai/auto_parallel/offload/util.py      |  90 +++
 .../gpt/experiments/auto_offload/README.md    |  37 ++
 .../gpt/experiments/auto_offload/model_zoo.py |  65 +++
 .../experiments/auto_offload/requirements.txt |   2 +
 .../gpt/experiments/auto_offload/run.sh       |   8 +
 .../auto_offload/train_gpt_offload.py         |  94 ++++
 .../test_offload/model_utils.py               |  86 +++
 .../test_offload/test_perf.py                 | 150 +++++
 .../test_offload/test_solver.py               |  62 +++
 18 files changed, 2833 insertions(+)
 create mode 100644 colossalai/auto_parallel/offload/__init__.py
 create mode 100644 colossalai/auto_parallel/offload/amp_optimizer.py
 create mode 100644 colossalai/auto_parallel/offload/base_offload_module.py
 create mode 100644 colossalai/auto_parallel/offload/mem_optimize.py
 create mode 100644 colossalai/auto_parallel/offload/region.py
 create mode 100644 colossalai/auto_parallel/offload/region_manager.py
 create mode 100644 colossalai/auto_parallel/offload/runtime.py
 create mode 100644 colossalai/auto_parallel/offload/solver.py
 create mode 100644 colossalai/auto_parallel/offload/training_simulator.py
 create mode 100644 colossalai/auto_parallel/offload/util.py
 create mode 100644 examples/language/gpt/experiments/auto_offload/README.md
 create mode 100644 examples/language/gpt/experiments/auto_offload/model_zoo.py
 create mode 100644 examples/language/gpt/experiments/auto_offload/requirements.txt
 create mode 100644 examples/language/gpt/experiments/auto_offload/run.sh
 create mode 100644 examples/language/gpt/experiments/auto_offload/train_gpt_offload.py
 create mode 100644 tests/test_auto_parallel/test_offload/model_utils.py
 create mode 100644 tests/test_auto_parallel/test_offload/test_perf.py
 create mode 100644 tests/test_auto_parallel/test_offload/test_solver.py

diff --git a/colossalai/auto_parallel/offload/__init__.py b/colossalai/auto_parallel/offload/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/colossalai/auto_parallel/offload/amp_optimizer.py b/colossalai/auto_parallel/offload/amp_optimizer.py
new file mode 100644
index 000000000000..a79e5006e7d2
--- /dev/null
+++ b/colossalai/auto_parallel/offload/amp_optimizer.py
@@ -0,0 +1,177 @@
+from typing import Dict, Tuple
+from enum import Enum
+import torch
+from torch.optim import Optimizer
+
+from colossalai.logging import get_dist_logger
+from colossalai.nn.optimizer import ColossalaiOptimizer
+from colossalai.amp.naive_amp.grad_scaler import DynamicGradScaler
+from colossalai.utils import get_current_device
+
+from .base_offload_module import BaseOffloadModule
+from .region_manager import RegionManager
+from .region import Region
+
+
+class OptimState(Enum):
+    SCALED = 0
+    UNSCALED = 1
+
+class AMPOptimizer(ColossalaiOptimizer):
+
+    """
+    A wrapper for Optimizer.
+    Code reference: https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/optimizer/zero_optimizer.py
+
+    Args:
+        optimizer (Optimizer): An Optimizer instance.
+        module (BaseOffloadModule): A ``BaseOffloadModule`` instance.
+        initial_scale (float, optional): Initial scale used by DynamicGradScaler. Defaults to 2**16.
+        growth_factor (float, optional): growth_factor used by DynamicGradScaler. Defaults to 2.
+        backoff_factor (float, optional): backoff_factor used by DynamicGradScaler. Defaults to 0.5.
+        growth_interval (float, optional): growth_interval used by DynamicGradScaler. Defaults to 1000.
+        hysteresis (float, optional): hysteresis used by DynamicGradScaler. Defaults to 2.
+        min_scale (float, optional): Min scale used by DynamicGradScaler. Defaults to 1.
+        max_scale (int, optional): max_scale used by DynamicGradScaler. Defaults to 2**32.
+        norm_type (float, optional): norm_type used for `clip_grad_norm`.
+    """
+
+    def __init__(self,
+                 optimizer: Optimizer,
+                 module: BaseOffloadModule,
+                 initial_scale: float = 2**16,
+                 growth_factor: float = 2,
+                 backoff_factor: float = 0.5,
+                 growth_interval: int = 1000,
+                 hysteresis: int = 2,
+                 min_scale: float = 1,
+                 max_scale: float = 2**32,
+                 clipping_norm: float = 0.0,
+                 norm_type: float = 2.0):
+
+        super().__init__(optimizer)
+
+        self.module = module
+        self.optim_state = OptimState.UNSCALED
+        self.clipping_flag = clipping_norm > 0.0
+        self.max_norm = clipping_norm
+
+        self.region_manager: RegionManager = self.module.region_manager
+        self.param_to_range: Dict[torch.nn.Parameter, Tuple[int, int]] = dict()
+        self.param_to_region: Dict[torch.nn.Parameter, Region] = dict()
+
+        self.fp32_to_fp16_params: Dict[torch.Tensor, torch.nn.Parameter] = dict()
+
+        if self.clipping_flag:
+            assert norm_type == 2.0, "AMPOptimizer only supports L2 norm now"
+
+        self.__init__optimizer()
+
+        # Grad scaler
+        self.grad_scaler = DynamicGradScaler(initial_scale=initial_scale,
+                                             min_scale=min_scale,
+                                             growth_factor=growth_factor,
+                                             backoff_factor=backoff_factor,
+                                             growth_interval=growth_interval,
+                                             hysteresis=hysteresis,
+                                             max_scale=max_scale)
+        self._found_overflow: torch.Tensor = torch.zeros(1, dtype=torch.int64, device=get_current_device())
+        self._logger = get_dist_logger()
+
+    def _set_grad_ptr(self):
+        for group in self.param_groups:
+            for fake_param in group['params']:
+                region = self.param_to_region[fake_param]
+                begin, end = self.param_to_range[fake_param]
+
+                fake_param.data = region.cpu_grad[begin:end]
+                fake_param.grad = fake_param.data
+                fake_param.data = region.fp32_data[begin:end]
+
+    def _update_fp16_params(self):
+        none_tensor = torch.empty([0])
+        for group in self.param_groups:
+            for fake_param in group['params']:
+                assert fake_param.grad is None
+                fake_param.data = none_tensor
+                self.param_to_region[fake_param].cpu_grad = None
+
+    def _check_overflow(self):
+        # clear previous overflow record
+        self._found_overflow.fill_(self.module.overflow_counter.item())
+        return self._found_overflow.item() > 0
+
+    def _get_combined_scale(self):
+        loss_scale = 1
+
+        if self.optim_state == OptimState.SCALED:
+            loss_scale = self.loss_scale
+            self.optim_state = OptimState.UNSCALED
+
+        combined_scale = loss_scale
+
+        if combined_scale == 1:
+            return -1
+        else:
+            return combined_scale
+
+    @property
+    def loss_scale(self):
+        return self.grad_scaler.scale.item()
+
+    def zero_grad(self, *args, **kwargs):
+        self.module.overflow_counter = torch.cuda.IntTensor([0])
+        return self.optim.zero_grad(set_to_none=True)
+
+    def step(self, *args, **kwargs):
+        # Copy gradients from model params to main params.
+        self._set_grad_ptr()
+
+        found_inf = self._check_overflow()
+        if found_inf:
+            self.optim_state = OptimState.UNSCALED    # no need to unscale grad
+            self.grad_scaler.update(found_inf)    # update gradient scaler
+            self._logger.info(f'Found overflow. Skip step')
+            self.zero_grad()    # reset all gradients
+            self._update_fp16_params()
+            return
+
+        # get combined scale. combined scale = loss scale * clipping norm
+        # so that gradient = gradient / combined scale
+        combined_scale = self._get_combined_scale()
+        self.grad_scaler.update(found_inf)
+
+        ret = self.optim.step(div_scale=combined_scale, *args, **kwargs)
+        self.zero_grad()
+        self._update_fp16_params()
+        return ret
+
+    def clip_grad_norm(self, model: torch.nn.Module, max_norm: float, norm_type: float = 2.0):
+        raise NotImplementedError
+
+    def backward(self, loss: torch.Tensor):
+        loss = self.loss_scale * loss
+        self.optim_state = OptimState.SCALED
+        self.module.backward(loss)
+
+    def __init__optimizer(self):
+
+        for group in self.optim.param_groups:
+            fake_params_list = list()
+
+            for param in group['params']:
+                region = self.region_manager.get_region(param)
+                fake_param = torch.nn.Parameter(torch.empty([0]))
+                self.param_to_range[fake_param] = region.param_to_range[param]
+                self.param_to_region[fake_param] = region
+                fake_params_list.append(fake_param)
+
+                # Reset existing state dict key to the new main param.
+                if param in self.optim.state:
+                    self.optim.state[fake_param] = self.optim.state.pop(param)
+
+            group['params'] = fake_params_list
+
+        # Leverage state_dict() and load_state_dict() to
+        # recast preexisting per-param state tensors
+        self.optim.load_state_dict(self.optim.state_dict())
\ No newline at end of file
diff --git a/colossalai/auto_parallel/offload/base_offload_module.py b/colossalai/auto_parallel/offload/base_offload_module.py
new file mode 100644
index 000000000000..59cea4ece266
--- /dev/null
+++ b/colossalai/auto_parallel/offload/base_offload_module.py
@@ -0,0 +1,109 @@
+from typing import Optional, Set
+from functools import partial
+import torch
+import torch.nn as nn
+
+from colossalai.nn.parallel.data_parallel import _cast_float
+from colossalai.gemini.tensor_utils import free_storage
+
+from .region_manager import RegionManager
+from .util import GlobalRuntimeInfo
+
+
+class BaseOffloadModule:
+    """
+    BaseOffloadModule: A model wrapper for parameter offloading.
+
+    Args:
+        model (nn.Module): model to apply offloading.
+        region_manager (RegionManager): a ``RegionManager`` instance.
+        is_sync (bool): synchronous mode or not.
+    """
+
+    def __init__(self,
+                 model: nn.Module,
+                 region_manager: RegionManager,
+                 is_sync=True):
+
+        self.model = model
+        self.region_manager = region_manager
+        self.grad_hook_list = []
+        self.overflow_counter = torch.cuda.IntTensor([0])
+
+        self.grad_offload_stream = torch.cuda.current_stream() if is_sync else GlobalRuntimeInfo.d2h_stream
+
+        self._cast_buffers()
+
+    def register_grad_hook(self):
+        for p in self.model.parameters():
+            if p.requires_grad:
+                self.grad_hook_list.append(p.register_hook(partial(self.grad_handle, p)))
+
+    def remove_grad_hook(self):
+        for hook in self.grad_hook_list:
+            hook.remove()
+
+    def __call__(self, *args, **kwargs):
+        return self.forward(*args, **kwargs)
+
+    def _pre_forward(self):
+        self.register_grad_hook()
+        for region in self.region_manager.region_list:
+            region.cpu_grad = None
+
+    def forward(self, *args, **kwargs):
+        args, kwargs = _cast_float(args, torch.half), _cast_float(kwargs, torch.half)
+        self.model.zero_grad(set_to_none=True)
+        self._pre_forward()
+        outputs = self.model(*args, **kwargs)
+        return outputs
+
+    def backward(self, loss):
+        loss.backward()
+        self._post_backward()
+
+    def _post_backward(self):
+        torch.cuda.synchronize()
+        self.remove_grad_hook()
+
+        for p in self.model.parameters():
+            p.grad = None
+
+        GlobalRuntimeInfo.fwd_prefetch_event_map.clear()
+        GlobalRuntimeInfo.bwd_prefetch_event_map.clear()
+
+    def grad_handle(self, p, grad):
+        empty_grad = torch.empty_like(grad)
+        free_storage(empty_grad)
+        with torch._C.DisableTorchFunction():
+            region = self.region_manager.get_region(p)
+            region.copy_grad_to_region_slice(p, grad)
+            if region.can_release:
+                self.overflow_counter += region.has_inf_or_nan
+                master_stream = torch.cuda.current_stream()
+                with torch.cuda.stream(self.grad_offload_stream):
+                    GlobalRuntimeInfo.d2h_stream.wait_stream(master_stream)
+                    region.move_grad_to_cpu()
+        return empty_grad
+
+    def _cast_buffers(self):
+        for buffer in self.model.buffers():
+            buffer.data = buffer.cuda()
+
+    def parameters(self, recurse: bool = True):
+        return self.model.parameters(recurse)
+
+    def named_parameters(self, prefix: str = '', recurse: bool = True):
+        return self.model.named_parameters(prefix, recurse)
+
+    def named_buffers(self, prefix: str = '', recurse: bool = True):
+        return self.model.named_buffers(prefix, recurse)
+
+    def named_children(self):
+        return self.model.named_children()
+
+    def named_modules(self,
+                      memo: Optional[Set[torch.nn.Module]] = None,
+                      prefix: str = '',
+                      remove_duplicate: bool = True):
+        return self.model.named_modules(memo, prefix, remove_duplicate)
diff --git a/colossalai/auto_parallel/offload/mem_optimize.py b/colossalai/auto_parallel/offload/mem_optimize.py
new file mode 100644
index 000000000000..02778696a106
--- /dev/null
+++ b/colossalai/auto_parallel/offload/mem_optimize.py
@@ -0,0 +1,49 @@
+from typing import Dict
+import torch
+import torch.fx
+from torch.fx import GraphModule
+from torch.utils._pytree import tree_map
+
+from colossalai.fx import ColoTracer, is_compatible_with_meta
+from colossalai.fx.passes.meta_info_prop import MetaInfoProp
+
+from .region_manager import RegionManager
+from .runtime import runtime_syn_offload_apply_pass, runtime_asyn_offload_apply_pass
+from .base_offload_module import BaseOffloadModule
+from .util import compute_max_param_mem, compute_total_param_mem, compute_act_peak_mem, GlobalRuntimeInfo
+
+def memory_optimize(model: torch.nn.Module,
+                    inps: Dict[str, torch.Tensor],
+                    memory_budget: float = -1.0,
+                    solver_name: str = 'asyn'):
+
+    model = model.cpu().half()
+    tracer = ColoTracer()
+    assert is_compatible_with_meta()
+    wrap_fn = lambda x: x.to("meta") if isinstance(x, torch.Tensor) else x
+    meta_args = tree_map(wrap_fn, inps)
+    graph = tracer.trace(model, meta_args=meta_args)
+    gm = GraphModule(model, graph, model.__class__.__name__)
+    interp = MetaInfoProp(gm)
+    interp.propagate(*meta_args.values())
+
+    region_manager = RegionManager(graph, solver_name=solver_name, memory_budget=memory_budget)
+    region_manager._build_regions()
+    GlobalRuntimeInfo.region_list = region_manager.region_list
+
+    act_peak_mem = compute_act_peak_mem(region_manager.region_list) / 1024 ** 2
+    max_param_mem = compute_max_param_mem(region_manager.region_list) / 1024 ** 2
+    total_param_mem = compute_total_param_mem(region_manager.region_list) / 1024 ** 2
+    print(
+        f"act_peak_mem={act_peak_mem:.3f} MB | max_param_mem={max_param_mem:.3f} MB | total_param_mem={total_param_mem:.3f}")
+
+    if solver_name == 'syn':
+        gm = runtime_syn_offload_apply_pass(gm, region_manager.region_list)
+    elif solver_name == 'asyn':
+        gm = runtime_asyn_offload_apply_pass(gm, region_manager.region_list)
+    else:
+        raise TypeError(f"Unknown solver name {solver_name}!")
+
+    gm.recompile()
+    optimized_model = BaseOffloadModule(gm, region_manager, solver_name=='syn')
+    return optimized_model
diff --git a/colossalai/auto_parallel/offload/region.py b/colossalai/auto_parallel/offload/region.py
new file mode 100644
index 000000000000..e6907cc4b81d
--- /dev/null
+++ b/colossalai/auto_parallel/offload/region.py
@@ -0,0 +1,144 @@
+from typing import List, Dict, Tuple
+import torch
+from torch.fx import Node
+from colossalai.gemini.tensor_utils import alloc_storage, free_storage
+
+class Region:
+    """
+    Region: A container owning a piece of contiguous nodes in the DNN computing graph.
+
+    Args:
+        r_id (int): the index of the region in the computing graph.
+    """
+
+    def __init__(self, r_id: int = 0) -> None:
+        self.r_id: int = r_id
+        self.fp16_params: List[torch.nn.Parameter] = []
+        self.param_size: int = 0
+        self.shared_rid: int = self.r_id
+
+        self.param_num: int = 0
+        self.grad_num: int = 0
+        self.fp16_data = None
+        self.fp32_data = None
+        self.cpu_grad = None
+        self.temp_fp32_data = None
+        self.param_to_range: Dict[torch.nn.Parameter, Tuple[int, int]] = dict()
+
+        self.need_offload: bool = False
+        self.is_syn: bool = False
+        self.nodes: List[Node] = []
+        self.fwd_prefetch_region = None
+        self.bwd_prefetch_region = None
+
+        self.in_mem_pool_flag: bool = False
+
+    @property
+    def can_release(self) -> bool:
+        """
+        Check if the region can be released.
+        """
+        return self.grad_num == self.param_num
+
+    @property
+    def has_inf_or_nan(self) -> bool:
+        """
+        Check if the grad of the region has inf or nan values on CUDA.
+        """
+        return torch.isinf(self.fp16_data).any() | torch.isnan(self.fp16_data).any()
+
+    def init_param_data(self, pre_alloc_tensor: torch.Tensor = None):
+        """
+        Map the parameters in the region to a contiguous memory space.
+        """
+
+        self.fp16_data = torch.zeros(
+            self.param_num, dtype=torch.half, device='cuda')
+        offset = 0
+        for param in self.fp16_params:
+            param.data = param.data.cuda()
+            p_num = param.data.numel()
+            self.fp16_data[offset:offset + p_num].copy_(param.data.flatten())
+            param.data = self.fp16_data[offset:offset +
+                                               p_num].view(param.data.shape)
+            self.param_to_range[param] = (offset, offset + p_num)
+            offset += p_num
+
+        self.fp32_data = self.fp16_data.float().cpu().pin_memory()
+        free_storage(self.fp16_data)
+        if self.in_mem_pool_flag and pre_alloc_tensor is not None:
+            self.fp16_data = pre_alloc_tensor
+
+    def move_param_to_cuda(self):
+        """
+        Move parameters from CPU to GPU.
+        It first moves float32 parameters to GPU and
+        then transforms float32 parameters to half-precision on the GPU.
+        The reason is that the performance of precision conversion on the CPU
+        is much slower than the data transfer overhead.
+        """
+
+        self.temp_fp32_data.copy_(self.fp32_data, non_blocking=True)
+        self.temp_fp32_data.record_stream(torch.cuda.current_stream())
+        if not self.in_mem_pool_flag:
+            alloc_storage(self.fp16_data)
+        self.fp16_data[:self.param_num].copy_(self.temp_fp32_data)
+        self.fp16_data.record_stream(torch.cuda.current_stream())
+
+        self.__update_params_ptr()
+
+    def move_grad_to_cpu(self):
+        """
+        Move gradients from GPU to CPU.
+        """
+
+        self.cpu_grad = torch.empty(self.param_num, dtype=torch.half, pin_memory=True)
+        self.cpu_grad.copy_(self.fp16_data[:self.param_num], non_blocking=True)
+        self.fp16_data.record_stream(torch.cuda.current_stream())
+        if not self.in_mem_pool_flag:
+            self.free_cuda_data()
+
+        self.grad_num = 0
+
+    def free_cuda_data(self):
+        free_storage(self.fp16_data)
+
+        # torch.cuda.empty_cache()
+
+    def copy_grad_to_region_slice(self, param: torch.nn.Parameter, data_slice: torch.Tensor) -> None:
+        """
+        Copy data slice to the memory space indexed by the input tensor in the region.
+
+        Args:
+            param (torch.nn.Parameter): the param used to retrive meta information
+            data_slice (torch.Tensor): the tensor to be copied to the region
+        """
+
+        begin, end = self.param_to_range[param]
+        self.fp16_data[begin:end].copy_(data_slice.data.flatten())
+        param.data = self.fp16_data[begin:end].view(param.data.shape)
+
+        self.grad_num += data_slice.numel()
+
+    def split(self, cut_node_idx: int, cut_param_idx: int):
+        """
+        Split the region into two and return the latter.
+        """
+        new_reg = Region(r_id=self.r_id + 1)
+        new_reg.nodes = self.nodes[cut_node_idx:]
+        new_reg.fp16_params = self.fp16_params[cut_param_idx:]
+        for p in new_reg.fp16_params:
+            new_reg.param_size += p.data.numel() * p.data.element_size()
+            new_reg.param_num += p.data.numel()
+
+        self.nodes = self.nodes[:cut_node_idx]
+        self.fp16_params = self.fp16_params[:cut_param_idx]
+        self.param_size -= new_reg.param_size
+        self.param_num -= new_reg.param_num
+
+        return new_reg
+
+    def __update_params_ptr(self) -> None:
+        for param in self.fp16_params:
+            begin, end = self.param_to_range[param]
+            param.data = self.fp16_data[begin:end].view(param.data.shape)
\ No newline at end of file
diff --git a/colossalai/auto_parallel/offload/region_manager.py b/colossalai/auto_parallel/offload/region_manager.py
new file mode 100644
index 000000000000..30bfaf00d493
--- /dev/null
+++ b/colossalai/auto_parallel/offload/region_manager.py
@@ -0,0 +1,526 @@
+from typing import List, Any, Dict, Tuple
+import torch
+from torch.fx import Graph, Node
+
+from .solver import SolverFactory
+from .training_simulator import TrainingSimulator
+from .region import Region
+from .util import NodeInfo
+
+
+class RegionManager:
+    """
+    RegionManager is used to construct and manage the offload plan for the model execution.
+
+    Args:
+        graph (Graph): a Graph object used for analysis and strategy generation.
+        solver_name (str): a solver name which specifies the preferences for plan searching.
+        memory_budget (float): the given memory budget.
+        cnode (List[str], optional): Common node List, should be the subset of input.
+    """
+
+    def __init__(self,
+                 graph: Graph,
+                 solver_name: str = 'asyn',
+                 memory_budget: float = -1.0,
+                 cnode: List[str] = None):
+
+        self.graph = graph
+        assert graph.owning_module is not None, 'The given graph is not associated with a owning_module'
+        self.root_module = self.graph.owning_module
+        self.nodes = list(graph.nodes)
+        self.cnode = cnode
+        self.only_param_ops = []
+        self.param_region_map: Dict[torch.nn.Parameter, Region] = dict()
+        self.shared_region_pairs: List[Tuple[Region, Region]] = list()
+        self.region_list: List[Region] = list()
+        self.rid_in_pool: List[int] = list()
+        self.mem_block_size: int = 0
+        self.memory_budget = memory_budget
+
+        self.solver_name = solver_name
+        self.require_pool: bool = solver_name == 'asyn'
+
+        self.reg_to_block: Dict[int, int] = dict()
+
+    def _build_regions(self):
+        """
+        1. Pre-processing, mainly contains linearized computing graph and
+            merge smaller regions into larger ones.
+        2. Construct a solver to search for an efficient offload strategy.
+        3. Post-processing, mainly contains early region placement if using asynchronous mode,
+            and initialize region data.
+        """
+
+        self._pre_process()
+
+        solver_cls = SolverFactory.create(self.solver_name)
+        solver = solver_cls(self.region_list, self.memory_budget)
+        solver._call_solver()
+
+        self._post_process(solver.best_ts)
+
+    def _pre_process(self):
+
+        init_region_list = self._linearize_graph()
+
+        if len(self.shared_region_pairs) > 1:
+            raise NotImplementedError(
+                'The current version only considers at most one pair of parameter sharing.')
+
+        elif len(self.shared_region_pairs) == 1:
+            shared_regs = self.shared_region_pairs[0]
+            assert shared_regs[0].shared_rid == shared_regs[1].r_id \
+                   and shared_regs[1].shared_rid == shared_regs[0].r_id
+            fst_id = shared_regs[0].r_id
+            lst_id = shared_regs[1].r_id
+            regs_left_out = init_region_list[:fst_id + 1]
+            regs_right_out = init_region_list[lst_id:]
+            hold_regs = init_region_list[fst_id + 1:lst_id]
+        else:
+            regs_left_out = []
+            regs_right_out = []
+            hold_regs = init_region_list
+
+        self.mem_block_size = self._search_block_size(hold_regs)
+        hold_regs = self._merge_small_regions(hold_regs)
+
+        if self.require_pool:
+            for reg in hold_regs:
+                reg.in_mem_pool_flag = True
+                self.rid_in_pool.append(reg.r_id)
+
+        self.region_list.extend(regs_left_out)
+        self.region_list.extend(hold_regs)
+
+        for reg in regs_right_out:
+            reg.r_id = self.region_list[-1].r_id + 1
+            self.region_list[reg.shared_rid].shared_rid = reg.r_id
+            self.region_list.append(reg)
+
+        self._process_shared_region()
+
+        self.max_param_num = max([reg.param_num for reg in self.region_list])
+        self.memory_budget -= self.max_param_num * torch.tensor([], dtype=torch.float32).element_size()
+
+    def _post_process(self, ts: TrainingSimulator = None):
+        if self.require_pool:
+            self._early_region_placement(ts)
+        self._init_region_data()
+
+    def _early_region_placement(self, ts: TrainingSimulator):
+        """
+        Implemented the early region placement strategy to avoid GPU memory fragmentation.
+        It maps all region data into a contiguous memory space and
+        reuses the same memory space for regions that do not coexist.
+
+        Args:
+            ts (TrainingSimulator): the best training simulator, which records region execution flow.
+
+        Raises:
+            NotImplementedError: due to the naive implementation,
+                it may not find a suitable region placement strategy for the given execution flow.
+        """
+
+        reg_flow = torch.cat(
+            [ts.fwd_reg_flow, ts.bwd_reg_flow], dim=0)
+        mem_block_num = torch.max(
+            torch.sum(reg_flow[:, self.rid_in_pool], dim=1))
+        coexist_matrix = torch.logical_or(
+            ts.fwd_reg_flow, ts.bwd_reg_flow)
+
+        block_to_regs = {}
+        for block_idx in range(mem_block_num):
+            block_to_regs[block_idx] = []
+        for reg in self.region_list:
+            if reg.r_id in self.rid_in_pool:
+                cur_reg_appears = coexist_matrix[:, reg.r_id]
+                cur_reg_coexists = torch.sum(
+                    coexist_matrix[cur_reg_appears], dim=0).bool()
+                for block_idx in range(mem_block_num):
+                    if not any(cur_reg_coexists[block_to_regs[block_idx]]):
+                        block_to_regs[block_idx].append(reg.r_id)
+                        self.reg_to_block[reg.r_id] = block_idx
+                        break
+
+                if reg.r_id not in self.reg_to_block:
+                    raise NotImplementedError(
+                        f'can not find a block from the memory pool to store parameters of the region')
+        self.memory_pool = torch.chunk(torch.zeros(int(
+            mem_block_num * self.mem_block_size / 2), dtype=torch.half, device='cuda'), chunks=int(mem_block_num))
+
+    def _merge_small_regions(self, orig_reg_list: List[Region]) -> List[Region]:
+        """
+        Merge smaller regions into larger ones for better bandwidth utilization and easier management.
+        It is inspired by Gemini.
+
+        Args:
+            orig_reg_list (List[Region]): original region list.
+
+        Returns:
+            List[Region]: region list after merging.
+        """
+
+        r_id = orig_reg_list[0].r_id
+        region = Region(r_id=r_id)
+        region_list = [region]
+
+        for orig_reg in orig_reg_list:
+            if region_list[-1].param_size + orig_reg.param_size > self.mem_block_size:
+                r_id += 1
+                region = Region(r_id=r_id)
+                region_list.append(region)
+            region.param_size += orig_reg.param_size
+            region.param_num += orig_reg.param_num
+            region.nodes.extend(orig_reg.nodes)
+            region.fp16_params.extend(orig_reg.fp16_params)
+            self.__update_param_region_map(orig_reg.fp16_params, region)
+
+        return region_list
+
+    def _search_block_size(self,
+                           region_list: List[Region],
+                           search_interval_byte: int = 1024,
+                           search_range_byte: int = 128 * 1024 ** 2) -> int:
+        """
+        Search for a suitable memory block size.
+
+        Args:
+            region_list (List[Region]): region list.
+            search_interval_byte (int): searching interval in byte.
+            search_range_byte (int): searching range in byte.
+
+        Returns:
+            int: the best memory block size.
+        """
+
+        def _get_wasted_mem(size_list: List[int], blk_size: int):
+            """
+            Get wasted byte for a certain block size.
+            """
+            acc_wasted = 0
+            left = 0
+            for s in size_list:
+                if left + s > blk_size:
+                    acc_wasted += blk_size - left
+                    left = s
+                left += s
+            acc_wasted += blk_size - left
+            return acc_wasted
+
+        param_size_list = [
+            region.param_size for region in region_list if region.r_id == region.shared_rid]
+
+        start_size = max(param_size_list)
+        min_mem_waste = float('+inf')
+        best_block_size = start_size
+
+        for block_size in range(start_size, start_size + search_range_byte + 1, search_interval_byte):
+            temp_waste = 0
+            temp_waste += _get_wasted_mem(param_size_list, block_size)
+            if temp_waste < min_mem_waste:
+                min_mem_waste = temp_waste
+                best_block_size = block_size
+
+        return best_block_size
+
+    def _init_region_data(self):
+        """
+        Initialize region data, which maps the parameters in the region to a contiguous memory space.
+        """
+
+        self.temp_fp32_data = torch.zeros(self.max_param_num, device='cuda', dtype=torch.float32)
+
+        for region in self.region_list:
+            pre_alloc_tensor = None
+            if self.require_pool and region.r_id in self.rid_in_pool:
+                block_idx = self.reg_to_block[region.r_id]
+                pre_alloc_tensor = self.memory_pool[block_idx]
+
+            if region.r_id <= region.shared_rid:
+                region.init_param_data(pre_alloc_tensor)
+            else:
+                shared_region = self.region_list[region.shared_rid]
+                region.fp16_data = shared_region.fp16_data
+                region.fp32_data = shared_region.fp32_data
+                region.param_to_range = shared_region.param_to_range
+            region.temp_fp32_data = self.temp_fp32_data[:region.param_num].detach(
+            )
+
+        torch.cuda.empty_cache()
+
+    def _process_shared_region(self):
+        """
+        Special processing for the shared region, which uses GPT2 and Bert case as a priori knowledge.
+        """
+
+        if len(self.shared_region_pairs):
+            assert len(self.shared_region_pairs) <= 1
+            former_reg, latter_reg = self.shared_region_pairs[0]
+            assert latter_reg.param_num >= former_reg.param_num
+            embedding_node = former_reg.nodes[-1]
+            assert embedding_node.op == 'call_module' and isinstance(
+                self.root_module.get_submodule(embedding_node.target), torch.nn.Embedding)
+            if latter_reg.param_num > former_reg.param_num:
+                for idx, n in enumerate(latter_reg.nodes):
+                    if (n.op == 'call_module' and isinstance(self.root_module.get_submodule(n.target),
+                                                             torch.nn.Linear)) or \
+                            (n.op == 'call_function' and n.target is torch.nn.functional.linear):
+                        cut_node_idx = idx + 1
+                        break
+                assert len(latter_reg.fp16_params) == 2
+                new_reg = latter_reg.split(cut_node_idx, 1)
+                for p in new_reg.fp16_params:
+                    self.param_region_map[p] = new_reg
+                self.region_list.insert(new_reg.r_id, new_reg)
+                for reg in self.region_list[new_reg.r_id + 1:]:
+                    reg.r_id += 1
+            latter_reg.shared_rid = former_reg.r_id
+            former_reg.shared_rid = latter_reg.r_id
+
+    def _linearize_graph(self) -> List[Region]:
+        """Linearizing the graph
+
+        Args:
+            graph (Graph): The computing graph to be optimized.
+
+        Returns:
+            List[Region]: each region contains the actual 'node' in linearized manner.
+
+        Remarks:
+            Do merge the inplace ops and shape-consistency ops into the previous node.
+        """
+
+        # List of target name that could be seen as common node
+        common_ops = ["getattr", "getitem", "size"]
+
+        def _is_cop(target: Any) -> bool:
+            """Check if an op could be seen as common node
+
+            Args:
+                target (Any): node target
+
+            Returns:
+                bool
+            """
+
+            if isinstance(target, str):
+                return target in common_ops
+            else:
+                return target.__name__ in common_ops
+
+        def _is_act(data: Any) -> bool:
+            """Check if an op could be seen as parameter computation start
+
+            Args:
+                data (Any): meta_data
+
+            Returns:
+                bool
+            """
+
+            label = False
+            if isinstance(data, torch.Tensor):
+                return True
+            elif isinstance(data, (tuple, list)):
+                for d in data:
+                    label = label or _is_act(d)
+            return label
+
+        def _maybe_param_comp_start() -> bool:
+            """Check if an op could be seen as parameter computation start
+
+            Args:
+                n (Node): node
+
+            Returns:
+                bool
+            """
+
+            label = False
+            if n.op == "get_attr":
+                label = True
+            elif n.op == "call_module":
+                target = n.target
+                submod = self.root_module.get_submodule(target)
+                if (
+                        len(list(submod.named_parameters(recurse=False))) != 0
+                        or len(list(submod.named_buffers(recurse=False))) != 0
+                ):
+                    label = True
+
+            return label and not sum([v for _, v in param_op_deps.items()])
+
+        def _is_param_comp_end() -> bool:
+            """Check if an op could be seen as parameter computation end
+
+            Args:
+                n (Node): node
+
+            Returns:
+                bool
+            """
+
+            def _is_inplace(n: Node):
+                """Get the inplace argument from ``torch.fx.Node``
+                """
+                inplace = False
+                if n.op == "call_function":
+                    inplace = n.kwargs.get("inplace", False)
+                elif n.op == "call_module":
+                    inplace = getattr(n.graph.owning_module.get_submodule(
+                        n.target), "inplace", False)
+                return inplace
+
+            label = False
+
+            if n.op == "call_module":
+                target = n.target
+                submod = self.root_module.get_submodule(target)
+                if (
+                        len(list(submod.named_parameters(recurse=False))) != 0
+                        or len(list(submod.named_buffers(recurse=False))) != 0
+                ):
+                    label = True
+
+            elif n.op == "call_function":
+                label = any(map(lambda x: x.name in self.only_param_ops, n.all_input_nodes)) and any(
+                    map(lambda x: x.name not in self.only_param_ops and not _is_cop(n.target), n.all_input_nodes))
+
+            return label and not sum([v for _, v in param_op_deps.items()]) and not any(map(_is_inplace, n.users))
+
+        def _exception_node_handling():
+            # TODO meta info prop bug
+            if n.name.__contains__("transpose") and n.meta['fwd_out'][0].dim() <= 2:
+                n.meta['fwd_out'] = []
+
+        # make sure that item in cnode is valid
+        if self.cnode:
+            for name in self.cnode:
+                try:
+                    assert next(node for node in self.graph.nodes if node.name == name).op == "placeholder", \
+                        f"Common node {name} is not an input of the model."
+                except StopIteration:
+                    raise ValueError(f"Common node name {name} not in graph.")
+        else:
+            self.cnode = []
+
+        node_id = 0
+        region_id = 0
+
+        param_op_deps = {}
+
+        deps = {}
+        region_list = []
+        region = Region(r_id=region_id)
+
+        act_n = None
+
+        for n in self.graph.nodes:
+            if n.op != "placeholder" and n.op != "output":
+                for n_par in n.all_input_nodes:
+                    if n_par.op != "placeholder" and n_par.name not in self.cnode:
+                        deps[n_par] -= 1
+                    if n_par.op != "placeholder" and n_par.name in self.only_param_ops:
+                        param_op_deps[n_par] -= 1
+
+                if act_n in region.nodes and _maybe_param_comp_start():
+                    ns = []
+                    border_n_idx = region.nodes.index(act_n)
+                    if border_n_idx < len(region.nodes):
+                        ns = region.nodes[border_n_idx + 1:]
+                        region.nodes = region.nodes[:border_n_idx + 1]
+                    region_list.append(region)
+                    region_id += 1
+                    region = Region(r_id=region_id)
+                    region.nodes = ns
+
+                _exception_node_handling()
+                region.nodes.append(n)
+                self._set_node_and_region_info(node_id, n, region)
+                node_id += 1
+
+                # if the node could free all dependencies in graph
+                # we could begin a new region
+                if _is_param_comp_end():
+                    region_list.append(region)
+                    region_id += 1
+                    region = Region(r_id=region_id)
+
+                # propagate common node attr if possible
+                if len(n.all_input_nodes) == len([node for node in n.all_input_nodes if node.name in self.cnode
+                                                  ]) or _is_cop(n.target):
+                    self.cnode.append(n.name)
+                else:
+                    deps[n] = len(
+                        [user for user in n.users if user.op != "output"])
+
+                # propagate param node attr if possible
+                if len(n.all_input_nodes) == len([node for node in n.all_input_nodes if node.name in self.only_param_ops
+                                                  ]) or n.op == "get_attr":
+                    self.only_param_ops.append(n.name)
+                    param_op_deps[n] = len(
+                        [user for user in n.users if user.op != "output"])
+
+                # record last activation node
+                if _is_act(n._meta_data):
+                    act_n = n
+
+        if len(region.nodes):
+            region_list.append(region)
+
+        return region_list
+
+    def _set_node_and_region_info(self, node_id: int, cur_n: Node, cur_reg: Region):
+
+        cur_n.node_info = NodeInfo(node_id)
+
+        if cur_n.op == 'call_module':
+            target = cur_n.target
+            submod = self.root_module.get_submodule(target)
+            for p in list(submod.parameters(recurse=False)):
+
+                if p in self.param_region_map:
+                    cur_reg.shared_rid = self.param_region_map[p].r_id
+                    self.param_region_map[p].shared_rid = cur_reg.r_id
+                    self.shared_region_pairs.append(
+                        (self.param_region_map[p], cur_reg))
+                else:
+                    self.param_region_map[p] = cur_reg
+
+                cur_reg.fp16_params.append(p)
+                cur_reg.param_num += p.data.numel()
+                cur_reg.param_size += p.data.numel() * p.data.element_size()
+
+        elif cur_n.op == "get_attr":
+            attr_itr = self.root_module
+            atoms = cur_n.target.split(".")
+            for atom in atoms:
+                attr_itr = getattr(attr_itr, atom)
+
+            if isinstance(attr_itr, torch.nn.Parameter):
+
+                if attr_itr in self.param_region_map:
+                    cur_reg.shared_rid = self.param_region_map[attr_itr].r_id
+                    self.param_region_map[attr_itr].shared_rid = cur_reg.r_id
+                    self.shared_region_pairs.append(
+                        (self.param_region_map[attr_itr], cur_reg))
+                else:
+                    self.param_region_map[attr_itr] = cur_reg
+
+                cur_reg.fp16_params.append(attr_itr)
+                cur_reg.param_num += attr_itr.data.numel()
+                cur_reg.param_size += attr_itr.data.numel() * attr_itr.data.element_size()
+
+    def get_region(self, param: torch.nn.Parameter) -> Region:
+        """
+        Return the region owning the parameter.
+
+        Args:
+            param (torch.nn.Parameter): a torch parameter object
+        """
+        return self.param_region_map[param]
+
+    def __update_param_region_map(self, params: List[torch.nn.Parameter], region: Region):
+        for p in params:
+            self.param_region_map[p] = region
diff --git a/colossalai/auto_parallel/offload/runtime.py b/colossalai/auto_parallel/offload/runtime.py
new file mode 100644
index 000000000000..91c7945bd65f
--- /dev/null
+++ b/colossalai/auto_parallel/offload/runtime.py
@@ -0,0 +1,253 @@
+from typing import List
+import torch
+from torch.fx.node import Node
+
+from .region import Region
+from .util import GlobalRuntimeInfo, requires_upload_p_in_fwd
+
+
+class SynPreFwdPostBwdOP(torch.autograd.Function):
+    """
+    A customized prefetch and offload operation.
+
+    Args:
+        input_: input tensor.
+        fwd_info: information dict, which contains region indices
+            that need to be uploaded or freed during forward pass.
+        bwd_info: information dict, which contains region indices
+            that need to be uploaded during backward pass.
+    """
+
+    @staticmethod
+    def forward(ctx, input_, fwd_info, bwd_info):
+        ctx.bwd_info = bwd_info
+        d2h_rid = fwd_info.get('d2h_rid', None)
+        if d2h_rid is not None:
+            free_region = GlobalRuntimeInfo.region_list[d2h_rid]
+            assert isinstance(free_region, Region)
+            free_region.free_cuda_data()
+
+        h2d_rid = fwd_info.get('h2d_rid', None)
+        if h2d_rid is not None:
+            h2d_region = GlobalRuntimeInfo.region_list[h2d_rid]
+            assert isinstance(h2d_region, Region)
+            h2d_region.move_param_to_cuda()
+
+        return input_
+
+    @staticmethod
+    def backward(ctx, grad_output):
+
+        h2d_rid = ctx.bwd_info.get('h2d_rid', None)
+        if h2d_rid is not None:
+            pref_region = GlobalRuntimeInfo.region_list[h2d_rid]
+            assert isinstance(pref_region, Region)
+            pref_region.move_param_to_cuda()
+
+        return grad_output, None, None
+
+
+class AsynPreFwdPostBwdOP(torch.autograd.Function):
+    """
+    A customized prefetch and offload operation.
+
+    Args:
+        input_: input tensor.
+        fwd_info: information dict, which contains region indices
+            that need to be prefetched, waited, or freed during forward pass.
+        bwd_info: information dict, which contains region indices
+            that need to be prefetched or waited during backward pass.
+    """
+
+    @staticmethod
+    def forward(ctx, input_, fwd_info, bwd_info):
+        ctx.bwd_info = bwd_info
+
+        sync_rid = fwd_info.get('sync_rid', None)
+        if sync_rid is not None:
+            prefetch_event = GlobalRuntimeInfo.fwd_prefetch_event_map.get(
+                sync_rid, None)
+            if prefetch_event:
+                prefetch_event.wait()
+
+        h2d_rid = fwd_info.get('h2d_rid', None)
+        if h2d_rid is not None:
+            pref_region = GlobalRuntimeInfo.region_list[h2d_rid]
+            assert isinstance(pref_region, Region)
+            master_stream = torch.cuda.current_stream()
+            with torch.cuda.stream(GlobalRuntimeInfo.h2d_stream):
+                GlobalRuntimeInfo.h2d_stream.wait_stream(master_stream)
+                pref_region.move_param_to_cuda()
+
+            prefetch_event = torch.cuda.Event()
+            prefetch_event.record(GlobalRuntimeInfo.h2d_stream)
+            GlobalRuntimeInfo.fwd_prefetch_event_map[h2d_rid] = prefetch_event
+
+        return input_
+
+    @staticmethod
+    def backward(ctx, grad_output):
+
+        sync_rid = ctx.bwd_info.get('sync_rid', None)
+        if sync_rid is not None:
+            wait_region = GlobalRuntimeInfo.region_list[sync_rid]
+            assert isinstance(wait_region, Region)
+            prefetch_event = GlobalRuntimeInfo.bwd_prefetch_event_map.get(
+                sync_rid, None)
+            if prefetch_event:
+                prefetch_event.wait()
+            else:
+                wait_region.move_param_to_cuda()
+
+        h2d_rid = ctx.bwd_info.get('h2d_rid', None)
+        if h2d_rid is not None:
+            pref_region = GlobalRuntimeInfo.region_list[h2d_rid]
+            assert isinstance(pref_region, Region)
+            master_stream = torch.cuda.current_stream()
+            with torch.cuda.stream(GlobalRuntimeInfo.h2d_stream):
+                GlobalRuntimeInfo.h2d_stream.wait_stream(master_stream)
+                pref_region.move_param_to_cuda()
+
+            prefetch_event = torch.cuda.Event()
+            prefetch_event.record(GlobalRuntimeInfo.h2d_stream)
+            GlobalRuntimeInfo.bwd_prefetch_event_map[h2d_rid] = prefetch_event
+        return grad_output, None, None
+
+
+def convert_fwd_upload_bwd_offload_to_action(tensor, fwd_info, bwd_info):
+    '''
+    Convert Upload and Offload operation into runtime action.
+
+    Argument:
+        tensor(torch.Tensor): input tensor.
+        fwd_info(dict): information dict, which contains region indices
+            that need to be uploaded, or freed during forward pass.
+        bwd_info(dict): information dict, which contains region indices
+            that need to be uploaded during backward pass.
+    '''
+    with torch._C.DisableTorchFunction():
+        ret = SynPreFwdPostBwdOP.apply(tensor, fwd_info, bwd_info)
+    return ret
+
+def convert_fwd_prefetch_bwd_offload_to_action(tensor, fwd_info, bwd_info):
+    '''
+    Convert Prefetch and Offload operation into runtime action.
+
+    Argument:
+        tensor(torch.Tensor): input tensor.
+        fwd_info(dict): information dict, which contains region indices
+            that need to be prefetched, waited, or freed during forward pass.
+        bwd_info(dict): information dict, which contains region indices
+            that need to be prefetched or waited during backward pass.
+    '''
+    with torch._C.DisableTorchFunction():
+        ret = AsynPreFwdPostBwdOP.apply(tensor, fwd_info, bwd_info)
+    return ret
+
+
+def replace_node_users(orig_node: Node, inserted_node: Node, rep_user_nodes: List[Node] = None):
+    user_list = list(orig_node.users.keys())
+    if rep_user_nodes is not None:
+        user_list = rep_user_nodes
+    for user in user_list:
+        if user == inserted_node:
+            continue
+        new_args = list(user.args)
+        new_kwargs = dict(user.kwargs)
+        # the origin node may be a positional argument or key word argument of user node
+        if orig_node in new_args:
+            # substitute the origin node with offload_apply_node
+            new_args[new_args.index(orig_node)] = inserted_node
+            user.args = tuple(new_args)
+        elif str(orig_node) in new_kwargs:
+            # substitute the origin node with offload_apply_node
+            new_kwargs[str(orig_node)] = inserted_node
+            user.kwargs = new_kwargs
+
+
+def runtime_syn_offload_apply_pass(gm: torch.fx.GraphModule, region_list: List[Region]):
+    """
+    This pass is used to add the synchronous upload and offload spec apply node to the origin graph.
+    """
+    mod_graph = gm.graph
+    last_inp_node = tuple(mod_graph.nodes)[0]
+
+    for r_idx, region in enumerate(region_list):
+        # forward upload
+        fwd_info = {}
+        if requires_upload_p_in_fwd(region_list[region.shared_rid]):
+            fwd_info['h2d_rid'] = region.r_id
+
+        # forward offload
+        if r_idx > 0 and region_list[r_idx - 1].need_offload:
+            fwd_info['d2h_rid'] = r_idx - 1
+
+        bwd_info = {}
+        # backward upload
+        if r_idx > 0 and region_list[r_idx - 1].need_offload:
+            bwd_info['h2d_rid'] = region_list[r_idx - 1].r_id
+
+        if fwd_info or bwd_info:
+            with mod_graph.inserting_after(last_inp_node):
+                new_node = mod_graph.create_node('call_function', convert_fwd_upload_bwd_offload_to_action,
+                                                 args=(last_inp_node, fwd_info, bwd_info))
+            replace_node_users(last_inp_node, new_node)
+
+        last_inp_node = region.nodes[-1]
+
+    return gm
+
+
+def runtime_asyn_offload_apply_pass(gm: torch.fx.GraphModule, region_list: List[Region]):
+    """
+    This pass is used to add the asynchronous prefetch and offload spec apply node to the origin graph.
+    """
+    mod_graph = gm.graph
+
+    # upload parameters of the first region
+    last_inp_node = tuple(mod_graph.nodes)[0]
+    first_region_with_p = [
+        region for region in region_list if region.param_size][0]
+    fwd_info = {"h2d_rid": first_region_with_p.r_id}
+    with mod_graph.inserting_after(last_inp_node):
+        upload_apply_node = mod_graph.create_node('call_function', convert_fwd_upload_bwd_offload_to_action,
+                                                  args=(last_inp_node, fwd_info, {}))
+    replace_node_users(last_inp_node, upload_apply_node)
+    last_inp_node = upload_apply_node
+
+    for r_idx, region in enumerate(region_list):
+        # forward prefetch
+        fwd_info = {}
+        if region.param_size:
+            fwd_info['sync_rid'] = region.r_id
+        fwd_prefetch_region = region.fwd_prefetch_region
+        if fwd_prefetch_region and requires_upload_p_in_fwd(region_list[fwd_prefetch_region.shared_rid]):
+            fwd_info['h2d_rid'] = fwd_prefetch_region.r_id
+
+        # forward offload
+        if r_idx > 0 and region_list[r_idx-1].need_offload:
+            fwd_info['d2h_rid'] = r_idx - 1
+
+        bwd_info = {}
+        # backward prefetch
+        if r_idx > 0 and region_list[r_idx-1].need_offload:
+            bwd_info['sync_rid'] = r_idx - 1
+        if r_idx > 0 and region_list[r_idx-1].bwd_prefetch_region:
+            bwd_info['h2d_rid'] = region_list[r_idx-1].bwd_prefetch_region.r_id
+
+        if fwd_info or bwd_info:
+            with mod_graph.inserting_after(last_inp_node):
+                new_node = mod_graph.create_node('call_function', convert_fwd_prefetch_bwd_offload_to_action,
+                                                 args=(last_inp_node, fwd_info, bwd_info))
+            replace_node_users(last_inp_node, new_node)
+
+        last_inp_node = region.nodes[-1]
+
+    if region.bwd_prefetch_region:
+        bwd_info = {'h2d_rid': region.bwd_prefetch_region.r_id}
+        with mod_graph.inserting_after(last_inp_node):
+            new_node = mod_graph.create_node('call_function', convert_fwd_prefetch_bwd_offload_to_action,
+                                             args=(last_inp_node, {}, bwd_info))
+        replace_node_users(last_inp_node, new_node)
+    # gm.graph.print_tabular()
+    return gm
diff --git a/colossalai/auto_parallel/offload/solver.py b/colossalai/auto_parallel/offload/solver.py
new file mode 100644
index 000000000000..161f7ff86898
--- /dev/null
+++ b/colossalai/auto_parallel/offload/solver.py
@@ -0,0 +1,523 @@
+import time
+from typing import List, Dict, Type
+from abc import ABC, abstractmethod
+
+NOT_NVML = False
+try:
+    from pynvml import *
+except:
+    NOT_NVML = True
+
+import torch
+from torch.fx.node import Node
+from colossalai.utils.cuda import get_current_device
+
+from .training_simulator import TrainingSimulator, SynTrainingSimulator, AsynTrainingSimulator
+from .region import Region
+from .util import NodeInfo, NvDevicePower
+
+
+def benchmark_func(func, number=1, repeat=1, warmup=3):
+    """
+    benchmark data transfer cost.
+    """
+
+    for i in range(warmup):
+        func()
+
+    costs = []
+
+    for i in range(repeat):
+        torch.cuda.synchronize()
+        begin = time.time()
+        for i in range(number):
+            func()
+        torch.cuda.synchronize()
+        costs.append((time.time() - begin) / number)
+
+    return sum(costs) / len(costs)
+
+
+class Solver(ABC):
+    """
+    The parameter offload solver.
+
+    Args:
+        region_list (List[Region]): represents the linearized DNN computing graph.
+        memory_budget (float): the given memory budget.
+        error_factor (float): the error factor.
+            It is used to reduce the memory budget. Due to some errors in the estimation of peak memory and execution time.
+    """
+
+    def __init__(self,
+                 region_list: List[Region],
+                 memory_budget: float = -1.0,
+                 error_factor: float = 0.95) -> None:
+
+        self.region_list = region_list
+
+        self.error_factor: float = error_factor
+        if memory_budget > 0:
+            self.memory_budget = memory_budget * self.error_factor
+        else:
+            self.memory_budget = torch.cuda.get_device_properties(
+                get_current_device()).total_memory * self.error_factor
+
+        self.link_to_bandwidth: Dict[str, Dict[float, float]] = self._profile_bandwidth()
+        self.comp_power: float = self._extract_computing_power()
+
+    @abstractmethod
+    def _call_solver(self):
+        raise NotImplementedError
+
+    @abstractmethod
+    def _try_to_offload(self, *args):
+        raise NotImplementedError
+
+    @abstractmethod
+    def _eval_one_choice(self, *args):
+        raise NotImplementedError
+
+    def _compute_offload_profit(self, total_mem_saving: float, peak_mem_saving: float, extra_cost: float):
+        """
+        Compute the profits of the offload strategies,
+        which packages the memory savings information for subsequent comparisons.
+
+        Args:
+            total_mem_saving (float): the total memory saving of the offload strategy.
+            peak_mem_saving (float): the peak memory saving of the offload strategy.
+            extra_cost (float): extra data transfer cost.
+
+        Returns:
+            tuple: profit information, the first term represents memory savings per unit of time.
+        """
+
+        if extra_cost == 0:
+            # means data transfer overhead can be completely overlapped
+            return (float('inf'), total_mem_saving, peak_mem_saving)
+        return (total_mem_saving / extra_cost, total_mem_saving, peak_mem_saving)
+
+    def _compare_profit(self, profit_a: tuple, profit_b: tuple) -> bool:
+        """
+        Compare the profits of the two offload strategies using the dictionary order algorithm.
+
+        Args:
+            profit_a (tuple): the profit of a offload strategy.
+            profit_b (tuple): the profit of another offload strategy.
+
+        Returns:
+            bool: whether profit_a is greater than profit_b.
+        """
+
+        for val1, val2 in zip(profit_a, profit_b):
+            if val1 != val2:
+                return val1 > val2
+        return False
+
+    def _update_state(self, best_ts: TrainingSimulator):
+        """
+        Update the solver state.
+        """
+
+        self.best_ts = best_ts
+        self._update_node_mem_info(best_ts.fwd_node_mem, best_ts.bwd_node_mem)
+
+    def _update_node_mem_info(self,
+                              fwd_mem_info: Dict[Node, float],
+                              bwd_mem_info: Dict[Node, float]):
+        """
+        Update the runtime memory information of the node.
+
+        Args:
+            fwd_mem_info (Dict[Node, float]): the runtime memory of each node in forward pass.
+            bwd_mem_info (Dict[Node, float]): the runtime memory of each node in backward pass.
+        """
+
+        for node, mem in fwd_mem_info.items():
+            assert hasattr(node, 'node_info') and isinstance(
+                node.node_info, NodeInfo)
+            node.node_info.runtime_fwd_mem = mem
+        for node, mem in bwd_mem_info.items():
+            assert hasattr(node, 'node_info') and isinstance(
+                node.node_info, NodeInfo)
+            node.node_info.runtime_bwd_mem = mem
+
+    def _extract_computing_power(self):
+        """
+        return the FP16 computing performance of the current NVIDIA GPU.
+
+        Raises:
+            TypeError: Unknown NVIDIA GPU device.
+        """
+
+        nvmlInit()
+        handle = nvmlDeviceGetHandleByIndex(0)
+        device_name = nvmlDeviceGetName(handle)
+        units = 1e12
+
+        if device_name.__contains__("RTX 3080"):
+            return NvDevicePower.RTX3080_FP16 * units
+        elif device_name.__contains__("RTX 3090"):
+            return NvDevicePower.RTX3090_FP16 * units
+        elif device_name.__contains__('V100'):
+            return NvDevicePower.V100_FP16 * units
+        elif device_name.__contains__("A100"):
+            return NvDevicePower.A100_FP16 * units
+        else:
+            raise TypeError(f'Unknown NVIDIA GPU device name {device_name}')
+
+    def _profile_bandwidth(self):
+        """
+        Profile the bidirectional communication bandwidth between CPU and GPU
+        using data volumes ranging from 1KB to 1GB.
+        """
+
+        print('profiling bandwidth ......')
+        link_to_bandwidth = {}
+        links = ['h2d', 'd2h']
+
+        for link in links:
+            t_size = 1024
+            size_to_bandwidth = {}
+
+            # from 1KB to 1GB
+            for i in range(21):
+                if link == 'h2d':
+                    src_tensor = torch.ones(
+                        int(t_size), dtype=torch.int8, pin_memory=True)
+                    dst_tensor = torch.ones(
+                        (int(t_size)), dtype=torch.int8, device='cuda')
+                elif link == 'd2h':
+                    src_tensor = torch.ones(
+                        int(t_size), dtype=torch.int8, device='cuda')
+                    dst_tensor = torch.ones(
+                        (int(t_size)), dtype=torch.int8, pin_memory=True)
+
+                def func():
+                    dst_tensor.copy_(src_tensor)
+
+                size_to_bandwidth[t_size] = t_size / benchmark_func(func, number=5, repeat=3)
+                print(f'size: {t_size / 1024 ** 2:.3f} MB, '
+                      f'{src_tensor.device.type}-to-{dst_tensor.device.type} '
+                      f'bandwidth: {size_to_bandwidth[t_size] / 1024 ** 3:.3f} GB/s')
+
+                t_size *= 2
+
+            link_to_bandwidth[link] = size_to_bandwidth
+        return link_to_bandwidth
+
+
+class SynGreedySolver(Solver):
+
+    def __init__(self,
+                 region_list: List[Region],
+                 memory_budget: float = -1.0) -> None:
+        super().__init__(region_list, memory_budget)
+
+        self.best_ts: SynTrainingSimulator = None
+        self._init_state()
+
+    def _init_state(self):
+        """
+        Initialize the solver state when without offloading.
+        """
+
+        ts = SynTrainingSimulator(self.region_list, self.comp_power, self.link_to_bandwidth)
+        ts.execute()
+        self._update_state(ts)
+
+    def _call_solver(self):
+        """
+        Call the solver to search an efficient parameter offloading strategy for the linearized graph.
+        The solver adopts greedy algorithm.
+
+        Raises:
+            NotImplementedError: Unable to find a solution for the given memory budget.
+        """
+
+        print("search offloading strategy ......")
+        while self.best_ts.peak_mem > self.memory_budget:
+            offload_region = None
+            best_ts = None
+            max_profit = (0,)
+
+            # search which region should be offloaded,
+            # the last region does not need to be offloaded.
+            for region in self.region_list[:-1]:
+                if region.param_size and not region.need_offload:
+                    temp_ts, profit = self._try_to_offload(region)
+                    if self._compare_profit(profit, max_profit):
+                        offload_region = region
+                        max_profit = profit
+                        best_ts = temp_ts
+
+            if offload_region is not None and best_ts is not None:
+                offload_region.need_offload = True
+                offload_region.is_syn = True
+                self._update_state(best_ts)
+            else:
+                raise NotImplementedError(
+                    f"can't find the offload strategy met the memory budget {self.memory_budget / 1024 ** 2} MB, "
+                    f"it needs {self.best_ts.peak_mem / 1024 ** 2:.3f} MB at least!")
+
+    def _call_solver_l2l(self):
+        """
+        The layer-wise offload strategy.
+        """
+
+        for region in self.region_list[:-1]:
+            region.need_offload = True
+            region.is_syn = True
+
+    def _try_to_offload(self, offload_region: Region):
+
+        # record previous information
+        orig_need_offload = offload_region.need_offload
+        assert not orig_need_offload
+        offload_region.need_offload = True
+
+        ts, profit = self._eval_one_choice(offload_region)
+
+        # restore previous information
+        offload_region.need_offload = orig_need_offload
+        return ts, profit
+
+    def _eval_one_choice(self, offload_region: Region):
+        """
+        Evaluate the profit of a strategy choice.
+
+        Args:
+            offload_region (Region): the offload region of current choice.
+
+        Returns:
+            SynTrainingSimulator: the training simulator corresponding to the current strategy.
+            tuple: contains memory saving and cost information of the current strategy.
+        """
+
+        ts = SynTrainingSimulator(self.region_list, self.comp_power, self.link_to_bandwidth)
+        ts.execute()
+
+        extra_comm_cost = 2.0 * \
+                          ts._get_communication_overhead('h2d', offload_region.param_size)
+        # the shared region needs to be moved twice
+        if offload_region.r_id < offload_region.shared_rid:
+            extra_comm_cost *= 2.0
+        profit = self._compute_offload_profit(
+            ts.total_mem_saving, self.best_ts.peak_mem - ts.peak_mem, extra_comm_cost)
+
+        return ts, profit
+
+
+class AsynGreedySolver(Solver):
+
+    def __init__(self,
+                 region_list: List[Region],
+                 memory_budget: float = -1.0,
+                 search_window_size: int = 3):
+        super().__init__(region_list, memory_budget)
+
+        self.search_window_size = search_window_size
+        # Records the prefetch execution location of the offloaded region
+        self.region_to_region_map = {}
+        self.best_ts: AsynTrainingSimulator = None
+
+        self._init_state()
+
+    def _init_state(self):
+        """
+        Initialize the solver state when without offloading.
+        """
+
+        ts = AsynTrainingSimulator(self.region_list, self.comp_power, self.link_to_bandwidth)
+        ts.execute()
+        self._update_state(ts)
+        print("init peak memory", self.best_ts.peak_mem / 1024 ** 2, "MB")
+
+    def _call_solver(self):
+        """
+        Call the solver to search an efficient parameter offloading strategy for the linearized graph.
+        The solver adopts greedy algorithm.
+
+        Raises:
+            NotImplementedError: Unable to find a solution for the given memory budget.
+        """
+
+        print("search for offloading strategy ......")
+        # Records the prefetch execution location of the offloaded region
+        region_to_region_map = {}
+        while self.best_ts.peak_mem > self.memory_budget:
+            region_to_offload = None
+            max_offload_profit = (0,)
+            best_offl_ts = None
+
+            # search which region should be offloaded,
+            # the last region does not need to be offloaded
+            for region in self.region_list[:-1]:
+                if region.param_size and not region.need_offload:
+                    max_prefetch_profit = (0,)
+                    best_pref_ts = None
+
+                    # search when to prefetch the region offloaded
+                    for host_region in self.region_list[region.r_id + 1:region.r_id + 1 + self.search_window_size]:
+                        if host_region.bwd_prefetch_region is not None:
+                            continue
+
+                        temp_ts, profit = self._try_to_offload(
+                            host_region, region)
+
+                        if self._compare_profit(profit, max_prefetch_profit):
+                            region_to_region_map[region.r_id] = host_region
+                            max_prefetch_profit = profit
+                            best_pref_ts = temp_ts
+                            if profit[0] == float('inf'):
+                                break
+
+                    if self._compare_profit(max_prefetch_profit, max_offload_profit):
+                        region_to_offload = region
+                        max_offload_profit = max_prefetch_profit
+                        best_offl_ts = best_pref_ts
+
+            if (region_to_offload is not None) and (best_offl_ts is not None):
+                region_to_offload.need_offload = True
+                if region_to_region_map[region_to_offload.r_id] == region_to_offload:
+                    region_to_offload.is_syn = True
+                else:
+                    region_to_region_map[region_to_offload.r_id].bwd_prefetch_region = region_to_offload
+                    self.region_to_region_map[region_to_offload.r_id] = region_to_region_map[region_to_offload.r_id]
+
+                self._update_state(best_offl_ts)
+
+            elif self.region_to_region_map.__len__() > 0:
+                self._repair_strategy()
+            else:
+                raise NotImplementedError(
+                    f"can't find the offload strategy met the memory budget {self.memory_budget / 1024 ** 2} MB, "
+                    f"it needs {self.best_ts.peak_mem / 1024 ** 2:.3f} MB at least!")
+
+            region_to_region_map.clear()
+
+    def _try_to_offload(self, host_region: Region, offload_region: Region):
+        """
+        Attempts to offload the region and prefetch it in backward pass.
+        """
+
+        # record previous information
+        orig_prefetch = host_region.bwd_prefetch_region
+        orig_is_syn = offload_region.is_syn
+        orig_need_offload = offload_region.need_offload
+
+        if host_region == offload_region:
+            offload_region.is_syn = True
+        else:
+            host_region.bwd_prefetch_region = offload_region
+        offload_region.need_offload = True
+
+        ts, profit = self._eval_one_choice()
+
+        # restore previous information
+        host_region.bwd_prefetch_region = orig_prefetch
+        offload_region.is_syn = orig_is_syn
+        offload_region.need_offload = orig_need_offload
+
+        return ts, profit
+
+    def _try_convert_to_syn_upload(self, host_region: Region, offload_region: Region):
+        """
+        Attempts to convert asynchronous prefetch into synchronous upload operations.
+        """
+
+        # record previous information
+        orig_prefetch = host_region.bwd_prefetch_region
+        orig_is_syn = offload_region.is_syn
+        assert orig_prefetch is not None and not orig_is_syn
+
+        host_region.bwd_prefetch_region = None
+        offload_region.is_syn = True
+
+        ts, profit = self._eval_one_choice()
+
+        # restore previous information
+        host_region.bwd_prefetch_region = orig_prefetch
+        offload_region.is_syn = orig_is_syn
+
+        return ts, profit
+
+    def _repair_strategy(self):
+        """
+        Repair offload strategy.
+        It attempts to convert asynchronous prefetch into synchronous upload operations and selects the best one.
+        The repair process does not end until peak memory is reduced or there is no asynchronous prefetch operation.
+        """
+        print("repair strategy ......")
+
+        peak_mem_saving = 0
+        while len(self.region_to_region_map) and peak_mem_saving <= 0:
+
+            max_profit = (0,)
+            best_ts = None
+            undo_host_region = None
+            undo_offload_region = None
+
+            for offload_region_id, host_region in self.region_to_region_map.items():
+                offload_region = self.region_list[offload_region_id]
+                assert host_region.bwd_prefetch_region == offload_region
+                assert offload_region.need_offload
+                assert not offload_region.is_syn
+
+                ts, profit = self._try_convert_to_syn_upload(host_region,
+                                                             offload_region)
+
+                if self._compare_profit(profit, max_profit):
+                    undo_host_region = host_region
+                    undo_offload_region = offload_region
+                    max_profit = profit
+                    best_ts = ts
+
+            if best_ts is None:
+                raise NotImplementedError('repair error!')
+
+            assert not undo_offload_region.is_syn
+            undo_offload_region.is_syn = True
+            undo_host_region.bwd_prefetch_region = None
+
+            peak_mem_saving = self.best_ts.peak_mem - best_ts.peak_mem
+
+            self._update_state(best_ts)
+            self.region_to_region_map.pop(undo_offload_region.r_id)
+
+        return best_ts
+
+    def _eval_one_choice(self):
+        """
+        Evaluate the profit of a strategy choice.
+
+        Returns:
+            AsynTrainingSimulator: the training simulator corresponding to the current strategy.
+            tuple: contains memory saving and cost information of the current strategy.
+        """
+
+        ts = AsynTrainingSimulator(self.region_list, self.comp_power, self.link_to_bandwidth)
+        ts.execute()
+
+        extra_comm_cost = max(ts.iter_end_time - self.best_ts.iter_end_time, 0)
+        profit = self._compute_offload_profit(
+            ts.total_mem_saving, self.best_ts.peak_mem - ts.peak_mem, extra_comm_cost)
+
+        return ts, profit
+
+
+class SolverFactory:
+    solvers: Dict[str, Type[Solver]] = {
+        'syn': SynGreedySolver,
+        'asyn': AsynGreedySolver
+    }
+
+    @staticmethod
+    def create(solver_name: str) -> Type[Solver]:
+        if solver_name not in SolverFactory.solvers:
+            raise TypeError(f"Unknown parameter offload policy {solver_name}")
+        return SolverFactory.solvers[solver_name]
+
+    @staticmethod
+    def get_solver_names():
+        return tuple(SolverFactory.solvers.keys())
diff --git a/colossalai/auto_parallel/offload/training_simulator.py b/colossalai/auto_parallel/offload/training_simulator.py
new file mode 100644
index 000000000000..f277c183a912
--- /dev/null
+++ b/colossalai/auto_parallel/offload/training_simulator.py
@@ -0,0 +1,458 @@
+import bisect
+from typing import List, Dict
+from collections import OrderedDict
+from abc import ABC, abstractmethod
+
+from torch.fx.node import Node
+
+from .region import Region
+from .util import *
+
+
+@dataclass
+class ExecutionPeriod:
+    start_time: float = 0
+    end_time: float = 0
+
+
+class TrainingSimulator(ABC):
+    """
+    The Training Simulator is used to simulate the training process.
+    It records computation, communication, and runtime memory during forward and backward passes.
+
+    Args:
+        region_list (List[Region]): represents the linearized DNN computing graph.
+        comp_power (float): the NVIDIA GPU FP16 compuing power.
+        link_to_bw (Dict[str, Dict[float, float]]): communication links and the corresponding bandwidth.
+    """
+
+    def __init__(self,
+                 region_list: List[Region],
+                 comp_power: float,
+                 link_to_bw: Dict[str, Dict[float, float]]) -> None:
+        self.region_list = region_list
+        self.region_num = len(region_list)
+
+        self.runtime_mem: int = 0
+        self.peak_mem: int = 0
+        self.total_mem_saving: int = 0
+
+        self.fwd_node_mem: Dict[Node, float] = {}
+        self.bwd_node_mem: Dict[Node, float] = {}
+
+        # Node dependencies in backward pass
+        self.bwd_node_deps: Dict[Node, int] = {}
+
+        self.comp_power: float = comp_power
+        self.link_to_bandwidth: Dict[str, Dict[float, float]] = link_to_bw
+
+    @abstractmethod
+    def execute(self):
+        raise NotImplementedError
+
+    @abstractmethod
+    def _eval_fwd_mem_per_region(self, region: Region):
+        raise NotImplementedError
+
+    @abstractmethod
+    def _eval_bwd_mem_per_region(self, region: Region):
+        raise NotImplementedError
+
+    def _get_bandwidth(self, link: str, comm_volumn: float) -> float:
+        """
+        Get the data transfer bandwidth.
+
+        Args:
+            link (str): the data transfer link.
+            comm_volumn (float): the amount of data transferred.
+
+        Returns:
+            float: the data transfer bandwidth.
+        """
+
+        assert len(self.link_to_bandwidth)
+        if link not in self.link_to_bandwidth:
+            raise TypeError(f"Unknown data transfer link {link}")
+
+        # size_list = sorted(list(map(float, self.link_to_bandwidth[link].keys())))
+        size_list = sorted(self.link_to_bandwidth[link].keys())
+        d_idx = bisect.bisect_left(size_list, comm_volumn)
+        return self.link_to_bandwidth[link][size_list[d_idx]]
+
+    def _get_communication_overhead(self, link: str, comm_volumn: float) -> float:
+        return comm_volumn / self._get_bandwidth(link, comm_volumn)
+
+    def _get_computing_overhead(self, flop: float) -> float:
+        return flop / self.comp_power
+
+
+class SynTrainingSimulator(TrainingSimulator):
+
+    def __init__(self,
+                 region_list: List[Region],
+                 comp_power: float,
+                 link_to_bw: Dict[str, Dict[float, float]]) -> None:
+        super().__init__(region_list, comp_power, link_to_bw)
+
+    def execute(self):
+        """
+        Simulate synchronous training process.
+        """
+
+        for reg in self.region_list:
+            self._eval_fwd_mem_per_region(reg)
+
+        for reg in self.region_list.__reversed__():
+            self._eval_bwd_mem_per_region(reg)
+
+    def _eval_fwd_mem_per_region(self, region: Region):
+        """
+        Evaluate the runtime and peak memory when the forward execution reaches the current region.
+        """
+
+        # upload parameters of the current region
+        if requires_upload_p_in_fwd(self.region_list[region.shared_rid]):
+            self.runtime_mem += region.param_size
+
+        for node in region.nodes:
+            self.runtime_mem += calculate_fwd_tmp(node) + \
+                                calculate_fwd_out(node)
+            self.fwd_node_mem[node] = self.runtime_mem
+            self.peak_mem = max(self.runtime_mem, self.peak_mem)
+            self.total_mem_saving += node.node_info.runtime_fwd_mem - self.runtime_mem
+
+        if region.need_offload:
+            self.runtime_mem -= region.param_size
+
+    def _eval_bwd_mem_per_region(self, region: Region):
+        """
+        Evaluate the runtime and peak memory when the backward execution reaches the current region.
+        """
+
+        # upload parameters of the current region
+        if region.need_offload:
+            self.runtime_mem += region.param_size
+
+        # add the gradient of the parameter
+        if region.r_id < region.shared_rid:
+            # gradient accumulation is required for shared parameters
+            self.runtime_mem += 2.0 * region.param_size
+        else:
+            self.runtime_mem += region.param_size
+
+        for node in region.nodes.__reversed__():
+
+            self.runtime_mem -= calculate_fwd_out(node)
+            self.runtime_mem += node.meta['bwd_mem_tmp'] + \
+                                node.meta['bwd_mem_out']
+            self.peak_mem = max(self.runtime_mem, self.peak_mem)
+
+            # The memory savings of a node may be negative due to parameter prefetch.
+            self.total_mem_saving += node.node_info.runtime_bwd_mem - self.runtime_mem
+            self.bwd_node_mem[node] = self.runtime_mem
+
+            self.runtime_mem -= (node.meta['bwd_mem_tmp'] +
+                                 calculate_fwd_tmp(node))
+
+            # free bwd_mem_out
+            self.bwd_node_deps[node] = len(node.all_input_nodes)
+            for user_node in node.users:
+                if user_node in self.bwd_node_deps:
+                    self.bwd_node_deps[user_node] -= 1
+                    if self.bwd_node_deps[user_node] <= 0:
+                        self.runtime_mem -= user_node.meta['bwd_mem_out']
+
+            if self.runtime_mem < 0:
+                raise ValueError(f"region id: {region.r_id}, node name: {node.name}, "
+                                 f"runtime_mem: {self.runtime_mem / 1024 ** 2:.3f}MB ---"
+                                 f"runtime memory computed less than 0, which is miscalculated!")
+
+        # release parameter and offload gradient in region
+        if region.r_id == region.shared_rid:
+            self.runtime_mem -= 2.0 * region.param_size
+        elif region.r_id < region.shared_rid:
+            self.runtime_mem -= 3.0 * region.param_size
+        elif self.region_list[region.shared_rid].need_offload:
+            self.runtime_mem -= region.param_size
+
+
+class AsynTrainingSimulator(TrainingSimulator):
+
+    def __init__(self,
+                 region_list: List[Region],
+                 comp_power: float,
+                 link_to_bw: Dict[str, Dict[float, float]]) -> None:
+        super().__init__(region_list, comp_power, link_to_bw)
+
+        self.iter_end_time: int = 0
+        # the last computation execution period
+        self.last_comp: ExecutionPeriod = ExecutionPeriod(
+            start_time=0, end_time=0)
+        # the last parameter prefetch execution period
+        self.last_h2d: ExecutionPeriod = ExecutionPeriod(
+            start_time=0, end_time=0)
+        # the last gradient offload execution period
+        self.last_d2h: ExecutionPeriod = ExecutionPeriod(
+            start_time=0, end_time=0)
+        # the forward computation execution period of the region
+        self.fwd_reg_to_comp: OrderedDict[int, ExecutionPeriod] = OrderedDict()
+        # the forward parameter prefetch execution period of the region
+        self.fwd_reg_to_pref: OrderedDict[int, ExecutionPeriod] = OrderedDict()
+        # the backward computation execution period of the region
+        self.bwd_reg_to_comp: OrderedDict[int, ExecutionPeriod] = OrderedDict()
+        # the backward parameter prefetch execution period of the region
+        self.bwd_reg_to_pref: OrderedDict[int, ExecutionPeriod] = OrderedDict()
+        # the gradient offload execution period of the region
+        # which is divided into those that are waiting and those that have been released
+        self.bwd_reg_to_offl_waiting: OrderedDict[int,
+        ExecutionPeriod] = OrderedDict()
+        self.bwd_reg_to_offl_freed: OrderedDict[int,
+        ExecutionPeriod] = OrderedDict()
+        # the region buffer, which records regions that are offloaded but not released
+        self.reg_buffer_to_free: List[int] = []
+
+        # node dependencies in backward pass
+        self.bwd_node_deps: Dict[Node, int] = {}
+
+        # the region execution flow,
+        # where fwd_reg_flow[i,j] denotes whether the parameters of j-th region are in the GPU
+        # when the execution reaches the i-th region.
+        self.fwd_reg_flow = torch.zeros(
+            (self.region_num, self.region_num)).bool()
+        self.bwd_reg_flow = torch.zeros(
+            (self.region_num, self.region_num)).bool()
+
+    def execute(self):
+        """
+        Simulate asynchronous training process.
+        In forward pass, parameter prefetching is advanced by one region.
+        In backward pass, parameter prefetching is executed at the specified location,
+            and gradient offloading is urgent.
+        """
+
+        for reg in self.region_list:
+            if reg.param_size and reg.r_id < self.region_num - 1:
+                for nr in self.region_list[reg.r_id + 1:]:
+                    if nr.param_size and requires_upload_p_in_fwd(self.region_list[nr.shared_rid]):
+                        reg.fwd_prefetch_region = nr
+                        break
+            self._eval_fwd_cost_per_region(reg)
+            self._eval_fwd_mem_per_region(reg)
+
+        for reg in self.region_list.__reversed__():
+            self._eval_bwd_cost_per_region(reg)
+            self._eval_bwd_mem_per_region(reg)
+
+        # release remaining grads
+        for reg_id, offl_exec in self.bwd_reg_to_offl_waiting.items():
+            self.bwd_reg_to_offl_freed[reg_id] = offl_exec
+            self.runtime_mem -= self.region_list[reg_id].param_size
+        self.bwd_reg_to_offl_waiting.clear()
+
+        self.iter_end_time = max(
+            self.last_comp.end_time, self.last_d2h.end_time)
+
+    def _insert_h2d_exec(self, region: Region, is_fwd: bool = True):
+        """
+        Insert parameter prefetch execution period of the current region to the end of the h2d stream
+        """
+
+        pref_start_time = max(self.last_h2d.end_time, self.last_comp.end_time)
+        pref_end_time = pref_start_time + \
+                        2.0 * self._get_communication_overhead('h2d', region.param_size)
+        pref_ep = ExecutionPeriod(
+            start_time=pref_start_time, end_time=pref_end_time)
+        if is_fwd:
+            self.fwd_reg_to_pref[region.r_id] = pref_ep
+        else:
+            self.bwd_reg_to_pref[region.r_id] = pref_ep
+        self.last_h2d = pref_ep
+
+    def _insert_comp_exec(self, region: Region, is_fwd: bool = True):
+        """
+        Insert computation execution period of the current region to the end of the computing stream
+        """
+
+        if is_fwd:
+            reg_to_comp = self.fwd_reg_to_comp
+            reg_to_pref = self.fwd_reg_to_pref
+            flop_key = 'fwd_flop'
+        else:
+            reg_to_comp = self.bwd_reg_to_comp
+            reg_to_pref = self.bwd_reg_to_pref
+            flop_key = 'bwd_flop'
+        comp_start_time = max(self.last_comp.end_time, reg_to_pref.get(
+            region.r_id, ExecutionPeriod(0, 0)).end_time)
+        comp_end_time = comp_start_time + \
+                        sum([self._get_computing_overhead(node.meta.get(flop_key, 0))
+                             for node in region.nodes])
+        comp_ep = ExecutionPeriod(
+            start_time=comp_start_time, end_time=comp_end_time)
+        reg_to_comp[region.r_id] = comp_ep
+        self.last_comp = comp_ep
+
+    def _insert_d2h_exec(self, region: Region):
+        """
+        Insert gradient offload execution period of the current region to the end of the d2h stream
+        """
+
+        offl_start_time = max(self.last_d2h.end_time, self.last_comp.end_time)
+        offl_end_time = offl_start_time + \
+                        self._get_communication_overhead('d2h', region.param_size)
+        offl_ep = ExecutionPeriod(
+            start_time=offl_start_time, end_time=offl_end_time)
+        self.bwd_reg_to_offl_waiting[region.r_id] = offl_ep
+        self.last_d2h = offl_ep
+
+    def _eval_fwd_cost_per_region(self, region: Region):
+        """
+        Evaluate computation and communication execution period of the region in forward pass.
+        """
+
+        # upload parameters of the first region
+        if region.r_id == 0:
+            self._insert_h2d_exec(region)
+
+        # prefetch parameters of the next region
+        fwd_prefetch_region = region.fwd_prefetch_region
+        if fwd_prefetch_region and requires_upload_p_in_fwd(self.region_list[fwd_prefetch_region.shared_rid]):
+            self._insert_h2d_exec(fwd_prefetch_region)
+
+        # execute computation
+        self._insert_comp_exec(region)
+
+    def _eval_fwd_mem_per_region(self, region: Region):
+        """
+        Evaluate the runtime and peak memory when the forward execution reaches the current region.
+        """
+
+        # upload parameters of the current region
+        if region.r_id <= 0:
+            self.runtime_mem += region.param_size
+            self.fwd_reg_flow[region.r_id, region.r_id] = True
+        else:
+            self.fwd_reg_flow[region.r_id] = self.fwd_reg_flow[region.r_id - 1]
+            self.fwd_reg_flow[region.r_id,
+            self.reg_buffer_to_free] = False
+            self.reg_buffer_to_free.clear()
+
+        # prefetch parameters of the next region
+        fwd_prefetch_region = region.fwd_prefetch_region
+        if fwd_prefetch_region and requires_upload_p_in_fwd(self.region_list[fwd_prefetch_region.shared_rid]):
+            self.runtime_mem += fwd_prefetch_region.param_size
+            self.fwd_reg_flow[region.r_id,
+            fwd_prefetch_region.r_id] = True
+
+        for node in region.nodes:
+            self.runtime_mem += calculate_fwd_tmp(node) + \
+                                calculate_fwd_out(node)
+            self.peak_mem = max(self.runtime_mem, self.peak_mem)
+
+            self.total_mem_saving += node.node_info.runtime_fwd_mem - self.runtime_mem
+            self.fwd_node_mem[node] = self.runtime_mem
+
+        if region.need_offload:
+            self.runtime_mem -= region.param_size
+
+            assert len(
+                self.reg_buffer_to_free) <= 1, f'{len(self.reg_buffer_to_free)}'
+            self.reg_buffer_to_free.append(region.r_id)
+
+    def _eval_bwd_cost_per_region(self, region: Region):
+        """
+        Evaluate computation and communication execution period of the region in backward pass.
+        """
+
+        # upload parameters of the current region
+        if region.is_syn:
+            assert region.need_offload
+            self._insert_h2d_exec(region, is_fwd=False)
+
+        # prefetch parameters of the region choiced, which is parallel to computation
+        if region.bwd_prefetch_region is not None:
+            self._insert_h2d_exec(region.bwd_prefetch_region, is_fwd=False)
+
+        # execute computation
+        self._insert_comp_exec(region, is_fwd=False)
+
+        # offload gradient
+        if requires_offload_g_in_bwd(region):
+            self._insert_d2h_exec(region)
+
+        assert len(self.reg_buffer_to_free) == 0
+        for reg_id, offl_exec in self.bwd_reg_to_offl_waiting.items():
+            if offl_exec.end_time >= self.last_comp.start_time:
+                break
+            self.reg_buffer_to_free.append(reg_id)
+            self.bwd_reg_to_offl_freed[reg_id] = offl_exec
+
+        for reg_id in self.reg_buffer_to_free:
+            self.bwd_reg_to_offl_waiting.pop(reg_id)
+
+    def _eval_bwd_mem_per_region(self, region: Region):
+        """
+        Evaluate the runtime and peak memory when the backward execution reaches the current region.
+        """
+
+        if region.r_id + 1 < self.region_num:
+            self.bwd_reg_flow[region.r_id] = self.bwd_reg_flow[region.r_id + 1]
+        else:
+            self.bwd_reg_flow[region.r_id] = self.fwd_reg_flow[-1]
+        self.bwd_reg_flow[region.r_id,
+        self.reg_buffer_to_free] = False
+
+        # free gradients in the buffer
+        while len(self.reg_buffer_to_free):
+            reg_id = self.reg_buffer_to_free.pop(0)
+            self.runtime_mem -= self.region_list[reg_id].param_size
+
+        # upload parameters of the current region
+        if region.is_syn:
+            self.runtime_mem += region.param_size
+            self.bwd_reg_flow[region.r_id, region.r_id] = True
+
+        # prefetch parameters of the region choiced
+        bwd_prefetch_region = region.bwd_prefetch_region
+        if bwd_prefetch_region:
+            self.runtime_mem += bwd_prefetch_region.param_size
+            self.bwd_reg_flow[region.r_id,
+            bwd_prefetch_region.r_id] = True
+
+        # add the gradient of the parameter
+        if region.r_id < region.shared_rid:
+            # gradient accumulation is required for shared parameters
+            self.runtime_mem += 2.0 * region.param_size
+        else:
+            self.runtime_mem += region.param_size
+
+        for node in region.nodes.__reversed__():
+
+            self.runtime_mem -= calculate_fwd_out(node)
+            self.runtime_mem += node.meta['bwd_mem_tmp'] + \
+                                node.meta['bwd_mem_out']
+            self.peak_mem = max(self.runtime_mem, self.peak_mem)
+
+            # The memory savings of a node may be negative due to parameter prefetch.
+            self.total_mem_saving += node.node_info.runtime_bwd_mem - self.runtime_mem
+
+            self.bwd_node_mem[node] = self.runtime_mem
+
+            self.runtime_mem -= (node.meta['bwd_mem_tmp'] +
+                                 calculate_fwd_tmp(node))
+
+            # free bwd_mem_out
+            self.bwd_node_deps[node] = len(node.all_input_nodes)
+            for user_node in node.users:
+                if user_node in self.bwd_node_deps:
+                    self.bwd_node_deps[user_node] -= 1
+                    if self.bwd_node_deps[user_node] <= 0:
+                        self.runtime_mem -= user_node.meta['bwd_mem_out']
+
+            if self.runtime_mem < 0:
+                raise ValueError(f"region id: {region.r_id}, node name: {node.name}, "
+                                 f"runtime_mem: {self.runtime_mem / 1024 ** 2:.3f}MB ---"
+                                 f"runtime memory computed less than 0, which is miscalculated!")
+
+        # release parameters of the region
+        if requires_release_p_in_bwd(self.region_list[region.shared_rid]):
+            self.runtime_mem -= region.param_size
diff --git a/colossalai/auto_parallel/offload/util.py b/colossalai/auto_parallel/offload/util.py
new file mode 100644
index 000000000000..a99c4eb20225
--- /dev/null
+++ b/colossalai/auto_parallel/offload/util.py
@@ -0,0 +1,90 @@
+from dataclasses import dataclass
+from typing import List
+import torch
+from colossalai.fx.profiler import calculate_fwd_out, calculate_fwd_tmp
+
+from .region import Region
+
+
+@dataclass
+class NodeInfo:
+    node_id: int = 0
+    runtime_fwd_mem: float = 0
+    runtime_bwd_mem: float = 0
+
+class NvDevicePower:
+    """
+    NVIDIA GPU computing performance (TFLOPs).
+    """
+
+    RTX3080_FP16 = 70
+    RTX3080_FP32 = 34.1
+
+    RTX3090_FP16 = 71
+    RTX3090_FP32 = 35.7
+
+    V100_FP16 = 31.4
+    V100_FP32 = 15.7
+
+    A100_FP16 = 78
+    A100_FP32 = 19.5
+
+
+class GlobalRuntimeInfo:
+    h2d_stream = torch.cuda.Stream()
+    d2h_stream = torch.cuda.Stream()
+    fwd_prefetch_event_map = {}
+    bwd_prefetch_event_map = {}
+    region_list = []
+
+
+def compute_act_peak_mem(region_list: List[Region]) -> float:
+    act_peak_mem = 0
+    runtime_mem = 0
+    # forward
+    for region in region_list:
+        for node in region.nodes:
+            runtime_mem = runtime_mem + \
+                          calculate_fwd_tmp(node) + calculate_fwd_out(node)
+            act_peak_mem = max(runtime_mem, act_peak_mem)
+    # backward
+    bwd_deps = {}
+    for region in region_list.__reversed__():
+        for node in region.nodes.__reversed__():
+            runtime_mem -= calculate_fwd_out(node)
+            runtime_mem = runtime_mem + \
+                          node.meta['bwd_mem_tmp'] + node.meta['bwd_mem_out']
+
+            act_peak_mem = max(runtime_mem, act_peak_mem)
+
+            runtime_mem = runtime_mem - \
+                          node.meta['bwd_mem_tmp'] - calculate_fwd_tmp(node)
+
+            # free bwd_mem_out
+            bwd_deps[node] = len(node.all_input_nodes)
+            for user_node in node.users:
+                if user_node in bwd_deps:
+                    bwd_deps[user_node] -= 1
+                    if bwd_deps[user_node] <= 0:
+                        runtime_mem -= user_node.meta['bwd_mem_out']
+
+    return act_peak_mem
+
+def compute_max_param_mem(region_list: List[Region]) -> float:
+    return max(region.param_size for region in region_list)
+
+def compute_total_param_mem(region_list: List[Region]) -> float:
+    return sum(region.param_size for region in region_list if region.r_id <= region.shared_rid)
+
+def requires_upload_p_in_fwd(shared_reg: Region):
+    return (shared_reg.r_id >= shared_reg.shared_rid) or (
+                shared_reg.r_id < shared_reg.shared_rid and shared_reg.need_offload)
+
+def requires_release_p_in_bwd(shared_reg: Region):
+    return (shared_reg.r_id >= shared_reg.shared_rid) or (
+                shared_reg.r_id < shared_reg.shared_rid and shared_reg.need_offload)
+
+def requires_offload_g_in_bwd(region: Region):
+    return region.param_size and (region.r_id <= region.shared_rid)
+
+
diff --git a/examples/language/gpt/experiments/auto_offload/README.md b/examples/language/gpt/experiments/auto_offload/README.md
new file mode 100644
index 000000000000..a0d252119056
--- /dev/null
+++ b/examples/language/gpt/experiments/auto_offload/README.md
@@ -0,0 +1,37 @@
+# Auto-Offload Demo with GPT2
+
+## Requirements
+
+Before you can launch training, you need to install the following requirements.
+
+### Install PyTorch
+
+```bash
+#conda
+conda install pytorch==1.12.0 torchvision==0.13.0 torchaudio==0.12.0 cudatoolkit=11.3 -c pytorch
+#pip
+pip install torch==1.12.0+cu113 torchvision==0.13.0+cu113 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu113
+```
+
+### Install [Colossal-AI v0.2.0](https://colossalai.org/download/) From Official Website
+
+```bash
+pip install colossalai==0.2.0+torch1.12cu11.3 -f https://release.colossalai.org
+```
+
+### Install transformers
+
+```bash
+pip install transformers
+```
+
+## Dataset
+
+For simplicity, the input data is randonly generated here.
+
+## Training
+
+```bash
+#Run the auto offload on GPT with default setting and a dummy dataset.
+bash run.sh
+```
diff --git a/examples/language/gpt/experiments/auto_offload/model_zoo.py b/examples/language/gpt/experiments/auto_offload/model_zoo.py
new file mode 100644
index 000000000000..35e44608f810
--- /dev/null
+++ b/examples/language/gpt/experiments/auto_offload/model_zoo.py
@@ -0,0 +1,65 @@
+import torch
+import torch.nn as nn
+from transformers import GPT2Config, GPT2LMHeadModel
+
+class GPTLMModel(nn.Module):
+
+    def __init__(self,
+                 hidden_size=768,
+                 num_layers=12,
+                 num_attention_heads=12,
+                 max_seq_len=1024,
+                 vocab_size=50257):
+        super().__init__()
+        self.model = GPT2LMHeadModel(
+            GPT2Config(n_embd=hidden_size,
+                       n_layer=num_layers,
+                       n_head=num_attention_heads,
+                       n_positions=max_seq_len,
+                       n_ctx=max_seq_len,
+                       vocab_size=vocab_size))
+
+    def forward(self, input_ids, attention_mask):
+        # Only return lm_logits
+        return self.model(input_ids=input_ids, attention_mask=attention_mask, use_cache=True)[0]
+
+
+class GPTLMLoss(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.loss_fn = nn.CrossEntropyLoss()
+
+    def forward(self, logits, labels):
+        shift_logits = logits[..., :-1, :].contiguous()
+        shift_labels = labels[..., 1:].contiguous()
+        # Flatten the tokens
+        return self.loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+
+def get_gpt2_components(model_type: str, batch_size: int):
+    vocab_size = 1024
+    seq_len = 8
+
+    def gpt2_model_builder():
+        if model_type == "gpt2_medium":
+            return GPTLMModel(hidden_size=1024, num_layers=24, num_attention_heads=16)
+        elif model_type == "gpt2_xl":
+            return GPTLMModel(hidden_size=1600, num_layers=48, num_attention_heads=32)
+        elif model_type == "gpt2_10b":
+            return GPTLMModel(hidden_size=4096, num_layers=50, num_attention_heads=16)
+        elif model_type == "gpt2_14b":
+            return GPTLMModel(hidden_size=4096, num_layers=70, num_attention_heads=16)
+        elif model_type == "gpt2_20b":
+            return GPTLMModel(hidden_size=8192, num_layers=25, num_attention_heads=16)
+        elif model_type == "gpt2_24b":
+            return GPTLMModel(hidden_size=8192, num_layers=30, num_attention_heads=16)
+        else:
+            raise TypeError(f"model_builder {model_type}")
+
+    def gpt2_data_gen(device="cuda"):
+        input_ids = torch.randint(0, vocab_size, (batch_size, seq_len), device=device)
+        attention_mask = torch.ones_like(input_ids, device=device)
+        kwargs = dict(input_ids=input_ids, attention_mask=attention_mask)
+        return kwargs
+
+    return gpt2_model_builder, gpt2_data_gen
\ No newline at end of file
diff --git a/examples/language/gpt/experiments/auto_offload/requirements.txt b/examples/language/gpt/experiments/auto_offload/requirements.txt
new file mode 100644
index 000000000000..3ebde8d460aa
--- /dev/null
+++ b/examples/language/gpt/experiments/auto_offload/requirements.txt
@@ -0,0 +1,2 @@
+colossalai >= 0.1.12
+torch >= 1.8.1
\ No newline at end of file
diff --git a/examples/language/gpt/experiments/auto_offload/run.sh b/examples/language/gpt/experiments/auto_offload/run.sh
new file mode 100644
index 000000000000..6a272ec442ab
--- /dev/null
+++ b/examples/language/gpt/experiments/auto_offload/run.sh
@@ -0,0 +1,8 @@
+export BATCH_SIZE=${BATCH_SIZE:-64}
+export MODEL_TYPE=${MODEL_TYPE:-"gpt2_medium"}
+export MEMORY_BUDGET=${MEMORY_BUDGET:-16}
+export SOLVER_TYPE=${SOLVER_TYPE:-"asyn"}
+
+mkdir -p offload_logs
+
+python train_gpt_offload.py --model_type=${MODEL_TYPE} --memory_budget=${MEMORY_BUDGET} --solver_type=${SOLVER_TYPE} --batch_size=${BATCH_SIZE} 2>&1 | tee ./offload_logs/${MODEL_TYPE}_bs_${BATCH_SIZE}_st_${SOLVER_TYPE}.log
diff --git a/examples/language/gpt/experiments/auto_offload/train_gpt_offload.py b/examples/language/gpt/experiments/auto_offload/train_gpt_offload.py
new file mode 100644
index 000000000000..729d1ce4456b
--- /dev/null
+++ b/examples/language/gpt/experiments/auto_offload/train_gpt_offload.py
@@ -0,0 +1,94 @@
+import time
+import pytest
+import argparse
+from functools import partial
+
+import torch
+from torch.utils._pytree import tree_map
+import torch.multiprocessing as mp
+
+import colossalai
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.fx.profiler import parameter_size
+from colossalai.utils import free_port, get_current_device
+from colossalai.auto_parallel.offload.amp_optimizer import AMPOptimizer
+from colossalai.auto_parallel.offload.mem_optimize import memory_optimize
+from colossalai.auto_parallel.offload.solver import NOT_NVML
+from model_zoo import get_gpt2_components, GPTLMLoss
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_type', type=str, default="gpt2_medium")
+    parser.add_argument('--batch_size', type=int, default=64)
+    parser.add_argument('--solver_type', type=str, default='asyn')
+    parser.add_argument('--memory_budget', type=float, default=16)
+    return parser.parse_args()
+
+@pytest.mark.skipif(NOT_NVML, reason='pynvml is not installed')
+def train_gpt(args):
+    memory_budget = args.memory_budget * 1024 * 1024 * 1024
+    solver_type = args.solver_type
+    model_type = args.model_type
+    batch_size = args.batch_size
+
+    # build model
+    model_builder, data_gen = get_gpt2_components(model_type=model_type, batch_size=batch_size)
+    label = torch.randint(low=0, high=128, size=(64, 8,), device=get_current_device())
+    criterion = GPTLMLoss()
+
+    start_time = time.time()
+    model = model_builder()
+    model.train()
+    param_size = parameter_size(model) / 1024 ** 2 / 2
+    init_time = time.time() - start_time
+    print(f"init_param_size={param_size:.3f} MB | init_model_time={init_time:.3f} s")
+
+    data_args = data_gen(device="cpu")
+    wrap_fn = lambda x: x.to(dtype=torch.half) if isinstance(x, torch.Tensor) and torch.is_floating_point(x) else x
+    data_args = tree_map(wrap_fn, data_args)
+    start_time = time.time()
+    model = memory_optimize(model, data_args, memory_budget, solver_type)
+    solver_time = time.time() - start_time
+    print(f"solver_time={solver_time:.3f} s")
+
+    hybrid_optimizer = HybridAdam(model.model.parameters(), lr=1e-3)
+    optim = AMPOptimizer(hybrid_optimizer, model)
+
+    torch.cuda.empty_cache()
+    torch.cuda.synchronize()
+    torch.cuda.reset_peak_memory_stats()
+
+    time_list = []
+    data_args = data_gen(device="cuda")
+    data_args = tree_map(wrap_fn, data_args)
+    for step in range(10):
+        optim.zero_grad()
+        torch.cuda.synchronize()
+        start_time = time.time()
+        loss = criterion(model(**data_args), label)
+        optim.backward(loss)
+        torch.cuda.synchronize()
+        time_list.append(time.time() - start_time)
+        optim.step()
+
+    torch.cuda.synchronize()
+
+    exec_time = sum(sorted(time_list)[:5]) / 5
+    runtime_peak_mem_alc = torch.cuda.max_memory_allocated() / 1024 ** 2
+    runtime_peak_mem_res = torch.cuda.max_memory_reserved() / 1024 ** 2
+    print(f'solver_type: {solver_type} | model_type: {model_type}')
+    print(
+        f'| exec_time={exec_time:.3f} s | param_size={param_size:.3f} MB '
+        f'| runtime_peak_mem_alc={runtime_peak_mem_alc:.3f} MB| runtime_peak_mem_res={runtime_peak_mem_res:.3f} MB|'
+    )
+    print(time_list)
+
+def run(rank, world_size, port, args):
+    config = {}
+    colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+    train_gpt(args)
+
+if __name__ == '__main__':
+    args = parse_args()
+    run_func = partial(run, world_size=1, port=free_port(), args=args)
+    mp.spawn(run_func, nprocs=1)
diff --git a/tests/test_auto_parallel/test_offload/model_utils.py b/tests/test_auto_parallel/test_offload/model_utils.py
new file mode 100644
index 000000000000..c22b17ae42ba
--- /dev/null
+++ b/tests/test_auto_parallel/test_offload/model_utils.py
@@ -0,0 +1,86 @@
+import torch
+import torch.nn as nn
+from transformers import GPT2Config, GPT2LMHeadModel
+from transformers import BertConfig, BertLMHeadModel
+from tests.components_to_test.registry import non_distributed_component_funcs
+
+class GPTLMModel(nn.Module):
+
+    def __init__(self,
+                 hidden_size=768,
+                 num_layers=12,
+                 num_attention_heads=12,
+                 max_seq_len=1024,
+                 vocab_size=50257):
+        super().__init__()
+        self.model = GPT2LMHeadModel(
+            GPT2Config(n_embd=hidden_size,
+                       n_layer=num_layers,
+                       n_head=num_attention_heads,
+                       n_positions=max_seq_len,
+                       n_ctx=max_seq_len,
+                       vocab_size=vocab_size))
+
+    def forward(self, input_ids, attention_mask):
+        # Only return lm_logits
+        return self.model(input_ids=input_ids, attention_mask=attention_mask, use_cache=True)[0]
+
+
+class LMLoss(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.loss_fn = nn.CrossEntropyLoss()
+
+    def forward(self, logits, labels):
+        shift_logits = logits[..., :-1, :].contiguous()
+        shift_labels = labels[..., 1:].contiguous()
+        # Flatten the tokens
+        return self.loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+
+class BertLMModel(nn.Module):
+    def __init__(self, hidden_size=768, num_layers=12, num_attention_heads=32, vocab_size=30522):
+        super().__init__()
+        self.model = BertLMHeadModel(BertConfig(n_embd=hidden_size, num_hidden_layers=num_layers, hidden_size=hidden_size,
+                                                num_attention_heads=num_attention_heads, max_position_embeddings=hidden_size,
+                                                vocab_size=vocab_size))
+
+    def forward(self, input_ids, attention_mask):
+        # Only return lm_logits
+        return self.model(input_ids=input_ids, attention_mask=attention_mask, use_cache=True)[0]
+
+@non_distributed_component_funcs.register(name='bert_')
+def get_bert_components():
+    vocab_size = 1024
+    seq_len = 64
+    batchSize = 64
+
+    def bert_model_builder():
+        model = BertLMModel(hidden_size=8192, num_layers=4, num_attention_heads=32, vocab_size=vocab_size)
+        return model
+
+    def bert_data_gen(device="meta"):
+        input_ids = torch.randint(0, vocab_size, (batchSize, seq_len), device=device)
+        attention_mask = torch.ones_like(input_ids, device=device)
+        kwargs = dict(input_ids=input_ids, attention_mask=attention_mask)
+        return kwargs
+
+    return bert_model_builder, bert_data_gen
+
+@non_distributed_component_funcs.register(name='gpt2_')
+def get_gpt2_components():
+    vocab_size = 1024
+    seq_len = 8
+    batchSize = 64
+
+    def gpt2_model_builder():
+        model = GPTLMModel(hidden_size=8192, num_layers=2, num_attention_heads=32, vocab_size=vocab_size)
+        return model
+
+    def gpt2_data_gen(device="meta"):
+        input_ids = torch.randint(0, vocab_size, (batchSize, seq_len), device=device)
+        attention_mask = torch.ones_like(input_ids, device=device)
+        kwargs = dict(input_ids=input_ids, attention_mask=attention_mask)
+        return kwargs
+
+    return gpt2_model_builder, gpt2_data_gen
\ No newline at end of file
diff --git a/tests/test_auto_parallel/test_offload/test_perf.py b/tests/test_auto_parallel/test_offload/test_perf.py
new file mode 100644
index 000000000000..d569570f4b7d
--- /dev/null
+++ b/tests/test_auto_parallel/test_offload/test_perf.py
@@ -0,0 +1,150 @@
+import time
+import pytest
+from functools import partial
+
+import torch
+from torch.utils._pytree import tree_map
+import torch.multiprocessing as mp
+
+import colossalai
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.fx.profiler import parameter_size
+from colossalai.utils.model.colo_init_context import ColoInitContext
+from colossalai.utils import free_port, get_current_device
+from colossalai.nn.parallel import zero_model_wrapper, zero_optim_wrapper
+from colossalai.auto_parallel.offload.amp_optimizer import AMPOptimizer
+from colossalai.auto_parallel.offload.mem_optimize import memory_optimize
+from colossalai.auto_parallel.offload.solver import NOT_NVML
+from colossalai.testing import parameterize
+
+from tests.test_tensor.common_utils import set_seed
+from tests.test_auto_parallel.test_offload.model_utils import *
+
+
+@parameterize('model_name', ['gpt2_'])
+@parameterize('memory_budget', [5000])
+@parameterize('solver_name', ['asyn'])
+def exam_fwd_bwd(
+        model_name: str,
+        memory_budget: float,
+        solver_name: str
+):
+
+    # build model
+    get_components_func = non_distributed_component_funcs.get_callable(model_name)
+    model_builder, data_gen = get_components_func()
+    label = torch.randint(low=0, high=128, size=(64, 8,), device=get_current_device())
+    criterion = LMLoss()
+
+    set_seed(42)
+    start_time = time.time()
+    model = model_builder()
+    model.train()
+    param_size = parameter_size(model) / 1024 ** 2 / 2
+    init_time = time.time() - start_time
+    print(f"init_param_size={param_size:.3f} MB | init_model_time={init_time:.3f} s")
+
+    data_args = data_gen(device="cpu")
+    wrap_fn = lambda x: x.to(dtype=torch.half) if isinstance(x, torch.Tensor) and torch.is_floating_point(x) else x
+    data_args = tree_map(wrap_fn, data_args)
+    start_time = time.time()
+    model = memory_optimize(model, data_args, memory_budget * 1024 * 1024, solver_name)
+    solver_time = time.time() - start_time
+    print(f"solver_time={solver_time:.3f} s")
+
+    hybrid_optimizer = HybridAdam(model.model.parameters(), lr=1e-3)
+    optim = AMPOptimizer(hybrid_optimizer, model)
+
+    with ColoInitContext(device=torch.device('cpu')):
+        gemini_model = model_builder()
+    gemini_model.train()
+
+    hybrid_optimizer = HybridAdam(gemini_model.parameters(), lr=1e-3)
+    gemini_config = dict(strict_ddp_mode=False,
+                         device=torch.device('cpu'),
+                         placement_policy='cpu',
+                         pin_memory=True,
+                         hidden_dim=8192,
+                         search_range_mb=128)
+    gemini_model = zero_model_wrapper(gemini_model, 3, gemini_config)
+    optim_config = dict(reduce_bucket_size=12 * 1024 * 1024, overlap_communication=True, verbose=True)
+    gemini_optim = zero_optim_wrapper(gemini_model, hybrid_optimizer, optim_config=optim_config)
+
+    torch.cuda.empty_cache()
+    torch.cuda.synchronize()
+    torch.cuda.reset_peak_memory_stats()
+
+    # test gemini
+    time_list = []
+    set_seed(42)
+    data_args = data_gen(device="cuda")
+    for step in range(10):
+        gemini_optim.zero_grad()
+        torch.cuda.synchronize()
+        start_time = time.time()
+        gemini_out = gemini_model(**data_args)
+        gemini_loss = criterion(gemini_out, label)
+        gemini_optim.backward(gemini_loss)
+        torch.cuda.synchronize()
+        time_list.append(time.time() - start_time)
+        gemini_optim.step()
+
+    torch.cuda.synchronize()
+
+    exec_time = sum(sorted(time_list)[:5]) / 5
+    runtime_peak_mem_alc = torch.cuda.max_memory_allocated() / 1024 ** 2
+    runtime_peak_mem_res = torch.cuda.max_memory_reserved() / 1024 ** 2
+    print(f'gemini | model_name: {model_name}')
+    print(
+        f'| exec_time={exec_time:.3f} s | param_size={param_size:.3f} MB '
+        f'| runtime_peak_mem_alc={runtime_peak_mem_alc:.3f} MB| runtime_peak_mem_res={runtime_peak_mem_res:.3f} MB|'
+    )
+    print(time_list)
+
+    del data_args
+    del gemini_model
+    del gemini_optim
+    del gemini_out
+    del gemini_loss
+
+    # test asyn offload
+    torch.cuda.empty_cache()
+    torch.cuda.synchronize()
+    torch.cuda.reset_peak_memory_stats()
+
+    time_list = []
+    set_seed(42)
+    data_args = data_gen(device="cuda")
+    data_args = tree_map(wrap_fn, data_args)
+    for step in range(10):
+        optim.zero_grad()
+        torch.cuda.synchronize()
+        start_time = time.time()
+        loss = criterion(model(**data_args), label)
+        optim.backward(loss)
+        torch.cuda.synchronize()
+        time_list.append(time.time() - start_time)
+        optim.step()
+
+    torch.cuda.synchronize()
+
+    exec_time = sum(sorted(time_list)[:5]) / 5
+    runtime_peak_mem_alc = torch.cuda.max_memory_allocated() / 1024 ** 2
+    runtime_peak_mem_res = torch.cuda.max_memory_reserved() / 1024 ** 2
+    print(f'solver_name: {solver_name} | model_name: {model_name}')
+    print(
+        f'| exec_time={exec_time:.3f} s | param_size={param_size:.3f} MB '
+        f'| runtime_peak_mem_alc={runtime_peak_mem_alc:.3f} MB| runtime_peak_mem_res={runtime_peak_mem_res:.3f} MB|'
+    )
+    print(time_list)
+
+@pytest.mark.skipif(NOT_NVML, reason='pynvml is not installed')
+def test_perf(rank, world_size, port):
+    config = {}
+    colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+    exam_fwd_bwd()
+
+
+if __name__ == '__main__':
+    run_func = partial(test_perf, world_size=1, port=free_port())
+    mp.spawn(run_func, nprocs=1)
diff --git a/tests/test_auto_parallel/test_offload/test_solver.py b/tests/test_auto_parallel/test_offload/test_solver.py
new file mode 100644
index 000000000000..2efbb750f80d
--- /dev/null
+++ b/tests/test_auto_parallel/test_offload/test_solver.py
@@ -0,0 +1,62 @@
+import pytest
+import torch.fx
+from torch.fx import GraphModule
+from torch.utils._pytree import tree_map
+
+from colossalai.fx import ColoTracer, is_compatible_with_meta
+from colossalai.fx.passes.meta_info_prop import MetaInfoProp
+from colossalai.auto_parallel.offload.region_manager import RegionManager
+from colossalai.auto_parallel.offload.solver import SolverFactory, NOT_NVML
+from colossalai.testing import parameterize
+from tests.test_auto_parallel.test_offload.model_utils import *
+
+@pytest.mark.skipif(NOT_NVML, reason='pynvml is not installed')
+@parameterize('model_name', ['gpt2_', 'bert_'])
+@parameterize('memory_budget', [4000])
+@parameterize('solver_name', ['syn', 'asyn'])
+def solver_test(model_name: str,
+                memory_budget: float,
+                solver_name: str):
+
+    get_components_func = non_distributed_component_funcs.get_callable(model_name)
+    model_builder, data_gen = get_components_func()
+    data_args = data_gen(device="cpu")
+    wrap_fn = lambda x: x.to(dtype=torch.half) if isinstance(x, torch.Tensor) and torch.is_floating_point(x) else x
+    data_args = tree_map(wrap_fn, data_args)
+    model = model_builder()
+    model.train()
+    model = model.cpu().half()
+
+    tracer = ColoTracer()
+    assert is_compatible_with_meta()
+    wrap_fn = lambda x: x.to("meta") if isinstance(x, torch.Tensor) else x
+    meta_args = tree_map(wrap_fn, data_args)
+    graph = tracer.trace(model, meta_args=meta_args)
+    gm = GraphModule(model, graph, model.__class__.__name__)
+
+    interp = MetaInfoProp(gm)
+    interp.propagate(*meta_args.values())
+
+    region_manager = RegionManager(graph, solver_name=solver_name)
+    region_manager._pre_process()
+    region_list = region_manager.region_list
+
+    solver_cls = SolverFactory.create(solver_name)
+    memory_budget = memory_budget * 1024 * 1024
+    solver = solver_cls(region_list, memory_budget)
+    solver._call_solver()
+
+    assert solver.best_ts.peak_mem < memory_budget
+
+    print("****************** execution plan *******************")
+    for region in region_list:
+        need_offload = region.need_offload
+        to_prefetch = region.fwd_prefetch_region.r_id if region.fwd_prefetch_region is not None else None
+        print(f'| {model_name} forward | region id: {region.r_id} | need_offload: {need_offload} | to_prefetch: {to_prefetch}')
+    for region in region_list.__reversed__():
+        need_offload = region.need_offload
+        to_prefetch = region.bwd_prefetch_region.r_id if region.bwd_prefetch_region is not None else None
+        print(f'| {model_name} backward | region id: {region.r_id} | need_offload: {need_offload} | to_prefetch: {to_prefetch}')
+
+if __name__ == '__main__':
+    solver_test()
\ No newline at end of file

From e5f668f280f376e3cb8fc3f6c65bb824dcab1bc8 Mon Sep 17 00:00:00 2001
From: NatalieC323 <127177614+NatalieC323@users.noreply.github.com>
Date: Tue, 21 Mar 2023 16:01:13 +0800
Subject: [PATCH 490/503] [dreambooth] fixing the incompatibity in
 requirements.txt (#3190)

* Update requirements.txt

* Update environment.yaml

* Update README.md

* Update environment.yaml

* Update README.md

* Update README.md

* Delete requirements_colossalai.txt

* Update requirements.txt

* Update README.md
---
 examples/images/diffusion/README.md           |  4 +++-
 examples/images/dreambooth/README.md          | 21 +++++++++++--------
 examples/images/dreambooth/requirements.txt   |  1 -
 .../dreambooth/requirements_colossalai.txt    |  8 -------
 4 files changed, 15 insertions(+), 19 deletions(-)
 delete mode 100644 examples/images/dreambooth/requirements_colossalai.txt

diff --git a/examples/images/diffusion/README.md b/examples/images/diffusion/README.md
index 22970ced064e..a70792b9f4a4 100644
--- a/examples/images/diffusion/README.md
+++ b/examples/images/diffusion/README.md
@@ -78,7 +78,9 @@ You can install the latest version (0.2.7) from our official website or from sou
 ##### Download suggested verision for this training
 
 ```
-pip install colossalai=0.2.5
+
+pip install colossalai==0.2.5
+
 ```
 
 ##### Download the latest version from pip for latest torch version
diff --git a/examples/images/dreambooth/README.md b/examples/images/dreambooth/README.md
index 14ed66c8d45b..b067a437c764 100644
--- a/examples/images/dreambooth/README.md
+++ b/examples/images/dreambooth/README.md
@@ -5,12 +5,12 @@ The `train_dreambooth_colossalai.py` script shows how to implement the training
 
 By accommodating model data in CPU and GPU and moving the data to the computing device when necessary, [Gemini](https://www.colossalai.org/docs/advanced_tutorials/meet_gemini), the Heterogeneous Memory Manager of [Colossal-AI](https://github.com/hpcaitech/ColossalAI) can breakthrough the GPU memory wall by using GPU and CPU memory (composed of CPU DRAM or nvme SSD memory) together at the same time. Moreover, the model scale can be further improved by combining heterogeneous training with the other parallel approaches, such as data parallel, tensor parallel and pipeline parallel.
 
-## Installing the dependencies
+## Installation
 
-Before running the scripts, make sure to install the library's training dependencies:
+To begin with, make sure your operating system has the cuda version suitable for this exciting training session, which is cuda11.6-11.8. Notice that you may want to make sure the module versions suitable for the whole environment. Before running the scripts, make sure to install the library's training dependencies:
 
 ```bash
-pip install -r requirements_colossalai.txt
+pip install -r requirements.txt
 ```
 
 ### Install [colossalai](https://github.com/hpcaitech/ColossalAI.git)
@@ -37,9 +37,7 @@ The `text` include the tag `Teyvat`, `Name`,`Element`, `Weapon`, `Region`, `Mode
 
 ## Training
 
-The arguement `placement` can be `cpu`, `auto`, `cuda`, with `cpu` the GPU RAM required can be minimized to 4GB but will deceleration, with `cuda` you can also reduce GPU memory by half but accelerated training， with `auto` a more balanced solution for speed and memory can be obtained。
-
-**___Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.___**
+We provide the script `colossalai.sh` to run the training task with colossalai. Meanwhile, we also provided traditional training process of dreambooth, `dreambooth.sh`, for possible comparation. For instance, the script of training process for [stable-diffusion-v1-4] model can be modified into:
 
 ```bash
 export MODEL_NAME="CompVis/stable-diffusion-v1-4"
@@ -59,12 +57,17 @@ torchrun --nproc_per_node 2 train_dreambooth_colossalai.py \
   --max_train_steps=400 \
   --placement="cuda"
 ```
-
+- `MODEL_NAME` refers to the model you are training.
+- `INSTANCE_DIR` refers to personalized path to instance images, you might need to insert information here.
+- `OUTPUT_DIR` refers to local path to save the trained model, you might need to find a path with enough space.
+- `resolution` refers to the corresponding resolution number of your target model. Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.
+- `placement`  refers to the training strategy supported by Colossal AI, defult = 'cuda', which refers to loading all the parameters into cuda memory. On the other hand, 'cpu' refers to 'cpu offload' strategy while 'auto' enables 'Gemini', both featured by Colossal AI.
 
 ### Training with prior-preservation loss
 
 Prior-preservation is used to avoid overfitting and language-drift. Refer to the paper to learn more about it. For prior-preservation we first generate images using the model with a class prompt and then use those during training along with our data.
-According to the paper, it's recommended to generate `num_epochs * num_samples` images for prior-preservation. 200-300 works well for most cases. The `num_class_images` flag sets the number of images to generate with the class prompt. You can place existing images in `class_data_dir`, and the training script will generate any additional images so that `num_class_images` are present in `class_data_dir` during training time.
+
+According to the paper, it's recommended to generate `num_epochs * num_samples` images for prior-preservation. 200-300 works well for most cases. The `num_class_images` flag sets the number of images to generate with the class prompt. You can place existing images in `class_data_dir`, and the training script will generate any additional images so that `num_class_images` are present in `class_data_dir` during training time. The general script can be then modified as the following.
 
 ```bash
 export MODEL_NAME="CompVis/stable-diffusion-v1-4"
@@ -91,7 +94,7 @@ torchrun --nproc_per_node 2 train_dreambooth_colossalai.py \
 
 ## Inference
 
-Once you have trained a model using above command, the inference can be done simply using the `StableDiffusionPipeline`. Make sure to include the `identifier`(e.g. sks in above example) in your prompt.
+Once you have trained a model using above command, the inference can be done simply using the `StableDiffusionPipeline`. Make sure to include the `identifier`(e.g. `--instance_prompt="a photo of sks dog" ` in the above example) in your prompt.
 
 ```python
 from diffusers import StableDiffusionPipeline
diff --git a/examples/images/dreambooth/requirements.txt b/examples/images/dreambooth/requirements.txt
index 6c4f40fb5dd0..1ec828c630ef 100644
--- a/examples/images/dreambooth/requirements.txt
+++ b/examples/images/dreambooth/requirements.txt
@@ -5,4 +5,3 @@ transformers>=4.21.0
 ftfy
 tensorboard
 modelcards
-colossalai
diff --git a/examples/images/dreambooth/requirements_colossalai.txt b/examples/images/dreambooth/requirements_colossalai.txt
deleted file mode 100644
index c4a0e91703bb..000000000000
--- a/examples/images/dreambooth/requirements_colossalai.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-diffusers
-torch
-torchvision
-ftfy
-tensorboard
-modelcards
-transformers
-colossalai==0.2.0+torch1.12cu11.3 -f https://release.colossalai.org

From e7f3bed2d36c5406e9a9ab92438be46a5f9258d7 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Tue, 21 Mar 2023 17:39:30 +0800
Subject: [PATCH 491/503] [booster] added the plugin base and torch ddp plugin
 (#3180)

* [booster] added the plugin base and torch ddp plugin

* polish code

* polish code

* polish code
---
 colossalai/booster/booster.py                 |  90 ++++++-----
 colossalai/booster/plugin.py                  |  46 ------
 colossalai/booster/plugin/__init__.py         |   4 +
 colossalai/booster/plugin/plugin_base.py      |  51 ++++++
 colossalai/booster/plugin/torch_ddp_plugin.py | 147 ++++++++++++++++++
 tests/test_booster/test_accelerator.py        |  22 ++-
 .../test_mixed_precision/test_fp16_torch.py   |  21 ++-
 .../test_plugin/test_torch_ddp_plugin.py      |  85 ++++++++++
 8 files changed, 379 insertions(+), 87 deletions(-)
 delete mode 100644 colossalai/booster/plugin.py
 create mode 100644 colossalai/booster/plugin/__init__.py
 create mode 100644 colossalai/booster/plugin/plugin_base.py
 create mode 100644 colossalai/booster/plugin/torch_ddp_plugin.py
 create mode 100644 tests/test_booster/test_plugin/test_torch_ddp_plugin.py

diff --git a/colossalai/booster/booster.py b/colossalai/booster/booster.py
index 7d7f21ca6cf2..230c65a9e0a1 100644
--- a/colossalai/booster/booster.py
+++ b/colossalai/booster/booster.py
@@ -1,9 +1,9 @@
+import warnings
 from contextlib import contextmanager
-from typing import Callable, Iterable, Iterator, List, Optional, Tuple, Union
+from typing import Callable, Iterator, List, Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
-from torch import Tensor
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
 from torch.utils.data import DataLoader
@@ -55,27 +55,43 @@ def __init__(self,
                  device: str = 'cuda',
                  mixed_precision: Union[MixedPrecision, str] = None,
                  plugin: Optional[Plugin] = None) -> None:
-        # TODO(FrankLeeeee): add plugin control logic
-        # if self.plugin is not None and self.plugin.control_accelerator:
-        #     ...
-        # create acclerator
-        self.acceleartor = Accelerator(device)
-        self.acceleartor.set_default_device()
-
-        # validate and set precision
-        if isinstance(MixedPrecision, str):
-            # the user will take the default arguments for amp training
-            self.mixed_precision = mixed_precision_factory(mixed_precision)
-        elif isinstance(mixed_precision, MixedPrecision):
-            # the user can customize the arguments by passing the precision object
-            self.mixed_precision = mixed_precision
+        if plugin is not None:
+            assert isinstance(
+                plugin, Plugin), f'Expected the argument plugin to be an instance of Plugin, but got {type(plugin)}.'
+        self.plugin = plugin
+
+        # set accelerator
+        if self.plugin and self.plugin.control_device:
+            self.accelerator = None
+            warnings.warn('The plugin will control the accelerator, so the device argument will be ignored.')
         else:
-            raise ValueError(
-                f'Expected the argument mixed_precision to be a string or an instance of Precision, but got {type(mixed_precision)}.'
-            )
+            self.accelerator = Accelerator(device)
 
-    def boost(self, model: nn.Module, optimizer: Optimizer, criterion: Callable, lr_scheduler: LRScheduler,
-              dataloader: DataLoader) -> List[Union[nn.Module, Optimizer, LRScheduler, DataLoader]]:
+        # set precision
+        if mixed_precision is None or (self.plugin and self.plugin.control_precision):
+            self.mixed_precision = None
+            warnings.warn('The plugin will control the precision, so the mixed_precision argument will be ignored.')
+        else:
+            # validate and set precision
+            if isinstance(MixedPrecision, str):
+                # the user will take the default arguments for amp training
+                self.mixed_precision = mixed_precision_factory(mixed_precision)
+            elif isinstance(mixed_precision, MixedPrecision):
+                # the user can customize the arguments by passing the precision object
+                self.mixed_precision = mixed_precision
+            else:
+                raise ValueError(
+                    f'Expected the argument mixed_precision to be a string or an instance of Precision, but got {type(mixed_precision)}.'
+                )
+
+    def boost(
+        self,
+        model: nn.Module,
+        optimizer: Optimizer,
+        criterion: Callable = None,
+        dataloader: DataLoader = None,
+        lr_scheduler: LRScheduler = None,
+    ) -> List[Union[nn.Module, Optimizer, LRScheduler, DataLoader]]:
         """
         Boost the model, optimizer, criterion, lr_scheduler, and dataloader.
 
@@ -83,22 +99,25 @@ def boost(self, model: nn.Module, optimizer: Optimizer, criterion: Callable, lr_
             model (nn.Module): The model to be boosted.
             optimizer (Optimizer): The optimizer to be boosted.
             criterion (Callable): The criterion to be boosted.
-            lr_scheduler (LRScheduler): The lr_scheduler to be boosted.
             dataloader (DataLoader): The dataloader to be boosted.
+            lr_scheduler (LRScheduler): The lr_scheduler to be boosted.
         """
-        # TODO(FrankLeeeee): add plugin control logic
-        # if self.plugin is not None and self.plugin.control_accelerator:
-        #     ...
-        model = self.acceleartor.configure_model(model)
-
         # TODO(FrankLeeeee): consider multi-model and multi-optimizer case
-        # TODO(lsg): Add plugin control logic
-        # e.g.
-        # if self.plugin is not None and self.plugin.control_boost:
-        #    ...
+        # TODO(FrankLeeeee): consider multi-dataloader case
         # transform model for mixed precision
-        model, optimizer, criterion = self.mixed_precision.configure(model, optimizer, criterion)
-        return model, optimizer, criterion, lr_scheduler, dataloader
+        if self.plugin:
+            model, optimizer, criterion, dataloader, lr_scheduler = self.plugin.configure(
+                model, optimizer, criterion, dataloader, lr_scheduler)
+
+        if self.plugin and not self.plugin.control_device:
+            # transform model for accelerator
+            model = self.accelerator.configure(model)
+
+        if self.mixed_precision and self.plugin and not self.plugin.control_precision:
+            # transform model for mixed precision
+            model, optimizer, criterion = self.mixed_precision.configure(model, optimizer, criterion)
+
+        return model, optimizer, criterion, dataloader, lr_scheduler
 
     def backward(self, loss: torch.Tensor, optimizer: Optimizer) -> None:
         # TODO: implement this method with plugin
@@ -117,8 +136,9 @@ def execute_pipeline(self,
         pass
 
     def no_sync(self, model: nn.Module) -> contextmanager:
-        # TODO: implement this method
-        pass
+        assert self.plugin is not None, f'no_sync is only enabled when a plugin is provided and the plugin supports no_sync.'
+        assert self.plugin.support_no_sync, f'The plugin {self.plugin.__class__.__name__} does not support no_sync.'
+        return self.plugin.no_sync(model)
 
     def save(self,
              obj: Union[nn.Module, Optimizer, LRScheduler],
diff --git a/colossalai/booster/plugin.py b/colossalai/booster/plugin.py
deleted file mode 100644
index 32e0a7bde3f7..000000000000
--- a/colossalai/booster/plugin.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from typing import List, Tuple
-
-import torch
-import torch.nn as nn
-from torch.optim import Optimizer
-from torch.utils.data import DataLoader
-
-from colossalai.device.device_mesh import DeviceMesh
-
-__all__ = ['Plugin']
-
-
-class Plugin:
-
-    @property
-    def supported_devices(self) -> List[torch.device]:
-        pass
-
-    @property
-    def supported_precisions(self) -> List[str]:
-        pass
-
-    @property
-    def control_precision(self) -> bool:
-        pass
-
-    @property
-    def control_device(self) -> bool:
-        pass
-
-    @property
-    def support_no_sync(self) -> bool:
-        pass
-
-    def setup_model(self, model: nn.Module, device_mesh_pool: DeviceMesh) -> nn.Module:
-        pass
-
-    def setup_optimizer(self, optimizer: Optimizer) -> Optimizer:
-        pass
-
-    def setup_dataloader(self, dataloader: DataLoader) -> DataLoader:
-        pass
-
-    @property
-    def device_mesh_shape(self) -> List[Tuple[int, ...]]:
-        pass
diff --git a/colossalai/booster/plugin/__init__.py b/colossalai/booster/plugin/__init__.py
new file mode 100644
index 000000000000..3328fe2b9627
--- /dev/null
+++ b/colossalai/booster/plugin/__init__.py
@@ -0,0 +1,4 @@
+from .plugin_base import Plugin
+from .torch_ddp_plugin import TorchDDPPlugin
+
+__all__ = ['Plugin', 'TorchDDPPlugin']
diff --git a/colossalai/booster/plugin/plugin_base.py b/colossalai/booster/plugin/plugin_base.py
new file mode 100644
index 000000000000..3c347cb4252d
--- /dev/null
+++ b/colossalai/booster/plugin/plugin_base.py
@@ -0,0 +1,51 @@
+from abc import ABC, abstractmethod
+from typing import Callable, List, Tuple, Union
+
+import torch.nn as nn
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
+from torch.utils.data import DataLoader
+
+from colossalai.booster.interface import OptimizerWrapper
+
+__all__ = ['Plugin']
+
+
+class Plugin(ABC):
+
+    @property
+    @abstractmethod
+    def supported_devices(self) -> List[str]:
+        pass
+
+    @property
+    @abstractmethod
+    def supported_precisions(self) -> List[str]:
+        pass
+
+    @property
+    @abstractmethod
+    def control_precision(self) -> bool:
+        pass
+
+    @property
+    @abstractmethod
+    def control_device(self) -> bool:
+        pass
+
+    @property
+    @abstractmethod
+    def support_no_sync(self) -> bool:
+        pass
+
+    @abstractmethod
+    def configure(
+        self,
+        model: nn.Module,
+        optimizer: Optimizer,
+        criterion: Callable = None,
+        dataloader: DataLoader = None,
+        lr_scheduler: LRScheduler = None,
+    ) -> Tuple[Union[nn.Module, OptimizerWrapper, LRScheduler, DataLoader]]:
+        # implement this method
+        pass
diff --git a/colossalai/booster/plugin/torch_ddp_plugin.py b/colossalai/booster/plugin/torch_ddp_plugin.py
new file mode 100644
index 000000000000..07d6be8c748d
--- /dev/null
+++ b/colossalai/booster/plugin/torch_ddp_plugin.py
@@ -0,0 +1,147 @@
+import random
+from typing import Callable, List, Tuple, Union
+
+import numpy as np
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
+from torch.utils.data import DataLoader
+from torch.utils.data.distributed import DistributedSampler
+
+from colossalai.booster.interface import OptimizerWrapper
+
+from .plugin_base import Plugin
+
+__all__ = ['TorchDDPPlugin']
+
+
+class TorchDDPPlugin(Plugin):
+    """
+    Plugin for PyTorch DDP.
+
+    Example:
+        >>> from colossalai.booster import Booster
+        >>> from colossalai.booster.plugin import TorchDDPPlugin
+        >>>
+        >>> model, train_dataset, optimizer, criterion = ...
+        >>> plugin = TorchDDPPlugin()
+
+        >>> train_dataloader = plugin.prepare_train_dataloader(train_dataset, batch_size=8)
+        >>> booster = Booster(plugin=plugin)
+        >>> model, optimizer, train_dataloader, criterion = booster.boost(model, optimizer, train_dataloader, criterion)
+
+    Args:
+        broadcast_buffers (bool, optional): Whether to broadcast buffers in the beginning of training. Defaults to True.
+        bucket_cap_mb (int, optional): The bucket size in MB. Defaults to 25.
+        find_unused_parameters (bool, optional): Whether to find unused parameters. Defaults to False.
+        check_reduction (bool, optional): Whether to check reduction. Defaults to False.
+        gradient_as_bucket_view (bool, optional): Whether to use gradient as bucket view. Defaults to False.
+        static_graph (bool, optional): Whether to use static graph. Defaults to False.
+    """
+
+    def __init__(self,
+                 broadcast_buffers: bool = True,
+                 bucket_cap_mb: int = 25,
+                 find_unused_parameters: bool = False,
+                 check_reduction: bool = False,
+                 gradient_as_bucket_view: bool = False,
+                 static_graph: bool = False) -> None:
+
+        assert dist.is_initialized(
+        ), 'torch.distributed is not initialized, please use colossalai.launch to create the distributed environment'
+        self.rank = dist.get_rank()
+        self.world_size = dist.get_world_size()
+        self.ddp_kwargs = dict(broadcast_buffers=broadcast_buffers,
+                               bucket_cap_mb=bucket_cap_mb,
+                               find_unused_parameters=find_unused_parameters,
+                               check_reduction=check_reduction,
+                               gradient_as_bucket_view=gradient_as_bucket_view,
+                               static_graph=static_graph)
+
+    def support_no_sync(self) -> bool:
+        return True
+
+    def control_precision(self) -> bool:
+        return False
+
+    def supported_precisions(self) -> List[str]:
+        return ['fp16', 'fp16_apex', 'bf16', 'fp8']
+
+    def control_device(self) -> bool:
+        return True
+
+    def supported_devices(self) -> List[str]:
+        return ['cuda']
+
+    def prepare_train_dataloader(self,
+                                 dataset,
+                                 batch_size,
+                                 shuffle=False,
+                                 seed=1024,
+                                 drop_last=False,
+                                 pin_memory=False,
+                                 num_workers=0,
+                                 **kwargs):
+        r"""
+        Prepare a dataloader for distributed training. The dataloader will be wrapped by
+        `torch.utils.data.DataLoader` and `torch.utils.data.DistributedSampler`.
+
+        Note:
+            1. Evaluation datasets should not be passed to this function.
+
+        Args:
+            dataset (`torch.utils.data.Dataset`): The dataset to be loaded.
+            shuffle (bool, optional): Whether to shuffle the dataset. Defaults to False.
+            seed (int, optional): Random worker seed for sampling, defaults to 1024.
+            add_sampler: Whether to add ``DistributedDataParallelSampler`` to the dataset. Defaults to True.
+            drop_last (bool, optional): Set to True to drop the last incomplete batch, if the dataset size
+                is not divisible by the batch size. If False and the size of dataset is not divisible by
+                the batch size, then the last batch will be smaller, defaults to False.
+            pin_memory (bool, optional): Whether to pin memory address in CPU memory. Defaults to False.
+            num_workers (int, optional): Number of worker threads for this dataloader. Defaults to 0.
+            kwargs (dict): optional parameters for ``torch.utils.data.DataLoader``, more details could be found in
+                    `DataLoader <https://pytorch.org/docs/stable/_modules/torch/utils/data/dataloader.html#DataLoader>`_.
+
+        Returns:
+            :class:`torch.utils.data.DataLoader`: A DataLoader used for training or testing.
+        """
+        _kwargs = kwargs.copy()
+        sampler = DistributedSampler(dataset, num_replicas=self.world_size, rank=self.rank, shuffle=shuffle)
+
+        # Deterministic dataloader
+        def seed_worker(worker_id):
+            worker_seed = seed
+            np.random.seed(worker_seed)
+            torch.manual_seed(worker_seed)
+            random.seed(worker_seed)
+
+        return DataLoader(dataset,
+                          batch_size=batch_size,
+                          sampler=sampler,
+                          worker_init_fn=seed_worker,
+                          drop_last=drop_last,
+                          pin_memory=pin_memory,
+                          num_workers=num_workers,
+                          **_kwargs)
+
+    def configure(
+        self,
+        model: nn.Module,
+        optimizer: Optimizer,
+        criterion: Callable = None,
+        dataloader: DataLoader = None,
+        lr_scheduler: LRScheduler = None,
+    ) -> Tuple[Union[nn.Module, OptimizerWrapper, LRScheduler, DataLoader]]:
+        # cast model to cuda
+        model = model.cuda()
+
+        # wrap the model with PyTorch DDP
+        model = DDP(model, **self.ddp_kwargs)
+
+        if not isinstance(optimizer, OptimizerWrapper):
+            optimizer = OptimizerWrapper(optimizer)
+
+        return model, optimizer, criterion, dataloader, lr_scheduler
diff --git a/tests/test_booster/test_accelerator.py b/tests/test_booster/test_accelerator.py
index 4bfa3fd0631e..6958a87e2a08 100644
--- a/tests/test_booster/test_accelerator.py
+++ b/tests/test_booster/test_accelerator.py
@@ -1,13 +1,27 @@
-import pytest
+from functools import partial
+
+import torch.multiprocessing as mp
 import torch.nn as nn
-from torchvision.models import resnet18
 
 from colossalai.booster.accelerator import Accelerator
+from colossalai.testing import parameterize, rerun_if_address_is_in_use
 
 
-@pytest.mark.parametrize('device', ['cpu', 'cuda'])
-def test_accelerator(device):
+@parameterize('device', ['cpu', 'cuda'])
+def run_accelerator(device):
     acceleartor = Accelerator(device)
     model = nn.Linear(8, 8)
     model = acceleartor.configure_model(model)
     assert next(model.parameters()).device.type == device
+    del model, acceleartor
+
+
+def run_dist(rank):
+    run_accelerator()
+
+
+@rerun_if_address_is_in_use()
+def test_accelerator():
+    world_size = 1
+    run_func = partial(run_dist)
+    mp.spawn(run_func, nprocs=world_size)
diff --git a/tests/test_booster/test_mixed_precision/test_fp16_torch.py b/tests/test_booster/test_mixed_precision/test_fp16_torch.py
index 98d00cd2caca..bacf29014193 100644
--- a/tests/test_booster/test_mixed_precision/test_fp16_torch.py
+++ b/tests/test_booster/test_mixed_precision/test_fp16_torch.py
@@ -1,12 +1,21 @@
+from functools import partial
+
 import torch
+import torch.multiprocessing as mp
 from torch.optim import Adam
 
+import colossalai
 from colossalai.booster.mixed_precision import FP16TorchMixedPrecision
+from colossalai.testing import rerun_if_address_is_in_use
+from colossalai.utils import free_port
 from tests.kit.model_zoo import model_zoo
 
 
-def test_torch_amp():
-    for name, (model_fn, data_gen_fn, output_transform_fn, _) in model_zoo.items():
+def run_torch_amp(rank, world_size, port):
+    # init dist env
+    colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host='localhost')
+    sub_model_zoo = model_zoo.get_sub_registry('timm')
+    for name, (model_fn, data_gen_fn, output_transform_fn, _) in sub_model_zoo.items():
         # dlrm_interactionarch has not parameters, so skip
         if name == 'dlrm_interactionarch':
             continue
@@ -27,3 +36,11 @@ def test_torch_amp():
         optimizer.backward(loss)
         optimizer.clip_grad_by_norm(1.0)
         optimizer.step()
+        del model, optimizer, criterion, data, output, mixed_precision
+
+
+@rerun_if_address_is_in_use()
+def test_torch_ddp_plugin():
+    world_size = 1
+    run_func = partial(run_torch_amp, world_size=world_size, port=free_port())
+    mp.spawn(run_func, nprocs=world_size)
diff --git a/tests/test_booster/test_plugin/test_torch_ddp_plugin.py b/tests/test_booster/test_plugin/test_torch_ddp_plugin.py
new file mode 100644
index 000000000000..58aef54c4967
--- /dev/null
+++ b/tests/test_booster/test_plugin/test_torch_ddp_plugin.py
@@ -0,0 +1,85 @@
+from functools import partial
+
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.optim import SGD
+
+import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.interface import OptimizerWrapper
+from colossalai.booster.plugin import TorchDDPPlugin
+from colossalai.testing import rerun_if_address_is_in_use
+from colossalai.utils import free_port
+from tests.kit.model_zoo import model_zoo
+
+
+def check_torch_ddp_plugin():
+    plugin = TorchDDPPlugin()
+    booster = Booster(plugin=plugin)
+
+    for name, (model_fn, data_gen_fn, output_transform_fn, _) in model_zoo.items():
+        if name == 'dlrm_interactionarch':
+            continue
+
+        model = model_fn()
+        optimizer = SGD(model.parameters(), lr=1e-3)
+        criterion = lambda x: x.mean()
+        data = data_gen_fn()
+
+        data = {
+            k: v.to('cuda') if torch.is_tensor(v) or 'Tensor' in v.__class__.__name__ else v for k, v in data.items()
+        }
+
+        model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
+
+        assert isinstance(model, DDP)
+        assert isinstance(optimizer, OptimizerWrapper)
+
+        output = model(**data)
+        output = output_transform_fn(output)
+        output_key = list(output.keys())[0]
+        loss = criterion(output[output_key])
+
+        booster.backward(loss, optimizer)
+        optimizer.clip_grad_by_norm(1.0)
+        optimizer.step()
+
+
+def check_dataloader_sharding():
+    plugin = TorchDDPPlugin()
+
+    # create a custom dasetset with 0 to 10
+    dataset = torch.utils.data.TensorDataset(torch.arange(0, 10))
+    train_dataloader = plugin.prepare_train_dataloader(dataset, batch_size=2)
+
+    # get the first batch of data
+    batch = next(iter(train_dataloader))[0].cuda()
+    is_rank_0 = dist.get_rank() == 0
+
+    if is_rank_0:
+        batch_to_compare = batch.clone()
+    else:
+        batch_to_compare = batch
+    # pass to the rank 1 value to rank 0
+    dist.broadcast(batch_to_compare, src=1)
+
+    # compare on rank 0
+    if is_rank_0:
+        assert not torch.equal(batch,
+                               batch_to_compare), 'Same number was found across ranks but expected it to be different'
+
+
+def run_dist(rank, world_size, port):
+    # init dist env
+    colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host='localhost')
+    check_dataloader_sharding()
+    check_torch_ddp_plugin()
+
+
+@rerun_if_address_is_in_use()
+def test_torch_ddp_plugin():
+    world_size = 2
+    run_func = partial(run_dist, world_size=world_size, port=free_port())
+    mp.spawn(run_func, nprocs=world_size)

From b429529365ba15a4a72f6c6ba0f6556d9d9d1fe4 Mon Sep 17 00:00:00 2001
From: pgzhang <37991273+pgzhang@users.noreply.github.com>
Date: Wed, 22 Mar 2023 09:59:42 +0800
Subject: [PATCH 492/503] [chatgpt] add supervised learning fine-tune code
 (#3183)

* [chatgpt] add supervised fine-tune code

* [chatgpt] delete unused code and modified comment code

* [chatgpt] use pytorch distributed sampler instead

---------

Co-authored-by: zhangpengpeng <zhangpengpeng@joyy.com>
---
 .../ChatGPT/chatgpt/dataset/__init__.py       |   3 +-
 .../ChatGPT/chatgpt/dataset/sft_dataset.py    |  40 ++++++
 .../ChatGPT/chatgpt/models/base/__init__.py   |   3 +-
 .../ChatGPT/chatgpt/models/base/lm.py         |  33 +++++
 .../ChatGPT/chatgpt/models/bloom/__init__.py  |   3 +-
 .../ChatGPT/chatgpt/models/bloom/bloom_lm.py  |  36 ++++++
 .../ChatGPT/chatgpt/models/gpt/__init__.py    |   3 +-
 .../ChatGPT/chatgpt/models/gpt/gpt_lm.py      |  36 ++++++
 .../ChatGPT/chatgpt/models/opt/__init__.py    |   3 +-
 .../ChatGPT/chatgpt/models/opt/opt_lm.py      |  36 ++++++
 .../ChatGPT/chatgpt/trainer/__init__.py       |   3 +-
 applications/ChatGPT/chatgpt/trainer/sft.py   | 101 ++++++++++++++++
 applications/ChatGPT/examples/train_sft.py    | 114 ++++++++++++++++++
 applications/ChatGPT/examples/train_sft.sh    |  20 +++
 14 files changed, 428 insertions(+), 6 deletions(-)
 create mode 100644 applications/ChatGPT/chatgpt/dataset/sft_dataset.py
 create mode 100644 applications/ChatGPT/chatgpt/models/base/lm.py
 create mode 100644 applications/ChatGPT/chatgpt/models/bloom/bloom_lm.py
 create mode 100644 applications/ChatGPT/chatgpt/models/gpt/gpt_lm.py
 create mode 100644 applications/ChatGPT/chatgpt/models/opt/opt_lm.py
 create mode 100644 applications/ChatGPT/chatgpt/trainer/sft.py
 create mode 100644 applications/ChatGPT/examples/train_sft.py
 create mode 100755 applications/ChatGPT/examples/train_sft.sh

diff --git a/applications/ChatGPT/chatgpt/dataset/__init__.py b/applications/ChatGPT/chatgpt/dataset/__init__.py
index 83393098775f..78fd2c0705a9 100644
--- a/applications/ChatGPT/chatgpt/dataset/__init__.py
+++ b/applications/ChatGPT/chatgpt/dataset/__init__.py
@@ -1,4 +1,5 @@
 from .reward_dataset import RmStaticDataset, HhRlhfDataset
 from .utils import is_rank_0
+from .sft_dataset import SFTDataset
 
-__all__ = ['RmStaticDataset', 'HhRlhfDataset','is_rank_0']
+__all__ = ['RmStaticDataset', 'HhRlhfDataset','is_rank_0', 'SFTDataset']
diff --git a/applications/ChatGPT/chatgpt/dataset/sft_dataset.py b/applications/ChatGPT/chatgpt/dataset/sft_dataset.py
new file mode 100644
index 000000000000..53ad205073e5
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/dataset/sft_dataset.py
@@ -0,0 +1,40 @@
+from typing import Callable
+import random
+from torch.utils.data import Dataset
+import torch.distributed as dist
+from tqdm import tqdm
+import torch
+
+from .utils import is_rank_0
+
+
+class SFTDataset(Dataset):
+    """
+    Dataset for sft model
+
+    Args:
+        dataset: dataset for supervised model
+        tokenizer: tokenizer for supervised model
+        max_length: max length of input
+    """
+
+    def __init__(self, dataset, tokenizer: Callable, max_length: int=512) -> None:
+        super().__init__()
+        self.prompts = []
+
+        for data in tqdm(dataset, disable=not is_rank_0()):
+            prompt = data['prompt'] + data['completion'] + "<|endoftext|>"
+            prompt_token = tokenizer(prompt,
+                                     max_length=max_length,
+                                     padding="max_length",
+                                     truncation=True,
+                                     return_tensors="pt")
+
+            self.prompts.append(prompt_token)
+
+    def __len__(self):
+        length = len(self.prompts)
+        return length
+
+    def __getitem__(self, idx):
+        return self.prompts[idx]
diff --git a/applications/ChatGPT/chatgpt/models/base/__init__.py b/applications/ChatGPT/chatgpt/models/base/__init__.py
index 86f403556904..7c7b1ceba257 100644
--- a/applications/ChatGPT/chatgpt/models/base/__init__.py
+++ b/applications/ChatGPT/chatgpt/models/base/__init__.py
@@ -1,5 +1,6 @@
 from .actor import Actor
 from .critic import Critic
 from .reward_model import RewardModel
+from .lm import LM
 
-__all__ = ['Actor', 'Critic', 'RewardModel']
+__all__ = ['Actor', 'Critic', 'RewardModel', 'LM']
diff --git a/applications/ChatGPT/chatgpt/models/base/lm.py b/applications/ChatGPT/chatgpt/models/base/lm.py
new file mode 100644
index 000000000000..b6bd7aff8315
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/models/base/lm.py
@@ -0,0 +1,33 @@
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..generation import generate
+from .actor import Actor
+
+
+class LM(Actor):
+    """
+    Language model base class.
+
+    Args:
+        model (nn.Module): Language Model.
+        lora_rank (int): LoRA rank.
+        lora_train_bias (str): LoRA bias training mode.
+    """
+
+    def __init__(self, model: nn.Module, lora_rank: int = 0, lora_train_bias: str = 'none') -> None:
+        super().__init__(model=model, lora_rank=lora_rank, lora_train_bias=lora_train_bias)
+
+    def forward(self,
+                sequences: torch.LongTensor,
+                attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """Returns output log probs
+        """
+        output = self.model(sequences, attention_mask=attention_mask)
+        logits = output['logits']
+        log_probs = F.log_softmax(logits, dim=-1)
+        return log_probs
+
diff --git a/applications/ChatGPT/chatgpt/models/bloom/__init__.py b/applications/ChatGPT/chatgpt/models/bloom/__init__.py
index d0e7f7b1ef94..7d6d7753bb9a 100644
--- a/applications/ChatGPT/chatgpt/models/bloom/__init__.py
+++ b/applications/ChatGPT/chatgpt/models/bloom/__init__.py
@@ -1,5 +1,6 @@
 from .bloom_actor import BLOOMActor
 from .bloom_critic import BLOOMCritic
 from .bloom_rm import BLOOMRM
+from .bloom_lm import BLOOMLM
 
-__all__ = ['BLOOMActor', 'BLOOMCritic', 'BLOOMRM']
+__all__ = ['BLOOMActor', 'BLOOMCritic', 'BLOOMRM', 'BLOOMLM']
diff --git a/applications/ChatGPT/chatgpt/models/bloom/bloom_lm.py b/applications/ChatGPT/chatgpt/models/bloom/bloom_lm.py
new file mode 100644
index 000000000000..81e17f27c11a
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/models/bloom/bloom_lm.py
@@ -0,0 +1,36 @@
+from typing import Optional
+
+import torch
+from transformers import BloomConfig, BloomForCausalLM, BloomModel
+
+from ..base import LM
+
+
+class BLOOMLM(LM):
+    """
+    BLOOM language model.
+
+    Args:
+        pretrained (str): Pretrained model name or path.
+        config (BloomConfig): Model config.
+        checkpoint (bool): Enable gradient checkpointing.
+        lora_rank (int): LoRA rank.
+        lora_train_bias (str): LoRA bias training mode.
+    """
+
+    def __init__(self,
+                 pretrained: str = None,
+                 config: Optional[BloomConfig] = None,
+                 checkpoint: bool = False,
+                 lora_rank: int = 0,
+                 lora_train_bias: str = 'none') -> None:
+        if pretrained is not None:
+            model = BloomForCausalLM.from_pretrained(pretrained)
+        elif config is not None:
+            model = BloomForCausalLM(config)
+        else:
+            model = BloomForCausalLM(BloomConfig())
+        if checkpoint:
+            model.gradient_checkpointing_enable()
+        super().__init__(model, lora_rank, lora_train_bias)
+
diff --git a/applications/ChatGPT/chatgpt/models/gpt/__init__.py b/applications/ChatGPT/chatgpt/models/gpt/__init__.py
index 63dc5ab0f5ea..c6ae05113cc0 100644
--- a/applications/ChatGPT/chatgpt/models/gpt/__init__.py
+++ b/applications/ChatGPT/chatgpt/models/gpt/__init__.py
@@ -1,5 +1,6 @@
 from .gpt_actor import GPTActor
 from .gpt_critic import GPTCritic
 from .gpt_rm import GPTRM
+from .gpt_lm import GPTLM
 
-__all__ = ['GPTActor', 'GPTCritic', 'GPTRM']
+__all__ = ['GPTActor', 'GPTCritic', 'GPTRM', 'GPTLM']
diff --git a/applications/ChatGPT/chatgpt/models/gpt/gpt_lm.py b/applications/ChatGPT/chatgpt/models/gpt/gpt_lm.py
new file mode 100644
index 000000000000..5740c80d3e77
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/models/gpt/gpt_lm.py
@@ -0,0 +1,36 @@
+from typing import Optional
+
+from transformers.models.gpt2.configuration_gpt2 import GPT2Config
+from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
+
+from ..base import LM
+
+
+class GPTLM(LM):
+    """
+    GPT language model.
+
+    Args:
+        pretrained (str): Pretrained model name or path.
+        config (GPT2Config): Model config.
+        checkpoint (bool): Enable gradient checkpointing.
+        lora_rank (int): Rank of the LoRa layer.
+        lora_train_bias (str): Bias training strategy for the LoRa layer.
+    """
+
+    def __init__(self,
+                 pretrained: Optional[str] = None,
+                 config: Optional[GPT2Config] = None,
+                 checkpoint: bool = False,
+                 lora_rank: int = 0,
+                 lora_train_bias: str = 'none') -> None:
+        if pretrained is not None:
+            model = GPT2LMHeadModel.from_pretrained(pretrained)
+        elif config is not None:
+            model = GPT2LMHeadModel(config)
+        else:
+            model = GPT2LMHeadModel(GPT2Config())
+        if checkpoint:
+            model.gradient_checkpointing_enable()
+        super().__init__(model, lora_rank, lora_train_bias)
+
diff --git a/applications/ChatGPT/chatgpt/models/opt/__init__.py b/applications/ChatGPT/chatgpt/models/opt/__init__.py
index 334f4df0032a..fccec3bdff99 100644
--- a/applications/ChatGPT/chatgpt/models/opt/__init__.py
+++ b/applications/ChatGPT/chatgpt/models/opt/__init__.py
@@ -1,5 +1,6 @@
 from .opt_actor import OPTActor
 from .opt_critic import OPTCritic
 from .opt_rm import OPTRM
+from .opt_lm import OPTLM
 
-__all__ = ['OPTActor', 'OPTCritic', 'OPTRM']
+__all__ = ['OPTActor', 'OPTCritic', 'OPTRM', 'OPTLM']
diff --git a/applications/ChatGPT/chatgpt/models/opt/opt_lm.py b/applications/ChatGPT/chatgpt/models/opt/opt_lm.py
new file mode 100644
index 000000000000..35bfe198a225
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/models/opt/opt_lm.py
@@ -0,0 +1,36 @@
+from typing import Optional
+
+from transformers.models.opt.configuration_opt import OPTConfig
+from transformers.models.opt.modeling_opt import OPTForCausalLM
+
+from ..base import LM
+
+
+class OPTLM(LM):
+    """
+    OPT language model.
+
+    Args:
+        pretrained (str): Pretrained model name or path.
+        config (OPTConfig): Model config.
+        checkpoint (bool): Enable gradient checkpointing.
+        lora_rank (int): Rank of the low-rank approximation.
+        lora_train_bias (str): LoRA bias training mode.
+    """
+
+    def __init__(self,
+                 pretrained: Optional[str] = None,
+                 config: Optional[OPTConfig] = None,
+                 checkpoint: bool = False,
+                 lora_rank: int = 0,
+                 lora_train_bias: str = 'none') -> None:
+        if pretrained is not None:
+            model = OPTForCausalLM.from_pretrained(pretrained)
+        elif config is not None:
+            model = OPTForCausalLM(config)
+        else:
+            model = OPTForCausalLM(OPTConfig())
+        if checkpoint:
+            model.gradient_checkpointing_enable()
+        super().__init__(model, lora_rank, lora_train_bias)
+
diff --git a/applications/ChatGPT/chatgpt/trainer/__init__.py b/applications/ChatGPT/chatgpt/trainer/__init__.py
index c47c76347ee5..525b57bf21d3 100644
--- a/applications/ChatGPT/chatgpt/trainer/__init__.py
+++ b/applications/ChatGPT/chatgpt/trainer/__init__.py
@@ -1,5 +1,6 @@
 from .base import Trainer
 from .ppo import PPOTrainer
 from .rm import RewardModelTrainer
+from .sft import SFTTrainer
 
-__all__ = ['Trainer', 'PPOTrainer', 'RewardModelTrainer']
+__all__ = ['Trainer', 'PPOTrainer', 'RewardModelTrainer', 'SFTTrainer']
diff --git a/applications/ChatGPT/chatgpt/trainer/sft.py b/applications/ChatGPT/chatgpt/trainer/sft.py
new file mode 100644
index 000000000000..e3913d46bd45
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/trainer/sft.py
@@ -0,0 +1,101 @@
+from abc import ABC
+from typing import Optional
+import loralib as lora
+import torch
+from chatgpt.dataset import SFTDataset
+from chatgpt.models.loss import GPTLMLoss
+from torch.optim import Adam, Optimizer
+from torch.utils.data import DataLoader
+from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm
+import torch.distributed as dist
+from .strategies import Strategy
+from .utils import is_rank_0
+from colossalai.logging import get_dist_logger
+
+
+class SFTTrainer(ABC):
+    """
+        Trainer to use while training reward model.
+
+    Args:
+        model (torch.nn.Module): the model to train
+        strategy (Strategy): the strategy to use for training
+        optim(Optimizer): the optimizer to use for training
+        train_dataset (SFTDataset or SFTDistributedDataset): the dataset to use for training
+        eval_dataset (SFTDataset or SFTDistributedDataset): the dataset to use for evaluation
+        batch_size (int, defaults to 1): the batch size while training
+        max_epochs (int, defaults to 2): the number of epochs to train
+        optim_kwargs (dict, defaults to {'lr':1e-4}): the kwargs to use while initializing optimizer
+    """
+
+    def __init__(
+        self,
+        model,
+        strategy: Strategy,
+        optim: Optimizer,
+        train_dataset: SFTDataset,
+        eval_dataset: SFTDataset,
+        sampler: Optional[DistributedSampler] = None,
+        batch_size: int = 1,
+        max_epochs: int = 2,
+    ) -> None:
+        super().__init__()
+        self.strategy = strategy
+        self.epochs = max_epochs
+        self.train_dataset = train_dataset
+        self.eval_dataset = eval_dataset
+        self.sampler = sampler
+
+        self.train_dataloader = DataLoader(self.train_dataset, shuffle=(sampler is None),
+                                           sampler=sampler, batch_size=batch_size)
+        self.eval_dataloader = DataLoader(self.eval_dataset, batch_size=batch_size)
+
+        self.model = strategy.setup_model(model)
+        if "DDP" in str(self.strategy):
+            self.model = self.model.module
+        self.loss_fn = GPTLMLoss()
+        self.optimizer = strategy.setup_optimizer(optim, self.model)
+
+    def fit(self, logger, use_lora, log_interval=10):
+        epoch_bar = tqdm(range(self.epochs), desc='Train epoch', disable=not is_rank_0())
+        for epoch in range(self.epochs):
+            if isinstance(self.sampler, DistributedSampler):
+                self.sampler.set_epoch(epoch)
+            # train
+            self.model.train()
+            for batch_id, batch in enumerate(self.train_dataloader):
+                prompt_ids = batch["input_ids"]
+                p_mask = batch["attention_mask"]
+                prompt_ids = prompt_ids.squeeze(1).cuda()
+                p_mask = p_mask.squeeze(1).cuda()
+                prompt_logits = self.model(prompt_ids, attention_mask=p_mask)
+
+                loss = self.loss_fn(prompt_logits, prompt_ids)
+                self.strategy.backward(loss, self.model, self.optimizer)
+                self.strategy.optimizer_step(self.optimizer)
+                self.optimizer.zero_grad()
+                if batch_id % log_interval == 0:
+                    logger.info(f'Train Epoch {epoch}/{self.epochs} Batch {batch_id} Rank {dist.get_rank()} loss {loss.item()}')
+
+            # eval
+            self.model.eval()
+            with torch.no_grad():
+                loss_sum = 0
+                num_seen = 0
+                for batch in self.eval_dataloader:
+                    prompt_ids = batch["input_ids"]
+                    p_mask = batch["attention_mask"]
+                    prompt_ids = prompt_ids.squeeze(1).cuda()
+                    p_mask = p_mask.squeeze(1).cuda()
+
+                    prompt_logits = self.model(prompt_ids, attention_mask=p_mask)
+                    loss = self.loss_fn(prompt_logits, prompt_ids)
+                    loss_sum += loss.item()
+                    num_seen += prompt_ids.size(0)
+
+                loss_mean = loss_sum / num_seen
+                if dist.get_rank() == 0:
+                    logger.info(f'Eval Epoch {epoch}/{self.epochs} loss {loss_mean}')
+            epoch_bar.update()
+
diff --git a/applications/ChatGPT/examples/train_sft.py b/applications/ChatGPT/examples/train_sft.py
new file mode 100644
index 000000000000..4b3f85a2a491
--- /dev/null
+++ b/applications/ChatGPT/examples/train_sft.py
@@ -0,0 +1,114 @@
+import argparse
+
+import loralib as lora
+import torch
+import torch.distributed as dist
+from torch.utils.data.distributed import DistributedSampler
+from chatgpt.dataset import SFTDataset
+from chatgpt.models.base import RewardModel
+from chatgpt.models.bloom import BLOOMLM
+from chatgpt.models.gpt import GPTLM
+from chatgpt.models.opt import OPTLM
+from chatgpt.trainer import SFTTrainer
+from chatgpt.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
+from datasets import load_dataset
+from torch.optim import Adam
+from transformers import AutoTokenizer, BloomTokenizerFast
+from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
+
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.logging import get_dist_logger
+
+
+def train(args):
+    # configure strategy
+    if args.strategy == 'naive':
+        strategy = NaiveStrategy()
+    elif args.strategy == 'ddp':
+        strategy = DDPStrategy()
+    elif args.strategy == 'colossalai_gemini':
+        strategy = ColossalAIStrategy(stage=3, placement_policy='cuda')
+    elif args.strategy == 'colossalai_zero2':
+        strategy = ColossalAIStrategy(stage=2, placement_policy='cuda')
+    else:
+        raise ValueError(f'Unsupported strategy "{args.strategy}"')
+
+    # configure model
+    with strategy.model_init_context():
+        if args.model == 'bloom':
+            model = BLOOMLM(pretrained=args.pretrain, lora_rank=args.lora_rank).cuda()
+        elif args.model == 'opt':
+            model = OPTLM(pretrained=args.pretrain, lora_rank=args.lora_rank).cuda()
+        elif args.model == 'gpt2':
+            model = GPTLM(pretrained=args.pretrain, lora_rank=args.lora_rank).cuda()
+        else:
+            raise ValueError(f'Unsupported model "{args.model}"')
+
+    # configure tokenizer
+    if args.model == 'gpt2':
+        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        tokenizer.pad_token = tokenizer.eos_token
+    elif args.model == 'bloom':
+        tokenizer = BloomTokenizerFast.from_pretrained(args.pretrain)
+        tokenizer.pad_token = tokenizer.eos_token
+    elif args.model == 'opt':
+        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
+    else:
+        raise ValueError(f'Unsupported model "{args.model}"')
+    tokenizer.pad_token = tokenizer.eos_token
+
+    max_len = 512
+
+    # configure optimizer
+    if args.strategy.startswith('colossalai'):
+        optim = HybridAdam(model.parameters(), lr=5e-5)
+    else:
+        optim = Adam(model.parameters(), lr=5e-5)
+
+    logger = get_dist_logger()
+
+    train_data = load_dataset(args.dataset, 'super_natural_instructions', split='train')
+    eval_data = load_dataset(args.dataset, 'super_natural_instructions', split='test')
+
+    train_dataset = SFTDataset(train_data, tokenizer, max_len)
+    eval_dataset = SFTDataset(eval_data, tokenizer, max_len)
+
+    if dist.is_initialized() and dist.get_world_size() > 1:
+        sampler = DistributedSampler(train_dataset, shuffle=True, seed=42, drop_last=True)
+        logger.info("Using Distributed Sampler")
+    else:
+        sampler = None
+
+    trainer = SFTTrainer(model=model,
+                         strategy=strategy,
+                         optim=optim,
+                         train_dataset=train_dataset,
+                         eval_dataset=eval_dataset,
+                         sampler=sampler,
+                         batch_size=args.batch_size,
+                         max_epochs=args.max_epochs)
+
+    trainer.fit(logger=logger, use_lora=args.lora_rank, log_interval=args.log_interval)
+
+    # save model checkpoint after fitting on only rank0
+    strategy.save_model(model, 'sft_checkpoint.pt', only_rank0=True)
+    # save optimizer checkpoint on all ranks
+    strategy.save_optimizer(optim, 'sft_optim_checkpoint_%d.pt' % (torch.cuda.current_device()), only_rank0=False)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--strategy',
+                        choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
+                        default='naive')
+    parser.add_argument('--model', choices=['gpt2', 'bloom', 'opt'], default='bloom')
+    parser.add_argument('--pretrain', type=str, default=None)
+    parser.add_argument('--dataset', type=str, default='yizhongw/self_instruct')
+    parser.add_argument('--save_path', type=str, default='sft_ckpt.pth')
+    parser.add_argument('--max_epochs', type=int, default=1)
+    parser.add_argument('--batch_size', type=int, default=4)
+    parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
+    parser.add_argument('--log_interval', type=int, default=100, help="how many steps to log")
+    args = parser.parse_args()
+    train(args)
+
diff --git a/applications/ChatGPT/examples/train_sft.sh b/applications/ChatGPT/examples/train_sft.sh
new file mode 100755
index 000000000000..9f747b24689e
--- /dev/null
+++ b/applications/ChatGPT/examples/train_sft.sh
@@ -0,0 +1,20 @@
+set_n_least_used_CUDA_VISIBLE_DEVICES() {
+    local n=${1:-"9999"}
+    echo "GPU Memory Usage:"
+    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv \
+        | tail -n +2 \
+        | nl -v 0 \
+        | tee /dev/tty \
+        | sort -g -k 2 \
+        | awk '{print $1}' \
+        | head -n $n)
+    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
+    echo "Now CUDA_VISIBLE_DEVICES is set to:"
+    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
+}
+
+set_n_least_used_CUDA_VISIBLE_DEVICES 8
+
+#torchrun --standalone --nproc_per_node=2 train_sft.py --pretrain 'bigscience/bloomz-560m' --model 'bloom' --strategy colossalai_zero2 --log_interval 10
+#torchrun --standalone --nproc_per_node=8 train_sft.py  --model 'gpt2' --strategy colossalai_zero2 --batch_size 1 --log_interval 10
+torchrun --standalone --nproc_per_node=2 train_sft.py --pretrain "facebook/opt-350m" --model 'opt' --strategy colossalai_zero2 --log_interval 10

From f57d34958babae9781e351f8f8008ad0f47f01dd Mon Sep 17 00:00:00 2001
From: YuliangLiu0306 <72588413+YuliangLiu0306@users.noreply.github.com>
Date: Wed, 22 Mar 2023 10:40:33 +0800
Subject: [PATCH 493/503] [FX] refactor experimental tracer and adapt it with
 hf models (#3157)

* pass gpt trace and meta_prop

* pass t5 trace and meta_prop

* [FX] refactor experimental tracer and adapt it with hf models

* pass all mainstream model zoo

* fix CI

* fix CI

* fix CI

* fix CI

* fix CI

* fix CI

* fix CI

* fix CI

* skip tests

* fix CI

* using packaging version

* polish
---
 .../_subclasses/_meta_registration.py         | 829 +++++++++---------
 .../_analyzer/_subclasses/_monkey_patch.py    |  76 +-
 .../_analyzer/_subclasses/flop_tensor.py      | 236 ++---
 colossalai/_analyzer/fx/__init__.py           |   3 +-
 colossalai/_analyzer/fx/graph_module.py       |  72 +-
 colossalai/_analyzer/fx/node_util.py          |   6 +-
 colossalai/_analyzer/fx/passes/shape_prop.py  |  19 +-
 colossalai/_analyzer/fx/tracer/__init__.py    |   2 +
 .../fx/{ => tracer}/bias_addition.py          |   3 +-
 .../_analyzer/fx/tracer/custom_leaf_module.py |  29 +
 colossalai/_analyzer/fx/tracer/proxy.py       | 112 +++
 .../_analyzer/fx/tracer/symbolic_trace.py     | 157 ++++
 .../{symbolic_trace.py => tracer/tracer.py}   | 355 ++------
 tests/kit/model_zoo/__init__.py               |   1 -
 tests/kit/model_zoo/transformers/gpt.py       |  10 +-
 .../test_fx/test_bias_addition.py             |   3 +-
 .../test_hf_model/hf_tracer_utils.py          |   3 +-
 .../test_hf_model/test_hf_albert.py           |   4 +
 .../test_tracer/test_hf_model/test_hf_bert.py |   4 +
 .../test_tracer/test_hf_model/test_hf_gpt.py  |  12 +-
 .../test_tracer/test_hf_model/test_hf_opt.py  |   4 +
 .../test_tracer/test_hf_model/test_hf_t5.py   |   4 +
 .../test_timm_model/test_timm_model.py        |   5 +-
 .../test_torchaudio_model.py                  |   8 +-
 .../test_torchaudio_model/torchaudio_utils.py |   2 +-
 .../test_torchrec_model/test_deepfm_model.py  |   2 +-
 .../test_torchrec_model/test_dlrm_model.py    |   2 +-
 .../test_torchvision_model.py                 |   2 +-
 28 files changed, 1058 insertions(+), 907 deletions(-)
 create mode 100644 colossalai/_analyzer/fx/tracer/__init__.py
 rename colossalai/_analyzer/fx/{ => tracer}/bias_addition.py (98%)
 create mode 100644 colossalai/_analyzer/fx/tracer/custom_leaf_module.py
 create mode 100644 colossalai/_analyzer/fx/tracer/proxy.py
 create mode 100644 colossalai/_analyzer/fx/tracer/symbolic_trace.py
 rename colossalai/_analyzer/fx/{symbolic_trace.py => tracer/tracer.py} (53%)

diff --git a/colossalai/_analyzer/_subclasses/_meta_registration.py b/colossalai/_analyzer/_subclasses/_meta_registration.py
index 20ab46054c8e..2af7e05399af 100644
--- a/colossalai/_analyzer/_subclasses/_meta_registration.py
+++ b/colossalai/_analyzer/_subclasses/_meta_registration.py
@@ -6,11 +6,15 @@
 from typing import Callable, List, Optional, Tuple, Union
 
 import torch
+from packaging import version
 from torch.utils._pytree import tree_map
 
 aten = torch.ops.aten
 
-meta_lib = torch.library.Library("aten", "IMPL", "Meta")
+try:
+    meta_lib = torch.library.Library("aten", "IMPL", "Meta")
+except AttributeError:
+    meta_lib = None
 
 meta_table = {}
 
@@ -50,432 +54,411 @@ def add_func(op):
     return wrapper
 
 
-# ============================== Convolutions ======================================
-# https://github.com/pytorch/pytorch/pull/79834
-@register_meta(aten.convolution.default)
-def meta_conv(
-    input_tensor: torch.Tensor,
-    weight: torch.Tensor,
-    bias: torch.Tensor,
-    stride: List[int],
-    padding: List[int],
-    dilation: List[int],
-    is_transposed: bool,
-    output_padding: List[int],
-    groups: int,
-):
-
-    def _formula(ln: int, p: int, d: int, k: int, s: int) -> int:
-        """
-        Formula to apply to calculate the length of some dimension of the output
-        See: https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
-        Args:
-            ln: length of the dimension
-            p: padding in that dim
-            d: dilation in that dim
-            k: kernel size in that dim
-            s: stride in that dim
-        Returns:
-            The output length
-        """
-        return (ln + 2 * p - d * (k - 1) - 1) // s + 1
-
-    def _formula_transposed(ln: int, p: int, d: int, k: int, s: int, op: int) -> int:
-        """
-        Formula to apply to calculate the length of some dimension of the output
-        if transposed convolution is used.
-        See: https://pytorch.org/docs/stable/generated/torch.nn.ConvTranspose2d.html
-        Args:
-            ln: length of the dimension
-            p: padding in that dim
-            d: dilation in that dim
-            k: kernel size in that dim
-            s: stride in that dim
-            op: output padding in that dim
-        Returns:
-            The output length
-        """
-        return (ln - 1) * s - 2 * p + d * (k - 1) + op + 1
-
-    def calc_conv_nd_return_shape(
-        dims: torch.Size,
-        kernel_size: torch.Size,
-        stride: Union[List[int], int],
-        padding: Union[List[int], int],
-        dilation: Union[List[int], int],
-        output_padding: Optional[Union[List[int], int]] = None,
+if version.parse(torch.__version__) >= version.parse('1.12.0'):
+    # ============================== Convolutions ======================================
+    # https://github.com/pytorch/pytorch/pull/79834
+    @register_meta(aten.convolution.default)
+    def meta_conv(
+        input_tensor: torch.Tensor,
+        weight: torch.Tensor,
+        bias: torch.Tensor,
+        stride: List[int],
+        padding: List[int],
+        dilation: List[int],
+        is_transposed: bool,
+        output_padding: List[int],
+        groups: int,
     ):
-        ret_shape = []
-        if isinstance(stride, int):
-            stride = [stride] * len(dims)
-        elif len(stride) == 1:
-            stride = [stride[0]] * len(dims)
-
-        if isinstance(padding, int):
-            padding = [padding] * len(dims)
-        elif len(padding) == 1:
-            padding = [padding[0]] * len(dims)
-
-        if isinstance(dilation, int):
-            dilation = [dilation] * len(dims)
-        elif len(dilation) == 1:
-            dilation = [dilation[0]] * len(dims)
-
-        output_padding_list: Optional[List[int]] = None
-        if output_padding:
-            if isinstance(output_padding, int):
-                output_padding_list = [output_padding] * len(dims)
-            elif len(output_padding) == 1:
-                output_padding_list = [output_padding[0]] * len(dims)
-            else:
-                output_padding_list = output_padding
-
-        for i in range(len(dims)):
-            # If output_padding is present, we are dealing with a transposed convolution
-            if output_padding_list:
-                ret_shape.append(
-                    _formula_transposed(
-                        dims[i],
-                        padding[i],
-                        dilation[i],
-                        kernel_size[i],
-                        stride[i],
-                        output_padding_list[i],
-                    ))
-            else:
-                ret_shape.append(_formula(dims[i], padding[i], dilation[i], kernel_size[i], stride[i]))
-        return ret_shape
-
-    def pick_memory_format():
-        if input_tensor.is_contiguous(memory_format=torch.channels_last):
-            return torch.channels_last
-        elif input_tensor.is_contiguous(memory_format=torch.contiguous_format):
-            return torch.contiguous_format
-        elif input_tensor.is_contiguous(memory_format=torch.preserve_format):
-            return torch.preserve_format
-
-    kernel_size = weight.shape[2:]
-    dims = input_tensor.shape[2:]
-    if is_transposed:
-        out_channels = groups * weight.shape[1]
-
-        shape_out = calc_conv_nd_return_shape(
-            dims,
-            kernel_size,
-            stride,
-            padding,
-            dilation,
-            output_padding,
-        )
-
-    else:
-        out_channels = weight.shape[0]
-        if weight.shape[1] != input_tensor.shape[1] / groups:
-            raise RuntimeError("Invalid channel dimensions")
-        shape_out = calc_conv_nd_return_shape(dims, kernel_size, stride, padding, dilation)
-    out = input_tensor.new_empty((input_tensor.shape[0], out_channels, *shape_out))
-    mem_fmt = pick_memory_format()
-    out = out.to(memory_format=mem_fmt)    # type: ignore[call-overload]
-    return out
-
-
-@register_meta(aten._convolution.default)
-def meta__conv(input_tensor: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, stride: List[int],
-               padding: List[int], dilation: List[int], is_transposed: bool, output_padding: List[int], groups: int,
-               *extra_args):
-    out = meta_conv(input_tensor, weight, bias, stride, padding, dilation, is_transposed, output_padding, groups)
-    return out
-
-
-@register_meta(aten.convolution_backward.default)
-def meta_conv_backward(grad_output: torch.Tensor, input: torch.Tensor, weight: torch.Tensor, bias_sizes, stride,
-                       padding, dilation, transposed, output_padding, groups, output_mask):
-    return new_like(input), new_like(weight), new((bias_sizes))
-
-
-# https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/AdaptiveAveragePooling.cpp
-@register_meta(aten._adaptive_avg_pool2d_backward.default)
-def meta_adaptive_avg_pool2d_backward(
-    grad_output: torch.Tensor,
-    input: torch.Tensor,
-):
-    return new_like(input)
-
-
-# ================================ RNN =============================================
-# https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cudnn/RNN.cpp
-@register_meta(aten._cudnn_rnn.default)
-def meta_cuda_rnn(
-    input,
-    weight,
-    weight_stride0,
-    weight_buf,
-    hx,
-    cx,
-    mode,
-    hidden_size,
-    proj_size,
-    num_layers,
-    batch_first,
-    dropout,
-    train,
-    bidirectional,
-    batch_sizes,
-    dropout_state,
-):
-
-    is_input_packed = len(batch_sizes) != 0
-    if is_input_packed:
-        seq_length = len(batch_sizes)
-        mini_batch = batch_sizes[0]
-        batch_sizes_sum = input.shape[0]
-    else:
-        seq_length = input.shape[1] if batch_first else input.shape[0]
-        mini_batch = input.shape[0] if batch_first else input.shape[1]
-        batch_sizes_sum = -1
-
-    num_directions = 2 if bidirectional else 1
-    out_size = proj_size if proj_size != 0 else hidden_size
-    if is_input_packed:
-        out_shape = [batch_sizes_sum, out_size * num_directions]
-    else:
-        out_shape = ([mini_batch, seq_length, out_size *
-                      num_directions] if batch_first else [seq_length, mini_batch, out_size * num_directions])
-    output = input.new_empty(out_shape)
-
-    cell_shape = [num_layers * num_directions, mini_batch, hidden_size]
-    cy = new(0) if cx is None else cx.new_empty(cell_shape)
-
-    hy = hx.new_empty([num_layers * num_directions, mini_batch, out_size])
-
-    # TODO: Query cudnnGetRNNTrainingReserveSize (expose to python)
-    reserve_shape = 0 if train else 0
-    reserve = input.new_empty(reserve_shape, dtype=torch.uint8)
-
-    return output, hy, cy, reserve, weight_buf
-
-
-# https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cudnn/RNN.cpp
-@register_meta(aten._cudnn_rnn_backward.default)
-def meta_cudnn_rnn_backward(input: torch.Tensor,
-                            weight: torch.Tensor,
-                            weight_stride0: int,
-                            hx: torch.Tensor,
-                            cx: Optional[torch.Tensor] = None,
-                            *args,
-                            **kwargs):
-    return new_like(input), new_like(weight), new_like(hx), new_like(cx) if cx is not None else new(
-        ())    # (grad_input, grad_weight, grad_hx, grad_cx)
-
-
-# https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/Activation.cpp
-# ============================== Activations =======================================
-_unregistered_ewise = [
-    aten.relu.default,
-    aten.prelu.default,
-    aten.hardswish.default,
-    aten.hardtanh.default,
-    aten.prelu_backward.default,
-    aten.hardswish_backward.default,
-    aten.hardtanh_backward.default,
-]
-
-
-@register_meta(_unregistered_ewise)
-def meta_unregistered_ewise(input: torch.Tensor, *args):
-    return new_like(input)
-
-
-# ============================== Normalization =====================================
-# https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cudnn/BatchNorm.cpp
-@register_meta(aten.native_batch_norm.default)
-def meta_bn(input: torch.Tensor, weight, bias, running_mean, running_var, training, momentum, eps):
-    n_input = input.size(1)
-    return new_like(input), new((n_input)), new((n_input))
-
-
-# https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cudnn/BatchNorm.cpp
-@register_meta(aten.native_batch_norm_backward.default)
-def meta_bn_backward(dY: torch.Tensor, input: torch.Tensor, weight: torch.Tensor, running_mean, running_var, save_mean,
-                     save_invstd, train, eps, output_mask):
-    return new_like(input), new_like(weight), new_like(weight)    # (dX, dgamma, dbeta)
-
-
-# https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cudnn/BatchNorm.cpp
-@register_meta(aten.cudnn_batch_norm.default)
-def meta_cudnn_bn(input: torch.Tensor, weight, bias, running_mean, running_var, training, momentum, eps):
-    n_input = input.size(1)
-    return new_like(input), new((n_input)), new((n_input)), new(
-        (0), dtype=torch.uint8)    # (output, running_mean, running_var, reserve)
-
-
-# https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cudnn/BatchNorm.cpp
-# NB: CuDNN only implements the backward algorithm for batchnorm
-# in training mode (evaluation mode batchnorm has a different algorithm),
-# which is why this doesn't accept a 'training' parameter.
-@register_meta(aten.cudnn_batch_norm_backward.default)
-def meta_cudnn_bn_backward(dY: torch.Tensor, input: torch.Tensor, weight: torch.Tensor, running_mean, running_var,
-                           save_mean, save_invstd, eps, reserve):
-    return new_like(input), new_like(weight), new_like(weight)    # (dX, dgamma, dbeta)
-
-
-# https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/layer_norm.cpp
-@register_meta(aten.native_layer_norm.default)
-def meta_ln(input: torch.Tensor, normalized_shape, weight, bias, eps):
-    bs, n_input = input.size(0), input.size(1)
-    return new_like(input), new((bs, n_input, 1)), new((bs, n_input, 1))    # (output, running_mean, running_var)
-
-
-# https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/layer_norm.cpp
-@register_meta(aten.native_layer_norm_backward.default)
-def meta_ln_backward(dY: torch.Tensor, input: torch.Tensor, normalized_shape, mean, rstd, weight, bias,
-                     grad_input_mask):
-    return new_like(input), new_like(weight), new_like(bias)    # (dX, dgamma, dbeta)
-
-
-# ================================== Misc ==========================================
-# Maybe incorrect
-# https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/Im2Col.cpp
-@register_meta(aten.im2col.default)
-def meta_im2col(input: torch.Tensor, kernel_size, dilation, padding, stride):
-    return new_like(input)
-
-
-# https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/native_functions.yaml
-@register_meta(aten.eye.m_out)
-def meta_eye(n: int, m: int, out: torch.Tensor):
-    return out
-
-
-# https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/native_functions.yaml
-@register_meta(aten.roll.default)
-def meta_roll(input: torch.Tensor, shifts, dims):
-    return input
-
-
-# https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/Scalar.cpp
-@register_meta(aten._local_scalar_dense.default)
-def meta_local_scalar_dense(self: torch.Tensor):
-    return 0
-
-
-# https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/TensorCompare.cpp
-@register_meta(aten.where.self)
-def meta_where_self(condition: torch.Tensor, self: torch.Tensor, other: torch.Tensor):
-    result_type = torch.result_type(self, other)
-    return new_like(condition + self + other, dtype=result_type)
-
-
-@register_meta(aten.index.Tensor)
-def meta_index_Tensor(self, indices):
-    assert indices, "at least one index must be provided"
-    # aten::index is the internal advanced indexing implementation
-    # checkIndexTensorTypes and expandTensors
-    result: List[Optional[torch.Tensor]] = []
-    for i, index in enumerate(indices):
-        if index is not None:
-            assert index.dtype in [torch.long, torch.int8, torch.bool],\
-                "tensors used as indices must be long, byte or bool tensors"
-            if index.dtype in [torch.int8, torch.bool]:
-                nonzero = index.nonzero()
-                k = len(result)
-                assert k + index.ndim <= self.ndim, f"too many indices for tensor of dimension {self.ndim}"
-                for j in range(index.ndim):
-                    assert index.shape[j] == self.shape[
-                        k +
-                        j], f"The shape of the mask {index.shape} at index {i} does not match the shape of the indexed tensor {self.shape} at index {k + j}"
-                    result.append(nonzero.select(1, j))
-            else:
-                result.append(index)
+
+        def _formula(ln: int, p: int, d: int, k: int, s: int) -> int:
+            """
+            Formula to apply to calculate the length of some dimension of the output
+            See: https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
+            Args:
+                ln: length of the dimension
+                p: padding in that dim
+                d: dilation in that dim
+                k: kernel size in that dim
+                s: stride in that dim
+            Returns:
+                The output length
+            """
+            return (ln + 2 * p - d * (k - 1) - 1) // s + 1
+
+        def _formula_transposed(ln: int, p: int, d: int, k: int, s: int, op: int) -> int:
+            """
+            Formula to apply to calculate the length of some dimension of the output
+            if transposed convolution is used.
+            See: https://pytorch.org/docs/stable/generated/torch.nn.ConvTranspose2d.html
+            Args:
+                ln: length of the dimension
+                p: padding in that dim
+                d: dilation in that dim
+                k: kernel size in that dim
+                s: stride in that dim
+                op: output padding in that dim
+            Returns:
+                The output length
+            """
+            return (ln - 1) * s - 2 * p + d * (k - 1) + op + 1
+
+        def calc_conv_nd_return_shape(
+            dims: torch.Size,
+            kernel_size: torch.Size,
+            stride: Union[List[int], int],
+            padding: Union[List[int], int],
+            dilation: Union[List[int], int],
+            output_padding: Optional[Union[List[int], int]] = None,
+        ):
+            ret_shape = []
+            if isinstance(stride, int):
+                stride = [stride] * len(dims)
+            elif len(stride) == 1:
+                stride = [stride[0]] * len(dims)
+
+            if isinstance(padding, int):
+                padding = [padding] * len(dims)
+            elif len(padding) == 1:
+                padding = [padding[0]] * len(dims)
+
+            if isinstance(dilation, int):
+                dilation = [dilation] * len(dims)
+            elif len(dilation) == 1:
+                dilation = [dilation[0]] * len(dims)
+
+            output_padding_list: Optional[List[int]] = None
+            if output_padding:
+                if isinstance(output_padding, int):
+                    output_padding_list = [output_padding] * len(dims)
+                elif len(output_padding) == 1:
+                    output_padding_list = [output_padding[0]] * len(dims)
+                else:
+                    output_padding_list = output_padding
+
+            for i in range(len(dims)):
+                # If output_padding is present, we are dealing with a transposed convolution
+                if output_padding_list:
+                    ret_shape.append(
+                        _formula_transposed(
+                            dims[i],
+                            padding[i],
+                            dilation[i],
+                            kernel_size[i],
+                            stride[i],
+                            output_padding_list[i],
+                        ))
+                else:
+                    ret_shape.append(_formula(dims[i], padding[i], dilation[i], kernel_size[i], stride[i]))
+            return ret_shape
+
+        def pick_memory_format():
+            if input_tensor.is_contiguous(memory_format=torch.channels_last):
+                return torch.channels_last
+            elif input_tensor.is_contiguous(memory_format=torch.contiguous_format):
+                return torch.contiguous_format
+            elif input_tensor.is_contiguous(memory_format=torch.preserve_format):
+                return torch.preserve_format
+
+        kernel_size = weight.shape[2:]
+        dims = input_tensor.shape[2:]
+        if is_transposed:
+            out_channels = groups * weight.shape[1]
+
+            shape_out = calc_conv_nd_return_shape(
+                dims,
+                kernel_size,
+                stride,
+                padding,
+                dilation,
+                output_padding,
+            )
+
         else:
-            result.append(index)
-    indices = result
-    assert len(indices) <= self.ndim, f"too many indices for tensor of dimension {self.ndim} (got {len(indices)})"
-    # expand_outplace
-    import torch._refs as refs
-
-    indices = list(refs._maybe_broadcast(*indices))
-    # add missing null tensors
-    while len(indices) < self.ndim:
-        indices.append(None)
-
-    # hasContiguousSubspace
-    #   true if all non-null tensors are adjacent
-    # See:
-    # https://numpy.org/doc/stable/user/basics.indexing.html#combining-advanced-and-basic-indexing
-    # https://stackoverflow.com/questions/53841497/why-does-numpy-mixed-basic-advanced-indexing-depend-on-slice-adjacency
-    state = 0
-    has_contiguous_subspace = False
-    for index in indices:
-        if state == 0:
-            if index is not None:
-                state = 1
-        elif state == 1:
-            if index is None:
-                state = 2
+            out_channels = weight.shape[0]
+            if weight.shape[1] != input_tensor.shape[1] / groups:
+                raise RuntimeError("Invalid channel dimensions")
+            shape_out = calc_conv_nd_return_shape(dims, kernel_size, stride, padding, dilation)
+        out = input_tensor.new_empty((input_tensor.shape[0], out_channels, *shape_out))
+        mem_fmt = pick_memory_format()
+        out = out.to(memory_format=mem_fmt)    # type: ignore[call-overload]
+        return out
+
+    @register_meta(aten._convolution.default)
+    def meta__conv(input_tensor: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, stride: List[int],
+                   padding: List[int], dilation: List[int], is_transposed: bool, output_padding: List[int], groups: int,
+                   *extra_args):
+        out = meta_conv(input_tensor, weight, bias, stride, padding, dilation, is_transposed, output_padding, groups)
+        return out
+
+    @register_meta(aten.convolution_backward.default)
+    def meta_conv_backward(grad_output: torch.Tensor, input: torch.Tensor, weight: torch.Tensor, bias_sizes, stride,
+                           padding, dilation, transposed, output_padding, groups, output_mask):
+        return new_like(input), new_like(weight), new((bias_sizes))
+
+    # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/AdaptiveAveragePooling.cpp
+    @register_meta(aten._adaptive_avg_pool2d_backward.default)
+    def meta_adaptive_avg_pool2d_backward(
+        grad_output: torch.Tensor,
+        input: torch.Tensor,
+    ):
+        return new_like(input)
+
+    # ================================ RNN =============================================
+    # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cudnn/RNN.cpp
+    @register_meta(aten._cudnn_rnn.default)
+    def meta_cuda_rnn(
+        input,
+        weight,
+        weight_stride0,
+        weight_buf,
+        hx,
+        cx,
+        mode,
+        hidden_size,
+        proj_size,
+        num_layers,
+        batch_first,
+        dropout,
+        train,
+        bidirectional,
+        batch_sizes,
+        dropout_state,
+    ):
+
+        is_input_packed = len(batch_sizes) != 0
+        if is_input_packed:
+            seq_length = len(batch_sizes)
+            mini_batch = batch_sizes[0]
+            batch_sizes_sum = input.shape[0]
         else:
-            if index is not None:
-                break
-    else:
-        has_contiguous_subspace = True
-
-    # transposeToFront
-    # This is the logic that causes the newly inserted dimensions to show up
-    # at the beginning of the tensor, if they're not contiguous
-    if not has_contiguous_subspace:
-        dims = []
-        transposed_indices = []
+            seq_length = input.shape[1] if batch_first else input.shape[0]
+            mini_batch = input.shape[0] if batch_first else input.shape[1]
+            batch_sizes_sum = -1
+
+        num_directions = 2 if bidirectional else 1
+        out_size = proj_size if proj_size != 0 else hidden_size
+        if is_input_packed:
+            out_shape = [batch_sizes_sum, out_size * num_directions]
+        else:
+            out_shape = ([mini_batch, seq_length, out_size *
+                          num_directions] if batch_first else [seq_length, mini_batch, out_size * num_directions])
+        output = input.new_empty(out_shape)
+
+        cell_shape = [num_layers * num_directions, mini_batch, hidden_size]
+        cy = new(0) if cx is None else cx.new_empty(cell_shape)
+
+        hy = hx.new_empty([num_layers * num_directions, mini_batch, out_size])
+
+        # TODO: Query cudnnGetRNNTrainingReserveSize (expose to python)
+        reserve_shape = 0 if train else 0
+        reserve = input.new_empty(reserve_shape, dtype=torch.uint8)
+
+        return output, hy, cy, reserve, weight_buf
+
+    # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cudnn/RNN.cpp
+    @register_meta(aten._cudnn_rnn_backward.default)
+    def meta_cudnn_rnn_backward(input: torch.Tensor,
+                                weight: torch.Tensor,
+                                weight_stride0: int,
+                                hx: torch.Tensor,
+                                cx: Optional[torch.Tensor] = None,
+                                *args,
+                                **kwargs):
+        return new_like(input), new_like(weight), new_like(hx), new_like(cx) if cx is not None else new(
+            ())    # (grad_input, grad_weight, grad_hx, grad_cx)
+
+    # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/Activation.cpp
+    # ============================== Activations =======================================
+    _unregistered_ewise = [
+        aten.relu.default,
+        aten.prelu.default,
+        aten.hardswish.default,
+        aten.hardtanh.default,
+        aten.prelu_backward.default,
+        aten.hardswish_backward.default,
+        aten.hardtanh_backward.default,
+    ]
+
+    @register_meta(_unregistered_ewise)
+    def meta_unregistered_ewise(input: torch.Tensor, *args):
+        return new_like(input)
+
+    # ============================== Normalization =====================================
+    # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cudnn/BatchNorm.cpp
+    @register_meta(aten.native_batch_norm.default)
+    def meta_bn(input: torch.Tensor, weight, bias, running_mean, running_var, training, momentum, eps):
+        n_input = input.size(1)
+        return new_like(input), new((n_input)), new((n_input))
+
+    # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cudnn/BatchNorm.cpp
+    @register_meta(aten.native_batch_norm_backward.default)
+    def meta_bn_backward(dY: torch.Tensor, input: torch.Tensor, weight: torch.Tensor, running_mean, running_var,
+                         save_mean, save_invstd, train, eps, output_mask):
+        return new_like(input), new_like(weight), new_like(weight)    # (dX, dgamma, dbeta)
+
+    # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cudnn/BatchNorm.cpp
+    @register_meta(aten.cudnn_batch_norm.default)
+    def meta_cudnn_bn(input: torch.Tensor, weight, bias, running_mean, running_var, training, momentum, eps):
+        n_input = input.size(1)
+        return new_like(input), new((n_input)), new((n_input)), new(
+            (0), dtype=torch.uint8)    # (output, running_mean, running_var, reserve)
+
+    # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cudnn/BatchNorm.cpp
+    # NB: CuDNN only implements the backward algorithm for batchnorm
+    # in training mode (evaluation mode batchnorm has a different algorithm),
+    # which is why this doesn't accept a 'training' parameter.
+    @register_meta(aten.cudnn_batch_norm_backward.default)
+    def meta_cudnn_bn_backward(dY: torch.Tensor, input: torch.Tensor, weight: torch.Tensor, running_mean, running_var,
+                               save_mean, save_invstd, eps, reserve):
+        return new_like(input), new_like(weight), new_like(weight)    # (dX, dgamma, dbeta)
+
+    # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/layer_norm.cpp
+    @register_meta(aten.native_layer_norm.default)
+    def meta_ln(input: torch.Tensor, normalized_shape, weight, bias, eps):
+        bs, n_input = input.size(0), input.size(1)
+        return new_like(input), new((bs, n_input, 1)), new((bs, n_input, 1))    # (output, running_mean, running_var)
+
+    # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/layer_norm.cpp
+    @register_meta(aten.native_layer_norm_backward.default)
+    def meta_ln_backward(dY: torch.Tensor, input: torch.Tensor, normalized_shape, mean, rstd, weight, bias,
+                         grad_input_mask):
+        return new_like(input), new_like(weight), new_like(bias)    # (dX, dgamma, dbeta)
+
+    # ================================== Misc ==========================================
+    # Maybe incorrect
+    # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/Im2Col.cpp
+    @register_meta(aten.im2col.default)
+    def meta_im2col(input: torch.Tensor, kernel_size, dilation, padding, stride):
+        return new_like(input)
+
+    # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/native_functions.yaml
+    @register_meta(aten.eye.m_out)
+    def meta_eye(n: int, m: int, out: torch.Tensor):
+        return out
+
+    # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/native_functions.yaml
+    @register_meta(aten.roll.default)
+    def meta_roll(input: torch.Tensor, shifts, dims):
+        return input
+
+    # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/Scalar.cpp
+    @register_meta(aten._local_scalar_dense.default)
+    def meta_local_scalar_dense(self: torch.Tensor):
+        return 0
+
+    # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/TensorCompare.cpp
+    @register_meta(aten.where.self)
+    def meta_where_self(condition: torch.Tensor, self: torch.Tensor, other: torch.Tensor):
+        result_type = torch.result_type(self, other)
+        return new_like(condition + self + other, dtype=result_type)
+
+    @register_meta(aten.index.Tensor)
+    def meta_index_Tensor(self, indices):
+        assert indices, "at least one index must be provided"
+        # aten::index is the internal advanced indexing implementation
+        # checkIndexTensorTypes and expandTensors
+        result: List[Optional[torch.Tensor]] = []
         for i, index in enumerate(indices):
             if index is not None:
-                dims.append(i)
-                transposed_indices.append(index)
-        for i, index in enumerate(indices):
-            if index is None:
-                dims.append(i)
-                transposed_indices.append(index)
-        self = self.permute(dims)
-        indices = transposed_indices
-
-    # AdvancedIndex::AdvancedIndex
-    # Now we can assume the indices have contiguous subspace
-    # This is simplified from AdvancedIndex which goes to more effort
-    # to put the input and indices in a form so that TensorIterator can
-    # take them.  If we write a ref for this, probably that logic should
-    # get implemented
-    before_shape: List[int] = []
-    after_shape: List[int] = []
-    replacement_shape: List[int] = []
-    for dim, index in enumerate(indices):
-        if index is None:
-            if replacement_shape:
-                after_shape.append(self.shape[dim])
+                assert index.dtype in [torch.long, torch.int8, torch.bool],\
+                    "tensors used as indices must be long, byte or bool tensors"
+                if index.dtype in [torch.int8, torch.bool]:
+                    nonzero = index.nonzero()
+                    k = len(result)
+                    assert k + index.ndim <= self.ndim, f"too many indices for tensor of dimension {self.ndim}"
+                    for j in range(index.ndim):
+                        assert index.shape[j] == self.shape[
+                            k +
+                            j], f"The shape of the mask {index.shape} at index {i} does not match the shape of the indexed tensor {self.shape} at index {k + j}"
+                        result.append(nonzero.select(1, j))
+                else:
+                    result.append(index)
+            else:
+                result.append(index)
+        indices = result
+        assert len(indices) <= self.ndim, f"too many indices for tensor of dimension {self.ndim} (got {len(indices)})"
+        # expand_outplace
+        import torch._refs as refs
+
+        indices = list(refs._maybe_broadcast(*indices))
+        # add missing null tensors
+        while len(indices) < self.ndim:
+            indices.append(None)
+
+        # hasContiguousSubspace
+        #   true if all non-null tensors are adjacent
+        # See:
+        # https://numpy.org/doc/stable/user/basics.indexing.html#combining-advanced-and-basic-indexing
+        # https://stackoverflow.com/questions/53841497/why-does-numpy-mixed-basic-advanced-indexing-depend-on-slice-adjacency
+        state = 0
+        has_contiguous_subspace = False
+        for index in indices:
+            if state == 0:
+                if index is not None:
+                    state = 1
+            elif state == 1:
+                if index is None:
+                    state = 2
             else:
-                before_shape.append(self.shape[dim])
+                if index is not None:
+                    break
         else:
-            replacement_shape = list(index.shape)
-    return self.new_empty(before_shape + replacement_shape + after_shape)
-
-
-# ============================== Embedding =========================================
-# https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/Embedding.cpp
-@register_meta(aten.embedding_dense_backward.default)
-def meta_embedding_dense_backward(grad_output: torch.Tensor, indices: torch.Tensor, num_weights, padding_idx,
-                                  scale_grad_by_freq):
-    return new((num_weights, grad_output.size(-1)),
-               dtype=grad_output.dtype,
-               device=grad_output.device,
-               layout=grad_output.layout)
-
-
-# ============================== Dropout ===========================================
-# https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/Dropout.cpp
-@register_meta(aten.native_dropout.default)
-def meta_native_dropout_default(input: torch.Tensor, p: float, train: bool = False):
-    # notice that mask is bool
-    return new_like(input), new_like(input, dtype=torch.bool)    # (output, mask)
-
-
-# https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/Dropout.cpp
-@register_meta(aten.native_dropout_backward.default)
-def meta_native_dropout_backward_default(grad: torch.Tensor, mask: torch.Tensor, scale: float):
-    return new_like(grad)    # (grad_in)
+            has_contiguous_subspace = True
+
+        # transposeToFront
+        # This is the logic that causes the newly inserted dimensions to show up
+        # at the beginning of the tensor, if they're not contiguous
+        if not has_contiguous_subspace:
+            dims = []
+            transposed_indices = []
+            for i, index in enumerate(indices):
+                if index is not None:
+                    dims.append(i)
+                    transposed_indices.append(index)
+            for i, index in enumerate(indices):
+                if index is None:
+                    dims.append(i)
+                    transposed_indices.append(index)
+            self = self.permute(dims)
+            indices = transposed_indices
+
+        # AdvancedIndex::AdvancedIndex
+        # Now we can assume the indices have contiguous subspace
+        # This is simplified from AdvancedIndex which goes to more effort
+        # to put the input and indices in a form so that TensorIterator can
+        # take them.  If we write a ref for this, probably that logic should
+        # get implemented
+        before_shape: List[int] = []
+        after_shape: List[int] = []
+        replacement_shape: List[int] = []
+        for dim, index in enumerate(indices):
+            if index is None:
+                if replacement_shape:
+                    after_shape.append(self.shape[dim])
+                else:
+                    before_shape.append(self.shape[dim])
+            else:
+                replacement_shape = list(index.shape)
+        return self.new_empty(before_shape + replacement_shape + after_shape)
+
+    # ============================== Embedding =========================================
+    # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/Embedding.cpp
+    @register_meta(aten.embedding_dense_backward.default)
+    def meta_embedding_dense_backward(grad_output: torch.Tensor, indices: torch.Tensor, num_weights, padding_idx,
+                                      scale_grad_by_freq):
+        return new((num_weights, grad_output.size(-1)),
+                   dtype=grad_output.dtype,
+                   device=grad_output.device,
+                   layout=grad_output.layout)
+
+    # ============================== Dropout ===========================================
+    # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/Dropout.cpp
+    @register_meta(aten.native_dropout.default)
+    def meta_native_dropout_default(input: torch.Tensor, p: float, train: bool = False):
+        # notice that mask is bool
+        return new_like(input), new_like(input, dtype=torch.bool)    # (output, mask)
+
+    # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/Dropout.cpp
+    @register_meta(aten.native_dropout_backward.default)
+    def meta_native_dropout_backward_default(grad: torch.Tensor, mask: torch.Tensor, scale: float):
+        return new_like(grad)    # (grad_in)
diff --git a/colossalai/_analyzer/_subclasses/_monkey_patch.py b/colossalai/_analyzer/_subclasses/_monkey_patch.py
index 1c7b972ab2f6..7c1c3d3d8cd4 100644
--- a/colossalai/_analyzer/_subclasses/_monkey_patch.py
+++ b/colossalai/_analyzer/_subclasses/_monkey_patch.py
@@ -1,5 +1,6 @@
 import torch
 import torch.distributed as dist
+from packaging import version
 
 aten = torch.ops.aten
 
@@ -49,40 +50,45 @@
     "scatter",
 ]
 
-# TODO: dive deep here
-# refer to https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/TensorShape.cpp
-_AliasATen = [
-    aten.detach.default,
-    aten.detach_.default,
-    aten.t.default,
-    aten.transpose.int,
-    aten.view.default,
-    aten._unsafe_view.default,
-    aten._reshape_alias.default,
-]
+if version.parse(torch.__version__) >= version.parse('1.12.0'):
+    # TODO: dive deep here
+    # refer to https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/TensorShape.cpp
+    _AliasATen = [
+        aten.detach.default,
+        aten.detach_.default,
+        aten.t.default,
+        aten.transpose.int,
+        aten.view.default,
+        aten._unsafe_view.default,
+        aten._reshape_alias.default,
+    ]
 
-_InplaceATen = [
-    aten.add_.Tensor,
-    aten.add_.Scalar,
-    aten.sub_.Tensor,
-    aten.sub_.Scalar,
-    aten.mul_.Tensor,
-    aten.mul_.Scalar,
-    aten.div_.Tensor,
-    aten.div_.Scalar,
-    aten.pow_.Tensor,
-    aten.pow_.Scalar,
-]
+    _InplaceATen = [
+        aten.add_.Tensor,
+        aten.add_.Scalar,
+        aten.sub_.Tensor,
+        aten.sub_.Scalar,
+        aten.mul_.Tensor,
+        aten.mul_.Scalar,
+        aten.div_.Tensor,
+        aten.div_.Scalar,
+        aten.pow_.Tensor,
+        aten.pow_.Scalar,
+    ]
 
-# use `MaybeInplace` because they call ``as_strided()`` or ``slice()``
-_MaybeInplaceATen = [
-    aten.diagonal.default,
-    aten.expand.default,
-    aten.select.int,
-    aten.slice.Tensor,
-    aten.split.Tensor,
-    aten.squeeze.default,
-    aten.permute.default,
-    aten.unsqueeze.default,
-    aten.as_strided.default,
-]
+    # use `MaybeInplace` because they call ``as_strided()`` or ``slice()``
+    _MaybeInplaceATen = [
+        aten.diagonal.default,
+        aten.expand.default,
+        aten.select.int,
+        aten.slice.Tensor,
+        aten.split.Tensor,
+        aten.squeeze.default,
+        aten.permute.default,
+        aten.unsqueeze.default,
+        aten.as_strided.default,
+    ]
+else:
+    _AliasATen = []
+    _InplaceATen = []
+    _MaybeInplaceATen = []
diff --git a/colossalai/_analyzer/_subclasses/flop_tensor.py b/colossalai/_analyzer/_subclasses/flop_tensor.py
index ab93551467b8..dd35b00b3fab 100644
--- a/colossalai/_analyzer/_subclasses/flop_tensor.py
+++ b/colossalai/_analyzer/_subclasses/flop_tensor.py
@@ -11,6 +11,7 @@
 from typing import Any, Callable, List, Optional, Union
 
 import torch
+from packaging import version
 from torch.utils._pytree import tree_map
 
 from .meta_tensor import MetaTensor
@@ -403,134 +404,139 @@ def zero_flop_jit(*args):
     return 0
 
 
-flop_mapping = {
+if version.parse(torch.__version__) >= version.parse('1.12.0'):
+    flop_mapping = {
     # gemm
-    aten.mm.default: matmul_flop_jit,
-    aten.matmul.default: matmul_flop_jit,
-    aten.addmm.default: addmm_flop_jit,
-    aten.bmm.default: bmm_flop_jit,
+        aten.mm.default: matmul_flop_jit,
+        aten.matmul.default: matmul_flop_jit,
+        aten.addmm.default: addmm_flop_jit,
+        aten.bmm.default: bmm_flop_jit,
 
     # convolution
-    aten.convolution.default: conv_flop_jit,
-    aten._convolution.default: conv_flop_jit,
-    aten.convolution_backward.default: conv_backward_flop_jit,
+        aten.convolution.default: conv_flop_jit,
+        aten._convolution.default: conv_flop_jit,
+        aten.convolution_backward.default: conv_backward_flop_jit,
 
     # normalization
-    aten.native_batch_norm.default: batchnorm_flop_jit,
-    aten.native_batch_norm_backward.default: batchnorm_flop_jit,
-    aten.cudnn_batch_norm.default: batchnorm_flop_jit,
-    aten.cudnn_batch_norm_backward.default: partial(batchnorm_flop_jit, training=True),
-    aten.native_layer_norm.default: norm_flop_counter(2, 0),
-    aten.native_layer_norm_backward.default: norm_flop_counter(2, 0),
+        aten.native_batch_norm.default: batchnorm_flop_jit,
+        aten.native_batch_norm_backward.default: batchnorm_flop_jit,
+        aten.cudnn_batch_norm.default: batchnorm_flop_jit,
+        aten.cudnn_batch_norm_backward.default: partial(batchnorm_flop_jit, training=True),
+        aten.native_layer_norm.default: norm_flop_counter(2, 0),
+        aten.native_layer_norm_backward.default: norm_flop_counter(2, 0),
 
     # pooling
-    aten.avg_pool1d.default: ewise_flop_counter(1, 0),
-    aten.avg_pool2d.default: ewise_flop_counter(1, 0),
-    aten.avg_pool2d_backward.default: ewise_flop_counter(0, 1),
-    aten.avg_pool3d.default: ewise_flop_counter(1, 0),
-    aten.avg_pool3d_backward.default: ewise_flop_counter(0, 1),
-    aten.max_pool1d.default: ewise_flop_counter(1, 0),
-    aten.max_pool2d.default: ewise_flop_counter(1, 0),
-    aten.max_pool3d.default: ewise_flop_counter(1, 0),
-    aten.max_pool1d_with_indices.default: ewise_flop_counter(1, 0),
-    aten.max_pool2d_with_indices.default: ewise_flop_counter(1, 0),
-    aten.max_pool2d_with_indices_backward.default: ewise_flop_counter(0, 1),
-    aten.max_pool3d_with_indices.default: ewise_flop_counter(1, 0),
-    aten.max_pool3d_with_indices_backward.default: ewise_flop_counter(0, 1),
-    aten._adaptive_avg_pool2d.default: ewise_flop_counter(1, 0),
-    aten._adaptive_avg_pool2d_backward.default: ewise_flop_counter(0, 1),
-    aten._adaptive_avg_pool3d.default: ewise_flop_counter(1, 0),
-    aten._adaptive_avg_pool3d_backward.default: ewise_flop_counter(0, 1),
-    aten.embedding_dense_backward.default: ewise_flop_counter(0, 1),
-    aten.embedding.default: ewise_flop_counter(1, 0),
-}
-
-ewise_flop_aten = [
+        aten.avg_pool1d.default: ewise_flop_counter(1, 0),
+        aten.avg_pool2d.default: ewise_flop_counter(1, 0),
+        aten.avg_pool2d_backward.default: ewise_flop_counter(0, 1),
+        aten.avg_pool3d.default: ewise_flop_counter(1, 0),
+        aten.avg_pool3d_backward.default: ewise_flop_counter(0, 1),
+        aten.max_pool1d.default: ewise_flop_counter(1, 0),
+        aten.max_pool2d.default: ewise_flop_counter(1, 0),
+        aten.max_pool3d.default: ewise_flop_counter(1, 0),
+        aten.max_pool1d_with_indices.default: ewise_flop_counter(1, 0),
+        aten.max_pool2d_with_indices.default: ewise_flop_counter(1, 0),
+        aten.max_pool2d_with_indices_backward.default: ewise_flop_counter(0, 1),
+        aten.max_pool3d_with_indices.default: ewise_flop_counter(1, 0),
+        aten.max_pool3d_with_indices_backward.default: ewise_flop_counter(0, 1),
+        aten._adaptive_avg_pool2d.default: ewise_flop_counter(1, 0),
+        aten._adaptive_avg_pool2d_backward.default: ewise_flop_counter(0, 1),
+        aten._adaptive_avg_pool3d.default: ewise_flop_counter(1, 0),
+        aten._adaptive_avg_pool3d_backward.default: ewise_flop_counter(0, 1),
+        aten.embedding_dense_backward.default: ewise_flop_counter(0, 1),
+        aten.embedding.default: ewise_flop_counter(1, 0),
+    }
+
+    ewise_flop_aten = [
     # basic op
-    aten.add.Tensor,
-    aten.add_.Tensor,
-    aten.div.Tensor,
-    aten.div_.Tensor,
-    aten.div.Scalar,
-    aten.div_.Scalar,
-    aten.mul.Tensor,
-    aten.mul.Scalar,
-    aten.mul_.Tensor,
-    aten.neg.default,
-    aten.pow.Tensor_Scalar,
-    aten.rsub.Scalar,
-    aten.sum.default,
-    aten.sum.dim_IntList,
-    aten.mean.dim,
+        aten.add.Tensor,
+        aten.add_.Tensor,
+        aten.div.Tensor,
+        aten.div_.Tensor,
+        aten.div.Scalar,
+        aten.div_.Scalar,
+        aten.mul.Tensor,
+        aten.mul.Scalar,
+        aten.mul_.Tensor,
+        aten.neg.default,
+        aten.pow.Tensor_Scalar,
+        aten.rsub.Scalar,
+        aten.sum.default,
+        aten.sum.dim_IntList,
+        aten.mean.dim,
 
     # activation op
-    aten.hardswish.default,
-    aten.hardswish_.default,
-    aten.hardswish_backward.default,
-    aten.hardtanh.default,
-    aten.hardtanh_.default,
-    aten.hardtanh_backward.default,
-    aten.hardsigmoid_backward.default,
-    aten.hardsigmoid.default,
-    aten.gelu.default,
-    aten.gelu_backward.default,
-    aten.silu.default,
-    aten.silu_.default,
-    aten.silu_backward.default,
-    aten.sigmoid.default,
-    aten.sigmoid_backward.default,
-    aten._softmax.default,
-    aten._softmax_backward_data.default,
-    aten.relu_.default,
-    aten.relu.default,
-    aten.tanh.default,
-    aten.tanh_backward.default,
-    aten.threshold_backward.default,
+        aten.hardswish.default,
+        aten.hardswish_.default,
+        aten.hardswish_backward.default,
+        aten.hardtanh.default,
+        aten.hardtanh_.default,
+        aten.hardtanh_backward.default,
+        aten.hardsigmoid_backward.default,
+        aten.hardsigmoid.default,
+        aten.gelu.default,
+        aten.gelu_backward.default,
+        aten.silu.default,
+        aten.silu_.default,
+        aten.silu_backward.default,
+        aten.sigmoid.default,
+        aten.sigmoid_backward.default,
+        aten._softmax.default,
+        aten._softmax_backward_data.default,
+        aten.relu_.default,
+        aten.relu.default,
+        aten.tanh.default,
+        aten.tanh_backward.default,
+        aten.threshold_backward.default,
 
     # dropout
-    aten.native_dropout.default,
-    aten.native_dropout_backward.default,
+        aten.native_dropout.default,
+        aten.native_dropout_backward.default,
 
     # distribution
-    aten.bernoulli_.float,
+        aten.bernoulli_.float,
 
     # where
-    aten.where.self,
-]
-for op in ewise_flop_aten:
-    flop_mapping[op] = ewise_flop_counter(1, 0)
-
-# fix-me: this will be removed in future
-zero_flop_aten = [
-    aten.as_strided.default,
-    aten.as_strided_.default,
-    aten.cat.default,
-    aten.clone.default,
-    aten.copy_.default,
-    aten.detach.default,
-    aten.expand.default,
-    aten.empty_like.default,
-    aten.new_empty.default,
-    aten.new_empty_strided.default,
-    aten.ones_like.default,
-    aten._reshape_alias.default,
-    aten.select.int,
-    aten.select_backward.default,
-    aten.squeeze.dim,
-    aten.slice.Tensor,
-    aten.slice_backward.default,
-    aten.split.Tensor,
-    aten.permute.default,
-    aten.t.default,
-    aten.transpose.int,
-    aten._to_copy.default,
-    aten.unsqueeze.default,
-    aten.unbind.int,
-    aten._unsafe_view.default,
-    aten.view.default,
-    aten.zero_.default,
-    aten.zeros_like.default,
-]
-
-for op in zero_flop_aten:
-    flop_mapping[op] = zero_flop_jit
+        aten.where.self,
+    ]
+    for op in ewise_flop_aten:
+        flop_mapping[op] = ewise_flop_counter(1, 0)
+
+    # fix-me: this will be removed in future
+    zero_flop_aten = [
+        aten.as_strided.default,
+        aten.as_strided_.default,
+        aten.cat.default,
+        aten.clone.default,
+        aten.copy_.default,
+        aten.detach.default,
+        aten.expand.default,
+        aten.empty_like.default,
+        aten.new_empty.default,
+        aten.new_empty_strided.default,
+        aten.ones_like.default,
+        aten._reshape_alias.default,
+        aten.select.int,
+        aten.select_backward.default,
+        aten.squeeze.dim,
+        aten.slice.Tensor,
+        aten.slice_backward.default,
+        aten.split.Tensor,
+        aten.permute.default,
+        aten.t.default,
+        aten.transpose.int,
+        aten._to_copy.default,
+        aten.unsqueeze.default,
+        aten.unbind.int,
+        aten._unsafe_view.default,
+        aten.view.default,
+        aten.zero_.default,
+        aten.zeros_like.default,
+    ]
+
+    for op in zero_flop_aten:
+        flop_mapping[op] = zero_flop_jit
+else:
+    flop_mapping = {}
+    elementwise_flop_aten = {}
+    zero_flop_aten = {}
diff --git a/colossalai/_analyzer/fx/__init__.py b/colossalai/_analyzer/fx/__init__.py
index 2e857b1b054b..aa01de0bbe6c 100644
--- a/colossalai/_analyzer/fx/__init__.py
+++ b/colossalai/_analyzer/fx/__init__.py
@@ -1,4 +1,3 @@
-from .bias_addition import *
 from .node_util import MetaInfo
 from .symbolic_profile import symbolic_profile
-from .symbolic_trace import symbolic_trace
+from .tracer.symbolic_trace import symbolic_trace
diff --git a/colossalai/_analyzer/fx/graph_module.py b/colossalai/_analyzer/fx/graph_module.py
index 779b42ebaafd..1fdedd758c01 100644
--- a/colossalai/_analyzer/fx/graph_module.py
+++ b/colossalai/_analyzer/fx/graph_module.py
@@ -1,4 +1,7 @@
+import linecache
 import os
+import sys
+import traceback
 import warnings
 from pathlib import Path
 from typing import Any, Dict, Optional, Union
@@ -6,11 +9,74 @@
 import torch
 import torch.fx
 import torch.nn as nn
-from torch.fx.graph import PythonCode, _PyTreeCodeGen
-from torch.fx.graph_module import _exec_with_source, _forward_from_src, _WrappedCall
+from torch.fx.graph import PythonCode
+
+try:
+    from torch.fx.graph import _PyTreeCodeGen
+    SUPPORT_PT_CODEGEN = True
+except ImportError:
+    SUPPORT_PT_CODEGEN = False
+
+from torch.fx.graph_module import _exec_with_source, _forward_from_src
 from torch.nn.modules.module import _addindent
 
 
+# This is a copy of torch.fx.graph_module._WrappedCall.
+# It should be removed when we stop supporting torch < 1.12.0.
+class _WrappedCall:
+
+    def __init__(self, cls, cls_call):
+        self.cls = cls
+        self.cls_call = cls_call
+
+    # Previously, if an error occurred when valid
+    # symbolically-traced code was run with an invalid input, the
+    # user would see the source of the error as coming from
+    # `File "<eval_with_key_N">`, where N is some number. We use
+    # this function to generate a more informative error message. We
+    # return the traceback itself, a message explaining that the
+    # error occurred in a traced Module's generated forward
+    # function, and five lines of context surrounding the faulty
+    # line
+    @staticmethod
+    def _generate_error_message(frame_summary: traceback.FrameSummary) -> str:
+        # auxiliary variables (for readability)
+        err_lineno = frame_summary.lineno
+        assert err_lineno is not None
+        line = frame_summary.line
+        assert line is not None
+        err_line_len = len(line)
+        all_src_lines = linecache.getlines(frame_summary.filename)
+
+        # constituent substrings of the error message
+        tb_repr = traceback.format_exc()
+        custom_msg = ("Call using an FX-traced Module, "
+                      f"line {err_lineno} of the traced Module's "
+                      "generated forward function:")
+        before_err = "".join(all_src_lines[err_lineno - 2:err_lineno])
+        marker = "~" * err_line_len + "~~~ <--- HERE"
+        err_and_after_err = "\n".join(all_src_lines[err_lineno:err_lineno + 2])
+
+        # joined message
+        return "\n".join([tb_repr, custom_msg, before_err, marker, err_and_after_err])
+
+    def __call__(self, obj, *args, **kwargs):
+        try:
+            if self.cls_call is not None:
+                return self.cls_call(obj, *args, **kwargs)
+            else:
+                return super(self.cls, obj).__call__(*args, **kwargs)    # type: ignore[misc]
+        except Exception as e:
+            assert e.__traceback__
+            topmost_framesummary: traceback.FrameSummary = \
+                traceback.StackSummary.extract(traceback.walk_tb(e.__traceback__))[-1]  # type: ignore[arg-type]
+            if "eval_with_key" in topmost_framesummary.filename:
+                print(_WrappedCall._generate_error_message(topmost_framesummary), file=sys.stderr)
+                raise e.with_traceback(None)
+            else:
+                raise e
+
+
 class ColoGraphModule(torch.fx.GraphModule):
     """
     ColoGraphGraphModule is an nn.Module generated from an fx.Graph.
@@ -65,7 +131,7 @@ def recompile(self) -> PythonCode:
         called after editing the contained ``graph``, otherwise the generated
         code of this ``GraphModule`` will be out of date.
         """
-        if isinstance(self._graph._codegen, _PyTreeCodeGen):
+        if SUPPORT_PT_CODEGEN and isinstance(self._graph._codegen, _PyTreeCodeGen):
             self._in_spec = self._graph._codegen.pytree_info.in_spec
             self._out_spec = self._graph._codegen.pytree_info.out_spec
         python_code = self._graph.python_code(root_module='self')
diff --git a/colossalai/_analyzer/fx/node_util.py b/colossalai/_analyzer/fx/node_util.py
index d06fa8b93fc6..8c8956d8ea7c 100644
--- a/colossalai/_analyzer/fx/node_util.py
+++ b/colossalai/_analyzer/fx/node_util.py
@@ -20,7 +20,7 @@ def union(a, b):
     return {**a, **b}
 
 
-def compute_size_in_bytes(elem: torch.Tensor | Dict | List | Tuple | int) -> int:
+def compute_size_in_bytes(elem: Union[torch.Tensor, Dict, List, Tuple, int]) -> int:
     """Compute the size of a tensor or a collection of tensors in bytes.
 
     Args:
@@ -195,8 +195,8 @@ def __repr__(self):
             s += f'\n\thas buffer of size {_format_memory(self.buffer_size)}'
         if self.output_size:
             s += f'\n\thas output activation of size {_format_memory(self.output_size)}'
-        if self.total_size:
-            s += f'\n\thas total activation of size {_format_memory(self.total_size)}'
+        # if self.total_size:
+        #     s += f'\n\thas total activation of size {_format_memory(self.total_size)}'
         if self.temp_size:
             s += f'\n\thas temp activation of size {_format_memory(self.temp_size)}'
         if self.backward_size:
diff --git a/colossalai/_analyzer/fx/passes/shape_prop.py b/colossalai/_analyzer/fx/passes/shape_prop.py
index 3691497ed8cd..ab3e1a4d6a3d 100644
--- a/colossalai/_analyzer/fx/passes/shape_prop.py
+++ b/colossalai/_analyzer/fx/passes/shape_prop.py
@@ -111,7 +111,24 @@ def run_node(self, n: torch.fx.Node) -> Any:
         with self.global_hook:
             r = getattr(self, n.op)(n.target, args, kwargs)
 
-        unwrap_fn = lambda elem: elem._tensor if isinstance(elem, MetaTensor) else elem
+        def unwrap_fn(elem):
+
+            def _convert_meta(t: torch.Tensor):
+                if t.device == 'meta':
+                    return t
+                else:
+                    return t.to('meta')
+
+            if isinstance(elem, MetaTensor):
+                return _convert_meta(elem._tensor)
+
+            elif isinstance(elem, torch.Tensor):
+                return _convert_meta(elem)
+
+            else:
+                return elem
+
+        # unwrap_fn = lambda elem: elem._tensor if isinstance(elem, MetaTensor) else elem
         is_pure_tensor = lambda elem: isinstance(elem, MetaTensor) and not isinstance(elem, torch.nn.Parameter)
         n_info = MetaInfo(n)
         n_info.outputs = _normalize_tuple(r)
diff --git a/colossalai/_analyzer/fx/tracer/__init__.py b/colossalai/_analyzer/fx/tracer/__init__.py
new file mode 100644
index 000000000000..6b1b2256aa44
--- /dev/null
+++ b/colossalai/_analyzer/fx/tracer/__init__.py
@@ -0,0 +1,2 @@
+from .bias_addition import *
+from .custom_leaf_module import *
diff --git a/colossalai/_analyzer/fx/bias_addition.py b/colossalai/_analyzer/fx/tracer/bias_addition.py
similarity index 98%
rename from colossalai/_analyzer/fx/bias_addition.py
rename to colossalai/_analyzer/fx/tracer/bias_addition.py
index 5359752d4cb4..1e75b47ca5b0 100644
--- a/colossalai/_analyzer/fx/bias_addition.py
+++ b/colossalai/_analyzer/fx/tracer/bias_addition.py
@@ -4,11 +4,10 @@
 """
 
 import torch
-import torch.nn as nn
 import torch.nn.functional as F
 from torch.nn.modules.utils import _pair, _single, _triple
 
-from .symbolic_trace import register_tracer_impl
+from .tracer import register_tracer_impl
 
 __all__ = []
 
diff --git a/colossalai/_analyzer/fx/tracer/custom_leaf_module.py b/colossalai/_analyzer/fx/tracer/custom_leaf_module.py
new file mode 100644
index 000000000000..112c7c9637d2
--- /dev/null
+++ b/colossalai/_analyzer/fx/tracer/custom_leaf_module.py
@@ -0,0 +1,29 @@
+import torch
+
+from .tracer import register_leaf_module, register_leaf_module_impl
+
+try:
+    import apex
+    register_leaf_module(apex.normalization.FusedLayerNorm)
+    register_leaf_module(apex.normalization.FusedRMSNorm)
+    register_leaf_module(apex.normalization.MixedFusedLayerNorm)
+    register_leaf_module(apex.normalization.MixedFusedRMSNorm)
+
+    @register_leaf_module_impl(apex.normalization.FusedLayerNorm)
+    @register_leaf_module_impl(apex.normalization.FusedRMSNorm)
+    @register_leaf_module_impl(apex.normalization.MixedFusedLayerNorm)
+    @register_leaf_module_impl(apex.normalization.MixedFusedRMSNorm)
+    def torch_nn_normalize(self, input: torch.Tensor):
+        # check shape
+        if isinstance(self, torch.nn.BatchNorm1d):
+            assert input.dim() in [2, 3]
+        elif isinstance(self, torch.nn.BatchNorm2d):
+            assert input.dim() == 4
+        elif isinstance(self, torch.nn.BatchNorm3d):
+            assert input.dim() == 5
+
+        # normalization maintain the same shape as the input
+        return input.clone()
+
+except (ImportError, AttributeError):
+    pass
diff --git a/colossalai/_analyzer/fx/tracer/proxy.py b/colossalai/_analyzer/fx/tracer/proxy.py
new file mode 100644
index 000000000000..ce379efdcf0d
--- /dev/null
+++ b/colossalai/_analyzer/fx/tracer/proxy.py
@@ -0,0 +1,112 @@
+import operator
+from typing import Any, Callable, Dict, Optional, Set, Union
+
+import torch
+import torch.nn as nn
+from torch.fx import Graph, Node, Proxy, Tracer
+from torch.fx.graph import _Namespace
+from torch.utils._pytree import tree_map
+
+from colossalai._analyzer._subclasses import MetaTensor
+
+Target = Union[Callable[..., Any], str]
+
+
+class ColoProxy(Proxy):
+    _func_dispatch: Dict[Target, Callable[..., Any]] = {}
+
+    def __init__(self, *args, data=None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._meta_data = data
+
+    @property
+    def meta_data(self):
+        return self._meta_data
+
+    @meta_data.setter
+    def meta_data(self, args):
+        wrap_fn = lambda x: MetaTensor(x) if isinstance(x, torch.Tensor) else x
+        self._meta_data = tree_map(wrap_fn, args)
+
+    @classmethod
+    def __torch_function__(cls, orig_method, types, args=(), kwargs=None):
+        kwargs = {} if kwargs is None else kwargs
+        if orig_method in cls._func_dispatch:
+            impl = cls._func_dispatch.pop(orig_method)    # avoid recursion
+            proxy = impl(*args, **kwargs)
+            cls._func_dispatch[orig_method] = impl
+            return proxy
+        else:
+            proxy = cls.from_torch_proxy(super().__torch_function__(orig_method, types, args, kwargs))
+            unwrap_fn = lambda p: p.meta_data if isinstance(p, ColoProxy) else p
+            if proxy.meta_data is None:
+                proxy.meta_data = orig_method(*tree_map(unwrap_fn, args), **tree_map(unwrap_fn, kwargs))
+            return proxy
+
+    @classmethod
+    def from_torch_proxy(cls, proxy: Proxy):
+        return cls(proxy.node, proxy.tracer)
+
+    def __repr__(self):
+        return f"ColoProxy({self.node.name}, meta_data={self.meta_data})"
+
+    def __len__(self):
+        return len(self.meta_data)
+
+    def __int__(self):
+        return int(self.meta_data)
+
+    def __index__(self):
+        try:
+            return int(self.meta_data)
+        except:
+            return torch.zeros(self.meta_data.shape, dtype=torch.bool).numpy().__index__()
+
+    def __float__(self):
+        return float(self.meta_data)
+
+    def __bool__(self):
+        return self.meta_data
+
+    def __getattr__(self, k):
+        return ColoAttribute(self, k, getattr(self._meta_data, k, None))
+
+    def __setitem__(self, key, value):
+        proxy = self.tracer.create_proxy('call_function', operator.setitem, (self, key, value), {})
+        proxy.meta_data = self._meta_data
+        return proxy
+
+    def __contains__(self, key):
+        if self.node.op == "placeholder":
+            # this is used to handle like
+            # if x in kwargs
+            # we don't handle this case for now
+            return False
+        return super().__contains__(key)
+
+    def __isinstancecheck__(self, type):
+        return isinstance(self.meta_data, type)
+
+
+class ColoAttribute(ColoProxy):
+
+    def __init__(self, root, attr: str, data=None):
+        self.root = root
+        self.attr = attr
+        self.tracer = root.tracer
+        self._meta_data = data
+        self._node: Optional[Node] = None
+
+    @property
+    def node(self):
+        # the node for attributes is added lazily, since most will just be method calls
+        # which do not rely on the getitem call
+        if self._node is None:
+            self._node = self.tracer.create_proxy('call_function', getattr, (self.root, self.attr), {}).node
+        return self._node
+
+    def __call__(self, *args, **kwargs):
+        return self.tracer.create_proxy('call_method', self.attr, (self.root,) + args, kwargs)
+
+    def __repr__(self):
+        return f"ColoAttribute({self.node.name}, attr={self.attr})"
diff --git a/colossalai/_analyzer/fx/tracer/symbolic_trace.py b/colossalai/_analyzer/fx/tracer/symbolic_trace.py
new file mode 100644
index 000000000000..2018863f6f5f
--- /dev/null
+++ b/colossalai/_analyzer/fx/tracer/symbolic_trace.py
@@ -0,0 +1,157 @@
+from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Type, Union
+
+import torch
+from torch.fx import Tracer
+from torch.utils._pytree import tree_map
+
+from colossalai._analyzer._subclasses import MetaTensor
+
+try:
+    from ..codegen import ActivationCheckpointCodeGen
+    SUPPORT_ACTIVATION = True
+except:
+    SUPPORT_ACTIVATION = False
+from ..graph_module import ColoGraphModule
+from .tracer import ColoTracer
+
+
+def _default_device():
+    return torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
+
+
+def _current_device(module: torch.nn.Module):
+    try:
+        return next(module.parameters()).device
+    except:
+        return _default_device()
+
+
+def symbolic_trace(
+    root: Union[torch.nn.Module, Callable[..., Any]],
+    concrete_args: Optional[Dict[str, Any]] = None,
+    meta_args: Optional[Dict[str, Any]] = None,
+    trace_act_ckpt: bool = False,
+    bias_addition_split: bool = False,
+) -> ColoGraphModule:
+    """
+    Traces a ``torch.nn.Module`` or a function and returns a ``GraphModule`` with ``Node``s and ``MetaInfo``
+    attached to the ``Node``s.
+
+    Can be used to trace the usage of ``torch.utils.checkpoint`` and the path of module
+    (https://github.com/pytorch/examples/blob/main/fx/module_tracer.py).
+
+    This tracer is able to trace basic control flow and for loops.
+
+    It will split the bias addition into two parts if ``bias_addition_split`` is set to be ``True``.
+    (See ./bias_addition.py for more details).
+
+    Examples:
+    1. Tracing a ``torch.nn.Module`` with control flow.
+
+    .. code-block:: python
+
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(2, 2)
+
+            def forward(self, x):
+                if x.size(0) > 1:
+                    x = x.sum(dim=0)
+                return self.linear(x)
+
+        traced = symbolic_trace(MyModule(), meta_args={'x': torch.randn(1, 2, 2)})
+
+        # traced code like:
+        # def forward(self, x):
+        #     linear_1 = self.linear(x)
+        #     return linear_1
+
+        traced = symbolic_trace(MyModule(), meta_args={'x': torch.randn(2, 2, 2)})
+
+        # traced code like:
+        # def forward(self, x):
+        #     sum = x.sum(dim=0); x = None
+        #     linear = self.linear(sum); sum = None
+        #     return linear
+
+    2. Tracing a ``torch.nn.Module`` with ``torch.utils.checkpoint``.
+
+    .. code-block:: python
+
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(2, 2)
+
+            def forward(self, x):
+                def custom_forward(x):
+                    return self.linear(x)
+                return torch.utils.checkpoint.checkpoint(custom_forward, x)
+
+        traced = symbolic_trace(MyModule(), meta_args={'x': torch.randn(1, 2, 2)}, trace_act_ckpt=True)
+
+        # traced code like:
+        # def checkpoint_0(self, x):
+        #     linear = self.linear(x); x = None
+        #     return linear
+        #
+        # def forward(self, x):
+        #     linear = torch.utils.checkpoint.checkpoint(checkpoint_0, x); x = None
+        #     return linear
+
+    3. Tracing a ``torch.nn.Module`` with ``bias_addition_split``.
+
+    .. code-block:: python
+
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(2, 2, bias=True)
+
+            def forward(self, x):
+                return self.linear(x)
+
+        traced = symbolic_trace(MyModule(), meta_args={'x': torch.randn(1, 2, 2)}, bias_addition_split=True)
+
+        # traced code like:
+        # def forward(self, x):
+        #     linear_bias = self.linear.bias
+        #     linear_weight = self.linear.weight
+        #     linear = torch._C._nn.linear(x, linear_weight);  x = linear_weight = None
+        #     add = linear + linear_bias;  linear = linear_bias = None
+        #     return add
+
+    Args:
+        root (Union[torch.nn.Module, Callable[..., Any]]): The ``torch.nn.Module`` or function to be traced.
+        concrete_args (Optional[Dict[str, Any]], optional): Concrete arguments to be passed to the ``root``.
+            Defaults to {}.
+        meta_args (Optional[Dict[str, Any]], optional): Meta arguments to be passed to the ``root``. Mostly used
+            for tracing control flow. Defaults to {}.
+        trace_act_ckpt (bool, optional): Whether to trace the usage of ``torch.utils.checkpoint``.
+            Defaults to False.
+        bias_addition_split (bool, optional): Whether to split the bias addition into two parts. Defaults to False.
+
+    Returns:
+        ColoGraphModule: A traced ``GraphModule`` that is ready for activation checkpoint ``CodeGen``.
+
+    Remarks:
+        This part of ``symbolic_trace()`` is maintained by Colossal-AI team. If you encountered
+        any unexpected error during tracing, feel free to raise an issue on Colossal-AI GitHub
+        repo. We welcome any feedback and contributions to enhance the extensibility of
+        Colossal-AI.
+    """
+    if meta_args:
+        device, orig_device = _default_device(), _current_device(root)
+        wrap_fn = lambda elem: MetaTensor(elem, device=device) if isinstance(elem, torch.Tensor) else elem
+        graph = ColoTracer(trace_act_ckpt=trace_act_ckpt,
+                           bias_addition_split=bias_addition_split).trace(root.to(device),
+                                                                          concrete_args=concrete_args,
+                                                                          meta_args=tree_map(wrap_fn, meta_args))
+        if trace_act_ckpt and SUPPORT_ACTIVATION:
+            graph.set_codegen(ActivationCheckpointCodeGen())
+        root.to(orig_device)
+    else:
+        graph = Tracer().trace(root, concrete_args=concrete_args)
+    name = root.__class__.__name__ if isinstance(root, torch.nn.Module) else root.__name__
+    return ColoGraphModule(root, graph, name)
diff --git a/colossalai/_analyzer/fx/symbolic_trace.py b/colossalai/_analyzer/fx/tracer/tracer.py
similarity index 53%
rename from colossalai/_analyzer/fx/symbolic_trace.py
rename to colossalai/_analyzer/fx/tracer/tracer.py
index 5d858c87a3c8..1a247449f3d8 100644
--- a/colossalai/_analyzer/fx/symbolic_trace.py
+++ b/colossalai/_analyzer/fx/tracer/tracer.py
@@ -1,28 +1,19 @@
 import functools
 import inspect
-import operator
 from contextlib import contextmanager
-from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Type, Union
+from typing import Any, Callable, Dict, Iterable, Optional, Set, Tuple, Type, Union
 
 import torch
 import torch.nn as nn
 from torch.fx import Graph, Node, Proxy, Tracer
-from torch.fx.graph import _Namespace
 from torch.utils._pytree import tree_map
 
-from colossalai._analyzer._subclasses import MetaTensor, _TensorPropertyMethod, _TorchFactoryMethod
+from colossalai._analyzer._subclasses import _TensorPropertyMethod, _TorchFactoryMethod
 
-from .codegen import ActivationCheckpointCodeGen
-from .graph_module import ColoGraphModule
-from .node_util import MetaInfo
+from ..node_util import MetaInfo
+from .proxy import ColoProxy
 
 Target = Union[Callable[..., Any], str]
-Argument = Optional[Union[Tuple[Any, ...],    # actually Argument, but mypy can't represent recursive types
-                          List[Any],    # actually Argument
-                          Dict[str, Any],    # actually Argument
-                          slice,    # Slice[Argument, Argument, Argument], but slice is not a templated type in typing
-                          'Node',]]
-zeros = torch.zeros
 
 
 def _truncate_suffix(s: str):
@@ -32,17 +23,6 @@ def _truncate_suffix(s: str):
     return re.sub(r'_\d+$', '', s)
 
 
-def _default_device():
-    return torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
-
-
-def _current_device(module):
-    try:
-        return next(module.parameters()).device
-    except:
-        return _default_device()
-
-
 def register_tracer_impl(func: Callable[..., Any], name: Optional[str] = '_custom_impl'):
 
     def wrapper(impl):
@@ -70,149 +50,6 @@ def register_non_leaf_module(module: nn.Module):
     ColoTracer._custom_non_leaf_module.add(module)
 
 
-class ColoProxy(Proxy):
-    _func_dispatch: Dict[Target, Callable[..., Any]] = {}
-
-    def __init__(self, *args, data=None, **kwargs):
-        super().__init__(*args, **kwargs)
-        self._meta_data = data
-
-    @property
-    def meta_data(self):
-        return self._meta_data
-
-    @meta_data.setter
-    def meta_data(self, args):
-        wrap_fn = lambda x: MetaTensor(x) if isinstance(x, torch.Tensor) else x
-        self._meta_data = tree_map(wrap_fn, args)
-
-    @classmethod
-    def __torch_function__(cls, orig_method, types, args=(), kwargs=None):
-        kwargs = {} if kwargs is None else kwargs
-        if orig_method in cls._func_dispatch:
-            impl = cls._func_dispatch.pop(orig_method)    # avoid recursion
-            proxy = impl(*args, **kwargs)
-            cls._func_dispatch[orig_method] = impl
-            return proxy
-        else:
-            proxy = cls.from_torch_proxy(super().__torch_function__(orig_method, types, args, kwargs))
-            unwrap_fn = lambda p: p.meta_data if isinstance(p, ColoProxy) else p
-            if proxy.meta_data is None:
-                proxy.meta_data = orig_method(*tree_map(unwrap_fn, args), **tree_map(unwrap_fn, kwargs))
-            return proxy
-
-    @classmethod
-    def from_torch_proxy(cls, proxy: Proxy):
-        return cls(proxy.node, proxy.tracer)
-
-    def __repr__(self):
-        return f"ColoProxy({self.node.name}, meta_data={self.meta_data})"
-
-    def __len__(self):
-        return len(self.meta_data)
-
-    def __int__(self):
-        return int(self.meta_data)
-
-    def __index__(self):
-        try:
-            return int(self.meta_data)
-        except:
-            return zeros(self.meta_data.shape, dtype=torch.bool).numpy().__index__()
-
-    def __float__(self):
-        return float(self.meta_data)
-
-    def __bool__(self):
-        return self.meta_data
-
-    def __getattr__(self, k):
-        return ColoAttribute(self, k, getattr(self._meta_data, k, None))
-
-    def __setitem__(self, key, value):
-        proxy = self.tracer.create_proxy('call_function', operator.setitem, (self, key, value), {})
-        proxy.meta_data = self._meta_data
-        return proxy
-
-    def __contains__(self, key):
-        if self.node.op == "placeholder":
-            # this is used to handle like
-            # if x in kwargs
-            # we don't handle this case for now
-            return False
-        return super().__contains__(key)
-
-    def __isinstancecheck__(self, type):
-        return isinstance(self.meta_data, type)
-
-    def size(self, dim=None):
-        if self._meta_data is None:
-            return self._meta_data.size(*[dim] if dim else [])
-        return self.tracer.create_proxy('call_method', 'size', (self, dim) if dim else (self,), {})
-
-    def dim(self):
-        if self._meta_data is not None:
-            return self._meta_data.dim()
-        return self.tracer.create_proxy('call_method', 'dim', (self,), {})
-
-    @property
-    def shape(self):
-        if self._meta_data is not None:
-            return self._meta_data.shape
-        return self.tracer.create_proxy('call_function', getattr, (self, 'shape'), {})
-
-    @property
-    def ndim(self):
-        if self._meta_data is not None:
-            return self._meta_data.ndim
-        return self.tracer.create_proxy('call_function', getattr, (self, 'ndim'), {})
-
-    @property
-    def device(self):
-        if self._meta_data is not None:
-            return self._meta_data.device
-        return self.tracer.create_proxy('call_function', getattr, (self, 'device'), {})
-
-    @property
-    def dtype(self):
-        if self._meta_data is not None:
-            return self._meta_data.dtype
-        return self.tracer.create_proxy('call_function', getattr, (self, 'dtype'), {})
-
-    def to(self, *args, **kwargs):
-        return self.tracer.create_proxy('call_method', 'to', (self, *args), {**kwargs})
-
-    def cpu(self, *args, **kwargs):
-        return self.tracer.create_proxy('call_method', 'cpu', (self, *args), {**kwargs})
-
-    def cuda(self, *args, **kwargs):
-        return self.tracer.create_proxy('call_method', 'cuda', (self, *args), {**kwargs})
-
-
-class ColoAttribute(ColoProxy):
-
-    def __init__(self, root, attr: str, data=None):
-        self.root = root
-        self.attr = attr
-        self.tracer = root.tracer
-        self._meta_data = data
-        self._node: Optional[Node] = None
-
-    @property
-    def node(self):
-        # the node for attributes is added lazily, since most will just be method calls
-        # which do not rely on the getitem call
-        if self._node is None:
-            self._node = self.tracer.create_proxy('call_function', getattr, (self.root, self.attr), {}).node
-        return self._node
-
-    def __call__(self, *args, **kwargs):
-        return self.tracer.create_proxy('call_method', self.attr, (self.root,) + args, kwargs)
-
-    def __repr__(self):
-        return f"ColoAttribute({self.node.name}, attr={self.attr})"
-
-
 class ColoTracer(Tracer):
     _custom_leaf_module: Set[Type[nn.Module]] = set()
     _custom_leaf_module_impl: Dict[Type[nn.Module], Callable[..., Any]] = {}
@@ -249,7 +86,6 @@ def is_leaf_module(self, m: nn.Module, module_qualified_name: str) -> bool:
         # we will enter the module and split the bias-addition ops
         if self.bias_addition_split and type(m) in self._bias_addition_module and m.bias is not None:
             return False
-
         # user can specify which modules are leaf modules and which are not
         return (type(m) not in self._custom_non_leaf_module
                 and (type(m) in self._custom_leaf_module or super().is_leaf_module(m, module_qualified_name)))
@@ -306,9 +142,13 @@ def create_proxy(self,
             mod = self.root.get_submodule(target)
             self.disable_module_getattr = True
             try:
-                proxy.meta_data = self._custom_leaf_module_impl.get(type(mod),
-                                                                    mod.forward)(*tree_map(unwrap_fn, args),
-                                                                                 **tree_map(unwrap_fn, kwargs))
+                args = tree_map(unwrap_fn, args)
+                kwargs = tree_map(unwrap_fn, kwargs)
+                if type(mod) in self._custom_leaf_module:
+                    target = self._custom_leaf_module_impl[type(mod)]
+                    proxy.meta_data = target(mod, *args, **kwargs)
+                else:
+                    proxy.meta_data = mod.forward(*args, **kwargs)
             finally:
                 self.disable_module_getattr = False
         return proxy
@@ -320,15 +160,21 @@ def create_node(self, *args, **kwargs) -> Node:
 
     def trace(self,
               root: torch.nn.Module,
-              concrete_args: Optional[Dict[str, torch.Tensor]] = {},
-              meta_args: Optional[Dict[str, torch.Tensor]] = {}) -> Graph:
+              concrete_args: Optional[Dict[str, torch.Tensor]] = None,
+              meta_args: Optional[Dict[str, torch.Tensor]] = None) -> Graph:
+
+        if meta_args is None:
+            meta_args = {}
+
+        if concrete_args is None:
+            concrete_args = {}
 
         # check concrete and meta args have valid names
         sig = inspect.signature(root.forward)
         sig_names = set(sig.parameters.keys())
         meta_arg_names = set(meta_args.keys())
         concrete_arg_names = set(concrete_args.keys())
-
+        non_concrete_arg_names = sig_names - concrete_arg_names
         # update concrete args with default values
         for k, v in sig.parameters.items():
             if k in sig_names - meta_arg_names and \
@@ -352,6 +198,34 @@ def _check_arg_name_valid(names: Iterable[str]):
             self.graph = super().trace(root, concrete_args=concrete_args)
             self.mod_dir = ''
         self.graph.lint()
+
+        for node in self.graph.nodes:
+            if node.op == "placeholder":
+                # Removing default values for inputs as the forward pass will fail with them.
+                if node.target in non_concrete_arg_names:
+                    node.args = ()
+                    # Without this, torch.jit.script fails because the inputs type is Optional[torch.Tensor].
+                    # It cannot infer on the attributes and methods the input should have, and fails.
+                    node.type = torch.Tensor
+                # It is a concrete arg so it is not used and should be removed.
+                else:
+                    if hasattr(torch.fx._symbolic_trace, "_assert_is_none"):
+                        # Newer versions of torch.fx emit an assert statement
+                        # for concrete arguments; delete those before we delete
+                        # the concrete arg.
+                        to_delete = []
+                        for user in node.users:
+                            if user.target == torch.fx._symbolic_trace._assert_is_none:
+                                to_delete.append(user)
+                        for user in to_delete:
+                            self.graph.erase_node(user)
+
+                    self.graph.erase_node(node)
+
+            # TODO: solves GraphModule creation.
+            # Without this, return type annotation "Tuple" is causing code execution failure.
+            if node.op == "output":
+                node.type = None
         return self.graph
 
     @contextmanager
@@ -454,7 +328,7 @@ def _post_check(self, non_concrete_arg_names: Set[str]):
             if node.op == "output":
                 node.type = None
             self.graph.lint()
-     
+
     def getattr(self, attr, attr_val, parameter_proxy_cache):
         return self._module_getattr(attr, attr_val, parameter_proxy_cache)
 
@@ -487,134 +361,3 @@ def maybe_get_proxy_for_attr(attr_val, collection_to_search, parameter_proxy_cac
                 return maybe_parameter_proxy
 
         return attr_val
-
-
-def symbolic_trace(
-    root: Union[torch.nn.Module, Callable[..., Any]],
-    concrete_args: Optional[Dict[str, Any]] = {},
-    meta_args: Optional[Dict[str, Any]] = {},
-    trace_act_ckpt: bool = False,
-    bias_addition_split: bool = False,
-) -> ColoGraphModule:
-    """
-    Traces a ``torch.nn.Module`` or a function and returns a ``GraphModule`` with ``Node``s and ``MetaInfo``
-    attached to the ``Node``s.
-
-    Can be used to trace the usage of ``torch.utils.checkpoint`` and the path of module
-    (https://github.com/pytorch/examples/blob/main/fx/module_tracer.py).
-
-    This tracer is able to trace basic control flow and for loops.
-
-    It will split the bias addition into two parts if ``bias_addition_split`` is set to be ``True``.
-    (See ./bias_addition.py for more details).
-
-    Examples:
-    1. Tracing a ``torch.nn.Module`` with control flow.
-
-    .. code-block:: python
-
-        class MyModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.linear = torch.nn.Linear(2, 2)
-
-            def forward(self, x):
-                if x.size(0) > 1:
-                    x = x.sum(dim=0)
-                return self.linear(x)
-
-        traced = symbolic_trace(MyModule(), meta_args={'x': torch.randn(1, 2, 2)})
-
-        # traced code like:
-        # def forward(self, x):
-        #     linear_1 = self.linear(x)
-        #     return linear_1
-
-        traced = symbolic_trace(MyModule(), meta_args={'x': torch.randn(2, 2, 2)})
-
-        # traced code like:
-        # def forward(self, x):
-        #     sum = x.sum(dim=0); x = None
-        #     linear = self.linear(sum); sum = None
-        #     return linear
-
-    2. Tracing a ``torch.nn.Module`` with ``torch.utils.checkpoint``.
-
-    .. code-block:: python
-
-        class MyModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.linear = torch.nn.Linear(2, 2)
-
-            def forward(self, x):
-                def custom_forward(x):
-                    return self.linear(x)
-                return torch.utils.checkpoint.checkpoint(custom_forward, x)
-
-        traced = symbolic_trace(MyModule(), meta_args={'x': torch.randn(1, 2, 2)}, trace_act_ckpt=True)
-
-        # traced code like:
-        # def checkpoint_0(self, x):
-        #     linear = self.linear(x); x = None
-        #     return linear
-        #
-        # def forward(self, x):
-        #     linear = torch.utils.checkpoint.checkpoint(checkpoint_0, x); x = None
-        #     return linear
-
-    3. Tracing a ``torch.nn.Module`` with ``bias_addition_split``.
-
-    .. code-block:: python
-
-        class MyModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.linear = torch.nn.Linear(2, 2, bias=True)
-
-            def forward(self, x):
-                return self.linear(x)
-
-        traced = symbolic_trace(MyModule(), meta_args={'x': torch.randn(1, 2, 2)}, bias_addition_split=True)
-
-        # traced code like:
-        # def forward(self, x):
-        #     linear_bias = self.linear.bias
-        #     linear_weight = self.linear.weight
-        #     linear = torch._C._nn.linear(x, linear_weight);  x = linear_weight = None
-        #     add = linear + linear_bias;  linear = linear_bias = None
-        #     return add
-
-    Args:
-        root (Union[torch.nn.Module, Callable[..., Any]]): The ``torch.nn.Module`` or function to be traced.
-        concrete_args (Optional[Dict[str, Any]], optional): Concrete arguments to be passed to the ``root``.
-            Defaults to {}.
-        meta_args (Optional[Dict[str, Any]], optional): Meta arguments to be passed to the ``root``. Mostly used
-            for tracing control flow. Defaults to {}.
-        trace_act_ckpt (bool, optional): Whether to trace the usage of ``torch.utils.checkpoint``.
-            Defaults to False.
-        bias_addition_split (bool, optional): Whether to split the bias addition into two parts. Defaults to False.
-
-    Returns:
-        ColoGraphModule: A traced ``GraphModule`` that is ready for activation checkpoint ``CodeGen``.
-
-    Remarks:
-        This part of ``symbolic_trace()`` is maintained by Colossal-AI team. If you encountered
-        any unexpected error during tracing, feel free to raise an issue on Colossal-AI GitHub
-        repo. We welcome any feedback and contributions to enhance the extensibility of
-        Colossal-AI.
-    """
-    if meta_args:
-        device, orig_device = _default_device(), _current_device(root)
-        wrap_fn = lambda elem: MetaTensor(elem, device=device) if isinstance(elem, torch.Tensor) else elem
-        graph = ColoTracer(trace_act_ckpt=trace_act_ckpt,
-                           bias_addition_split=bias_addition_split).trace(root.to(device),
-                                                                          concrete_args=concrete_args,
-                                                                          meta_args=tree_map(wrap_fn, meta_args))
-        if trace_act_ckpt:
-            graph.set_codegen(ActivationCheckpointCodeGen())
-        root.to(orig_device)
-    else:
-        graph = Tracer().trace(root, concrete_args=concrete_args)
-    name = root.__class__.__name__ if isinstance(root, torch.nn.Module) else root.__name__
-    return ColoGraphModule(root, graph, name)
diff --git a/tests/kit/model_zoo/__init__.py b/tests/kit/model_zoo/__init__.py
index 710038ffa387..466a2a558829 100644
--- a/tests/kit/model_zoo/__init__.py
+++ b/tests/kit/model_zoo/__init__.py
@@ -1,5 +1,4 @@
 from . import diffusers, timm, torchaudio, torchrec, torchvision, transformers
-
 from .registry import model_zoo
 
 __all__ = ['model_zoo']
diff --git a/tests/kit/model_zoo/transformers/gpt.py b/tests/kit/model_zoo/transformers/gpt.py
index 2a100c981dea..5ed4fbe70dc9 100644
--- a/tests/kit/model_zoo/transformers/gpt.py
+++ b/tests/kit/model_zoo/transformers/gpt.py
@@ -17,6 +17,14 @@ def data_gen():
     return dict(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
 
 
+def seq_classification_data_gen():
+    # batch sizes should be 1 if no padding token is defined.
+    input_ids = torch.zeros((1, SEQ_LENGTH), dtype=torch.int64)
+    token_type_ids = torch.zeros((1, SEQ_LENGTH), dtype=torch.int64)
+    attention_mask = torch.zeros((1, SEQ_LENGTH), dtype=torch.int64)
+    return dict(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
+
+
 output_transform_fn = lambda x: x
 
 config = transformers.GPT2Config(n_position=64, n_layer=2, n_head=4)
@@ -44,6 +52,6 @@ def data_gen():
                    model_attribute=ModelAttribute(has_control_flow=True))
 model_zoo.register(name='transformers_gpt_for_sequence_classification',
                    model_fn=lambda: transformers.GPT2ForSequenceClassification(config),
-                   data_gen_fn=data_gen,
+                   data_gen_fn=seq_classification_data_gen,
                    output_transform_fn=output_transform_fn,
                    model_attribute=ModelAttribute(has_control_flow=True))
diff --git a/tests/test_analyzer/test_fx/test_bias_addition.py b/tests/test_analyzer/test_fx/test_bias_addition.py
index 5c9ec7cc3477..044a464be8ef 100644
--- a/tests/test_analyzer/test_fx/test_bias_addition.py
+++ b/tests/test_analyzer/test_fx/test_bias_addition.py
@@ -1,5 +1,6 @@
 import pytest
 import torch
+from packaging import version
 from torch.utils.checkpoint import checkpoint
 
 try:
@@ -73,7 +74,7 @@ def forward(self, x):
         return x
 
 
-@pytest.mark.skipif(torch.__version__ < '1.12.0', reason='torch version < 12')
+@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.12.0'), reason='torch version < 12')
 @pytest.mark.parametrize("bias", [True, False])
 @pytest.mark.parametrize("bias_addition_split", [True, False])
 @pytest.mark.parametrize("shape", [(3, 3, 3), (3, 3, 3, 3)])
diff --git a/tests/test_fx/test_tracer/test_hf_model/hf_tracer_utils.py b/tests/test_fx/test_tracer/test_hf_model/hf_tracer_utils.py
index 6d93fe0408d7..7a4bf131ae36 100644
--- a/tests/test_fx/test_tracer/test_hf_model/hf_tracer_utils.py
+++ b/tests/test_fx/test_tracer/test_hf_model/hf_tracer_utils.py
@@ -3,7 +3,8 @@
 from torch.fx import GraphModule
 from torch.utils._pytree import tree_flatten
 
-from colossalai.fx import symbolic_trace
+# from colossalai.fx import symbolic_trace
+from colossalai._analyzer.fx import symbolic_trace
 
 
 def trace_model_and_compare_output(model, data_gen):
diff --git a/tests/test_fx/test_tracer/test_hf_model/test_hf_albert.py b/tests/test_fx/test_tracer/test_hf_model/test_hf_albert.py
index b1c9c211a9a0..31ba2290ed99 100644
--- a/tests/test_fx/test_tracer/test_hf_model/test_hf_albert.py
+++ b/tests/test_fx/test_tracer/test_hf_model/test_hf_albert.py
@@ -1,4 +1,7 @@
+import pytest
+import torch
 from hf_tracer_utils import trace_model_and_compare_output
+from packaging import version
 
 from tests.kit.model_zoo import model_zoo
 
@@ -6,6 +9,7 @@
 SEQ_LENGTH = 16
 
 
+@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.12.0'), reason='torch version < 12')
 def test_albert():
     sub_registry = model_zoo.get_sub_registry('transformers_albert')
 
diff --git a/tests/test_fx/test_tracer/test_hf_model/test_hf_bert.py b/tests/test_fx/test_tracer/test_hf_model/test_hf_bert.py
index 1bf4947c31a0..8db6817c66dc 100644
--- a/tests/test_fx/test_tracer/test_hf_model/test_hf_bert.py
+++ b/tests/test_fx/test_tracer/test_hf_model/test_hf_bert.py
@@ -1,8 +1,12 @@
+import pytest
+import torch
 from hf_tracer_utils import trace_model_and_compare_output
+from packaging import version
 
 from tests.kit.model_zoo import model_zoo
 
 
+@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.12.0'), reason='torch version < 12')
 def test_bert():
     sub_registry = model_zoo.get_sub_registry('transformers_bert')
 
diff --git a/tests/test_fx/test_tracer/test_hf_model/test_hf_gpt.py b/tests/test_fx/test_tracer/test_hf_model/test_hf_gpt.py
index 67a3178fae1b..796c17e398d5 100644
--- a/tests/test_fx/test_tracer/test_hf_model/test_hf_gpt.py
+++ b/tests/test_fx/test_tracer/test_hf_model/test_hf_gpt.py
@@ -1,16 +1,24 @@
 import pytest
+import torch
 from hf_tracer_utils import trace_model_and_compare_output
+from packaging import version
 
 from tests.kit.model_zoo import model_zoo
 
 
-# TODO: remove this skip once we handle the latest gpt model
-@pytest.mark.skip
+@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.12.0'), reason='torch version < 12')
 def test_gpt():
     sub_registry = model_zoo.get_sub_registry('transformers_gpt')
 
     for name, (model_fn, data_gen_fn, _, _) in sub_registry.items():
         model = model_fn()
+
+        # TODO: support the following models
+        # 1. GPT2DoubleHeadsModel
+        # as they are not supported, let's skip them
+        if model.__class__.__name__ in ['GPT2DoubleHeadsModel']:
+            continue
+
         trace_model_and_compare_output(model, data_gen_fn)
 
 
diff --git a/tests/test_fx/test_tracer/test_hf_model/test_hf_opt.py b/tests/test_fx/test_tracer/test_hf_model/test_hf_opt.py
index 740f5a9f0c57..e7bfa607082e 100644
--- a/tests/test_fx/test_tracer/test_hf_model/test_hf_opt.py
+++ b/tests/test_fx/test_tracer/test_hf_model/test_hf_opt.py
@@ -1,8 +1,12 @@
+import pytest
+import torch
 from hf_tracer_utils import trace_model_and_compare_output
+from packaging import version
 
 from tests.kit.model_zoo import model_zoo
 
 
+@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.12.0'), reason='torch version < 12')
 def test_opt():
     sub_registry = model_zoo.get_sub_registry('transformers_opt')
 
diff --git a/tests/test_fx/test_tracer/test_hf_model/test_hf_t5.py b/tests/test_fx/test_tracer/test_hf_model/test_hf_t5.py
index 7073fd63470b..5f7e4f81c44e 100644
--- a/tests/test_fx/test_tracer/test_hf_model/test_hf_t5.py
+++ b/tests/test_fx/test_tracer/test_hf_model/test_hf_t5.py
@@ -1,8 +1,12 @@
+import pytest
+import torch
 from hf_tracer_utils import trace_model_and_compare_output
+from packaging import version
 
 from tests.kit.model_zoo import model_zoo
 
 
+@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.12.0'), reason='torch version < 12')
 def test_t5():
     sub_registry = model_zoo.get_sub_registry('transformers_t5')
 
diff --git a/tests/test_fx/test_tracer/test_timm_model/test_timm_model.py b/tests/test_fx/test_tracer/test_timm_model/test_timm_model.py
index 31baa3e89798..b175d8b10c67 100644
--- a/tests/test_fx/test_tracer/test_timm_model/test_timm_model.py
+++ b/tests/test_fx/test_tracer/test_timm_model/test_timm_model.py
@@ -1,8 +1,8 @@
 import pytest
-import timm.models as tm
 import torch
+from packaging import version
 
-from colossalai.fx import symbolic_trace
+from colossalai._analyzer.fx import symbolic_trace
 from tests.kit.model_zoo import model_zoo
 
 
@@ -42,6 +42,7 @@ def trace_and_compare(model_cls, data, output_transform_fn, meta_args=None):
             f'{model.__class__.__name__} has inconsistent outputs, {fx_output_val} vs {non_fx_output_val}'
 
 
+@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.12.0'), reason='torch version < 12')
 def test_timm_models():
     torch.backends.cudnn.deterministic = True
 
diff --git a/tests/test_fx/test_tracer/test_torchaudio_model/test_torchaudio_model.py b/tests/test_fx/test_tracer/test_torchaudio_model/test_torchaudio_model.py
index bf6c7ae551ab..65f9f5149dda 100644
--- a/tests/test_fx/test_tracer/test_torchaudio_model/test_torchaudio_model.py
+++ b/tests/test_fx/test_tracer/test_torchaudio_model/test_torchaudio_model.py
@@ -1,20 +1,18 @@
-import re
-
+import pytest
 import torch
+from packaging import version
 from torchaudio_utils import trace_and_compare
 
 from tests.kit.model_zoo import model_zoo
 
 
+@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.12.0'), reason='torch version < 12')
 def test_torchaudio_models():
     torch.backends.cudnn.deterministic = True
 
     sub_model_zoo = model_zoo.get_sub_registry('torchaudio')
 
     for name, (model_fn, data_gen_fn, output_transform_fn, attribute) in sub_model_zoo.items():
-        # FIXME(ver217): temporarily skip these models
-        if re.search(f'(conformer|emformer|tacotron|wav2vec2_base|hubert_base)', name):
-            continue
         model = model_fn()
         trace_and_compare(model,
                           data_gen_fn,
diff --git a/tests/test_fx/test_tracer/test_torchaudio_model/torchaudio_utils.py b/tests/test_fx/test_tracer/test_torchaudio_model/torchaudio_utils.py
index 18d86fc05941..239f38680cec 100644
--- a/tests/test_fx/test_tracer/test_torchaudio_model/torchaudio_utils.py
+++ b/tests/test_fx/test_tracer/test_torchaudio_model/torchaudio_utils.py
@@ -1,6 +1,6 @@
 import torch
 
-from colossalai.fx import symbolic_trace
+from colossalai._analyzer.fx import symbolic_trace
 
 
 def trace_and_compare(model, data_gen, output_transform_fn, need_meta=False, need_concrete=False):
diff --git a/tests/test_fx/test_tracer/test_torchrec_model/test_deepfm_model.py b/tests/test_fx/test_tracer/test_torchrec_model/test_deepfm_model.py
index a4e847dbcfcd..40f83d47a7cc 100644
--- a/tests/test_fx/test_tracer/test_torchrec_model/test_deepfm_model.py
+++ b/tests/test_fx/test_tracer/test_torchrec_model/test_deepfm_model.py
@@ -1,7 +1,7 @@
 import pytest
 import torch
 
-from colossalai.fx import symbolic_trace
+from colossalai._analyzer.fx import symbolic_trace
 from tests.kit.model_zoo import model_zoo
 
 BATCH = 2
diff --git a/tests/test_fx/test_tracer/test_torchrec_model/test_dlrm_model.py b/tests/test_fx/test_tracer/test_torchrec_model/test_dlrm_model.py
index ac377ff1d5f8..6d4b6ab81b12 100644
--- a/tests/test_fx/test_tracer/test_torchrec_model/test_dlrm_model.py
+++ b/tests/test_fx/test_tracer/test_torchrec_model/test_dlrm_model.py
@@ -1,7 +1,7 @@
 import pytest
 import torch
 
-from colossalai.fx import symbolic_trace
+from colossalai._analyzer.fx import symbolic_trace
 from tests.kit.model_zoo import model_zoo
 
 BATCH = 2
diff --git a/tests/test_fx/test_tracer/test_torchvision_model/test_torchvision_model.py b/tests/test_fx/test_tracer/test_torchvision_model/test_torchvision_model.py
index 455638818463..8dbbf9f5aab7 100644
--- a/tests/test_fx/test_tracer/test_torchvision_model/test_torchvision_model.py
+++ b/tests/test_fx/test_tracer/test_torchvision_model/test_torchvision_model.py
@@ -1,6 +1,6 @@
 import torch
 
-from colossalai.fx import symbolic_trace
+from colossalai._analyzer.fx import symbolic_trace
 from tests.kit.model_zoo import model_zoo
 
 
From 019a847432f850d790912b5fc1e048d85fe99e2a Mon Sep 17 00:00:00 2001
From: YuliangLiu0306 <72588413+YuliangLiu0306@users.noreply.github.com>
Date: Wed, 22 Mar 2023 13:38:11 +0800
Subject: [PATCH 494/503] [Analyzer] fix analyzer tests (#3197)

---
 .../test_fx/test_bias_addition.py             | 33 +++++++-----
 .../test_analyzer/test_fx/test_shape_prop.py  | 30 ++++++-----
 .../test_fx/test_symbolic_profile.py          | 18 ++++---
 tests/test_analyzer/test_fx/zoo.py            |  8 +--
 .../test_subclasses/test_flop_tensor.py       | 11 ++--
 .../test_subclasses/test_meta_mode.py         |  7 +--
 tests/test_analyzer/test_subclasses/zoo.py    | 53 -------------------
 7 files changed, 60 insertions(+), 100 deletions(-)
 delete mode 100644 tests/test_analyzer/test_subclasses/zoo.py

diff --git a/tests/test_analyzer/test_fx/test_bias_addition.py b/tests/test_analyzer/test_fx/test_bias_addition.py
index 044a464be8ef..61951e9a5da9 100644
--- a/tests/test_analyzer/test_fx/test_bias_addition.py
+++ b/tests/test_analyzer/test_fx/test_bias_addition.py
@@ -3,6 +3,8 @@
 from packaging import version
 from torch.utils.checkpoint import checkpoint
 
+from colossalai.testing.utils import parameterize
+
 try:
     from colossalai._analyzer.fx import symbolic_trace
 except:
@@ -56,9 +58,13 @@ def __init__(self, bias) -> None:
         self.linear = LinearModel(3, 3, bias)
         self.conv = ConvModel(3, 6, 3, bias)
 
-    def forward(self, x, select=0):
+    def forward(self, x, select=torch.Tensor([0])):
         x = self.linear(x)
-        x = checkpoint(self.conv, x, select)
+        if select:
+            x = checkpoint(self.conv, x, 0)
+        else:
+            x = checkpoint(self.conv, x, 1)
+
         return x
 
 
@@ -75,10 +81,10 @@ def forward(self, x):
 
 
 @pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.12.0'), reason='torch version < 12')
-@pytest.mark.parametrize("bias", [True, False])
-@pytest.mark.parametrize("bias_addition_split", [True, False])
-@pytest.mark.parametrize("shape", [(3, 3, 3), (3, 3, 3, 3)])
-@pytest.mark.parametrize("select", [0, 1])
+@parameterize("bias", [True, False])
+@parameterize("bias_addition_split", [True, False])
+@parameterize("shape", [(3, 3, 3), (3, 3, 3, 3)])
+@parameterize("select", [torch.Tensor([0]), torch.Tensor([1])])
 def test_siu_model(bias, bias_addition_split, shape, select):
     model = SiuModel(bias=bias)
     x = torch.rand(shape)
@@ -87,18 +93,18 @@ def test_siu_model(bias, bias_addition_split, shape, select):
                         concrete_args={'select': select},
                         trace_act_ckpt=True,
                         bias_addition_split=bias_addition_split)
-    assert torch.allclose(model(x, select), gm(x, select)), 'original model and traced model should be the same!'
+    assert torch.allclose(model(x, select), gm(x)), 'original model and traced model should be the same!'
     if bias and bias_addition_split:
         assert '+' in gm.code, 'bias addition should be split!'
     else:
         assert '+' not in gm.code, 'bias addition should not be split!'
 
 
-@pytest.mark.skipif(torch.__version__ < '1.12.0', reason='torch version < 12')
-@pytest.mark.parametrize("alpha", [1, 2])
-@pytest.mark.parametrize("beta", [1, 2])
-@pytest.mark.parametrize("bias_addition_split", [True, False])
-@pytest.mark.parametrize("shape", [(3, 3), (5, 5)])
+@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.12.0'), reason='torch version < 12')
+@parameterize("alpha", [1, 2])
+@parameterize("beta", [1, 2])
+@parameterize("bias_addition_split", [True, False])
+@parameterize("shape", [(3, 3), (5, 5)])
 def test_addmm_model(alpha, beta, bias_addition_split, shape):
     model = AddmmModel(alpha=alpha, beta=beta)
     x = torch.rand(shape)
@@ -111,4 +117,5 @@ def test_addmm_model(alpha, beta, bias_addition_split, shape):
 
 
 if __name__ == '__main__':
-    test_siu_model(True, True, (3, 3, 3))
+    test_siu_model()
+    test_addmm_model()
diff --git a/tests/test_analyzer/test_fx/test_shape_prop.py b/tests/test_analyzer/test_fx/test_shape_prop.py
index b19884a70fb2..08f4ff2cbd1f 100644
--- a/tests/test_analyzer/test_fx/test_shape_prop.py
+++ b/tests/test_analyzer/test_fx/test_shape_prop.py
@@ -1,16 +1,17 @@
 import pytest
-import timm.models as tmm
 import torch
 import torchvision.models as tm
-from .zoo import tm_models, tmm_models
+from packaging import version
+
+from colossalai.testing.utils import parameterize
+from tests.test_analyzer.test_fx.zoo import tm_models, tmm_models
 
 try:
     from colossalai._analyzer._subclasses import MetaTensorMode
     from colossalai._analyzer.fx import symbolic_trace
     from colossalai._analyzer.fx.passes.shape_prop import shape_prop_pass
     from colossalai._analyzer.fx.symbolic_profile import register_shape_impl
-    
-    
+
     @register_shape_impl(torch.nn.functional.linear)
     def linear_impl(*args, **kwargs):
         assert True
@@ -23,15 +24,15 @@ def _check_gm_validity(gm: torch.fx.GraphModule):
     for node in gm.graph.nodes:
         assert node.meta['info'].outputs, f'In {gm.__class__.__name__}, {node} has no output shape.'
         if node.op in [
-        # 'call_module',    # can apply to params
-        # 'call_function',  # can apply to params
-        # 'call_method',    # can apply to params
+                'call_module',    # can apply to params
+                'call_function',    # can apply to params
+                'call_method',    # can apply to params
         ]:
-            assert node.meta['info'].inputs, f'In {gm.__class__.__name__}, {node} has no input shape.'
+            assert hasattr(node.meta['info'], 'inputs'), f'In {gm.__class__.__name__}, {node} has no input shape.'
 
 
-@pytest.mark.skipif(torch.__version__ < '1.12.0', reason='torch version < 12')
-@pytest.mark.parametrize('m', tm_models)
+@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.12.0'), reason='torch version < 12')
+@parameterize('m', tm_models)
 def test_torchvision_shape_prop(m):
     with MetaTensorMode():
         model = m()
@@ -44,8 +45,8 @@ def test_torchvision_shape_prop(m):
     _check_gm_validity(gm)
 
 
-@pytest.mark.skipif(torch.__version__ < '1.12.0', reason='torch version < 12')
-@pytest.mark.parametrize('m', tmm_models)
+@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.12.0'), reason='torch version < 12')
+@parameterize('m', tmm_models)
 def test_timm_shape_prop(m):
     with MetaTensorMode():
         model = m()
@@ -53,11 +54,12 @@ def test_timm_shape_prop(m):
     meta_args = {
         "x": data,
     }
+
     gm = symbolic_trace(model, meta_args=meta_args)
     shape_prop_pass(gm, data)
     _check_gm_validity(gm)
 
 
 if __name__ == "__main__":
-    test_torchvision_shape_prop(tm.resnet18)
-    test_timm_shape_prop(tmm.vgg11)
+    test_torchvision_shape_prop()
+    test_timm_shape_prop()
diff --git a/tests/test_analyzer/test_fx/test_symbolic_profile.py b/tests/test_analyzer/test_fx/test_symbolic_profile.py
index 5f749e6f3c50..be781599f14b 100644
--- a/tests/test_analyzer/test_fx/test_symbolic_profile.py
+++ b/tests/test_analyzer/test_fx/test_symbolic_profile.py
@@ -1,8 +1,10 @@
 import pytest
-import timm.models as tmm
 import torch
 import torchvision.models as tm
-from .zoo import tm_models, tmm_models
+from packaging import version
+
+from colossalai.testing.utils import parameterize
+from tests.test_analyzer.test_fx.zoo import tm_models, tmm_models
 
 try:
     from colossalai._analyzer._subclasses import MetaTensorMode
@@ -16,8 +18,8 @@ def _check_gm_validity(gm: torch.fx.GraphModule):
         assert len(node.meta['info'].global_ctx), f'In {gm.__class__.__name__}, {node} has empty global context.'
 
 
-@pytest.mark.skipif(torch.__version__ < '1.12.0', reason='torch version < 12')
-@pytest.mark.parametrize('m', tm_models)
+@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.12.0'), reason='torch version < 12')
+@parameterize('m', tm_models)
 def test_torchvision_profile(m, verbose=False, bias_addition_split=False):
     with MetaTensorMode():
         model = m()
@@ -30,8 +32,8 @@ def test_torchvision_profile(m, verbose=False, bias_addition_split=False):
     _check_gm_validity(gm)
 
 
-@pytest.mark.skipif(torch.__version__ < '1.12.0', reason='torch version < 12')
-@pytest.mark.parametrize('m', tmm_models)
+@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.12.0'), reason='torch version < 12')
+@parameterize('m', tmm_models)
 def test_timm_profile(m, verbose=False, bias_addition_split=False):
     with MetaTensorMode():
         model = m()
@@ -45,5 +47,5 @@ def test_timm_profile(m, verbose=False, bias_addition_split=False):
 
 
 if __name__ == "__main__":
-    test_torchvision_profile(tm.vit_b_16, verbose=True, bias_addition_split=False)
-    test_timm_profile(tmm.gmlp_b16_224, verbose=True, bias_addition_split=False)
+    test_torchvision_profile()
+    test_timm_profile()
diff --git a/tests/test_analyzer/test_fx/zoo.py b/tests/test_analyzer/test_fx/zoo.py
index 925078d0dcbe..a96aa3949134 100644
--- a/tests/test_analyzer/test_fx/zoo.py
+++ b/tests/test_analyzer/test_fx/zoo.py
@@ -33,18 +33,18 @@
     tmm.dm_nfnet_f0,
     tmm.eca_nfnet_l0,
     tmm.efficientformer_l1,
-    tmm.ese_vovnet19b_dw,
+    # tmm.ese_vovnet19b_dw,
     tmm.gmixer_12_224,
     tmm.gmlp_b16_224,
-    tmm.hardcorenas_a,
+    # tmm.hardcorenas_a,
     tmm.hrnet_w18_small,
     tmm.inception_v3,
     tmm.mixer_b16_224,
     tmm.nf_ecaresnet101,
     tmm.nf_regnet_b0,
     # tmm.pit_b_224,  # pretrained only
-    tmm.regnetv_040,
-    tmm.skresnet18,
+    # tmm.regnetv_040,
+    # tmm.skresnet18,
     # tmm.swin_base_patch4_window7_224,     # fx bad case
     # tmm.tnt_b_patch16_224,    # bad case
     tmm.vgg11,
diff --git a/tests/test_analyzer/test_subclasses/test_flop_tensor.py b/tests/test_analyzer/test_subclasses/test_flop_tensor.py
index 551628103325..752836141fe7 100644
--- a/tests/test_analyzer/test_subclasses/test_flop_tensor.py
+++ b/tests/test_analyzer/test_subclasses/test_flop_tensor.py
@@ -1,9 +1,10 @@
 import pytest
 import torch
-import torch.nn as nn
 import torch.nn.functional as F
 import torchvision.models as tm
-from .zoo import tm_models, tmm_models
+from packaging import version
+
+from tests.test_analyzer.test_fx.zoo import tm_models, tmm_models
 
 try:
     from colossalai._analyzer._subclasses import MetaTensorMode, flop_count
@@ -11,7 +12,7 @@
     pass
 
 
-@pytest.mark.skipif(torch.__version__ < '1.12.0', reason='torch version < 12')
+@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.12.0'), reason='torch version < 12')
 @pytest.mark.parametrize('m', tm_models + tmm_models)
 def test_flop_count_module(m):
     x = torch.rand(2, 3, 224, 224)
@@ -37,7 +38,7 @@ def test_flop_count_module(m):
 ]
 
 
-@pytest.mark.skipif(torch.__version__ < '1.12.0', reason='torch version < 12')
+@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.12.0'), reason='torch version < 12')
 @pytest.mark.parametrize('func, args, kwargs', odd_cases)
 def test_flop_count_function(func, args, kwargs):
     rs_fwd, rs_bwd = flop_count(func, *args, **kwargs, verbose=True)
@@ -46,5 +47,5 @@ def test_flop_count_function(func, args, kwargs):
 
 
 if __name__ == '__main__':
-    test_flop_count_module(tm.resnet18, torch.rand(2, 3, 224, 224))
+    test_flop_count_module(tm.resnet18)
     test_flop_count_function(F.relu, (torch.rand(2, 3, 224, 224, requires_grad=True),), {'inplace': True})
diff --git a/tests/test_analyzer/test_subclasses/test_meta_mode.py b/tests/test_analyzer/test_subclasses/test_meta_mode.py
index d8122b019619..160d411f6c39 100644
--- a/tests/test_analyzer/test_subclasses/test_meta_mode.py
+++ b/tests/test_analyzer/test_subclasses/test_meta_mode.py
@@ -1,12 +1,13 @@
 import pytest
 import torch
-import torch.distributed as dist
 import torchvision.models as tm
+from packaging import version
+
 try:
     from colossalai._analyzer._subclasses import MetaTensor, MetaTensorMode
 except:
     pass
-from .zoo import tm_models, tmm_models
+from tests.test_analyzer.test_fx.zoo import tm_models, tmm_models
 
 
 def compare_all(tensor: torch.Tensor, meta_tensor: torch.Tensor):
@@ -28,7 +29,7 @@ def run_and_compare(model):
     compare_all(x.grad, meta_x.grad)
 
 
-@pytest.mark.skipif(torch.__version__ < '1.12.0', reason='torch version < 12')
+@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.12.0'), reason='torch version < 12')
 @pytest.mark.parametrize('m', tm_models + tmm_models)
 def test_meta_mode_shape(m):
     run_and_compare(m())
diff --git a/tests/test_analyzer/test_subclasses/zoo.py b/tests/test_analyzer/test_subclasses/zoo.py
deleted file mode 100644
index 925078d0dcbe..000000000000
--- a/tests/test_analyzer/test_subclasses/zoo.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import timm.models as tmm
-import torchvision.models as tm
-
-# input shape: (batch_size, 3, 224, 224)
-tm_models = [
-    tm.alexnet,
-    tm.convnext_base,
-    tm.densenet121,
-    # tm.efficientnet_v2_s,
-    # tm.googlenet,   # output bad case
-    # tm.inception_v3,  # bad case
-    tm.mobilenet_v2,
-    tm.mobilenet_v3_small,
-    tm.mnasnet0_5,
-    tm.resnet18,
-    tm.regnet_x_16gf,
-    tm.resnext50_32x4d,
-    tm.shufflenet_v2_x0_5,
-    tm.squeezenet1_0,
-    # tm.swin_s,  # fx bad case
-    tm.vgg11,
-    tm.vit_b_16,
-    tm.wide_resnet50_2,
-]
-
-tmm_models = [
-    tmm.beit_base_patch16_224,
-    tmm.beitv2_base_patch16_224,
-    tmm.cait_s24_224,
-    tmm.coat_lite_mini,
-    tmm.convit_base,
-    tmm.deit3_base_patch16_224,
-    tmm.dm_nfnet_f0,
-    tmm.eca_nfnet_l0,
-    tmm.efficientformer_l1,
-    tmm.ese_vovnet19b_dw,
-    tmm.gmixer_12_224,
-    tmm.gmlp_b16_224,
-    tmm.hardcorenas_a,
-    tmm.hrnet_w18_small,
-    tmm.inception_v3,
-    tmm.mixer_b16_224,
-    tmm.nf_ecaresnet101,
-    tmm.nf_regnet_b0,
-    # tmm.pit_b_224,  # pretrained only
-    tmm.regnetv_040,
-    tmm.skresnet18,
-    # tmm.swin_base_patch4_window7_224,     # fx bad case
-    # tmm.tnt_b_patch16_224,    # bad case
-    tmm.vgg11,
-    tmm.vit_base_patch16_18x2_224,
-    tmm.wide_resnet50_2,
-]

From e3ad88fb482fdd95241a1f74866559b83ab4f56b Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Wed, 22 Mar 2023 14:11:54 +0800
Subject: [PATCH 495/503] [booster] implemented the cluster module (#3191)

* [booster] implemented the cluster module

* polish code
---
 colossalai/cluster/__init__.py              |   5 +
 colossalai/cluster/device_mesh_manager.py   |  36 +++++
 colossalai/cluster/dist_coordinator.py      | 158 ++++++++++++++++++++
 colossalai/cluster/process_group_manager.py |  75 ++++++++++
 4 files changed, 274 insertions(+)
 create mode 100644 colossalai/cluster/__init__.py
 create mode 100644 colossalai/cluster/device_mesh_manager.py
 create mode 100644 colossalai/cluster/dist_coordinator.py
 create mode 100644 colossalai/cluster/process_group_manager.py

diff --git a/colossalai/cluster/__init__.py b/colossalai/cluster/__init__.py
new file mode 100644
index 000000000000..2fbdfd3cc999
--- /dev/null
+++ b/colossalai/cluster/__init__.py
@@ -0,0 +1,5 @@
+from .device_mesh_manager import DeviceMeshManager
+from .dist_coordinator import DistCoordinator
+from .process_group_manager import ProcessGroupManager
+
+__all__ = ['DistCoordinator', 'ProcessGroupManager', 'DeviceMeshManager']
diff --git a/colossalai/cluster/device_mesh_manager.py b/colossalai/cluster/device_mesh_manager.py
new file mode 100644
index 000000000000..744799182e22
--- /dev/null
+++ b/colossalai/cluster/device_mesh_manager.py
@@ -0,0 +1,36 @@
+from colossalai.device.device_mesh import DeviceMesh
+
+
+class DeviceMeshManager:
+    """
+    Device mesh manager is responsible for creating and managing device meshes.
+    """
+
+    def __init__(self):
+        self.device_mesh_store = dict()
+
+    def create_device_mesh(self, name, *args, **kwargs) -> DeviceMesh:
+        """
+        Create a device mesh and store it in the manager.
+
+        Args:
+            name (str): name of the device mesh
+            *args: args for DeviceMesh
+            **kwargs: kwargs for DeviceMesh
+        """
+        # TODO(Yuliang): replace *args, **kwargs with explicit arguments
+        if name not in self.device_mesh_store:
+            device_mesh = DeviceMesh(*args, **kwargs)
+            self.device_mesh_store[name] = device_mesh
+            return device_mesh
+        else:
+            raise ValueError(f'Device mesh {name} already exists.')
+
+    def get(self, name: str) -> DeviceMesh:
+        pass
+
+    def destroy(self):
+        pass
+
+    def destroy_all(self):
+        pass
diff --git a/colossalai/cluster/dist_coordinator.py b/colossalai/cluster/dist_coordinator.py
new file mode 100644
index 000000000000..6b48faf5b720
--- /dev/null
+++ b/colossalai/cluster/dist_coordinator.py
@@ -0,0 +1,158 @@
+import os
+from contextlib import contextmanager
+
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+from colossalai.context.singleton_meta import SingletonMeta
+
+
+class DistCoordinator(metaclass=SingletonMeta):
+    """
+    This class is used to coordinate distributed training. It is a singleton class, which means that there is only one instance of this
+    class in the whole program.
+
+    There are some terms that are used in this class:
+        - rank: the rank of the current process
+        - world size: the total number of processes
+        - local rank: the rank of the current process on the current node
+        - master: the process with rank 0
+        - node master: the process with local rank 0 on the current node
+
+    Example:
+        >>> from colossalai.cluster.dist_coordinator import DistCoordinator
+        >>> coordinator = DistCoordinator()
+        >>>
+        >>> if coordinator.is_master():
+        >>>     do_something()
+        >>>
+        >>> coordinator.print_on_master('hello world')
+
+    Attributes:
+        rank (int): the rank of the current process
+        world_size (int): the total number of processes
+        local_rank (int): the rank of the current process on the current node
+    """
+
+    def __init__(self):
+        assert dist.is_initialized(
+        ), 'Distributed is not initialized. Please call `torch.distributed.init_process_group` or `colossalai.launch` first.'
+        self._rank = dist.get_rank()
+        self._world_size = dist.get_world_size()
+        # this is often passed by launchers such as torchrun
+        self._local_rank = os.environ.get('LOCAL_RANK', -1)
+
+    @property
+    def rank(self) -> int:
+        return self._rank
+
+    @property
+    def world_size(self) -> int:
+        return self._world_size
+
+    @property
+    def local_rank(self) -> int:
+        return self._local_rank
+
+    def _assert_local_rank_set(self):
+        """
+        Assert that the local rank is set. This is often passed by launchers such as torchrun.
+        """
+        assert self.local_rank >= 0, 'The environment variable LOCAL_RANK is not set, thus the coordinator is not aware of the local rank of the current process.'
+
+    def is_master(self, process_group: ProcessGroup = None) -> bool:
+        """
+        Check if the current process is the master process (rank is 0). It can accept a sub process group to check the rank 0 with respect to the process.
+
+        Args:
+            process_group (ProcessGroup, optional): process group to use for the rank 0 check. Defaults to None, which refers to the default process group.
+
+        Returns:
+            bool: True if the current process is the master process, False otherwise
+        """
+        rank = dist.get_rank(group=process_group)
+        return rank == 0
+
+    def is_node_master(self) -> bool:
+        """
+        Check if the current process is the master process on the current node (local rank is 0).
+
+        Returns:
+            bool: True if the current process is the master process on the current node, False otherwise
+        """
+        self._assert_local_rank_set()
+        return self.local_rank == 0
+
+    def is_last_process(self, process_group: ProcessGroup = None) -> bool:
+        """
+        Check if the current process is the last process (rank is world size - 1). It can accept a sub process group to check the last rank with respect to the process.
+
+        Args:
+            process_group (ProcessGroup, optional): process group to use for the last rank check. Defaults to None, which refers to the default process group.
+
+        Returns:
+            bool: True if the current process is the last process, False otherwise
+        """
+        rank = dist.get_rank(group=process_group)
+        world_size = dist.get_world_size(group=process_group)
+        return rank == world_size - 1
+
+    def print_on_master(self, msg: str, process_group: ProcessGroup = None):
+        """
+        Print message only from rank 0.
+
+        Args:
+            msg (str): message to print
+            process_group (ProcessGroup, optional): process group to use for the rank 0 check. Defaults to None, which refers to the default process group.
+        """
+        rank = dist.get_rank(group=process_group)
+        if rank == 0:
+            print(msg)
+
+    def print_on_node_master(self, msg: str):
+        """
+        Print message only from local rank 0. Local rank 0 refers to the 0th process running the current node.
+
+        Args:
+            msg (str): message to print
+        """
+        self._assert_local_rank_set()
+        if self.local_rank == 0:
+            print(msg)
+
+    @contextmanager
+    def priority_execution(self, executor_rank: int = 0, process_group: ProcessGroup = None):
+        """
+        This context manager is used to allow one process to execute while blocking all
+        other processes in the same process group. This is often useful when downloading is required
+        as we only want to download in one process to prevent file corruption.
+
+        Example:
+            >>> from colossalai.cluster import DistCoordinator
+            >>> dist_coordinator = DistCoordinator()
+            >>> with dist_coordinator.priority_execution():
+            >>>     dataset = CIFAR10(root='./data', download=True)
+
+        Args:
+            executor_rank (int): the process rank to execute without blocking, all other processes will be blocked
+            process_group (ProcessGroup, optional): process group to use for the executor rank check. Defaults to None, which refers to the default process group.
+        """
+        rank = dist.get_rank(group=process_group)
+        should_block = rank != executor_rank
+
+        if should_block:
+            dist.barrier(group=process_group)
+
+        yield
+
+        if not should_block:
+            dist.barrier(group=process_group)
+
+    def destroy(self, process_group: ProcessGroup = None):
+        """
+        Destroy the distributed process group.
+
+        Args:
+            process_group (ProcessGroup, optional): process group to destroy. Defaults to None, which refers to the default process group.
+        """
+        dist.destroy_process_group(process_group)
diff --git a/colossalai/cluster/process_group_manager.py b/colossalai/cluster/process_group_manager.py
new file mode 100644
index 000000000000..e52661846f3e
--- /dev/null
+++ b/colossalai/cluster/process_group_manager.py
@@ -0,0 +1,75 @@
+from typing import List
+
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+
+class ProcessGroupManager:
+    """
+    ProcessGroupManager is used to manage the process groups in the cluster.
+
+    There are some terms used in this class:
+        - pg: the short name for process group
+        - pg_name: the name of the process group
+        - pg_size: the world size of the process group
+        - rank: the rank of the current process in the process group
+        - world_size: the total number of processes in the process group
+    """
+
+    def __init__(self):
+        self.pg_store = dict()
+
+    def create_process_group(self, name: str, ranks: List[int], backend: str = 'nccl') -> ProcessGroup:
+        """
+        Get a process group by name. If the process group does not exist, it will be created.
+
+        Args:
+            name (str): name of the process group
+            ranks (List[int]): ranks of the process group
+            backend (str, optional): backend of the process group. Defaults to 'nccl'.
+
+        Returns:
+            ProcessGroup: the process group
+        """
+        if name not in self.pg_store:
+            pg = dist.new_group(ranks=ranks, backend=backend)
+            self.pg_store[name] = pg
+            return pg
+        else:
+            raise ValueError(f'Process group {name} already exists.')
+
+    def get(self, name: str) -> ProcessGroup:
+        """
+        Get a process group by name.
+
+        Args:
+            name (str): name of the process group
+
+        Returns:
+            ProcessGroup: the process group
+        """
+        if name in self.pg_store:
+            return self.pg_store[name]
+        else:
+            raise ValueError(f'Process group {name} does not exist.')
+
+    def destroy(self, name: str) -> None:
+        """
+        Destroy a process group by name.
+
+        Args:
+            name (str): name of the process group
+        """
+        if name in self.pg_store:
+            dist.destroy_process_group(self.pg_store[name])
+            del self.pg_store[name]
+        else:
+            raise ValueError(f'Process group {name} does not exist.')
+
+    def destroy_all(self) -> None:
+        """
+        Destroy all process groups.
+        """
+        for name in self.pg_store:
+            dist.destroy_process_group(self.pg_store[name])
+        self.pg_store.clear()

From 1e1b9d2feabc6252818352fdd71772dd46fbe41d Mon Sep 17 00:00:00 2001
From: Fazzie-Maqianli <55798671+Fazziekey@users.noreply.github.com>
Date: Wed, 22 Mar 2023 15:44:31 +0800
Subject: [PATCH 496/503] [chatgpt]support llama (#3070)

---
 .../ChatGPT/chatgpt/models/llama/__init__.py  |  5 +++
 .../chatgpt/models/llama/llama_actor.py       | 38 +++++++++++++++++
 .../chatgpt/models/llama/llama_critic.py      | 42 +++++++++++++++++++
 .../ChatGPT/chatgpt/models/llama/llama_rm.py  | 41 ++++++++++++++++++
 4 files changed, 126 insertions(+)
 create mode 100644 applications/ChatGPT/chatgpt/models/llama/__init__.py
 create mode 100644 applications/ChatGPT/chatgpt/models/llama/llama_actor.py
 create mode 100644 applications/ChatGPT/chatgpt/models/llama/llama_critic.py
 create mode 100644 applications/ChatGPT/chatgpt/models/llama/llama_rm.py

diff --git a/applications/ChatGPT/chatgpt/models/llama/__init__.py b/applications/ChatGPT/chatgpt/models/llama/__init__.py
new file mode 100644
index 000000000000..9b2a024afdb2
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/models/llama/__init__.py
@@ -0,0 +1,5 @@
+from .llama_actor import LlamaActor
+from .llama_critic import LlamaCritic
+from .llama_rm import LlamaRM
+
+__all__ = ['LlamaActor', 'LlamaCritic', 'LlamaRM']
diff --git a/applications/ChatGPT/chatgpt/models/llama/llama_actor.py b/applications/ChatGPT/chatgpt/models/llama/llama_actor.py
new file mode 100644
index 000000000000..2c7adb390d8b
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/models/llama/llama_actor.py
@@ -0,0 +1,38 @@
+from typing import Optional
+
+import torch
+from transformers import AutoModelForCausalLM, LlamaConfig, LlamaForCausalLM
+
+from ..base import Actor
+
+
+class LlamaActor(Actor):
+    """
+    Llama Actor model.
+
+    Args:
+        pretrained (str): Pretrained model name or path.
+        config (LlamaConfig): Model config.
+        checkpoint (bool): Enable gradient checkpointing.
+        lora_rank (int): LoRA rank.
+        lora_train_bias (str): LoRA bias training mode.
+    """
+
+    def __init__(self,
+                 pretrained: Optional[str] = None,
+                 config: Optional[LlamaConfig] = None,
+                 checkpoint: bool = False,
+                 lora_rank: int = 0,
+                 lora_train_bias: str = 'none') -> None:
+
+        if pretrained is not None:
+            model = LlamaForCausalLM.from_pretrained(pretrained)
+        elif config is not None:
+            model = LlamaForCausalLM(config)
+        else:
+            model = LlamaForCausalLM(LlamaConfig())
+
+        if checkpoint:
+            model.gradient_checkpointing_enable()
+
+        super().__init__(model, lora_rank, lora_train_bias)
diff --git a/applications/ChatGPT/chatgpt/models/llama/llama_critic.py b/applications/ChatGPT/chatgpt/models/llama/llama_critic.py
new file mode 100644
index 000000000000..cd565031e112
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/models/llama/llama_critic.py
@@ -0,0 +1,42 @@
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from transformers import AutoModelForCausalLM, LlamaConfig, LlamaForCausalLM
+
+from ..base import Critic
+
+
+class LlamaCritic(Critic):
+    """
+    Llama Critic model.
+
+    Args:
+        pretrained (str): Pretrained model name or path.
+        config (LlamaConfig): Model config.
+        checkpoint (bool): Enable gradient checkpointing.
+        lora_rank (int): LoRA rank.
+        lora_train_bias (str): LoRA bias training mode.
+    """
+
+    def __init__(self,
+                 pretrained: Optional[str] = None,
+                 config: Optional[LlamaConfig] = None,
+                 checkpoint: bool = False,
+                 lora_rank: int = 0,
+                 lora_train_bias: str = 'none',
+                 **kwargs) -> None:
+
+        if pretrained is not None:
+            model = LlamaForCausalLM.from_pretrained(pretrained)
+        elif config is not None:
+            model = LlamaForCausalLM(config)
+        else:
+            model = LlamaForCausalLM(LlamaConfig())
+
+        if checkpoint:
+            model.gradient_checkpointing_enable()
+
+        value_head = nn.Linear(model.config.hidden_size, 1)
+
+        super().__init__(model, value_head, lora_rank, lora_train_bias, **kwargs)
diff --git a/applications/ChatGPT/chatgpt/models/llama/llama_rm.py b/applications/ChatGPT/chatgpt/models/llama/llama_rm.py
new file mode 100644
index 000000000000..81fa22d1969d
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/models/llama/llama_rm.py
@@ -0,0 +1,41 @@
+from typing import Optional
+
+import torch.nn as nn
+from transformers import LlamaConfig, LlamaForCausalLM
+
+from ..base import RewardModel
+
+
+class LlamaRM(RewardModel):
+    """
+    Llama Reward model.
+
+    Args:
+        pretrained (str): Pretrained model name or path.
+        config (LlamaConfig): Model config.
+        checkpoint (bool): Enable gradient checkpointing.
+        lora_rank (int): LoRA rank.
+        lora_train_bias (str): LoRA bias training mode.
+    """
+
+    def __init__(self,
+                 pretrained: Optional[str] = None,
+                 config: Optional[LlamaConfig] = None,
+                 checkpoint: bool = False,
+                 lora_rank: int = 0,
+                 lora_train_bias: str = 'none') -> None:
+
+        if pretrained is not None:
+            model = LlamaForCausalLM.from_pretrained(pretrained)
+        elif config is not None:
+            model = LlamaForCausalLM(config)
+        else:
+            model = LlamaForCausalLM(LlamaConfig())
+
+        if checkpoint:
+            model.gradient_checkpointing_enable()
+
+        value_head = nn.Linear(model.config.hidden_size, 1)
+        value_head.weight.data.normal_(mean=0.0, std=1 / (model.config.hidden_size + 1))
+
+        super().__init__(model, lora_rank, lora_train_bias)

From 9998d5ef64cb809e9858681f10b5307da1ff9196 Mon Sep 17 00:00:00 2001
From: Yuanchen <70520919+chengeharrison@users.noreply.github.com>
Date: Wed, 22 Mar 2023 19:09:39 +0800
Subject: [PATCH 497/503] [chatgpt]add reward model code for deberta (#3199)

Co-authored-by: Yuanchen Xu <yuanchen.xu00@gmail.com>
---
 .../chatgpt/models/deberta/__init__.py        |  4 ++
 .../chatgpt/models/deberta/deberta_critic.py  | 36 ++++++++++++++++++
 .../chatgpt/models/deberta/deberta_rm.py      | 37 +++++++++++++++++++
 .../ChatGPT/examples/requirements.txt         |  1 +
 applications/ChatGPT/examples/test_ci.sh      |  6 +++
 .../ChatGPT/examples/train_reward_model.py    |  9 ++++-
 applications/ChatGPT/examples/train_rm.sh     |  4 +-
 7 files changed, 93 insertions(+), 4 deletions(-)
 create mode 100644 applications/ChatGPT/chatgpt/models/deberta/__init__.py
 create mode 100644 applications/ChatGPT/chatgpt/models/deberta/deberta_critic.py
 create mode 100644 applications/ChatGPT/chatgpt/models/deberta/deberta_rm.py

diff --git a/applications/ChatGPT/chatgpt/models/deberta/__init__.py b/applications/ChatGPT/chatgpt/models/deberta/__init__.py
new file mode 100644
index 000000000000..b66888f34fd0
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/models/deberta/__init__.py
@@ -0,0 +1,4 @@
+from .deberta_critic import DebertaCritic
+from .deberta_rm import DebertaRM
+
+__all__ = ['DebertaCritic', 'DebertaRM']
diff --git a/applications/ChatGPT/chatgpt/models/deberta/deberta_critic.py b/applications/ChatGPT/chatgpt/models/deberta/deberta_critic.py
new file mode 100644
index 000000000000..e84c1dbd8380
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/models/deberta/deberta_critic.py
@@ -0,0 +1,36 @@
+from typing import Optional
+
+import torch.nn as nn
+from transformers import DebertaV2Config, DebertaV2Model
+
+from ..base import Critic
+
+
+class DebertaCritic(Critic):
+    """
+    Deberta Critic model.
+
+    Args:
+        pretrained (str): Pretrained model name or path.
+        config (DebertaV2Config): Model config.
+        checkpoint (bool): Enable gradient checkpointing.
+        lora_rank (int): Rank of the LO-RA decomposition.
+        lora_train_bias (str): LoRA bias training mode.
+    """
+
+    def __init__(self,
+                 pretrained: Optional[str] = None,
+                 config: Optional[DebertaV2Config] = None,
+                 checkpoint: bool = False,
+                 lora_rank: int = 0,
+                 lora_train_bias: str = 'none') -> None:
+        if pretrained is not None:
+            model = DebertaV2Model.from_pretrained(pretrained)
+        elif config is not None:
+            model = DebertaV2Model(config)
+        else:
+            model = DebertaV2Model(DebertaV2Config())
+        if checkpoint:
+            model.gradient_checkpointing_enable()
+        value_head = nn.Linear(model.config.hidden_size, 1)
+        super().__init__(model, value_head, lora_rank, lora_train_bias)
diff --git a/applications/ChatGPT/chatgpt/models/deberta/deberta_rm.py b/applications/ChatGPT/chatgpt/models/deberta/deberta_rm.py
new file mode 100644
index 000000000000..2448c879ec85
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/models/deberta/deberta_rm.py
@@ -0,0 +1,37 @@
+from typing import Optional
+
+import torch.nn as nn
+from transformers import DebertaV2Config, DebertaV2Model
+
+from ..base import RewardModel
+
+
+class DebertaRM(RewardModel):
+    """
+    Deberta Reward model.
+
+    Args:
+        pretrained (str): Pretrained model name or path.
+        config (DebertaV2Config): Model config.
+        checkpoint (bool): Enable gradient checkpointing.
+        lora_rank (int): Rank of the LO-RA decomposition.
+        lora_train_bias (str): LoRA bias training mode.
+    """
+
+    def __init__(self,
+                 pretrained: str = None,
+                 config: Optional[DebertaV2Config] = None,
+                 checkpoint: bool = False,
+                 lora_rank: int = 0,
+                 lora_train_bias: str = 'none') -> None:
+        if pretrained is not None:
+            model = DebertaV2Model.from_pretrained(pretrained)
+        elif config is not None:
+            model = DebertaV2Model(config)
+        else:
+            model = DebertaV2Model(DebertaV2Config())
+        if checkpoint:
+            model.gradient_checkpointing_enable()
+        value_head = nn.Linear(model.config.hidden_size, 1)
+        value_head.weight.data.normal_(mean=0.0, std=1 / (model.config.hidden_size + 1))
+        super().__init__(model, value_head, lora_rank, lora_train_bias)
diff --git a/applications/ChatGPT/examples/requirements.txt b/applications/ChatGPT/examples/requirements.txt
index 6c5dac292486..40e6edc7ea73 100644
--- a/applications/ChatGPT/examples/requirements.txt
+++ b/applications/ChatGPT/examples/requirements.txt
@@ -1 +1,2 @@
 pandas>=1.4.1
+sentencepiece
diff --git a/applications/ChatGPT/examples/test_ci.sh b/applications/ChatGPT/examples/test_ci.sh
index abc43ab1ee9e..1d05c4c58341 100755
--- a/applications/ChatGPT/examples/test_ci.sh
+++ b/applications/ChatGPT/examples/test_ci.sh
@@ -88,4 +88,10 @@ torchrun --standalone --nproc_per_node=2 ${BASE}/train_reward_model.py \
                              --dataset 'Anthropic/hh-rlhf' --subset 'harmless-base'\
                              --test True --lora_rank 4
 
+torchrun --standalone --nproc_per_node=2 ${BASE}/train_reward_model.py \
+                             --pretrain 'microsoft/deberta-v3-large' --model 'deberta' \
+                             --strategy colossalai_zero2 --loss_fn 'log_sig'\
+                             --dataset 'Anthropic/hh-rlhf' --subset 'harmless-base'\
+                             --test True --lora_rank 4
+
 rm -rf ${BASE}/rm_ckpt.pt
diff --git a/applications/ChatGPT/examples/train_reward_model.py b/applications/ChatGPT/examples/train_reward_model.py
index 47dd988b8117..a9c844b7b1f8 100644
--- a/applications/ChatGPT/examples/train_reward_model.py
+++ b/applications/ChatGPT/examples/train_reward_model.py
@@ -8,12 +8,13 @@
 from chatgpt.models.bloom import BLOOMRM
 from chatgpt.models.gpt import GPTRM
 from chatgpt.models.opt import OPTRM
+from chatgpt.models.deberta import DebertaRM
 from chatgpt.trainer import RewardModelTrainer
 from chatgpt.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
 from datasets import load_dataset
 from random import randint
 from torch.optim import Adam
-from transformers import AutoTokenizer, BloomTokenizerFast
+from transformers import AutoTokenizer, BloomTokenizerFast, DebertaV2Tokenizer
 from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
 
 from colossalai.nn.optimizer import HybridAdam
@@ -39,6 +40,8 @@ def train(args):
             model = OPTRM(pretrained=args.pretrain, lora_rank=args.lora_rank).to(torch.cuda.current_device())
         elif args.model == 'gpt2':
             model = GPTRM(pretrained=args.pretrain, lora_rank=args.lora_rank).to(torch.cuda.current_device())
+        elif args.model == 'deberta':
+            model = DebertaRM(pretrained=args.pretrain, lora_rank=args.lora_rank).to(torch.cuda.current_device())
         else:
             raise ValueError(f'Unsupported model "{args.model}"')
         
@@ -54,6 +57,8 @@ def train(args):
         tokenizer = BloomTokenizerFast.from_pretrained('bigscience/bloom-560m')
     elif args.model == 'opt':
         tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
+    elif args.model == 'deberta':
+        tokenizer = DebertaV2Tokenizer.from_pretrained('microsoft/deberta-v3-large')
     else:
         raise ValueError(f'Unsupported model "{args.model}"')
     max_len = args.max_len
@@ -119,7 +124,7 @@ def train(args):
     parser.add_argument('--strategy',
                         choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
                         default='naive')
-    parser.add_argument('--model', choices=['gpt2', 'bloom', 'opt'], default='bloom')
+    parser.add_argument('--model', choices=['gpt2', 'bloom', 'opt', 'deberta'], default='bloom')
     parser.add_argument('--pretrain', type=str, default=None)
     parser.add_argument('--model_path', type=str, default=None)
     parser.add_argument('--need_optim_ckpt', type=bool, default=False)
diff --git a/applications/ChatGPT/examples/train_rm.sh b/applications/ChatGPT/examples/train_rm.sh
index 981b7a15fcd4..4f9f55b6b59a 100755
--- a/applications/ChatGPT/examples/train_rm.sh
+++ b/applications/ChatGPT/examples/train_rm.sh
@@ -1,7 +1,7 @@
 set_n_least_used_CUDA_VISIBLE_DEVICES 1
 
-python train_reward_model.py --pretrain '/home/lczht/data2/bloom-560m' \
-                             --model 'bloom' \
+python train_reward_model.py --pretrain 'microsoft/deberta-v3-large' \
+                             --model 'deberta' \
                              --strategy naive \
                              --loss_fn 'log_exp'\
                              --save_path 'rmstatic.pt' \

From 189347963aa761839946f501334b2b7c6be53318 Mon Sep 17 00:00:00 2001
From: Yan Fang <30396678+Suffoquer-fang@users.noreply.github.com>
Date: Thu, 23 Mar 2023 10:22:08 +0800
Subject: [PATCH 498/503] [auto] fix requirements typo for issue #3125 (#3209)

---
 .../language/gpt/experiments/auto_parallel/requirements.txt     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/language/gpt/experiments/auto_parallel/requirements.txt b/examples/language/gpt/experiments/auto_parallel/requirements.txt
index ff046ad1cae9..1b2561f098d5 100644
--- a/examples/language/gpt/experiments/auto_parallel/requirements.txt
+++ b/examples/language/gpt/experiments/auto_parallel/requirements.txt
@@ -1,4 +1,4 @@
 colossalai >= 0.1.12
 torch >= 1.8.1
-transformers >= 4.231
+transformers >= 4.23.1
 PuLP >= 2.7.0

From f8289d42218878fb864c6ca3f9c05d45bdb8a560 Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Thu, 23 Mar 2023 10:53:06 +0800
Subject: [PATCH 499/503] [lazyinit] combine lazy tensor with dtensor (#3204)

* [lazyinit] lazy tensor add distribute

* [lazyinit] refactor distribute

* [lazyinit] add test dist lazy init

* [lazyinit] add verbose info for dist lazy init

* [lazyinit] fix rnn flatten weight op

* [lazyinit] polish test

* [lazyinit] polish test

* [lazyinit] fix lazy tensor data setter

* [lazyinit] polish test

* [lazyinit] fix clean

* [lazyinit] make materialize inplace

* [lazyinit] refactor materialize

* [lazyinit] refactor test distribute

* [lazyinit] fix requires_grad

* [lazyinit] fix tolist after materialization

* [lazyinit] refactor distribute module

* [lazyinit] polish docstr

* [lazyinit] polish lazy init context

* [lazyinit] temporarily skip test

* [lazyinit] polish test

* [lazyinit] add docstr
---
 colossalai/utils/model/experimental.py        | 231 ++++++++++++------
 .../test_lazy_init/test_distribute.py         | 110 +++++++++
 tests/test_utils/test_lazy_init/utils.py      |  16 ++
 3 files changed, 281 insertions(+), 76 deletions(-)
 create mode 100644 tests/test_utils/test_lazy_init/test_distribute.py

diff --git a/colossalai/utils/model/experimental.py b/colossalai/utils/model/experimental.py
index 00cb532d9c1d..6427a147a5c0 100644
--- a/colossalai/utils/model/experimental.py
+++ b/colossalai/utils/model/experimental.py
@@ -1,11 +1,15 @@
-from typing import Callable, List, Optional, Union
+from types import MethodType
+from typing import Callable, Optional, Union
 
 import torch
+import torch.distributed as dist
 import torch.nn as nn
 from torch import Tensor
 from torch.utils._pytree import tree_map
 
 from colossalai.fx.profiler.tensor import MetaTensor
+from colossalai.tensor.d_tensor.d_tensor import DTensor
+from colossalai.tensor.d_tensor.layout import Layout
 
 # reference: https://pytorch.org/cppdocs/notes/tensor_creation.html
 _NORMAL_FACTORY = [
@@ -30,6 +34,11 @@
 
 _EARLY_MATERIALIZED_OPS = ['__getitem__', 'split']
 
+# If your intent is to change the metadata of a Tensor (such as sizes / strides / storage / storage_offset)
+# without autograd tracking the change, remove the .data / .detach() call and wrap the change in a `with torch.no_grad():` block.
+# These ops cannot be unwrapped using .data
+_CHANGE_META_OPS = ['_cudnn_rnn_flatten_weight', 'requires_grad_', '__get__']
+
 _LEGACY_TENSOR_CONSTRUCTOR = {
     'FloatTensor': torch.float,
     'DoubleTensor': torch.double,
@@ -43,6 +52,8 @@
     'BoolTensor': torch.bool,
 }
 
+_EMPTY_DATA = torch.empty(0)
+
 
 class _MyTensor(Tensor):
     """This class is only for correctness verification.
@@ -64,6 +75,29 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
         return super().__torch_function__(func, types, args, kwargs)
 
 
+def _convert_cls(tensor: 'LazyTensor', target: torch.Tensor) -> torch.Tensor:
+    """Convert a lazy tensor's class to target's class, with target's data.
+
+    The reason why we change the class of a lazy tensor in-place is that this can easily handle shared modules/parameters, which is common in huggingface models.
+    If we create a new tensor and update the module by ``setattr(module, name, param)``, the shared parameters will not be updated. And we have to track all shared parameters and update them manually.
+
+    Args:
+        tensor (LazyTensor): the LazyTensor to be converted
+        target (torch.Tensor): target tensor
+
+    Returns:
+        torch.Tensor: the converted tensor
+    """
+    cls_to_become = nn.Parameter if isinstance(tensor, nn.Parameter) else torch.Tensor
+    tensor.__class__ = cls_to_become
+    tensor.data = target
+    tensor.requires_grad = target.requires_grad
+    # subclass of torch.Tensor does not have tolist() method
+    # overwrite this method after materialization or distribution
+    tensor.tolist = MethodType(torch.Tensor.tolist, target)
+    return tensor
+
+
 class LazyTensor(torch.Tensor):
     """A naive implementation of LazyTensor (https://arxiv.org/pdf/2102.13267.pdf).
 
@@ -112,14 +146,8 @@ def __new__(cls, func, *args, meta_data=None, concrete_data=None, **kwargs):
                 elem = func(*args, **{**kwargs, 'device': 'meta'})
                 meta_data = MetaTensor(elem, fake_device=device)
             elem = meta_data._tensor
-        r = torch.Tensor._make_wrapper_subclass(cls,
-                                                elem.size(),
-                                                strides=elem.stride(),
-                                                storage_offset=elem.storage_offset(),
-                                                dtype=elem.dtype,
-                                                layout=elem.layout,
-                                                device=elem.device,
-                                                requires_grad=elem.requires_grad)
+        # As a meta tensor cannot be modified __class__ to torch.Tensor, we should use an empty real tensor here
+        r = torch.Tensor._make_subclass(cls, _EMPTY_DATA, require_grad=elem.requires_grad)
         r._meta_data = meta_data
         return r
 
@@ -129,15 +157,28 @@ def __init__(self, func, *args, meta_data=None, concrete_data=None, **kwargs):
         self._materialized_data: Optional[torch.Tensor] = concrete_data    # materialized data
 
     def materialize(self) -> torch.Tensor:
-        """Materialize the ``LazyTensor`` to ``torch.Tensor``.
+        """Materialize the ``LazyTensor`` to ``torch.Tensor`` by modifying __class__ (inplace).
 
         Returns:
-            torch.Tensor: The materialized tensor.
+            torch.Tensor: The materialized tensor (self).
         """
         target = self._materialize_data()
-        if isinstance(self, nn.Parameter):
-            target = nn.Parameter(target, requires_grad=self.requires_grad)
-        return target
+        self.clean()
+        return _convert_cls(self, target)
+
+    def distribute(self, layout: Layout) -> torch.Tensor:
+        """Distribute the ``LazyTensor`` to ``torch.Tensor`` by modifying __class__ (inplace), according to the layout.
+
+        Args:
+            layout (Layout): Distribution layout.
+
+        Returns:
+            torch.Tensor: The distributed tensor (self).
+        """
+        target = self._materialize_data()
+        self.clean()
+        local_tensor = DTensor(target, layout).local_tensor
+        return _convert_cls(self, local_tensor)
 
     def clean(self) -> None:
         """Clean all stored operations, meta data and materialized data, which prevents memory leaking. This should be called after all tensors are materialized.
@@ -216,6 +257,8 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
         is_inplace: bool = (func.__name__.endswith('_') and not (func.__name__.endswith('__'))
                             or func.__name__ == "__setitem__")
 
+        is_change_meta_op: bool = func.__name__ in _CHANGE_META_OPS
+
         if isinstance(func, torch._C.ScriptMethod):
             # FIXME(ver217): torch script functions are not verified
 
@@ -239,10 +282,10 @@ def unwrap(x):
                 if isinstance(x, LazyTensor):
                     if x._materialized_data is not None:
                         # for early materialized tensor, use its materialized data directly
-                        return x._materialized_data.data
+                        return x._materialized_data if is_change_meta_op else x._materialized_data.data
                     t = x if is_inplace else x.clone()
                     t._op_buffer.append((func, args, kwargs))
-                    meta = x._meta_data.data
+                    meta = x._meta_data if is_change_meta_op else x._meta_data.data
                     meta_to_lazy[meta] = t
                     return meta
                 return x
@@ -290,13 +333,36 @@ def data(self):
 
     @data.setter
     def data(self, other: 'LazyTensor'):
+        """This is sightly different from oringinal `data` setter.
+
+        E.g.:
+            >>> a = torch.randn(3, 3) # a is a Tensor
+            >>> b = torch.rand(2, 2)
+            >>> a.data = b
+            >>> b.add_(1)   # this will affect a
+            >>> x = torch.randn(3, 3) # x is a LazyTensor
+            >>> y = torch.rand(2, 2) # y is a LazyTensor
+            >>> x.data = y
+            >>> y.add_(1)   # this will not affect x
+
+        """
         if other is self:
             return
-        # TODO(ver217): to avoid infinity recursion, do early materialization
-        self._materialized_data = other._materialize_data()
+
+        self._op_buffer.append(other._factory_method)
+
+        def replace(x):
+            if x is other:
+                return self
+            return x
+
+        for func, args, kwargs in other._op_buffer:
+            self._op_buffer.append((func, tree_map(replace, args), tree_map(replace, kwargs)))
 
     def tolist(self) -> list:
-        t = self.materialize()
+        # Though self.__class__ is modified to torch.Tensor, in C++ side, it is still a subclass of torch.Tensor
+        # And subclass of torch.Tensor does not have tolist() method
+        t = self._materialize_data()
         return t.tolist()
 
     def __hash__(self):
@@ -421,71 +487,84 @@ def __exit__(self, exc_type, exc_val, exc_tb):
             setattr(torch, name, orig)
 
     @staticmethod
-    def materialize(module: torch.nn.Module, verbose: bool = False):
-        """Initialize all ``nn.Parameter`` from ``LazyTensor``.
+    def materialize(module: nn.Module, verbose: bool = False) -> nn.Module:
+        """Initialize all ``nn.Parameter`` from ``LazyTensor``. This function will modify the module in-place.
 
         Args:
-            module (torch.nn.Module): Target ``nn.Module``
+            module (nn.Module): Target ``nn.Module``
             verbose (bool): Whether to print lazy initialization rate. Defaults to False.
         """
-        if verbose:
-            param_cnt = 0
-            param_lazy_cnt = 0
-            buf_cnt = 0
-            buf_lazy_cnt = 0
-            non_lazy_numel = 0
-
-        # do post cleaning to handle shared parameter
-        visited_lazy_tensors: List[LazyTensor] = []
-        # handle shared module
-        visited_modules = set()
-
-        @torch.no_grad()
-        def init_recursively(module: nn.Module):
-            nonlocal param_cnt, param_lazy_cnt, buf_cnt, buf_lazy_cnt, non_lazy_numel
-            # recursively initialize the module
-            for mod in module.children():
-                if id(mod) not in visited_modules:
-                    visited_modules.add(id(mod))
-                    init_recursively(mod)
-
-            # initialize tensors directly attached to the current module
-            for name, param in module.named_parameters(recurse=False):
-                if verbose:
-                    param_cnt += 1
-                    if getattr(param, '_materialized_data', False) is None:
-                        # if no _materialized_data attr, the tensor is not lazy
-                        param_lazy_cnt += 1
-                    else:
-                        non_lazy_numel += param.numel()
-                if hasattr(param, 'materialize'):
-                    # TODO(ver217): apex layers cannot be captured
-                    visited_lazy_tensors.append(param)
-                    setattr(module, name, param.materialize())
-
-            for name, buf in module.named_buffers(recurse=False):
-                if verbose:
-                    buf_cnt += 1
-                    if getattr(buf, "_materialized_data", False) is None:
-                        # if no _materialized_data attr, the tensor is not lazy
-                        buf_lazy_cnt += 1
-                    else:
-                        non_lazy_numel += buf.numel()
-                if hasattr(buf, 'materialize'):
-                    # TODO(ver217): apex layers cannot be captured
-                    visited_lazy_tensors.append(buf)
-                    setattr(module, name, buf.materialize())
 
-        init_recursively(module)
+        def apply_fn(name: str, p: LazyTensor):
+            p.materialize()
+
+        return _apply_to_lazy_module(module, apply_fn, verbose)
+
+    @staticmethod
+    def distribute(module: nn.Module, layout_dict: dict, verbose: bool = False) -> nn.Module:
+        """Distribute all ``nn.Parameter`` from ``LazyTensor``. This function will modify the module in-place.
+
+        Args:
+            module (nn.Module): Target ``nn.Module``
+            layout_dict (dict): Dict of layout for each parameter/buffer. The key is the parameter/buffer name, and the value is the layout.
+            verbose (bool, optional): Whether to print lazy initialization rate. Defaults to False.
+        """
+
+        def apply_fn(name: str, p: LazyTensor):
+            p.distribute(layout_dict[name])
+
+        return _apply_to_lazy_module(module, apply_fn, verbose)
+
 
-        for t in visited_lazy_tensors:
-            t.clean()
+def _apply_to_lazy_module(module: nn.Module,
+                          apply_fn: Callable[[str, torch.Tensor], None],
+                          verbose: bool = False) -> nn.Module:
+    if verbose:
+        # verbose info
+        param_cnt = 0
+        param_lazy_cnt = 0
+        buf_cnt = 0
+        buf_lazy_cnt = 0
+        total_numel = 0
+        non_lazy_numel = 0
+
+    for name, p in module.named_parameters():
+        if verbose:
+            param_cnt += 1
+            total_numel += p.numel()
+            if getattr(p, '_materialized_data', False) is None:
+                # if no _materialized_data attr, the tensor is not lazy
+                param_lazy_cnt += 1
+            else:
+                non_lazy_numel += p.numel()
+        if isinstance(p, LazyTensor):
+            apply_fn(name, p)
 
+    for name, buf in module.named_buffers():
         if verbose:
-            print(f'Param lazy rate: {param_lazy_cnt}/{param_cnt}')
-            print(f'Buffer lazy rate: {buf_lazy_cnt}/{buf_cnt}')
-            print(f'Non-lazy numel: {non_lazy_numel} ({non_lazy_numel/1024**2:.3f} M)')
-        return module
+            buf_cnt += 1
+            total_numel += buf.numel()
+            if getattr(buf, "_materialized_data", False) is None:
+                # if no _materialized_data attr, the tensor is not lazy
+                buf_lazy_cnt += 1
+            else:
+                non_lazy_numel += buf.numel()
+        if isinstance(buf, LazyTensor):
+            apply_fn(name, buf)
+
+    if verbose:
+        non_lazy_numel_ratio = non_lazy_numel / total_numel * 100 if non_lazy_numel != 0 else 0
+        _print_rank_0(f'Param lazy rate: {param_lazy_cnt}/{param_cnt}')
+        _print_rank_0(f'Buffer lazy rate: {buf_lazy_cnt}/{buf_cnt}')
+        _print_rank_0(
+            f'Non lazy numel: {non_lazy_numel} ({non_lazy_numel/1024**2:.3f} M), ratio: {non_lazy_numel_ratio}%')
+
+    return module
+
+
+def _print_rank_0(*args, **kwargs):
+    if not dist.is_initialized() or dist.get_rank() == 0:
+        print(*args, **kwargs)
 
 
 def _is_int_tuple(args) -> bool:
diff --git a/tests/test_utils/test_lazy_init/test_distribute.py b/tests/test_utils/test_lazy_init/test_distribute.py
new file mode 100644
index 000000000000..37b2c5da1efa
--- /dev/null
+++ b/tests/test_utils/test_lazy_init/test_distribute.py
@@ -0,0 +1,110 @@
+from functools import partial
+from typing import Optional
+
+import pytest
+import torch
+import torch.multiprocessing as mp
+import torch.nn as nn
+
+import colossalai
+from colossalai.device.device_mesh import DeviceMesh
+from colossalai.tensor.d_tensor.layout import Layout
+from colossalai.tensor.d_tensor.sharding_spec import ShardingSpec
+from colossalai.testing import parameterize, rerun_if_address_is_in_use
+from colossalai.utils import free_port
+from colossalai.utils.common import print_rank_0
+from colossalai.utils.model.experimental import LazyInitContext, LazyTensor, _MyTensor
+from tests.kit.model_zoo import model_zoo
+
+# from utils import assert_dist_model_equal, set_seed
+
+
+def find_shard_dim(shape: torch.Size) -> Optional[int]:
+    for dim, size in enumerate(shape):
+        if size % 2 == 0:
+            return dim
+
+
+def make_layout(device_mesh: DeviceMesh, original_tensor: torch.Tensor) -> Layout:
+    shard_dim = find_shard_dim(original_tensor.shape)
+    dim_partition_dict = {shard_dim: [0]} if shard_dim is not None else {}
+    target_sharding_spec = ShardingSpec(dim_size=original_tensor.dim(), dim_partition_dict=dim_partition_dict)
+    layout = Layout(device_mesh=device_mesh,
+                    device_type=torch.device('cuda'),
+                    sharding_spec=target_sharding_spec,
+                    entire_shape=original_tensor.shape)
+    return layout
+
+
+def _get_current_name(prefix: str, name: str) -> str:
+    return f'{prefix}.{name}'.lstrip('.')
+
+
+def generate_layout_dict(model: nn.Module, device_mesh: DeviceMesh) -> dict:
+    layout_dict = {}
+
+    @torch.no_grad()
+    def generate_recursively(module: nn.Module, prefix: str = ''):
+        # recursively initialize the module
+        for name, mod in module.named_children():
+            generate_recursively(mod, prefix=_get_current_name(prefix, name))
+
+        # initialize tensors directly attached to the current module
+        for name, param in module.named_parameters(recurse=False):
+            if isinstance(param, LazyTensor):
+                layout = make_layout(device_mesh, param)
+                layout_dict[_get_current_name(prefix, name)] = layout
+
+        for name, buf in module.named_buffers(recurse=False):
+            if isinstance(buf, LazyTensor):
+                layout = make_layout(device_mesh, buf)
+                layout_dict[_get_current_name(prefix, name)] = layout
+
+    generate_recursively(model)
+
+    return layout_dict
+
+
+@parameterize('subset', ['torchvision', 'diffusers', 'timm', 'transformers', 'torchaudio', 'deepfm', 'dlrm'])
+def run_dist_lazy_init(subset, seed: int = 42):
+    sub_model_zoo = model_zoo.get_sub_registry(subset)
+    device_mesh = DeviceMesh(torch.Tensor([0, 1, 2, 3]), (2, 2), init_process_group=True)
+    # FIXME(ver217): uncomment this line
+    # _MyTensor._pre_op_fn = lambda *args: set_seed(seed)
+    # LazyTensor._pre_op_fn = lambda *args: set_seed(seed)
+
+    for name, entry in sub_model_zoo.items():
+        # TODO(ver217): lazy init does not support weight norm, skip these models
+        if name in ('torchaudio_wav2vec2_base', 'torchaudio_hubert_base'):
+            continue
+        print_rank_0(name)
+        model_fn, data_gen_fn, output_transform_fn, model_attr = entry
+        ctx = LazyInitContext(tensor_cls=_MyTensor)
+        with ctx:
+            model = model_fn()
+        ctx = LazyInitContext()
+        with ctx:
+            deferred_model = model_fn()
+        layout_dict = generate_layout_dict(deferred_model, device_mesh)
+        ctx.distribute(deferred_model, layout_dict, verbose=True)
+        # FIXME(ver217): uncomment this line
+        # assert_dist_model_equal(model, deferred_model, layout_dict)
+
+
+def run_dist(rank, world_size, port) -> None:
+    colossalai.launch({}, rank=rank, world_size=world_size, host='localhost', port=port)
+    run_dist_lazy_init()
+
+
+# FIXME(ver217): temporarily skip this test since torch 1.11 does not fully support meta tensor
+@pytest.mark.skip
+@pytest.mark.dist
+@rerun_if_address_is_in_use()
+def test_dist_lazy_init():
+    world_size = 4
+    run_func = partial(run_dist, world_size=world_size, port=free_port())
+    mp.spawn(run_func, nprocs=world_size)
+
+
+if __name__ == '__main__':
+    test_dist_lazy_init()
diff --git a/tests/test_utils/test_lazy_init/utils.py b/tests/test_utils/test_lazy_init/utils.py
index 47ba534bc434..a8aeb4c8930c 100644
--- a/tests/test_utils/test_lazy_init/utils.py
+++ b/tests/test_utils/test_lazy_init/utils.py
@@ -4,6 +4,7 @@
 import numpy as np
 import torch
 
+from colossalai.tensor.d_tensor.layout_converter import to_global
 from colossalai.utils.model.experimental import LazyInitContext, LazyTensor, _MyTensor
 from tests.kit.model_zoo.registry import ModelAttribute
 
@@ -67,3 +68,18 @@ def check_lazy_init(entry: TestingEntry, seed: int = 42, verbose: bool = False,
         assert_forward_equal(model, deferred_model, data_gen_fn, output_transform_fn)
     if verbose:
         print(f'{model.__class__.__name__} pass')
+
+
+def assert_dist_model_equal(model: torch.nn.Module, distributed_model: torch.nn.Module, layout_dict: dict) -> None:
+    state = model.state_dict()
+    distributed_state = distributed_model.state_dict()
+
+    assert len(state) == len(distributed_state), f'len {len(state)} vs {len(distributed_state)}'
+
+    for (n1, t1), (n2, t2) in zip(state.items(), distributed_state.items()):
+        assert n1 == n2
+        t1 = t1.cuda()
+        t2 = t2.cuda()
+        if n2 in layout_dict:
+            t2 = to_global(t2, layout_dict[n2])
+        assert torch.equal(t1, t2), f'{n1} {t1} vs {t2}'

From cd142fbefa964d62048a9bafb180322369ab89f8 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Thu, 23 Mar 2023 10:53:17 +0800
Subject: [PATCH 500/503] [api] implemented the checkpoint io module (#3205)

* [api] implemented the checkpoint io module

* polish code

* polish code
---
 colossalai/checkpoint_io/__init__.py          |   4 +
 .../checkpoint_io/checkpoint_io_base.py       | 374 ++++++++++++++++++
 .../checkpoint_io/general_checkpoint_io.py    |  66 ++++
 .../test_general_checkpoint_io.py             |  70 ++++
 4 files changed, 514 insertions(+)
 create mode 100644 colossalai/checkpoint_io/__init__.py
 create mode 100644 colossalai/checkpoint_io/checkpoint_io_base.py
 create mode 100644 colossalai/checkpoint_io/general_checkpoint_io.py
 create mode 100644 tests/test_checkpoint_io/test_general_checkpoint_io.py

diff --git a/colossalai/checkpoint_io/__init__.py b/colossalai/checkpoint_io/__init__.py
new file mode 100644
index 000000000000..3cec630b2f86
--- /dev/null
+++ b/colossalai/checkpoint_io/__init__.py
@@ -0,0 +1,4 @@
+from .checkpoint_io_base import CheckpointIO, ShardCheckpointIndexFile
+from .general_checkpoint_io import GeneralCheckpointIO
+
+__all__ = ['CheckpointIO', 'ShardCheckpointIndexFile', 'GeneralCheckpointIO']
diff --git a/colossalai/checkpoint_io/checkpoint_io_base.py b/colossalai/checkpoint_io/checkpoint_io_base.py
new file mode 100644
index 000000000000..00a65424bece
--- /dev/null
+++ b/colossalai/checkpoint_io/checkpoint_io_base.py
@@ -0,0 +1,374 @@
+import json
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Any
+
+import torch
+import torch.nn as nn
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
+
+__all__ = ['CheckpointIO', 'ShardCheckpointIndexFile']
+
+
+class CheckpointIO(ABC):
+    """
+    CheckpointIO is the base class for all checkpoint IO classes. It defines the interface for checkpoint IO.
+
+
+    Examples:
+        >>> from colossalai.checkpoint_io import GeneralCheckpointIO
+        >>> checkpoint_io = CheckpointIO()
+        >>>
+        >>> # load model from checkpoint
+        >>> model = checkpoint_io.load_model(model, 'model.pt')
+        >>>
+        >>> # save model to checkpoint
+        >>> checkpoint_io.save_model(model, 'model.pt')
+        >>>
+        >>> # save model to sharded checkpoints
+        >>> checkpoint_io.save_model(model, './checkpoints/', shard=True)
+        >>>
+        >>> # load model from sharded checkpoints
+        >>> model = checkpoint_io.load_model(model, './checkpoints/')
+        >>>
+        >>> # load optimizer from checkpoint
+        >>> optimizer = checkpoint_io.load_optimizer(optimizer, 'optimizer.pt')
+        >>>
+        >>> # save optimizer to checkpoint
+        >>> checkpoint_io.save_optimizer(optimizer, 'optimizer.pt')
+
+    """
+
+    # ======================================
+    # Abstract methods for implementation
+    # ======================================
+
+    @abstractmethod
+    def load_model(self, model: nn.Module, checkpoint: str, strict: bool = True):
+        """
+        Load model from checkpoint.
+
+        Args:
+            model (nn.Module): model to be loaded.
+            checkpoint (str): checkpoint path. This value is made compatiblity with the model checkpoints in the
+                        mainstream model zoos such as Hugging Face and TIMM. The checkpoint path can be:
+                        1. a file path, e.g. 'model.pt'
+                        2. a path to a json file which defines the index to the sharded checkpoint
+                        3. a path to a folder containing a unique .index.json file for sharded checkpoint
+            strict (bool): whether to strictly enforce that the param name in
+                the checkpoint match the keys returned by this module's.
+        """
+        pass
+
+    @abstractmethod
+    def save_model(self,
+                   model: nn.Module,
+                   checkpoint: str,
+                   prefix: str = None,
+                   shard: bool = False,
+                   size_per_shard: int = 1024):
+        """
+        Save model to checkpoint.
+
+        Examples:
+            >>> from colossalai.checkpoint_io import GeneralCheckpointIO
+            >>> checkpoint_io = CheckpointIO()
+            >>>
+            >>> # save model to a single file
+            >>> save_model(model, 'model.pt')
+            >>>
+            >>> # save model to a sharded checkpoint
+            >>> save_model(model, './checkpoints/', shard=True)
+
+        Args:
+            model (nn.Module): model to be saved.
+            checkpoint: checkpoint path. The checkpoint path can be :
+                1. a file path, e.g. 'model.pt'
+                2. a directory path to save the sharded checkpoint, e.g. './checkpoints/' when shard = True.
+            shard: whether to shard the checkpoint. Default: False. If set to True, the checkpoint will be sharded into
+                multiple files. The model shards will be specificed by a `model.index.json` file. When shard = True, please ensure
+                that the checkpoint path is a directory path instead of a file path.
+            size_per_shard (int): size per shard in MB. Default: 1024. This value is only used when shard is set to True.
+        """
+        pass
+
+    @abstractmethod
+    def load_optimizer(self, optimizer: Optimizer, checkpoint: str):
+        """
+        Load optimizer from checkpoint.
+
+        Args:
+            optimizer (Optimizer): optimizer to be loaded.
+            checkpoint (str): checkpoint path. This value is made compatiblity with the model checkpoints in the
+        """
+        pass
+
+    @abstractmethod
+    def save_optimizer(self, optimizer: Optimizer, checkpoint: str, shard: bool = False, size_per_shard: int = 1024):
+        """
+        Save optimizer to checkpoint.
+
+        Args:
+            optimizer (Optimizer): optimizer to be saved.
+            checkpoint: checkpoint path. The checkpoint path can be :
+                1. a file path, e.g. 'model.pt'
+                2. a path to a json file which defines the index to the sharded checkpoint for the optimizer
+                3. a path to a folder containing a unique .index.json file for sharded checkpoint
+        """
+        pass
+
+    # ============================================
+    # methods for loading and saving lr scheduler
+    # as this is quite standard, there is no need
+    # to make them abstract
+    # ============================================
+
+    def save_lr_scheduler(self, lr_scheduler: LRScheduler, checkpoint: str):
+        """
+        Save lr scheduler to checkpoint.
+
+        Args:
+            lr_scheduler (LRScheduler): lr scheduler to be saved.
+            checkpoint: checkpoint path. The checkpoint path can only be a file path.
+        """
+        torch.save(lr_scheduler.state_dict(), checkpoint)
+
+    def load_lr_scheduler(self, lr_scheduler: LRScheduler, checkpoint: str):
+        """
+        Load lr scheduler from checkpoint.
+
+        Args:
+            lr_scheduler (LRScheduler): lr scheduler to be loaded.
+            checkpoint (str): the path for a single checkpoint file.
+        """
+        state_dict = torch.load(checkpoint)
+        lr_scheduler.load_state_dict(state_dict)
+
+    # ========================================
+    # Helper functions for loading state dict
+    # ========================================
+
+    def get_sharded_checkpoint_index_file(self, checkpoint_path: Path):
+        """
+        Get the index file path for a sharded checkpoint.
+
+        Args:
+            checkpoint_path (Path): path to the checkpoint.
+
+        Returns:
+            Path: path to the index file.
+        """
+        if checkpoint_path.is_file():
+            # check if it is .index.json
+            if checkpoint_path.name.endswith('.index.json'):
+                return checkpoint_path
+            else:
+                raise ValueError(f'Invalid checkpoint path: {checkpoint_path}. ')
+        elif checkpoint_path.is_dir():
+            # check if there is only one a file ending with .index.json in this directory
+            index_files = list(checkpoint_path.glob('*.index.json'))
+            if len(index_files) == 1:
+                return index_files[0]
+            else:
+                raise ValueError(f'Found {len(index_files)} index files in {checkpoint_path}. ')
+
+    def is_sharded_checkpoint(self, checkpoint_path: Path):
+        """
+        Check whether the checkpoint is sharded.
+
+        Args:
+            checkpoint (str): checkpoint path.
+
+        Returns:
+            bool: whether the checkpoint is sharded.
+        """
+        if checkpoint_path.is_file():
+            # check if it is .index.json
+            if checkpoint_path.name.endswith('.index.json'):
+                return True
+            else:
+                return False
+        elif checkpoint_path.is_dir():
+            # check if there is only one a file ending with .index.json in this directory
+            index_files = list(checkpoint_path.glob('*.index.json'))
+            if len(index_files) == 1:
+                return True
+            else:
+                raise ValueError(f'Found {len(index_files)} index files in {checkpoint_path}. ')
+
+    def get_checkpoint_shard_filenames(self, index_file_path: Path):
+        """
+        Get checkpoint shard filenames from a json file.
+
+        Args:
+            index_file_path (Path): path to the json file.
+
+        Returns:
+            list: checkpoint shard filenames.
+        """
+        with open(str(index_file_path), 'r') as f:
+            shard_filenames = json.load(f)
+
+        if "weight_map" in index:
+            index = index["weight_map"]
+
+        checkpoint_root_path = index_file_path.absolute().parent
+
+        # read the checkpoint file list from the json file and get a list of unique file names
+        checkpoint_files = sorted(list(set(index.values())))
+
+        # get the absolute paths for all checkpoint files
+        checkpoint_files = [checkpoint_root_path.joinpath(f) for f in checkpoint_files]
+        return shard_filenames
+
+    def load_safetensors_state_dict(self, *args, **kwargs):
+        """
+        Load safetensors state dict from checkpoint.
+        """
+        # TODO(FrankLeeeee): support huggingface safetensors
+        raise NotImplementedError("This method is not implemented to support safe tensors")
+
+    def load_state_dict(self, checkpoint_file_path: Path):
+        """
+        Load state dict from checkpoint.
+
+        Args:
+            checkpoint_file_path (Path): path to the checkpoint file.
+
+        Returns:
+            dict: state dict.
+        """
+        return torch.load(str(checkpoint_file_path))
+
+    # ======================================
+    # Helper functions for saving state dict
+    # ======================================
+
+    def save_safetensors_state_dict(self, *args, **kwargs):
+        """
+        Save safetensors state dict to checkpoint.
+        """
+        # TODO(FrankLeeeee): support huggingface safetensors
+        raise NotImplementedError("This method is not implemented to support safe tensors")
+
+    def generate_checkpoint_shard_file_name(self, index: int, total_number: int, prefix: str = None):
+        """
+        Generate checkpoint shard file name.
+
+        Args:
+            index (int): index of the shard.
+            total_number (int): total number of shards.
+            prefix (str): prefix of the shard file name. Default: None.
+        """
+        if prefix is None:
+            return f"{index}-of-{total_number}.bin"
+        else:
+            return f"{prefix}-{index}-of-{total_number}.bin"
+
+    def save_checkpoint(self, state_dict: dict, checkpoint_file_path: Path):
+        """
+        Save state dict to checkpoint.
+
+        Args:
+            state_dict (dict): state dict.
+            checkpoint_file_path (Path): path to the checkpoint file.
+        """
+        torch.save(state_dict, str(checkpoint_file_path))
+
+    def save_state_dict_as_shard(self, state_dict: dict, index: int, total_number: int, prefix: str,
+                                 checkpoint_path: Path):
+        """
+        Save state dict as shard.
+
+        Args:
+            state_dict (dict): state dict.
+            checkpoint_path (Path): path to the checkpoint file.
+        """
+        # generate the shard name
+        shard_file_name = self.generate_checkpoint_shard_file_name(index, total_number, prefix)
+        shard_file_path = checkpoint_path.joinpath(shard_file_name)
+
+        # save the shard
+        self.save_checkpoint(state_dict, shard_file_path)
+
+    def calculate_param_size(self, param: torch.Tensor):
+        """
+        Calculate the size of a parameter in MB. Used to compute whether a group of params exceed the shard size.
+        If so, a new shard should be created.
+
+        ArgsL
+            param (torch.Tensor): parameter tensor.
+        """
+        # TODO(FrankLeeeee): check if this tensor is a DTensor, compute its global size if so
+        return param.numel() * param.element_size() / 1024 / 1024
+
+
+class ShardCheckpointIndexFile:
+    """
+    This class is a data structure to keep the content in the index.json file for sharded checkpoint.
+
+    Example:
+        >>> index = ShardCheckpointIndexFile()
+        >>> index.load('index.json')
+        >>> index.append_metadata('model_type', 'bert')
+        >>> index.append_weight_map('bert.embeddings.word_embeddings.weight', 'bert.embeddings.word_embeddings.weight-0-of-2.bin')
+        >>> index.export('index.json')
+    """
+
+    def __init__(self) -> None:
+        self.metadata: dict = dict()
+        self.weight_map: dict = dict()
+
+    def load(self, json_path: str):
+        """
+        Load the index file from a json file.
+
+        Args:
+            json_path (str): path to the json file.
+        """
+        # load the json file
+        with open(json_path, 'r') as f:
+            index = json.load(f)
+
+        # assign attributes if exists
+        if "metadata" in index:
+            self.metadata = index["metadata"]
+        if "weight_map" in index:
+            self.weight_map = index["weight_map"]
+
+    def export(self, json_path: str):
+        """
+        Export the index file to a json file.
+
+        Args:
+            json_path (str): path to the json file.
+        """
+        # create the index file
+        index = dict()
+        index["metadata"] = self.metadata
+        index["weight_map"] = self.weight_map
+
+        # export the index file
+        with open(json_path, 'w') as f:
+            json.dump(index, f, indent=4)
+
+    def append_weight_map(self, param_name: str, shard_file: str):
+        """
+        Append a weight map entry to the index file.
+
+        Args:
+            param_name (str): name of the parameter.
+            shard_file (str): name of the shard file.
+        """
+        self.weight_map[param_name] = shard_file
+
+    def append_meta_data(self, name: str, val: Any):
+        """
+        Append a metadata entry to the index file.
+
+        Args:
+            name (str): name of the metadata.
+            val (Any): value of the metadata.
+        """
+        self.metadata[name] = val
diff --git a/colossalai/checkpoint_io/general_checkpoint_io.py b/colossalai/checkpoint_io/general_checkpoint_io.py
new file mode 100644
index 000000000000..0a3636655530
--- /dev/null
+++ b/colossalai/checkpoint_io/general_checkpoint_io.py
@@ -0,0 +1,66 @@
+from pathlib import Path
+
+import torch.nn as nn
+from torch.optim import Optimizer
+
+from .checkpoint_io_base import CheckpointIO
+
+__all__ = ['GeneralCheckpointIO']
+
+
+class GeneralCheckpointIO(CheckpointIO):
+
+    def load_model(self, model: nn.Module, checkpoint: str, strict: bool = True):
+        checkpoint = Path(checkpoint)
+        is_sharded = self.is_sharded_checkpoint(checkpoint)
+
+        if not is_sharded:
+            checkpoint = self.load_state_dict(checkpoint)
+            model.load_state_dict(checkpoint, strict=strict)
+        else:
+            # find the index file
+            checkpoint_path = Path(checkpoint)
+            index_file_path = self.get_sharded_checkpoint_index_file(checkpoint_path)
+
+            # iterate over the shard checkpoint files
+            # and load each
+            shard_files = self.get_checkpoint_shard_filenames(index_file_path)
+            for shard_file in shard_files:
+                shard_checkpoint = self.load_state_dict(shard_file)
+                model.load_state_dict(shard_checkpoint, strict=strict)
+
+        return model
+
+    def save_model(self,
+                   model: nn.Module,
+                   checkpoint: str,
+                   prefix: str = None,
+                   shard: bool = False,
+                   size_per_shard: int = 1024):
+        checkpoint = Path(checkpoint)
+        if shard:
+            # TODO(FrankLeeeee): implement checkpoint saving to sharded checkpoint
+            raise NotImplementedError("Not implemented yet")
+        else:
+            self.save_checkpoint(model.state_dict(), checkpoint)
+
+    def load_optimizer(self, optimizer: Optimizer, checkpoint: str):
+        checkpoint = Path(checkpoint)
+        is_sharded = self.is_sharded_checkpoint(checkpoint)
+
+        if not is_sharded:
+            checkpoint = self.load_state_dict(checkpoint)
+            optimizer.load_state_dict(checkpoint)
+        else:
+            # TODO(FrankLeeeee): implement checkpoint loading from sharded checkpoint
+            # This is not an urgent feature, so we can leave it for later
+            # let's implement this when we test large-scale models
+            pass
+        return optimizer
+
+    def save_optimizer(self, optimizer: Optimizer, checkpoint: str, shard: bool = False, size_per_shard: int = 1024):
+        if shard:
+            # TODO(FrankLeeeee): implement checkpoint saving to sharded checkpoint
+            pass
+        else:
+            self.save_checkpoint(optimizer.state_dict(), checkpoint)
diff --git a/tests/test_checkpoint_io/test_general_checkpoint_io.py b/tests/test_checkpoint_io/test_general_checkpoint_io.py
new file mode 100644
index 000000000000..48376aaa88bf
--- /dev/null
+++ b/tests/test_checkpoint_io/test_general_checkpoint_io.py
@@ -0,0 +1,70 @@
+import tempfile
+
+import torch
+from torch.optim import Adam
+from torchvision.models import resnet18
+
+from colossalai.checkpoint_io import GeneralCheckpointIO
+
+# ========
+# Note:
+# 1. due to checkpoint IO can be quite slow if tested with all models, we will only test on resnet for now
+# 2. we will test on both sharded and unsharded checkpoints
+# 3. TODO(FrankLeeeee): implement sharded checkpoint and test it
+# ========
+
+
+def test_unsharded_checkpoint():
+    # create a model and optimizer
+    model = resnet18()
+    optimizer = Adam(model.parameters(), lr=0.001)
+
+    # create test data sample
+    x = torch.randn(1, 3, 224, 224)
+
+    # run fwd and bwd
+    y = model(x)
+    loss = y.sum()
+    loss.backward()
+    optimizer.step()
+
+    # create a temp file for checkpoint
+    model_ckpt_tempfile = tempfile.NamedTemporaryFile()
+    optimizer_ckpt_tempfile = tempfile.NamedTemporaryFile()
+
+    # save the model and optimizer
+    ckpt_io = GeneralCheckpointIO()
+    ckpt_io.save_model(model, model_ckpt_tempfile.name)
+    ckpt_io.save_optimizer(optimizer, optimizer_ckpt_tempfile.name)
+
+    # create new model
+    new_model = resnet18()
+    new_optimizer = Adam(new_model.parameters(), lr=0.001)
+
+    # load the model and optimizer
+    new_model = ckpt_io.load_model(new_model, model_ckpt_tempfile.name)
+    new_optimizer = ckpt_io.load_optimizer(new_optimizer, optimizer_ckpt_tempfile.name)
+
+    # do recursive check for the optimizer state dict
+    # if the value is a dict, compare its values
+    # if the value is a list, comapre all elements one-by-one
+    # if the value is a torch.Tensor, use torch.equal
+    # otherwise use assertEqual
+    def recursive_check(d1, d2):
+        for k, v in d1.items():
+            if isinstance(v, dict):
+                recursive_check(v, d2[k])
+            elif isinstance(v, list):
+                for i in range(len(v)):
+                    if isinstance(v[i], torch.Tensor):
+                        assert torch.equal(v[i], d2[k][i])
+                    else:
+                        assert v[i] == d2[k][i]
+            elif isinstance(v, torch.Tensor):
+                assert torch.equal(v, d2[k])
+            else:
+                assert v == d2[k]
+
+    # check for model and optimizer state dict recursively
+    recursive_check(model.state_dict(), new_model.state_dict())
+    recursive_check(optimizer.state_dict(), new_optimizer.state_dict())

From 4fd4bd9d9a88bde184d347a4b283b117e5025630 Mon Sep 17 00:00:00 2001
From: Fazzie-Maqianli <55798671+Fazziekey@users.noreply.github.com>
Date: Thu, 23 Mar 2023 16:46:20 +0800
Subject: [PATCH 501/503] [chatgpt] support instuct training (#3216)

---
 .../ChatGPT/chatgpt/dataset/__init__.py       |   4 +-
 .../ChatGPT/chatgpt/dataset/sft_dataset.py    | 122 +++++++++++++++++-
 applications/ChatGPT/chatgpt/dataset/utils.py |  15 +++
 .../ChatGPT/chatgpt/models/llama/__init__.py  |   3 +-
 .../ChatGPT/chatgpt/models/llama/llama_lm.py  |  38 ++++++
 applications/ChatGPT/chatgpt/trainer/sft.py   |  50 ++++---
 .../ChatGPT/chatgpt/utils/__init__.py         |   3 +
 .../ChatGPT/chatgpt/utils/tokenizer_utils.py  |  74 +++++++++++
 applications/ChatGPT/examples/train_sft.py    |  43 ++++--
 9 files changed, 313 insertions(+), 39 deletions(-)
 create mode 100644 applications/ChatGPT/chatgpt/models/llama/llama_lm.py
 create mode 100644 applications/ChatGPT/chatgpt/utils/__init__.py
 create mode 100644 applications/ChatGPT/chatgpt/utils/tokenizer_utils.py

diff --git a/applications/ChatGPT/chatgpt/dataset/__init__.py b/applications/ChatGPT/chatgpt/dataset/__init__.py
index 78fd2c0705a9..df484f46d24c 100644
--- a/applications/ChatGPT/chatgpt/dataset/__init__.py
+++ b/applications/ChatGPT/chatgpt/dataset/__init__.py
@@ -1,5 +1,5 @@
 from .reward_dataset import RmStaticDataset, HhRlhfDataset
 from .utils import is_rank_0
-from .sft_dataset import SFTDataset
+from .sft_dataset import SFTDataset, AlpacaDataset, AlpacaDataCollator
 
-__all__ = ['RmStaticDataset', 'HhRlhfDataset','is_rank_0', 'SFTDataset']
+__all__ = ['RmStaticDataset', 'HhRlhfDataset','is_rank_0', 'SFTDataset', 'AlpacaDataset', 'AlpacaDataCollator']
diff --git a/applications/ChatGPT/chatgpt/dataset/sft_dataset.py b/applications/ChatGPT/chatgpt/dataset/sft_dataset.py
index 53ad205073e5..67e1b761c60f 100644
--- a/applications/ChatGPT/chatgpt/dataset/sft_dataset.py
+++ b/applications/ChatGPT/chatgpt/dataset/sft_dataset.py
@@ -1,12 +1,46 @@
-from typing import Callable
+#    Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+import copy
+from dataclasses import dataclass, field
+from typing import Callable, Dict, Sequence
 import random
 from torch.utils.data import Dataset
 import torch.distributed as dist
 from tqdm import tqdm
 import torch
 
-from .utils import is_rank_0
+from .utils import is_rank_0, jload
+
+import transformers
+from colossalai.logging import get_dist_logger
 
+logger = get_dist_logger()
+
+IGNORE_INDEX = -100
+PROMPT_DICT = {
+    "prompt_input": (
+        "Below is an instruction that describes a task, paired with an input that provides further context. "
+        "Write a response that appropriately completes the request.\n\n"
+        "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
+    ),
+    "prompt_no_input": (
+        "Below is an instruction that describes a task. "
+        "Write a response that appropriately completes the request.\n\n"
+        "### Instruction:\n{instruction}\n\n### Response:"
+    ),
+}
 
 class SFTDataset(Dataset):
     """
@@ -38,3 +72,87 @@ def __len__(self):
 
     def __getitem__(self, idx):
         return self.prompts[idx]
+    
+
+def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer) -> Dict:
+    """Tokenize a list of strings."""
+    tokenized_list = [
+        tokenizer(
+            text,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        )
+        for text in strings
+    ]
+    input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list]
+    input_ids_lens = labels_lens = [
+        tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() for tokenized in tokenized_list
+    ]
+    return dict(
+        input_ids=input_ids,
+        labels=labels,
+        input_ids_lens=input_ids_lens,
+        labels_lens=labels_lens,
+    )
+
+def preprocess(
+    sources: Sequence[str],
+    targets: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+) -> Dict:
+    """Preprocess the data by tokenizing."""
+    examples = [s + t for s, t in zip(sources, targets)]
+    examples_tokenized, sources_tokenized = [_tokenize_fn(strings, tokenizer) for strings in (examples, sources)]
+    input_ids = examples_tokenized["input_ids"]
+    labels = copy.deepcopy(input_ids)
+    for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]):
+        label[:source_len] = IGNORE_INDEX
+    return dict(input_ids=input_ids, labels=labels)
+
+class AlpacaDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+
+    def __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer):
+        super(AlpacaDataset, self).__init__()
+        logger.info("Loading data...")
+        list_data_dict = jload(data_path)
+
+        logger.info("Formatting inputs...")
+        prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"]
+        sources = [
+            prompt_input.format_map(example) if example.get("input", "") != "" else prompt_no_input.format_map(example)
+            for example in list_data_dict
+        ]
+        targets = [f"{example['output']}{tokenizer.eos_token}" for example in list_data_dict]
+
+        logger.info("Tokenizing inputs... This may take some time...")
+        data_dict = preprocess(sources, targets, tokenizer)
+
+        self.input_ids = data_dict["input_ids"]
+        self.labels = data_dict["labels"]
+
+    def __len__(self):
+        return len(self.input_ids)
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        return dict(input_ids=self.input_ids[i], labels=self.labels[i])
+    
+@dataclass
+class AlpacaDataCollator(object):
+    """Collate examples for supervised fine-tuning."""
+
+    tokenizer: transformers.PreTrainedTokenizer
+
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
+        )
+        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
+        return dict(
+            input_ids=input_ids,
+            labels=labels,
+            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
+        )
diff --git a/applications/ChatGPT/chatgpt/dataset/utils.py b/applications/ChatGPT/chatgpt/dataset/utils.py
index 6c9f7f085f8c..0e88cc8c39b4 100644
--- a/applications/ChatGPT/chatgpt/dataset/utils.py
+++ b/applications/ChatGPT/chatgpt/dataset/utils.py
@@ -1,5 +1,20 @@
+import io
+import json
+
 import torch.distributed as dist
 
 
 def is_rank_0() -> bool:
     return not dist.is_initialized() or dist.get_rank() == 0
+
+def _make_r_io_base(f, mode: str):
+    if not isinstance(f, io.IOBase):
+        f = open(f, mode=mode)
+    return f
+
+def jload(f, mode="r"):
+    """Load a .json file into a dictionary."""
+    f = _make_r_io_base(f, mode)
+    jdict = json.load(f)
+    f.close()
+    return jdict
\ No newline at end of file
diff --git a/applications/ChatGPT/chatgpt/models/llama/__init__.py b/applications/ChatGPT/chatgpt/models/llama/__init__.py
index 9b2a024afdb2..3edb51e14376 100644
--- a/applications/ChatGPT/chatgpt/models/llama/__init__.py
+++ b/applications/ChatGPT/chatgpt/models/llama/__init__.py
@@ -1,5 +1,6 @@
 from .llama_actor import LlamaActor
 from .llama_critic import LlamaCritic
 from .llama_rm import LlamaRM
+from .llama_lm import LlamaLM
 
-__all__ = ['LlamaActor', 'LlamaCritic', 'LlamaRM']
+__all__ = ['LlamaActor', 'LlamaCritic', 'LlamaRM', 'LlamaLM']
diff --git a/applications/ChatGPT/chatgpt/models/llama/llama_lm.py b/applications/ChatGPT/chatgpt/models/llama/llama_lm.py
new file mode 100644
index 000000000000..c63077b1ac04
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/models/llama/llama_lm.py
@@ -0,0 +1,38 @@
+from typing import Optional
+
+from transformers import LlamaConfig, LlamaForCausalLM
+
+from ..base import LM
+
+
+class LlamaLM(LM):
+    """
+    Llama language model.
+
+    Args:
+        pretrained (str): Pretrained model name or path.
+        config (LlamaConfig): Model config.
+        checkpoint (bool): Enable gradient checkpointing.
+        lora_rank (int): LoRA rank.
+        lora_train_bias (str): LoRA bias training mode.
+    """
+
+    def __init__(self,
+                 pretrained: Optional[str] = None,
+                 config: Optional[LlamaConfig] = None,
+                 checkpoint: bool = False,
+                 lora_rank: int = 0,
+                 lora_train_bias: str = 'none') -> None:
+
+        if pretrained is not None:
+            model = LlamaForCausalLM.from_pretrained(pretrained)
+        elif config is not None:
+            model = LlamaForCausalLM(config)
+        else:
+            model = LlamaForCausalLM(LlamaConfig())
+
+        if checkpoint:
+            model.gradient_checkpointing_enable()
+            
+        super().__init__(model, lora_rank, lora_train_bias)
+
diff --git a/applications/ChatGPT/chatgpt/trainer/sft.py b/applications/ChatGPT/chatgpt/trainer/sft.py
index e3913d46bd45..dd5cd35f5f4d 100644
--- a/applications/ChatGPT/chatgpt/trainer/sft.py
+++ b/applications/ChatGPT/chatgpt/trainer/sft.py
@@ -2,7 +2,6 @@
 from typing import Optional
 import loralib as lora
 import torch
-from chatgpt.dataset import SFTDataset
 from chatgpt.models.loss import GPTLMLoss
 from torch.optim import Adam, Optimizer
 from torch.utils.data import DataLoader
@@ -22,8 +21,8 @@ class SFTTrainer(ABC):
         model (torch.nn.Module): the model to train
         strategy (Strategy): the strategy to use for training
         optim(Optimizer): the optimizer to use for training
-        train_dataset (SFTDataset or SFTDistributedDataset): the dataset to use for training
-        eval_dataset (SFTDataset or SFTDistributedDataset): the dataset to use for evaluation
+        train_dataloader: the dataloader to use for training
+        eval_dataloader: the dataloader to use for evaluation
         batch_size (int, defaults to 1): the batch size while training
         max_epochs (int, defaults to 2): the number of epochs to train
         optim_kwargs (dict, defaults to {'lr':1e-4}): the kwargs to use while initializing optimizer
@@ -34,8 +33,8 @@ def __init__(
         model,
         strategy: Strategy,
         optim: Optimizer,
-        train_dataset: SFTDataset,
-        eval_dataset: SFTDataset,
+        train_dataloader: DataLoader,
+        eval_dataloader: DataLoader = None,
         sampler: Optional[DistributedSampler] = None,
         batch_size: int = 1,
         max_epochs: int = 2,
@@ -43,13 +42,10 @@ def __init__(
         super().__init__()
         self.strategy = strategy
         self.epochs = max_epochs
-        self.train_dataset = train_dataset
-        self.eval_dataset = eval_dataset
         self.sampler = sampler
 
-        self.train_dataloader = DataLoader(self.train_dataset, shuffle=(sampler is None),
-                                           sampler=sampler, batch_size=batch_size)
-        self.eval_dataloader = DataLoader(self.eval_dataset, batch_size=batch_size)
+        self.train_dataloader = train_dataloader
+        self.eval_dataloader = eval_dataloader
 
         self.model = strategy.setup_model(model)
         if "DDP" in str(self.strategy):
@@ -79,23 +75,25 @@ def fit(self, logger, use_lora, log_interval=10):
                     logger.info(f'Train Epoch {epoch}/{self.epochs} Batch {batch_id} Rank {dist.get_rank()} loss {loss.item()}')
 
             # eval
-            self.model.eval()
-            with torch.no_grad():
-                loss_sum = 0
-                num_seen = 0
-                for batch in self.eval_dataloader:
-                    prompt_ids = batch["input_ids"]
-                    p_mask = batch["attention_mask"]
-                    prompt_ids = prompt_ids.squeeze(1).cuda()
-                    p_mask = p_mask.squeeze(1).cuda()
+            if self.eval_dataloader is not None:
+                self.model.eval()
+                with torch.no_grad():
+                    loss_sum = 0
+                    num_seen = 0
+                    for batch in self.eval_dataloader:
+                        prompt_ids = batch["input_ids"]
+                        p_mask = batch["attention_mask"]
+                        prompt_ids = prompt_ids.squeeze(1).cuda()
+                        p_mask = p_mask.squeeze(1).cuda()
 
-                    prompt_logits = self.model(prompt_ids, attention_mask=p_mask)
-                    loss = self.loss_fn(prompt_logits, prompt_ids)
-                    loss_sum += loss.item()
-                    num_seen += prompt_ids.size(0)
+                        prompt_logits = self.model(prompt_ids, attention_mask=p_mask)
+                        loss = self.loss_fn(prompt_logits, prompt_ids)
+                        loss_sum += loss.item()
+                        num_seen += prompt_ids.size(0)
 
-                loss_mean = loss_sum / num_seen
-                if dist.get_rank() == 0:
-                    logger.info(f'Eval Epoch {epoch}/{self.epochs} loss {loss_mean}')
+                    loss_mean = loss_sum / num_seen
+                    if dist.get_rank() == 0:
+                        logger.info(f'Eval Epoch {epoch}/{self.epochs} loss {loss_mean}')
+                        
             epoch_bar.update()
 
diff --git a/applications/ChatGPT/chatgpt/utils/__init__.py b/applications/ChatGPT/chatgpt/utils/__init__.py
new file mode 100644
index 000000000000..8f526d7efdad
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/utils/__init__.py
@@ -0,0 +1,3 @@
+from .tokenizer_utils import smart_tokenizer_and_embedding_resize, prepare_llama_tokenizer_and_embedding
+
+__all__ = ['smart_tokenizer_and_embedding_resize', 'prepare_llama_tokenizer_and_embedding']
\ No newline at end of file
diff --git a/applications/ChatGPT/chatgpt/utils/tokenizer_utils.py b/applications/ChatGPT/chatgpt/utils/tokenizer_utils.py
new file mode 100644
index 000000000000..8699bf64c7b5
--- /dev/null
+++ b/applications/ChatGPT/chatgpt/utils/tokenizer_utils.py
@@ -0,0 +1,74 @@
+#    Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+from typing import Dict
+
+import transformers
+
+DEFAULT_PAD_TOKEN = "[PAD]"
+DEFAULT_EOS_TOKEN = "</s>"
+DEFAULT_BOS_TOKEN = "</s>"
+DEFAULT_UNK_TOKEN = "</s>"
+
+def prepare_llama_tokenizer_and_embedding(
+    tokenizer: transformers.PreTrainedTokenizer,
+    model: transformers.PreTrainedModel,
+    special_tokens_dict: Dict = dict(pad_token=DEFAULT_PAD_TOKEN),
+):
+    """prepare llama tokenizer and embedding.
+
+    """
+
+    if tokenizer.pad_token is None:
+        smart_tokenizer_and_embedding_resize(
+            special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN),
+            tokenizer=tokenizer,
+            model=model,
+        )
+
+    tokenizer.add_special_tokens(
+        {
+            "eos_token": DEFAULT_EOS_TOKEN,
+            "bos_token": DEFAULT_BOS_TOKEN,
+            "unk_token": DEFAULT_UNK_TOKEN,
+        }
+    )
+
+    return tokenizer
+
+
+def smart_tokenizer_and_embedding_resize(
+    tokenizer: transformers.PreTrainedTokenizer,
+    model: transformers.PreTrainedModel,
+    special_tokens_dict: Dict = dict(pad_token=DEFAULT_PAD_TOKEN),
+):
+    """Resize tokenizer and embedding.
+
+    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
+    """
+
+    if tokenizer.pad_token is None:
+        num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
+        model.resize_token_embeddings(len(tokenizer))
+
+        if num_new_tokens > 0:
+            input_embeddings = model.get_input_embeddings().weight.data
+            output_embeddings = model.get_output_embeddings().weight.data
+
+            input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
+            output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
+
+            input_embeddings[-num_new_tokens:] = input_embeddings_avg
+            output_embeddings[-num_new_tokens:] = output_embeddings_avg
+            
\ No newline at end of file
diff --git a/applications/ChatGPT/examples/train_sft.py b/applications/ChatGPT/examples/train_sft.py
index 4b3f85a2a491..83b34f9dd1ea 100644
--- a/applications/ChatGPT/examples/train_sft.py
+++ b/applications/ChatGPT/examples/train_sft.py
@@ -4,15 +4,18 @@
 import torch
 import torch.distributed as dist
 from torch.utils.data.distributed import DistributedSampler
-from chatgpt.dataset import SFTDataset
+from chatgpt.dataset import SFTDataset, AlpacaDataset, AlpacaDataCollator
 from chatgpt.models.base import RewardModel
 from chatgpt.models.bloom import BLOOMLM
 from chatgpt.models.gpt import GPTLM
 from chatgpt.models.opt import OPTLM
+from chatgpt.models.llama import LlamaLM
 from chatgpt.trainer import SFTTrainer
 from chatgpt.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
+from chatgpt.utils import prepare_llama_tokenizer_and_embedding
 from datasets import load_dataset
 from torch.optim import Adam
+from torch.utils.data import DataLoader
 from transformers import AutoTokenizer, BloomTokenizerFast
 from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
 
@@ -41,6 +44,8 @@ def train(args):
             model = OPTLM(pretrained=args.pretrain, lora_rank=args.lora_rank).cuda()
         elif args.model == 'gpt2':
             model = GPTLM(pretrained=args.pretrain, lora_rank=args.lora_rank).cuda()
+        elif args.model == 'llama':
+            model = LlamaLM(pretrained=args.pretrain, lora_rank=args.lora_rank).cuda()
         else:
             raise ValueError(f'Unsupported model "{args.model}"')
 
@@ -53,9 +58,19 @@ def train(args):
         tokenizer.pad_token = tokenizer.eos_token
     elif args.model == 'opt':
         tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
+    elif args.model == 'llama':
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.pretrain,
+            padding_side="right",
+            use_fast=False,
+        )
     else:
         raise ValueError(f'Unsupported model "{args.model}"')
-    tokenizer.pad_token = tokenizer.eos_token
+    
+    if args.model == 'llama':
+        tokenizer = prepare_llama_tokenizer_and_embedding(tokenizer, model)
+    else:
+        tokenizer.pad_token = tokenizer.eos_token
 
     max_len = 512
 
@@ -67,11 +82,19 @@ def train(args):
 
     logger = get_dist_logger()
 
-    train_data = load_dataset(args.dataset, 'super_natural_instructions', split='train')
-    eval_data = load_dataset(args.dataset, 'super_natural_instructions', split='test')
+    # configure dataset
+    if args.dataset == 'yizhongw/self_instruct':
+        train_data = load_dataset(args.dataset, 'super_natural_instructions', split='train')
+        eval_data = load_dataset(args.dataset, 'super_natural_instructions', split='test')
 
-    train_dataset = SFTDataset(train_data, tokenizer, max_len)
-    eval_dataset = SFTDataset(eval_data, tokenizer, max_len)
+        train_dataset = SFTDataset(train_data, tokenizer, max_len)
+        eval_dataset = SFTDataset(eval_data, tokenizer, max_len)
+
+    elif 'alpaca' in args.dataset:
+        train_dataset = AlpacaDataset(tokenizer=tokenizer, data_path=args.dataset)
+        eval_dataset = None
+        eval_dataset
+        data_collator = AlpacaDataCollator(tokenizer=tokenizer)
 
     if dist.is_initialized() and dist.get_world_size() > 1:
         sampler = DistributedSampler(train_dataset, shuffle=True, seed=42, drop_last=True)
@@ -79,11 +102,15 @@ def train(args):
     else:
         sampler = None
 
+    train_dataloader = DataLoader(train_dataset, shuffle=(sampler is None), sampler=sampler, batch_size=args.batch_size)
+    if eval_dataset is not None:
+        eval_dataloader = DataLoader(eval_dataset, batch_size=args.batch_size)
+
     trainer = SFTTrainer(model=model,
                          strategy=strategy,
                          optim=optim,
-                         train_dataset=train_dataset,
-                         eval_dataset=eval_dataset,
+                         train_dataloader=train_dataloader,
+                         eval_dataloader=eval_dataloader,
                          sampler=sampler,
                          batch_size=args.batch_size,
                          max_epochs=args.max_epochs)

From fa97a9cab4e0aa3b3fe188a2193eee6b09bc38e0 Mon Sep 17 00:00:00 2001
From: Fazzie-Maqianli <55798671+Fazziekey@users.noreply.github.com>
Date: Thu, 23 Mar 2023 17:38:30 +0800
Subject: [PATCH 502/503] [chatgpt] unnify  datasets (#3218)

---
 applications/ChatGPT/chatgpt/dataset/sft_dataset.py | 11 ++++++++---
 applications/ChatGPT/chatgpt/trainer/sft.py         |  6 ++++--
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/applications/ChatGPT/chatgpt/dataset/sft_dataset.py b/applications/ChatGPT/chatgpt/dataset/sft_dataset.py
index 67e1b761c60f..11ec61908aef 100644
--- a/applications/ChatGPT/chatgpt/dataset/sft_dataset.py
+++ b/applications/ChatGPT/chatgpt/dataset/sft_dataset.py
@@ -54,7 +54,8 @@ class SFTDataset(Dataset):
 
     def __init__(self, dataset, tokenizer: Callable, max_length: int=512) -> None:
         super().__init__()
-        self.prompts = []
+        # self.prompts = []
+        self.input_ids = []
 
         for data in tqdm(dataset, disable=not is_rank_0()):
             prompt = data['prompt'] + data['completion'] + "<|endoftext|>"
@@ -64,14 +65,18 @@ def __init__(self, dataset, tokenizer: Callable, max_length: int=512) -> None:
                                      truncation=True,
                                      return_tensors="pt")
 
-            self.prompts.append(prompt_token)
+            # self.prompts.append(prompt_token)s
+            self.input_ids.append(prompt_token)
+            self.labels = copy.deepcopy(self.input_ids)
 
     def __len__(self):
         length = len(self.prompts)
         return length
 
     def __getitem__(self, idx):
-        return self.prompts[idx]
+        # dict(input_ids=self.input_ids[i], labels=self.labels[i])
+        return dict(input_ids=self.input_ids[i], labels=self.labels[i])
+        # return dict(self.prompts[idx], self.prompts[idx])
     
 
 def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer) -> Dict:
diff --git a/applications/ChatGPT/chatgpt/trainer/sft.py b/applications/ChatGPT/chatgpt/trainer/sft.py
index dd5cd35f5f4d..3b35f516816f 100644
--- a/applications/ChatGPT/chatgpt/trainer/sft.py
+++ b/applications/ChatGPT/chatgpt/trainer/sft.py
@@ -63,11 +63,13 @@ def fit(self, logger, use_lora, log_interval=10):
             for batch_id, batch in enumerate(self.train_dataloader):
                 prompt_ids = batch["input_ids"]
                 p_mask = batch["attention_mask"]
+                labels = batch["labels"]
                 prompt_ids = prompt_ids.squeeze(1).cuda()
                 p_mask = p_mask.squeeze(1).cuda()
-                prompt_logits = self.model(prompt_ids, attention_mask=p_mask)
+                # prompt_logits = self.model(prompt_ids, attention_mask=p_mask, labels=labels)
+                loss, prompt_logits = self.model(prompt_ids, attention_mask=p_mask, labels=labels)
 
-                loss = self.loss_fn(prompt_logits, prompt_ids)
+                # loss = self.loss_fn(prompt_logits, labels)
                 self.strategy.backward(loss, self.model, self.optimizer)
                 self.strategy.optimizer_step(self.optimizer)
                 self.optimizer.zero_grad()

From bbac6760e59beed8be6d74f62f9589c8f7240cda Mon Sep 17 00:00:00 2001
From: Fazzie-Maqianli <55798671+Fazziekey@users.noreply.github.com>
Date: Thu, 23 Mar 2023 20:56:35 +0800
Subject: [PATCH 503/503] fix torch version (#3225)

---
 .../chatgpt/trainer/strategies/colossalai.py  | 20 ++++++++++++++-----
 applications/ChatGPT/requirements.txt         |  2 +-
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/applications/ChatGPT/chatgpt/trainer/strategies/colossalai.py b/applications/ChatGPT/chatgpt/trainer/strategies/colossalai.py
index b20b02d3d34d..64ebf12f1922 100644
--- a/applications/ChatGPT/chatgpt/trainer/strategies/colossalai.py
+++ b/applications/ChatGPT/chatgpt/trainer/strategies/colossalai.py
@@ -9,6 +9,10 @@
 from chatgpt.models.lora import LoraLinear
 from torch.optim import Optimizer
 
+
+from transformers.modeling_utils import PreTrainedModel
+from transformers.tokenization_utils_base import PreTrainedTokenizerBase
+
 import colossalai
 from colossalai.nn.optimizer import CPUAdam, HybridAdam
 from colossalai.nn.parallel import ZeroDDP, zero_model_wrapper, zero_optim_wrapper
@@ -143,7 +147,7 @@ def _unwrap_actor(actor: Actor) -> nn.Module:
             return model.module
         return model
 
-    def save_model(self, model: nn.Module, path: str, only_rank0: bool = False) -> None:
+    def save_model(self, model: nn.Module, path: str, only_rank0: bool = False, tokenizer: Optional[PreTrainedTokenizerBase] = None) -> None:
         unwrapped_model = self._unwrap_model(model)
         # TODO : better way to get torch model from gemini model
         # to get torch model from gemini model
@@ -159,10 +163,16 @@ def save_model(self, model: nn.Module, path: str, only_rank0: bool = False) -> N
                 module.merge_weights=True
                 module.eval()
         # get state_dict and save
-        state_dict = unwrapped_model.state_dict()
-        if only_rank0 and dist.get_rank() != 0:
-            return
-        torch.save(state_dict, path)
+
+        if not isinstance(self.model, PreTrainedModel):
+            state_dict = unwrapped_model.state_dict()
+            if only_rank0 and dist.get_rank() != 0:
+                return
+            torch.save(state_dict, path)
+        else:
+            self.model.save_pretrained(path)
+            if tokenizer is not None:
+                tokenizer.save_pretrained(path)
 
     def save_optimizer(self, optimizer: Optimizer, path: str, only_rank0: bool = False) -> None:
         if only_rank0:
diff --git a/applications/ChatGPT/requirements.txt b/applications/ChatGPT/requirements.txt
index 15a960c2c650..3469111925ff 100644
--- a/applications/ChatGPT/requirements.txt
+++ b/applications/ChatGPT/requirements.txt
@@ -3,5 +3,5 @@ tqdm
 datasets
 loralib
 colossalai>=0.2.4
-torch
+torch==1.12.1
 langchain